langroid 0.31.1__py3-none-any.whl → 0.33.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (163) hide show
  1. {langroid-0.31.1.dist-info → langroid-0.33.3.dist-info}/METADATA +150 -124
  2. langroid-0.33.3.dist-info/RECORD +7 -0
  3. {langroid-0.31.1.dist-info → langroid-0.33.3.dist-info}/WHEEL +1 -1
  4. langroid-0.33.3.dist-info/entry_points.txt +4 -0
  5. pyproject.toml +317 -212
  6. langroid/__init__.py +0 -106
  7. langroid/agent/.chainlit/config.toml +0 -121
  8. langroid/agent/.chainlit/translations/bn.json +0 -231
  9. langroid/agent/.chainlit/translations/en-US.json +0 -229
  10. langroid/agent/.chainlit/translations/gu.json +0 -231
  11. langroid/agent/.chainlit/translations/he-IL.json +0 -231
  12. langroid/agent/.chainlit/translations/hi.json +0 -231
  13. langroid/agent/.chainlit/translations/kn.json +0 -231
  14. langroid/agent/.chainlit/translations/ml.json +0 -231
  15. langroid/agent/.chainlit/translations/mr.json +0 -231
  16. langroid/agent/.chainlit/translations/ta.json +0 -231
  17. langroid/agent/.chainlit/translations/te.json +0 -231
  18. langroid/agent/.chainlit/translations/zh-CN.json +0 -229
  19. langroid/agent/__init__.py +0 -41
  20. langroid/agent/base.py +0 -1981
  21. langroid/agent/batch.py +0 -398
  22. langroid/agent/callbacks/__init__.py +0 -0
  23. langroid/agent/callbacks/chainlit.py +0 -598
  24. langroid/agent/chat_agent.py +0 -1899
  25. langroid/agent/chat_document.py +0 -454
  26. langroid/agent/helpers.py +0 -0
  27. langroid/agent/junk +0 -13
  28. langroid/agent/openai_assistant.py +0 -882
  29. langroid/agent/special/__init__.py +0 -59
  30. langroid/agent/special/arangodb/__init__.py +0 -0
  31. langroid/agent/special/arangodb/arangodb_agent.py +0 -656
  32. langroid/agent/special/arangodb/system_messages.py +0 -186
  33. langroid/agent/special/arangodb/tools.py +0 -107
  34. langroid/agent/special/arangodb/utils.py +0 -36
  35. langroid/agent/special/doc_chat_agent.py +0 -1466
  36. langroid/agent/special/lance_doc_chat_agent.py +0 -262
  37. langroid/agent/special/lance_rag/__init__.py +0 -9
  38. langroid/agent/special/lance_rag/critic_agent.py +0 -198
  39. langroid/agent/special/lance_rag/lance_rag_task.py +0 -82
  40. langroid/agent/special/lance_rag/query_planner_agent.py +0 -260
  41. langroid/agent/special/lance_tools.py +0 -61
  42. langroid/agent/special/neo4j/__init__.py +0 -0
  43. langroid/agent/special/neo4j/csv_kg_chat.py +0 -174
  44. langroid/agent/special/neo4j/neo4j_chat_agent.py +0 -433
  45. langroid/agent/special/neo4j/system_messages.py +0 -120
  46. langroid/agent/special/neo4j/tools.py +0 -32
  47. langroid/agent/special/relevance_extractor_agent.py +0 -127
  48. langroid/agent/special/retriever_agent.py +0 -56
  49. langroid/agent/special/sql/__init__.py +0 -17
  50. langroid/agent/special/sql/sql_chat_agent.py +0 -654
  51. langroid/agent/special/sql/utils/__init__.py +0 -21
  52. langroid/agent/special/sql/utils/description_extractors.py +0 -190
  53. langroid/agent/special/sql/utils/populate_metadata.py +0 -85
  54. langroid/agent/special/sql/utils/system_message.py +0 -35
  55. langroid/agent/special/sql/utils/tools.py +0 -64
  56. langroid/agent/special/table_chat_agent.py +0 -263
  57. langroid/agent/structured_message.py +0 -9
  58. langroid/agent/task.py +0 -2093
  59. langroid/agent/tool_message.py +0 -393
  60. langroid/agent/tools/__init__.py +0 -38
  61. langroid/agent/tools/duckduckgo_search_tool.py +0 -50
  62. langroid/agent/tools/file_tools.py +0 -234
  63. langroid/agent/tools/google_search_tool.py +0 -39
  64. langroid/agent/tools/metaphor_search_tool.py +0 -67
  65. langroid/agent/tools/orchestration.py +0 -303
  66. langroid/agent/tools/recipient_tool.py +0 -235
  67. langroid/agent/tools/retrieval_tool.py +0 -32
  68. langroid/agent/tools/rewind_tool.py +0 -137
  69. langroid/agent/tools/segment_extract_tool.py +0 -41
  70. langroid/agent/typed_task.py +0 -19
  71. langroid/agent/xml_tool_message.py +0 -382
  72. langroid/agent_config.py +0 -0
  73. langroid/cachedb/__init__.py +0 -17
  74. langroid/cachedb/base.py +0 -58
  75. langroid/cachedb/momento_cachedb.py +0 -108
  76. langroid/cachedb/redis_cachedb.py +0 -153
  77. langroid/embedding_models/__init__.py +0 -39
  78. langroid/embedding_models/base.py +0 -74
  79. langroid/embedding_models/clustering.py +0 -189
  80. langroid/embedding_models/models.py +0 -461
  81. langroid/embedding_models/protoc/__init__.py +0 -0
  82. langroid/embedding_models/protoc/embeddings.proto +0 -19
  83. langroid/embedding_models/protoc/embeddings_pb2.py +0 -33
  84. langroid/embedding_models/protoc/embeddings_pb2.pyi +0 -50
  85. langroid/embedding_models/protoc/embeddings_pb2_grpc.py +0 -79
  86. langroid/embedding_models/remote_embeds.py +0 -153
  87. langroid/exceptions.py +0 -65
  88. langroid/experimental/team-save.py +0 -391
  89. langroid/language_models/.chainlit/config.toml +0 -121
  90. langroid/language_models/.chainlit/translations/en-US.json +0 -231
  91. langroid/language_models/__init__.py +0 -53
  92. langroid/language_models/azure_openai.py +0 -153
  93. langroid/language_models/base.py +0 -678
  94. langroid/language_models/config.py +0 -18
  95. langroid/language_models/mock_lm.py +0 -124
  96. langroid/language_models/openai_gpt.py +0 -1923
  97. langroid/language_models/prompt_formatter/__init__.py +0 -16
  98. langroid/language_models/prompt_formatter/base.py +0 -40
  99. langroid/language_models/prompt_formatter/hf_formatter.py +0 -132
  100. langroid/language_models/prompt_formatter/llama2_formatter.py +0 -75
  101. langroid/language_models/utils.py +0 -147
  102. langroid/mytypes.py +0 -84
  103. langroid/parsing/__init__.py +0 -52
  104. langroid/parsing/agent_chats.py +0 -38
  105. langroid/parsing/code-parsing.md +0 -86
  106. langroid/parsing/code_parser.py +0 -121
  107. langroid/parsing/config.py +0 -0
  108. langroid/parsing/document_parser.py +0 -718
  109. langroid/parsing/image_text.py +0 -32
  110. langroid/parsing/para_sentence_split.py +0 -62
  111. langroid/parsing/parse_json.py +0 -155
  112. langroid/parsing/parser.py +0 -313
  113. langroid/parsing/repo_loader.py +0 -790
  114. langroid/parsing/routing.py +0 -36
  115. langroid/parsing/search.py +0 -275
  116. langroid/parsing/spider.py +0 -102
  117. langroid/parsing/table_loader.py +0 -94
  118. langroid/parsing/url_loader.py +0 -111
  119. langroid/parsing/url_loader_cookies.py +0 -73
  120. langroid/parsing/urls.py +0 -273
  121. langroid/parsing/utils.py +0 -373
  122. langroid/parsing/web_search.py +0 -155
  123. langroid/prompts/__init__.py +0 -9
  124. langroid/prompts/chat-gpt4-system-prompt.md +0 -68
  125. langroid/prompts/dialog.py +0 -17
  126. langroid/prompts/prompts_config.py +0 -5
  127. langroid/prompts/templates.py +0 -141
  128. langroid/pydantic_v1/__init__.py +0 -10
  129. langroid/pydantic_v1/main.py +0 -4
  130. langroid/utils/.chainlit/config.toml +0 -121
  131. langroid/utils/.chainlit/translations/en-US.json +0 -231
  132. langroid/utils/__init__.py +0 -19
  133. langroid/utils/algorithms/__init__.py +0 -3
  134. langroid/utils/algorithms/graph.py +0 -103
  135. langroid/utils/configuration.py +0 -98
  136. langroid/utils/constants.py +0 -30
  137. langroid/utils/docker.py +0 -37
  138. langroid/utils/git_utils.py +0 -252
  139. langroid/utils/globals.py +0 -49
  140. langroid/utils/llms/__init__.py +0 -0
  141. langroid/utils/llms/strings.py +0 -8
  142. langroid/utils/logging.py +0 -135
  143. langroid/utils/object_registry.py +0 -66
  144. langroid/utils/output/__init__.py +0 -20
  145. langroid/utils/output/citations.py +0 -41
  146. langroid/utils/output/printing.py +0 -99
  147. langroid/utils/output/status.py +0 -40
  148. langroid/utils/pandas_utils.py +0 -30
  149. langroid/utils/pydantic_utils.py +0 -602
  150. langroid/utils/system.py +0 -286
  151. langroid/utils/types.py +0 -93
  152. langroid/utils/web/__init__.py +0 -0
  153. langroid/utils/web/login.py +0 -83
  154. langroid/vector_store/__init__.py +0 -50
  155. langroid/vector_store/base.py +0 -357
  156. langroid/vector_store/chromadb.py +0 -214
  157. langroid/vector_store/lancedb.py +0 -401
  158. langroid/vector_store/meilisearch.py +0 -299
  159. langroid/vector_store/momento.py +0 -278
  160. langroid/vector_store/qdrant_cloud.py +0 -6
  161. langroid/vector_store/qdrantdb.py +0 -468
  162. langroid-0.31.1.dist-info/RECORD +0 -162
  163. {langroid-0.31.1.dist-info → langroid-0.33.3.dist-info/licenses}/LICENSE +0 -0
@@ -1,718 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import itertools
4
- import logging
5
- import re
6
- from enum import Enum
7
- from io import BytesIO
8
- from typing import TYPE_CHECKING, Any, Generator, List, Tuple
9
-
10
- from langroid.exceptions import LangroidImportError
11
- from langroid.utils.object_registry import ObjectRegistry
12
-
13
- try:
14
- import fitz
15
- except ImportError:
16
- if not TYPE_CHECKING:
17
- fitz = None
18
-
19
- try:
20
- import pypdf
21
- except ImportError:
22
- if not TYPE_CHECKING:
23
- pypdf = None
24
-
25
- try:
26
- import pdfplumber
27
- except ImportError:
28
- if not TYPE_CHECKING:
29
- pdfplumber = None
30
-
31
- import requests
32
- from bs4 import BeautifulSoup
33
-
34
- if TYPE_CHECKING:
35
- from PIL import Image
36
-
37
- from langroid.mytypes import DocMetaData, Document
38
- from langroid.parsing.parser import Parser, ParsingConfig
39
-
40
- logger = logging.getLogger(__name__)
41
-
42
-
43
- class DocumentType(str, Enum):
44
- PDF = "pdf"
45
- DOCX = "docx"
46
- DOC = "doc"
47
- TXT = "txt"
48
-
49
-
50
- def find_last_full_char(possible_unicode: bytes) -> int:
51
- """
52
- Find the index of the last full character in a byte string.
53
- Args:
54
- possible_unicode (bytes): The bytes to check.
55
- Returns:
56
- int: The index of the last full unicode character.
57
- """
58
-
59
- for i in range(len(possible_unicode) - 1, 0, -1):
60
- if (possible_unicode[i] & 0xC0) != 0x80:
61
- return i
62
- return 0
63
-
64
-
65
- def is_plain_text(path_or_bytes: str | bytes) -> bool:
66
- """
67
- Check if a file is plain text by attempting to decode it as UTF-8.
68
- Args:
69
- path_or_bytes (str|bytes): The file path or bytes object.
70
- Returns:
71
- bool: True if the file is plain text, False otherwise.
72
- """
73
- if isinstance(path_or_bytes, str):
74
- if path_or_bytes.startswith(("http://", "https://")):
75
- response = requests.get(path_or_bytes)
76
- response.raise_for_status()
77
- content = response.content[:1024]
78
- else:
79
- with open(path_or_bytes, "rb") as f:
80
- content = f.read(1024)
81
- else:
82
- content = path_or_bytes[:1024]
83
- try:
84
- # Use magic to detect the MIME type
85
- import magic
86
-
87
- mime_type = magic.from_buffer(content, mime=True)
88
-
89
- # Check if the MIME type is not a text type
90
- if not mime_type.startswith("text/"):
91
- return False
92
-
93
- # Attempt to decode the content as UTF-8
94
- content = content[: find_last_full_char(content)]
95
-
96
- try:
97
- _ = content.decode("utf-8")
98
- # Additional checks can go here, e.g., to verify that the content
99
- # doesn't contain too many unusual characters for it to be considered text
100
- return True
101
- except UnicodeDecodeError:
102
- return False
103
- except UnicodeDecodeError:
104
- # If decoding fails, it's likely not plain text (or not encoded in UTF-8)
105
- return False
106
-
107
-
108
- class DocumentParser(Parser):
109
- """
110
- Abstract base class for extracting text from special types of docs
111
- such as PDFs or Docx.
112
-
113
- Attributes:
114
- source (str): The source, either a URL or a file path.
115
- doc_bytes (BytesIO): BytesIO object containing the doc data.
116
- """
117
-
118
- @classmethod
119
- def create(
120
- cls,
121
- source: str | bytes,
122
- config: ParsingConfig,
123
- doc_type: str | DocumentType | None = None,
124
- ) -> "DocumentParser":
125
- """
126
- Create a DocumentParser instance based on source type
127
- and config.<source_type>.library specified.
128
-
129
- Args:
130
- source (str|bytes): The source, could be a URL, file path,
131
- or bytes object.
132
- config (ParserConfig): The parser configuration.
133
- doc_type (str|None): The type of document, if known
134
-
135
- Returns:
136
- DocumentParser: An instance of a DocumentParser subclass.
137
- """
138
- inferred_doc_type = DocumentParser._document_type(source, doc_type)
139
- if inferred_doc_type == DocumentType.PDF:
140
- if config.pdf.library == "fitz":
141
- return FitzPDFParser(source, config)
142
- elif config.pdf.library == "pypdf":
143
- return PyPDFParser(source, config)
144
- elif config.pdf.library == "pdfplumber":
145
- return PDFPlumberParser(source, config)
146
- elif config.pdf.library == "unstructured":
147
- return UnstructuredPDFParser(source, config)
148
- elif config.pdf.library == "pdf2image":
149
- return ImagePdfParser(source, config)
150
- else:
151
- raise ValueError(
152
- f"Unsupported PDF library specified: {config.pdf.library}"
153
- )
154
- elif inferred_doc_type == DocumentType.DOCX:
155
- if config.docx.library == "unstructured":
156
- return UnstructuredDocxParser(source, config)
157
- elif config.docx.library == "python-docx":
158
- return PythonDocxParser(source, config)
159
- else:
160
- raise ValueError(
161
- f"Unsupported DOCX library specified: {config.docx.library}"
162
- )
163
- elif inferred_doc_type == DocumentType.DOC:
164
- return UnstructuredDocParser(source, config)
165
- else:
166
- source_name = source if isinstance(source, str) else "bytes"
167
- raise ValueError(f"Unsupported document type: {source_name}")
168
-
169
- def __init__(self, source: str | bytes, config: ParsingConfig):
170
- """
171
- Args:
172
- source (str|bytes): The source, which could be
173
- a path, a URL or a bytes object.
174
- """
175
- super().__init__(config)
176
- self.config = config
177
- if isinstance(source, bytes):
178
- self.source = "bytes"
179
- self.doc_bytes = BytesIO(source)
180
- else:
181
- self.source = source
182
- self.doc_bytes = self._load_doc_as_bytesio()
183
-
184
- @staticmethod
185
- def _document_type(
186
- source: str | bytes, doc_type: str | DocumentType | None = None
187
- ) -> DocumentType:
188
- """
189
- Determine the type of document based on the source.
190
-
191
- Args:
192
- source (str|bytes): The source, which could be a URL,
193
- a file path, or a bytes object.
194
- doc_type (str|DocumentType|None): The type of document, if known.
195
-
196
- Returns:
197
- str: The document type.
198
- """
199
- if isinstance(doc_type, DocumentType):
200
- return doc_type
201
- if doc_type:
202
- return DocumentType(doc_type.lower())
203
- if is_plain_text(source):
204
- return DocumentType.TXT
205
- if isinstance(source, str):
206
- # detect file type from path extension
207
- if source.lower().endswith(".pdf"):
208
- return DocumentType.PDF
209
- elif source.lower().endswith(".docx"):
210
- return DocumentType.DOCX
211
- elif source.lower().endswith(".doc"):
212
- return DocumentType.DOC
213
- else:
214
- raise ValueError(f"Unsupported document type: {source}")
215
- else:
216
- # must be bytes: attempt to detect type from content
217
- # using magic mime type detection
218
- import magic
219
-
220
- mime_type = magic.from_buffer(source, mime=True)
221
- if mime_type == "application/pdf":
222
- return DocumentType.PDF
223
- elif mime_type in [
224
- "application/vnd.openxmlformats-officedocument"
225
- ".wordprocessingml.document",
226
- "application/zip",
227
- ]:
228
- # DOCX files are essentially ZIP files,
229
- # but this might catch other ZIP-based formats too!
230
- return DocumentType.DOCX
231
- elif mime_type == "application/msword":
232
- return DocumentType.DOC
233
- else:
234
- raise ValueError("Unsupported document type from bytes")
235
-
236
- def _load_doc_as_bytesio(self) -> BytesIO:
237
- """
238
- Load the docs into a BytesIO object.
239
-
240
- Returns:
241
- BytesIO: A BytesIO object containing the doc data.
242
- """
243
- if self.source.startswith(("http://", "https://")):
244
- response = requests.get(self.source)
245
- response.raise_for_status()
246
- return BytesIO(response.content)
247
- else:
248
- with open(self.source, "rb") as f:
249
- return BytesIO(f.read())
250
-
251
- @staticmethod
252
- def chunks_from_path_or_bytes(
253
- source: str | bytes,
254
- parser: Parser,
255
- doc_type: str | DocumentType | None = None,
256
- lines: int | None = None,
257
- ) -> List[Document]:
258
- """
259
- Get document chunks from a file path or bytes object.
260
- Args:
261
- source (str|bytes): The source, which could be a URL, path or bytes object.
262
- parser (Parser): The parser instance (for splitting the document).
263
- doc_type (str|DocumentType|None): The type of document, if known.
264
- lines (int|None): The number of lines to read from a plain text file.
265
- Returns:
266
- List[Document]: A list of `Document` objects,
267
- each containing a chunk of text, determined by the
268
- chunking and splitting settings in the parser config.
269
- """
270
- dtype: DocumentType = DocumentParser._document_type(source, doc_type)
271
- if dtype in [DocumentType.PDF, DocumentType.DOC, DocumentType.DOCX]:
272
- doc_parser = DocumentParser.create(
273
- source,
274
- parser.config,
275
- doc_type=doc_type,
276
- )
277
- chunks = doc_parser.get_doc_chunks()
278
- if len(chunks) == 0 and dtype == DocumentType.PDF:
279
- doc_parser = ImagePdfParser(source, parser.config)
280
- chunks = doc_parser.get_doc_chunks()
281
- return chunks
282
- else:
283
- # try getting as plain text; these will be chunked downstream
284
- # -- could be a bytes object or a path
285
- if isinstance(source, bytes):
286
- content = source.decode()
287
- if lines is not None:
288
- file_lines = content.splitlines()[:lines]
289
- content = "\n".join(line.strip() for line in file_lines)
290
- else:
291
- with open(source, "r") as f:
292
- if lines is not None:
293
- file_lines = list(itertools.islice(f, lines))
294
- content = "\n".join(line.strip() for line in file_lines)
295
- else:
296
- content = f.read()
297
- soup = BeautifulSoup(content, "html.parser")
298
- text = soup.get_text()
299
- source_name = source if isinstance(source, str) else "bytes"
300
- doc = Document(
301
- content=text,
302
- metadata=DocMetaData(source=str(source_name)),
303
- )
304
- return parser.split([doc])
305
-
306
- def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
307
- """Yield each page in the PDF."""
308
- raise NotImplementedError
309
-
310
- def extract_text_from_page(self, page: Any) -> str:
311
- """Extract text from a given page."""
312
- raise NotImplementedError
313
-
314
- def fix_text(self, text: str) -> str:
315
- """
316
- Fix text extracted from a PDF.
317
-
318
- Args:
319
- text (str): The extracted text.
320
-
321
- Returns:
322
- str: The fixed text.
323
- """
324
- # Some pdf parsers introduce extra space before hyphen,
325
- # so use regular expression to replace 'space-hyphen' with just 'hyphen'
326
- return re.sub(r" +\-", "-", text)
327
-
328
- def get_doc(self) -> Document:
329
- """
330
- Get entire text from source as a single document.
331
-
332
- Returns:
333
- a `Document` object containing the content of the pdf file,
334
- and metadata containing source name (URL or path)
335
- """
336
-
337
- text = "".join(
338
- [self.extract_text_from_page(page) for _, page in self.iterate_pages()]
339
- )
340
- return Document(content=text, metadata=DocMetaData(source=self.source))
341
-
342
- def get_doc_chunks(self) -> List[Document]:
343
- """
344
- Get document chunks from a pdf source,
345
- with page references in the document metadata.
346
-
347
- Adapted from
348
- https://github.com/whitead/paper-qa/blob/main/paperqa/readers.py
349
-
350
- Returns:
351
- List[Document]: a list of `Document` objects,
352
- each containing a chunk of text
353
- """
354
-
355
- split = [] # tokens in curr split
356
- pages: List[str] = []
357
- docs: List[Document] = []
358
- # metadata.id to be shared by ALL chunks of this document
359
- common_id = ObjectRegistry.new_id()
360
- n_chunks = 0 # how many chunk so far
361
- for i, page in self.iterate_pages():
362
- page_text = self.extract_text_from_page(page)
363
- split += self.tokenizer.encode(page_text)
364
- pages.append(str(i + 1))
365
- # split could be so long it needs to be split
366
- # into multiple chunks. Or it could be so short
367
- # that it needs to be combined with the next chunk.
368
- while len(split) > self.config.chunk_size:
369
- # pretty formatting of pages (e.g. 1-3, 4, 5-7)
370
- pg = "-".join([pages[0], pages[-1]])
371
- text = self.tokenizer.decode(split[: self.config.chunk_size])
372
- docs.append(
373
- Document(
374
- content=text,
375
- metadata=DocMetaData(
376
- source=f"{self.source} pages {pg}",
377
- is_chunk=True,
378
- id=common_id,
379
- ),
380
- )
381
- )
382
- n_chunks += 1
383
- split = split[self.config.chunk_size - self.config.overlap :]
384
- pages = [str(i + 1)]
385
- # there may be a last split remaining:
386
- # if it's shorter than the overlap, we shouldn't make a chunk for it
387
- # since it's already included in the prior chunk;
388
- # the only exception is if there have been no chunks so far.
389
- if len(split) > self.config.overlap or n_chunks == 0:
390
- pg = "-".join([pages[0], pages[-1]])
391
- text = self.tokenizer.decode(split[: self.config.chunk_size])
392
- docs.append(
393
- Document(
394
- content=text,
395
- metadata=DocMetaData(
396
- source=f"{self.source} pages {pg}",
397
- is_chunk=True,
398
- id=common_id,
399
- ),
400
- )
401
- )
402
- self.add_window_ids(docs)
403
- return docs
404
-
405
-
406
- class FitzPDFParser(DocumentParser):
407
- """
408
- Parser for processing PDFs using the `fitz` library.
409
- """
410
-
411
- def iterate_pages(self) -> Generator[Tuple[int, "fitz.Page"], None, None]:
412
- """
413
- Yield each page in the PDF using `fitz`.
414
-
415
- Returns:
416
- Generator[fitz.Page]: Generator yielding each page.
417
- """
418
- if fitz is None:
419
- raise LangroidImportError("fitz", "pdf-parsers")
420
- doc = fitz.open(stream=self.doc_bytes, filetype="pdf")
421
- for i, page in enumerate(doc):
422
- yield i, page
423
- doc.close()
424
-
425
- def extract_text_from_page(self, page: "fitz.Page") -> str:
426
- """
427
- Extract text from a given `fitz` page.
428
-
429
- Args:
430
- page (fitz.Page): The `fitz` page object.
431
-
432
- Returns:
433
- str: Extracted text from the page.
434
- """
435
- return self.fix_text(page.get_text())
436
-
437
-
438
- class PyPDFParser(DocumentParser):
439
- """
440
- Parser for processing PDFs using the `pypdf` library.
441
- """
442
-
443
- def iterate_pages(self) -> Generator[Tuple[int, pypdf.PageObject], None, None]:
444
- """
445
- Yield each page in the PDF using `pypdf`.
446
-
447
- Returns:
448
- Generator[pypdf.pdf.PageObject]: Generator yielding each page.
449
- """
450
- if pypdf is None:
451
- raise LangroidImportError("pypdf", "pdf-parsers")
452
- reader = pypdf.PdfReader(self.doc_bytes)
453
- for i, page in enumerate(reader.pages):
454
- yield i, page
455
-
456
- def extract_text_from_page(self, page: pypdf.PageObject) -> str:
457
- """
458
- Extract text from a given `pypdf` page.
459
-
460
- Args:
461
- page (pypdf.pdf.PageObject): The `pypdf` page object.
462
-
463
- Returns:
464
- str: Extracted text from the page.
465
- """
466
- return self.fix_text(page.extract_text())
467
-
468
-
469
- class PDFPlumberParser(DocumentParser):
470
- """
471
- Parser for processing PDFs using the `pdfplumber` library.
472
- """
473
-
474
- def iterate_pages(
475
- self,
476
- ) -> (Generator)[Tuple[int, pdfplumber.pdf.Page], None, None]: # type: ignore
477
- """
478
- Yield each page in the PDF using `pdfplumber`.
479
-
480
- Returns:
481
- Generator[pdfplumber.Page]: Generator yielding each page.
482
- """
483
- if pdfplumber is None:
484
- raise LangroidImportError("pdfplumber", "pdf-parsers")
485
- with pdfplumber.open(self.doc_bytes) as pdf:
486
- for i, page in enumerate(pdf.pages):
487
- yield i, page
488
-
489
- def extract_text_from_page(self, page: pdfplumber.pdf.Page) -> str: # type: ignore
490
- """
491
- Extract text from a given `pdfplumber` page.
492
-
493
- Args:
494
- page (pdfplumber.Page): The `pdfplumber` page object.
495
-
496
- Returns:
497
- str: Extracted text from the page.
498
- """
499
- return self.fix_text(page.extract_text())
500
-
501
-
502
- class ImagePdfParser(DocumentParser):
503
- """
504
- Parser for processing PDFs that are images, i.e. not "true" PDFs.
505
- """
506
-
507
- def iterate_pages(
508
- self,
509
- ) -> Generator[Tuple[int, "Image"], None, None]: # type: ignore
510
- try:
511
- from pdf2image import convert_from_bytes
512
- except ImportError:
513
- raise LangroidImportError("pdf2image", "pdf-parsers")
514
-
515
- images = convert_from_bytes(self.doc_bytes.getvalue())
516
- for i, image in enumerate(images):
517
- yield i, image
518
-
519
- def extract_text_from_page(self, page: "Image") -> str: # type: ignore
520
- """
521
- Extract text from a given `pdf2image` page.
522
-
523
- Args:
524
- page (Image): The PIL Image object.
525
-
526
- Returns:
527
- str: Extracted text from the image.
528
- """
529
- try:
530
- import pytesseract
531
- except ImportError:
532
- raise LangroidImportError("pytesseract", "pdf-parsers")
533
-
534
- text = pytesseract.image_to_string(page)
535
- return self.fix_text(text)
536
-
537
-
538
- class UnstructuredPDFParser(DocumentParser):
539
- """
540
- Parser for processing PDF files using the `unstructured` library.
541
- """
542
-
543
- def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]: # type: ignore
544
- try:
545
- from unstructured.partition.pdf import partition_pdf
546
- except ImportError:
547
- raise ImportError(
548
- """
549
- The `unstructured` library is not installed by default with langroid.
550
- To include this library, please install langroid with the
551
- `unstructured` extra by running `pip install "langroid[unstructured]"`
552
- or equivalent.
553
- """
554
- )
555
-
556
- # from unstructured.chunking.title import chunk_by_title
557
-
558
- try:
559
- elements = partition_pdf(file=self.doc_bytes, include_page_breaks=True)
560
- except Exception as e:
561
- raise Exception(
562
- f"""
563
- Error parsing PDF: {e}
564
- The `unstructured` library failed to parse the pdf.
565
- Please try a different library by setting the `library` field
566
- in the `pdf` section of the `parsing` field in the config file.
567
- Supported libraries are:
568
- fitz, pypdf, pdfplumber, unstructured
569
- """
570
- )
571
-
572
- # elements = chunk_by_title(elements)
573
- page_number = 1
574
- page_elements = [] # type: ignore
575
- for el in elements:
576
- if el.category == "PageBreak":
577
- if page_elements: # Avoid yielding empty pages at the start
578
- yield page_number, page_elements
579
- page_number += 1
580
- page_elements = []
581
- else:
582
- page_elements.append(el)
583
- # Yield the last page if it's not empty
584
- if page_elements:
585
- yield page_number, page_elements
586
-
587
- def extract_text_from_page(self, page: Any) -> str:
588
- """
589
- Extract text from a given `unstructured` element.
590
-
591
- Args:
592
- page (unstructured element): The `unstructured` element object.
593
-
594
- Returns:
595
- str: Extracted text from the element.
596
- """
597
- text = " ".join(el.text for el in page)
598
- return self.fix_text(text)
599
-
600
-
601
- class UnstructuredDocxParser(DocumentParser):
602
- """
603
- Parser for processing DOCX files using the `unstructured` library.
604
- """
605
-
606
- def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]: # type: ignore
607
- try:
608
- from unstructured.partition.docx import partition_docx
609
- except ImportError:
610
- raise ImportError(
611
- """
612
- The `unstructured` library is not installed by default with langroid.
613
- To include this library, please install langroid with the
614
- `unstructured` extra by running `pip install "langroid[unstructured]"`
615
- or equivalent.
616
- """
617
- )
618
-
619
- elements = partition_docx(file=self.doc_bytes, include_page_breaks=True)
620
-
621
- page_number = 1
622
- page_elements = [] # type: ignore
623
- for el in elements:
624
- if el.category == "PageBreak":
625
- if page_elements: # Avoid yielding empty pages at the start
626
- yield page_number, page_elements
627
- page_number += 1
628
- page_elements = []
629
- else:
630
- page_elements.append(el)
631
- # Yield the last page if it's not empty
632
- if page_elements:
633
- yield page_number, page_elements
634
-
635
- def extract_text_from_page(self, page: Any) -> str:
636
- """
637
- Extract text from a given `unstructured` element.
638
-
639
- Note:
640
- The concept of "pages" doesn't actually exist in the .docx file format in
641
- the same way it does in formats like .pdf. A .docx file is made up of a
642
- series of elements like paragraphs and tables, but the division into
643
- pages is done dynamically based on the rendering settings (like the page
644
- size, margin size, font size, etc.).
645
-
646
- Args:
647
- page (unstructured element): The `unstructured` element object.
648
-
649
- Returns:
650
- str: Extracted text from the element.
651
- """
652
- text = " ".join(el.text for el in page)
653
- return self.fix_text(text)
654
-
655
-
656
- class UnstructuredDocParser(UnstructuredDocxParser):
657
- def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]: # type: ignore
658
- try:
659
- from unstructured.partition.doc import partition_doc
660
- except ImportError:
661
- raise ImportError(
662
- """
663
- The `unstructured` library is not installed by default with langroid.
664
- To include this library, please install langroid with the
665
- `unstructured` extra by running `pip install "langroid[unstructured]"`
666
- or equivalent.
667
- """
668
- )
669
-
670
- elements = partition_doc(file=self.doc_bytes, include_page_breaks=True)
671
-
672
- page_number = 1
673
- page_elements = [] # type: ignore
674
- for el in elements:
675
- if el.category == "PageBreak":
676
- if page_elements: # Avoid yielding empty pages at the start
677
- yield page_number, page_elements
678
- page_number += 1
679
- page_elements = []
680
- else:
681
- page_elements.append(el)
682
- # Yield the last page if it's not empty
683
- if page_elements:
684
- yield page_number, page_elements
685
-
686
-
687
- class PythonDocxParser(DocumentParser):
688
- """
689
- Parser for processing DOCX files using the `python-docx` library.
690
- """
691
-
692
- def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
693
- """
694
- Simulate iterating through pages.
695
- In a DOCX file, pages are not explicitly defined,
696
- so we consider each paragraph as a separate 'page' for simplicity.
697
- """
698
- try:
699
- import docx
700
- except ImportError:
701
- raise LangroidImportError("python-docx", "docx")
702
-
703
- doc = docx.Document(self.doc_bytes)
704
- for i, para in enumerate(doc.paragraphs, start=1):
705
- yield i, [para]
706
-
707
- def extract_text_from_page(self, page: Any) -> str:
708
- """
709
- Extract text from a given 'page', which in this case is a single paragraph.
710
-
711
- Args:
712
- page (list): A list containing a single Paragraph object.
713
-
714
- Returns:
715
- str: Extracted text from the paragraph.
716
- """
717
- paragraph = page[0]
718
- return self.fix_text(paragraph.text)