langroid 0.33.4__py3-none-any.whl → 0.33.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. langroid/__init__.py +106 -0
  2. langroid/agent/__init__.py +41 -0
  3. langroid/agent/base.py +1983 -0
  4. langroid/agent/batch.py +398 -0
  5. langroid/agent/callbacks/__init__.py +0 -0
  6. langroid/agent/callbacks/chainlit.py +598 -0
  7. langroid/agent/chat_agent.py +1899 -0
  8. langroid/agent/chat_document.py +454 -0
  9. langroid/agent/openai_assistant.py +882 -0
  10. langroid/agent/special/__init__.py +59 -0
  11. langroid/agent/special/arangodb/__init__.py +0 -0
  12. langroid/agent/special/arangodb/arangodb_agent.py +656 -0
  13. langroid/agent/special/arangodb/system_messages.py +186 -0
  14. langroid/agent/special/arangodb/tools.py +107 -0
  15. langroid/agent/special/arangodb/utils.py +36 -0
  16. langroid/agent/special/doc_chat_agent.py +1466 -0
  17. langroid/agent/special/lance_doc_chat_agent.py +262 -0
  18. langroid/agent/special/lance_rag/__init__.py +9 -0
  19. langroid/agent/special/lance_rag/critic_agent.py +198 -0
  20. langroid/agent/special/lance_rag/lance_rag_task.py +82 -0
  21. langroid/agent/special/lance_rag/query_planner_agent.py +260 -0
  22. langroid/agent/special/lance_tools.py +61 -0
  23. langroid/agent/special/neo4j/__init__.py +0 -0
  24. langroid/agent/special/neo4j/csv_kg_chat.py +174 -0
  25. langroid/agent/special/neo4j/neo4j_chat_agent.py +433 -0
  26. langroid/agent/special/neo4j/system_messages.py +120 -0
  27. langroid/agent/special/neo4j/tools.py +32 -0
  28. langroid/agent/special/relevance_extractor_agent.py +127 -0
  29. langroid/agent/special/retriever_agent.py +56 -0
  30. langroid/agent/special/sql/__init__.py +17 -0
  31. langroid/agent/special/sql/sql_chat_agent.py +654 -0
  32. langroid/agent/special/sql/utils/__init__.py +21 -0
  33. langroid/agent/special/sql/utils/description_extractors.py +190 -0
  34. langroid/agent/special/sql/utils/populate_metadata.py +85 -0
  35. langroid/agent/special/sql/utils/system_message.py +35 -0
  36. langroid/agent/special/sql/utils/tools.py +64 -0
  37. langroid/agent/special/table_chat_agent.py +263 -0
  38. langroid/agent/task.py +2095 -0
  39. langroid/agent/tool_message.py +393 -0
  40. langroid/agent/tools/__init__.py +38 -0
  41. langroid/agent/tools/duckduckgo_search_tool.py +50 -0
  42. langroid/agent/tools/file_tools.py +234 -0
  43. langroid/agent/tools/google_search_tool.py +39 -0
  44. langroid/agent/tools/metaphor_search_tool.py +68 -0
  45. langroid/agent/tools/orchestration.py +303 -0
  46. langroid/agent/tools/recipient_tool.py +235 -0
  47. langroid/agent/tools/retrieval_tool.py +32 -0
  48. langroid/agent/tools/rewind_tool.py +137 -0
  49. langroid/agent/tools/segment_extract_tool.py +41 -0
  50. langroid/agent/xml_tool_message.py +382 -0
  51. langroid/cachedb/__init__.py +17 -0
  52. langroid/cachedb/base.py +58 -0
  53. langroid/cachedb/momento_cachedb.py +108 -0
  54. langroid/cachedb/redis_cachedb.py +153 -0
  55. langroid/embedding_models/__init__.py +39 -0
  56. langroid/embedding_models/base.py +74 -0
  57. langroid/embedding_models/models.py +461 -0
  58. langroid/embedding_models/protoc/__init__.py +0 -0
  59. langroid/embedding_models/protoc/embeddings.proto +19 -0
  60. langroid/embedding_models/protoc/embeddings_pb2.py +33 -0
  61. langroid/embedding_models/protoc/embeddings_pb2.pyi +50 -0
  62. langroid/embedding_models/protoc/embeddings_pb2_grpc.py +79 -0
  63. langroid/embedding_models/remote_embeds.py +153 -0
  64. langroid/exceptions.py +71 -0
  65. langroid/language_models/__init__.py +53 -0
  66. langroid/language_models/azure_openai.py +153 -0
  67. langroid/language_models/base.py +678 -0
  68. langroid/language_models/config.py +18 -0
  69. langroid/language_models/mock_lm.py +124 -0
  70. langroid/language_models/openai_gpt.py +1964 -0
  71. langroid/language_models/prompt_formatter/__init__.py +16 -0
  72. langroid/language_models/prompt_formatter/base.py +40 -0
  73. langroid/language_models/prompt_formatter/hf_formatter.py +132 -0
  74. langroid/language_models/prompt_formatter/llama2_formatter.py +75 -0
  75. langroid/language_models/utils.py +151 -0
  76. langroid/mytypes.py +84 -0
  77. langroid/parsing/__init__.py +52 -0
  78. langroid/parsing/agent_chats.py +38 -0
  79. langroid/parsing/code_parser.py +121 -0
  80. langroid/parsing/document_parser.py +718 -0
  81. langroid/parsing/para_sentence_split.py +62 -0
  82. langroid/parsing/parse_json.py +155 -0
  83. langroid/parsing/parser.py +313 -0
  84. langroid/parsing/repo_loader.py +790 -0
  85. langroid/parsing/routing.py +36 -0
  86. langroid/parsing/search.py +275 -0
  87. langroid/parsing/spider.py +102 -0
  88. langroid/parsing/table_loader.py +94 -0
  89. langroid/parsing/url_loader.py +111 -0
  90. langroid/parsing/urls.py +273 -0
  91. langroid/parsing/utils.py +373 -0
  92. langroid/parsing/web_search.py +156 -0
  93. langroid/prompts/__init__.py +9 -0
  94. langroid/prompts/dialog.py +17 -0
  95. langroid/prompts/prompts_config.py +5 -0
  96. langroid/prompts/templates.py +141 -0
  97. langroid/pydantic_v1/__init__.py +10 -0
  98. langroid/pydantic_v1/main.py +4 -0
  99. langroid/utils/__init__.py +19 -0
  100. langroid/utils/algorithms/__init__.py +3 -0
  101. langroid/utils/algorithms/graph.py +103 -0
  102. langroid/utils/configuration.py +98 -0
  103. langroid/utils/constants.py +30 -0
  104. langroid/utils/git_utils.py +252 -0
  105. langroid/utils/globals.py +49 -0
  106. langroid/utils/logging.py +135 -0
  107. langroid/utils/object_registry.py +66 -0
  108. langroid/utils/output/__init__.py +20 -0
  109. langroid/utils/output/citations.py +41 -0
  110. langroid/utils/output/printing.py +99 -0
  111. langroid/utils/output/status.py +40 -0
  112. langroid/utils/pandas_utils.py +30 -0
  113. langroid/utils/pydantic_utils.py +602 -0
  114. langroid/utils/system.py +286 -0
  115. langroid/utils/types.py +93 -0
  116. langroid/vector_store/__init__.py +50 -0
  117. langroid/vector_store/base.py +359 -0
  118. langroid/vector_store/chromadb.py +214 -0
  119. langroid/vector_store/lancedb.py +406 -0
  120. langroid/vector_store/meilisearch.py +299 -0
  121. langroid/vector_store/momento.py +278 -0
  122. langroid/vector_store/qdrantdb.py +468 -0
  123. {langroid-0.33.4.dist-info → langroid-0.33.7.dist-info}/METADATA +95 -94
  124. langroid-0.33.7.dist-info/RECORD +127 -0
  125. {langroid-0.33.4.dist-info → langroid-0.33.7.dist-info}/WHEEL +1 -1
  126. langroid-0.33.4.dist-info/RECORD +0 -7
  127. langroid-0.33.4.dist-info/entry_points.txt +0 -4
  128. pyproject.toml +0 -356
  129. {langroid-0.33.4.dist-info → langroid-0.33.7.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,718 @@
1
+ from __future__ import annotations
2
+
3
+ import itertools
4
+ import logging
5
+ import re
6
+ from enum import Enum
7
+ from io import BytesIO
8
+ from typing import TYPE_CHECKING, Any, Generator, List, Tuple
9
+
10
+ from langroid.exceptions import LangroidImportError
11
+ from langroid.utils.object_registry import ObjectRegistry
12
+
13
+ try:
14
+ import fitz
15
+ except ImportError:
16
+ if not TYPE_CHECKING:
17
+ fitz = None
18
+
19
+ try:
20
+ import pypdf
21
+ except ImportError:
22
+ if not TYPE_CHECKING:
23
+ pypdf = None
24
+
25
+ try:
26
+ import pdfplumber
27
+ except ImportError:
28
+ if not TYPE_CHECKING:
29
+ pdfplumber = None
30
+
31
+ import requests
32
+ from bs4 import BeautifulSoup
33
+
34
+ if TYPE_CHECKING:
35
+ from PIL import Image
36
+
37
+ from langroid.mytypes import DocMetaData, Document
38
+ from langroid.parsing.parser import Parser, ParsingConfig
39
+
40
+ logger = logging.getLogger(__name__)
41
+
42
+
43
+ class DocumentType(str, Enum):
44
+ PDF = "pdf"
45
+ DOCX = "docx"
46
+ DOC = "doc"
47
+ TXT = "txt"
48
+
49
+
50
+ def find_last_full_char(possible_unicode: bytes) -> int:
51
+ """
52
+ Find the index of the last full character in a byte string.
53
+ Args:
54
+ possible_unicode (bytes): The bytes to check.
55
+ Returns:
56
+ int: The index of the last full unicode character.
57
+ """
58
+
59
+ for i in range(len(possible_unicode) - 1, 0, -1):
60
+ if (possible_unicode[i] & 0xC0) != 0x80:
61
+ return i
62
+ return 0
63
+
64
+
65
+ def is_plain_text(path_or_bytes: str | bytes) -> bool:
66
+ """
67
+ Check if a file is plain text by attempting to decode it as UTF-8.
68
+ Args:
69
+ path_or_bytes (str|bytes): The file path or bytes object.
70
+ Returns:
71
+ bool: True if the file is plain text, False otherwise.
72
+ """
73
+ if isinstance(path_or_bytes, str):
74
+ if path_or_bytes.startswith(("http://", "https://")):
75
+ response = requests.get(path_or_bytes)
76
+ response.raise_for_status()
77
+ content = response.content[:1024]
78
+ else:
79
+ with open(path_or_bytes, "rb") as f:
80
+ content = f.read(1024)
81
+ else:
82
+ content = path_or_bytes[:1024]
83
+ try:
84
+ # Use magic to detect the MIME type
85
+ import magic
86
+
87
+ mime_type = magic.from_buffer(content, mime=True)
88
+
89
+ # Check if the MIME type is not a text type
90
+ if not mime_type.startswith("text/"):
91
+ return False
92
+
93
+ # Attempt to decode the content as UTF-8
94
+ content = content[: find_last_full_char(content)]
95
+
96
+ try:
97
+ _ = content.decode("utf-8")
98
+ # Additional checks can go here, e.g., to verify that the content
99
+ # doesn't contain too many unusual characters for it to be considered text
100
+ return True
101
+ except UnicodeDecodeError:
102
+ return False
103
+ except UnicodeDecodeError:
104
+ # If decoding fails, it's likely not plain text (or not encoded in UTF-8)
105
+ return False
106
+
107
+
108
+ class DocumentParser(Parser):
109
+ """
110
+ Abstract base class for extracting text from special types of docs
111
+ such as PDFs or Docx.
112
+
113
+ Attributes:
114
+ source (str): The source, either a URL or a file path.
115
+ doc_bytes (BytesIO): BytesIO object containing the doc data.
116
+ """
117
+
118
+ @classmethod
119
+ def create(
120
+ cls,
121
+ source: str | bytes,
122
+ config: ParsingConfig,
123
+ doc_type: str | DocumentType | None = None,
124
+ ) -> "DocumentParser":
125
+ """
126
+ Create a DocumentParser instance based on source type
127
+ and config.<source_type>.library specified.
128
+
129
+ Args:
130
+ source (str|bytes): The source, could be a URL, file path,
131
+ or bytes object.
132
+ config (ParserConfig): The parser configuration.
133
+ doc_type (str|None): The type of document, if known
134
+
135
+ Returns:
136
+ DocumentParser: An instance of a DocumentParser subclass.
137
+ """
138
+ inferred_doc_type = DocumentParser._document_type(source, doc_type)
139
+ if inferred_doc_type == DocumentType.PDF:
140
+ if config.pdf.library == "fitz":
141
+ return FitzPDFParser(source, config)
142
+ elif config.pdf.library == "pypdf":
143
+ return PyPDFParser(source, config)
144
+ elif config.pdf.library == "pdfplumber":
145
+ return PDFPlumberParser(source, config)
146
+ elif config.pdf.library == "unstructured":
147
+ return UnstructuredPDFParser(source, config)
148
+ elif config.pdf.library == "pdf2image":
149
+ return ImagePdfParser(source, config)
150
+ else:
151
+ raise ValueError(
152
+ f"Unsupported PDF library specified: {config.pdf.library}"
153
+ )
154
+ elif inferred_doc_type == DocumentType.DOCX:
155
+ if config.docx.library == "unstructured":
156
+ return UnstructuredDocxParser(source, config)
157
+ elif config.docx.library == "python-docx":
158
+ return PythonDocxParser(source, config)
159
+ else:
160
+ raise ValueError(
161
+ f"Unsupported DOCX library specified: {config.docx.library}"
162
+ )
163
+ elif inferred_doc_type == DocumentType.DOC:
164
+ return UnstructuredDocParser(source, config)
165
+ else:
166
+ source_name = source if isinstance(source, str) else "bytes"
167
+ raise ValueError(f"Unsupported document type: {source_name}")
168
+
169
+ def __init__(self, source: str | bytes, config: ParsingConfig):
170
+ """
171
+ Args:
172
+ source (str|bytes): The source, which could be
173
+ a path, a URL or a bytes object.
174
+ """
175
+ super().__init__(config)
176
+ self.config = config
177
+ if isinstance(source, bytes):
178
+ self.source = "bytes"
179
+ self.doc_bytes = BytesIO(source)
180
+ else:
181
+ self.source = source
182
+ self.doc_bytes = self._load_doc_as_bytesio()
183
+
184
+ @staticmethod
185
+ def _document_type(
186
+ source: str | bytes, doc_type: str | DocumentType | None = None
187
+ ) -> DocumentType:
188
+ """
189
+ Determine the type of document based on the source.
190
+
191
+ Args:
192
+ source (str|bytes): The source, which could be a URL,
193
+ a file path, or a bytes object.
194
+ doc_type (str|DocumentType|None): The type of document, if known.
195
+
196
+ Returns:
197
+ str: The document type.
198
+ """
199
+ if isinstance(doc_type, DocumentType):
200
+ return doc_type
201
+ if doc_type:
202
+ return DocumentType(doc_type.lower())
203
+ if is_plain_text(source):
204
+ return DocumentType.TXT
205
+ if isinstance(source, str):
206
+ # detect file type from path extension
207
+ if source.lower().endswith(".pdf"):
208
+ return DocumentType.PDF
209
+ elif source.lower().endswith(".docx"):
210
+ return DocumentType.DOCX
211
+ elif source.lower().endswith(".doc"):
212
+ return DocumentType.DOC
213
+ else:
214
+ raise ValueError(f"Unsupported document type: {source}")
215
+ else:
216
+ # must be bytes: attempt to detect type from content
217
+ # using magic mime type detection
218
+ import magic
219
+
220
+ mime_type = magic.from_buffer(source, mime=True)
221
+ if mime_type == "application/pdf":
222
+ return DocumentType.PDF
223
+ elif mime_type in [
224
+ "application/vnd.openxmlformats-officedocument"
225
+ ".wordprocessingml.document",
226
+ "application/zip",
227
+ ]:
228
+ # DOCX files are essentially ZIP files,
229
+ # but this might catch other ZIP-based formats too!
230
+ return DocumentType.DOCX
231
+ elif mime_type == "application/msword":
232
+ return DocumentType.DOC
233
+ else:
234
+ raise ValueError("Unsupported document type from bytes")
235
+
236
+ def _load_doc_as_bytesio(self) -> BytesIO:
237
+ """
238
+ Load the docs into a BytesIO object.
239
+
240
+ Returns:
241
+ BytesIO: A BytesIO object containing the doc data.
242
+ """
243
+ if self.source.startswith(("http://", "https://")):
244
+ response = requests.get(self.source)
245
+ response.raise_for_status()
246
+ return BytesIO(response.content)
247
+ else:
248
+ with open(self.source, "rb") as f:
249
+ return BytesIO(f.read())
250
+
251
+ @staticmethod
252
+ def chunks_from_path_or_bytes(
253
+ source: str | bytes,
254
+ parser: Parser,
255
+ doc_type: str | DocumentType | None = None,
256
+ lines: int | None = None,
257
+ ) -> List[Document]:
258
+ """
259
+ Get document chunks from a file path or bytes object.
260
+ Args:
261
+ source (str|bytes): The source, which could be a URL, path or bytes object.
262
+ parser (Parser): The parser instance (for splitting the document).
263
+ doc_type (str|DocumentType|None): The type of document, if known.
264
+ lines (int|None): The number of lines to read from a plain text file.
265
+ Returns:
266
+ List[Document]: A list of `Document` objects,
267
+ each containing a chunk of text, determined by the
268
+ chunking and splitting settings in the parser config.
269
+ """
270
+ dtype: DocumentType = DocumentParser._document_type(source, doc_type)
271
+ if dtype in [DocumentType.PDF, DocumentType.DOC, DocumentType.DOCX]:
272
+ doc_parser = DocumentParser.create(
273
+ source,
274
+ parser.config,
275
+ doc_type=doc_type,
276
+ )
277
+ chunks = doc_parser.get_doc_chunks()
278
+ if len(chunks) == 0 and dtype == DocumentType.PDF:
279
+ doc_parser = ImagePdfParser(source, parser.config)
280
+ chunks = doc_parser.get_doc_chunks()
281
+ return chunks
282
+ else:
283
+ # try getting as plain text; these will be chunked downstream
284
+ # -- could be a bytes object or a path
285
+ if isinstance(source, bytes):
286
+ content = source.decode()
287
+ if lines is not None:
288
+ file_lines = content.splitlines()[:lines]
289
+ content = "\n".join(line.strip() for line in file_lines)
290
+ else:
291
+ with open(source, "r") as f:
292
+ if lines is not None:
293
+ file_lines = list(itertools.islice(f, lines))
294
+ content = "\n".join(line.strip() for line in file_lines)
295
+ else:
296
+ content = f.read()
297
+ soup = BeautifulSoup(content, "html.parser")
298
+ text = soup.get_text()
299
+ source_name = source if isinstance(source, str) else "bytes"
300
+ doc = Document(
301
+ content=text,
302
+ metadata=DocMetaData(source=str(source_name)),
303
+ )
304
+ return parser.split([doc])
305
+
306
+ def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
307
+ """Yield each page in the PDF."""
308
+ raise NotImplementedError
309
+
310
+ def extract_text_from_page(self, page: Any) -> str:
311
+ """Extract text from a given page."""
312
+ raise NotImplementedError
313
+
314
+ def fix_text(self, text: str) -> str:
315
+ """
316
+ Fix text extracted from a PDF.
317
+
318
+ Args:
319
+ text (str): The extracted text.
320
+
321
+ Returns:
322
+ str: The fixed text.
323
+ """
324
+ # Some pdf parsers introduce extra space before hyphen,
325
+ # so use regular expression to replace 'space-hyphen' with just 'hyphen'
326
+ return re.sub(r" +\-", "-", text)
327
+
328
+ def get_doc(self) -> Document:
329
+ """
330
+ Get entire text from source as a single document.
331
+
332
+ Returns:
333
+ a `Document` object containing the content of the pdf file,
334
+ and metadata containing source name (URL or path)
335
+ """
336
+
337
+ text = "".join(
338
+ [self.extract_text_from_page(page) for _, page in self.iterate_pages()]
339
+ )
340
+ return Document(content=text, metadata=DocMetaData(source=self.source))
341
+
342
+ def get_doc_chunks(self) -> List[Document]:
343
+ """
344
+ Get document chunks from a pdf source,
345
+ with page references in the document metadata.
346
+
347
+ Adapted from
348
+ https://github.com/whitead/paper-qa/blob/main/paperqa/readers.py
349
+
350
+ Returns:
351
+ List[Document]: a list of `Document` objects,
352
+ each containing a chunk of text
353
+ """
354
+
355
+ split = [] # tokens in curr split
356
+ pages: List[str] = []
357
+ docs: List[Document] = []
358
+ # metadata.id to be shared by ALL chunks of this document
359
+ common_id = ObjectRegistry.new_id()
360
+ n_chunks = 0 # how many chunk so far
361
+ for i, page in self.iterate_pages():
362
+ page_text = self.extract_text_from_page(page)
363
+ split += self.tokenizer.encode(page_text)
364
+ pages.append(str(i + 1))
365
+ # split could be so long it needs to be split
366
+ # into multiple chunks. Or it could be so short
367
+ # that it needs to be combined with the next chunk.
368
+ while len(split) > self.config.chunk_size:
369
+ # pretty formatting of pages (e.g. 1-3, 4, 5-7)
370
+ pg = "-".join([pages[0], pages[-1]])
371
+ text = self.tokenizer.decode(split[: self.config.chunk_size])
372
+ docs.append(
373
+ Document(
374
+ content=text,
375
+ metadata=DocMetaData(
376
+ source=f"{self.source} pages {pg}",
377
+ is_chunk=True,
378
+ id=common_id,
379
+ ),
380
+ )
381
+ )
382
+ n_chunks += 1
383
+ split = split[self.config.chunk_size - self.config.overlap :]
384
+ pages = [str(i + 1)]
385
+ # there may be a last split remaining:
386
+ # if it's shorter than the overlap, we shouldn't make a chunk for it
387
+ # since it's already included in the prior chunk;
388
+ # the only exception is if there have been no chunks so far.
389
+ if len(split) > self.config.overlap or n_chunks == 0:
390
+ pg = "-".join([pages[0], pages[-1]])
391
+ text = self.tokenizer.decode(split[: self.config.chunk_size])
392
+ docs.append(
393
+ Document(
394
+ content=text,
395
+ metadata=DocMetaData(
396
+ source=f"{self.source} pages {pg}",
397
+ is_chunk=True,
398
+ id=common_id,
399
+ ),
400
+ )
401
+ )
402
+ self.add_window_ids(docs)
403
+ return docs
404
+
405
+
406
+ class FitzPDFParser(DocumentParser):
407
+ """
408
+ Parser for processing PDFs using the `fitz` library.
409
+ """
410
+
411
+ def iterate_pages(self) -> Generator[Tuple[int, "fitz.Page"], None, None]:
412
+ """
413
+ Yield each page in the PDF using `fitz`.
414
+
415
+ Returns:
416
+ Generator[fitz.Page]: Generator yielding each page.
417
+ """
418
+ if fitz is None:
419
+ raise LangroidImportError("fitz", "pdf-parsers")
420
+ doc = fitz.open(stream=self.doc_bytes, filetype="pdf")
421
+ for i, page in enumerate(doc):
422
+ yield i, page
423
+ doc.close()
424
+
425
+ def extract_text_from_page(self, page: "fitz.Page") -> str:
426
+ """
427
+ Extract text from a given `fitz` page.
428
+
429
+ Args:
430
+ page (fitz.Page): The `fitz` page object.
431
+
432
+ Returns:
433
+ str: Extracted text from the page.
434
+ """
435
+ return self.fix_text(page.get_text())
436
+
437
+
438
+ class PyPDFParser(DocumentParser):
439
+ """
440
+ Parser for processing PDFs using the `pypdf` library.
441
+ """
442
+
443
+ def iterate_pages(self) -> Generator[Tuple[int, pypdf.PageObject], None, None]:
444
+ """
445
+ Yield each page in the PDF using `pypdf`.
446
+
447
+ Returns:
448
+ Generator[pypdf.pdf.PageObject]: Generator yielding each page.
449
+ """
450
+ if pypdf is None:
451
+ raise LangroidImportError("pypdf", "pdf-parsers")
452
+ reader = pypdf.PdfReader(self.doc_bytes)
453
+ for i, page in enumerate(reader.pages):
454
+ yield i, page
455
+
456
+ def extract_text_from_page(self, page: pypdf.PageObject) -> str:
457
+ """
458
+ Extract text from a given `pypdf` page.
459
+
460
+ Args:
461
+ page (pypdf.pdf.PageObject): The `pypdf` page object.
462
+
463
+ Returns:
464
+ str: Extracted text from the page.
465
+ """
466
+ return self.fix_text(page.extract_text())
467
+
468
+
469
+ class PDFPlumberParser(DocumentParser):
470
+ """
471
+ Parser for processing PDFs using the `pdfplumber` library.
472
+ """
473
+
474
+ def iterate_pages(
475
+ self,
476
+ ) -> (Generator)[Tuple[int, pdfplumber.pdf.Page], None, None]: # type: ignore
477
+ """
478
+ Yield each page in the PDF using `pdfplumber`.
479
+
480
+ Returns:
481
+ Generator[pdfplumber.Page]: Generator yielding each page.
482
+ """
483
+ if pdfplumber is None:
484
+ raise LangroidImportError("pdfplumber", "pdf-parsers")
485
+ with pdfplumber.open(self.doc_bytes) as pdf:
486
+ for i, page in enumerate(pdf.pages):
487
+ yield i, page
488
+
489
+ def extract_text_from_page(self, page: pdfplumber.pdf.Page) -> str: # type: ignore
490
+ """
491
+ Extract text from a given `pdfplumber` page.
492
+
493
+ Args:
494
+ page (pdfplumber.Page): The `pdfplumber` page object.
495
+
496
+ Returns:
497
+ str: Extracted text from the page.
498
+ """
499
+ return self.fix_text(page.extract_text())
500
+
501
+
502
+ class ImagePdfParser(DocumentParser):
503
+ """
504
+ Parser for processing PDFs that are images, i.e. not "true" PDFs.
505
+ """
506
+
507
+ def iterate_pages(
508
+ self,
509
+ ) -> Generator[Tuple[int, "Image"], None, None]: # type: ignore
510
+ try:
511
+ from pdf2image import convert_from_bytes
512
+ except ImportError:
513
+ raise LangroidImportError("pdf2image", "pdf-parsers")
514
+
515
+ images = convert_from_bytes(self.doc_bytes.getvalue())
516
+ for i, image in enumerate(images):
517
+ yield i, image
518
+
519
+ def extract_text_from_page(self, page: "Image") -> str: # type: ignore
520
+ """
521
+ Extract text from a given `pdf2image` page.
522
+
523
+ Args:
524
+ page (Image): The PIL Image object.
525
+
526
+ Returns:
527
+ str: Extracted text from the image.
528
+ """
529
+ try:
530
+ import pytesseract
531
+ except ImportError:
532
+ raise LangroidImportError("pytesseract", "pdf-parsers")
533
+
534
+ text = pytesseract.image_to_string(page)
535
+ return self.fix_text(text)
536
+
537
+
538
+ class UnstructuredPDFParser(DocumentParser):
539
+ """
540
+ Parser for processing PDF files using the `unstructured` library.
541
+ """
542
+
543
+ def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]: # type: ignore
544
+ try:
545
+ from unstructured.partition.pdf import partition_pdf
546
+ except ImportError:
547
+ raise ImportError(
548
+ """
549
+ The `unstructured` library is not installed by default with langroid.
550
+ To include this library, please install langroid with the
551
+ `unstructured` extra by running `pip install "langroid[unstructured]"`
552
+ or equivalent.
553
+ """
554
+ )
555
+
556
+ # from unstructured.chunking.title import chunk_by_title
557
+
558
+ try:
559
+ elements = partition_pdf(file=self.doc_bytes, include_page_breaks=True)
560
+ except Exception as e:
561
+ raise Exception(
562
+ f"""
563
+ Error parsing PDF: {e}
564
+ The `unstructured` library failed to parse the pdf.
565
+ Please try a different library by setting the `library` field
566
+ in the `pdf` section of the `parsing` field in the config file.
567
+ Supported libraries are:
568
+ fitz, pypdf, pdfplumber, unstructured
569
+ """
570
+ )
571
+
572
+ # elements = chunk_by_title(elements)
573
+ page_number = 1
574
+ page_elements = [] # type: ignore
575
+ for el in elements:
576
+ if el.category == "PageBreak":
577
+ if page_elements: # Avoid yielding empty pages at the start
578
+ yield page_number, page_elements
579
+ page_number += 1
580
+ page_elements = []
581
+ else:
582
+ page_elements.append(el)
583
+ # Yield the last page if it's not empty
584
+ if page_elements:
585
+ yield page_number, page_elements
586
+
587
+ def extract_text_from_page(self, page: Any) -> str:
588
+ """
589
+ Extract text from a given `unstructured` element.
590
+
591
+ Args:
592
+ page (unstructured element): The `unstructured` element object.
593
+
594
+ Returns:
595
+ str: Extracted text from the element.
596
+ """
597
+ text = " ".join(el.text for el in page)
598
+ return self.fix_text(text)
599
+
600
+
601
+ class UnstructuredDocxParser(DocumentParser):
602
+ """
603
+ Parser for processing DOCX files using the `unstructured` library.
604
+ """
605
+
606
+ def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]: # type: ignore
607
+ try:
608
+ from unstructured.partition.docx import partition_docx
609
+ except ImportError:
610
+ raise ImportError(
611
+ """
612
+ The `unstructured` library is not installed by default with langroid.
613
+ To include this library, please install langroid with the
614
+ `unstructured` extra by running `pip install "langroid[unstructured]"`
615
+ or equivalent.
616
+ """
617
+ )
618
+
619
+ elements = partition_docx(file=self.doc_bytes, include_page_breaks=True)
620
+
621
+ page_number = 1
622
+ page_elements = [] # type: ignore
623
+ for el in elements:
624
+ if el.category == "PageBreak":
625
+ if page_elements: # Avoid yielding empty pages at the start
626
+ yield page_number, page_elements
627
+ page_number += 1
628
+ page_elements = []
629
+ else:
630
+ page_elements.append(el)
631
+ # Yield the last page if it's not empty
632
+ if page_elements:
633
+ yield page_number, page_elements
634
+
635
+ def extract_text_from_page(self, page: Any) -> str:
636
+ """
637
+ Extract text from a given `unstructured` element.
638
+
639
+ Note:
640
+ The concept of "pages" doesn't actually exist in the .docx file format in
641
+ the same way it does in formats like .pdf. A .docx file is made up of a
642
+ series of elements like paragraphs and tables, but the division into
643
+ pages is done dynamically based on the rendering settings (like the page
644
+ size, margin size, font size, etc.).
645
+
646
+ Args:
647
+ page (unstructured element): The `unstructured` element object.
648
+
649
+ Returns:
650
+ str: Extracted text from the element.
651
+ """
652
+ text = " ".join(el.text for el in page)
653
+ return self.fix_text(text)
654
+
655
+
656
+ class UnstructuredDocParser(UnstructuredDocxParser):
657
+ def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]: # type: ignore
658
+ try:
659
+ from unstructured.partition.doc import partition_doc
660
+ except ImportError:
661
+ raise ImportError(
662
+ """
663
+ The `unstructured` library is not installed by default with langroid.
664
+ To include this library, please install langroid with the
665
+ `unstructured` extra by running `pip install "langroid[unstructured]"`
666
+ or equivalent.
667
+ """
668
+ )
669
+
670
+ elements = partition_doc(file=self.doc_bytes, include_page_breaks=True)
671
+
672
+ page_number = 1
673
+ page_elements = [] # type: ignore
674
+ for el in elements:
675
+ if el.category == "PageBreak":
676
+ if page_elements: # Avoid yielding empty pages at the start
677
+ yield page_number, page_elements
678
+ page_number += 1
679
+ page_elements = []
680
+ else:
681
+ page_elements.append(el)
682
+ # Yield the last page if it's not empty
683
+ if page_elements:
684
+ yield page_number, page_elements
685
+
686
+
687
+ class PythonDocxParser(DocumentParser):
688
+ """
689
+ Parser for processing DOCX files using the `python-docx` library.
690
+ """
691
+
692
+ def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
693
+ """
694
+ Simulate iterating through pages.
695
+ In a DOCX file, pages are not explicitly defined,
696
+ so we consider each paragraph as a separate 'page' for simplicity.
697
+ """
698
+ try:
699
+ import docx
700
+ except ImportError:
701
+ raise LangroidImportError("python-docx", "docx")
702
+
703
+ doc = docx.Document(self.doc_bytes)
704
+ for i, para in enumerate(doc.paragraphs, start=1):
705
+ yield i, [para]
706
+
707
+ def extract_text_from_page(self, page: Any) -> str:
708
+ """
709
+ Extract text from a given 'page', which in this case is a single paragraph.
710
+
711
+ Args:
712
+ page (list): A list containing a single Paragraph object.
713
+
714
+ Returns:
715
+ str: Extracted text from the paragraph.
716
+ """
717
+ paragraph = page[0]
718
+ return self.fix_text(paragraph.text)