langroid 0.32.2__py3-none-any.whl → 0.33.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {langroid-0.32.2.dist-info → langroid-0.33.4.dist-info}/METADATA +149 -123
- langroid-0.33.4.dist-info/RECORD +7 -0
- {langroid-0.32.2.dist-info → langroid-0.33.4.dist-info}/WHEEL +1 -1
- langroid-0.33.4.dist-info/entry_points.txt +4 -0
- pyproject.toml +317 -212
- langroid/__init__.py +0 -106
- langroid/agent/__init__.py +0 -41
- langroid/agent/base.py +0 -1983
- langroid/agent/batch.py +0 -398
- langroid/agent/callbacks/__init__.py +0 -0
- langroid/agent/callbacks/chainlit.py +0 -598
- langroid/agent/chat_agent.py +0 -1899
- langroid/agent/chat_document.py +0 -454
- langroid/agent/openai_assistant.py +0 -882
- langroid/agent/special/__init__.py +0 -59
- langroid/agent/special/arangodb/__init__.py +0 -0
- langroid/agent/special/arangodb/arangodb_agent.py +0 -656
- langroid/agent/special/arangodb/system_messages.py +0 -186
- langroid/agent/special/arangodb/tools.py +0 -107
- langroid/agent/special/arangodb/utils.py +0 -36
- langroid/agent/special/doc_chat_agent.py +0 -1466
- langroid/agent/special/lance_doc_chat_agent.py +0 -262
- langroid/agent/special/lance_rag/__init__.py +0 -9
- langroid/agent/special/lance_rag/critic_agent.py +0 -198
- langroid/agent/special/lance_rag/lance_rag_task.py +0 -82
- langroid/agent/special/lance_rag/query_planner_agent.py +0 -260
- langroid/agent/special/lance_tools.py +0 -61
- langroid/agent/special/neo4j/__init__.py +0 -0
- langroid/agent/special/neo4j/csv_kg_chat.py +0 -174
- langroid/agent/special/neo4j/neo4j_chat_agent.py +0 -433
- langroid/agent/special/neo4j/system_messages.py +0 -120
- langroid/agent/special/neo4j/tools.py +0 -32
- langroid/agent/special/relevance_extractor_agent.py +0 -127
- langroid/agent/special/retriever_agent.py +0 -56
- langroid/agent/special/sql/__init__.py +0 -17
- langroid/agent/special/sql/sql_chat_agent.py +0 -654
- langroid/agent/special/sql/utils/__init__.py +0 -21
- langroid/agent/special/sql/utils/description_extractors.py +0 -190
- langroid/agent/special/sql/utils/populate_metadata.py +0 -85
- langroid/agent/special/sql/utils/system_message.py +0 -35
- langroid/agent/special/sql/utils/tools.py +0 -64
- langroid/agent/special/table_chat_agent.py +0 -263
- langroid/agent/task.py +0 -2095
- langroid/agent/tool_message.py +0 -393
- langroid/agent/tools/__init__.py +0 -38
- langroid/agent/tools/duckduckgo_search_tool.py +0 -50
- langroid/agent/tools/file_tools.py +0 -234
- langroid/agent/tools/google_search_tool.py +0 -39
- langroid/agent/tools/metaphor_search_tool.py +0 -67
- langroid/agent/tools/orchestration.py +0 -303
- langroid/agent/tools/recipient_tool.py +0 -235
- langroid/agent/tools/retrieval_tool.py +0 -32
- langroid/agent/tools/rewind_tool.py +0 -137
- langroid/agent/tools/segment_extract_tool.py +0 -41
- langroid/agent/xml_tool_message.py +0 -382
- langroid/cachedb/__init__.py +0 -17
- langroid/cachedb/base.py +0 -58
- langroid/cachedb/momento_cachedb.py +0 -108
- langroid/cachedb/redis_cachedb.py +0 -153
- langroid/embedding_models/__init__.py +0 -39
- langroid/embedding_models/base.py +0 -74
- langroid/embedding_models/models.py +0 -461
- langroid/embedding_models/protoc/__init__.py +0 -0
- langroid/embedding_models/protoc/embeddings.proto +0 -19
- langroid/embedding_models/protoc/embeddings_pb2.py +0 -33
- langroid/embedding_models/protoc/embeddings_pb2.pyi +0 -50
- langroid/embedding_models/protoc/embeddings_pb2_grpc.py +0 -79
- langroid/embedding_models/remote_embeds.py +0 -153
- langroid/exceptions.py +0 -65
- langroid/language_models/__init__.py +0 -53
- langroid/language_models/azure_openai.py +0 -153
- langroid/language_models/base.py +0 -678
- langroid/language_models/config.py +0 -18
- langroid/language_models/mock_lm.py +0 -124
- langroid/language_models/openai_gpt.py +0 -1964
- langroid/language_models/prompt_formatter/__init__.py +0 -16
- langroid/language_models/prompt_formatter/base.py +0 -40
- langroid/language_models/prompt_formatter/hf_formatter.py +0 -132
- langroid/language_models/prompt_formatter/llama2_formatter.py +0 -75
- langroid/language_models/utils.py +0 -151
- langroid/mytypes.py +0 -84
- langroid/parsing/__init__.py +0 -52
- langroid/parsing/agent_chats.py +0 -38
- langroid/parsing/code_parser.py +0 -121
- langroid/parsing/document_parser.py +0 -718
- langroid/parsing/para_sentence_split.py +0 -62
- langroid/parsing/parse_json.py +0 -155
- langroid/parsing/parser.py +0 -313
- langroid/parsing/repo_loader.py +0 -790
- langroid/parsing/routing.py +0 -36
- langroid/parsing/search.py +0 -275
- langroid/parsing/spider.py +0 -102
- langroid/parsing/table_loader.py +0 -94
- langroid/parsing/url_loader.py +0 -111
- langroid/parsing/urls.py +0 -273
- langroid/parsing/utils.py +0 -373
- langroid/parsing/web_search.py +0 -155
- langroid/prompts/__init__.py +0 -9
- langroid/prompts/dialog.py +0 -17
- langroid/prompts/prompts_config.py +0 -5
- langroid/prompts/templates.py +0 -141
- langroid/pydantic_v1/__init__.py +0 -10
- langroid/pydantic_v1/main.py +0 -4
- langroid/utils/__init__.py +0 -19
- langroid/utils/algorithms/__init__.py +0 -3
- langroid/utils/algorithms/graph.py +0 -103
- langroid/utils/configuration.py +0 -98
- langroid/utils/constants.py +0 -30
- langroid/utils/git_utils.py +0 -252
- langroid/utils/globals.py +0 -49
- langroid/utils/logging.py +0 -135
- langroid/utils/object_registry.py +0 -66
- langroid/utils/output/__init__.py +0 -20
- langroid/utils/output/citations.py +0 -41
- langroid/utils/output/printing.py +0 -99
- langroid/utils/output/status.py +0 -40
- langroid/utils/pandas_utils.py +0 -30
- langroid/utils/pydantic_utils.py +0 -602
- langroid/utils/system.py +0 -286
- langroid/utils/types.py +0 -93
- langroid/vector_store/__init__.py +0 -50
- langroid/vector_store/base.py +0 -357
- langroid/vector_store/chromadb.py +0 -214
- langroid/vector_store/lancedb.py +0 -401
- langroid/vector_store/meilisearch.py +0 -299
- langroid/vector_store/momento.py +0 -278
- langroid/vector_store/qdrantdb.py +0 -468
- langroid-0.32.2.dist-info/RECORD +0 -128
- {langroid-0.32.2.dist-info → langroid-0.33.4.dist-info/licenses}/LICENSE +0 -0
@@ -1,718 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
import itertools
|
4
|
-
import logging
|
5
|
-
import re
|
6
|
-
from enum import Enum
|
7
|
-
from io import BytesIO
|
8
|
-
from typing import TYPE_CHECKING, Any, Generator, List, Tuple
|
9
|
-
|
10
|
-
from langroid.exceptions import LangroidImportError
|
11
|
-
from langroid.utils.object_registry import ObjectRegistry
|
12
|
-
|
13
|
-
try:
|
14
|
-
import fitz
|
15
|
-
except ImportError:
|
16
|
-
if not TYPE_CHECKING:
|
17
|
-
fitz = None
|
18
|
-
|
19
|
-
try:
|
20
|
-
import pypdf
|
21
|
-
except ImportError:
|
22
|
-
if not TYPE_CHECKING:
|
23
|
-
pypdf = None
|
24
|
-
|
25
|
-
try:
|
26
|
-
import pdfplumber
|
27
|
-
except ImportError:
|
28
|
-
if not TYPE_CHECKING:
|
29
|
-
pdfplumber = None
|
30
|
-
|
31
|
-
import requests
|
32
|
-
from bs4 import BeautifulSoup
|
33
|
-
|
34
|
-
if TYPE_CHECKING:
|
35
|
-
from PIL import Image
|
36
|
-
|
37
|
-
from langroid.mytypes import DocMetaData, Document
|
38
|
-
from langroid.parsing.parser import Parser, ParsingConfig
|
39
|
-
|
40
|
-
logger = logging.getLogger(__name__)
|
41
|
-
|
42
|
-
|
43
|
-
class DocumentType(str, Enum):
|
44
|
-
PDF = "pdf"
|
45
|
-
DOCX = "docx"
|
46
|
-
DOC = "doc"
|
47
|
-
TXT = "txt"
|
48
|
-
|
49
|
-
|
50
|
-
def find_last_full_char(possible_unicode: bytes) -> int:
|
51
|
-
"""
|
52
|
-
Find the index of the last full character in a byte string.
|
53
|
-
Args:
|
54
|
-
possible_unicode (bytes): The bytes to check.
|
55
|
-
Returns:
|
56
|
-
int: The index of the last full unicode character.
|
57
|
-
"""
|
58
|
-
|
59
|
-
for i in range(len(possible_unicode) - 1, 0, -1):
|
60
|
-
if (possible_unicode[i] & 0xC0) != 0x80:
|
61
|
-
return i
|
62
|
-
return 0
|
63
|
-
|
64
|
-
|
65
|
-
def is_plain_text(path_or_bytes: str | bytes) -> bool:
|
66
|
-
"""
|
67
|
-
Check if a file is plain text by attempting to decode it as UTF-8.
|
68
|
-
Args:
|
69
|
-
path_or_bytes (str|bytes): The file path or bytes object.
|
70
|
-
Returns:
|
71
|
-
bool: True if the file is plain text, False otherwise.
|
72
|
-
"""
|
73
|
-
if isinstance(path_or_bytes, str):
|
74
|
-
if path_or_bytes.startswith(("http://", "https://")):
|
75
|
-
response = requests.get(path_or_bytes)
|
76
|
-
response.raise_for_status()
|
77
|
-
content = response.content[:1024]
|
78
|
-
else:
|
79
|
-
with open(path_or_bytes, "rb") as f:
|
80
|
-
content = f.read(1024)
|
81
|
-
else:
|
82
|
-
content = path_or_bytes[:1024]
|
83
|
-
try:
|
84
|
-
# Use magic to detect the MIME type
|
85
|
-
import magic
|
86
|
-
|
87
|
-
mime_type = magic.from_buffer(content, mime=True)
|
88
|
-
|
89
|
-
# Check if the MIME type is not a text type
|
90
|
-
if not mime_type.startswith("text/"):
|
91
|
-
return False
|
92
|
-
|
93
|
-
# Attempt to decode the content as UTF-8
|
94
|
-
content = content[: find_last_full_char(content)]
|
95
|
-
|
96
|
-
try:
|
97
|
-
_ = content.decode("utf-8")
|
98
|
-
# Additional checks can go here, e.g., to verify that the content
|
99
|
-
# doesn't contain too many unusual characters for it to be considered text
|
100
|
-
return True
|
101
|
-
except UnicodeDecodeError:
|
102
|
-
return False
|
103
|
-
except UnicodeDecodeError:
|
104
|
-
# If decoding fails, it's likely not plain text (or not encoded in UTF-8)
|
105
|
-
return False
|
106
|
-
|
107
|
-
|
108
|
-
class DocumentParser(Parser):
|
109
|
-
"""
|
110
|
-
Abstract base class for extracting text from special types of docs
|
111
|
-
such as PDFs or Docx.
|
112
|
-
|
113
|
-
Attributes:
|
114
|
-
source (str): The source, either a URL or a file path.
|
115
|
-
doc_bytes (BytesIO): BytesIO object containing the doc data.
|
116
|
-
"""
|
117
|
-
|
118
|
-
@classmethod
|
119
|
-
def create(
|
120
|
-
cls,
|
121
|
-
source: str | bytes,
|
122
|
-
config: ParsingConfig,
|
123
|
-
doc_type: str | DocumentType | None = None,
|
124
|
-
) -> "DocumentParser":
|
125
|
-
"""
|
126
|
-
Create a DocumentParser instance based on source type
|
127
|
-
and config.<source_type>.library specified.
|
128
|
-
|
129
|
-
Args:
|
130
|
-
source (str|bytes): The source, could be a URL, file path,
|
131
|
-
or bytes object.
|
132
|
-
config (ParserConfig): The parser configuration.
|
133
|
-
doc_type (str|None): The type of document, if known
|
134
|
-
|
135
|
-
Returns:
|
136
|
-
DocumentParser: An instance of a DocumentParser subclass.
|
137
|
-
"""
|
138
|
-
inferred_doc_type = DocumentParser._document_type(source, doc_type)
|
139
|
-
if inferred_doc_type == DocumentType.PDF:
|
140
|
-
if config.pdf.library == "fitz":
|
141
|
-
return FitzPDFParser(source, config)
|
142
|
-
elif config.pdf.library == "pypdf":
|
143
|
-
return PyPDFParser(source, config)
|
144
|
-
elif config.pdf.library == "pdfplumber":
|
145
|
-
return PDFPlumberParser(source, config)
|
146
|
-
elif config.pdf.library == "unstructured":
|
147
|
-
return UnstructuredPDFParser(source, config)
|
148
|
-
elif config.pdf.library == "pdf2image":
|
149
|
-
return ImagePdfParser(source, config)
|
150
|
-
else:
|
151
|
-
raise ValueError(
|
152
|
-
f"Unsupported PDF library specified: {config.pdf.library}"
|
153
|
-
)
|
154
|
-
elif inferred_doc_type == DocumentType.DOCX:
|
155
|
-
if config.docx.library == "unstructured":
|
156
|
-
return UnstructuredDocxParser(source, config)
|
157
|
-
elif config.docx.library == "python-docx":
|
158
|
-
return PythonDocxParser(source, config)
|
159
|
-
else:
|
160
|
-
raise ValueError(
|
161
|
-
f"Unsupported DOCX library specified: {config.docx.library}"
|
162
|
-
)
|
163
|
-
elif inferred_doc_type == DocumentType.DOC:
|
164
|
-
return UnstructuredDocParser(source, config)
|
165
|
-
else:
|
166
|
-
source_name = source if isinstance(source, str) else "bytes"
|
167
|
-
raise ValueError(f"Unsupported document type: {source_name}")
|
168
|
-
|
169
|
-
def __init__(self, source: str | bytes, config: ParsingConfig):
|
170
|
-
"""
|
171
|
-
Args:
|
172
|
-
source (str|bytes): The source, which could be
|
173
|
-
a path, a URL or a bytes object.
|
174
|
-
"""
|
175
|
-
super().__init__(config)
|
176
|
-
self.config = config
|
177
|
-
if isinstance(source, bytes):
|
178
|
-
self.source = "bytes"
|
179
|
-
self.doc_bytes = BytesIO(source)
|
180
|
-
else:
|
181
|
-
self.source = source
|
182
|
-
self.doc_bytes = self._load_doc_as_bytesio()
|
183
|
-
|
184
|
-
@staticmethod
|
185
|
-
def _document_type(
|
186
|
-
source: str | bytes, doc_type: str | DocumentType | None = None
|
187
|
-
) -> DocumentType:
|
188
|
-
"""
|
189
|
-
Determine the type of document based on the source.
|
190
|
-
|
191
|
-
Args:
|
192
|
-
source (str|bytes): The source, which could be a URL,
|
193
|
-
a file path, or a bytes object.
|
194
|
-
doc_type (str|DocumentType|None): The type of document, if known.
|
195
|
-
|
196
|
-
Returns:
|
197
|
-
str: The document type.
|
198
|
-
"""
|
199
|
-
if isinstance(doc_type, DocumentType):
|
200
|
-
return doc_type
|
201
|
-
if doc_type:
|
202
|
-
return DocumentType(doc_type.lower())
|
203
|
-
if is_plain_text(source):
|
204
|
-
return DocumentType.TXT
|
205
|
-
if isinstance(source, str):
|
206
|
-
# detect file type from path extension
|
207
|
-
if source.lower().endswith(".pdf"):
|
208
|
-
return DocumentType.PDF
|
209
|
-
elif source.lower().endswith(".docx"):
|
210
|
-
return DocumentType.DOCX
|
211
|
-
elif source.lower().endswith(".doc"):
|
212
|
-
return DocumentType.DOC
|
213
|
-
else:
|
214
|
-
raise ValueError(f"Unsupported document type: {source}")
|
215
|
-
else:
|
216
|
-
# must be bytes: attempt to detect type from content
|
217
|
-
# using magic mime type detection
|
218
|
-
import magic
|
219
|
-
|
220
|
-
mime_type = magic.from_buffer(source, mime=True)
|
221
|
-
if mime_type == "application/pdf":
|
222
|
-
return DocumentType.PDF
|
223
|
-
elif mime_type in [
|
224
|
-
"application/vnd.openxmlformats-officedocument"
|
225
|
-
".wordprocessingml.document",
|
226
|
-
"application/zip",
|
227
|
-
]:
|
228
|
-
# DOCX files are essentially ZIP files,
|
229
|
-
# but this might catch other ZIP-based formats too!
|
230
|
-
return DocumentType.DOCX
|
231
|
-
elif mime_type == "application/msword":
|
232
|
-
return DocumentType.DOC
|
233
|
-
else:
|
234
|
-
raise ValueError("Unsupported document type from bytes")
|
235
|
-
|
236
|
-
def _load_doc_as_bytesio(self) -> BytesIO:
|
237
|
-
"""
|
238
|
-
Load the docs into a BytesIO object.
|
239
|
-
|
240
|
-
Returns:
|
241
|
-
BytesIO: A BytesIO object containing the doc data.
|
242
|
-
"""
|
243
|
-
if self.source.startswith(("http://", "https://")):
|
244
|
-
response = requests.get(self.source)
|
245
|
-
response.raise_for_status()
|
246
|
-
return BytesIO(response.content)
|
247
|
-
else:
|
248
|
-
with open(self.source, "rb") as f:
|
249
|
-
return BytesIO(f.read())
|
250
|
-
|
251
|
-
@staticmethod
|
252
|
-
def chunks_from_path_or_bytes(
|
253
|
-
source: str | bytes,
|
254
|
-
parser: Parser,
|
255
|
-
doc_type: str | DocumentType | None = None,
|
256
|
-
lines: int | None = None,
|
257
|
-
) -> List[Document]:
|
258
|
-
"""
|
259
|
-
Get document chunks from a file path or bytes object.
|
260
|
-
Args:
|
261
|
-
source (str|bytes): The source, which could be a URL, path or bytes object.
|
262
|
-
parser (Parser): The parser instance (for splitting the document).
|
263
|
-
doc_type (str|DocumentType|None): The type of document, if known.
|
264
|
-
lines (int|None): The number of lines to read from a plain text file.
|
265
|
-
Returns:
|
266
|
-
List[Document]: A list of `Document` objects,
|
267
|
-
each containing a chunk of text, determined by the
|
268
|
-
chunking and splitting settings in the parser config.
|
269
|
-
"""
|
270
|
-
dtype: DocumentType = DocumentParser._document_type(source, doc_type)
|
271
|
-
if dtype in [DocumentType.PDF, DocumentType.DOC, DocumentType.DOCX]:
|
272
|
-
doc_parser = DocumentParser.create(
|
273
|
-
source,
|
274
|
-
parser.config,
|
275
|
-
doc_type=doc_type,
|
276
|
-
)
|
277
|
-
chunks = doc_parser.get_doc_chunks()
|
278
|
-
if len(chunks) == 0 and dtype == DocumentType.PDF:
|
279
|
-
doc_parser = ImagePdfParser(source, parser.config)
|
280
|
-
chunks = doc_parser.get_doc_chunks()
|
281
|
-
return chunks
|
282
|
-
else:
|
283
|
-
# try getting as plain text; these will be chunked downstream
|
284
|
-
# -- could be a bytes object or a path
|
285
|
-
if isinstance(source, bytes):
|
286
|
-
content = source.decode()
|
287
|
-
if lines is not None:
|
288
|
-
file_lines = content.splitlines()[:lines]
|
289
|
-
content = "\n".join(line.strip() for line in file_lines)
|
290
|
-
else:
|
291
|
-
with open(source, "r") as f:
|
292
|
-
if lines is not None:
|
293
|
-
file_lines = list(itertools.islice(f, lines))
|
294
|
-
content = "\n".join(line.strip() for line in file_lines)
|
295
|
-
else:
|
296
|
-
content = f.read()
|
297
|
-
soup = BeautifulSoup(content, "html.parser")
|
298
|
-
text = soup.get_text()
|
299
|
-
source_name = source if isinstance(source, str) else "bytes"
|
300
|
-
doc = Document(
|
301
|
-
content=text,
|
302
|
-
metadata=DocMetaData(source=str(source_name)),
|
303
|
-
)
|
304
|
-
return parser.split([doc])
|
305
|
-
|
306
|
-
def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
|
307
|
-
"""Yield each page in the PDF."""
|
308
|
-
raise NotImplementedError
|
309
|
-
|
310
|
-
def extract_text_from_page(self, page: Any) -> str:
|
311
|
-
"""Extract text from a given page."""
|
312
|
-
raise NotImplementedError
|
313
|
-
|
314
|
-
def fix_text(self, text: str) -> str:
|
315
|
-
"""
|
316
|
-
Fix text extracted from a PDF.
|
317
|
-
|
318
|
-
Args:
|
319
|
-
text (str): The extracted text.
|
320
|
-
|
321
|
-
Returns:
|
322
|
-
str: The fixed text.
|
323
|
-
"""
|
324
|
-
# Some pdf parsers introduce extra space before hyphen,
|
325
|
-
# so use regular expression to replace 'space-hyphen' with just 'hyphen'
|
326
|
-
return re.sub(r" +\-", "-", text)
|
327
|
-
|
328
|
-
def get_doc(self) -> Document:
|
329
|
-
"""
|
330
|
-
Get entire text from source as a single document.
|
331
|
-
|
332
|
-
Returns:
|
333
|
-
a `Document` object containing the content of the pdf file,
|
334
|
-
and metadata containing source name (URL or path)
|
335
|
-
"""
|
336
|
-
|
337
|
-
text = "".join(
|
338
|
-
[self.extract_text_from_page(page) for _, page in self.iterate_pages()]
|
339
|
-
)
|
340
|
-
return Document(content=text, metadata=DocMetaData(source=self.source))
|
341
|
-
|
342
|
-
def get_doc_chunks(self) -> List[Document]:
|
343
|
-
"""
|
344
|
-
Get document chunks from a pdf source,
|
345
|
-
with page references in the document metadata.
|
346
|
-
|
347
|
-
Adapted from
|
348
|
-
https://github.com/whitead/paper-qa/blob/main/paperqa/readers.py
|
349
|
-
|
350
|
-
Returns:
|
351
|
-
List[Document]: a list of `Document` objects,
|
352
|
-
each containing a chunk of text
|
353
|
-
"""
|
354
|
-
|
355
|
-
split = [] # tokens in curr split
|
356
|
-
pages: List[str] = []
|
357
|
-
docs: List[Document] = []
|
358
|
-
# metadata.id to be shared by ALL chunks of this document
|
359
|
-
common_id = ObjectRegistry.new_id()
|
360
|
-
n_chunks = 0 # how many chunk so far
|
361
|
-
for i, page in self.iterate_pages():
|
362
|
-
page_text = self.extract_text_from_page(page)
|
363
|
-
split += self.tokenizer.encode(page_text)
|
364
|
-
pages.append(str(i + 1))
|
365
|
-
# split could be so long it needs to be split
|
366
|
-
# into multiple chunks. Or it could be so short
|
367
|
-
# that it needs to be combined with the next chunk.
|
368
|
-
while len(split) > self.config.chunk_size:
|
369
|
-
# pretty formatting of pages (e.g. 1-3, 4, 5-7)
|
370
|
-
pg = "-".join([pages[0], pages[-1]])
|
371
|
-
text = self.tokenizer.decode(split[: self.config.chunk_size])
|
372
|
-
docs.append(
|
373
|
-
Document(
|
374
|
-
content=text,
|
375
|
-
metadata=DocMetaData(
|
376
|
-
source=f"{self.source} pages {pg}",
|
377
|
-
is_chunk=True,
|
378
|
-
id=common_id,
|
379
|
-
),
|
380
|
-
)
|
381
|
-
)
|
382
|
-
n_chunks += 1
|
383
|
-
split = split[self.config.chunk_size - self.config.overlap :]
|
384
|
-
pages = [str(i + 1)]
|
385
|
-
# there may be a last split remaining:
|
386
|
-
# if it's shorter than the overlap, we shouldn't make a chunk for it
|
387
|
-
# since it's already included in the prior chunk;
|
388
|
-
# the only exception is if there have been no chunks so far.
|
389
|
-
if len(split) > self.config.overlap or n_chunks == 0:
|
390
|
-
pg = "-".join([pages[0], pages[-1]])
|
391
|
-
text = self.tokenizer.decode(split[: self.config.chunk_size])
|
392
|
-
docs.append(
|
393
|
-
Document(
|
394
|
-
content=text,
|
395
|
-
metadata=DocMetaData(
|
396
|
-
source=f"{self.source} pages {pg}",
|
397
|
-
is_chunk=True,
|
398
|
-
id=common_id,
|
399
|
-
),
|
400
|
-
)
|
401
|
-
)
|
402
|
-
self.add_window_ids(docs)
|
403
|
-
return docs
|
404
|
-
|
405
|
-
|
406
|
-
class FitzPDFParser(DocumentParser):
|
407
|
-
"""
|
408
|
-
Parser for processing PDFs using the `fitz` library.
|
409
|
-
"""
|
410
|
-
|
411
|
-
def iterate_pages(self) -> Generator[Tuple[int, "fitz.Page"], None, None]:
|
412
|
-
"""
|
413
|
-
Yield each page in the PDF using `fitz`.
|
414
|
-
|
415
|
-
Returns:
|
416
|
-
Generator[fitz.Page]: Generator yielding each page.
|
417
|
-
"""
|
418
|
-
if fitz is None:
|
419
|
-
raise LangroidImportError("fitz", "pdf-parsers")
|
420
|
-
doc = fitz.open(stream=self.doc_bytes, filetype="pdf")
|
421
|
-
for i, page in enumerate(doc):
|
422
|
-
yield i, page
|
423
|
-
doc.close()
|
424
|
-
|
425
|
-
def extract_text_from_page(self, page: "fitz.Page") -> str:
|
426
|
-
"""
|
427
|
-
Extract text from a given `fitz` page.
|
428
|
-
|
429
|
-
Args:
|
430
|
-
page (fitz.Page): The `fitz` page object.
|
431
|
-
|
432
|
-
Returns:
|
433
|
-
str: Extracted text from the page.
|
434
|
-
"""
|
435
|
-
return self.fix_text(page.get_text())
|
436
|
-
|
437
|
-
|
438
|
-
class PyPDFParser(DocumentParser):
|
439
|
-
"""
|
440
|
-
Parser for processing PDFs using the `pypdf` library.
|
441
|
-
"""
|
442
|
-
|
443
|
-
def iterate_pages(self) -> Generator[Tuple[int, pypdf.PageObject], None, None]:
|
444
|
-
"""
|
445
|
-
Yield each page in the PDF using `pypdf`.
|
446
|
-
|
447
|
-
Returns:
|
448
|
-
Generator[pypdf.pdf.PageObject]: Generator yielding each page.
|
449
|
-
"""
|
450
|
-
if pypdf is None:
|
451
|
-
raise LangroidImportError("pypdf", "pdf-parsers")
|
452
|
-
reader = pypdf.PdfReader(self.doc_bytes)
|
453
|
-
for i, page in enumerate(reader.pages):
|
454
|
-
yield i, page
|
455
|
-
|
456
|
-
def extract_text_from_page(self, page: pypdf.PageObject) -> str:
|
457
|
-
"""
|
458
|
-
Extract text from a given `pypdf` page.
|
459
|
-
|
460
|
-
Args:
|
461
|
-
page (pypdf.pdf.PageObject): The `pypdf` page object.
|
462
|
-
|
463
|
-
Returns:
|
464
|
-
str: Extracted text from the page.
|
465
|
-
"""
|
466
|
-
return self.fix_text(page.extract_text())
|
467
|
-
|
468
|
-
|
469
|
-
class PDFPlumberParser(DocumentParser):
|
470
|
-
"""
|
471
|
-
Parser for processing PDFs using the `pdfplumber` library.
|
472
|
-
"""
|
473
|
-
|
474
|
-
def iterate_pages(
|
475
|
-
self,
|
476
|
-
) -> (Generator)[Tuple[int, pdfplumber.pdf.Page], None, None]: # type: ignore
|
477
|
-
"""
|
478
|
-
Yield each page in the PDF using `pdfplumber`.
|
479
|
-
|
480
|
-
Returns:
|
481
|
-
Generator[pdfplumber.Page]: Generator yielding each page.
|
482
|
-
"""
|
483
|
-
if pdfplumber is None:
|
484
|
-
raise LangroidImportError("pdfplumber", "pdf-parsers")
|
485
|
-
with pdfplumber.open(self.doc_bytes) as pdf:
|
486
|
-
for i, page in enumerate(pdf.pages):
|
487
|
-
yield i, page
|
488
|
-
|
489
|
-
def extract_text_from_page(self, page: pdfplumber.pdf.Page) -> str: # type: ignore
|
490
|
-
"""
|
491
|
-
Extract text from a given `pdfplumber` page.
|
492
|
-
|
493
|
-
Args:
|
494
|
-
page (pdfplumber.Page): The `pdfplumber` page object.
|
495
|
-
|
496
|
-
Returns:
|
497
|
-
str: Extracted text from the page.
|
498
|
-
"""
|
499
|
-
return self.fix_text(page.extract_text())
|
500
|
-
|
501
|
-
|
502
|
-
class ImagePdfParser(DocumentParser):
|
503
|
-
"""
|
504
|
-
Parser for processing PDFs that are images, i.e. not "true" PDFs.
|
505
|
-
"""
|
506
|
-
|
507
|
-
def iterate_pages(
|
508
|
-
self,
|
509
|
-
) -> Generator[Tuple[int, "Image"], None, None]: # type: ignore
|
510
|
-
try:
|
511
|
-
from pdf2image import convert_from_bytes
|
512
|
-
except ImportError:
|
513
|
-
raise LangroidImportError("pdf2image", "pdf-parsers")
|
514
|
-
|
515
|
-
images = convert_from_bytes(self.doc_bytes.getvalue())
|
516
|
-
for i, image in enumerate(images):
|
517
|
-
yield i, image
|
518
|
-
|
519
|
-
def extract_text_from_page(self, page: "Image") -> str: # type: ignore
|
520
|
-
"""
|
521
|
-
Extract text from a given `pdf2image` page.
|
522
|
-
|
523
|
-
Args:
|
524
|
-
page (Image): The PIL Image object.
|
525
|
-
|
526
|
-
Returns:
|
527
|
-
str: Extracted text from the image.
|
528
|
-
"""
|
529
|
-
try:
|
530
|
-
import pytesseract
|
531
|
-
except ImportError:
|
532
|
-
raise LangroidImportError("pytesseract", "pdf-parsers")
|
533
|
-
|
534
|
-
text = pytesseract.image_to_string(page)
|
535
|
-
return self.fix_text(text)
|
536
|
-
|
537
|
-
|
538
|
-
class UnstructuredPDFParser(DocumentParser):
|
539
|
-
"""
|
540
|
-
Parser for processing PDF files using the `unstructured` library.
|
541
|
-
"""
|
542
|
-
|
543
|
-
def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]: # type: ignore
|
544
|
-
try:
|
545
|
-
from unstructured.partition.pdf import partition_pdf
|
546
|
-
except ImportError:
|
547
|
-
raise ImportError(
|
548
|
-
"""
|
549
|
-
The `unstructured` library is not installed by default with langroid.
|
550
|
-
To include this library, please install langroid with the
|
551
|
-
`unstructured` extra by running `pip install "langroid[unstructured]"`
|
552
|
-
or equivalent.
|
553
|
-
"""
|
554
|
-
)
|
555
|
-
|
556
|
-
# from unstructured.chunking.title import chunk_by_title
|
557
|
-
|
558
|
-
try:
|
559
|
-
elements = partition_pdf(file=self.doc_bytes, include_page_breaks=True)
|
560
|
-
except Exception as e:
|
561
|
-
raise Exception(
|
562
|
-
f"""
|
563
|
-
Error parsing PDF: {e}
|
564
|
-
The `unstructured` library failed to parse the pdf.
|
565
|
-
Please try a different library by setting the `library` field
|
566
|
-
in the `pdf` section of the `parsing` field in the config file.
|
567
|
-
Supported libraries are:
|
568
|
-
fitz, pypdf, pdfplumber, unstructured
|
569
|
-
"""
|
570
|
-
)
|
571
|
-
|
572
|
-
# elements = chunk_by_title(elements)
|
573
|
-
page_number = 1
|
574
|
-
page_elements = [] # type: ignore
|
575
|
-
for el in elements:
|
576
|
-
if el.category == "PageBreak":
|
577
|
-
if page_elements: # Avoid yielding empty pages at the start
|
578
|
-
yield page_number, page_elements
|
579
|
-
page_number += 1
|
580
|
-
page_elements = []
|
581
|
-
else:
|
582
|
-
page_elements.append(el)
|
583
|
-
# Yield the last page if it's not empty
|
584
|
-
if page_elements:
|
585
|
-
yield page_number, page_elements
|
586
|
-
|
587
|
-
def extract_text_from_page(self, page: Any) -> str:
|
588
|
-
"""
|
589
|
-
Extract text from a given `unstructured` element.
|
590
|
-
|
591
|
-
Args:
|
592
|
-
page (unstructured element): The `unstructured` element object.
|
593
|
-
|
594
|
-
Returns:
|
595
|
-
str: Extracted text from the element.
|
596
|
-
"""
|
597
|
-
text = " ".join(el.text for el in page)
|
598
|
-
return self.fix_text(text)
|
599
|
-
|
600
|
-
|
601
|
-
class UnstructuredDocxParser(DocumentParser):
|
602
|
-
"""
|
603
|
-
Parser for processing DOCX files using the `unstructured` library.
|
604
|
-
"""
|
605
|
-
|
606
|
-
def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]: # type: ignore
|
607
|
-
try:
|
608
|
-
from unstructured.partition.docx import partition_docx
|
609
|
-
except ImportError:
|
610
|
-
raise ImportError(
|
611
|
-
"""
|
612
|
-
The `unstructured` library is not installed by default with langroid.
|
613
|
-
To include this library, please install langroid with the
|
614
|
-
`unstructured` extra by running `pip install "langroid[unstructured]"`
|
615
|
-
or equivalent.
|
616
|
-
"""
|
617
|
-
)
|
618
|
-
|
619
|
-
elements = partition_docx(file=self.doc_bytes, include_page_breaks=True)
|
620
|
-
|
621
|
-
page_number = 1
|
622
|
-
page_elements = [] # type: ignore
|
623
|
-
for el in elements:
|
624
|
-
if el.category == "PageBreak":
|
625
|
-
if page_elements: # Avoid yielding empty pages at the start
|
626
|
-
yield page_number, page_elements
|
627
|
-
page_number += 1
|
628
|
-
page_elements = []
|
629
|
-
else:
|
630
|
-
page_elements.append(el)
|
631
|
-
# Yield the last page if it's not empty
|
632
|
-
if page_elements:
|
633
|
-
yield page_number, page_elements
|
634
|
-
|
635
|
-
def extract_text_from_page(self, page: Any) -> str:
|
636
|
-
"""
|
637
|
-
Extract text from a given `unstructured` element.
|
638
|
-
|
639
|
-
Note:
|
640
|
-
The concept of "pages" doesn't actually exist in the .docx file format in
|
641
|
-
the same way it does in formats like .pdf. A .docx file is made up of a
|
642
|
-
series of elements like paragraphs and tables, but the division into
|
643
|
-
pages is done dynamically based on the rendering settings (like the page
|
644
|
-
size, margin size, font size, etc.).
|
645
|
-
|
646
|
-
Args:
|
647
|
-
page (unstructured element): The `unstructured` element object.
|
648
|
-
|
649
|
-
Returns:
|
650
|
-
str: Extracted text from the element.
|
651
|
-
"""
|
652
|
-
text = " ".join(el.text for el in page)
|
653
|
-
return self.fix_text(text)
|
654
|
-
|
655
|
-
|
656
|
-
class UnstructuredDocParser(UnstructuredDocxParser):
|
657
|
-
def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]: # type: ignore
|
658
|
-
try:
|
659
|
-
from unstructured.partition.doc import partition_doc
|
660
|
-
except ImportError:
|
661
|
-
raise ImportError(
|
662
|
-
"""
|
663
|
-
The `unstructured` library is not installed by default with langroid.
|
664
|
-
To include this library, please install langroid with the
|
665
|
-
`unstructured` extra by running `pip install "langroid[unstructured]"`
|
666
|
-
or equivalent.
|
667
|
-
"""
|
668
|
-
)
|
669
|
-
|
670
|
-
elements = partition_doc(file=self.doc_bytes, include_page_breaks=True)
|
671
|
-
|
672
|
-
page_number = 1
|
673
|
-
page_elements = [] # type: ignore
|
674
|
-
for el in elements:
|
675
|
-
if el.category == "PageBreak":
|
676
|
-
if page_elements: # Avoid yielding empty pages at the start
|
677
|
-
yield page_number, page_elements
|
678
|
-
page_number += 1
|
679
|
-
page_elements = []
|
680
|
-
else:
|
681
|
-
page_elements.append(el)
|
682
|
-
# Yield the last page if it's not empty
|
683
|
-
if page_elements:
|
684
|
-
yield page_number, page_elements
|
685
|
-
|
686
|
-
|
687
|
-
class PythonDocxParser(DocumentParser):
|
688
|
-
"""
|
689
|
-
Parser for processing DOCX files using the `python-docx` library.
|
690
|
-
"""
|
691
|
-
|
692
|
-
def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
|
693
|
-
"""
|
694
|
-
Simulate iterating through pages.
|
695
|
-
In a DOCX file, pages are not explicitly defined,
|
696
|
-
so we consider each paragraph as a separate 'page' for simplicity.
|
697
|
-
"""
|
698
|
-
try:
|
699
|
-
import docx
|
700
|
-
except ImportError:
|
701
|
-
raise LangroidImportError("python-docx", "docx")
|
702
|
-
|
703
|
-
doc = docx.Document(self.doc_bytes)
|
704
|
-
for i, para in enumerate(doc.paragraphs, start=1):
|
705
|
-
yield i, [para]
|
706
|
-
|
707
|
-
def extract_text_from_page(self, page: Any) -> str:
|
708
|
-
"""
|
709
|
-
Extract text from a given 'page', which in this case is a single paragraph.
|
710
|
-
|
711
|
-
Args:
|
712
|
-
page (list): A list containing a single Paragraph object.
|
713
|
-
|
714
|
-
Returns:
|
715
|
-
str: Extracted text from the paragraph.
|
716
|
-
"""
|
717
|
-
paragraph = page[0]
|
718
|
-
return self.fix_text(paragraph.text)
|