langroid 0.1.85__py3-none-any.whl → 0.1.219__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langroid/__init__.py +95 -0
- langroid/agent/__init__.py +40 -0
- langroid/agent/base.py +222 -91
- langroid/agent/batch.py +264 -0
- langroid/agent/callbacks/chainlit.py +608 -0
- langroid/agent/chat_agent.py +247 -101
- langroid/agent/chat_document.py +41 -4
- langroid/agent/openai_assistant.py +842 -0
- langroid/agent/special/__init__.py +50 -0
- langroid/agent/special/doc_chat_agent.py +837 -141
- langroid/agent/special/lance_doc_chat_agent.py +258 -0
- langroid/agent/special/lance_rag/__init__.py +9 -0
- langroid/agent/special/lance_rag/critic_agent.py +136 -0
- langroid/agent/special/lance_rag/lance_rag_task.py +80 -0
- langroid/agent/special/lance_rag/query_planner_agent.py +180 -0
- langroid/agent/special/lance_tools.py +44 -0
- langroid/agent/special/neo4j/__init__.py +0 -0
- langroid/agent/special/neo4j/csv_kg_chat.py +174 -0
- langroid/agent/special/neo4j/neo4j_chat_agent.py +370 -0
- langroid/agent/special/neo4j/utils/__init__.py +0 -0
- langroid/agent/special/neo4j/utils/system_message.py +46 -0
- langroid/agent/special/relevance_extractor_agent.py +127 -0
- langroid/agent/special/retriever_agent.py +32 -198
- langroid/agent/special/sql/__init__.py +11 -0
- langroid/agent/special/sql/sql_chat_agent.py +47 -23
- langroid/agent/special/sql/utils/__init__.py +22 -0
- langroid/agent/special/sql/utils/description_extractors.py +95 -46
- langroid/agent/special/sql/utils/populate_metadata.py +28 -21
- langroid/agent/special/table_chat_agent.py +43 -9
- langroid/agent/task.py +475 -122
- langroid/agent/tool_message.py +75 -13
- langroid/agent/tools/__init__.py +13 -0
- langroid/agent/tools/duckduckgo_search_tool.py +66 -0
- langroid/agent/tools/google_search_tool.py +11 -0
- langroid/agent/tools/metaphor_search_tool.py +67 -0
- langroid/agent/tools/recipient_tool.py +16 -29
- langroid/agent/tools/run_python_code.py +60 -0
- langroid/agent/tools/sciphi_search_rag_tool.py +79 -0
- langroid/agent/tools/segment_extract_tool.py +36 -0
- langroid/cachedb/__init__.py +9 -0
- langroid/cachedb/base.py +22 -2
- langroid/cachedb/momento_cachedb.py +26 -2
- langroid/cachedb/redis_cachedb.py +78 -11
- langroid/embedding_models/__init__.py +34 -0
- langroid/embedding_models/base.py +21 -2
- langroid/embedding_models/models.py +120 -18
- langroid/embedding_models/protoc/embeddings.proto +19 -0
- langroid/embedding_models/protoc/embeddings_pb2.py +33 -0
- langroid/embedding_models/protoc/embeddings_pb2.pyi +50 -0
- langroid/embedding_models/protoc/embeddings_pb2_grpc.py +79 -0
- langroid/embedding_models/remote_embeds.py +153 -0
- langroid/language_models/__init__.py +45 -0
- langroid/language_models/azure_openai.py +80 -27
- langroid/language_models/base.py +117 -12
- langroid/language_models/config.py +5 -0
- langroid/language_models/openai_assistants.py +3 -0
- langroid/language_models/openai_gpt.py +558 -174
- langroid/language_models/prompt_formatter/__init__.py +15 -0
- langroid/language_models/prompt_formatter/base.py +4 -6
- langroid/language_models/prompt_formatter/hf_formatter.py +135 -0
- langroid/language_models/utils.py +18 -21
- langroid/mytypes.py +25 -8
- langroid/parsing/__init__.py +46 -0
- langroid/parsing/document_parser.py +260 -63
- langroid/parsing/image_text.py +32 -0
- langroid/parsing/parse_json.py +143 -0
- langroid/parsing/parser.py +122 -59
- langroid/parsing/repo_loader.py +114 -52
- langroid/parsing/search.py +68 -63
- langroid/parsing/spider.py +3 -2
- langroid/parsing/table_loader.py +44 -0
- langroid/parsing/url_loader.py +59 -11
- langroid/parsing/urls.py +85 -37
- langroid/parsing/utils.py +298 -4
- langroid/parsing/web_search.py +73 -0
- langroid/prompts/__init__.py +11 -0
- langroid/prompts/chat-gpt4-system-prompt.md +68 -0
- langroid/prompts/prompts_config.py +1 -1
- langroid/utils/__init__.py +17 -0
- langroid/utils/algorithms/__init__.py +3 -0
- langroid/utils/algorithms/graph.py +103 -0
- langroid/utils/configuration.py +36 -5
- langroid/utils/constants.py +4 -0
- langroid/utils/globals.py +2 -2
- langroid/utils/logging.py +2 -5
- langroid/utils/output/__init__.py +21 -0
- langroid/utils/output/printing.py +47 -1
- langroid/utils/output/status.py +33 -0
- langroid/utils/pandas_utils.py +30 -0
- langroid/utils/pydantic_utils.py +616 -2
- langroid/utils/system.py +98 -0
- langroid/vector_store/__init__.py +40 -0
- langroid/vector_store/base.py +203 -6
- langroid/vector_store/chromadb.py +59 -32
- langroid/vector_store/lancedb.py +463 -0
- langroid/vector_store/meilisearch.py +10 -7
- langroid/vector_store/momento.py +262 -0
- langroid/vector_store/qdrantdb.py +104 -22
- {langroid-0.1.85.dist-info → langroid-0.1.219.dist-info}/METADATA +329 -149
- langroid-0.1.219.dist-info/RECORD +127 -0
- {langroid-0.1.85.dist-info → langroid-0.1.219.dist-info}/WHEEL +1 -1
- langroid/agent/special/recipient_validator_agent.py +0 -157
- langroid/parsing/json.py +0 -64
- langroid/utils/web/selenium_login.py +0 -36
- langroid-0.1.85.dist-info/RECORD +0 -94
- /langroid/{scripts → agent/callbacks}/__init__.py +0 -0
- {langroid-0.1.85.dist-info → langroid-0.1.219.dist-info}/LICENSE +0 -0
@@ -1,3 +1,4 @@
|
|
1
|
+
import itertools
|
1
2
|
import logging
|
2
3
|
import re
|
3
4
|
from enum import Enum
|
@@ -8,10 +9,11 @@ import fitz
|
|
8
9
|
import pdfplumber
|
9
10
|
import pypdf
|
10
11
|
import requests
|
12
|
+
from bs4 import BeautifulSoup
|
13
|
+
from PIL import Image
|
11
14
|
|
12
15
|
from langroid.mytypes import DocMetaData, Document
|
13
16
|
from langroid.parsing.parser import Parser, ParsingConfig
|
14
|
-
from langroid.parsing.urls import url_to_tempfile
|
15
17
|
|
16
18
|
logger = logging.getLogger(__name__)
|
17
19
|
|
@@ -19,6 +21,30 @@ logger = logging.getLogger(__name__)
|
|
19
21
|
class DocumentType(str, Enum):
|
20
22
|
PDF = "pdf"
|
21
23
|
DOCX = "docx"
|
24
|
+
DOC = "doc"
|
25
|
+
TXT = "txt"
|
26
|
+
|
27
|
+
|
28
|
+
def is_plain_text(path_or_bytes: str | bytes) -> bool:
|
29
|
+
if isinstance(path_or_bytes, str):
|
30
|
+
if path_or_bytes.startswith(("http://", "https://")):
|
31
|
+
response = requests.get(path_or_bytes)
|
32
|
+
response.raise_for_status()
|
33
|
+
content = response.content[:1024]
|
34
|
+
else:
|
35
|
+
with open(path_or_bytes, "rb") as f:
|
36
|
+
content = f.read(1024)
|
37
|
+
else:
|
38
|
+
content = path_or_bytes[:1024]
|
39
|
+
try:
|
40
|
+
# Attempt to decode the content as UTF-8
|
41
|
+
_ = content.decode("utf-8")
|
42
|
+
# Additional checks can go here, e.g., to verify that the content
|
43
|
+
# doesn't contain too many unusual characters for it to be considered text
|
44
|
+
return True
|
45
|
+
except UnicodeDecodeError:
|
46
|
+
# If decoding fails, it's likely not plain text (or not encoded in UTF-8)
|
47
|
+
return False
|
22
48
|
|
23
49
|
|
24
50
|
class DocumentParser(Parser):
|
@@ -32,19 +58,26 @@ class DocumentParser(Parser):
|
|
32
58
|
"""
|
33
59
|
|
34
60
|
@classmethod
|
35
|
-
def create(
|
61
|
+
def create(
|
62
|
+
cls,
|
63
|
+
source: str | bytes,
|
64
|
+
config: ParsingConfig,
|
65
|
+
doc_type: str | DocumentType | None = None,
|
66
|
+
) -> "DocumentParser":
|
36
67
|
"""
|
37
68
|
Create a DocumentParser instance based on source type
|
38
69
|
and config.<source_type>.library specified.
|
39
70
|
|
40
71
|
Args:
|
41
|
-
source (str): The source
|
72
|
+
source (str|bytes): The source, could be a URL, file path,
|
73
|
+
or bytes object.
|
42
74
|
config (ParserConfig): The parser configuration.
|
75
|
+
doc_type (str|None): The type of document, if known
|
43
76
|
|
44
77
|
Returns:
|
45
78
|
DocumentParser: An instance of a DocumentParser subclass.
|
46
79
|
"""
|
47
|
-
if DocumentParser._document_type(source) == DocumentType.PDF:
|
80
|
+
if DocumentParser._document_type(source, doc_type) == DocumentType.PDF:
|
48
81
|
if config.pdf.library == "fitz":
|
49
82
|
return FitzPDFParser(source, config)
|
50
83
|
elif config.pdf.library == "pypdf":
|
@@ -53,51 +86,93 @@ class DocumentParser(Parser):
|
|
53
86
|
return PDFPlumberParser(source, config)
|
54
87
|
elif config.pdf.library == "unstructured":
|
55
88
|
return UnstructuredPDFParser(source, config)
|
56
|
-
elif config.pdf.library == "
|
57
|
-
return
|
89
|
+
elif config.pdf.library == "pdf2image":
|
90
|
+
return ImagePdfParser(source, config)
|
58
91
|
else:
|
59
92
|
raise ValueError(
|
60
93
|
f"Unsupported PDF library specified: {config.pdf.library}"
|
61
94
|
)
|
62
|
-
elif DocumentParser._document_type(source) == DocumentType.DOCX:
|
95
|
+
elif DocumentParser._document_type(source, doc_type) == DocumentType.DOCX:
|
63
96
|
if config.docx.library == "unstructured":
|
64
97
|
return UnstructuredDocxParser(source, config)
|
98
|
+
elif config.docx.library == "python-docx":
|
99
|
+
return PythonDocxParser(source, config)
|
65
100
|
else:
|
66
101
|
raise ValueError(
|
67
102
|
f"Unsupported DOCX library specified: {config.docx.library}"
|
68
103
|
)
|
104
|
+
elif DocumentParser._document_type(source, doc_type) == DocumentType.DOC:
|
105
|
+
return UnstructuredDocParser(source, config)
|
69
106
|
else:
|
70
|
-
|
107
|
+
source_name = source if isinstance(source, str) else "bytes"
|
108
|
+
raise ValueError(f"Unsupported document type: {source_name}")
|
71
109
|
|
72
|
-
def __init__(self, source: str, config: ParsingConfig):
|
110
|
+
def __init__(self, source: str | bytes, config: ParsingConfig):
|
73
111
|
"""
|
74
|
-
Initialize the PDFParser.
|
75
|
-
|
76
112
|
Args:
|
77
|
-
source (str): The source
|
113
|
+
source (str|bytes): The source, which could be
|
114
|
+
a path, a URL or a bytes object.
|
78
115
|
"""
|
79
116
|
super().__init__(config)
|
80
|
-
self.source = source
|
81
117
|
self.config = config
|
82
|
-
|
118
|
+
if isinstance(source, bytes):
|
119
|
+
self.source = "bytes"
|
120
|
+
self.doc_bytes = BytesIO(source)
|
121
|
+
else:
|
122
|
+
self.source = source
|
123
|
+
self.doc_bytes = self._load_doc_as_bytesio()
|
83
124
|
|
84
125
|
@staticmethod
|
85
|
-
def _document_type(
|
126
|
+
def _document_type(
|
127
|
+
source: str | bytes, doc_type: str | DocumentType | None = None
|
128
|
+
) -> DocumentType:
|
86
129
|
"""
|
87
130
|
Determine the type of document based on the source.
|
88
131
|
|
89
132
|
Args:
|
90
|
-
source (str): The source
|
133
|
+
source (str|bytes): The source, which could be a URL,
|
134
|
+
a file path, or a bytes object.
|
135
|
+
doc_type (str|DocumentType|None): The type of document, if known.
|
91
136
|
|
92
137
|
Returns:
|
93
138
|
str: The document type.
|
94
139
|
"""
|
95
|
-
if
|
96
|
-
return
|
97
|
-
|
98
|
-
return DocumentType.
|
140
|
+
if isinstance(doc_type, DocumentType):
|
141
|
+
return doc_type
|
142
|
+
if doc_type:
|
143
|
+
return DocumentType(doc_type.lower())
|
144
|
+
if is_plain_text(source):
|
145
|
+
return DocumentType.TXT
|
146
|
+
if isinstance(source, str):
|
147
|
+
# detect file type from path extension
|
148
|
+
if source.lower().endswith(".pdf"):
|
149
|
+
return DocumentType.PDF
|
150
|
+
elif source.lower().endswith(".docx"):
|
151
|
+
return DocumentType.DOCX
|
152
|
+
elif source.lower().endswith(".doc"):
|
153
|
+
return DocumentType.DOC
|
154
|
+
else:
|
155
|
+
raise ValueError(f"Unsupported document type: {source}")
|
99
156
|
else:
|
100
|
-
|
157
|
+
# must be bytes: attempt to detect type from content
|
158
|
+
# using magic mime type detection
|
159
|
+
import magic
|
160
|
+
|
161
|
+
mime_type = magic.from_buffer(source, mime=True)
|
162
|
+
if mime_type == "application/pdf":
|
163
|
+
return DocumentType.PDF
|
164
|
+
elif mime_type in [
|
165
|
+
"application/vnd.openxmlformats-officedocument"
|
166
|
+
".wordprocessingml.document",
|
167
|
+
"application/zip",
|
168
|
+
]:
|
169
|
+
# DOCX files are essentially ZIP files,
|
170
|
+
# but this might catch other ZIP-based formats too!
|
171
|
+
return DocumentType.DOCX
|
172
|
+
elif mime_type == "application/msword":
|
173
|
+
return DocumentType.DOC
|
174
|
+
else:
|
175
|
+
raise ValueError("Unsupported document type from bytes")
|
101
176
|
|
102
177
|
def _load_doc_as_bytesio(self) -> BytesIO:
|
103
178
|
"""
|
@@ -114,6 +189,61 @@ class DocumentParser(Parser):
|
|
114
189
|
with open(self.source, "rb") as f:
|
115
190
|
return BytesIO(f.read())
|
116
191
|
|
192
|
+
@staticmethod
|
193
|
+
def chunks_from_path_or_bytes(
|
194
|
+
source: str | bytes,
|
195
|
+
parser: Parser,
|
196
|
+
doc_type: str | DocumentType | None = None,
|
197
|
+
lines: int | None = None,
|
198
|
+
) -> List[Document]:
|
199
|
+
"""
|
200
|
+
Get document chunks from a file path or bytes object.
|
201
|
+
Args:
|
202
|
+
source (str|bytes): The source, which could be a URL, path or bytes object.
|
203
|
+
parser (Parser): The parser instance (for splitting the document).
|
204
|
+
doc_type (str|DocumentType|None): The type of document, if known.
|
205
|
+
lines (int|None): The number of lines to read from a plain text file.
|
206
|
+
Returns:
|
207
|
+
List[Document]: A list of `Document` objects,
|
208
|
+
each containing a chunk of text, determined by the
|
209
|
+
chunking and splitting settings in the parser config.
|
210
|
+
"""
|
211
|
+
dtype: DocumentType = DocumentParser._document_type(source, doc_type)
|
212
|
+
if dtype in [DocumentType.PDF, DocumentType.DOC, DocumentType.DOCX]:
|
213
|
+
doc_parser = DocumentParser.create(
|
214
|
+
source,
|
215
|
+
parser.config,
|
216
|
+
doc_type=doc_type,
|
217
|
+
)
|
218
|
+
chunks = doc_parser.get_doc_chunks()
|
219
|
+
if len(chunks) == 0 and dtype == DocumentType.PDF:
|
220
|
+
doc_parser = ImagePdfParser(source, parser.config)
|
221
|
+
chunks = doc_parser.get_doc_chunks()
|
222
|
+
return chunks
|
223
|
+
else:
|
224
|
+
# try getting as plain text; these will be chunked downstream
|
225
|
+
# -- could be a bytes object or a path
|
226
|
+
if isinstance(source, bytes):
|
227
|
+
content = source.decode()
|
228
|
+
if lines is not None:
|
229
|
+
file_lines = content.splitlines()[:lines]
|
230
|
+
content = "\n".join(line.strip() for line in file_lines)
|
231
|
+
else:
|
232
|
+
with open(source, "r") as f:
|
233
|
+
if lines is not None:
|
234
|
+
file_lines = list(itertools.islice(f, lines))
|
235
|
+
content = "\n".join(line.strip() for line in file_lines)
|
236
|
+
else:
|
237
|
+
content = f.read()
|
238
|
+
soup = BeautifulSoup(content, "html.parser")
|
239
|
+
text = soup.get_text()
|
240
|
+
source_name = source if isinstance(source, str) else "bytes"
|
241
|
+
doc = Document(
|
242
|
+
content=text,
|
243
|
+
metadata=DocMetaData(source=str(source_name)),
|
244
|
+
)
|
245
|
+
return parser.split([doc])
|
246
|
+
|
117
247
|
def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
|
118
248
|
"""Yield each page in the PDF."""
|
119
249
|
raise NotImplementedError
|
@@ -138,7 +268,7 @@ class DocumentParser(Parser):
|
|
138
268
|
|
139
269
|
def get_doc(self) -> Document:
|
140
270
|
"""
|
141
|
-
Get entire text from
|
271
|
+
Get entire text from source as a single document.
|
142
272
|
|
143
273
|
Returns:
|
144
274
|
a `Document` object containing the content of the pdf file,
|
@@ -200,6 +330,7 @@ class DocumentParser(Parser):
|
|
200
330
|
),
|
201
331
|
)
|
202
332
|
)
|
333
|
+
self.add_window_ids(docs)
|
203
334
|
return docs
|
204
335
|
|
205
336
|
|
@@ -293,50 +424,34 @@ class PDFPlumberParser(DocumentParser):
|
|
293
424
|
return self.fix_text(page.extract_text())
|
294
425
|
|
295
426
|
|
296
|
-
class
|
427
|
+
class ImagePdfParser(DocumentParser):
|
297
428
|
"""
|
298
|
-
Parser for processing PDFs
|
429
|
+
Parser for processing PDFs that are images, i.e. not "true" PDFs.
|
299
430
|
"""
|
300
431
|
|
301
|
-
def
|
302
|
-
|
303
|
-
|
304
|
-
|
432
|
+
def iterate_pages(
|
433
|
+
self,
|
434
|
+
) -> Generator[Tuple[int, Image], None, None]:
|
435
|
+
from pdf2image import convert_from_bytes
|
436
|
+
|
437
|
+
images = convert_from_bytes(self.doc_bytes.getvalue())
|
438
|
+
for i, image in enumerate(images):
|
439
|
+
yield i, image
|
440
|
+
|
441
|
+
def extract_text_from_page(self, page: Image) -> str:
|
305
442
|
"""
|
443
|
+
Extract text from a given `pdf2image` page.
|
306
444
|
|
307
|
-
|
445
|
+
Args:
|
446
|
+
page (Image): The PIL Image object.
|
308
447
|
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
# note self.config.chunk_size is in token units,
|
317
|
-
# and we use an approximation of 75 words per 100 tokens
|
318
|
-
# to convert to word units
|
319
|
-
preprocessor = PreProcessor(
|
320
|
-
clean_empty_lines=True,
|
321
|
-
clean_whitespace=True,
|
322
|
-
clean_header_footer=False,
|
323
|
-
split_by="word",
|
324
|
-
split_length=int(0.75 * self.config.chunk_size),
|
325
|
-
split_overlap=int(0.75 * self.config.overlap),
|
326
|
-
split_respect_sentence_boundary=True,
|
327
|
-
add_page_number=True,
|
328
|
-
)
|
329
|
-
chunks = preprocessor.process(doc)
|
330
|
-
return [
|
331
|
-
Document(
|
332
|
-
content=chunk.content,
|
333
|
-
metadata=DocMetaData(
|
334
|
-
source=f"{self.source} page {chunk.meta['page']}",
|
335
|
-
is_chunk=True,
|
336
|
-
),
|
337
|
-
)
|
338
|
-
for chunk in chunks
|
339
|
-
]
|
448
|
+
Returns:
|
449
|
+
str: Extracted text from the image.
|
450
|
+
"""
|
451
|
+
import pytesseract
|
452
|
+
|
453
|
+
text = pytesseract.image_to_string(page)
|
454
|
+
return self.fix_text(text)
|
340
455
|
|
341
456
|
|
342
457
|
class UnstructuredPDFParser(DocumentParser):
|
@@ -345,7 +460,17 @@ class UnstructuredPDFParser(DocumentParser):
|
|
345
460
|
"""
|
346
461
|
|
347
462
|
def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]: # type: ignore
|
348
|
-
|
463
|
+
try:
|
464
|
+
from unstructured.partition.pdf import partition_pdf
|
465
|
+
except ImportError:
|
466
|
+
raise ImportError(
|
467
|
+
"""
|
468
|
+
The `unstructured` library is not installed by default with langroid.
|
469
|
+
To include this library, please install langroid with the
|
470
|
+
`unstructured` extra by running `pip install "langroid[unstructured]"`
|
471
|
+
or equivalent.
|
472
|
+
"""
|
473
|
+
)
|
349
474
|
|
350
475
|
# from unstructured.chunking.title import chunk_by_title
|
351
476
|
|
@@ -359,7 +484,7 @@ class UnstructuredPDFParser(DocumentParser):
|
|
359
484
|
Please try a different library by setting the `library` field
|
360
485
|
in the `pdf` section of the `parsing` field in the config file.
|
361
486
|
Supported libraries are:
|
362
|
-
fitz, pypdf, pdfplumber, unstructured
|
487
|
+
fitz, pypdf, pdfplumber, unstructured
|
363
488
|
"""
|
364
489
|
)
|
365
490
|
|
@@ -398,7 +523,17 @@ class UnstructuredDocxParser(DocumentParser):
|
|
398
523
|
"""
|
399
524
|
|
400
525
|
def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]: # type: ignore
|
401
|
-
|
526
|
+
try:
|
527
|
+
from unstructured.partition.docx import partition_docx
|
528
|
+
except ImportError:
|
529
|
+
raise ImportError(
|
530
|
+
"""
|
531
|
+
The `unstructured` library is not installed by default with langroid.
|
532
|
+
To include this library, please install langroid with the
|
533
|
+
`unstructured` extra by running `pip install "langroid[unstructured]"`
|
534
|
+
or equivalent.
|
535
|
+
"""
|
536
|
+
)
|
402
537
|
|
403
538
|
elements = partition_docx(file=self.doc_bytes, include_page_breaks=True)
|
404
539
|
|
@@ -435,3 +570,65 @@ class UnstructuredDocxParser(DocumentParser):
|
|
435
570
|
"""
|
436
571
|
text = " ".join(el.text for el in page)
|
437
572
|
return self.fix_text(text)
|
573
|
+
|
574
|
+
|
575
|
+
class UnstructuredDocParser(UnstructuredDocxParser):
|
576
|
+
def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]: # type: ignore
|
577
|
+
try:
|
578
|
+
from unstructured.partition.doc import partition_doc
|
579
|
+
except ImportError:
|
580
|
+
raise ImportError(
|
581
|
+
"""
|
582
|
+
The `unstructured` library is not installed by default with langroid.
|
583
|
+
To include this library, please install langroid with the
|
584
|
+
`unstructured` extra by running `pip install "langroid[unstructured]"`
|
585
|
+
or equivalent.
|
586
|
+
"""
|
587
|
+
)
|
588
|
+
|
589
|
+
elements = partition_doc(filename=self.source, include_page_breaks=True)
|
590
|
+
|
591
|
+
page_number = 1
|
592
|
+
page_elements = [] # type: ignore
|
593
|
+
for el in elements:
|
594
|
+
if el.category == "PageBreak":
|
595
|
+
if page_elements: # Avoid yielding empty pages at the start
|
596
|
+
yield page_number, page_elements
|
597
|
+
page_number += 1
|
598
|
+
page_elements = []
|
599
|
+
else:
|
600
|
+
page_elements.append(el)
|
601
|
+
# Yield the last page if it's not empty
|
602
|
+
if page_elements:
|
603
|
+
yield page_number, page_elements
|
604
|
+
|
605
|
+
|
606
|
+
class PythonDocxParser(DocumentParser):
|
607
|
+
"""
|
608
|
+
Parser for processing DOCX files using the `python-docx` library.
|
609
|
+
"""
|
610
|
+
|
611
|
+
def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
|
612
|
+
"""
|
613
|
+
Simulate iterating through pages.
|
614
|
+
In a DOCX file, pages are not explicitly defined,
|
615
|
+
so we consider each paragraph as a separate 'page' for simplicity.
|
616
|
+
"""
|
617
|
+
import docx
|
618
|
+
|
619
|
+
doc = docx.Document(self.doc_bytes)
|
620
|
+
for i, para in enumerate(doc.paragraphs, start=1):
|
621
|
+
yield i, [para]
|
622
|
+
|
623
|
+
def extract_text_from_page(self, page: Any) -> str:
|
624
|
+
"""
|
625
|
+
Extract text from a given 'page', which in this case is a single paragraph.
|
626
|
+
|
627
|
+
Args:
|
628
|
+
page (list): A list containing a single Paragraph object.
|
629
|
+
|
630
|
+
Returns:
|
631
|
+
str: Extracted text from the paragraph.
|
632
|
+
"""
|
633
|
+
paragraph = page[0]
|
634
|
+
return self.fix_text(paragraph.text)
|
@@ -0,0 +1,32 @@
|
|
1
|
+
from typing import Union
|
2
|
+
|
3
|
+
import pytesseract
|
4
|
+
from pdf2image import convert_from_bytes, convert_from_path
|
5
|
+
|
6
|
+
|
7
|
+
def pdf_image_to_text(input_data: Union[str, bytes]) -> str:
|
8
|
+
"""
|
9
|
+
Converts a PDF that contains images to text using OCR.
|
10
|
+
|
11
|
+
Args:
|
12
|
+
input_data (Union[str, bytes]): The file path to the PDF or a bytes-like object
|
13
|
+
of the PDF content.
|
14
|
+
|
15
|
+
Returns:
|
16
|
+
str: The extracted text from the PDF.
|
17
|
+
"""
|
18
|
+
|
19
|
+
# Check if the input is a file path (str) or bytes, and
|
20
|
+
# convert PDF to images accordingly
|
21
|
+
if isinstance(input_data, str):
|
22
|
+
images = convert_from_path(input_data)
|
23
|
+
elif isinstance(input_data, bytes):
|
24
|
+
images = convert_from_bytes(input_data)
|
25
|
+
else:
|
26
|
+
raise ValueError("input_data must be a file path (str) or bytes-like object")
|
27
|
+
|
28
|
+
text = ""
|
29
|
+
for image in images:
|
30
|
+
text += pytesseract.image_to_string(image)
|
31
|
+
|
32
|
+
return text
|
@@ -0,0 +1,143 @@
|
|
1
|
+
import json
|
2
|
+
from typing import Any, Iterator, List
|
3
|
+
|
4
|
+
import yaml
|
5
|
+
from pyparsing import nestedExpr, originalTextFor
|
6
|
+
|
7
|
+
|
8
|
+
def is_valid_json(json_str: str) -> bool:
|
9
|
+
"""Check if the input string is a valid JSON.
|
10
|
+
|
11
|
+
Args:
|
12
|
+
json_str (str): The input string to check.
|
13
|
+
|
14
|
+
Returns:
|
15
|
+
bool: True if the input string is a valid JSON, False otherwise.
|
16
|
+
"""
|
17
|
+
try:
|
18
|
+
json.loads(json_str)
|
19
|
+
return True
|
20
|
+
except ValueError:
|
21
|
+
return False
|
22
|
+
|
23
|
+
|
24
|
+
def flatten(nested_list) -> Iterator[str]: # type: ignore
|
25
|
+
"""Flatten a nested list into a single list of strings"""
|
26
|
+
for item in nested_list:
|
27
|
+
if isinstance(item, (list, tuple)):
|
28
|
+
for subitem in flatten(item):
|
29
|
+
yield subitem
|
30
|
+
else:
|
31
|
+
yield item
|
32
|
+
|
33
|
+
|
34
|
+
def get_json_candidates(s: str) -> List[str]:
|
35
|
+
"""Get top-level JSON candidates, i.e. strings between curly braces."""
|
36
|
+
# Define the grammar for matching curly braces
|
37
|
+
curly_braces = originalTextFor(nestedExpr("{", "}"))
|
38
|
+
|
39
|
+
# Parse the string
|
40
|
+
try:
|
41
|
+
results = curly_braces.searchString(s)
|
42
|
+
# Properly convert nested lists to strings
|
43
|
+
return [r[0] for r in results]
|
44
|
+
except Exception:
|
45
|
+
return []
|
46
|
+
|
47
|
+
|
48
|
+
def add_quotes(s: str) -> str:
|
49
|
+
"""
|
50
|
+
Replace accidentally un-quoted string-like keys and values in a potential json str.
|
51
|
+
Intended to handle cases where a weak LLM may produce a JSON-like string
|
52
|
+
containing, e.g. "rent": DO-NOT-KNOW, where it "forgot" to put quotes on the value,
|
53
|
+
or city: "New York" where it "forgot" to put quotes on the key.
|
54
|
+
It will even handle cases like 'address: do not know'.
|
55
|
+
|
56
|
+
Got this fiendishly clever solution from
|
57
|
+
https://stackoverflow.com/a/66053900/10940584
|
58
|
+
Far better/safer than trying to do it with regexes.
|
59
|
+
|
60
|
+
Args:
|
61
|
+
- s (str): The potential JSON string to parse.
|
62
|
+
|
63
|
+
Returns:
|
64
|
+
- str: The (potential) JSON string with un-quoted string-like values
|
65
|
+
replaced by quoted values.
|
66
|
+
"""
|
67
|
+
if is_valid_json(s):
|
68
|
+
return s
|
69
|
+
try:
|
70
|
+
dct = yaml.load(s, yaml.SafeLoader)
|
71
|
+
return json.dumps(dct)
|
72
|
+
except Exception:
|
73
|
+
return s
|
74
|
+
|
75
|
+
|
76
|
+
def repair_newlines(s: str) -> str:
|
77
|
+
"""
|
78
|
+
Attempt to load as json, and if it fails, try with newlines replaced by space.
|
79
|
+
Intended to handle cases where weak LLMs produce JSON-like strings where
|
80
|
+
some string-values contain explicit newlines, e.g.:
|
81
|
+
{"text": "This is a text\n with a newline"}
|
82
|
+
These would not be valid JSON, so we try to clean them up here.
|
83
|
+
"""
|
84
|
+
try:
|
85
|
+
json.loads(s)
|
86
|
+
return s
|
87
|
+
except Exception:
|
88
|
+
try:
|
89
|
+
s = s.replace("\n", " ")
|
90
|
+
json.loads(s)
|
91
|
+
return s
|
92
|
+
except Exception:
|
93
|
+
return s
|
94
|
+
|
95
|
+
|
96
|
+
def extract_top_level_json(s: str) -> List[str]:
|
97
|
+
"""Extract all top-level JSON-formatted substrings from a given string.
|
98
|
+
|
99
|
+
Args:
|
100
|
+
s (str): The input string to search for JSON substrings.
|
101
|
+
|
102
|
+
Returns:
|
103
|
+
List[str]: A list of top-level JSON-formatted substrings.
|
104
|
+
"""
|
105
|
+
# Find JSON object and array candidates
|
106
|
+
json_candidates = get_json_candidates(s)
|
107
|
+
|
108
|
+
normalized_candidates = [
|
109
|
+
candidate.replace("\\{", "{").replace("\\}", "}").replace("\\_", "_")
|
110
|
+
for candidate in json_candidates
|
111
|
+
]
|
112
|
+
candidates = [add_quotes(candidate) for candidate in normalized_candidates]
|
113
|
+
candidates = [repair_newlines(candidate) for candidate in candidates]
|
114
|
+
top_level_jsons = [
|
115
|
+
candidate for candidate in candidates if is_valid_json(candidate)
|
116
|
+
]
|
117
|
+
|
118
|
+
return top_level_jsons
|
119
|
+
|
120
|
+
|
121
|
+
def top_level_json_field(s: str, f: str) -> Any:
|
122
|
+
"""
|
123
|
+
Extract the value of a field f from a top-level JSON object.
|
124
|
+
If there are multiple, just return the first.
|
125
|
+
|
126
|
+
Args:
|
127
|
+
s (str): The input string to search for JSON substrings.
|
128
|
+
f (str): The field to extract from the JSON object.
|
129
|
+
|
130
|
+
Returns:
|
131
|
+
str: The value of the field f in the top-level JSON object, if any.
|
132
|
+
Otherwise, return an empty string.
|
133
|
+
"""
|
134
|
+
|
135
|
+
jsons = extract_top_level_json(s)
|
136
|
+
if len(jsons) == 0:
|
137
|
+
return ""
|
138
|
+
for j in jsons:
|
139
|
+
json_data = json.loads(j)
|
140
|
+
if f in json_data:
|
141
|
+
return json_data[f]
|
142
|
+
|
143
|
+
return ""
|