langroid 0.1.85__py3-none-any.whl → 0.1.219__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. langroid/__init__.py +95 -0
  2. langroid/agent/__init__.py +40 -0
  3. langroid/agent/base.py +222 -91
  4. langroid/agent/batch.py +264 -0
  5. langroid/agent/callbacks/chainlit.py +608 -0
  6. langroid/agent/chat_agent.py +247 -101
  7. langroid/agent/chat_document.py +41 -4
  8. langroid/agent/openai_assistant.py +842 -0
  9. langroid/agent/special/__init__.py +50 -0
  10. langroid/agent/special/doc_chat_agent.py +837 -141
  11. langroid/agent/special/lance_doc_chat_agent.py +258 -0
  12. langroid/agent/special/lance_rag/__init__.py +9 -0
  13. langroid/agent/special/lance_rag/critic_agent.py +136 -0
  14. langroid/agent/special/lance_rag/lance_rag_task.py +80 -0
  15. langroid/agent/special/lance_rag/query_planner_agent.py +180 -0
  16. langroid/agent/special/lance_tools.py +44 -0
  17. langroid/agent/special/neo4j/__init__.py +0 -0
  18. langroid/agent/special/neo4j/csv_kg_chat.py +174 -0
  19. langroid/agent/special/neo4j/neo4j_chat_agent.py +370 -0
  20. langroid/agent/special/neo4j/utils/__init__.py +0 -0
  21. langroid/agent/special/neo4j/utils/system_message.py +46 -0
  22. langroid/agent/special/relevance_extractor_agent.py +127 -0
  23. langroid/agent/special/retriever_agent.py +32 -198
  24. langroid/agent/special/sql/__init__.py +11 -0
  25. langroid/agent/special/sql/sql_chat_agent.py +47 -23
  26. langroid/agent/special/sql/utils/__init__.py +22 -0
  27. langroid/agent/special/sql/utils/description_extractors.py +95 -46
  28. langroid/agent/special/sql/utils/populate_metadata.py +28 -21
  29. langroid/agent/special/table_chat_agent.py +43 -9
  30. langroid/agent/task.py +475 -122
  31. langroid/agent/tool_message.py +75 -13
  32. langroid/agent/tools/__init__.py +13 -0
  33. langroid/agent/tools/duckduckgo_search_tool.py +66 -0
  34. langroid/agent/tools/google_search_tool.py +11 -0
  35. langroid/agent/tools/metaphor_search_tool.py +67 -0
  36. langroid/agent/tools/recipient_tool.py +16 -29
  37. langroid/agent/tools/run_python_code.py +60 -0
  38. langroid/agent/tools/sciphi_search_rag_tool.py +79 -0
  39. langroid/agent/tools/segment_extract_tool.py +36 -0
  40. langroid/cachedb/__init__.py +9 -0
  41. langroid/cachedb/base.py +22 -2
  42. langroid/cachedb/momento_cachedb.py +26 -2
  43. langroid/cachedb/redis_cachedb.py +78 -11
  44. langroid/embedding_models/__init__.py +34 -0
  45. langroid/embedding_models/base.py +21 -2
  46. langroid/embedding_models/models.py +120 -18
  47. langroid/embedding_models/protoc/embeddings.proto +19 -0
  48. langroid/embedding_models/protoc/embeddings_pb2.py +33 -0
  49. langroid/embedding_models/protoc/embeddings_pb2.pyi +50 -0
  50. langroid/embedding_models/protoc/embeddings_pb2_grpc.py +79 -0
  51. langroid/embedding_models/remote_embeds.py +153 -0
  52. langroid/language_models/__init__.py +45 -0
  53. langroid/language_models/azure_openai.py +80 -27
  54. langroid/language_models/base.py +117 -12
  55. langroid/language_models/config.py +5 -0
  56. langroid/language_models/openai_assistants.py +3 -0
  57. langroid/language_models/openai_gpt.py +558 -174
  58. langroid/language_models/prompt_formatter/__init__.py +15 -0
  59. langroid/language_models/prompt_formatter/base.py +4 -6
  60. langroid/language_models/prompt_formatter/hf_formatter.py +135 -0
  61. langroid/language_models/utils.py +18 -21
  62. langroid/mytypes.py +25 -8
  63. langroid/parsing/__init__.py +46 -0
  64. langroid/parsing/document_parser.py +260 -63
  65. langroid/parsing/image_text.py +32 -0
  66. langroid/parsing/parse_json.py +143 -0
  67. langroid/parsing/parser.py +122 -59
  68. langroid/parsing/repo_loader.py +114 -52
  69. langroid/parsing/search.py +68 -63
  70. langroid/parsing/spider.py +3 -2
  71. langroid/parsing/table_loader.py +44 -0
  72. langroid/parsing/url_loader.py +59 -11
  73. langroid/parsing/urls.py +85 -37
  74. langroid/parsing/utils.py +298 -4
  75. langroid/parsing/web_search.py +73 -0
  76. langroid/prompts/__init__.py +11 -0
  77. langroid/prompts/chat-gpt4-system-prompt.md +68 -0
  78. langroid/prompts/prompts_config.py +1 -1
  79. langroid/utils/__init__.py +17 -0
  80. langroid/utils/algorithms/__init__.py +3 -0
  81. langroid/utils/algorithms/graph.py +103 -0
  82. langroid/utils/configuration.py +36 -5
  83. langroid/utils/constants.py +4 -0
  84. langroid/utils/globals.py +2 -2
  85. langroid/utils/logging.py +2 -5
  86. langroid/utils/output/__init__.py +21 -0
  87. langroid/utils/output/printing.py +47 -1
  88. langroid/utils/output/status.py +33 -0
  89. langroid/utils/pandas_utils.py +30 -0
  90. langroid/utils/pydantic_utils.py +616 -2
  91. langroid/utils/system.py +98 -0
  92. langroid/vector_store/__init__.py +40 -0
  93. langroid/vector_store/base.py +203 -6
  94. langroid/vector_store/chromadb.py +59 -32
  95. langroid/vector_store/lancedb.py +463 -0
  96. langroid/vector_store/meilisearch.py +10 -7
  97. langroid/vector_store/momento.py +262 -0
  98. langroid/vector_store/qdrantdb.py +104 -22
  99. {langroid-0.1.85.dist-info → langroid-0.1.219.dist-info}/METADATA +329 -149
  100. langroid-0.1.219.dist-info/RECORD +127 -0
  101. {langroid-0.1.85.dist-info → langroid-0.1.219.dist-info}/WHEEL +1 -1
  102. langroid/agent/special/recipient_validator_agent.py +0 -157
  103. langroid/parsing/json.py +0 -64
  104. langroid/utils/web/selenium_login.py +0 -36
  105. langroid-0.1.85.dist-info/RECORD +0 -94
  106. /langroid/{scripts → agent/callbacks}/__init__.py +0 -0
  107. {langroid-0.1.85.dist-info → langroid-0.1.219.dist-info}/LICENSE +0 -0
@@ -1,3 +1,4 @@
1
+ import itertools
1
2
  import logging
2
3
  import re
3
4
  from enum import Enum
@@ -8,10 +9,11 @@ import fitz
8
9
  import pdfplumber
9
10
  import pypdf
10
11
  import requests
12
+ from bs4 import BeautifulSoup
13
+ from PIL import Image
11
14
 
12
15
  from langroid.mytypes import DocMetaData, Document
13
16
  from langroid.parsing.parser import Parser, ParsingConfig
14
- from langroid.parsing.urls import url_to_tempfile
15
17
 
16
18
  logger = logging.getLogger(__name__)
17
19
 
@@ -19,6 +21,30 @@ logger = logging.getLogger(__name__)
19
21
  class DocumentType(str, Enum):
20
22
  PDF = "pdf"
21
23
  DOCX = "docx"
24
+ DOC = "doc"
25
+ TXT = "txt"
26
+
27
+
28
+ def is_plain_text(path_or_bytes: str | bytes) -> bool:
29
+ if isinstance(path_or_bytes, str):
30
+ if path_or_bytes.startswith(("http://", "https://")):
31
+ response = requests.get(path_or_bytes)
32
+ response.raise_for_status()
33
+ content = response.content[:1024]
34
+ else:
35
+ with open(path_or_bytes, "rb") as f:
36
+ content = f.read(1024)
37
+ else:
38
+ content = path_or_bytes[:1024]
39
+ try:
40
+ # Attempt to decode the content as UTF-8
41
+ _ = content.decode("utf-8")
42
+ # Additional checks can go here, e.g., to verify that the content
43
+ # doesn't contain too many unusual characters for it to be considered text
44
+ return True
45
+ except UnicodeDecodeError:
46
+ # If decoding fails, it's likely not plain text (or not encoded in UTF-8)
47
+ return False
22
48
 
23
49
 
24
50
  class DocumentParser(Parser):
@@ -32,19 +58,26 @@ class DocumentParser(Parser):
32
58
  """
33
59
 
34
60
  @classmethod
35
- def create(cls, source: str, config: ParsingConfig) -> "DocumentParser":
61
+ def create(
62
+ cls,
63
+ source: str | bytes,
64
+ config: ParsingConfig,
65
+ doc_type: str | DocumentType | None = None,
66
+ ) -> "DocumentParser":
36
67
  """
37
68
  Create a DocumentParser instance based on source type
38
69
  and config.<source_type>.library specified.
39
70
 
40
71
  Args:
41
- source (str): The source of the PDF, either a URL or a file path.
72
+ source (str|bytes): The source, could be a URL, file path,
73
+ or bytes object.
42
74
  config (ParserConfig): The parser configuration.
75
+ doc_type (str|None): The type of document, if known
43
76
 
44
77
  Returns:
45
78
  DocumentParser: An instance of a DocumentParser subclass.
46
79
  """
47
- if DocumentParser._document_type(source) == DocumentType.PDF:
80
+ if DocumentParser._document_type(source, doc_type) == DocumentType.PDF:
48
81
  if config.pdf.library == "fitz":
49
82
  return FitzPDFParser(source, config)
50
83
  elif config.pdf.library == "pypdf":
@@ -53,51 +86,93 @@ class DocumentParser(Parser):
53
86
  return PDFPlumberParser(source, config)
54
87
  elif config.pdf.library == "unstructured":
55
88
  return UnstructuredPDFParser(source, config)
56
- elif config.pdf.library == "haystack":
57
- return HaystackPDFParser(source, config)
89
+ elif config.pdf.library == "pdf2image":
90
+ return ImagePdfParser(source, config)
58
91
  else:
59
92
  raise ValueError(
60
93
  f"Unsupported PDF library specified: {config.pdf.library}"
61
94
  )
62
- elif DocumentParser._document_type(source) == DocumentType.DOCX:
95
+ elif DocumentParser._document_type(source, doc_type) == DocumentType.DOCX:
63
96
  if config.docx.library == "unstructured":
64
97
  return UnstructuredDocxParser(source, config)
98
+ elif config.docx.library == "python-docx":
99
+ return PythonDocxParser(source, config)
65
100
  else:
66
101
  raise ValueError(
67
102
  f"Unsupported DOCX library specified: {config.docx.library}"
68
103
  )
104
+ elif DocumentParser._document_type(source, doc_type) == DocumentType.DOC:
105
+ return UnstructuredDocParser(source, config)
69
106
  else:
70
- raise ValueError(f"Unsupported document type: {source}")
107
+ source_name = source if isinstance(source, str) else "bytes"
108
+ raise ValueError(f"Unsupported document type: {source_name}")
71
109
 
72
- def __init__(self, source: str, config: ParsingConfig):
110
+ def __init__(self, source: str | bytes, config: ParsingConfig):
73
111
  """
74
- Initialize the PDFParser.
75
-
76
112
  Args:
77
- source (str): The source of the PDF, either a URL or a file path.
113
+ source (str|bytes): The source, which could be
114
+ a path, a URL or a bytes object.
78
115
  """
79
116
  super().__init__(config)
80
- self.source = source
81
117
  self.config = config
82
- self.doc_bytes = self._load_doc_as_bytesio()
118
+ if isinstance(source, bytes):
119
+ self.source = "bytes"
120
+ self.doc_bytes = BytesIO(source)
121
+ else:
122
+ self.source = source
123
+ self.doc_bytes = self._load_doc_as_bytesio()
83
124
 
84
125
  @staticmethod
85
- def _document_type(source: str) -> DocumentType:
126
+ def _document_type(
127
+ source: str | bytes, doc_type: str | DocumentType | None = None
128
+ ) -> DocumentType:
86
129
  """
87
130
  Determine the type of document based on the source.
88
131
 
89
132
  Args:
90
- source (str): The source of the PDF, either a URL or a file path.
133
+ source (str|bytes): The source, which could be a URL,
134
+ a file path, or a bytes object.
135
+ doc_type (str|DocumentType|None): The type of document, if known.
91
136
 
92
137
  Returns:
93
138
  str: The document type.
94
139
  """
95
- if source.lower().endswith(".pdf"):
96
- return DocumentType.PDF
97
- elif source.lower().endswith(".docx"):
98
- return DocumentType.DOCX
140
+ if isinstance(doc_type, DocumentType):
141
+ return doc_type
142
+ if doc_type:
143
+ return DocumentType(doc_type.lower())
144
+ if is_plain_text(source):
145
+ return DocumentType.TXT
146
+ if isinstance(source, str):
147
+ # detect file type from path extension
148
+ if source.lower().endswith(".pdf"):
149
+ return DocumentType.PDF
150
+ elif source.lower().endswith(".docx"):
151
+ return DocumentType.DOCX
152
+ elif source.lower().endswith(".doc"):
153
+ return DocumentType.DOC
154
+ else:
155
+ raise ValueError(f"Unsupported document type: {source}")
99
156
  else:
100
- raise ValueError(f"Unsupported document type: {source}")
157
+ # must be bytes: attempt to detect type from content
158
+ # using magic mime type detection
159
+ import magic
160
+
161
+ mime_type = magic.from_buffer(source, mime=True)
162
+ if mime_type == "application/pdf":
163
+ return DocumentType.PDF
164
+ elif mime_type in [
165
+ "application/vnd.openxmlformats-officedocument"
166
+ ".wordprocessingml.document",
167
+ "application/zip",
168
+ ]:
169
+ # DOCX files are essentially ZIP files,
170
+ # but this might catch other ZIP-based formats too!
171
+ return DocumentType.DOCX
172
+ elif mime_type == "application/msword":
173
+ return DocumentType.DOC
174
+ else:
175
+ raise ValueError("Unsupported document type from bytes")
101
176
 
102
177
  def _load_doc_as_bytesio(self) -> BytesIO:
103
178
  """
@@ -114,6 +189,61 @@ class DocumentParser(Parser):
114
189
  with open(self.source, "rb") as f:
115
190
  return BytesIO(f.read())
116
191
 
192
+ @staticmethod
193
+ def chunks_from_path_or_bytes(
194
+ source: str | bytes,
195
+ parser: Parser,
196
+ doc_type: str | DocumentType | None = None,
197
+ lines: int | None = None,
198
+ ) -> List[Document]:
199
+ """
200
+ Get document chunks from a file path or bytes object.
201
+ Args:
202
+ source (str|bytes): The source, which could be a URL, path or bytes object.
203
+ parser (Parser): The parser instance (for splitting the document).
204
+ doc_type (str|DocumentType|None): The type of document, if known.
205
+ lines (int|None): The number of lines to read from a plain text file.
206
+ Returns:
207
+ List[Document]: A list of `Document` objects,
208
+ each containing a chunk of text, determined by the
209
+ chunking and splitting settings in the parser config.
210
+ """
211
+ dtype: DocumentType = DocumentParser._document_type(source, doc_type)
212
+ if dtype in [DocumentType.PDF, DocumentType.DOC, DocumentType.DOCX]:
213
+ doc_parser = DocumentParser.create(
214
+ source,
215
+ parser.config,
216
+ doc_type=doc_type,
217
+ )
218
+ chunks = doc_parser.get_doc_chunks()
219
+ if len(chunks) == 0 and dtype == DocumentType.PDF:
220
+ doc_parser = ImagePdfParser(source, parser.config)
221
+ chunks = doc_parser.get_doc_chunks()
222
+ return chunks
223
+ else:
224
+ # try getting as plain text; these will be chunked downstream
225
+ # -- could be a bytes object or a path
226
+ if isinstance(source, bytes):
227
+ content = source.decode()
228
+ if lines is not None:
229
+ file_lines = content.splitlines()[:lines]
230
+ content = "\n".join(line.strip() for line in file_lines)
231
+ else:
232
+ with open(source, "r") as f:
233
+ if lines is not None:
234
+ file_lines = list(itertools.islice(f, lines))
235
+ content = "\n".join(line.strip() for line in file_lines)
236
+ else:
237
+ content = f.read()
238
+ soup = BeautifulSoup(content, "html.parser")
239
+ text = soup.get_text()
240
+ source_name = source if isinstance(source, str) else "bytes"
241
+ doc = Document(
242
+ content=text,
243
+ metadata=DocMetaData(source=str(source_name)),
244
+ )
245
+ return parser.split([doc])
246
+
117
247
  def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
118
248
  """Yield each page in the PDF."""
119
249
  raise NotImplementedError
@@ -138,7 +268,7 @@ class DocumentParser(Parser):
138
268
 
139
269
  def get_doc(self) -> Document:
140
270
  """
141
- Get entire text from pdf source as a single document.
271
+ Get entire text from source as a single document.
142
272
 
143
273
  Returns:
144
274
  a `Document` object containing the content of the pdf file,
@@ -200,6 +330,7 @@ class DocumentParser(Parser):
200
330
  ),
201
331
  )
202
332
  )
333
+ self.add_window_ids(docs)
203
334
  return docs
204
335
 
205
336
 
@@ -293,50 +424,34 @@ class PDFPlumberParser(DocumentParser):
293
424
  return self.fix_text(page.extract_text())
294
425
 
295
426
 
296
- class HaystackPDFParser(DocumentParser):
427
+ class ImagePdfParser(DocumentParser):
297
428
  """
298
- Parser for processing PDFs using the `haystack` library.
429
+ Parser for processing PDFs that are images, i.e. not "true" PDFs.
299
430
  """
300
431
 
301
- def get_doc_chunks(self) -> List[Document]:
302
- """
303
- Overrides the base class method to use the `haystack` library.
304
- See there for more details.
432
+ def iterate_pages(
433
+ self,
434
+ ) -> Generator[Tuple[int, Image], None, None]:
435
+ from pdf2image import convert_from_bytes
436
+
437
+ images = convert_from_bytes(self.doc_bytes.getvalue())
438
+ for i, image in enumerate(images):
439
+ yield i, image
440
+
441
+ def extract_text_from_page(self, page: Image) -> str:
305
442
  """
443
+ Extract text from a given `pdf2image` page.
306
444
 
307
- from haystack.nodes import PDFToTextConverter, PreProcessor
445
+ Args:
446
+ page (Image): The PIL Image object.
308
447
 
309
- converter = PDFToTextConverter(
310
- remove_numeric_tables=True,
311
- )
312
- path = self.source
313
- if path.startswith(("http://", "https://")):
314
- path = url_to_tempfile(path)
315
- doc = converter.convert(file_path=path, meta=None)
316
- # note self.config.chunk_size is in token units,
317
- # and we use an approximation of 75 words per 100 tokens
318
- # to convert to word units
319
- preprocessor = PreProcessor(
320
- clean_empty_lines=True,
321
- clean_whitespace=True,
322
- clean_header_footer=False,
323
- split_by="word",
324
- split_length=int(0.75 * self.config.chunk_size),
325
- split_overlap=int(0.75 * self.config.overlap),
326
- split_respect_sentence_boundary=True,
327
- add_page_number=True,
328
- )
329
- chunks = preprocessor.process(doc)
330
- return [
331
- Document(
332
- content=chunk.content,
333
- metadata=DocMetaData(
334
- source=f"{self.source} page {chunk.meta['page']}",
335
- is_chunk=True,
336
- ),
337
- )
338
- for chunk in chunks
339
- ]
448
+ Returns:
449
+ str: Extracted text from the image.
450
+ """
451
+ import pytesseract
452
+
453
+ text = pytesseract.image_to_string(page)
454
+ return self.fix_text(text)
340
455
 
341
456
 
342
457
  class UnstructuredPDFParser(DocumentParser):
@@ -345,7 +460,17 @@ class UnstructuredPDFParser(DocumentParser):
345
460
  """
346
461
 
347
462
  def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]: # type: ignore
348
- from unstructured.partition.pdf import partition_pdf
463
+ try:
464
+ from unstructured.partition.pdf import partition_pdf
465
+ except ImportError:
466
+ raise ImportError(
467
+ """
468
+ The `unstructured` library is not installed by default with langroid.
469
+ To include this library, please install langroid with the
470
+ `unstructured` extra by running `pip install "langroid[unstructured]"`
471
+ or equivalent.
472
+ """
473
+ )
349
474
 
350
475
  # from unstructured.chunking.title import chunk_by_title
351
476
 
@@ -359,7 +484,7 @@ class UnstructuredPDFParser(DocumentParser):
359
484
  Please try a different library by setting the `library` field
360
485
  in the `pdf` section of the `parsing` field in the config file.
361
486
  Supported libraries are:
362
- fitz, pypdf, pdfplumber, unstructured, haystack
487
+ fitz, pypdf, pdfplumber, unstructured
363
488
  """
364
489
  )
365
490
 
@@ -398,7 +523,17 @@ class UnstructuredDocxParser(DocumentParser):
398
523
  """
399
524
 
400
525
  def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]: # type: ignore
401
- from unstructured.partition.docx import partition_docx
526
+ try:
527
+ from unstructured.partition.docx import partition_docx
528
+ except ImportError:
529
+ raise ImportError(
530
+ """
531
+ The `unstructured` library is not installed by default with langroid.
532
+ To include this library, please install langroid with the
533
+ `unstructured` extra by running `pip install "langroid[unstructured]"`
534
+ or equivalent.
535
+ """
536
+ )
402
537
 
403
538
  elements = partition_docx(file=self.doc_bytes, include_page_breaks=True)
404
539
 
@@ -435,3 +570,65 @@ class UnstructuredDocxParser(DocumentParser):
435
570
  """
436
571
  text = " ".join(el.text for el in page)
437
572
  return self.fix_text(text)
573
+
574
+
575
+ class UnstructuredDocParser(UnstructuredDocxParser):
576
+ def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]: # type: ignore
577
+ try:
578
+ from unstructured.partition.doc import partition_doc
579
+ except ImportError:
580
+ raise ImportError(
581
+ """
582
+ The `unstructured` library is not installed by default with langroid.
583
+ To include this library, please install langroid with the
584
+ `unstructured` extra by running `pip install "langroid[unstructured]"`
585
+ or equivalent.
586
+ """
587
+ )
588
+
589
+ elements = partition_doc(filename=self.source, include_page_breaks=True)
590
+
591
+ page_number = 1
592
+ page_elements = [] # type: ignore
593
+ for el in elements:
594
+ if el.category == "PageBreak":
595
+ if page_elements: # Avoid yielding empty pages at the start
596
+ yield page_number, page_elements
597
+ page_number += 1
598
+ page_elements = []
599
+ else:
600
+ page_elements.append(el)
601
+ # Yield the last page if it's not empty
602
+ if page_elements:
603
+ yield page_number, page_elements
604
+
605
+
606
+ class PythonDocxParser(DocumentParser):
607
+ """
608
+ Parser for processing DOCX files using the `python-docx` library.
609
+ """
610
+
611
+ def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
612
+ """
613
+ Simulate iterating through pages.
614
+ In a DOCX file, pages are not explicitly defined,
615
+ so we consider each paragraph as a separate 'page' for simplicity.
616
+ """
617
+ import docx
618
+
619
+ doc = docx.Document(self.doc_bytes)
620
+ for i, para in enumerate(doc.paragraphs, start=1):
621
+ yield i, [para]
622
+
623
+ def extract_text_from_page(self, page: Any) -> str:
624
+ """
625
+ Extract text from a given 'page', which in this case is a single paragraph.
626
+
627
+ Args:
628
+ page (list): A list containing a single Paragraph object.
629
+
630
+ Returns:
631
+ str: Extracted text from the paragraph.
632
+ """
633
+ paragraph = page[0]
634
+ return self.fix_text(paragraph.text)
@@ -0,0 +1,32 @@
1
+ from typing import Union
2
+
3
+ import pytesseract
4
+ from pdf2image import convert_from_bytes, convert_from_path
5
+
6
+
7
+ def pdf_image_to_text(input_data: Union[str, bytes]) -> str:
8
+ """
9
+ Converts a PDF that contains images to text using OCR.
10
+
11
+ Args:
12
+ input_data (Union[str, bytes]): The file path to the PDF or a bytes-like object
13
+ of the PDF content.
14
+
15
+ Returns:
16
+ str: The extracted text from the PDF.
17
+ """
18
+
19
+ # Check if the input is a file path (str) or bytes, and
20
+ # convert PDF to images accordingly
21
+ if isinstance(input_data, str):
22
+ images = convert_from_path(input_data)
23
+ elif isinstance(input_data, bytes):
24
+ images = convert_from_bytes(input_data)
25
+ else:
26
+ raise ValueError("input_data must be a file path (str) or bytes-like object")
27
+
28
+ text = ""
29
+ for image in images:
30
+ text += pytesseract.image_to_string(image)
31
+
32
+ return text
@@ -0,0 +1,143 @@
1
+ import json
2
+ from typing import Any, Iterator, List
3
+
4
+ import yaml
5
+ from pyparsing import nestedExpr, originalTextFor
6
+
7
+
8
+ def is_valid_json(json_str: str) -> bool:
9
+ """Check if the input string is a valid JSON.
10
+
11
+ Args:
12
+ json_str (str): The input string to check.
13
+
14
+ Returns:
15
+ bool: True if the input string is a valid JSON, False otherwise.
16
+ """
17
+ try:
18
+ json.loads(json_str)
19
+ return True
20
+ except ValueError:
21
+ return False
22
+
23
+
24
+ def flatten(nested_list) -> Iterator[str]: # type: ignore
25
+ """Flatten a nested list into a single list of strings"""
26
+ for item in nested_list:
27
+ if isinstance(item, (list, tuple)):
28
+ for subitem in flatten(item):
29
+ yield subitem
30
+ else:
31
+ yield item
32
+
33
+
34
+ def get_json_candidates(s: str) -> List[str]:
35
+ """Get top-level JSON candidates, i.e. strings between curly braces."""
36
+ # Define the grammar for matching curly braces
37
+ curly_braces = originalTextFor(nestedExpr("{", "}"))
38
+
39
+ # Parse the string
40
+ try:
41
+ results = curly_braces.searchString(s)
42
+ # Properly convert nested lists to strings
43
+ return [r[0] for r in results]
44
+ except Exception:
45
+ return []
46
+
47
+
48
+ def add_quotes(s: str) -> str:
49
+ """
50
+ Replace accidentally un-quoted string-like keys and values in a potential json str.
51
+ Intended to handle cases where a weak LLM may produce a JSON-like string
52
+ containing, e.g. "rent": DO-NOT-KNOW, where it "forgot" to put quotes on the value,
53
+ or city: "New York" where it "forgot" to put quotes on the key.
54
+ It will even handle cases like 'address: do not know'.
55
+
56
+ Got this fiendishly clever solution from
57
+ https://stackoverflow.com/a/66053900/10940584
58
+ Far better/safer than trying to do it with regexes.
59
+
60
+ Args:
61
+ - s (str): The potential JSON string to parse.
62
+
63
+ Returns:
64
+ - str: The (potential) JSON string with un-quoted string-like values
65
+ replaced by quoted values.
66
+ """
67
+ if is_valid_json(s):
68
+ return s
69
+ try:
70
+ dct = yaml.load(s, yaml.SafeLoader)
71
+ return json.dumps(dct)
72
+ except Exception:
73
+ return s
74
+
75
+
76
+ def repair_newlines(s: str) -> str:
77
+ """
78
+ Attempt to load as json, and if it fails, try with newlines replaced by space.
79
+ Intended to handle cases where weak LLMs produce JSON-like strings where
80
+ some string-values contain explicit newlines, e.g.:
81
+ {"text": "This is a text\n with a newline"}
82
+ These would not be valid JSON, so we try to clean them up here.
83
+ """
84
+ try:
85
+ json.loads(s)
86
+ return s
87
+ except Exception:
88
+ try:
89
+ s = s.replace("\n", " ")
90
+ json.loads(s)
91
+ return s
92
+ except Exception:
93
+ return s
94
+
95
+
96
+ def extract_top_level_json(s: str) -> List[str]:
97
+ """Extract all top-level JSON-formatted substrings from a given string.
98
+
99
+ Args:
100
+ s (str): The input string to search for JSON substrings.
101
+
102
+ Returns:
103
+ List[str]: A list of top-level JSON-formatted substrings.
104
+ """
105
+ # Find JSON object and array candidates
106
+ json_candidates = get_json_candidates(s)
107
+
108
+ normalized_candidates = [
109
+ candidate.replace("\\{", "{").replace("\\}", "}").replace("\\_", "_")
110
+ for candidate in json_candidates
111
+ ]
112
+ candidates = [add_quotes(candidate) for candidate in normalized_candidates]
113
+ candidates = [repair_newlines(candidate) for candidate in candidates]
114
+ top_level_jsons = [
115
+ candidate for candidate in candidates if is_valid_json(candidate)
116
+ ]
117
+
118
+ return top_level_jsons
119
+
120
+
121
+ def top_level_json_field(s: str, f: str) -> Any:
122
+ """
123
+ Extract the value of a field f from a top-level JSON object.
124
+ If there are multiple, just return the first.
125
+
126
+ Args:
127
+ s (str): The input string to search for JSON substrings.
128
+ f (str): The field to extract from the JSON object.
129
+
130
+ Returns:
131
+ str: The value of the field f in the top-level JSON object, if any.
132
+ Otherwise, return an empty string.
133
+ """
134
+
135
+ jsons = extract_top_level_json(s)
136
+ if len(jsons) == 0:
137
+ return ""
138
+ for j in jsons:
139
+ json_data = json.loads(j)
140
+ if f in json_data:
141
+ return json_data[f]
142
+
143
+ return ""