langroid 0.1.139__py3-none-any.whl → 0.1.219__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. langroid/__init__.py +70 -0
  2. langroid/agent/__init__.py +22 -0
  3. langroid/agent/base.py +120 -33
  4. langroid/agent/batch.py +134 -35
  5. langroid/agent/callbacks/__init__.py +0 -0
  6. langroid/agent/callbacks/chainlit.py +608 -0
  7. langroid/agent/chat_agent.py +164 -100
  8. langroid/agent/chat_document.py +19 -2
  9. langroid/agent/openai_assistant.py +20 -10
  10. langroid/agent/special/__init__.py +33 -10
  11. langroid/agent/special/doc_chat_agent.py +521 -108
  12. langroid/agent/special/lance_doc_chat_agent.py +258 -0
  13. langroid/agent/special/lance_rag/__init__.py +9 -0
  14. langroid/agent/special/lance_rag/critic_agent.py +136 -0
  15. langroid/agent/special/lance_rag/lance_rag_task.py +80 -0
  16. langroid/agent/special/lance_rag/query_planner_agent.py +180 -0
  17. langroid/agent/special/lance_tools.py +44 -0
  18. langroid/agent/special/neo4j/__init__.py +0 -0
  19. langroid/agent/special/neo4j/csv_kg_chat.py +174 -0
  20. langroid/agent/special/neo4j/neo4j_chat_agent.py +370 -0
  21. langroid/agent/special/neo4j/utils/__init__.py +0 -0
  22. langroid/agent/special/neo4j/utils/system_message.py +46 -0
  23. langroid/agent/special/relevance_extractor_agent.py +23 -7
  24. langroid/agent/special/retriever_agent.py +29 -174
  25. langroid/agent/special/sql/__init__.py +7 -0
  26. langroid/agent/special/sql/sql_chat_agent.py +47 -23
  27. langroid/agent/special/sql/utils/__init__.py +11 -0
  28. langroid/agent/special/sql/utils/description_extractors.py +95 -46
  29. langroid/agent/special/sql/utils/populate_metadata.py +28 -21
  30. langroid/agent/special/table_chat_agent.py +43 -9
  31. langroid/agent/task.py +423 -114
  32. langroid/agent/tool_message.py +67 -10
  33. langroid/agent/tools/__init__.py +8 -0
  34. langroid/agent/tools/duckduckgo_search_tool.py +66 -0
  35. langroid/agent/tools/google_search_tool.py +11 -0
  36. langroid/agent/tools/metaphor_search_tool.py +67 -0
  37. langroid/agent/tools/recipient_tool.py +6 -24
  38. langroid/agent/tools/sciphi_search_rag_tool.py +79 -0
  39. langroid/cachedb/__init__.py +6 -0
  40. langroid/embedding_models/__init__.py +24 -0
  41. langroid/embedding_models/base.py +9 -1
  42. langroid/embedding_models/models.py +117 -17
  43. langroid/embedding_models/protoc/embeddings.proto +19 -0
  44. langroid/embedding_models/protoc/embeddings_pb2.py +33 -0
  45. langroid/embedding_models/protoc/embeddings_pb2.pyi +50 -0
  46. langroid/embedding_models/protoc/embeddings_pb2_grpc.py +79 -0
  47. langroid/embedding_models/remote_embeds.py +153 -0
  48. langroid/language_models/__init__.py +22 -0
  49. langroid/language_models/azure_openai.py +47 -4
  50. langroid/language_models/base.py +26 -10
  51. langroid/language_models/config.py +5 -0
  52. langroid/language_models/openai_gpt.py +407 -121
  53. langroid/language_models/prompt_formatter/__init__.py +9 -0
  54. langroid/language_models/prompt_formatter/base.py +4 -6
  55. langroid/language_models/prompt_formatter/hf_formatter.py +135 -0
  56. langroid/language_models/utils.py +10 -9
  57. langroid/mytypes.py +10 -4
  58. langroid/parsing/__init__.py +33 -1
  59. langroid/parsing/document_parser.py +259 -63
  60. langroid/parsing/image_text.py +32 -0
  61. langroid/parsing/parse_json.py +143 -0
  62. langroid/parsing/parser.py +20 -7
  63. langroid/parsing/repo_loader.py +108 -46
  64. langroid/parsing/search.py +8 -0
  65. langroid/parsing/table_loader.py +44 -0
  66. langroid/parsing/url_loader.py +59 -13
  67. langroid/parsing/urls.py +18 -9
  68. langroid/parsing/utils.py +130 -9
  69. langroid/parsing/web_search.py +73 -0
  70. langroid/prompts/__init__.py +7 -0
  71. langroid/prompts/chat-gpt4-system-prompt.md +68 -0
  72. langroid/prompts/prompts_config.py +1 -1
  73. langroid/utils/__init__.py +10 -0
  74. langroid/utils/algorithms/__init__.py +3 -0
  75. langroid/utils/configuration.py +0 -1
  76. langroid/utils/constants.py +4 -0
  77. langroid/utils/logging.py +2 -5
  78. langroid/utils/output/__init__.py +15 -2
  79. langroid/utils/output/status.py +33 -0
  80. langroid/utils/pandas_utils.py +30 -0
  81. langroid/utils/pydantic_utils.py +446 -4
  82. langroid/utils/system.py +36 -1
  83. langroid/vector_store/__init__.py +34 -2
  84. langroid/vector_store/base.py +33 -2
  85. langroid/vector_store/chromadb.py +42 -13
  86. langroid/vector_store/lancedb.py +226 -60
  87. langroid/vector_store/meilisearch.py +7 -6
  88. langroid/vector_store/momento.py +3 -2
  89. langroid/vector_store/qdrantdb.py +82 -11
  90. {langroid-0.1.139.dist-info → langroid-0.1.219.dist-info}/METADATA +190 -129
  91. langroid-0.1.219.dist-info/RECORD +127 -0
  92. langroid/agent/special/recipient_validator_agent.py +0 -157
  93. langroid/parsing/json.py +0 -64
  94. langroid/utils/web/selenium_login.py +0 -36
  95. langroid-0.1.139.dist-info/RECORD +0 -103
  96. {langroid-0.1.139.dist-info → langroid-0.1.219.dist-info}/LICENSE +0 -0
  97. {langroid-0.1.139.dist-info → langroid-0.1.219.dist-info}/WHEEL +0 -0
@@ -1,3 +1,4 @@
1
+ import itertools
1
2
  import logging
2
3
  import re
3
4
  from enum import Enum
@@ -8,10 +9,11 @@ import fitz
8
9
  import pdfplumber
9
10
  import pypdf
10
11
  import requests
12
+ from bs4 import BeautifulSoup
13
+ from PIL import Image
11
14
 
12
15
  from langroid.mytypes import DocMetaData, Document
13
16
  from langroid.parsing.parser import Parser, ParsingConfig
14
- from langroid.parsing.urls import url_to_tempfile
15
17
 
16
18
  logger = logging.getLogger(__name__)
17
19
 
@@ -19,6 +21,30 @@ logger = logging.getLogger(__name__)
19
21
  class DocumentType(str, Enum):
20
22
  PDF = "pdf"
21
23
  DOCX = "docx"
24
+ DOC = "doc"
25
+ TXT = "txt"
26
+
27
+
28
+ def is_plain_text(path_or_bytes: str | bytes) -> bool:
29
+ if isinstance(path_or_bytes, str):
30
+ if path_or_bytes.startswith(("http://", "https://")):
31
+ response = requests.get(path_or_bytes)
32
+ response.raise_for_status()
33
+ content = response.content[:1024]
34
+ else:
35
+ with open(path_or_bytes, "rb") as f:
36
+ content = f.read(1024)
37
+ else:
38
+ content = path_or_bytes[:1024]
39
+ try:
40
+ # Attempt to decode the content as UTF-8
41
+ _ = content.decode("utf-8")
42
+ # Additional checks can go here, e.g., to verify that the content
43
+ # doesn't contain too many unusual characters for it to be considered text
44
+ return True
45
+ except UnicodeDecodeError:
46
+ # If decoding fails, it's likely not plain text (or not encoded in UTF-8)
47
+ return False
22
48
 
23
49
 
24
50
  class DocumentParser(Parser):
@@ -32,19 +58,26 @@ class DocumentParser(Parser):
32
58
  """
33
59
 
34
60
  @classmethod
35
- def create(cls, source: str, config: ParsingConfig) -> "DocumentParser":
61
+ def create(
62
+ cls,
63
+ source: str | bytes,
64
+ config: ParsingConfig,
65
+ doc_type: str | DocumentType | None = None,
66
+ ) -> "DocumentParser":
36
67
  """
37
68
  Create a DocumentParser instance based on source type
38
69
  and config.<source_type>.library specified.
39
70
 
40
71
  Args:
41
- source (str): The source of the PDF, either a URL or a file path.
72
+ source (str|bytes): The source, could be a URL, file path,
73
+ or bytes object.
42
74
  config (ParserConfig): The parser configuration.
75
+ doc_type (str|None): The type of document, if known
43
76
 
44
77
  Returns:
45
78
  DocumentParser: An instance of a DocumentParser subclass.
46
79
  """
47
- if DocumentParser._document_type(source) == DocumentType.PDF:
80
+ if DocumentParser._document_type(source, doc_type) == DocumentType.PDF:
48
81
  if config.pdf.library == "fitz":
49
82
  return FitzPDFParser(source, config)
50
83
  elif config.pdf.library == "pypdf":
@@ -53,51 +86,93 @@ class DocumentParser(Parser):
53
86
  return PDFPlumberParser(source, config)
54
87
  elif config.pdf.library == "unstructured":
55
88
  return UnstructuredPDFParser(source, config)
56
- elif config.pdf.library == "haystack":
57
- return HaystackPDFParser(source, config)
89
+ elif config.pdf.library == "pdf2image":
90
+ return ImagePdfParser(source, config)
58
91
  else:
59
92
  raise ValueError(
60
93
  f"Unsupported PDF library specified: {config.pdf.library}"
61
94
  )
62
- elif DocumentParser._document_type(source) == DocumentType.DOCX:
95
+ elif DocumentParser._document_type(source, doc_type) == DocumentType.DOCX:
63
96
  if config.docx.library == "unstructured":
64
97
  return UnstructuredDocxParser(source, config)
98
+ elif config.docx.library == "python-docx":
99
+ return PythonDocxParser(source, config)
65
100
  else:
66
101
  raise ValueError(
67
102
  f"Unsupported DOCX library specified: {config.docx.library}"
68
103
  )
104
+ elif DocumentParser._document_type(source, doc_type) == DocumentType.DOC:
105
+ return UnstructuredDocParser(source, config)
69
106
  else:
70
- raise ValueError(f"Unsupported document type: {source}")
107
+ source_name = source if isinstance(source, str) else "bytes"
108
+ raise ValueError(f"Unsupported document type: {source_name}")
71
109
 
72
- def __init__(self, source: str, config: ParsingConfig):
110
+ def __init__(self, source: str | bytes, config: ParsingConfig):
73
111
  """
74
- Initialize the PDFParser.
75
-
76
112
  Args:
77
- source (str): The source of the PDF, either a URL or a file path.
113
+ source (str|bytes): The source, which could be
114
+ a path, a URL or a bytes object.
78
115
  """
79
116
  super().__init__(config)
80
- self.source = source
81
117
  self.config = config
82
- self.doc_bytes = self._load_doc_as_bytesio()
118
+ if isinstance(source, bytes):
119
+ self.source = "bytes"
120
+ self.doc_bytes = BytesIO(source)
121
+ else:
122
+ self.source = source
123
+ self.doc_bytes = self._load_doc_as_bytesio()
83
124
 
84
125
  @staticmethod
85
- def _document_type(source: str) -> DocumentType:
126
+ def _document_type(
127
+ source: str | bytes, doc_type: str | DocumentType | None = None
128
+ ) -> DocumentType:
86
129
  """
87
130
  Determine the type of document based on the source.
88
131
 
89
132
  Args:
90
- source (str): The source of the PDF, either a URL or a file path.
133
+ source (str|bytes): The source, which could be a URL,
134
+ a file path, or a bytes object.
135
+ doc_type (str|DocumentType|None): The type of document, if known.
91
136
 
92
137
  Returns:
93
138
  str: The document type.
94
139
  """
95
- if source.lower().endswith(".pdf"):
96
- return DocumentType.PDF
97
- elif source.lower().endswith(".docx"):
98
- return DocumentType.DOCX
140
+ if isinstance(doc_type, DocumentType):
141
+ return doc_type
142
+ if doc_type:
143
+ return DocumentType(doc_type.lower())
144
+ if is_plain_text(source):
145
+ return DocumentType.TXT
146
+ if isinstance(source, str):
147
+ # detect file type from path extension
148
+ if source.lower().endswith(".pdf"):
149
+ return DocumentType.PDF
150
+ elif source.lower().endswith(".docx"):
151
+ return DocumentType.DOCX
152
+ elif source.lower().endswith(".doc"):
153
+ return DocumentType.DOC
154
+ else:
155
+ raise ValueError(f"Unsupported document type: {source}")
99
156
  else:
100
- raise ValueError(f"Unsupported document type: {source}")
157
+ # must be bytes: attempt to detect type from content
158
+ # using magic mime type detection
159
+ import magic
160
+
161
+ mime_type = magic.from_buffer(source, mime=True)
162
+ if mime_type == "application/pdf":
163
+ return DocumentType.PDF
164
+ elif mime_type in [
165
+ "application/vnd.openxmlformats-officedocument"
166
+ ".wordprocessingml.document",
167
+ "application/zip",
168
+ ]:
169
+ # DOCX files are essentially ZIP files,
170
+ # but this might catch other ZIP-based formats too!
171
+ return DocumentType.DOCX
172
+ elif mime_type == "application/msword":
173
+ return DocumentType.DOC
174
+ else:
175
+ raise ValueError("Unsupported document type from bytes")
101
176
 
102
177
  def _load_doc_as_bytesio(self) -> BytesIO:
103
178
  """
@@ -114,6 +189,61 @@ class DocumentParser(Parser):
114
189
  with open(self.source, "rb") as f:
115
190
  return BytesIO(f.read())
116
191
 
192
+ @staticmethod
193
+ def chunks_from_path_or_bytes(
194
+ source: str | bytes,
195
+ parser: Parser,
196
+ doc_type: str | DocumentType | None = None,
197
+ lines: int | None = None,
198
+ ) -> List[Document]:
199
+ """
200
+ Get document chunks from a file path or bytes object.
201
+ Args:
202
+ source (str|bytes): The source, which could be a URL, path or bytes object.
203
+ parser (Parser): The parser instance (for splitting the document).
204
+ doc_type (str|DocumentType|None): The type of document, if known.
205
+ lines (int|None): The number of lines to read from a plain text file.
206
+ Returns:
207
+ List[Document]: A list of `Document` objects,
208
+ each containing a chunk of text, determined by the
209
+ chunking and splitting settings in the parser config.
210
+ """
211
+ dtype: DocumentType = DocumentParser._document_type(source, doc_type)
212
+ if dtype in [DocumentType.PDF, DocumentType.DOC, DocumentType.DOCX]:
213
+ doc_parser = DocumentParser.create(
214
+ source,
215
+ parser.config,
216
+ doc_type=doc_type,
217
+ )
218
+ chunks = doc_parser.get_doc_chunks()
219
+ if len(chunks) == 0 and dtype == DocumentType.PDF:
220
+ doc_parser = ImagePdfParser(source, parser.config)
221
+ chunks = doc_parser.get_doc_chunks()
222
+ return chunks
223
+ else:
224
+ # try getting as plain text; these will be chunked downstream
225
+ # -- could be a bytes object or a path
226
+ if isinstance(source, bytes):
227
+ content = source.decode()
228
+ if lines is not None:
229
+ file_lines = content.splitlines()[:lines]
230
+ content = "\n".join(line.strip() for line in file_lines)
231
+ else:
232
+ with open(source, "r") as f:
233
+ if lines is not None:
234
+ file_lines = list(itertools.islice(f, lines))
235
+ content = "\n".join(line.strip() for line in file_lines)
236
+ else:
237
+ content = f.read()
238
+ soup = BeautifulSoup(content, "html.parser")
239
+ text = soup.get_text()
240
+ source_name = source if isinstance(source, str) else "bytes"
241
+ doc = Document(
242
+ content=text,
243
+ metadata=DocMetaData(source=str(source_name)),
244
+ )
245
+ return parser.split([doc])
246
+
117
247
  def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
118
248
  """Yield each page in the PDF."""
119
249
  raise NotImplementedError
@@ -138,7 +268,7 @@ class DocumentParser(Parser):
138
268
 
139
269
  def get_doc(self) -> Document:
140
270
  """
141
- Get entire text from pdf source as a single document.
271
+ Get entire text from source as a single document.
142
272
 
143
273
  Returns:
144
274
  a `Document` object containing the content of the pdf file,
@@ -294,50 +424,34 @@ class PDFPlumberParser(DocumentParser):
294
424
  return self.fix_text(page.extract_text())
295
425
 
296
426
 
297
- class HaystackPDFParser(DocumentParser):
427
+ class ImagePdfParser(DocumentParser):
298
428
  """
299
- Parser for processing PDFs using the `haystack` library.
429
+ Parser for processing PDFs that are images, i.e. not "true" PDFs.
300
430
  """
301
431
 
302
- def get_doc_chunks(self) -> List[Document]:
303
- """
304
- Overrides the base class method to use the `haystack` library.
305
- See there for more details.
432
+ def iterate_pages(
433
+ self,
434
+ ) -> Generator[Tuple[int, Image], None, None]:
435
+ from pdf2image import convert_from_bytes
436
+
437
+ images = convert_from_bytes(self.doc_bytes.getvalue())
438
+ for i, image in enumerate(images):
439
+ yield i, image
440
+
441
+ def extract_text_from_page(self, page: Image) -> str:
306
442
  """
443
+ Extract text from a given `pdf2image` page.
307
444
 
308
- from haystack.nodes import PDFToTextConverter, PreProcessor
445
+ Args:
446
+ page (Image): The PIL Image object.
309
447
 
310
- converter = PDFToTextConverter(
311
- remove_numeric_tables=True,
312
- )
313
- path = self.source
314
- if path.startswith(("http://", "https://")):
315
- path = url_to_tempfile(path)
316
- doc = converter.convert(file_path=path, meta=None)
317
- # note self.config.chunk_size is in token units,
318
- # and we use an approximation of 75 words per 100 tokens
319
- # to convert to word units
320
- preprocessor = PreProcessor(
321
- clean_empty_lines=True,
322
- clean_whitespace=True,
323
- clean_header_footer=False,
324
- split_by="word",
325
- split_length=int(0.75 * self.config.chunk_size),
326
- split_overlap=int(0.75 * self.config.overlap),
327
- split_respect_sentence_boundary=True,
328
- add_page_number=True,
329
- )
330
- chunks = preprocessor.process(doc)
331
- return [
332
- Document(
333
- content=chunk.content,
334
- metadata=DocMetaData(
335
- source=f"{self.source} page {chunk.meta['page']}",
336
- is_chunk=True,
337
- ),
338
- )
339
- for chunk in chunks
340
- ]
448
+ Returns:
449
+ str: Extracted text from the image.
450
+ """
451
+ import pytesseract
452
+
453
+ text = pytesseract.image_to_string(page)
454
+ return self.fix_text(text)
341
455
 
342
456
 
343
457
  class UnstructuredPDFParser(DocumentParser):
@@ -346,7 +460,17 @@ class UnstructuredPDFParser(DocumentParser):
346
460
  """
347
461
 
348
462
  def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]: # type: ignore
349
- from unstructured.partition.pdf import partition_pdf
463
+ try:
464
+ from unstructured.partition.pdf import partition_pdf
465
+ except ImportError:
466
+ raise ImportError(
467
+ """
468
+ The `unstructured` library is not installed by default with langroid.
469
+ To include this library, please install langroid with the
470
+ `unstructured` extra by running `pip install "langroid[unstructured]"`
471
+ or equivalent.
472
+ """
473
+ )
350
474
 
351
475
  # from unstructured.chunking.title import chunk_by_title
352
476
 
@@ -360,7 +484,7 @@ class UnstructuredPDFParser(DocumentParser):
360
484
  Please try a different library by setting the `library` field
361
485
  in the `pdf` section of the `parsing` field in the config file.
362
486
  Supported libraries are:
363
- fitz, pypdf, pdfplumber, unstructured, haystack
487
+ fitz, pypdf, pdfplumber, unstructured
364
488
  """
365
489
  )
366
490
 
@@ -399,7 +523,17 @@ class UnstructuredDocxParser(DocumentParser):
399
523
  """
400
524
 
401
525
  def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]: # type: ignore
402
- from unstructured.partition.docx import partition_docx
526
+ try:
527
+ from unstructured.partition.docx import partition_docx
528
+ except ImportError:
529
+ raise ImportError(
530
+ """
531
+ The `unstructured` library is not installed by default with langroid.
532
+ To include this library, please install langroid with the
533
+ `unstructured` extra by running `pip install "langroid[unstructured]"`
534
+ or equivalent.
535
+ """
536
+ )
403
537
 
404
538
  elements = partition_docx(file=self.doc_bytes, include_page_breaks=True)
405
539
 
@@ -436,3 +570,65 @@ class UnstructuredDocxParser(DocumentParser):
436
570
  """
437
571
  text = " ".join(el.text for el in page)
438
572
  return self.fix_text(text)
573
+
574
+
575
+ class UnstructuredDocParser(UnstructuredDocxParser):
576
+ def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]: # type: ignore
577
+ try:
578
+ from unstructured.partition.doc import partition_doc
579
+ except ImportError:
580
+ raise ImportError(
581
+ """
582
+ The `unstructured` library is not installed by default with langroid.
583
+ To include this library, please install langroid with the
584
+ `unstructured` extra by running `pip install "langroid[unstructured]"`
585
+ or equivalent.
586
+ """
587
+ )
588
+
589
+ elements = partition_doc(filename=self.source, include_page_breaks=True)
590
+
591
+ page_number = 1
592
+ page_elements = [] # type: ignore
593
+ for el in elements:
594
+ if el.category == "PageBreak":
595
+ if page_elements: # Avoid yielding empty pages at the start
596
+ yield page_number, page_elements
597
+ page_number += 1
598
+ page_elements = []
599
+ else:
600
+ page_elements.append(el)
601
+ # Yield the last page if it's not empty
602
+ if page_elements:
603
+ yield page_number, page_elements
604
+
605
+
606
+ class PythonDocxParser(DocumentParser):
607
+ """
608
+ Parser for processing DOCX files using the `python-docx` library.
609
+ """
610
+
611
+ def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
612
+ """
613
+ Simulate iterating through pages.
614
+ In a DOCX file, pages are not explicitly defined,
615
+ so we consider each paragraph as a separate 'page' for simplicity.
616
+ """
617
+ import docx
618
+
619
+ doc = docx.Document(self.doc_bytes)
620
+ for i, para in enumerate(doc.paragraphs, start=1):
621
+ yield i, [para]
622
+
623
+ def extract_text_from_page(self, page: Any) -> str:
624
+ """
625
+ Extract text from a given 'page', which in this case is a single paragraph.
626
+
627
+ Args:
628
+ page (list): A list containing a single Paragraph object.
629
+
630
+ Returns:
631
+ str: Extracted text from the paragraph.
632
+ """
633
+ paragraph = page[0]
634
+ return self.fix_text(paragraph.text)
@@ -0,0 +1,32 @@
1
+ from typing import Union
2
+
3
+ import pytesseract
4
+ from pdf2image import convert_from_bytes, convert_from_path
5
+
6
+
7
+ def pdf_image_to_text(input_data: Union[str, bytes]) -> str:
8
+ """
9
+ Converts a PDF that contains images to text using OCR.
10
+
11
+ Args:
12
+ input_data (Union[str, bytes]): The file path to the PDF or a bytes-like object
13
+ of the PDF content.
14
+
15
+ Returns:
16
+ str: The extracted text from the PDF.
17
+ """
18
+
19
+ # Check if the input is a file path (str) or bytes, and
20
+ # convert PDF to images accordingly
21
+ if isinstance(input_data, str):
22
+ images = convert_from_path(input_data)
23
+ elif isinstance(input_data, bytes):
24
+ images = convert_from_bytes(input_data)
25
+ else:
26
+ raise ValueError("input_data must be a file path (str) or bytes-like object")
27
+
28
+ text = ""
29
+ for image in images:
30
+ text += pytesseract.image_to_string(image)
31
+
32
+ return text
@@ -0,0 +1,143 @@
1
+ import json
2
+ from typing import Any, Iterator, List
3
+
4
+ import yaml
5
+ from pyparsing import nestedExpr, originalTextFor
6
+
7
+
8
+ def is_valid_json(json_str: str) -> bool:
9
+ """Check if the input string is a valid JSON.
10
+
11
+ Args:
12
+ json_str (str): The input string to check.
13
+
14
+ Returns:
15
+ bool: True if the input string is a valid JSON, False otherwise.
16
+ """
17
+ try:
18
+ json.loads(json_str)
19
+ return True
20
+ except ValueError:
21
+ return False
22
+
23
+
24
+ def flatten(nested_list) -> Iterator[str]: # type: ignore
25
+ """Flatten a nested list into a single list of strings"""
26
+ for item in nested_list:
27
+ if isinstance(item, (list, tuple)):
28
+ for subitem in flatten(item):
29
+ yield subitem
30
+ else:
31
+ yield item
32
+
33
+
34
+ def get_json_candidates(s: str) -> List[str]:
35
+ """Get top-level JSON candidates, i.e. strings between curly braces."""
36
+ # Define the grammar for matching curly braces
37
+ curly_braces = originalTextFor(nestedExpr("{", "}"))
38
+
39
+ # Parse the string
40
+ try:
41
+ results = curly_braces.searchString(s)
42
+ # Properly convert nested lists to strings
43
+ return [r[0] for r in results]
44
+ except Exception:
45
+ return []
46
+
47
+
48
+ def add_quotes(s: str) -> str:
49
+ """
50
+ Replace accidentally un-quoted string-like keys and values in a potential json str.
51
+ Intended to handle cases where a weak LLM may produce a JSON-like string
52
+ containing, e.g. "rent": DO-NOT-KNOW, where it "forgot" to put quotes on the value,
53
+ or city: "New York" where it "forgot" to put quotes on the key.
54
+ It will even handle cases like 'address: do not know'.
55
+
56
+ Got this fiendishly clever solution from
57
+ https://stackoverflow.com/a/66053900/10940584
58
+ Far better/safer than trying to do it with regexes.
59
+
60
+ Args:
61
+ - s (str): The potential JSON string to parse.
62
+
63
+ Returns:
64
+ - str: The (potential) JSON string with un-quoted string-like values
65
+ replaced by quoted values.
66
+ """
67
+ if is_valid_json(s):
68
+ return s
69
+ try:
70
+ dct = yaml.load(s, yaml.SafeLoader)
71
+ return json.dumps(dct)
72
+ except Exception:
73
+ return s
74
+
75
+
76
+ def repair_newlines(s: str) -> str:
77
+ """
78
+ Attempt to load as json, and if it fails, try with newlines replaced by space.
79
+ Intended to handle cases where weak LLMs produce JSON-like strings where
80
+ some string-values contain explicit newlines, e.g.:
81
+ {"text": "This is a text\n with a newline"}
82
+ These would not be valid JSON, so we try to clean them up here.
83
+ """
84
+ try:
85
+ json.loads(s)
86
+ return s
87
+ except Exception:
88
+ try:
89
+ s = s.replace("\n", " ")
90
+ json.loads(s)
91
+ return s
92
+ except Exception:
93
+ return s
94
+
95
+
96
+ def extract_top_level_json(s: str) -> List[str]:
97
+ """Extract all top-level JSON-formatted substrings from a given string.
98
+
99
+ Args:
100
+ s (str): The input string to search for JSON substrings.
101
+
102
+ Returns:
103
+ List[str]: A list of top-level JSON-formatted substrings.
104
+ """
105
+ # Find JSON object and array candidates
106
+ json_candidates = get_json_candidates(s)
107
+
108
+ normalized_candidates = [
109
+ candidate.replace("\\{", "{").replace("\\}", "}").replace("\\_", "_")
110
+ for candidate in json_candidates
111
+ ]
112
+ candidates = [add_quotes(candidate) for candidate in normalized_candidates]
113
+ candidates = [repair_newlines(candidate) for candidate in candidates]
114
+ top_level_jsons = [
115
+ candidate for candidate in candidates if is_valid_json(candidate)
116
+ ]
117
+
118
+ return top_level_jsons
119
+
120
+
121
+ def top_level_json_field(s: str, f: str) -> Any:
122
+ """
123
+ Extract the value of a field f from a top-level JSON object.
124
+ If there are multiple, just return the first.
125
+
126
+ Args:
127
+ s (str): The input string to search for JSON substrings.
128
+ f (str): The field to extract from the JSON object.
129
+
130
+ Returns:
131
+ str: The value of the field f in the top-level JSON object, if any.
132
+ Otherwise, return an empty string.
133
+ """
134
+
135
+ jsons = extract_top_level_json(s)
136
+ if len(jsons) == 0:
137
+ return ""
138
+ for j in jsons:
139
+ json_data = json.loads(j)
140
+ if f in json_data:
141
+ return json_data[f]
142
+
143
+ return ""
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  from enum import Enum
3
- from typing import Dict, List
3
+ from typing import Dict, List, Literal
4
4
 
5
5
  import tiktoken
6
6
  from pydantic import BaseSettings
@@ -19,11 +19,21 @@ class Splitter(str, Enum):
19
19
 
20
20
 
21
21
  class PdfParsingConfig(BaseSettings):
22
- library: str = "pdfplumber"
22
+ library: Literal[
23
+ "fitz",
24
+ "pdfplumber",
25
+ "pypdf",
26
+ "unstructured",
27
+ "pdf2image",
28
+ ] = "pdfplumber"
23
29
 
24
30
 
25
31
  class DocxParsingConfig(BaseSettings):
26
- library: str = "unstructured"
32
+ library: Literal["python-docx", "unstructured"] = "unstructured"
33
+
34
+
35
+ class DocParsingConfig(BaseSettings):
36
+ library: Literal["unstructured"] = "unstructured"
27
37
 
28
38
 
29
39
  class ParsingConfig(BaseSettings):
@@ -40,6 +50,7 @@ class ParsingConfig(BaseSettings):
40
50
  token_encoding_model: str = "text-embedding-ada-002"
41
51
  pdf: PdfParsingConfig = PdfParsingConfig()
42
52
  docx: DocxParsingConfig = DocxParsingConfig()
53
+ doc: DocParsingConfig = DocParsingConfig()
43
54
 
44
55
 
45
56
  class Parser:
@@ -55,6 +66,10 @@ class Parser:
55
66
  """Chunks may belong to multiple docs, but for each doc,
56
67
  they appear consecutively. Add window_ids in metadata"""
57
68
 
69
+ # discard empty chunks
70
+ chunks = [c for c in chunks if c.content.strip() != ""]
71
+ if len(chunks) == 0:
72
+ return
58
73
  # The original metadata.id (if any) is ignored since it will be same for all
59
74
  # chunks and is useless. We want a distinct id for each chunk.
60
75
  orig_ids = [c.metadata.id for c in chunks]
@@ -65,8 +80,8 @@ class Parser:
65
80
  orig_id_to_ids: Dict[str, List[str]] = {}
66
81
  for orig_id, id in zip(orig_ids, ids):
67
82
  if orig_id not in orig_id_to_ids:
68
- orig_id_to_ids[orig_id] = [] # type: ignore
69
- orig_id_to_ids[orig_id].append(id) # type: ignore
83
+ orig_id_to_ids[orig_id] = []
84
+ orig_id_to_ids[orig_id].append(id)
70
85
 
71
86
  # now each orig_id maps to a sequence of ids within a single doc
72
87
 
@@ -77,8 +92,6 @@ class Parser:
77
92
  window_ids = [ids[max(0, i - k) : min(n, i + k + 1)] for i in range(n)]
78
93
  for i, _ in enumerate(ids):
79
94
  c = id2chunk[ids[i]]
80
- if c.content.strip() == "":
81
- continue
82
95
  c.metadata.window_ids = window_ids[i]
83
96
  c.metadata.id = ids[i]
84
97
  c.metadata.is_chunk = True