langroid 0.41.5__py3-none-any.whl → 0.42.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langroid/parsing/__init__.py +6 -0
- langroid/parsing/document_parser.py +99 -4
- langroid/parsing/parser.py +17 -1
- {langroid-0.41.5.dist-info → langroid-0.42.1.dist-info}/METADATA +8 -1
- {langroid-0.41.5.dist-info → langroid-0.42.1.dist-info}/RECORD +7 -7
- {langroid-0.41.5.dist-info → langroid-0.42.1.dist-info}/WHEEL +0 -0
- {langroid-0.41.5.dist-info → langroid-0.42.1.dist-info}/licenses/LICENSE +0 -0
langroid/parsing/__init__.py
CHANGED
@@ -14,6 +14,9 @@ from . import web_search
|
|
14
14
|
|
15
15
|
from .parser import (
|
16
16
|
Splitter,
|
17
|
+
MarkitdownXLSParsingConfig,
|
18
|
+
MarkitdownXLSXParsingConfig,
|
19
|
+
MarkitdownPPTXParsingConfig,
|
17
20
|
PdfParsingConfig,
|
18
21
|
DocxParsingConfig,
|
19
22
|
DocParsingConfig,
|
@@ -40,6 +43,9 @@ __all__ = [
|
|
40
43
|
"DocxParsingConfig",
|
41
44
|
"DocParsingConfig",
|
42
45
|
"ParsingConfig",
|
46
|
+
"MarkitdownXLSXParsingConfig",
|
47
|
+
"MarkitdownXLSParsingConfig",
|
48
|
+
"MarkitdownPPTXParsingConfig",
|
43
49
|
"Parser",
|
44
50
|
]
|
45
51
|
|
@@ -56,6 +56,9 @@ class DocumentType(str, Enum):
|
|
56
56
|
DOCX = "docx"
|
57
57
|
DOC = "doc"
|
58
58
|
TXT = "txt"
|
59
|
+
XLSX = "xlsx"
|
60
|
+
XLS = "xls"
|
61
|
+
PPTX = "pptx"
|
59
62
|
|
60
63
|
|
61
64
|
def find_last_full_char(possible_unicode: bytes) -> int:
|
@@ -175,6 +178,12 @@ class DocumentParser(Parser):
|
|
175
178
|
)
|
176
179
|
elif inferred_doc_type == DocumentType.DOC:
|
177
180
|
return UnstructuredDocParser(source, config)
|
181
|
+
elif inferred_doc_type == DocumentType.XLS:
|
182
|
+
return MarkitdownXLSXParser(source, config)
|
183
|
+
elif inferred_doc_type == DocumentType.XLSX:
|
184
|
+
return MarkitdownXLSXParser(source, config)
|
185
|
+
elif inferred_doc_type == DocumentType.PPTX:
|
186
|
+
return MarkitdownPPTXParser(source, config)
|
178
187
|
else:
|
179
188
|
source_name = source if isinstance(source, str) else "bytes"
|
180
189
|
raise ValueError(f"Unsupported document type: {source_name}")
|
@@ -223,6 +232,12 @@ class DocumentParser(Parser):
|
|
223
232
|
return DocumentType.DOCX
|
224
233
|
elif source.lower().endswith(".doc"):
|
225
234
|
return DocumentType.DOC
|
235
|
+
elif source.lower().endswith(".xlsx"):
|
236
|
+
return DocumentType.XLSX
|
237
|
+
elif source.lower().endswith(".xls"):
|
238
|
+
return DocumentType.XLS
|
239
|
+
elif source.lower().endswith(".pptx"):
|
240
|
+
return DocumentType.PPTX
|
226
241
|
else:
|
227
242
|
raise ValueError(f"Unsupported document type: {source}")
|
228
243
|
else:
|
@@ -236,13 +251,17 @@ class DocumentParser(Parser):
|
|
236
251
|
elif mime_type in [
|
237
252
|
"application/vnd.openxmlformats-officedocument"
|
238
253
|
".wordprocessingml.document",
|
239
|
-
"application/zip",
|
240
254
|
]:
|
241
|
-
# DOCX files are essentially ZIP files,
|
242
|
-
# but this might catch other ZIP-based formats too!
|
243
255
|
return DocumentType.DOCX
|
244
256
|
elif mime_type == "application/msword":
|
245
257
|
return DocumentType.DOC
|
258
|
+
elif (
|
259
|
+
mime_type
|
260
|
+
== "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
261
|
+
):
|
262
|
+
return DocumentType.XLSX
|
263
|
+
elif mime_type == "application/vnd.ms-excel":
|
264
|
+
return DocumentType.XLS
|
246
265
|
else:
|
247
266
|
raise ValueError("Unsupported document type from bytes")
|
248
267
|
|
@@ -281,7 +300,14 @@ class DocumentParser(Parser):
|
|
281
300
|
chunking and splitting settings in the parser config.
|
282
301
|
"""
|
283
302
|
dtype: DocumentType = DocumentParser._document_type(source, doc_type)
|
284
|
-
if dtype in [
|
303
|
+
if dtype in [
|
304
|
+
DocumentType.PDF,
|
305
|
+
DocumentType.DOC,
|
306
|
+
DocumentType.DOCX,
|
307
|
+
DocumentType.PPTX,
|
308
|
+
DocumentType.XLS,
|
309
|
+
DocumentType.XLSX,
|
310
|
+
]:
|
285
311
|
doc_parser = DocumentParser.create(
|
286
312
|
source,
|
287
313
|
parser.config,
|
@@ -857,3 +883,72 @@ class PythonDocxParser(DocumentParser):
|
|
857
883
|
content=self.fix_text(paragraph.text),
|
858
884
|
metadata=DocMetaData(source=self.source),
|
859
885
|
)
|
886
|
+
|
887
|
+
|
888
|
+
class MarkitdownXLSXParser(DocumentParser):
|
889
|
+
def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
|
890
|
+
try:
|
891
|
+
from markitdown import MarkItDown
|
892
|
+
except ImportError:
|
893
|
+
LangroidImportError("markitdown", "doc-parsers")
|
894
|
+
md = MarkItDown()
|
895
|
+
self.doc_bytes.seek(0) # Reset to start
|
896
|
+
|
897
|
+
# Save stream to a temp file since md.convert() expects a path or URL
|
898
|
+
# Temporary workaround until markitdown fixes convert_stream function
|
899
|
+
# for xls and xlsx files
|
900
|
+
# See issue here https://github.com/microsoft/markitdown/issues/321
|
901
|
+
with tempfile.NamedTemporaryFile(delete=True, suffix=".xlsx") as temp_file:
|
902
|
+
temp_file.write(self.doc_bytes.read())
|
903
|
+
temp_file.flush() # Ensure data is written before reading
|
904
|
+
result = md.convert(temp_file.name)
|
905
|
+
|
906
|
+
sheets = re.split(r"(?=## Sheet\d+)", result.text_content)
|
907
|
+
|
908
|
+
for i, sheet in enumerate(sheets):
|
909
|
+
yield i, sheet
|
910
|
+
|
911
|
+
def get_document_from_page(self, md_content: str) -> Document:
|
912
|
+
"""
|
913
|
+
Get Document object from a given 1-page markdown string.
|
914
|
+
|
915
|
+
Args:
|
916
|
+
md_content (str): The markdown content for the page.
|
917
|
+
|
918
|
+
Returns:
|
919
|
+
Document: Document object, with content and possible metadata.
|
920
|
+
"""
|
921
|
+
return Document(
|
922
|
+
content=self.fix_text(md_content),
|
923
|
+
metadata=DocMetaData(source=self.source),
|
924
|
+
)
|
925
|
+
|
926
|
+
|
927
|
+
class MarkitdownPPTXParser(DocumentParser):
|
928
|
+
def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
|
929
|
+
try:
|
930
|
+
from markitdown import MarkItDown
|
931
|
+
except ImportError:
|
932
|
+
LangroidImportError("markitdown", "doc-parsers")
|
933
|
+
|
934
|
+
md = MarkItDown()
|
935
|
+
self.doc_bytes.seek(0)
|
936
|
+
result = md.convert_stream(self.doc_bytes, file_extension=".pptx")
|
937
|
+
slides = re.split(r"(?=<!-- Slide number: \d+ -->)", result.text_content)
|
938
|
+
for i, slide in enumerate(slides):
|
939
|
+
yield i, slide
|
940
|
+
|
941
|
+
def get_document_from_page(self, md_content: str) -> Document:
|
942
|
+
"""
|
943
|
+
Get Document object from a given 1-page markdown string.
|
944
|
+
|
945
|
+
Args:
|
946
|
+
md_content (str): The markdown content for the page.
|
947
|
+
|
948
|
+
Returns:
|
949
|
+
Document: Document object, with content and possible metadata.
|
950
|
+
"""
|
951
|
+
return Document(
|
952
|
+
content=self.fix_text(md_content),
|
953
|
+
metadata=DocMetaData(source=self.source),
|
954
|
+
)
|
langroid/parsing/parser.py
CHANGED
@@ -28,6 +28,7 @@ class PdfParsingConfig(BaseSettings):
|
|
28
28
|
"pypdf",
|
29
29
|
"unstructured",
|
30
30
|
"pdf2image",
|
31
|
+
"markitdown",
|
31
32
|
] = "pymupdf4llm"
|
32
33
|
|
33
34
|
|
@@ -39,6 +40,18 @@ class DocParsingConfig(BaseSettings):
|
|
39
40
|
library: Literal["unstructured"] = "unstructured"
|
40
41
|
|
41
42
|
|
43
|
+
class MarkitdownPPTXParsingConfig(BaseSettings):
|
44
|
+
library: Literal["markitdown"] = "markitdown"
|
45
|
+
|
46
|
+
|
47
|
+
class MarkitdownXLSXParsingConfig(BaseSettings):
|
48
|
+
library: Literal["markitdown"] = "markitdown"
|
49
|
+
|
50
|
+
|
51
|
+
class MarkitdownXLSParsingConfig(BaseSettings):
|
52
|
+
library: Literal["markitdown"] = "markitdown"
|
53
|
+
|
54
|
+
|
42
55
|
class ParsingConfig(BaseSettings):
|
43
56
|
splitter: str = Splitter.TOKENS
|
44
57
|
chunk_by_page: bool = False # split by page?
|
@@ -55,6 +68,9 @@ class ParsingConfig(BaseSettings):
|
|
55
68
|
pdf: PdfParsingConfig = PdfParsingConfig()
|
56
69
|
docx: DocxParsingConfig = DocxParsingConfig()
|
57
70
|
doc: DocParsingConfig = DocParsingConfig()
|
71
|
+
pptx: MarkitdownPPTXParsingConfig = MarkitdownPPTXParsingConfig()
|
72
|
+
xls: MarkitdownXLSParsingConfig = MarkitdownXLSParsingConfig()
|
73
|
+
xlsx: MarkitdownXLSXParsingConfig = MarkitdownXLSXParsingConfig()
|
58
74
|
|
59
75
|
|
60
76
|
class Parser:
|
@@ -66,7 +82,7 @@ class Parser:
|
|
66
82
|
self.tokenizer = tiktoken.encoding_for_model("text-embedding-3-small")
|
67
83
|
|
68
84
|
def num_tokens(self, text: str) -> int:
|
69
|
-
tokens = self.tokenizer.encode(text)
|
85
|
+
tokens = self.tokenizer.encode(text, allowed_special={"<|endoftext|>"})
|
70
86
|
return len(tokens)
|
71
87
|
|
72
88
|
def truncate_tokens(self, text: str, max_tokens: int) -> str:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: langroid
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.42.1
|
4
4
|
Summary: Harness LLMs with Multi-Agent Programming
|
5
5
|
Author-email: Prasad Chalasani <pchalasani@gmail.com>
|
6
6
|
License: MIT
|
@@ -104,6 +104,12 @@ Requires-Dist: pypdf>=5.1.0; extra == 'doc-chat'
|
|
104
104
|
Requires-Dist: pytesseract<0.4.0,>=0.3.10; extra == 'doc-chat'
|
105
105
|
Requires-Dist: python-docx<2.0.0,>=1.1.0; extra == 'doc-chat'
|
106
106
|
Requires-Dist: unstructured[docx,pdf,pptx]<1.0.0,>=0.16.15; extra == 'doc-chat'
|
107
|
+
Provides-Extra: doc-parsers
|
108
|
+
Requires-Dist: markitdown>=0.0.1a3; extra == 'doc-parsers'
|
109
|
+
Requires-Dist: openpyxl>=3.1.5; extra == 'doc-parsers'
|
110
|
+
Requires-Dist: python-docx>=1.1.2; extra == 'doc-parsers'
|
111
|
+
Requires-Dist: python-pptx>=1.0.2; extra == 'doc-parsers'
|
112
|
+
Requires-Dist: xlrd>=2.0.1; extra == 'doc-parsers'
|
107
113
|
Provides-Extra: docling
|
108
114
|
Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'docling'
|
109
115
|
Provides-Extra: docx
|
@@ -142,6 +148,7 @@ Provides-Extra: neo4j
|
|
142
148
|
Requires-Dist: neo4j<6.0.0,>=5.14.1; extra == 'neo4j'
|
143
149
|
Provides-Extra: pdf-parsers
|
144
150
|
Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'pdf-parsers'
|
151
|
+
Requires-Dist: markitdown>=0.0.1a3; extra == 'pdf-parsers'
|
145
152
|
Requires-Dist: pdf2image<2.0.0,>=1.17.0; extra == 'pdf-parsers'
|
146
153
|
Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17; extra == 'pdf-parsers'
|
147
154
|
Requires-Dist: pymupdf<2.0.0,>=1.23.3; extra == 'pdf-parsers'
|
@@ -78,13 +78,13 @@ langroid/language_models/prompt_formatter/__init__.py,sha256=2-5cdE24XoFDhifOLl8
|
|
78
78
|
langroid/language_models/prompt_formatter/base.py,sha256=eDS1sgRNZVnoajwV_ZIha6cba5Dt8xjgzdRbPITwx3Q,1221
|
79
79
|
langroid/language_models/prompt_formatter/hf_formatter.py,sha256=PVJppmjRvD-2DF-XNC6mE05vTZ9wbu37SmXwZBQhad0,5055
|
80
80
|
langroid/language_models/prompt_formatter/llama2_formatter.py,sha256=YdcO88qyBeuMENVIVvVqSYuEpvYSTndUe_jd6hVTko4,2899
|
81
|
-
langroid/parsing/__init__.py,sha256=
|
81
|
+
langroid/parsing/__init__.py,sha256=2oUWJJAxIavq9Wtw5RGlkXLq3GF3zgXeVLLW4j7yeb8,1138
|
82
82
|
langroid/parsing/agent_chats.py,sha256=sbZRV9ujdM5QXvvuHVjIi2ysYSYlap-uqfMMUKulrW0,1068
|
83
83
|
langroid/parsing/code_parser.py,sha256=5ze0MBytrGGkU69pA_bJDjRm6QZz_QYfPcIwkagUa7U,3796
|
84
|
-
langroid/parsing/document_parser.py,sha256=
|
84
|
+
langroid/parsing/document_parser.py,sha256=NKmN_HjwNdfUjTbXhpyK_Wjay3QYEA26ZnewmbO6moA,33632
|
85
85
|
langroid/parsing/para_sentence_split.py,sha256=AJBzZojP3zpB-_IMiiHismhqcvkrVBQ3ZINoQyx_bE4,2000
|
86
86
|
langroid/parsing/parse_json.py,sha256=aADo38bAHQhC8on4aWZZzVzSDy-dK35vRLZsFI2ewh8,4756
|
87
|
-
langroid/parsing/parser.py,sha256=
|
87
|
+
langroid/parsing/parser.py,sha256=moJKI5Cn_Pxd7xbNrY220dqQu-0FeEWUI7ogeq63Kec,12842
|
88
88
|
langroid/parsing/pdf_utils.py,sha256=rmNJ9UzuBgXTAYwj1TtRJcD8h53x7cizhgyYHKO88I4,1513
|
89
89
|
langroid/parsing/repo_loader.py,sha256=3GjvPJS6Vf5L6gV2zOU8s-Tf1oq_fZm-IB_RL_7CTsY,29373
|
90
90
|
langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1232
|
@@ -128,7 +128,7 @@ langroid/vector_store/pineconedb.py,sha256=otxXZNaBKb9f_H75HTaU3lMHiaR2NUp5MqwLZ
|
|
128
128
|
langroid/vector_store/postgres.py,sha256=DQHd6dt-OcV_QVNm-ymn28rlTfhI6hqgcpLTPCsm0jI,15990
|
129
129
|
langroid/vector_store/qdrantdb.py,sha256=v7TAsIoj_vxeKDYS9tpwJLBZA8fuTweTYxHo0X_uawM,17949
|
130
130
|
langroid/vector_store/weaviatedb.py,sha256=tjlqEtkwrhykelt-nbr2WIuHWJBuSAGjZuG6gsAMBsc,11753
|
131
|
-
langroid-0.
|
132
|
-
langroid-0.
|
133
|
-
langroid-0.
|
134
|
-
langroid-0.
|
131
|
+
langroid-0.42.1.dist-info/METADATA,sha256=MASCH2qw35RSfqTiDsVpYMj1OfzMJpFBCJdNxJ7jDPA,61699
|
132
|
+
langroid-0.42.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
133
|
+
langroid-0.42.1.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
|
134
|
+
langroid-0.42.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|