langroid 0.41.4__py3-none-any.whl → 0.42.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -64,6 +64,7 @@ class GeminiModel(ModelName):
64
64
  GEMINI_1_5_FLASH_8B = "gemini/gemini-1.5-flash-8b"
65
65
  GEMINI_1_5_PRO = "gemini/gemini-1.5-pro"
66
66
  GEMINI_2_FLASH = "gemini/gemini-2.0-flash"
67
+ GEMINI_2_FLASH_LITE = "gemini/gemini-2.0-flash-lite-preview"
67
68
  GEMINI_2_FLASH_THINKING = "gemini/gemini-2.0-flash-thinking-exp"
68
69
 
69
70
 
@@ -282,9 +283,21 @@ MODEL_INFO: Dict[str, ModelInfo] = {
282
283
  provider=ModelProvider.GOOGLE,
283
284
  context_length=1_056_768,
284
285
  max_output_tokens=8192,
286
+ input_cost_per_million=0.10,
287
+ output_cost_per_million=0.40,
285
288
  rename_params={"max_tokens": "max_completion_tokens"},
286
289
  description="Gemini 2.0 Flash",
287
290
  ),
291
+ GeminiModel.GEMINI_2_FLASH_LITE.value: ModelInfo(
292
+ name=GeminiModel.GEMINI_2_FLASH_LITE.value,
293
+ provider=ModelProvider.GOOGLE,
294
+ context_length=1_056_768,
295
+ max_output_tokens=8192,
296
+ input_cost_per_million=0.075,
297
+ output_cost_per_million=0.30,
298
+ rename_params={"max_tokens": "max_completion_tokens"},
299
+ description="Gemini 2.0 Flash Lite Preview",
300
+ ),
288
301
  GeminiModel.GEMINI_1_5_FLASH.value: ModelInfo(
289
302
  name=GeminiModel.GEMINI_1_5_FLASH.value,
290
303
  provider=ModelProvider.GOOGLE,
@@ -14,6 +14,9 @@ from . import web_search
14
14
 
15
15
  from .parser import (
16
16
  Splitter,
17
+ MarkitdownXLSParsingConfig,
18
+ MarkitdownXLSXParsingConfig,
19
+ MarkitdownPPTXParsingConfig,
17
20
  PdfParsingConfig,
18
21
  DocxParsingConfig,
19
22
  DocParsingConfig,
@@ -40,6 +43,9 @@ __all__ = [
40
43
  "DocxParsingConfig",
41
44
  "DocParsingConfig",
42
45
  "ParsingConfig",
46
+ "MarkitdownXLSXParsingConfig",
47
+ "MarkitdownXLSParsingConfig",
48
+ "MarkitdownPPTXParsingConfig",
43
49
  "Parser",
44
50
  ]
45
51
 
@@ -56,6 +56,9 @@ class DocumentType(str, Enum):
56
56
  DOCX = "docx"
57
57
  DOC = "doc"
58
58
  TXT = "txt"
59
+ XLSX = "xlsx"
60
+ XLS = "xls"
61
+ PPTX = "pptx"
59
62
 
60
63
 
61
64
  def find_last_full_char(possible_unicode: bytes) -> int:
@@ -175,6 +178,12 @@ class DocumentParser(Parser):
175
178
  )
176
179
  elif inferred_doc_type == DocumentType.DOC:
177
180
  return UnstructuredDocParser(source, config)
181
+ elif inferred_doc_type == DocumentType.XLS:
182
+ return MarkitdownXLSXParser(source, config)
183
+ elif inferred_doc_type == DocumentType.XLSX:
184
+ return MarkitdownXLSXParser(source, config)
185
+ elif inferred_doc_type == DocumentType.PPTX:
186
+ return MarkitdownPPTXParser(source, config)
178
187
  else:
179
188
  source_name = source if isinstance(source, str) else "bytes"
180
189
  raise ValueError(f"Unsupported document type: {source_name}")
@@ -223,6 +232,12 @@ class DocumentParser(Parser):
223
232
  return DocumentType.DOCX
224
233
  elif source.lower().endswith(".doc"):
225
234
  return DocumentType.DOC
235
+ elif source.lower().endswith(".xlsx"):
236
+ return DocumentType.XLSX
237
+ elif source.lower().endswith(".xls"):
238
+ return DocumentType.XLS
239
+ elif source.lower().endswith(".pptx"):
240
+ return DocumentType.PPTX
226
241
  else:
227
242
  raise ValueError(f"Unsupported document type: {source}")
228
243
  else:
@@ -236,13 +251,17 @@ class DocumentParser(Parser):
236
251
  elif mime_type in [
237
252
  "application/vnd.openxmlformats-officedocument"
238
253
  ".wordprocessingml.document",
239
- "application/zip",
240
254
  ]:
241
- # DOCX files are essentially ZIP files,
242
- # but this might catch other ZIP-based formats too!
243
255
  return DocumentType.DOCX
244
256
  elif mime_type == "application/msword":
245
257
  return DocumentType.DOC
258
+ elif (
259
+ mime_type
260
+ == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
261
+ ):
262
+ return DocumentType.XLSX
263
+ elif mime_type == "application/vnd.ms-excel":
264
+ return DocumentType.XLS
246
265
  else:
247
266
  raise ValueError("Unsupported document type from bytes")
248
267
 
@@ -281,7 +300,14 @@ class DocumentParser(Parser):
281
300
  chunking and splitting settings in the parser config.
282
301
  """
283
302
  dtype: DocumentType = DocumentParser._document_type(source, doc_type)
284
- if dtype in [DocumentType.PDF, DocumentType.DOC, DocumentType.DOCX]:
303
+ if dtype in [
304
+ DocumentType.PDF,
305
+ DocumentType.DOC,
306
+ DocumentType.DOCX,
307
+ DocumentType.PPTX,
308
+ DocumentType.XLS,
309
+ DocumentType.XLSX,
310
+ ]:
285
311
  doc_parser = DocumentParser.create(
286
312
  source,
287
313
  parser.config,
@@ -857,3 +883,72 @@ class PythonDocxParser(DocumentParser):
857
883
  content=self.fix_text(paragraph.text),
858
884
  metadata=DocMetaData(source=self.source),
859
885
  )
886
+
887
+
888
+ class MarkitdownXLSXParser(DocumentParser):
889
+ def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
890
+ try:
891
+ from markitdown import MarkItDown
892
+ except ImportError:
893
+ LangroidImportError("markitdown", "doc-parsers")
894
+ md = MarkItDown()
895
+ self.doc_bytes.seek(0) # Reset to start
896
+
897
+ # Save stream to a temp file since md.convert() expects a path or URL
898
+ # Temporary workaround until markitdown fixes convert_stream function
899
+ # for xls and xlsx files
900
+ # See issue here https://github.com/microsoft/markitdown/issues/321
901
+ with tempfile.NamedTemporaryFile(delete=True, suffix=".xlsx") as temp_file:
902
+ temp_file.write(self.doc_bytes.read())
903
+ temp_file.flush() # Ensure data is written before reading
904
+ result = md.convert(temp_file.name)
905
+
906
+ sheets = re.split(r"(?=## Sheet\d+)", result.text_content)
907
+
908
+ for i, sheet in enumerate(sheets):
909
+ yield i, sheet
910
+
911
+ def get_document_from_page(self, md_content: str) -> Document:
912
+ """
913
+ Get Document object from a given 1-page markdown string.
914
+
915
+ Args:
916
+ md_content (str): The markdown content for the page.
917
+
918
+ Returns:
919
+ Document: Document object, with content and possible metadata.
920
+ """
921
+ return Document(
922
+ content=self.fix_text(md_content),
923
+ metadata=DocMetaData(source=self.source),
924
+ )
925
+
926
+
927
+ class MarkitdownPPTXParser(DocumentParser):
928
+ def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
929
+ try:
930
+ from markitdown import MarkItDown
931
+ except ImportError:
932
+ LangroidImportError("markitdown", "doc-parsers")
933
+
934
+ md = MarkItDown()
935
+ self.doc_bytes.seek(0)
936
+ result = md.convert_stream(self.doc_bytes, file_extension=".pptx")
937
+ slides = re.split(r"(?=<!-- Slide number: \d+ -->)", result.text_content)
938
+ for i, slide in enumerate(slides):
939
+ yield i, slide
940
+
941
+ def get_document_from_page(self, md_content: str) -> Document:
942
+ """
943
+ Get Document object from a given 1-page markdown string.
944
+
945
+ Args:
946
+ md_content (str): The markdown content for the page.
947
+
948
+ Returns:
949
+ Document: Document object, with content and possible metadata.
950
+ """
951
+ return Document(
952
+ content=self.fix_text(md_content),
953
+ metadata=DocMetaData(source=self.source),
954
+ )
@@ -28,6 +28,7 @@ class PdfParsingConfig(BaseSettings):
28
28
  "pypdf",
29
29
  "unstructured",
30
30
  "pdf2image",
31
+ "markitdown",
31
32
  ] = "pymupdf4llm"
32
33
 
33
34
 
@@ -39,6 +40,18 @@ class DocParsingConfig(BaseSettings):
39
40
  library: Literal["unstructured"] = "unstructured"
40
41
 
41
42
 
43
+ class MarkitdownPPTXParsingConfig(BaseSettings):
44
+ library: Literal["markitdown"] = "markitdown"
45
+
46
+
47
+ class MarkitdownXLSXParsingConfig(BaseSettings):
48
+ library: Literal["markitdown"] = "markitdown"
49
+
50
+
51
+ class MarkitdownXLSParsingConfig(BaseSettings):
52
+ library: Literal["markitdown"] = "markitdown"
53
+
54
+
42
55
  class ParsingConfig(BaseSettings):
43
56
  splitter: str = Splitter.TOKENS
44
57
  chunk_by_page: bool = False # split by page?
@@ -55,6 +68,9 @@ class ParsingConfig(BaseSettings):
55
68
  pdf: PdfParsingConfig = PdfParsingConfig()
56
69
  docx: DocxParsingConfig = DocxParsingConfig()
57
70
  doc: DocParsingConfig = DocParsingConfig()
71
+ pptx: MarkitdownPPTXParsingConfig = MarkitdownPPTXParsingConfig()
72
+ xls: MarkitdownXLSParsingConfig = MarkitdownXLSParsingConfig()
73
+ xlsx: MarkitdownXLSXParsingConfig = MarkitdownXLSXParsingConfig()
58
74
 
59
75
 
60
76
  class Parser:
@@ -45,6 +45,9 @@ class WeaviateDBConfig(VectorStoreConfig):
45
45
  embedding: EmbeddingModelsConfig = OpenAIEmbeddingsConfig()
46
46
  distance: str = VectorDistances.COSINE
47
47
  cloud: bool = False
48
+ docker: bool = False
49
+ host: str = "127.0.0.1"
50
+ port: int = 8080
48
51
  storage_path: str = ".weaviate_embedded/data"
49
52
 
50
53
 
@@ -55,11 +58,13 @@ class WeaviateDB(VectorStore):
55
58
  raise LangroidImportError("weaviate", "weaviate")
56
59
  self.config: WeaviateDBConfig = config
57
60
  load_dotenv()
58
- if not self.config.cloud:
59
- self.client = weaviate.connect_to_embedded(
60
- version="latest", persistence_data_path=self.config.storage_path
61
+ if self.config.docker:
62
+ self.client = weaviate.connect_to_local(
63
+ host=self.config.host,
64
+ port=self.config.port,
61
65
  )
62
- else: # Cloud mode
66
+ self.config.cloud = False
67
+ elif self.config.cloud:
63
68
  key = os.getenv("WEAVIATE_API_KEY")
64
69
  url = os.getenv("WEAVIATE_API_URL")
65
70
  if url is None or key is None:
@@ -73,6 +78,10 @@ class WeaviateDB(VectorStore):
73
78
  cluster_url=url,
74
79
  auth_credentials=Auth.api_key(key),
75
80
  )
81
+ else:
82
+ self.client = weaviate.connect_to_embedded(
83
+ version="latest", persistence_data_path=self.config.storage_path
84
+ )
76
85
 
77
86
  if config.collection_name is not None:
78
87
  WeaviateDB.validate_and_format_collection_name(config.collection_name)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: langroid
3
- Version: 0.41.4
3
+ Version: 0.42.0
4
4
  Summary: Harness LLMs with Multi-Agent Programming
5
5
  Author-email: Prasad Chalasani <pchalasani@gmail.com>
6
6
  License: MIT
@@ -104,6 +104,12 @@ Requires-Dist: pypdf>=5.1.0; extra == 'doc-chat'
104
104
  Requires-Dist: pytesseract<0.4.0,>=0.3.10; extra == 'doc-chat'
105
105
  Requires-Dist: python-docx<2.0.0,>=1.1.0; extra == 'doc-chat'
106
106
  Requires-Dist: unstructured[docx,pdf,pptx]<1.0.0,>=0.16.15; extra == 'doc-chat'
107
+ Provides-Extra: doc-parsers
108
+ Requires-Dist: markitdown>=0.0.1a3; extra == 'doc-parsers'
109
+ Requires-Dist: openpyxl>=3.1.5; extra == 'doc-parsers'
110
+ Requires-Dist: python-docx>=1.1.2; extra == 'doc-parsers'
111
+ Requires-Dist: python-pptx>=1.0.2; extra == 'doc-parsers'
112
+ Requires-Dist: xlrd>=2.0.1; extra == 'doc-parsers'
107
113
  Provides-Extra: docling
108
114
  Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'docling'
109
115
  Provides-Extra: docx
@@ -142,6 +148,7 @@ Provides-Extra: neo4j
142
148
  Requires-Dist: neo4j<6.0.0,>=5.14.1; extra == 'neo4j'
143
149
  Provides-Extra: pdf-parsers
144
150
  Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'pdf-parsers'
151
+ Requires-Dist: markitdown>=0.0.1a3; extra == 'pdf-parsers'
145
152
  Requires-Dist: pdf2image<2.0.0,>=1.17.0; extra == 'pdf-parsers'
146
153
  Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17; extra == 'pdf-parsers'
147
154
  Requires-Dist: pymupdf<2.0.0,>=1.23.3; extra == 'pdf-parsers'
@@ -71,20 +71,20 @@ langroid/language_models/azure_openai.py,sha256=zNQzzsERxNestq-hFfQZbvTzK43G2vjR
71
71
  langroid/language_models/base.py,sha256=is4l3x858tdPHbrJU2jxJXe2j9PCGb9kk_c5nyfShxs,26150
72
72
  langroid/language_models/config.py,sha256=9Q8wk5a7RQr8LGMT_0WkpjY8S4ywK06SalVRjXlfCiI,378
73
73
  langroid/language_models/mock_lm.py,sha256=5BgHKDVRWFbUwDT_PFgTZXz9-k8wJSA2e3PZmyDgQ1k,4022
74
- langroid/language_models/model_info.py,sha256=yKAaKoCanPoqaoHCzRVNPjg-M9a4S2Vm2AJGnwMeO-M,11360
74
+ langroid/language_models/model_info.py,sha256=GEIyfypSzuev6ZG81-nb8OhvSxH4PHQ_m5UhBAQ8kSA,11910
75
75
  langroid/language_models/openai_gpt.py,sha256=aajZ3ZvGkwI-3QdsNWgJ4QSyGpnyXJ5n4p2fYGUmdo4,77317
76
76
  langroid/language_models/utils.py,sha256=L4_CbihDMTGcsg0TOG1Yd5JFEto46--h7CX_14m89sQ,5016
77
77
  langroid/language_models/prompt_formatter/__init__.py,sha256=2-5cdE24XoFDhifOLl8yiscohil1ogbP1ECkYdBlBsk,372
78
78
  langroid/language_models/prompt_formatter/base.py,sha256=eDS1sgRNZVnoajwV_ZIha6cba5Dt8xjgzdRbPITwx3Q,1221
79
79
  langroid/language_models/prompt_formatter/hf_formatter.py,sha256=PVJppmjRvD-2DF-XNC6mE05vTZ9wbu37SmXwZBQhad0,5055
80
80
  langroid/language_models/prompt_formatter/llama2_formatter.py,sha256=YdcO88qyBeuMENVIVvVqSYuEpvYSTndUe_jd6hVTko4,2899
81
- langroid/parsing/__init__.py,sha256=ZgSAfgTC6VsTLFlRSWT-TwYco7SQeRMeZG-49MnKYGY,936
81
+ langroid/parsing/__init__.py,sha256=2oUWJJAxIavq9Wtw5RGlkXLq3GF3zgXeVLLW4j7yeb8,1138
82
82
  langroid/parsing/agent_chats.py,sha256=sbZRV9ujdM5QXvvuHVjIi2ysYSYlap-uqfMMUKulrW0,1068
83
83
  langroid/parsing/code_parser.py,sha256=5ze0MBytrGGkU69pA_bJDjRm6QZz_QYfPcIwkagUa7U,3796
84
- langroid/parsing/document_parser.py,sha256=WSdNAiFDMVDS7wIF6XNIkRbE2BFLr1YYtgsitWkb4xM,30233
84
+ langroid/parsing/document_parser.py,sha256=NKmN_HjwNdfUjTbXhpyK_Wjay3QYEA26ZnewmbO6moA,33632
85
85
  langroid/parsing/para_sentence_split.py,sha256=AJBzZojP3zpB-_IMiiHismhqcvkrVBQ3ZINoQyx_bE4,2000
86
86
  langroid/parsing/parse_json.py,sha256=aADo38bAHQhC8on4aWZZzVzSDy-dK35vRLZsFI2ewh8,4756
87
- langroid/parsing/parser.py,sha256=pPzM3zXQvFtwTyQPtDha15oZhu1O3OKDLECnkB8waxg,12276
87
+ langroid/parsing/parser.py,sha256=ydouAGlb8XOFCkfSgh_w49d2qPhCcrYb3MrNFRr73Jg,12807
88
88
  langroid/parsing/pdf_utils.py,sha256=rmNJ9UzuBgXTAYwj1TtRJcD8h53x7cizhgyYHKO88I4,1513
89
89
  langroid/parsing/repo_loader.py,sha256=3GjvPJS6Vf5L6gV2zOU8s-Tf1oq_fZm-IB_RL_7CTsY,29373
90
90
  langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1232
@@ -127,8 +127,8 @@ langroid/vector_store/momento.py,sha256=xOaU7Hlyyn_5ihb0ARS5JHtmrKrTCt2IdRA-ioMM
127
127
  langroid/vector_store/pineconedb.py,sha256=otxXZNaBKb9f_H75HTaU3lMHiaR2NUp5MqwLZXpEY9M,14994
128
128
  langroid/vector_store/postgres.py,sha256=DQHd6dt-OcV_QVNm-ymn28rlTfhI6hqgcpLTPCsm0jI,15990
129
129
  langroid/vector_store/qdrantdb.py,sha256=v7TAsIoj_vxeKDYS9tpwJLBZA8fuTweTYxHo0X_uawM,17949
130
- langroid/vector_store/weaviatedb.py,sha256=ONEr2iGS0Ii73oMe7tRk6bB-BEXQUa70fYSrdI8d3yo,11481
131
- langroid-0.41.4.dist-info/METADATA,sha256=hU062qh537keZYiNQZhdKWffOldsn_lz5BWYew1TQbg,61331
132
- langroid-0.41.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
133
- langroid-0.41.4.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
134
- langroid-0.41.4.dist-info/RECORD,,
130
+ langroid/vector_store/weaviatedb.py,sha256=tjlqEtkwrhykelt-nbr2WIuHWJBuSAGjZuG6gsAMBsc,11753
131
+ langroid-0.42.0.dist-info/METADATA,sha256=83GY_mPN5raNf0cSz556IErJWP6nreriBWkbf514jV8,61699
132
+ langroid-0.42.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
133
+ langroid-0.42.0.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
134
+ langroid-0.42.0.dist-info/RECORD,,