langroid 0.1.159__py3-none-any.whl → 0.1.160__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -62,6 +62,8 @@ class DocumentParser(Parser):
62
62
  elif DocumentParser._document_type(source) == DocumentType.DOCX:
63
63
  if config.docx.library == "unstructured":
64
64
  return UnstructuredDocxParser(source, config)
65
+ elif config.docx.library == "python-docx":
66
+ return PythonDocxParser(source, config)
65
67
  else:
66
68
  raise ValueError(
67
69
  f"Unsupported DOCX library specified: {config.docx.library}"
@@ -436,3 +438,34 @@ class UnstructuredDocxParser(DocumentParser):
436
438
  """
437
439
  text = " ".join(el.text for el in page)
438
440
  return self.fix_text(text)
441
+
442
+
443
+ class PythonDocxParser(DocumentParser):
444
+ """
445
+ Parser for processing DOCX files using the `python-docx` library.
446
+ """
447
+
448
+ def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
449
+ """
450
+ Simulate iterating through pages.
451
+ In a DOCX file, pages are not explicitly defined,
452
+ so we consider each paragraph as a separate 'page' for simplicity.
453
+ """
454
+ import docx
455
+
456
+ doc = docx.Document(self.doc_bytes)
457
+ for i, para in enumerate(doc.paragraphs, start=1):
458
+ yield i, [para]
459
+
460
+ def extract_text_from_page(self, page: Any) -> str:
461
+ """
462
+ Extract text from a given 'page', which in this case is a single paragraph.
463
+
464
+ Args:
465
+ page (list): A list containing a single Paragraph object.
466
+
467
+ Returns:
468
+ str: Extracted text from the paragraph.
469
+ """
470
+ paragraph = page[0]
471
+ return self.fix_text(paragraph.text)
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  from enum import Enum
3
- from typing import Dict, List
3
+ from typing import Dict, List, Literal
4
4
 
5
5
  import tiktoken
6
6
  from pydantic import BaseSettings
@@ -19,11 +19,13 @@ class Splitter(str, Enum):
19
19
 
20
20
 
21
21
  class PdfParsingConfig(BaseSettings):
22
- library: str = "pdfplumber"
22
+ library: Literal[
23
+ "fitz", "pdfplumber", "pypdf", "unstructured", "haystack"
24
+ ] = "pdfplumber"
23
25
 
24
26
 
25
27
  class DocxParsingConfig(BaseSettings):
26
- library: str = "unstructured"
28
+ library: Literal["python-docx", "unstructured"] = "unstructured"
27
29
 
28
30
 
29
31
  class ParsingConfig(BaseSettings):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: langroid
3
- Version: 0.1.159
3
+ Version: 0.1.160
4
4
  Summary: Harness LLMs with Multi-Agent Programming
5
5
  License: MIT
6
6
  Author: Prasad Chalasani
@@ -64,6 +64,7 @@ Requires-Dist: pytest-asyncio (>=0.21.1,<0.22.0)
64
64
  Requires-Dist: pytest-mysql (>=2.4.2,<3.0.0) ; extra == "mysql"
65
65
  Requires-Dist: pytest-postgresql (>=5.0.0,<6.0.0) ; extra == "postgres"
66
66
  Requires-Dist: pytest-redis (>=3.0.2,<4.0.0)
67
+ Requires-Dist: python-docx (>=1.1.0,<2.0.0)
67
68
  Requires-Dist: python-dotenv (>=1.0.0,<2.0.0)
68
69
  Requires-Dist: qdrant-client (>=1.7.0,<2.0.0)
69
70
  Requires-Dist: rank-bm25 (>=0.2.2,<0.3.0)
@@ -59,10 +59,10 @@ langroid/parsing/agent_chats.py,sha256=sbZRV9ujdM5QXvvuHVjIi2ysYSYlap-uqfMMUKulr
59
59
  langroid/parsing/code-parsing.md,sha256=--cyyNiSZSDlIwcjAV4-shKrSiRe2ytF3AdSoS_hD2g,3294
60
60
  langroid/parsing/code_parser.py,sha256=BbDAzp35wkYQ9U1dpf1ARL0lVyi0tfqEc6_eox2C090,3727
61
61
  langroid/parsing/config.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
62
- langroid/parsing/document_parser.py,sha256=YC3IXQ9ErpBGBZh6Be9gfJWHcTwGTSMfNQMT5ARrj5g,14615
62
+ langroid/parsing/document_parser.py,sha256=Msv8acFzVDex-nKPNxyGOvTw4eNKswrSQluYOa1qfAE,15670
63
63
  langroid/parsing/json.py,sha256=KfIIma_6IurQ09WTUyBn3mbSK67QeXZ8eHGDxGlOsv0,2551
64
64
  langroid/parsing/para_sentence_split.py,sha256=AJBzZojP3zpB-_IMiiHismhqcvkrVBQ3ZINoQyx_bE4,2000
65
- langroid/parsing/parser.py,sha256=BwVJboobG71N08w5LC7Tu36LI4pEJoSgAdiBSLChWGY,10251
65
+ langroid/parsing/parser.py,sha256=0QDDfRrcO9jUwEj9WQiWi8ayVZ19MRC1xjwTLDrCKwg,10372
66
66
  langroid/parsing/repo_loader.py,sha256=hhMfQBBSo-HvsZDQEcgmk_idKQQAeDQ_MMPd38x2ACU,29338
67
67
  langroid/parsing/search.py,sha256=xmQdAdTIwZ0REEUeQVFlGZlqf7k8Poah7-ALuyW7Ov0,8440
68
68
  langroid/parsing/spider.py,sha256=w_mHR1B4KOmxsBLoVI8kMkMTEbwTzeK3ath9fOMJrTk,3043
@@ -103,7 +103,7 @@ langroid/vector_store/meilisearch.py,sha256=d2huA9P-NoYRuAQ9ZeXJmMKr7ry8u90RUSR2
103
103
  langroid/vector_store/momento.py,sha256=j6Eo6oIDN2fe7lsBOlCXJn3uvvERHHTFL5QJfeREeOM,10044
104
104
  langroid/vector_store/qdrant_cloud.py,sha256=3im4Mip0QXLkR6wiqVsjV1QvhSElfxdFSuDKddBDQ-4,188
105
105
  langroid/vector_store/qdrantdb.py,sha256=qt7Dye6rcgoe0551WzmOxRGIlJfL87D4MX7HdqxuEok,13393
106
- langroid-0.1.159.dist-info/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
107
- langroid-0.1.159.dist-info/METADATA,sha256=-qIZRl58PZFOsLnKyeFFktfjVedKKQufuogeH6TN2qw,42701
108
- langroid-0.1.159.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
109
- langroid-0.1.159.dist-info/RECORD,,
106
+ langroid-0.1.160.dist-info/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
107
+ langroid-0.1.160.dist-info/METADATA,sha256=C2tjQ0HKr3e3x0iWWa6vrbraLd9qgHZuslb1nrN5ERA,42745
108
+ langroid-0.1.160.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
109
+ langroid-0.1.160.dist-info/RECORD,,