langroid 0.37.0__py3-none-any.whl → 0.37.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,12 +3,12 @@ from __future__ import annotations
3
3
  import itertools
4
4
  import logging
5
5
  import re
6
- import tempfile
7
6
  from enum import Enum
8
7
  from io import BytesIO
9
8
  from typing import TYPE_CHECKING, Any, Dict, Generator, List, Tuple
10
9
 
11
10
  from langroid.exceptions import LangroidImportError
11
+ from langroid.parsing.pdf_utils import pdf_split_pages
12
12
  from langroid.utils.object_registry import ObjectRegistry
13
13
 
14
14
  try:
@@ -515,29 +515,23 @@ class DoclingParser(DocumentParser):
515
515
  raise LangroidImportError(
516
516
  "docling", ["docling", "pdf-parsers", "all", "doc-chat"]
517
517
  )
518
- from docling.datamodel.document import TextItem # type: ignore
518
+
519
519
  from docling.document_converter import ( # type: ignore
520
520
  ConversionResult,
521
521
  DocumentConverter,
522
522
  )
523
+ from docling_core.types.doc import ImageRefMode # type: ignore
523
524
 
525
+ page_files, tmp_dir = pdf_split_pages(self.doc_bytes)
524
526
  converter = DocumentConverter()
525
- file_path = self.source
526
- if file_path == "bytes":
527
- with tempfile.NamedTemporaryFile(delete=False) as tmp:
528
- tmp.write(self.doc_bytes.getvalue())
529
- file_path = tmp.name
530
- result: ConversionResult = converter.convert(file_path)
531
- doc = result.document
532
- n_pages = doc.num_pages() # type: ignore
533
- for i in range(n_pages):
534
- texts = [
535
- item[0].text
536
- for item in doc.iterate_items(page_no=i + 1)
537
- if isinstance(item[0], TextItem)
538
- ]
539
- text = "\n".join(texts)
540
- yield i, text
527
+ for i, page_file in enumerate(page_files):
528
+ result: ConversionResult = converter.convert(page_file)
529
+ md_text = result.document.export_to_markdown(
530
+ image_mode=ImageRefMode.REFERENCED
531
+ )
532
+ yield i, md_text
533
+
534
+ tmp_dir.cleanup()
541
535
 
542
536
  def get_document_from_page(self, page: str) -> Document:
543
537
  """
@@ -0,0 +1,55 @@
1
+ import tempfile
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from tempfile import TemporaryDirectory
5
+ from typing import TYPE_CHECKING, Any, BinaryIO, List, Tuple, Union
6
+
7
+ try:
8
+ import pypdf
9
+ except ImportError:
10
+ if not TYPE_CHECKING:
11
+ pypdf = None
12
+
13
+ from langroid.exceptions import LangroidImportError
14
+
15
+ if pypdf is None:
16
+ raise LangroidImportError(
17
+ "pypdf", ["pypdf", "docling", "all", "pdf-parsers", "doc-chat"]
18
+ )
19
+ from pypdf import PdfReader, PdfWriter
20
+
21
+
22
+ def pdf_split_pages(
23
+ input_pdf: Union[str, Path, BytesIO, BinaryIO],
24
+ ) -> Tuple[List[Path], TemporaryDirectory[Any]]:
25
+ """Splits a PDF into individual pages in a temporary directory.
26
+
27
+ Args:
28
+ input_pdf: Input PDF file path or file-like object
29
+ max_workers: Maximum number of concurrent workers for parallel processing
30
+
31
+ Returns:
32
+ Tuple containing:
33
+ - List of paths to individual PDF pages
34
+ - Temporary directory object (caller must call cleanup())
35
+
36
+ Example:
37
+ paths, tmp_dir = split_pdf_temp("input.pdf")
38
+ # Use paths...
39
+ tmp_dir.cleanup() # Clean up temp files when done
40
+ """
41
+ tmp_dir = tempfile.TemporaryDirectory()
42
+ reader = PdfReader(input_pdf)
43
+ paths = []
44
+
45
+ for i in range(len(reader.pages)):
46
+ writer = PdfWriter()
47
+ writer.add_page(reader.pages[i])
48
+ writer.add_metadata(reader.metadata or {})
49
+
50
+ output = Path(tmp_dir.name) / f"page_{i+1}.pdf"
51
+ with open(output, "wb") as f:
52
+ writer.write(f)
53
+ paths.append(output)
54
+
55
+ return paths, tmp_dir # Return dir object so caller can control cleanup
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: langroid
3
- Version: 0.37.0
3
+ Version: 0.37.1
4
4
  Summary: Harness LLMs with Multi-Agent Programming
5
5
  Author-email: Prasad Chalasani <pchalasani@gmail.com>
6
6
  License: MIT
@@ -102,6 +102,7 @@ Requires-Dist: python-docx<2.0.0,>=1.1.0; extra == 'doc-chat'
102
102
  Requires-Dist: unstructured[docx,pdf,pptx]<1.0.0,>=0.16.15; extra == 'doc-chat'
103
103
  Provides-Extra: docling
104
104
  Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'docling'
105
+ Requires-Dist: pypdf>=5.1.0; extra == 'docling'
105
106
  Provides-Extra: docx
106
107
  Requires-Dist: python-docx<2.0.0,>=1.1.0; extra == 'docx'
107
108
  Provides-Extra: fastembed
@@ -78,10 +78,11 @@ langroid/language_models/prompt_formatter/llama2_formatter.py,sha256=YdcO88qyBeu
78
78
  langroid/parsing/__init__.py,sha256=ZgSAfgTC6VsTLFlRSWT-TwYco7SQeRMeZG-49MnKYGY,936
79
79
  langroid/parsing/agent_chats.py,sha256=sbZRV9ujdM5QXvvuHVjIi2ysYSYlap-uqfMMUKulrW0,1068
80
80
  langroid/parsing/code_parser.py,sha256=5ze0MBytrGGkU69pA_bJDjRm6QZz_QYfPcIwkagUa7U,3796
81
- langroid/parsing/document_parser.py,sha256=1DjkoiieuPxlPtX-3FGzr3frDSKOjfKM4PhaKbVNQ1c,28570
81
+ langroid/parsing/document_parser.py,sha256=Xcf_yA4admhx75N123_ouWcgnYXHztxX0S3TxqlWKNU,28334
82
82
  langroid/parsing/para_sentence_split.py,sha256=AJBzZojP3zpB-_IMiiHismhqcvkrVBQ3ZINoQyx_bE4,2000
83
83
  langroid/parsing/parse_json.py,sha256=aADo38bAHQhC8on4aWZZzVzSDy-dK35vRLZsFI2ewh8,4756
84
84
  langroid/parsing/parser.py,sha256=WDv4QnNtAcLSiPe6cPhHOa-aMhrt3OV-kKnVXdgwtmI,12276
85
+ langroid/parsing/pdf_utils.py,sha256=IFs2GH9_ZOYJ159YF5MomQ8RKRj1YPBIxkv0gx4Xz7o,1629
85
86
  langroid/parsing/repo_loader.py,sha256=3GjvPJS6Vf5L6gV2zOU8s-Tf1oq_fZm-IB_RL_7CTsY,29373
86
87
  langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1232
87
88
  langroid/parsing/search.py,sha256=0i_r0ESb5HEQfagA2g7_uMQyxYPADWVbdcN9ixZhS4E,8992
@@ -122,7 +123,7 @@ langroid/vector_store/meilisearch.py,sha256=6frB7GFWeWmeKzRfLZIvzRjllniZ1cYj3Hmh
122
123
  langroid/vector_store/momento.py,sha256=UNHGT6jXuQtqY9f6MdqGU14bVnS0zHgIJUa30ULpUJo,10474
123
124
  langroid/vector_store/qdrantdb.py,sha256=Cen6f-y6witiR53UQ-5a605Reo0gTj3ygXpE_ehYoZo,18116
124
125
  langroid/vector_store/weaviatedb.py,sha256=C6jd1Twl5_jux3JYyrcTfQb63Lk9HuiUzVF4NahXuGo,10642
125
- langroid-0.37.0.dist-info/METADATA,sha256=hlweiAhkVzVb_sVOPF-adwqwDPpAUUsgE1wJFRYNnKg,60524
126
- langroid-0.37.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
127
- langroid-0.37.0.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
128
- langroid-0.37.0.dist-info/RECORD,,
126
+ langroid-0.37.1.dist-info/METADATA,sha256=XL8VnB7r3uUJ6-BkwZkUPeSQO4pfvo8YfH3GvbX_gFg,60572
127
+ langroid-0.37.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
128
+ langroid-0.37.1.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
129
+ langroid-0.37.1.dist-info/RECORD,,