langroid 0.37.0__py3-none-any.whl → 0.37.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langroid/parsing/document_parser.py +12 -18
- langroid/parsing/pdf_utils.py +55 -0
- {langroid-0.37.0.dist-info → langroid-0.37.1.dist-info}/METADATA +2 -1
- {langroid-0.37.0.dist-info → langroid-0.37.1.dist-info}/RECORD +6 -5
- {langroid-0.37.0.dist-info → langroid-0.37.1.dist-info}/WHEEL +0 -0
- {langroid-0.37.0.dist-info → langroid-0.37.1.dist-info}/licenses/LICENSE +0 -0
@@ -3,12 +3,12 @@ from __future__ import annotations
|
|
3
3
|
import itertools
|
4
4
|
import logging
|
5
5
|
import re
|
6
|
-
import tempfile
|
7
6
|
from enum import Enum
|
8
7
|
from io import BytesIO
|
9
8
|
from typing import TYPE_CHECKING, Any, Dict, Generator, List, Tuple
|
10
9
|
|
11
10
|
from langroid.exceptions import LangroidImportError
|
11
|
+
from langroid.parsing.pdf_utils import pdf_split_pages
|
12
12
|
from langroid.utils.object_registry import ObjectRegistry
|
13
13
|
|
14
14
|
try:
|
@@ -515,29 +515,23 @@ class DoclingParser(DocumentParser):
|
|
515
515
|
raise LangroidImportError(
|
516
516
|
"docling", ["docling", "pdf-parsers", "all", "doc-chat"]
|
517
517
|
)
|
518
|
-
|
518
|
+
|
519
519
|
from docling.document_converter import ( # type: ignore
|
520
520
|
ConversionResult,
|
521
521
|
DocumentConverter,
|
522
522
|
)
|
523
|
+
from docling_core.types.doc import ImageRefMode # type: ignore
|
523
524
|
|
525
|
+
page_files, tmp_dir = pdf_split_pages(self.doc_bytes)
|
524
526
|
converter = DocumentConverter()
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
for i in range(n_pages):
|
534
|
-
texts = [
|
535
|
-
item[0].text
|
536
|
-
for item in doc.iterate_items(page_no=i + 1)
|
537
|
-
if isinstance(item[0], TextItem)
|
538
|
-
]
|
539
|
-
text = "\n".join(texts)
|
540
|
-
yield i, text
|
527
|
+
for i, page_file in enumerate(page_files):
|
528
|
+
result: ConversionResult = converter.convert(page_file)
|
529
|
+
md_text = result.document.export_to_markdown(
|
530
|
+
image_mode=ImageRefMode.REFERENCED
|
531
|
+
)
|
532
|
+
yield i, md_text
|
533
|
+
|
534
|
+
tmp_dir.cleanup()
|
541
535
|
|
542
536
|
def get_document_from_page(self, page: str) -> Document:
|
543
537
|
"""
|
@@ -0,0 +1,55 @@
|
|
1
|
+
import tempfile
|
2
|
+
from io import BytesIO
|
3
|
+
from pathlib import Path
|
4
|
+
from tempfile import TemporaryDirectory
|
5
|
+
from typing import TYPE_CHECKING, Any, BinaryIO, List, Tuple, Union
|
6
|
+
|
7
|
+
try:
|
8
|
+
import pypdf
|
9
|
+
except ImportError:
|
10
|
+
if not TYPE_CHECKING:
|
11
|
+
pypdf = None
|
12
|
+
|
13
|
+
from langroid.exceptions import LangroidImportError
|
14
|
+
|
15
|
+
if pypdf is None:
|
16
|
+
raise LangroidImportError(
|
17
|
+
"pypdf", ["pypdf", "docling", "all", "pdf-parsers", "doc-chat"]
|
18
|
+
)
|
19
|
+
from pypdf import PdfReader, PdfWriter
|
20
|
+
|
21
|
+
|
22
|
+
def pdf_split_pages(
|
23
|
+
input_pdf: Union[str, Path, BytesIO, BinaryIO],
|
24
|
+
) -> Tuple[List[Path], TemporaryDirectory[Any]]:
|
25
|
+
"""Splits a PDF into individual pages in a temporary directory.
|
26
|
+
|
27
|
+
Args:
|
28
|
+
input_pdf: Input PDF file path or file-like object
|
29
|
+
max_workers: Maximum number of concurrent workers for parallel processing
|
30
|
+
|
31
|
+
Returns:
|
32
|
+
Tuple containing:
|
33
|
+
- List of paths to individual PDF pages
|
34
|
+
- Temporary directory object (caller must call cleanup())
|
35
|
+
|
36
|
+
Example:
|
37
|
+
paths, tmp_dir = split_pdf_temp("input.pdf")
|
38
|
+
# Use paths...
|
39
|
+
tmp_dir.cleanup() # Clean up temp files when done
|
40
|
+
"""
|
41
|
+
tmp_dir = tempfile.TemporaryDirectory()
|
42
|
+
reader = PdfReader(input_pdf)
|
43
|
+
paths = []
|
44
|
+
|
45
|
+
for i in range(len(reader.pages)):
|
46
|
+
writer = PdfWriter()
|
47
|
+
writer.add_page(reader.pages[i])
|
48
|
+
writer.add_metadata(reader.metadata or {})
|
49
|
+
|
50
|
+
output = Path(tmp_dir.name) / f"page_{i+1}.pdf"
|
51
|
+
with open(output, "wb") as f:
|
52
|
+
writer.write(f)
|
53
|
+
paths.append(output)
|
54
|
+
|
55
|
+
return paths, tmp_dir # Return dir object so caller can control cleanup
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: langroid
|
3
|
-
Version: 0.37.
|
3
|
+
Version: 0.37.1
|
4
4
|
Summary: Harness LLMs with Multi-Agent Programming
|
5
5
|
Author-email: Prasad Chalasani <pchalasani@gmail.com>
|
6
6
|
License: MIT
|
@@ -102,6 +102,7 @@ Requires-Dist: python-docx<2.0.0,>=1.1.0; extra == 'doc-chat'
|
|
102
102
|
Requires-Dist: unstructured[docx,pdf,pptx]<1.0.0,>=0.16.15; extra == 'doc-chat'
|
103
103
|
Provides-Extra: docling
|
104
104
|
Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'docling'
|
105
|
+
Requires-Dist: pypdf>=5.1.0; extra == 'docling'
|
105
106
|
Provides-Extra: docx
|
106
107
|
Requires-Dist: python-docx<2.0.0,>=1.1.0; extra == 'docx'
|
107
108
|
Provides-Extra: fastembed
|
@@ -78,10 +78,11 @@ langroid/language_models/prompt_formatter/llama2_formatter.py,sha256=YdcO88qyBeu
|
|
78
78
|
langroid/parsing/__init__.py,sha256=ZgSAfgTC6VsTLFlRSWT-TwYco7SQeRMeZG-49MnKYGY,936
|
79
79
|
langroid/parsing/agent_chats.py,sha256=sbZRV9ujdM5QXvvuHVjIi2ysYSYlap-uqfMMUKulrW0,1068
|
80
80
|
langroid/parsing/code_parser.py,sha256=5ze0MBytrGGkU69pA_bJDjRm6QZz_QYfPcIwkagUa7U,3796
|
81
|
-
langroid/parsing/document_parser.py,sha256=
|
81
|
+
langroid/parsing/document_parser.py,sha256=Xcf_yA4admhx75N123_ouWcgnYXHztxX0S3TxqlWKNU,28334
|
82
82
|
langroid/parsing/para_sentence_split.py,sha256=AJBzZojP3zpB-_IMiiHismhqcvkrVBQ3ZINoQyx_bE4,2000
|
83
83
|
langroid/parsing/parse_json.py,sha256=aADo38bAHQhC8on4aWZZzVzSDy-dK35vRLZsFI2ewh8,4756
|
84
84
|
langroid/parsing/parser.py,sha256=WDv4QnNtAcLSiPe6cPhHOa-aMhrt3OV-kKnVXdgwtmI,12276
|
85
|
+
langroid/parsing/pdf_utils.py,sha256=IFs2GH9_ZOYJ159YF5MomQ8RKRj1YPBIxkv0gx4Xz7o,1629
|
85
86
|
langroid/parsing/repo_loader.py,sha256=3GjvPJS6Vf5L6gV2zOU8s-Tf1oq_fZm-IB_RL_7CTsY,29373
|
86
87
|
langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1232
|
87
88
|
langroid/parsing/search.py,sha256=0i_r0ESb5HEQfagA2g7_uMQyxYPADWVbdcN9ixZhS4E,8992
|
@@ -122,7 +123,7 @@ langroid/vector_store/meilisearch.py,sha256=6frB7GFWeWmeKzRfLZIvzRjllniZ1cYj3Hmh
|
|
122
123
|
langroid/vector_store/momento.py,sha256=UNHGT6jXuQtqY9f6MdqGU14bVnS0zHgIJUa30ULpUJo,10474
|
123
124
|
langroid/vector_store/qdrantdb.py,sha256=Cen6f-y6witiR53UQ-5a605Reo0gTj3ygXpE_ehYoZo,18116
|
124
125
|
langroid/vector_store/weaviatedb.py,sha256=C6jd1Twl5_jux3JYyrcTfQb63Lk9HuiUzVF4NahXuGo,10642
|
125
|
-
langroid-0.37.
|
126
|
-
langroid-0.37.
|
127
|
-
langroid-0.37.
|
128
|
-
langroid-0.37.
|
126
|
+
langroid-0.37.1.dist-info/METADATA,sha256=XL8VnB7r3uUJ6-BkwZkUPeSQO4pfvo8YfH3GvbX_gFg,60572
|
127
|
+
langroid-0.37.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
128
|
+
langroid-0.37.1.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
|
129
|
+
langroid-0.37.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|