swarmauri_parser_pypdf2 0.6.0.dev154__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- swarmauri_parser_pypdf2/PyPDF2Parser.py +68 -0
- swarmauri_parser_pypdf2/__init__.py +15 -0
- swarmauri_parser_pypdf2-0.6.0.dev154.dist-info/METADATA +20 -0
- swarmauri_parser_pypdf2-0.6.0.dev154.dist-info/RECORD +6 -0
- swarmauri_parser_pypdf2-0.6.0.dev154.dist-info/WHEEL +4 -0
- swarmauri_parser_pypdf2-0.6.0.dev154.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from typing import List, Literal, Union
|
|
2
|
+
|
|
3
|
+
import PyPDF2
|
|
4
|
+
from swarmauri_core.ComponentBase import ComponentBase
|
|
5
|
+
from swarmauri_standard.documents.Document import Document
|
|
6
|
+
from swarmauri_base.parsers.ParserBase import ParserBase
|
|
7
|
+
from swarmauri_core.documents.IDocument import IDocument
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@ComponentBase.register_type(ParserBase, "PyPDF2Parser")
|
|
11
|
+
class PyPDF2Parser(ParserBase):
|
|
12
|
+
"""
|
|
13
|
+
Parser for reading and extracting text from PDF files using PyPDF2.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
type: Literal["PyPDF2Parser"] = "PyPDF2Parser"
|
|
17
|
+
|
|
18
|
+
def parse(self, source: Union[str, bytes]) -> List[IDocument]:
|
|
19
|
+
"""
|
|
20
|
+
Parses a PDF file and extracts text from each page as Document instances.
|
|
21
|
+
|
|
22
|
+
Parameters:
|
|
23
|
+
- source (Union[str, bytes]): The path to the PDF file or bytes of the PDF content.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
- List[IDocument]: A list of IDocument instances with the extracted text.
|
|
27
|
+
"""
|
|
28
|
+
documents = []
|
|
29
|
+
|
|
30
|
+
if isinstance(source, str):
|
|
31
|
+
try:
|
|
32
|
+
with open(source, "rb") as file:
|
|
33
|
+
reader = PyPDF2.PdfReader(file)
|
|
34
|
+
for page_num, page in enumerate(reader.pages):
|
|
35
|
+
text = page.extract_text()
|
|
36
|
+
if text:
|
|
37
|
+
document = Document(
|
|
38
|
+
content=text.strip(),
|
|
39
|
+
metadata={
|
|
40
|
+
"page_number": page_num + 1,
|
|
41
|
+
"source": source,
|
|
42
|
+
},
|
|
43
|
+
)
|
|
44
|
+
documents.append(document)
|
|
45
|
+
except Exception as e:
|
|
46
|
+
print(f"An error occurred while parsing the PDF '{source}': {e}")
|
|
47
|
+
return []
|
|
48
|
+
elif isinstance(source, bytes):
|
|
49
|
+
try:
|
|
50
|
+
from io import BytesIO
|
|
51
|
+
|
|
52
|
+
file_stream = BytesIO(source)
|
|
53
|
+
reader = PyPDF2.PdfReader(file_stream)
|
|
54
|
+
for page_num, page in enumerate(reader.pages):
|
|
55
|
+
text = page.extract_text()
|
|
56
|
+
if text:
|
|
57
|
+
document = Document(
|
|
58
|
+
content=text.strip(),
|
|
59
|
+
metadata={"page_number": page_num + 1, "source": "bytes"},
|
|
60
|
+
)
|
|
61
|
+
documents.append(document)
|
|
62
|
+
except Exception as e:
|
|
63
|
+
print("An error occurred while parsing the PDF from bytes:", e)
|
|
64
|
+
return []
|
|
65
|
+
else:
|
|
66
|
+
raise TypeError("Source must be of type str (file path) or bytes.")
|
|
67
|
+
|
|
68
|
+
return documents
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# swm_example_community_package/__init__.py
|
|
2
|
+
__version__ = "0.6.0.dev26"
|
|
3
|
+
__long_desc__ = """
|
|
4
|
+
|
|
5
|
+
# Swarmauri Example Plugin
|
|
6
|
+
|
|
7
|
+
This repository includes an example of a Swarmauri Plugin.
|
|
8
|
+
|
|
9
|
+
Visit us at: https://swarmauri.com
|
|
10
|
+
Follow us at: https://github.com/swarmauri
|
|
11
|
+
Star us at: https://github.com/swarmauri/swarmauri-sdk
|
|
12
|
+
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: swarmauri_parser_pypdf2
|
|
3
|
+
Version: 0.6.0.dev154
|
|
4
|
+
Summary: PyPDF2 Parser for Swarmauri.
|
|
5
|
+
License: Apache-2.0
|
|
6
|
+
Author: Jacob Stewart
|
|
7
|
+
Author-email: jacob@swarmauri.com
|
|
8
|
+
Requires-Python: >=3.10,<3.13
|
|
9
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Requires-Dist: pypdf (>=5.0.1,<6.0.0)
|
|
15
|
+
Requires-Dist: swarmauri_base (>=0.6.0.dev154,<0.7.0)
|
|
16
|
+
Requires-Dist: swarmauri_core (>=0.6.0.dev154,<0.7.0)
|
|
17
|
+
Project-URL: Repository, http://github.com/swarmauri/swarmauri-sdk
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
|
|
20
|
+
# Swarmauri Example Community Package
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
swarmauri_parser_pypdf2/__init__.py,sha256=0JPWFMiRDsSmfgYFpZWp31sknn1CPYYATVUlg4tY14o,336
|
|
2
|
+
swarmauri_parser_pypdf2/PyPDF2Parser.py,sha256=aFxSVbZUcohWFPbLlw4ChhlPDgjwH9wk8FvdKztXW4o,2683
|
|
3
|
+
swarmauri_parser_pypdf2-0.6.0.dev154.dist-info/entry_points.txt,sha256=Ay1D3vv7qKTKfWIWJqTgodYxit45IkQ3Fl3yaMuKWRw,84
|
|
4
|
+
swarmauri_parser_pypdf2-0.6.0.dev154.dist-info/METADATA,sha256=a3nTfp-ucUiaDCIeVhzuisVJ5_POP-Z9X3hUpTv60k0,774
|
|
5
|
+
swarmauri_parser_pypdf2-0.6.0.dev154.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
|
|
6
|
+
swarmauri_parser_pypdf2-0.6.0.dev154.dist-info/RECORD,,
|