swarmauri_parser_pypdf2 0.6.0.dev154__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,68 @@
1
+ from typing import List, Literal, Union
2
+
3
+ import PyPDF2
4
+ from swarmauri_core.ComponentBase import ComponentBase
5
+ from swarmauri_standard.documents.Document import Document
6
+ from swarmauri_base.parsers.ParserBase import ParserBase
7
+ from swarmauri_core.documents.IDocument import IDocument
8
+
9
+
10
+ @ComponentBase.register_type(ParserBase, "PyPDF2Parser")
11
+ class PyPDF2Parser(ParserBase):
12
+ """
13
+ Parser for reading and extracting text from PDF files using PyPDF2.
14
+ """
15
+
16
+ type: Literal["PyPDF2Parser"] = "PyPDF2Parser"
17
+
18
+ def parse(self, source: Union[str, bytes]) -> List[IDocument]:
19
+ """
20
+ Parses a PDF file and extracts text from each page as Document instances.
21
+
22
+ Parameters:
23
+ - source (Union[str, bytes]): The path to the PDF file or bytes of the PDF content.
24
+
25
+ Returns:
26
+ - List[IDocument]: A list of IDocument instances with the extracted text.
27
+ """
28
+ documents = []
29
+
30
+ if isinstance(source, str):
31
+ try:
32
+ with open(source, "rb") as file:
33
+ reader = PyPDF2.PdfReader(file)
34
+ for page_num, page in enumerate(reader.pages):
35
+ text = page.extract_text()
36
+ if text:
37
+ document = Document(
38
+ content=text.strip(),
39
+ metadata={
40
+ "page_number": page_num + 1,
41
+ "source": source,
42
+ },
43
+ )
44
+ documents.append(document)
45
+ except Exception as e:
46
+ print(f"An error occurred while parsing the PDF '{source}': {e}")
47
+ return []
48
+ elif isinstance(source, bytes):
49
+ try:
50
+ from io import BytesIO
51
+
52
+ file_stream = BytesIO(source)
53
+ reader = PyPDF2.PdfReader(file_stream)
54
+ for page_num, page in enumerate(reader.pages):
55
+ text = page.extract_text()
56
+ if text:
57
+ document = Document(
58
+ content=text.strip(),
59
+ metadata={"page_number": page_num + 1, "source": "bytes"},
60
+ )
61
+ documents.append(document)
62
+ except Exception as e:
63
+ print("An error occurred while parsing the PDF from bytes:", e)
64
+ return []
65
+ else:
66
+ raise TypeError("Source must be of type str (file path) or bytes.")
67
+
68
+ return documents
@@ -0,0 +1,15 @@
1
+ # swm_example_community_package/__init__.py
2
+ __version__ = "0.6.0.dev26"
3
+ __long_desc__ = """
4
+
5
+ # Swarmauri Example Plugin
6
+
7
+ This repository includes an example of a Swarmauri Plugin.
8
+
9
+ Visit us at: https://swarmauri.com
10
+ Follow us at: https://github.com/swarmauri
11
+ Star us at: https://github.com/swarmauri/swarmauri-sdk
12
+
13
+ """
14
+
15
+
@@ -0,0 +1,20 @@
1
+ Metadata-Version: 2.3
2
+ Name: swarmauri_parser_pypdf2
3
+ Version: 0.6.0.dev154
4
+ Summary: PyPDF2 Parser for Swarmauri.
5
+ License: Apache-2.0
6
+ Author: Jacob Stewart
7
+ Author-email: jacob@swarmauri.com
8
+ Requires-Python: >=3.10,<3.13
9
+ Classifier: License :: OSI Approved :: Apache Software License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Requires-Dist: pypdf (>=5.0.1,<6.0.0)
15
+ Requires-Dist: swarmauri_base (>=0.6.0.dev154,<0.7.0)
16
+ Requires-Dist: swarmauri_core (>=0.6.0.dev154,<0.7.0)
17
+ Project-URL: Repository, http://github.com/swarmauri/swarmauri-sdk
18
+ Description-Content-Type: text/markdown
19
+
20
+ # Swarmauri Example Community Package
@@ -0,0 +1,6 @@
1
+ swarmauri_parser_pypdf2/__init__.py,sha256=0JPWFMiRDsSmfgYFpZWp31sknn1CPYYATVUlg4tY14o,336
2
+ swarmauri_parser_pypdf2/PyPDF2Parser.py,sha256=aFxSVbZUcohWFPbLlw4ChhlPDgjwH9wk8FvdKztXW4o,2683
3
+ swarmauri_parser_pypdf2-0.6.0.dev154.dist-info/entry_points.txt,sha256=Ay1D3vv7qKTKfWIWJqTgodYxit45IkQ3Fl3yaMuKWRw,84
4
+ swarmauri_parser_pypdf2-0.6.0.dev154.dist-info/METADATA,sha256=a3nTfp-ucUiaDCIeVhzuisVJ5_POP-Z9X3hUpTv60k0,774
5
+ swarmauri_parser_pypdf2-0.6.0.dev154.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
6
+ swarmauri_parser_pypdf2-0.6.0.dev154.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: poetry-core 2.0.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,3 @@
1
+ [swarmauri.parsers]
2
+ PyPDF2Parser=swarmauri_parser_pypdf2.PyPDF2Parser:PyPDF2Parser
3
+