swarmauri_parser_fitzpdf 0.6.0.dev154__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- swarmauri_parser_fitzpdf-0.6.0.dev154/PKG-INFO +20 -0
- swarmauri_parser_fitzpdf-0.6.0.dev154/README.md +1 -0
- swarmauri_parser_fitzpdf-0.6.0.dev154/pyproject.toml +57 -0
- swarmauri_parser_fitzpdf-0.6.0.dev154/swarmauri_parser_fitzpdf/FitzPdfParser.py +47 -0
- swarmauri_parser_fitzpdf-0.6.0.dev154/swarmauri_parser_fitzpdf/__init__.py +15 -0
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: swarmauri_parser_fitzpdf
|
|
3
|
+
Version: 0.6.0.dev154
|
|
4
|
+
Summary: Fitz PDF Parser for Swarmauri.
|
|
5
|
+
License: Apache-2.0
|
|
6
|
+
Author: Jacob Stewart
|
|
7
|
+
Author-email: jacob@swarmauri.com
|
|
8
|
+
Requires-Python: >=3.10,<3.13
|
|
9
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Requires-Dist: PyMuPDF (>=1.24.12,<2.0.0)
|
|
15
|
+
Requires-Dist: swarmauri_base (>=0.6.0.dev154,<0.7.0)
|
|
16
|
+
Requires-Dist: swarmauri_core (>=0.6.0.dev154,<0.7.0)
|
|
17
|
+
Project-URL: Repository, http://github.com/swarmauri/swarmauri-sdk
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
|
|
20
|
+
# Swarmauri Example Community Package
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Swarmauri Example Community Package
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "swarmauri_parser_fitzpdf"
|
|
3
|
+
version = "0.6.0.dev154"
|
|
4
|
+
description = "Fitz PDF Parser for Swarmauri."
|
|
5
|
+
authors = ["Jacob Stewart <jacob@swarmauri.com>"]
|
|
6
|
+
license = "Apache-2.0"
|
|
7
|
+
readme = "README.md"
|
|
8
|
+
repository = "http://github.com/swarmauri/swarmauri-sdk"
|
|
9
|
+
classifiers = [
|
|
10
|
+
"License :: OSI Approved :: Apache Software License",
|
|
11
|
+
"Programming Language :: Python :: 3.10",
|
|
12
|
+
"Programming Language :: Python :: 3.11",
|
|
13
|
+
"Programming Language :: Python :: 3.12"
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
[tool.poetry.dependencies]
|
|
17
|
+
python = ">=3.10,<3.13"
|
|
18
|
+
|
|
19
|
+
# Swarmauri
|
|
20
|
+
swarmauri_core = {version = "^0.6.0.dev154"}
|
|
21
|
+
swarmauri_base = {version = "^0.6.0.dev154"}
|
|
22
|
+
|
|
23
|
+
# Dependencies
|
|
24
|
+
PyMuPDF = "^1.24.12"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
[tool.poetry.group.dev.dependencies]
|
|
28
|
+
flake8 = "^7.0"
|
|
29
|
+
pytest = "^8.0"
|
|
30
|
+
pytest-asyncio = ">=0.24.0"
|
|
31
|
+
pytest-xdist = "^3.6.1"
|
|
32
|
+
pytest-json-report = "^1.5.0"
|
|
33
|
+
python-dotenv = "*"
|
|
34
|
+
requests = "^2.32.3"
|
|
35
|
+
|
|
36
|
+
[build-system]
|
|
37
|
+
requires = ["poetry-core>=1.0.0"]
|
|
38
|
+
build-backend = "poetry.core.masonry.api"
|
|
39
|
+
|
|
40
|
+
[tool.pytest.ini_options]
|
|
41
|
+
norecursedirs = ["combined", "scripts"]
|
|
42
|
+
|
|
43
|
+
markers = [
|
|
44
|
+
"test: standard test",
|
|
45
|
+
"unit: Unit tests",
|
|
46
|
+
"integration: Integration tests",
|
|
47
|
+
"acceptance: Acceptance tests",
|
|
48
|
+
"experimental: Experimental tests"
|
|
49
|
+
]
|
|
50
|
+
log_cli = true
|
|
51
|
+
log_cli_level = "INFO"
|
|
52
|
+
log_cli_format = "%(asctime)s [%(levelname)s] %(message)s"
|
|
53
|
+
log_cli_date_format = "%Y-%m-%d %H:%M:%S"
|
|
54
|
+
asyncio_default_fixture_loop_scope = "function"
|
|
55
|
+
|
|
56
|
+
[tool.poetry.plugins."swarmauri.parsers"]
|
|
57
|
+
FitzPdfParser = "swarmauri_parser_fitzpdf.FitzPdfParser:FitzPdfParser"
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from swarmauri_core.ComponentBase import ComponentBase
|
|
2
|
+
import pymupdf # PyMuPDF
|
|
3
|
+
from typing import List, Union, Any, Literal
|
|
4
|
+
from swarmauri.parsers.base.ParserBase import ParserBase
|
|
5
|
+
from swarmauri_core.documents.IDocument import IDocument
|
|
6
|
+
from swarmauri.documents.concrete.Document import Document
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@ComponentBase.register_type(ParserBase, "FitzPdfParser")
|
|
10
|
+
class PDFtoTextParser(ParserBase):
|
|
11
|
+
"""
|
|
12
|
+
A parser to extract text from PDF files.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
type: Literal["FitzPdfParser"] = "FitzPdfParser"
|
|
16
|
+
|
|
17
|
+
def parse(self, data: Union[str, Any]) -> List[IDocument]:
|
|
18
|
+
"""
|
|
19
|
+
Parses a PDF file and extracts its text content as Document instances.
|
|
20
|
+
|
|
21
|
+
Parameters:
|
|
22
|
+
- data (Union[str, Any]): The path to the PDF file.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
- List[IDocument]: A list with a single IDocument instance containing the extracted text.
|
|
26
|
+
"""
|
|
27
|
+
# Ensure data is a valid str path to a PDF file
|
|
28
|
+
if not isinstance(data, str):
|
|
29
|
+
raise ValueError("PDFtoTextParser expects a file path in str format.")
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
# Open the PDF file
|
|
33
|
+
doc = pymupdf.open(data)
|
|
34
|
+
text = ""
|
|
35
|
+
|
|
36
|
+
# Extract text from each page
|
|
37
|
+
for page_num in range(len(doc)):
|
|
38
|
+
page = doc.load_page(page_num)
|
|
39
|
+
text += page.get_text()
|
|
40
|
+
|
|
41
|
+
# Create a document with the extracted text
|
|
42
|
+
document = Document(content=text, metadata={"source": data})
|
|
43
|
+
return [document]
|
|
44
|
+
|
|
45
|
+
except Exception as e:
|
|
46
|
+
print(f"An error occurred while parsing the PDF: {e}")
|
|
47
|
+
return []
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# swm_example_community_package/__init__.py
|
|
2
|
+
__version__ = "0.6.0.dev26"
|
|
3
|
+
__long_desc__ = """
|
|
4
|
+
|
|
5
|
+
# Swarmauri Example Plugin
|
|
6
|
+
|
|
7
|
+
This repository includes an example of a Swarmauri Plugin.
|
|
8
|
+
|
|
9
|
+
Visit us at: https://swarmauri.com
|
|
10
|
+
Follow us at: https://github.com/swarmauri
|
|
11
|
+
Star us at: https://github.com/swarmauri/swarmauri-sdk
|
|
12
|
+
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
|