swarmauri_parser_slate 0.1.0.dev20__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- swarmauri_parser_slate-0.1.0.dev20/PKG-INFO +58 -0
- swarmauri_parser_slate-0.1.0.dev20/README.md +39 -0
- swarmauri_parser_slate-0.1.0.dev20/pyproject.toml +65 -0
- swarmauri_parser_slate-0.1.0.dev20/swarmauri_parser_slate/SlateParser.py +51 -0
- swarmauri_parser_slate-0.1.0.dev20/swarmauri_parser_slate/__init__.py +17 -0
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: swarmauri_parser_slate
|
|
3
|
+
Version: 0.1.0.dev20
|
|
4
|
+
Summary: A parser for extracting text from PDFs using Slate.
|
|
5
|
+
License: Apache-2.0
|
|
6
|
+
Author: Vijay Vignesh
|
|
7
|
+
Author-email: vijayvigneshp02@gmail.com
|
|
8
|
+
Requires-Python: >=3.10,<3.13
|
|
9
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Requires-Dist: slate3k (>=0.5)
|
|
14
|
+
Requires-Dist: swarmauri_base
|
|
15
|
+
Requires-Dist: swarmauri_core
|
|
16
|
+
Requires-Dist: swarmauri_standard
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
|
|
19
|
+

|
|
20
|
+
|
|
21
|
+
<div align="center">
|
|
22
|
+
|
|
23
|
+

|
|
24
|
+

|
|
25
|
+

|
|
26
|
+

|
|
27
|
+
|
|
28
|
+
</div>
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
# Swarmauri SlateParser
|
|
33
|
+
|
|
34
|
+
A parser for reading and extracting data fields from PDF files using Slate.
|
|
35
|
+
|
|
36
|
+
## Installation
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install swarmauri_parser_slate
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Usage
|
|
43
|
+
Basic usage example with code snippet:
|
|
44
|
+
```python
|
|
45
|
+
from swarmauri.parsers.SlateParser import SlateParser
|
|
46
|
+
|
|
47
|
+
parser = SlateParser()
|
|
48
|
+
file_path = "path/to/your/pdf_file.pdf"
|
|
49
|
+
documents = parser.parse(file_path)
|
|
50
|
+
|
|
51
|
+
for document in documents:
|
|
52
|
+
print(document.content)
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Want to help?
|
|
56
|
+
|
|
57
|
+
If you want to contribute to swarmauri-sdk, read up on our [guidelines for contributing](https://github.com/swarmauri/swarmauri-sdk/blob/master/contributing.md) that will help you get started.
|
|
58
|
+
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+

|
|
2
|
+
|
|
3
|
+
<div align="center">
|
|
4
|
+
|
|
5
|
+

|
|
6
|
+

|
|
7
|
+

|
|
8
|
+

|
|
9
|
+
|
|
10
|
+
</div>
|
|
11
|
+
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
# Swarmauri SlateParser
|
|
15
|
+
|
|
16
|
+
A parser for reading and extracting data fields from PDF files using Slate.
|
|
17
|
+
|
|
18
|
+
## Installation
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install swarmauri_parser_slate
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Usage
|
|
25
|
+
Basic usage example with code snippet:
|
|
26
|
+
```python
|
|
27
|
+
from swarmauri.parsers.SlateParser import SlateParser
|
|
28
|
+
|
|
29
|
+
parser = SlateParser()
|
|
30
|
+
file_path = "path/to/your/pdf_file.pdf"
|
|
31
|
+
documents = parser.parse(file_path)
|
|
32
|
+
|
|
33
|
+
for document in documents:
|
|
34
|
+
print(document.content)
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Want to help?
|
|
38
|
+
|
|
39
|
+
If you want to contribute to swarmauri-sdk, read up on our [guidelines for contributing](https://github.com/swarmauri/swarmauri-sdk/blob/master/contributing.md) that will help you get started.
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "swarmauri_parser_slate"
|
|
3
|
+
version = "0.1.0.dev20"
|
|
4
|
+
description = "A parser for extracting text from PDFs using Slate."
|
|
5
|
+
license = "Apache-2.0"
|
|
6
|
+
readme = "README.md"
|
|
7
|
+
repository = "http://github.com/swarmauri/swarmauri-sdk"
|
|
8
|
+
requires-python = ">=3.10,<3.13"
|
|
9
|
+
classifiers = [
|
|
10
|
+
"License :: OSI Approved :: Apache Software License",
|
|
11
|
+
"Programming Language :: Python :: 3.10",
|
|
12
|
+
"Programming Language :: Python :: 3.11",
|
|
13
|
+
"Programming Language :: Python :: 3.12",
|
|
14
|
+
]
|
|
15
|
+
authors = [{ name = "Vijay Vignesh", email = "vijayvigneshp02@gmail.com" }]
|
|
16
|
+
dependencies = [
|
|
17
|
+
"slate3k>=0.5",
|
|
18
|
+
"swarmauri_core",
|
|
19
|
+
"swarmauri_base",
|
|
20
|
+
"swarmauri_standard",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
[tool.uv.sources]
|
|
24
|
+
swarmauri_core = { workspace = true }
|
|
25
|
+
swarmauri_base = { workspace = true }
|
|
26
|
+
swarmauri_standard = { workspace = true }
|
|
27
|
+
|
|
28
|
+
[tool.pytest.ini_options]
|
|
29
|
+
norecursedirs = ["combined", "scripts"]
|
|
30
|
+
markers = [
|
|
31
|
+
"test: standard test",
|
|
32
|
+
"unit: Unit tests",
|
|
33
|
+
"i9n: Integration tests",
|
|
34
|
+
"r8n: Regression tests",
|
|
35
|
+
"timeout: mark test to timeout after X seconds",
|
|
36
|
+
"xpass: Expected passes",
|
|
37
|
+
"xfail: Expected failures",
|
|
38
|
+
"acceptance: Acceptance tests",
|
|
39
|
+
]
|
|
40
|
+
timeout = 300
|
|
41
|
+
log_cli = true
|
|
42
|
+
log_cli_level = "INFO"
|
|
43
|
+
log_cli_format = "%(asctime)s [%(levelname)s] %(message)s"
|
|
44
|
+
log_cli_date_format = "%Y-%m-%d %H:%M:%S"
|
|
45
|
+
asyncio_default_fixture_loop_scope = "function"
|
|
46
|
+
|
|
47
|
+
[tool.project.entry-points."swarmauri.parsers"]
|
|
48
|
+
SlateParser = "swarmauri_parser_slate.SlateParser:SlateParser"
|
|
49
|
+
|
|
50
|
+
[build-system]
|
|
51
|
+
requires = ["poetry-core>=1.0.0"]
|
|
52
|
+
build-backend = "poetry.core.masonry.api"
|
|
53
|
+
|
|
54
|
+
[dependency-groups]
|
|
55
|
+
dev = [
|
|
56
|
+
"pytest>=8.0",
|
|
57
|
+
"pytest-asyncio>=0.24.0",
|
|
58
|
+
"pytest-xdist>=3.6.1",
|
|
59
|
+
"pytest-json-report>=1.5.0",
|
|
60
|
+
"python-dotenv",
|
|
61
|
+
"requests>=2.32.3",
|
|
62
|
+
"flake8>=7.0",
|
|
63
|
+
"pytest-timeout>=2.3.1",
|
|
64
|
+
"ruff>=0.9.9",
|
|
65
|
+
]
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from typing import List, Literal
|
|
2
|
+
|
|
3
|
+
import slate3k as slate
|
|
4
|
+
from swarmauri_standard.documents.Document import Document
|
|
5
|
+
from swarmauri_base.parsers.ParserBase import ParserBase
|
|
6
|
+
from swarmauri_base.ComponentBase import ComponentBase
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@ComponentBase.register_type(ParserBase, "SlateParser")
|
|
10
|
+
class SlateParser(ParserBase):
|
|
11
|
+
"""
|
|
12
|
+
Parser for reading and extracting data fields from PDF files using Slate3k.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
type: Literal["SlateParser"] = "SlateParser"
|
|
16
|
+
|
|
17
|
+
def parse(self, source: str) -> List[Document]:
|
|
18
|
+
"""
|
|
19
|
+
Parses a PDF file and extracts its data fields as Document instances.
|
|
20
|
+
|
|
21
|
+
Parameters:
|
|
22
|
+
- source (str): The path to the PDF file.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
- List[IDocument]: A list containing a single Document instance with the extracted data fields.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
documents = []
|
|
29
|
+
if isinstance(source, str):
|
|
30
|
+
try:
|
|
31
|
+
with open(source, "rb") as file:
|
|
32
|
+
reader = slate.PDF(file)
|
|
33
|
+
print(reader)
|
|
34
|
+
for page_num, page in enumerate(reader):
|
|
35
|
+
text = page
|
|
36
|
+
if text:
|
|
37
|
+
document = Document(
|
|
38
|
+
content=text.strip(),
|
|
39
|
+
metadata={
|
|
40
|
+
"page_number": page_num + 1,
|
|
41
|
+
"source": source,
|
|
42
|
+
},
|
|
43
|
+
)
|
|
44
|
+
documents.append(document)
|
|
45
|
+
except Exception as e:
|
|
46
|
+
print(f"An error occurred while parsing the PDF '{source}': {e}")
|
|
47
|
+
return []
|
|
48
|
+
else:
|
|
49
|
+
raise TypeError("Source must be of type str (file path) or bytes.")
|
|
50
|
+
|
|
51
|
+
return documents
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from .SlateParser import SlateParser
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
__all__ = ["SlateParser"]
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
# For Python 3.8 and newer
|
|
8
|
+
from importlib.metadata import version, PackageNotFoundError
|
|
9
|
+
except ImportError:
|
|
10
|
+
# For older Python versions, use the backport
|
|
11
|
+
from importlib_metadata import version, PackageNotFoundError
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
__version__ = version("swarmauri_parser_slate")
|
|
15
|
+
except PackageNotFoundError:
|
|
16
|
+
# If the package is not installed (for example, during development)
|
|
17
|
+
__version__ = "0.0.0"
|