swarmauri_parser_slate 0.1.0.dev20__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,58 @@
1
+ Metadata-Version: 2.3
2
+ Name: swarmauri_parser_slate
3
+ Version: 0.1.0.dev20
4
+ Summary: A parser for extracting text from PDFs using Slate.
5
+ License: Apache-2.0
6
+ Author: Vijay Vignesh
7
+ Author-email: vijayvigneshp02@gmail.com
8
+ Requires-Python: >=3.10,<3.13
9
+ Classifier: License :: OSI Approved :: Apache Software License
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Requires-Dist: slate3k (>=0.5)
14
+ Requires-Dist: swarmauri_base
15
+ Requires-Dist: swarmauri_core
16
+ Requires-Dist: swarmauri_standard
17
+ Description-Content-Type: text/markdown
18
+
19
+ ![Swarmauri Logo](https://res.cloudinary.com/dbjmpekvl/image/upload/v1730099724/Swarmauri-logo-lockup-2048x757_hww01w.png)
20
+
21
+ <div align="center">
22
+
23
+ ![PyPI - Downloads](https://img.shields.io/pypi/dm/swarmauri_parser_pypdftk)
24
+ ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/swarmauri_parser_pypdftk)
25
+ ![PyPI - License](https://img.shields.io/pypi/l/swarmauri_parser_pypdftk)
26
+ ![PyPI - Version](https://img.shields.io/pypi/v/swarmauri_parser_pypdftk?label=swarmauri_parser_pypdftk&color=green)
27
+
28
+ </div>
29
+
30
+ ---
31
+
32
+ # Swarmauri SlateParser
33
+
34
+ A parser for reading and extracting data fields from PDF files using Slate.
35
+
36
+ ## Installation
37
+
38
+ ```bash
39
+ pip install swarmauri_parser_slate
40
+ ```
41
+
42
+ ## Usage
43
+ Basic usage example with code snippet:
44
+ ```python
45
+ from swarmauri.parsers.SlateParser import SlateParser
46
+
47
+ parser = SlateParser()
48
+ file_path = "path/to/your/pdf_file.pdf"
49
+ documents = parser.parse(file_path)
50
+
51
+ for document in documents:
52
+ print(document.content)
53
+ ```
54
+
55
+ ## Want to help?
56
+
57
+ If you want to contribute to swarmauri-sdk, read up on our [guidelines for contributing](https://github.com/swarmauri/swarmauri-sdk/blob/master/contributing.md) that will help you get started.
58
+
@@ -0,0 +1,39 @@
1
+ ![Swarmauri Logo](https://res.cloudinary.com/dbjmpekvl/image/upload/v1730099724/Swarmauri-logo-lockup-2048x757_hww01w.png)
2
+
3
+ <div align="center">
4
+
5
+ ![PyPI - Downloads](https://img.shields.io/pypi/dm/swarmauri_parser_pypdftk)
6
+ ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/swarmauri_parser_pypdftk)
7
+ ![PyPI - License](https://img.shields.io/pypi/l/swarmauri_parser_pypdftk)
8
+ ![PyPI - Version](https://img.shields.io/pypi/v/swarmauri_parser_pypdftk?label=swarmauri_parser_pypdftk&color=green)
9
+
10
+ </div>
11
+
12
+ ---
13
+
14
+ # Swarmauri SlateParser
15
+
16
+ A parser for reading and extracting data fields from PDF files using Slate.
17
+
18
+ ## Installation
19
+
20
+ ```bash
21
+ pip install swarmauri_parser_slate
22
+ ```
23
+
24
+ ## Usage
25
+ Basic usage example with code snippet:
26
+ ```python
27
+ from swarmauri.parsers.SlateParser import SlateParser
28
+
29
+ parser = SlateParser()
30
+ file_path = "path/to/your/pdf_file.pdf"
31
+ documents = parser.parse(file_path)
32
+
33
+ for document in documents:
34
+ print(document.content)
35
+ ```
36
+
37
+ ## Want to help?
38
+
39
+ If you want to contribute to swarmauri-sdk, read up on our [guidelines for contributing](https://github.com/swarmauri/swarmauri-sdk/blob/master/contributing.md) that will help you get started.
@@ -0,0 +1,65 @@
1
+ [project]
2
+ name = "swarmauri_parser_slate"
3
+ version = "0.1.0.dev20"
4
+ description = "A parser for extracting text from PDFs using Slate."
5
+ license = "Apache-2.0"
6
+ readme = "README.md"
7
+ repository = "http://github.com/swarmauri/swarmauri-sdk"
8
+ requires-python = ">=3.10,<3.13"
9
+ classifiers = [
10
+ "License :: OSI Approved :: Apache Software License",
11
+ "Programming Language :: Python :: 3.10",
12
+ "Programming Language :: Python :: 3.11",
13
+ "Programming Language :: Python :: 3.12",
14
+ ]
15
+ authors = [{ name = "Vijay Vignesh", email = "vijayvigneshp02@gmail.com" }]
16
+ dependencies = [
17
+ "slate3k>=0.5",
18
+ "swarmauri_core",
19
+ "swarmauri_base",
20
+ "swarmauri_standard",
21
+ ]
22
+
23
+ [tool.uv.sources]
24
+ swarmauri_core = { workspace = true }
25
+ swarmauri_base = { workspace = true }
26
+ swarmauri_standard = { workspace = true }
27
+
28
+ [tool.pytest.ini_options]
29
+ norecursedirs = ["combined", "scripts"]
30
+ markers = [
31
+ "test: standard test",
32
+ "unit: Unit tests",
33
+ "i9n: Integration tests",
34
+ "r8n: Regression tests",
35
+ "timeout: mark test to timeout after X seconds",
36
+ "xpass: Expected passes",
37
+ "xfail: Expected failures",
38
+ "acceptance: Acceptance tests",
39
+ ]
40
+ timeout = 300
41
+ log_cli = true
42
+ log_cli_level = "INFO"
43
+ log_cli_format = "%(asctime)s [%(levelname)s] %(message)s"
44
+ log_cli_date_format = "%Y-%m-%d %H:%M:%S"
45
+ asyncio_default_fixture_loop_scope = "function"
46
+
47
+ [tool.project.entry-points."swarmauri.parsers"]
48
+ SlateParser = "swarmauri_parser_slate.SlateParser:SlateParser"
49
+
50
+ [build-system]
51
+ requires = ["poetry-core>=1.0.0"]
52
+ build-backend = "poetry.core.masonry.api"
53
+
54
+ [dependency-groups]
55
+ dev = [
56
+ "pytest>=8.0",
57
+ "pytest-asyncio>=0.24.0",
58
+ "pytest-xdist>=3.6.1",
59
+ "pytest-json-report>=1.5.0",
60
+ "python-dotenv",
61
+ "requests>=2.32.3",
62
+ "flake8>=7.0",
63
+ "pytest-timeout>=2.3.1",
64
+ "ruff>=0.9.9",
65
+ ]
@@ -0,0 +1,51 @@
1
+ from typing import List, Literal
2
+
3
+ import slate3k as slate
4
+ from swarmauri_standard.documents.Document import Document
5
+ from swarmauri_base.parsers.ParserBase import ParserBase
6
+ from swarmauri_base.ComponentBase import ComponentBase
7
+
8
+
9
+ @ComponentBase.register_type(ParserBase, "SlateParser")
10
+ class SlateParser(ParserBase):
11
+ """
12
+ Parser for reading and extracting data fields from PDF files using Slate3k.
13
+ """
14
+
15
+ type: Literal["SlateParser"] = "SlateParser"
16
+
17
+ def parse(self, source: str) -> List[Document]:
18
+ """
19
+ Parses a PDF file and extracts its data fields as Document instances.
20
+
21
+ Parameters:
22
+ - source (str): The path to the PDF file.
23
+
24
+ Returns:
25
+ - List[IDocument]: A list containing a single Document instance with the extracted data fields.
26
+ """
27
+
28
+ documents = []
29
+ if isinstance(source, str):
30
+ try:
31
+ with open(source, "rb") as file:
32
+ reader = slate.PDF(file)
33
+ print(reader)
34
+ for page_num, page in enumerate(reader):
35
+ text = page
36
+ if text:
37
+ document = Document(
38
+ content=text.strip(),
39
+ metadata={
40
+ "page_number": page_num + 1,
41
+ "source": source,
42
+ },
43
+ )
44
+ documents.append(document)
45
+ except Exception as e:
46
+ print(f"An error occurred while parsing the PDF '{source}': {e}")
47
+ return []
48
+ else:
49
+ raise TypeError("Source must be of type str (file path) or bytes.")
50
+
51
+ return documents
@@ -0,0 +1,17 @@
1
+ from .SlateParser import SlateParser
2
+
3
+
4
+ __all__ = ["SlateParser"]
5
+
6
+ try:
7
+ # For Python 3.8 and newer
8
+ from importlib.metadata import version, PackageNotFoundError
9
+ except ImportError:
10
+ # For older Python versions, use the backport
11
+ from importlib_metadata import version, PackageNotFoundError
12
+
13
+ try:
14
+ __version__ = version("swarmauri_parser_slate")
15
+ except PackageNotFoundError:
16
+ # If the package is not installed (for example, during development)
17
+ __version__ = "0.0.0"