maybankpdf2json 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,72 @@
1
+ Metadata-Version: 2.4
2
+ Name: maybankpdf2json
3
+ Version: 0.1.0
4
+ Summary: A package for extracting data from Maybank account statements in PDF format.
5
+ Home-page: https://github.com/nordinz7/maybankpdf2json
6
+ Author: Nordin
7
+ Author-email: Nordin <vipnordin@gmail.com>
8
+ Project-URL: Homepage, https://github.com/nordinz7/maybankpdf2json
9
+ Keywords: maybank,pdf,JSON,account statements,Email delivery statement,Malayan Banking
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: OS Independent
13
+ Requires-Python: >=3.6
14
+ Description-Content-Type: text/markdown
15
+ Requires-Dist: pdfplumber>=0.7.4
16
+ Dynamic: author
17
+ Dynamic: home-page
18
+ Dynamic: requires-python
19
+
20
+ # Maybank Account Statement Extractor
21
+
22
+ This package provides functionality to extract and process data from Maybank account statement PDFs. It allows users to read PDF files, filter relevant data, and map it into a structured format for further analysis or reporting.
23
+
24
+ ## Features
25
+
26
+ - Extract data from PDF statements.
27
+ - Filter and map extracted data into a structured format.
28
+ - Utility functions for data manipulation and validation.
29
+
30
+ ## Installation
31
+
32
+ To install the package, clone the repository and run the following command:
33
+
34
+ ```
35
+ pip install .
36
+ ```
37
+
38
+ ## Usage
39
+
40
+ Here is a basic example of how to use the package:
41
+
42
+ ```python
43
+ from maybank_acc_extractor.extractor import read_pdfs, get_filtered_data, get_mapped_data
44
+
45
+ # Read PDF files from a specified directory
46
+ pdf_data = read_pdfs('path/to/pdf/folder', 'your_password')
47
+
48
+ # Filter the data
49
+ filtered_data = get_filtered_data(pdf_data)
50
+
51
+ # Map the filtered data to a structured format
52
+ mapped_data = get_mapped_data(filtered_data)
53
+
54
+ # Output the mapped data
55
+ print(mapped_data)
56
+ ```
57
+
58
+ ## Testing
59
+
60
+ To run the tests, navigate to the project directory and execute:
61
+
62
+ ```
63
+ pytest tests/
64
+ ```
65
+
66
+ ## Contributing
67
+
68
+ Contributions are welcome! Please feel free to submit a pull request or open an issue for any enhancements or bug fixes.
69
+
70
+ ## License
71
+
72
+ This project is licensed under the MIT License. See the LICENSE file for more details.
@@ -0,0 +1,53 @@
1
+ # Maybank Account Statement Extractor
2
+
3
+ This package provides functionality to extract and process data from Maybank account statement PDFs. It allows users to read PDF files, filter relevant data, and map it into a structured format for further analysis or reporting.
4
+
5
+ ## Features
6
+
7
+ - Extract data from PDF statements.
8
+ - Filter and map extracted data into a structured format.
9
+ - Utility functions for data manipulation and validation.
10
+
11
+ ## Installation
12
+
13
+ To install the package, clone the repository and run the following command:
14
+
15
+ ```
16
+ pip install .
17
+ ```
18
+
19
+ ## Usage
20
+
21
+ Here is a basic example of how to use the package:
22
+
23
+ ```python
24
+ from maybank_acc_extractor.extractor import read_pdfs, get_filtered_data, get_mapped_data
25
+
26
+ # Read PDF files from a specified directory
27
+ pdf_data = read_pdfs('path/to/pdf/folder', 'your_password')
28
+
29
+ # Filter the data
30
+ filtered_data = get_filtered_data(pdf_data)
31
+
32
+ # Map the filtered data to a structured format
33
+ mapped_data = get_mapped_data(filtered_data)
34
+
35
+ # Output the mapped data
36
+ print(mapped_data)
37
+ ```
38
+
39
+ ## Testing
40
+
41
+ To run the tests, navigate to the project directory and execute:
42
+
43
+ ```
44
+ pytest tests/
45
+ ```
46
+
47
+ ## Contributing
48
+
49
+ Contributions are welcome! Please feel free to submit a pull request or open an issue for any enhancements or bug fixes.
50
+
51
+ ## License
52
+
53
+ This project is licensed under the MIT License. See the LICENSE file for more details.
@@ -0,0 +1,3 @@
1
+ # This file marks the directory as a Python package.
2
+ # You can add package-level imports or initialization code here if needed.
3
+ from .extractor import MaybankPDFExtractor
@@ -0,0 +1,68 @@
1
+ from typing import List, Dict, Union, BinaryIO
2
+ import pdfplumber
3
+
4
+ START_ENTRY = "BEGINNING BALANCE"
5
+ END_ENTRY = "TOTAL DEBIT"
6
+ EXCLUDE_ITEMS = ["TOTAL CREDIT", "TOTAL DEBIT", "ENDING BALANCE"]
7
+
8
+ class MaybankAccExtractor:
9
+ def __init__(self, buffers: Union[List[BinaryIO], BinaryIO], pwd: str = None):
10
+ if isinstance(buffers, list):
11
+ self.buffers = buffers
12
+ else:
13
+ self.buffers = [buffers]
14
+ self.pwd = pwd
15
+
16
+ def read_pdfs(self) -> List[List[str]]:
17
+ pdf_files = []
18
+ for buffer in self.buffers:
19
+ try:
20
+ pdf_files.append(self.read_single_pdf_file(buffer, self.pwd))
21
+ except Exception:
22
+ pdf_files.append(self.read_single_pdf_file(buffer, None))
23
+ return pdf_files
24
+
25
+ def read_single_pdf_file(self, buffer: BinaryIO, pwd: str) -> List[str]:
26
+ buffer.seek(0)
27
+ with pdfplumber.open(buffer, password=pwd) as pdf:
28
+ return [
29
+ txt
30
+ for page in pdf.pages
31
+ for txt in page.extract_text().split("\n")
32
+ ]
33
+
34
+ def get_filtered_data(self, arr: List[str]) -> List[str]:
35
+ indexes = [0, len(arr)]
36
+ for i, x in enumerate(arr):
37
+ if x.startswith(START_ENTRY):
38
+ indexes[0] = i
39
+ elif x.startswith(END_ENTRY):
40
+ indexes[1] = i + 1
41
+ break
42
+ filtered = arr[indexes[0]: indexes[1]]
43
+ narr = [v for v in filtered if not any(v.startswith(item) for item in EXCLUDE_ITEMS)]
44
+ return narr
45
+
46
+ def get_mapped_data(self, arr: List[str]) -> List[Dict[str, str]]:
47
+ narr = []
48
+ for current in arr:
49
+ splitted = current.split()
50
+ if len(splitted) < 3:
51
+ continue
52
+ obj = {
53
+ "date": splitted[0],
54
+ "desc": " ".join(splitted[1:-2]),
55
+ "trans": float(splitted[-2]),
56
+ "bal": float(splitted[-1])
57
+ }
58
+ narr.append(obj)
59
+ return narr
60
+
61
+ def extract_data(self) -> List[Dict[str, str]]:
62
+ pdf_data = self.read_pdfs()
63
+ all_mapped_data = []
64
+ for pdf in pdf_data:
65
+ filtered_data = self.get_filtered_data(pdf)
66
+ mapped_data = self.get_mapped_data(filtered_data)
67
+ all_mapped_data.extend(mapped_data)
68
+ return all_mapped_data
@@ -0,0 +1,32 @@
1
+ def parse_acc_value(value: str) -> float:
2
+ value = value.replace(",", "")
3
+ if value.endswith("-"):
4
+ return -float(value[:-1])
5
+ elif value.endswith("+"):
6
+ return float(value[:-1])
7
+ else:
8
+ return float(value)
9
+
10
+
11
+ def is_valid_date(date_str: str) -> bool:
12
+ try:
13
+ datetime.strptime(date_str, "%d/%m/%y")
14
+ return True
15
+ except ValueError:
16
+ return False
17
+
18
+
19
+ def output_extracted_data(value, options):
20
+ type = options["format"]
21
+ is_json = type == "json"
22
+ newline = None if is_json else ""
23
+ date = datetime.strptime(value[2]["date"], "%d/%m/%y")
24
+ file_date = date.strftime("%Y%m %B ") if not options["merge"] else "-COMBINED"
25
+
26
+ with open(f"{OUTPUT_FILENAME}{file_date}.{type}", "w", newline=newline) as o_file:
27
+ if is_json:
28
+ json.dump(value, o_file, indent=4)
29
+ else:
30
+ writer = csv.DictWriter(o_file, ["date", "desc", "trans", "bal"])
31
+ writer.writeheader()
32
+ writer.writerows(value)
@@ -0,0 +1,72 @@
1
+ Metadata-Version: 2.4
2
+ Name: maybankpdf2json
3
+ Version: 0.1.0
4
+ Summary: A package for extracting data from Maybank account statements in PDF format.
5
+ Home-page: https://github.com/nordinz7/maybankpdf2json
6
+ Author: Nordin
7
+ Author-email: Nordin <vipnordin@gmail.com>
8
+ Project-URL: Homepage, https://github.com/nordinz7/maybankpdf2json
9
+ Keywords: maybank,pdf,JSON,account statements,Email delivery statement,Malayan Banking
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: OS Independent
13
+ Requires-Python: >=3.6
14
+ Description-Content-Type: text/markdown
15
+ Requires-Dist: pdfplumber>=0.7.4
16
+ Dynamic: author
17
+ Dynamic: home-page
18
+ Dynamic: requires-python
19
+
20
+ # Maybank Account Statement Extractor
21
+
22
+ This package provides functionality to extract and process data from Maybank account statement PDFs. It allows users to read PDF files, filter relevant data, and map it into a structured format for further analysis or reporting.
23
+
24
+ ## Features
25
+
26
+ - Extract data from PDF statements.
27
+ - Filter and map extracted data into a structured format.
28
+ - Utility functions for data manipulation and validation.
29
+
30
+ ## Installation
31
+
32
+ To install the package, clone the repository and run the following command:
33
+
34
+ ```
35
+ pip install .
36
+ ```
37
+
38
+ ## Usage
39
+
40
+ Here is a basic example of how to use the package:
41
+
42
+ ```python
43
+ from maybank_acc_extractor.extractor import read_pdfs, get_filtered_data, get_mapped_data
44
+
45
+ # Read PDF files from a specified directory
46
+ pdf_data = read_pdfs('path/to/pdf/folder', 'your_password')
47
+
48
+ # Filter the data
49
+ filtered_data = get_filtered_data(pdf_data)
50
+
51
+ # Map the filtered data to a structured format
52
+ mapped_data = get_mapped_data(filtered_data)
53
+
54
+ # Output the mapped data
55
+ print(mapped_data)
56
+ ```
57
+
58
+ ## Testing
59
+
60
+ To run the tests, navigate to the project directory and execute:
61
+
62
+ ```
63
+ pytest tests/
64
+ ```
65
+
66
+ ## Contributing
67
+
68
+ Contributions are welcome! Please feel free to submit a pull request or open an issue for any enhancements or bug fixes.
69
+
70
+ ## License
71
+
72
+ This project is licensed under the MIT License. See the LICENSE file for more details.
@@ -0,0 +1,12 @@
1
+ README.md
2
+ pyproject.toml
3
+ setup.py
4
+ maybankpdf2json/__init__.py
5
+ maybankpdf2json/extractor.py
6
+ maybankpdf2json/utils.py
7
+ maybankpdf2json.egg-info/PKG-INFO
8
+ maybankpdf2json.egg-info/SOURCES.txt
9
+ maybankpdf2json.egg-info/dependency_links.txt
10
+ maybankpdf2json.egg-info/requires.txt
11
+ maybankpdf2json.egg-info/top_level.txt
12
+ tests/test_extractor.py
@@ -0,0 +1 @@
1
+ pdfplumber>=0.7.4
@@ -0,0 +1 @@
1
+ maybankpdf2json
@@ -0,0 +1,30 @@
1
+ [build-system]
2
+ requires = ["setuptools", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "maybankpdf2json"
7
+ description = "A package for extracting data from Maybank account statements in PDF format."
8
+ authors = [{ name = "Nordin", email = "vipnordin@gmail.com" }]
9
+ license = { file = "LICENSE" }
10
+ readme = "README.md"
11
+ urls = { "Homepage" = "https://github.com/nordinz7/maybankpdf2json" }
12
+ keywords = [
13
+ "maybank",
14
+ "pdf",
15
+ "JSON",
16
+ "account statements",
17
+ "Email delivery statement",
18
+ "Malayan Banking",
19
+ ]
20
+ classifiers = [
21
+ "Programming Language :: Python :: 3",
22
+ "License :: OSI Approved :: MIT License",
23
+ "Operating System :: OS Independent",
24
+ ]
25
+ dependencies = ["pdfplumber>=0.7.4"]
26
+ dynamic = ["version"]
27
+ requires-python = ">=3.6"
28
+
29
+ [tool.setuptools]
30
+ packages = ["maybankpdf2json"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,22 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name="maybankpdf2json",
5
+ version="0.1.0",
6
+ author="Nordin",
7
+ author_email="vipnordin@gmail.com",
8
+ description="A package for extracting JSON data from Maybank account statements(PDF format).",
9
+ long_description=open("README.md").read(),
10
+ long_description_content_type="text/markdown",
11
+ url="https://github.com/nordinz7/maybankpdf2json",
12
+ packages=find_packages(),
13
+ install_requires=[
14
+ "pdfplumber",
15
+ ],
16
+ classifiers=[
17
+ "Programming Language :: Python :: 3",
18
+ "License :: OSI Approved :: MIT License",
19
+ "Operating System :: OS Independent",
20
+ ],
21
+ python_requires='>=3.6',
22
+ )
@@ -0,0 +1,35 @@
1
+ import unittest
2
+ from maybank_acc_extractor.extractor import read_pdfs, get_filtered_data, get_mapped_data
3
+
4
+ class TestExtractor(unittest.TestCase):
5
+
6
+ def setUp(self):
7
+ self.test_pdf_path = "path/to/test/pdf" # Update with actual test PDF path
8
+ self.test_password = "test_password" # Update with actual test password
9
+ self.test_data = [
10
+ "BEGINNING BALANCE 1000.00",
11
+ "01/01/22 Transaction Description 100.00 1100.00",
12
+ "02/01/22 Transaction Description -50.00 1050.00",
13
+ "TOTAL DEBIT 50.00"
14
+ ]
15
+
16
+ def test_read_pdfs(self):
17
+ pdf_data = read_pdfs(self.test_pdf_path, self.test_password)
18
+ self.assertIsInstance(pdf_data, list)
19
+
20
+ def test_get_filtered_data(self):
21
+ filtered_data = get_filtered_data(self.test_data)
22
+ self.assertGreater(len(filtered_data), 0)
23
+
24
+ def test_get_mapped_data(self):
25
+ filtered_data = get_filtered_data(self.test_data)
26
+ mapped_data = get_mapped_data(filtered_data)
27
+ self.assertIsInstance(mapped_data, list)
28
+ self.assertGreater(len(mapped_data), 0)
29
+ self.assertIn("date", mapped_data[0])
30
+ self.assertIn("desc", mapped_data[0])
31
+ self.assertIn("trans", mapped_data[0])
32
+ self.assertIn("bal", mapped_data[0])
33
+
34
+ if __name__ == "__main__":
35
+ unittest.main()