maybankpdf2json 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- maybankpdf2json-0.1.0/PKG-INFO +72 -0
- maybankpdf2json-0.1.0/README.md +53 -0
- maybankpdf2json-0.1.0/maybankpdf2json/__init__.py +3 -0
- maybankpdf2json-0.1.0/maybankpdf2json/extractor.py +68 -0
- maybankpdf2json-0.1.0/maybankpdf2json/utils.py +32 -0
- maybankpdf2json-0.1.0/maybankpdf2json.egg-info/PKG-INFO +72 -0
- maybankpdf2json-0.1.0/maybankpdf2json.egg-info/SOURCES.txt +12 -0
- maybankpdf2json-0.1.0/maybankpdf2json.egg-info/dependency_links.txt +1 -0
- maybankpdf2json-0.1.0/maybankpdf2json.egg-info/requires.txt +1 -0
- maybankpdf2json-0.1.0/maybankpdf2json.egg-info/top_level.txt +1 -0
- maybankpdf2json-0.1.0/pyproject.toml +30 -0
- maybankpdf2json-0.1.0/setup.cfg +4 -0
- maybankpdf2json-0.1.0/setup.py +22 -0
- maybankpdf2json-0.1.0/tests/test_extractor.py +35 -0
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: maybankpdf2json
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A package for extracting data from Maybank account statements in PDF format.
|
|
5
|
+
Home-page: https://github.com/nordinz7/maybankpdf2json
|
|
6
|
+
Author: Nordin
|
|
7
|
+
Author-email: Nordin <vipnordin@gmail.com>
|
|
8
|
+
Project-URL: Homepage, https://github.com/nordinz7/maybankpdf2json
|
|
9
|
+
Keywords: maybank,pdf,JSON,account statements,Email delivery statement,Malayan Banking
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Requires-Python: >=3.6
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
Requires-Dist: pdfplumber>=0.7.4
|
|
16
|
+
Dynamic: author
|
|
17
|
+
Dynamic: home-page
|
|
18
|
+
Dynamic: requires-python
|
|
19
|
+
|
|
20
|
+
# Maybank Account Statement Extractor
|
|
21
|
+
|
|
22
|
+
This package provides functionality to extract and process data from Maybank account statement PDFs. It allows users to read PDF files, filter relevant data, and map it into a structured format for further analysis or reporting.
|
|
23
|
+
|
|
24
|
+
## Features
|
|
25
|
+
|
|
26
|
+
- Extract data from PDF statements.
|
|
27
|
+
- Filter and map extracted data into a structured format.
|
|
28
|
+
- Utility functions for data manipulation and validation.
|
|
29
|
+
|
|
30
|
+
## Installation
|
|
31
|
+
|
|
32
|
+
To install the package, clone the repository and run the following command:
|
|
33
|
+
|
|
34
|
+
```
|
|
35
|
+
pip install .
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Usage
|
|
39
|
+
|
|
40
|
+
Here is a basic example of how to use the package:
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
from maybank_acc_extractor.extractor import read_pdfs, get_filtered_data, get_mapped_data
|
|
44
|
+
|
|
45
|
+
# Read PDF files from a specified directory
|
|
46
|
+
pdf_data = read_pdfs('path/to/pdf/folder', 'your_password')
|
|
47
|
+
|
|
48
|
+
# Filter the data
|
|
49
|
+
filtered_data = get_filtered_data(pdf_data)
|
|
50
|
+
|
|
51
|
+
# Map the filtered data to a structured format
|
|
52
|
+
mapped_data = get_mapped_data(filtered_data)
|
|
53
|
+
|
|
54
|
+
# Output the mapped data
|
|
55
|
+
print(mapped_data)
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Testing
|
|
59
|
+
|
|
60
|
+
To run the tests, navigate to the project directory and execute:
|
|
61
|
+
|
|
62
|
+
```
|
|
63
|
+
pytest tests/
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Contributing
|
|
67
|
+
|
|
68
|
+
Contributions are welcome! Please feel free to submit a pull request or open an issue for any enhancements or bug fixes.
|
|
69
|
+
|
|
70
|
+
## License
|
|
71
|
+
|
|
72
|
+
This project is licensed under the MIT License. See the LICENSE file for more details.
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# Maybank Account Statement Extractor
|
|
2
|
+
|
|
3
|
+
This package provides functionality to extract and process data from Maybank account statement PDFs. It allows users to read PDF files, filter relevant data, and map it into a structured format for further analysis or reporting.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- Extract data from PDF statements.
|
|
8
|
+
- Filter and map extracted data into a structured format.
|
|
9
|
+
- Utility functions for data manipulation and validation.
|
|
10
|
+
|
|
11
|
+
## Installation
|
|
12
|
+
|
|
13
|
+
To install the package, clone the repository and run the following command:
|
|
14
|
+
|
|
15
|
+
```
|
|
16
|
+
pip install .
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Usage
|
|
20
|
+
|
|
21
|
+
Here is a basic example of how to use the package:
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
from maybank_acc_extractor.extractor import read_pdfs, get_filtered_data, get_mapped_data
|
|
25
|
+
|
|
26
|
+
# Read PDF files from a specified directory
|
|
27
|
+
pdf_data = read_pdfs('path/to/pdf/folder', 'your_password')
|
|
28
|
+
|
|
29
|
+
# Filter the data
|
|
30
|
+
filtered_data = get_filtered_data(pdf_data)
|
|
31
|
+
|
|
32
|
+
# Map the filtered data to a structured format
|
|
33
|
+
mapped_data = get_mapped_data(filtered_data)
|
|
34
|
+
|
|
35
|
+
# Output the mapped data
|
|
36
|
+
print(mapped_data)
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Testing
|
|
40
|
+
|
|
41
|
+
To run the tests, navigate to the project directory and execute:
|
|
42
|
+
|
|
43
|
+
```
|
|
44
|
+
pytest tests/
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Contributing
|
|
48
|
+
|
|
49
|
+
Contributions are welcome! Please feel free to submit a pull request or open an issue for any enhancements or bug fixes.
|
|
50
|
+
|
|
51
|
+
## License
|
|
52
|
+
|
|
53
|
+
This project is licensed under the MIT License. See the LICENSE file for more details.
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from typing import List, Dict, Union, BinaryIO
|
|
2
|
+
import pdfplumber
|
|
3
|
+
|
|
4
|
+
START_ENTRY = "BEGINNING BALANCE"
|
|
5
|
+
END_ENTRY = "TOTAL DEBIT"
|
|
6
|
+
EXCLUDE_ITEMS = ["TOTAL CREDIT", "TOTAL DEBIT", "ENDING BALANCE"]
|
|
7
|
+
|
|
8
|
+
class MaybankAccExtractor:
|
|
9
|
+
def __init__(self, buffers: Union[List[BinaryIO], BinaryIO], pwd: str = None):
|
|
10
|
+
if isinstance(buffers, list):
|
|
11
|
+
self.buffers = buffers
|
|
12
|
+
else:
|
|
13
|
+
self.buffers = [buffers]
|
|
14
|
+
self.pwd = pwd
|
|
15
|
+
|
|
16
|
+
def read_pdfs(self) -> List[List[str]]:
|
|
17
|
+
pdf_files = []
|
|
18
|
+
for buffer in self.buffers:
|
|
19
|
+
try:
|
|
20
|
+
pdf_files.append(self.read_single_pdf_file(buffer, self.pwd))
|
|
21
|
+
except Exception:
|
|
22
|
+
pdf_files.append(self.read_single_pdf_file(buffer, None))
|
|
23
|
+
return pdf_files
|
|
24
|
+
|
|
25
|
+
def read_single_pdf_file(self, buffer: BinaryIO, pwd: str) -> List[str]:
|
|
26
|
+
buffer.seek(0)
|
|
27
|
+
with pdfplumber.open(buffer, password=pwd) as pdf:
|
|
28
|
+
return [
|
|
29
|
+
txt
|
|
30
|
+
for page in pdf.pages
|
|
31
|
+
for txt in page.extract_text().split("\n")
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
def get_filtered_data(self, arr: List[str]) -> List[str]:
|
|
35
|
+
indexes = [0, len(arr)]
|
|
36
|
+
for i, x in enumerate(arr):
|
|
37
|
+
if x.startswith(START_ENTRY):
|
|
38
|
+
indexes[0] = i
|
|
39
|
+
elif x.startswith(END_ENTRY):
|
|
40
|
+
indexes[1] = i + 1
|
|
41
|
+
break
|
|
42
|
+
filtered = arr[indexes[0]: indexes[1]]
|
|
43
|
+
narr = [v for v in filtered if not any(v.startswith(item) for item in EXCLUDE_ITEMS)]
|
|
44
|
+
return narr
|
|
45
|
+
|
|
46
|
+
def get_mapped_data(self, arr: List[str]) -> List[Dict[str, str]]:
|
|
47
|
+
narr = []
|
|
48
|
+
for current in arr:
|
|
49
|
+
splitted = current.split()
|
|
50
|
+
if len(splitted) < 3:
|
|
51
|
+
continue
|
|
52
|
+
obj = {
|
|
53
|
+
"date": splitted[0],
|
|
54
|
+
"desc": " ".join(splitted[1:-2]),
|
|
55
|
+
"trans": float(splitted[-2]),
|
|
56
|
+
"bal": float(splitted[-1])
|
|
57
|
+
}
|
|
58
|
+
narr.append(obj)
|
|
59
|
+
return narr
|
|
60
|
+
|
|
61
|
+
def extract_data(self) -> List[Dict[str, str]]:
|
|
62
|
+
pdf_data = self.read_pdfs()
|
|
63
|
+
all_mapped_data = []
|
|
64
|
+
for pdf in pdf_data:
|
|
65
|
+
filtered_data = self.get_filtered_data(pdf)
|
|
66
|
+
mapped_data = self.get_mapped_data(filtered_data)
|
|
67
|
+
all_mapped_data.extend(mapped_data)
|
|
68
|
+
return all_mapped_data
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
def parse_acc_value(value: str) -> float:
|
|
2
|
+
value = value.replace(",", "")
|
|
3
|
+
if value.endswith("-"):
|
|
4
|
+
return -float(value[:-1])
|
|
5
|
+
elif value.endswith("+"):
|
|
6
|
+
return float(value[:-1])
|
|
7
|
+
else:
|
|
8
|
+
return float(value)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def is_valid_date(date_str: str) -> bool:
|
|
12
|
+
try:
|
|
13
|
+
datetime.strptime(date_str, "%d/%m/%y")
|
|
14
|
+
return True
|
|
15
|
+
except ValueError:
|
|
16
|
+
return False
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def output_extracted_data(value, options):
|
|
20
|
+
type = options["format"]
|
|
21
|
+
is_json = type == "json"
|
|
22
|
+
newline = None if is_json else ""
|
|
23
|
+
date = datetime.strptime(value[2]["date"], "%d/%m/%y")
|
|
24
|
+
file_date = date.strftime("%Y%m %B ") if not options["merge"] else "-COMBINED"
|
|
25
|
+
|
|
26
|
+
with open(f"{OUTPUT_FILENAME}{file_date}.{type}", "w", newline=newline) as o_file:
|
|
27
|
+
if is_json:
|
|
28
|
+
json.dump(value, o_file, indent=4)
|
|
29
|
+
else:
|
|
30
|
+
writer = csv.DictWriter(o_file, ["date", "desc", "trans", "bal"])
|
|
31
|
+
writer.writeheader()
|
|
32
|
+
writer.writerows(value)
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: maybankpdf2json
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A package for extracting data from Maybank account statements in PDF format.
|
|
5
|
+
Home-page: https://github.com/nordinz7/maybankpdf2json
|
|
6
|
+
Author: Nordin
|
|
7
|
+
Author-email: Nordin <vipnordin@gmail.com>
|
|
8
|
+
Project-URL: Homepage, https://github.com/nordinz7/maybankpdf2json
|
|
9
|
+
Keywords: maybank,pdf,JSON,account statements,Email delivery statement,Malayan Banking
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Requires-Python: >=3.6
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
Requires-Dist: pdfplumber>=0.7.4
|
|
16
|
+
Dynamic: author
|
|
17
|
+
Dynamic: home-page
|
|
18
|
+
Dynamic: requires-python
|
|
19
|
+
|
|
20
|
+
# Maybank Account Statement Extractor
|
|
21
|
+
|
|
22
|
+
This package provides functionality to extract and process data from Maybank account statement PDFs. It allows users to read PDF files, filter relevant data, and map it into a structured format for further analysis or reporting.
|
|
23
|
+
|
|
24
|
+
## Features
|
|
25
|
+
|
|
26
|
+
- Extract data from PDF statements.
|
|
27
|
+
- Filter and map extracted data into a structured format.
|
|
28
|
+
- Utility functions for data manipulation and validation.
|
|
29
|
+
|
|
30
|
+
## Installation
|
|
31
|
+
|
|
32
|
+
To install the package, clone the repository and run the following command:
|
|
33
|
+
|
|
34
|
+
```
|
|
35
|
+
pip install .
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Usage
|
|
39
|
+
|
|
40
|
+
Here is a basic example of how to use the package:
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
from maybank_acc_extractor.extractor import read_pdfs, get_filtered_data, get_mapped_data
|
|
44
|
+
|
|
45
|
+
# Read PDF files from a specified directory
|
|
46
|
+
pdf_data = read_pdfs('path/to/pdf/folder', 'your_password')
|
|
47
|
+
|
|
48
|
+
# Filter the data
|
|
49
|
+
filtered_data = get_filtered_data(pdf_data)
|
|
50
|
+
|
|
51
|
+
# Map the filtered data to a structured format
|
|
52
|
+
mapped_data = get_mapped_data(filtered_data)
|
|
53
|
+
|
|
54
|
+
# Output the mapped data
|
|
55
|
+
print(mapped_data)
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Testing
|
|
59
|
+
|
|
60
|
+
To run the tests, navigate to the project directory and execute:
|
|
61
|
+
|
|
62
|
+
```
|
|
63
|
+
pytest tests/
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Contributing
|
|
67
|
+
|
|
68
|
+
Contributions are welcome! Please feel free to submit a pull request or open an issue for any enhancements or bug fixes.
|
|
69
|
+
|
|
70
|
+
## License
|
|
71
|
+
|
|
72
|
+
This project is licensed under the MIT License. See the LICENSE file for more details.
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
setup.py
|
|
4
|
+
maybankpdf2json/__init__.py
|
|
5
|
+
maybankpdf2json/extractor.py
|
|
6
|
+
maybankpdf2json/utils.py
|
|
7
|
+
maybankpdf2json.egg-info/PKG-INFO
|
|
8
|
+
maybankpdf2json.egg-info/SOURCES.txt
|
|
9
|
+
maybankpdf2json.egg-info/dependency_links.txt
|
|
10
|
+
maybankpdf2json.egg-info/requires.txt
|
|
11
|
+
maybankpdf2json.egg-info/top_level.txt
|
|
12
|
+
tests/test_extractor.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
pdfplumber>=0.7.4
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
maybankpdf2json
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "maybankpdf2json"
|
|
7
|
+
description = "A package for extracting data from Maybank account statements in PDF format."
|
|
8
|
+
authors = [{ name = "Nordin", email = "vipnordin@gmail.com" }]
|
|
9
|
+
license = { file = "LICENSE" }
|
|
10
|
+
readme = "README.md"
|
|
11
|
+
urls = { "Homepage" = "https://github.com/nordinz7/maybankpdf2json" }
|
|
12
|
+
keywords = [
|
|
13
|
+
"maybank",
|
|
14
|
+
"pdf",
|
|
15
|
+
"JSON",
|
|
16
|
+
"account statements",
|
|
17
|
+
"Email delivery statement",
|
|
18
|
+
"Malayan Banking",
|
|
19
|
+
]
|
|
20
|
+
classifiers = [
|
|
21
|
+
"Programming Language :: Python :: 3",
|
|
22
|
+
"License :: OSI Approved :: MIT License",
|
|
23
|
+
"Operating System :: OS Independent",
|
|
24
|
+
]
|
|
25
|
+
dependencies = ["pdfplumber>=0.7.4"]
|
|
26
|
+
dynamic = ["version"]
|
|
27
|
+
requires-python = ">=3.6"
|
|
28
|
+
|
|
29
|
+
[tool.setuptools]
|
|
30
|
+
packages = ["maybankpdf2json"]
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
setup(
|
|
4
|
+
name="maybankpdf2json",
|
|
5
|
+
version="0.1.0",
|
|
6
|
+
author="Nordin",
|
|
7
|
+
author_email="vipnordin@gmail.com",
|
|
8
|
+
description="A package for extracting JSON data from Maybank account statements(PDF format).",
|
|
9
|
+
long_description=open("README.md").read(),
|
|
10
|
+
long_description_content_type="text/markdown",
|
|
11
|
+
url="https://github.com/nordinz7/maybankpdf2json",
|
|
12
|
+
packages=find_packages(),
|
|
13
|
+
install_requires=[
|
|
14
|
+
"pdfplumber",
|
|
15
|
+
],
|
|
16
|
+
classifiers=[
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"License :: OSI Approved :: MIT License",
|
|
19
|
+
"Operating System :: OS Independent",
|
|
20
|
+
],
|
|
21
|
+
python_requires='>=3.6',
|
|
22
|
+
)
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
from maybank_acc_extractor.extractor import read_pdfs, get_filtered_data, get_mapped_data
|
|
3
|
+
|
|
4
|
+
class TestExtractor(unittest.TestCase):
|
|
5
|
+
|
|
6
|
+
def setUp(self):
|
|
7
|
+
self.test_pdf_path = "path/to/test/pdf" # Update with actual test PDF path
|
|
8
|
+
self.test_password = "test_password" # Update with actual test password
|
|
9
|
+
self.test_data = [
|
|
10
|
+
"BEGINNING BALANCE 1000.00",
|
|
11
|
+
"01/01/22 Transaction Description 100.00 1100.00",
|
|
12
|
+
"02/01/22 Transaction Description -50.00 1050.00",
|
|
13
|
+
"TOTAL DEBIT 50.00"
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
def test_read_pdfs(self):
|
|
17
|
+
pdf_data = read_pdfs(self.test_pdf_path, self.test_password)
|
|
18
|
+
self.assertIsInstance(pdf_data, list)
|
|
19
|
+
|
|
20
|
+
def test_get_filtered_data(self):
|
|
21
|
+
filtered_data = get_filtered_data(self.test_data)
|
|
22
|
+
self.assertGreater(len(filtered_data), 0)
|
|
23
|
+
|
|
24
|
+
def test_get_mapped_data(self):
|
|
25
|
+
filtered_data = get_filtered_data(self.test_data)
|
|
26
|
+
mapped_data = get_mapped_data(filtered_data)
|
|
27
|
+
self.assertIsInstance(mapped_data, list)
|
|
28
|
+
self.assertGreater(len(mapped_data), 0)
|
|
29
|
+
self.assertIn("date", mapped_data[0])
|
|
30
|
+
self.assertIn("desc", mapped_data[0])
|
|
31
|
+
self.assertIn("trans", mapped_data[0])
|
|
32
|
+
self.assertIn("bal", mapped_data[0])
|
|
33
|
+
|
|
34
|
+
if __name__ == "__main__":
|
|
35
|
+
unittest.main()
|