fintl 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fintl/__init__.py +0 -0
- fintl/accounts_etl/__init__.py +0 -0
- fintl/accounts_etl/dkb/__init__.py +0 -0
- fintl/accounts_etl/dkb/credit0.py +242 -0
- fintl/accounts_etl/dkb/festgeld0.py +245 -0
- fintl/accounts_etl/dkb/files.py +230 -0
- fintl/accounts_etl/dkb/giro0.py +249 -0
- fintl/accounts_etl/dkb/giro202307.py +251 -0
- fintl/accounts_etl/dkb/giro202312.py +263 -0
- fintl/accounts_etl/dkb/plugin.py +95 -0
- fintl/accounts_etl/dkb/tagesgeld0.py +268 -0
- fintl/accounts_etl/dkb/tagesgeld202307.py +257 -0
- fintl/accounts_etl/dkb/tagesgeld202312.py +261 -0
- fintl/accounts_etl/exceptions.py +6 -0
- fintl/accounts_etl/file_helper.py +143 -0
- fintl/accounts_etl/files.py +62 -0
- fintl/accounts_etl/gls/__init__.py +0 -0
- fintl/accounts_etl/gls/credit0.py +94 -0
- fintl/accounts_etl/gls/giro0.py +91 -0
- fintl/accounts_etl/gls/helper.py +204 -0
- fintl/accounts_etl/gls/plugin.py +38 -0
- fintl/accounts_etl/labels.py +57 -0
- fintl/accounts_etl/postbank/__init__.py +0 -0
- fintl/accounts_etl/postbank/giro0.py +250 -0
- fintl/accounts_etl/postbank/giro202305.py +271 -0
- fintl/accounts_etl/postbank/plugin.py +32 -0
- fintl/accounts_etl/process_accounts.py +99 -0
- fintl/accounts_etl/registry.py +28 -0
- fintl/accounts_etl/runner.py +205 -0
- fintl/accounts_etl/scalable/__init__.py +0 -0
- fintl/accounts_etl/scalable/broker0.py +209 -0
- fintl/accounts_etl/scalable/broker20231028.py +165 -0
- fintl/accounts_etl/scalable/broker20260309.py +293 -0
- fintl/accounts_etl/scalable/files.py +135 -0
- fintl/accounts_etl/scalable/plugin.py +47 -0
- fintl/accounts_etl/schemas.py +336 -0
- fintl/accounts_etl/store.py +169 -0
- fintl/accounts_etl/utils.py +149 -0
- fintl/cli/README.md +392 -0
- fintl/cli/__init__.py +0 -0
- fintl/cli/etl.py +23 -0
- fintl/cli/main.py +27 -0
- fintl/cli/plot.py +46 -0
- fintl/cli/search.py +388 -0
- fintl/cli/search.tcss +45 -0
- fintl/cli/store.py +121 -0
- fintl/fine_logging/__init__.py +237 -0
- fintl/path_utils.py +20 -0
- fintl-0.1.0.dist-info/METADATA +112 -0
- fintl-0.1.0.dist-info/RECORD +52 -0
- fintl-0.1.0.dist-info/WHEEL +4 -0
- fintl-0.1.0.dist-info/entry_points.txt +3 -0
fintl/__init__.py
ADDED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
import logging
|
|
3
|
+
import re
|
|
4
|
+
import typing as T
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
|
|
9
|
+
from fintl.accounts_etl.exceptions import (
|
|
10
|
+
ExtractBalanceException,
|
|
11
|
+
ExtractTransactionsException,
|
|
12
|
+
)
|
|
13
|
+
from fintl.accounts_etl.file_helper import (
|
|
14
|
+
concatenate_new_information_to_history,
|
|
15
|
+
detect_new_raw_files,
|
|
16
|
+
detect_relevant_target_files,
|
|
17
|
+
get_parser_source_files,
|
|
18
|
+
store_balance,
|
|
19
|
+
store_transactions,
|
|
20
|
+
)
|
|
21
|
+
from fintl.accounts_etl.files import copy_new_files, load_lines, select_files_to_copy
|
|
22
|
+
from fintl.accounts_etl.schemas import (
|
|
23
|
+
HASH_COLUMNS,
|
|
24
|
+
TRANSACTION_COLUMNS,
|
|
25
|
+
BalanceInfo,
|
|
26
|
+
Case,
|
|
27
|
+
Config,
|
|
28
|
+
DKBCreditParserEnum,
|
|
29
|
+
ProviderEnum,
|
|
30
|
+
ServiceEnum,
|
|
31
|
+
)
|
|
32
|
+
from fintl.accounts_etl.utils import (
|
|
33
|
+
detect_encoding,
|
|
34
|
+
find_line_with_pattern,
|
|
35
|
+
german_string_numbers_to_floats,
|
|
36
|
+
hash_transactions,
|
|
37
|
+
verify_transactions,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
logger = logging.getLogger(__name__)
|
|
41
|
+
|
|
42
|
+
CASE = Case(
|
|
43
|
+
provider=ProviderEnum.dkb.value,
|
|
44
|
+
service=ServiceEnum.credit.value,
|
|
45
|
+
parser=DKBCreditParserEnum.credit0.value,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def check_if_parser_applies(file_path: Path) -> bool:
|
|
50
|
+
is_file_name_match = (
|
|
51
|
+
re.search(
|
|
52
|
+
r"(\d{4}-\d{2}-\d{2}_to_\d{4}-\d{2}-\d{2}_\d{4}________\d{4}.csv)$",
|
|
53
|
+
str(file_path.name),
|
|
54
|
+
)
|
|
55
|
+
is not None
|
|
56
|
+
)
|
|
57
|
+
return is_file_name_match
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def extract_transactions(
|
|
61
|
+
case: Case, file_path: Path, lines: T.List[str], encoding: str
|
|
62
|
+
) -> pl.DataFrame:
|
|
63
|
+
transaction_pattern: str = '^("?Umsatz)' # start of transactions
|
|
64
|
+
|
|
65
|
+
date_format: str = "%d.%m.%Y"
|
|
66
|
+
date_cols: list = ["Belegdatum"]
|
|
67
|
+
|
|
68
|
+
ix_start_transactions, transactions_header = find_line_with_pattern(
|
|
69
|
+
lines, pattern=transaction_pattern
|
|
70
|
+
)
|
|
71
|
+
logger.debug(
|
|
72
|
+
f"{file_path=} has {ix_start_transactions=} and {transactions_header=}"
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
schema = {
|
|
76
|
+
"Umsatz abgerechnet und nicht im Saldo enthalten": pl.Utf8,
|
|
77
|
+
"Wertstellung": pl.Utf8,
|
|
78
|
+
"Belegdatum": pl.Utf8,
|
|
79
|
+
"Beschreibung": pl.Utf8,
|
|
80
|
+
"Betrag (EUR)": pl.Utf8,
|
|
81
|
+
"Ursprünglicher Betrag": pl.Utf8,
|
|
82
|
+
}
|
|
83
|
+
transactions = pl.read_csv(
|
|
84
|
+
file_path,
|
|
85
|
+
skip_rows=ix_start_transactions,
|
|
86
|
+
separator=";",
|
|
87
|
+
truncate_ragged_lines=True,
|
|
88
|
+
encoding=encoding,
|
|
89
|
+
schema=schema,
|
|
90
|
+
)
|
|
91
|
+
transactions = transactions.with_columns(
|
|
92
|
+
[pl.col(col).str.to_date(date_format) for col in date_cols],
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
transactions = transactions.with_columns(
|
|
96
|
+
pl.col("Betrag (EUR)")
|
|
97
|
+
.str.strip_chars_end()
|
|
98
|
+
.map_elements(german_string_numbers_to_floats, return_dtype=pl.Float64),
|
|
99
|
+
)
|
|
100
|
+
transactions = transactions.with_columns(
|
|
101
|
+
amount=pl.col("Betrag (EUR)"),
|
|
102
|
+
description=pl.col("Beschreibung"),
|
|
103
|
+
date=pl.col("Belegdatum"),
|
|
104
|
+
source=pl.when(pl.col("Betrag (EUR)") > 0)
|
|
105
|
+
.then(pl.col("Beschreibung"))
|
|
106
|
+
.otherwise(pl.lit("myself")),
|
|
107
|
+
recipient=pl.when(pl.col("Betrag (EUR)") < 0)
|
|
108
|
+
.then(pl.col("Beschreibung"))
|
|
109
|
+
.otherwise(pl.lit("myself")),
|
|
110
|
+
provider=pl.lit(case.provider),
|
|
111
|
+
service=pl.lit(case.service),
|
|
112
|
+
parser=pl.lit(case.parser),
|
|
113
|
+
file=pl.lit(str(file_path)),
|
|
114
|
+
)
|
|
115
|
+
transactions = hash_transactions(transactions, hash_columns=HASH_COLUMNS)
|
|
116
|
+
|
|
117
|
+
verify_transactions(TRANSACTION_COLUMNS, transactions, file_path)
|
|
118
|
+
|
|
119
|
+
transactions = transactions.select(TRANSACTION_COLUMNS)
|
|
120
|
+
|
|
121
|
+
return transactions
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def extract_balance(case: Case, file_path: Path, lines: T.List[str]) -> BalanceInfo:
|
|
125
|
+
balance_info_pattern: str = '^("?Saldo:)' # start of balance info
|
|
126
|
+
ix_start_balance, balance_line = find_line_with_pattern(
|
|
127
|
+
lines, pattern=balance_info_pattern
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
logger.debug(f"{file_path=} has {ix_start_balance=} and {balance_line=}")
|
|
131
|
+
|
|
132
|
+
_lines = lines[ix_start_balance : ix_start_balance + 2]
|
|
133
|
+
|
|
134
|
+
total, date = _lines[0], _lines[1]
|
|
135
|
+
|
|
136
|
+
date = date.split(";")[1]
|
|
137
|
+
date = date.strip(";").strip('"')
|
|
138
|
+
|
|
139
|
+
date = [int(v) for v in date.split(".")]
|
|
140
|
+
date = datetime.date(date[2], date[1], date[0])
|
|
141
|
+
|
|
142
|
+
total = total.split(";")[1]
|
|
143
|
+
total = total.strip(";").strip(":").strip('"').split(" ")
|
|
144
|
+
amount, currency = total[0], total[1]
|
|
145
|
+
amount = float(amount)
|
|
146
|
+
|
|
147
|
+
return BalanceInfo(
|
|
148
|
+
date=date,
|
|
149
|
+
amount=amount,
|
|
150
|
+
currency=currency,
|
|
151
|
+
provider=case.provider,
|
|
152
|
+
service=case.service,
|
|
153
|
+
parser=case.parser,
|
|
154
|
+
file=str(file_path),
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def parse_csv_file(case: Case, file_path: Path) -> tuple[pl.DataFrame, BalanceInfo]:
|
|
159
|
+
encoding = detect_encoding(file_path)
|
|
160
|
+
logger.debug(f"{file_path=} has {encoding=}")
|
|
161
|
+
|
|
162
|
+
lines = load_lines(file_path, encoding)
|
|
163
|
+
|
|
164
|
+
try:
|
|
165
|
+
transactions = extract_transactions(case, file_path, lines, encoding)
|
|
166
|
+
except Exception as e:
|
|
167
|
+
msg = f"failed to parse {case=} transactions: {file_path=}"
|
|
168
|
+
logger.error(msg)
|
|
169
|
+
raise ExtractTransactionsException(msg) from e
|
|
170
|
+
|
|
171
|
+
try:
|
|
172
|
+
balance = extract_balance(case, file_path, lines)
|
|
173
|
+
except Exception as e:
|
|
174
|
+
msg = f"failed to parse {case=} balance: {file_path=}"
|
|
175
|
+
logger.error(msg)
|
|
176
|
+
raise ExtractBalanceException(msg) from e
|
|
177
|
+
|
|
178
|
+
return transactions, balance
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def parse_new_files(
|
|
182
|
+
case: Case,
|
|
183
|
+
new_files_to_parse: list[Path],
|
|
184
|
+
parsed_dir: Path,
|
|
185
|
+
):
|
|
186
|
+
if len(new_files_to_parse) == 0:
|
|
187
|
+
logger.info("No new files to parse")
|
|
188
|
+
return
|
|
189
|
+
|
|
190
|
+
if not parsed_dir.exists():
|
|
191
|
+
logger.info(f"Creating {parsed_dir=}")
|
|
192
|
+
parsed_dir.mkdir(parents=True, exist_ok=True)
|
|
193
|
+
|
|
194
|
+
logger.info(f"Parsing {len(new_files_to_parse):_} new files to {parsed_dir=}")
|
|
195
|
+
|
|
196
|
+
for file_path in new_files_to_parse:
|
|
197
|
+
logger.debug(f"Parsing {file_path=} to {parsed_dir=}")
|
|
198
|
+
try:
|
|
199
|
+
transactions, balance = parse_csv_file(case, file_path)
|
|
200
|
+
except (ExtractBalanceException, ExtractTransactionsException):
|
|
201
|
+
continue # already logged in parse_csv_file
|
|
202
|
+
|
|
203
|
+
store_transactions(parsed_dir, file_path, transactions)
|
|
204
|
+
store_balance(parsed_dir, file_path, balance)
|
|
205
|
+
|
|
206
|
+
logger.info(f"Finished parsing {len(new_files_to_parse):_d} new files")
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def main(config: Config):
|
|
210
|
+
logger.info(f"Processing {CASE=}")
|
|
211
|
+
|
|
212
|
+
# scan source files
|
|
213
|
+
relevant_source_files = get_parser_source_files(
|
|
214
|
+
CASE, config, check_if_parser_applies
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
# scan target files
|
|
218
|
+
raw_dir = config.get_raw_dir(CASE)
|
|
219
|
+
relevant_target_files = detect_relevant_target_files(raw_dir)
|
|
220
|
+
|
|
221
|
+
# select new source files to be processed
|
|
222
|
+
new_files_to_copy = select_files_to_copy(
|
|
223
|
+
relevant_source_files, relevant_target_files
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
# copy new source files
|
|
227
|
+
copy_new_files(raw_dir, new_files_to_copy)
|
|
228
|
+
|
|
229
|
+
# detect new raw files
|
|
230
|
+
parsed_dir = config.get_parsed_dir(CASE)
|
|
231
|
+
new_files_to_parse = detect_new_raw_files(
|
|
232
|
+
raw_dir, check_if_parser_applies, parsed_dir, CASE.provider, CASE.service
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
# parse new files to parquet -> transactions & balance
|
|
236
|
+
parse_new_files(CASE, new_files_to_parse, parsed_dir)
|
|
237
|
+
|
|
238
|
+
# extend pre-existing parquets for this parser
|
|
239
|
+
parser_dir = config.get_parser_dir(CASE)
|
|
240
|
+
concatenate_new_information_to_history(parser_dir, parsed_dir, new_files_to_parse)
|
|
241
|
+
|
|
242
|
+
logger.info(f"Done processing {CASE=}")
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
3
|
+
import typing as T
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import polars as pl
|
|
7
|
+
|
|
8
|
+
from fintl.accounts_etl.dkb.giro202307 import extract_balance
|
|
9
|
+
from fintl.accounts_etl.dkb.giro202312 import detect_separator
|
|
10
|
+
from fintl.accounts_etl.exceptions import (
|
|
11
|
+
ExtractBalanceException,
|
|
12
|
+
ExtractTransactionsException,
|
|
13
|
+
)
|
|
14
|
+
from fintl.accounts_etl.file_helper import (
|
|
15
|
+
concatenate_new_information_to_history,
|
|
16
|
+
detect_new_parsed_files,
|
|
17
|
+
detect_new_raw_files,
|
|
18
|
+
detect_relevant_target_files,
|
|
19
|
+
get_parser_source_files,
|
|
20
|
+
store_balance,
|
|
21
|
+
store_transactions,
|
|
22
|
+
)
|
|
23
|
+
from fintl.accounts_etl.files import copy_new_files, load_lines, select_files_to_copy
|
|
24
|
+
from fintl.accounts_etl.schemas import (
|
|
25
|
+
HASH_COLUMNS,
|
|
26
|
+
TRANSACTION_COLUMNS,
|
|
27
|
+
BalanceInfo,
|
|
28
|
+
Case,
|
|
29
|
+
Config,
|
|
30
|
+
DKBFestgeltParserEnum,
|
|
31
|
+
ProviderEnum,
|
|
32
|
+
ServiceEnum,
|
|
33
|
+
)
|
|
34
|
+
from fintl.accounts_etl.utils import (
|
|
35
|
+
detect_encoding,
|
|
36
|
+
find_line_with_pattern,
|
|
37
|
+
german_string_numbers_to_floats,
|
|
38
|
+
hash_transactions,
|
|
39
|
+
verify_transactions,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
logger = logging.getLogger(__name__)
|
|
43
|
+
|
|
44
|
+
CASE = Case(
|
|
45
|
+
provider=ProviderEnum.dkb.value,
|
|
46
|
+
service=ServiceEnum.festgeld.value,
|
|
47
|
+
parser=DKBFestgeltParserEnum.festgeld0.value,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def check_if_parser_applies(file_path: Path) -> bool:
|
|
52
|
+
is_file_name_match = re.search(r"(DE\d{20}\.csv$)", str(file_path.name)) is not None
|
|
53
|
+
logger.debug(f"{is_file_name_match=}")
|
|
54
|
+
|
|
55
|
+
# check if the csv file at file_path contains "Betrag (€)"
|
|
56
|
+
encoding = detect_encoding(file_path)
|
|
57
|
+
lines = load_lines(file_path, encoding)
|
|
58
|
+
|
|
59
|
+
separator = detect_separator(lines)
|
|
60
|
+
is_expected_separator = separator is not None and separator in [",", ";"]
|
|
61
|
+
return is_file_name_match and is_expected_separator
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def extract_transactions(
|
|
65
|
+
case: Case, file_path: Path, lines: T.List[str], encoding: str
|
|
66
|
+
) -> pl.DataFrame:
|
|
67
|
+
transaction_pattern: str = '^("?Buchungsdatum)' # start of transactions
|
|
68
|
+
|
|
69
|
+
date_format: str = "%d.%m.%y"
|
|
70
|
+
date_cols: list = ["Buchungsdatum"]
|
|
71
|
+
|
|
72
|
+
ix_start_transactions, transactions_header = find_line_with_pattern(
|
|
73
|
+
lines, pattern=transaction_pattern
|
|
74
|
+
)
|
|
75
|
+
is_empty_1st_line = len(lines[0].strip()) == 0
|
|
76
|
+
logger.debug(
|
|
77
|
+
f"{file_path=} ({is_empty_1st_line=}) has {ix_start_transactions=} and {transactions_header=}"
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
schema = {
|
|
81
|
+
"Buchungsdatum": pl.Utf8,
|
|
82
|
+
"Wertstellung": pl.Utf8,
|
|
83
|
+
"Status": pl.Utf8,
|
|
84
|
+
"Zahlungspflichtige*r": pl.Utf8,
|
|
85
|
+
"Zahlungsempfänger*in": pl.Utf8,
|
|
86
|
+
"Verwendungszweck": pl.Utf8,
|
|
87
|
+
"Umsatztyp": pl.Utf8,
|
|
88
|
+
"IBAN": pl.Utf8,
|
|
89
|
+
"Betrag (€)": pl.Utf8,
|
|
90
|
+
"Gläubiger-ID": pl.Utf8,
|
|
91
|
+
"Mandatsreferenz": pl.Utf8,
|
|
92
|
+
"Kundenreferenz": pl.Utf8,
|
|
93
|
+
}
|
|
94
|
+
separator = detect_separator(lines)
|
|
95
|
+
if separator is None:
|
|
96
|
+
raise ValueError(
|
|
97
|
+
f"{separator=} but it is not allowed to be None in the following."
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
transactions = pl.read_csv(
|
|
101
|
+
file_path,
|
|
102
|
+
skip_rows=ix_start_transactions - 1
|
|
103
|
+
if is_empty_1st_line
|
|
104
|
+
else ix_start_transactions,
|
|
105
|
+
separator=separator,
|
|
106
|
+
truncate_ragged_lines=True,
|
|
107
|
+
encoding=encoding,
|
|
108
|
+
schema=schema,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
try:
|
|
112
|
+
transactions = transactions.with_columns(
|
|
113
|
+
[pl.col(col).str.to_date(date_format) for col in date_cols],
|
|
114
|
+
)
|
|
115
|
+
except pl.exceptions.InvalidOperationError as ex:
|
|
116
|
+
logger.error(f"{separator=}")
|
|
117
|
+
logger.error(f"{len(transactions)=:_}")
|
|
118
|
+
logger.error(f"{transactions[date_cols[0]].to_list()=}")
|
|
119
|
+
msg = f"{file_path=}: Failed to convert dates for values in one of the columns:"
|
|
120
|
+
for col in date_cols:
|
|
121
|
+
for v in transactions[col].unique():
|
|
122
|
+
s = pl.Series([v])
|
|
123
|
+
try:
|
|
124
|
+
s.str.to_date(date_format)
|
|
125
|
+
except: # noqa: E722
|
|
126
|
+
msg += f"\ncolumn '{col}' failed for value '{v}'"
|
|
127
|
+
logger.error(msg)
|
|
128
|
+
raise ex
|
|
129
|
+
|
|
130
|
+
transactions = transactions.with_columns(
|
|
131
|
+
pl.col("Betrag (€)")
|
|
132
|
+
.str.replace("€", "")
|
|
133
|
+
.str.strip_chars_end()
|
|
134
|
+
.map_elements(german_string_numbers_to_floats, return_dtype=pl.Float64),
|
|
135
|
+
)
|
|
136
|
+
transactions = transactions.with_columns(
|
|
137
|
+
amount=pl.col("Betrag (€)"),
|
|
138
|
+
description=pl.col("Verwendungszweck"),
|
|
139
|
+
date=pl.col("Buchungsdatum"),
|
|
140
|
+
source=pl.when(pl.col("Betrag (€)") > 0)
|
|
141
|
+
.then(pl.col("Zahlungspflichtige*r"))
|
|
142
|
+
.otherwise(pl.lit("myself")),
|
|
143
|
+
recipient=pl.when(pl.col("Betrag (€)") < 0)
|
|
144
|
+
.then(pl.col("Zahlungsempfänger*in"))
|
|
145
|
+
.otherwise(pl.lit("myself")),
|
|
146
|
+
provider=pl.lit(case.provider),
|
|
147
|
+
service=pl.lit(case.service),
|
|
148
|
+
parser=pl.lit(case.parser),
|
|
149
|
+
file=pl.lit(str(file_path)),
|
|
150
|
+
)
|
|
151
|
+
transactions = hash_transactions(transactions, HASH_COLUMNS)
|
|
152
|
+
|
|
153
|
+
verify_transactions(TRANSACTION_COLUMNS, transactions, file_path)
|
|
154
|
+
|
|
155
|
+
transactions = transactions.select(TRANSACTION_COLUMNS)
|
|
156
|
+
|
|
157
|
+
return transactions
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def parse_csv_file(case: Case, file_path: Path) -> tuple[pl.DataFrame, BalanceInfo]:
|
|
161
|
+
encoding = detect_encoding(file_path)
|
|
162
|
+
logger.debug(f"{file_path=} has {encoding=}")
|
|
163
|
+
|
|
164
|
+
lines = load_lines(file_path, encoding)
|
|
165
|
+
|
|
166
|
+
try:
|
|
167
|
+
transactions = extract_transactions(case, file_path, lines, encoding)
|
|
168
|
+
except Exception as e:
|
|
169
|
+
msg = f"failed to parse {case=} transactions: {file_path=}"
|
|
170
|
+
logger.error(msg)
|
|
171
|
+
raise ExtractTransactionsException(msg) from e
|
|
172
|
+
|
|
173
|
+
try:
|
|
174
|
+
balance = extract_balance(case, file_path, lines)
|
|
175
|
+
except Exception as e:
|
|
176
|
+
msg = f"failed to parse {case=} balance: {file_path=}"
|
|
177
|
+
logger.error(msg)
|
|
178
|
+
raise ExtractBalanceException(msg) from e
|
|
179
|
+
|
|
180
|
+
return transactions, balance
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def parse_new_files(
|
|
184
|
+
case: Case,
|
|
185
|
+
new_files_to_parse: list[Path],
|
|
186
|
+
parsed_dir: Path,
|
|
187
|
+
):
|
|
188
|
+
if len(new_files_to_parse) == 0:
|
|
189
|
+
logger.info("No new files to parse")
|
|
190
|
+
return
|
|
191
|
+
|
|
192
|
+
if not parsed_dir.exists():
|
|
193
|
+
logger.info(f"Creating {parsed_dir=}")
|
|
194
|
+
parsed_dir.mkdir(parents=True, exist_ok=True)
|
|
195
|
+
|
|
196
|
+
logger.info(f"Parsing {len(new_files_to_parse):_} new files to {parsed_dir=}")
|
|
197
|
+
|
|
198
|
+
for file_path in new_files_to_parse:
|
|
199
|
+
logger.debug(f"Parsing {file_path=} to {parsed_dir=}")
|
|
200
|
+
try:
|
|
201
|
+
transactions, balance = parse_csv_file(case, file_path)
|
|
202
|
+
except (ExtractBalanceException, ExtractTransactionsException):
|
|
203
|
+
continue # already logged in parse_csv_file
|
|
204
|
+
|
|
205
|
+
store_transactions(parsed_dir, file_path, transactions)
|
|
206
|
+
store_balance(parsed_dir, file_path, balance)
|
|
207
|
+
|
|
208
|
+
logger.info(f"Finished parsing {len(new_files_to_parse):_d} new files")
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def main(config: Config):
|
|
212
|
+
logger.info(f"Processing {CASE=}")
|
|
213
|
+
|
|
214
|
+
# scan source files
|
|
215
|
+
relevant_source_files = get_parser_source_files(
|
|
216
|
+
CASE, config, check_if_parser_applies
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
# scan target files
|
|
220
|
+
raw_dir = config.get_raw_dir(CASE)
|
|
221
|
+
relevant_target_files = detect_relevant_target_files(raw_dir)
|
|
222
|
+
|
|
223
|
+
# select new source files to be processed
|
|
224
|
+
new_files_to_copy = select_files_to_copy(
|
|
225
|
+
relevant_source_files, relevant_target_files
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
# copy new source files
|
|
229
|
+
copy_new_files(raw_dir, new_files_to_copy)
|
|
230
|
+
|
|
231
|
+
# detect new raw files
|
|
232
|
+
parsed_dir = config.get_parsed_dir(CASE)
|
|
233
|
+
new_files_to_parse = detect_new_raw_files(
|
|
234
|
+
raw_dir, check_if_parser_applies, parsed_dir, CASE.provider, CASE.service
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
# parse new files to parquet -> transactions & balance
|
|
238
|
+
parse_new_files(CASE, new_files_to_parse, parsed_dir)
|
|
239
|
+
|
|
240
|
+
# extend pre-existing parquets for this parser
|
|
241
|
+
parser_dir = config.get_parser_dir(CASE)
|
|
242
|
+
new_parsed_files = detect_new_parsed_files(raw_dir, parser_dir, parsed_dir)
|
|
243
|
+
concatenate_new_information_to_history(parser_dir, parsed_dir, new_parsed_files)
|
|
244
|
+
|
|
245
|
+
logger.info(f"Done processing {CASE=}")
|