extras_de_cont 1.0.1 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/extras_de_cont.gemspec +2 -39
- data/lib/extras_de_cont/rules/revolut.rb +58 -11
- data/lib/extras_de_cont/rules/unicredit.rb +197 -0
- metadata +3 -41
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: '029f7b904cb76faa331a686ec02f81a0fad9ffce39e0bea1c05e0ed95a6bb34c'
|
|
4
|
+
data.tar.gz: c98d9dc06a00db1cface2c7ea7b58f563502c2fa58da4481d2a928e99fe08766
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: cd20be09910a3ca8ad2a13bc07ce619d76950e080e3bb927a2f3dee0f24441f173ad842437dfc9f8196b24579d5976519c5558031aa8cd33e93cd5263e229705
|
|
7
|
+
data.tar.gz: 27c5cda6efa0043af432ac596c7d2ff736dd3f26c520f3cd2ea79a45506eb41a48d5c9ca236432d1f9999e73f9d8c11dabc201e4ee23cab82f08e9bc7a351b6b
|
data/extras_de_cont.gemspec
CHANGED
|
@@ -2,51 +2,14 @@
|
|
|
2
2
|
|
|
3
3
|
Gem::Specification.new do |s|
|
|
4
4
|
s.name = "extras_de_cont"
|
|
5
|
-
s.version = "1.0
|
|
5
|
+
s.version = "1.1.0"
|
|
6
6
|
s.licenses = ["GPLv3"]
|
|
7
7
|
s.summary = "A simple library which helps you extract transactions from a PDF bank statement."
|
|
8
8
|
s.description = <<~TEXT
|
|
9
9
|
A simple library which helps you extract transactions from a PDF bank statement.
|
|
10
10
|
Fine tuned for Romanian bank statements.
|
|
11
11
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
A ruby gem for extracting bank statements from PDFs.
|
|
15
|
-
|
|
16
|
-
## Simple usage
|
|
17
|
-
|
|
18
|
-
Create a PDF parser and print the extracted text:
|
|
19
|
-
|
|
20
|
-
```ruby
|
|
21
|
-
require "bundler/setup"
|
|
22
|
-
require "extras_de_cont"
|
|
23
|
-
|
|
24
|
-
parser = ExtrasDeCont::Parser.new("/home/dnutiu/Documents/tranzactii_revolut.pdf")
|
|
25
|
-
puts parser.text
|
|
26
|
-
```
|
|
27
|
-
|
|
28
|
-
Or, extract all the transactions from a Revolut Bank statement PDF:
|
|
29
|
-
|
|
30
|
-
```ruby
|
|
31
|
-
transactions = ExtrasDeCont.parse(file, bank: :revolut)
|
|
32
|
-
|
|
33
|
-
transactions.each do |t|
|
|
34
|
-
puts "\#{t.date}, \#{t.description}, \#{t.amount}, \#{t.currency}"
|
|
35
|
-
end
|
|
36
|
-
```
|
|
37
|
-
|
|
38
|
-
Or use the included entrypoint:
|
|
39
|
-
|
|
40
|
-
```bash
|
|
41
|
-
bundle exec ruby -Ilib bin/main /home/dnutiu/Documents/tranzactii_revolut.pdf
|
|
42
|
-
```
|
|
43
|
-
|
|
44
|
-
Run the Revolut parser test with:
|
|
45
|
-
|
|
46
|
-
```bash
|
|
47
|
-
ruby -Ilib:test test/revolut_rule_test.rb
|
|
48
|
-
```
|
|
49
|
-
|
|
12
|
+
Repository: https://github.com/dnutiu/extras-de-cont
|
|
50
13
|
TEXT
|
|
51
14
|
s.authors = ["Denis Nutiu"]
|
|
52
15
|
s.email = "dnutiu@nuculabs.dev"
|
|
@@ -11,24 +11,45 @@ module ExtrasDeCont
|
|
|
11
11
|
"Pending from ",
|
|
12
12
|
"Account transactions from ",
|
|
13
13
|
"Reverted from ",
|
|
14
|
-
"Deposit transactions from "
|
|
14
|
+
"Deposit transactions from ",
|
|
15
|
+
"Transactions from "
|
|
15
16
|
].freeze
|
|
16
17
|
|
|
17
18
|
DOCUMENT_NOISE_HEADERS = [
|
|
19
|
+
"Account statement",
|
|
18
20
|
"Balance summary",
|
|
19
21
|
"The balance on your statement might differ",
|
|
22
|
+
"There were no transactions during this period",
|
|
23
|
+
"Transaction types",
|
|
24
|
+
"Your funds are held and protected by a licensed bank",
|
|
20
25
|
"Report lost or stolen card",
|
|
21
26
|
"+",
|
|
22
27
|
"Get help directly in app",
|
|
28
|
+
"Get help directly In app",
|
|
23
29
|
"Scan the QR code",
|
|
24
30
|
"RON Statement",
|
|
31
|
+
" Statement",
|
|
25
32
|
"Generated on the ",
|
|
26
33
|
"Revolut Bank UAB",
|
|
27
34
|
"© "
|
|
28
35
|
].freeze
|
|
29
36
|
|
|
30
|
-
|
|
31
|
-
|
|
37
|
+
CURRENCY_SYMBOLS = {
|
|
38
|
+
"$" => "USD",
|
|
39
|
+
"€" => "EUR",
|
|
40
|
+
"£" => "GBP",
|
|
41
|
+
"zł" => "PLN",
|
|
42
|
+
"Kč" => "CZK",
|
|
43
|
+
"Ft" => "HUF",
|
|
44
|
+
"лв" => "BGN",
|
|
45
|
+
"₺" => "TRY",
|
|
46
|
+
"₴" => "UAH"
|
|
47
|
+
}.freeze
|
|
48
|
+
DATE_FORMATS = ["%b %e, %Y", "%e %b %Y"].freeze
|
|
49
|
+
DATE_PREFIX = /\A(?<date>(?:[A-Z][a-z]{2} \d{1,2}, \d{4}|\d{1,2} [A-Z][a-z]{2} \d{4}))\b/
|
|
50
|
+
NUMBER = /-?(?:\d{1,3}(?:[ ,]\d{3})+|\d+)\.\d{2}/
|
|
51
|
+
CURRENCY_SYMBOL = Regexp.union(CURRENCY_SYMBOLS.keys.sort_by { |symbol| -symbol.length })
|
|
52
|
+
AMOUNT = /(?:#{NUMBER} [A-Z]{3}|#{CURRENCY_SYMBOL}#{NUMBER}|#{NUMBER} ?#{CURRENCY_SYMBOL})/
|
|
32
53
|
|
|
33
54
|
def parse(text)
|
|
34
55
|
transactions = []
|
|
@@ -105,16 +126,41 @@ module ExtrasDeCont
|
|
|
105
126
|
parse_date(match[:date]),
|
|
106
127
|
description,
|
|
107
128
|
amount,
|
|
108
|
-
amount_string
|
|
129
|
+
parse_currency(amount_string)
|
|
109
130
|
)
|
|
110
131
|
end
|
|
111
132
|
|
|
112
133
|
def parse_date(value)
|
|
113
|
-
|
|
134
|
+
DATE_FORMATS.each do |format|
|
|
135
|
+
return Date.strptime(value, format)
|
|
136
|
+
rescue Date::Error
|
|
137
|
+
next
|
|
138
|
+
end
|
|
114
139
|
end
|
|
115
140
|
|
|
116
141
|
def parse_amount(value)
|
|
117
|
-
value.
|
|
142
|
+
numeric_value(value).delete(", ").to_f
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
def parse_currency(value)
|
|
146
|
+
symbol = currency_symbol(value)
|
|
147
|
+
return CURRENCY_SYMBOLS.fetch(symbol) if symbol
|
|
148
|
+
|
|
149
|
+
value.split.last
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
def numeric_value(value)
|
|
153
|
+
symbol = currency_symbol(value)
|
|
154
|
+
return value.delete_prefix(symbol) if symbol && value.start_with?(symbol)
|
|
155
|
+
return value.delete_suffix(symbol).strip if symbol
|
|
156
|
+
|
|
157
|
+
value.sub(/\s+[A-Z]{3}\z/, "")
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def currency_symbol(value)
|
|
161
|
+
CURRENCY_SYMBOLS.keys.find do |symbol|
|
|
162
|
+
value.start_with?(symbol) || value.end_with?(symbol)
|
|
163
|
+
end
|
|
118
164
|
end
|
|
119
165
|
|
|
120
166
|
def section_header?(line)
|
|
@@ -147,11 +193,12 @@ module ExtrasDeCont
|
|
|
147
193
|
amount_matches = row.to_enum(:scan, AMOUNT).map { Regexp.last_match }
|
|
148
194
|
return if amount_matches.empty?
|
|
149
195
|
|
|
150
|
-
transaction_match =
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
196
|
+
transaction_match =
|
|
197
|
+
if table.fetch(:has_balance)
|
|
198
|
+
amount_matches[-2] if amount_matches.length > 1
|
|
199
|
+
else
|
|
200
|
+
amount_matches[-1]
|
|
201
|
+
end
|
|
155
202
|
return if transaction_match.nil?
|
|
156
203
|
|
|
157
204
|
description = row[date_match.end(0)...transaction_match.begin(0)].to_s.strip
|
|
@@ -1,8 +1,205 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "date"
|
|
4
|
+
require "extras_de_cont/transaction"
|
|
5
|
+
|
|
3
6
|
module ExtrasDeCont
|
|
4
7
|
module Rules
|
|
5
8
|
class UniCredit < Rules::Base
|
|
9
|
+
ROMANIAN_MONTHS = {
|
|
10
|
+
"ianuarie" => 1, "februarie" => 2, "martie" => 3, "aprilie" => 4,
|
|
11
|
+
"mai" => 5, "iunie" => 6, "iulie" => 7, "august" => 8,
|
|
12
|
+
"septembrie" => 9, "octombrie" => 10, "noiembrie" => 11, "decembrie" => 12
|
|
13
|
+
}.freeze
|
|
14
|
+
|
|
15
|
+
RO_MONTH_NAMES = ROMANIAN_MONTHS.keys.freeze
|
|
16
|
+
DATE_PATTERN = /\b(\d{1,2})\s+(#{RO_MONTH_NAMES.join("|")})\s+(\d{4})\b/i
|
|
17
|
+
DATE_PREFIX = /\A\s*#{DATE_PATTERN}/
|
|
18
|
+
|
|
19
|
+
TABLE_HEADER_PATTERN = /Data\s+Descriere\s+Debit\s+Credit\s+Sold/
|
|
20
|
+
|
|
21
|
+
SECTION_HEADERS = [
|
|
22
|
+
"TRANZACȚII",
|
|
23
|
+
"SUMAR CONT",
|
|
24
|
+
"EXTRAS DE CONT"
|
|
25
|
+
].freeze
|
|
26
|
+
|
|
27
|
+
NOISE_PATTERNS = [
|
|
28
|
+
/\AUniCredit Bank S\.A\./,
|
|
29
|
+
/\ABulevardul/,
|
|
30
|
+
/\ASector \d/,
|
|
31
|
+
/\ATel:/,
|
|
32
|
+
/\AEmail:/,
|
|
33
|
+
/\Aunicredit\.ro/,
|
|
34
|
+
/\ACapital social:/,
|
|
35
|
+
/\APrezentul extras/,
|
|
36
|
+
/\AFondurile disponibile/,
|
|
37
|
+
/\APentru mai multe/,
|
|
38
|
+
/\ANUME CLIENT:/,
|
|
39
|
+
/\AADRESA:/,
|
|
40
|
+
/\ASUCURSALA:/,
|
|
41
|
+
/\ADATA EXTRAS CONT/,
|
|
42
|
+
/\APERIOADA/,
|
|
43
|
+
/\ATIP CONT:/,
|
|
44
|
+
/\AIBAN:/,
|
|
45
|
+
/\AMONEDA:/,
|
|
46
|
+
/\AOperator de date/,
|
|
47
|
+
/\ASold inițial/,
|
|
48
|
+
/\ASold final/,
|
|
49
|
+
/\AOperator de date cu/
|
|
50
|
+
].freeze
|
|
51
|
+
|
|
52
|
+
NEW_TRANSACTION_MARKERS = [
|
|
53
|
+
/\A\+CMS CLT-/,
|
|
54
|
+
/\A\+GPP/,
|
|
55
|
+
/\APlata electronica/,
|
|
56
|
+
/\APlata Instant/,
|
|
57
|
+
/\AIncasare Instant/,
|
|
58
|
+
/\ATransfer electronic/
|
|
59
|
+
].freeze
|
|
60
|
+
|
|
61
|
+
AMOUNT_PATTERN = /\d{1,3}(?:[.,]\d{3})*\.\d{2}/
|
|
62
|
+
CURRENCY_FROM_HEADER = /Sold\(([A-Z]{3})\)/
|
|
63
|
+
|
|
64
|
+
def parse(text)
|
|
65
|
+
transactions = []
|
|
66
|
+
current_currency = nil
|
|
67
|
+
current_table = nil
|
|
68
|
+
above_lines = []
|
|
69
|
+
below_lines = []
|
|
70
|
+
date_line = nil
|
|
71
|
+
|
|
72
|
+
each_normalized_line(text) do |line|
|
|
73
|
+
if (m = line.match(CURRENCY_FROM_HEADER))
|
|
74
|
+
current_currency = m[1]
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
if table_header?(line)
|
|
78
|
+
try_flush(date_line, above_lines, below_lines, current_table, current_currency, transactions)
|
|
79
|
+
current_table = extract_column_positions(line)
|
|
80
|
+
above_lines, below_lines, date_line = [], [], nil
|
|
81
|
+
next
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
if noise?(line) || section_header?(line)
|
|
85
|
+
try_flush(date_line, above_lines, below_lines, current_table, current_currency, transactions)
|
|
86
|
+
above_lines, below_lines, date_line = [], [], nil
|
|
87
|
+
next
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
next if current_table.nil?
|
|
91
|
+
|
|
92
|
+
if date_line?(line)
|
|
93
|
+
try_flush(date_line, above_lines, below_lines, current_table, current_currency, transactions)
|
|
94
|
+
date_line = line
|
|
95
|
+
below_lines = []
|
|
96
|
+
next
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
if date_line
|
|
100
|
+
if new_transaction_marker?(line)
|
|
101
|
+
try_flush(date_line, above_lines, below_lines, current_table, current_currency, transactions)
|
|
102
|
+
date_line, below_lines = nil, []
|
|
103
|
+
above_lines = [line]
|
|
104
|
+
else
|
|
105
|
+
below_lines << line
|
|
106
|
+
end
|
|
107
|
+
else
|
|
108
|
+
above_lines << line
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
try_flush(date_line, above_lines, below_lines, current_table, current_currency, transactions)
|
|
113
|
+
transactions
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
private
|
|
117
|
+
|
|
118
|
+
def each_normalized_line(text)
|
|
119
|
+
text.each_line do |line|
|
|
120
|
+
normalized = line.tr("\u00A0", " ").strip
|
|
121
|
+
next if normalized.empty?
|
|
122
|
+
|
|
123
|
+
yield normalized
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def table_header?(line)
|
|
128
|
+
line.match?(TABLE_HEADER_PATTERN)
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def date_line?(line)
|
|
132
|
+
line.match?(DATE_PREFIX)
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
def noise?(line)
|
|
136
|
+
NOISE_PATTERNS.any? { |pattern| line.match?(pattern) }
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
def section_header?(line)
|
|
140
|
+
SECTION_HEADERS.any? { |header| line == header }
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
def new_transaction_marker?(line)
|
|
144
|
+
NEW_TRANSACTION_MARKERS.any? { |pattern| line.match?(pattern) }
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
def extract_column_positions(line)
|
|
148
|
+
{
|
|
149
|
+
debit: line.index("Debit"),
|
|
150
|
+
credit: line.index("Credit"),
|
|
151
|
+
sold: line.index("Sold")
|
|
152
|
+
}
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
def try_flush(date_line, above_lines, below_lines, table, currency, transactions)
|
|
156
|
+
return if date_line.nil? || table.nil?
|
|
157
|
+
|
|
158
|
+
transaction = build_transaction(date_line, above_lines, below_lines, table, currency)
|
|
159
|
+
transactions << transaction if transaction
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
def build_transaction(date_line, above_lines, below_lines, table, currency)
|
|
163
|
+
date_match = date_line.match(DATE_PREFIX)
|
|
164
|
+
return if date_match.nil?
|
|
165
|
+
|
|
166
|
+
amounts = date_line.to_enum(:scan, AMOUNT_PATTERN).map { Regexp.last_match }
|
|
167
|
+
return if amounts.size < 2
|
|
168
|
+
|
|
169
|
+
transaction_amount_match = amounts[-2]
|
|
170
|
+
description_start = date_match.end(0)
|
|
171
|
+
description_end = transaction_amount_match.begin(0)
|
|
172
|
+
main_description = date_line[description_start...description_end].to_s.strip
|
|
173
|
+
|
|
174
|
+
amount_string = transaction_amount_match[0]
|
|
175
|
+
amount = amount_string.delete(", ").to_f
|
|
176
|
+
midpoint = (table[:debit] + table[:credit]) / 2
|
|
177
|
+
amount = -amount if transaction_amount_match.begin(0) < midpoint
|
|
178
|
+
|
|
179
|
+
description = build_description(main_description, above_lines, below_lines)
|
|
180
|
+
|
|
181
|
+
Transaction.new(
|
|
182
|
+
parse_date(date_match[1].to_i, date_match[2], date_match[3].to_i),
|
|
183
|
+
description,
|
|
184
|
+
amount,
|
|
185
|
+
currency || extract_currency_from_header(date_line)
|
|
186
|
+
)
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
def parse_date(day, month_name, year)
|
|
190
|
+
month = ROMANIAN_MONTHS[month_name.downcase]
|
|
191
|
+
Date.new(year, month, day)
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
def build_description(main_desc, above_lines, below_lines)
|
|
195
|
+
parts = [*above_lines.map(&:strip), main_desc, *below_lines.map(&:strip)]
|
|
196
|
+
parts.reject(&:empty?).join(" | ")
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
def extract_currency_from_header(date_line)
|
|
200
|
+
m = date_line.match(CURRENCY_FROM_HEADER)
|
|
201
|
+
m ? m[1] : nil
|
|
202
|
+
end
|
|
6
203
|
end
|
|
7
204
|
end
|
|
8
205
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: extras_de_cont
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.0
|
|
4
|
+
version: 1.1.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Denis Nutiu
|
|
@@ -23,48 +23,11 @@ dependencies:
|
|
|
23
23
|
- - "~>"
|
|
24
24
|
- !ruby/object:Gem::Version
|
|
25
25
|
version: '2.15'
|
|
26
|
-
description:
|
|
26
|
+
description: |
|
|
27
27
|
A simple library which helps you extract transactions from a PDF bank statement.
|
|
28
28
|
Fine tuned for Romanian bank statements.
|
|
29
29
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
A ruby gem for extracting bank statements from PDFs.
|
|
33
|
-
|
|
34
|
-
## Simple usage
|
|
35
|
-
|
|
36
|
-
Create a PDF parser and print the extracted text:
|
|
37
|
-
|
|
38
|
-
```ruby
|
|
39
|
-
require "bundler/setup"
|
|
40
|
-
require "extras_de_cont"
|
|
41
|
-
|
|
42
|
-
parser = ExtrasDeCont::Parser.new("/home/dnutiu/Documents/tranzactii_revolut.pdf")
|
|
43
|
-
puts parser.text
|
|
44
|
-
```
|
|
45
|
-
|
|
46
|
-
Or, extract all the transactions from a Revolut Bank statement PDF:
|
|
47
|
-
|
|
48
|
-
```ruby
|
|
49
|
-
transactions = ExtrasDeCont.parse(file, bank: :revolut)
|
|
50
|
-
|
|
51
|
-
transactions.each do |t|
|
|
52
|
-
puts "#{t.date}, #{t.description}, #{t.amount}, #{t.currency}"
|
|
53
|
-
end
|
|
54
|
-
```
|
|
55
|
-
|
|
56
|
-
Or use the included entrypoint:
|
|
57
|
-
|
|
58
|
-
```bash
|
|
59
|
-
bundle exec ruby -Ilib bin/main /home/dnutiu/Documents/tranzactii_revolut.pdf
|
|
60
|
-
```
|
|
61
|
-
|
|
62
|
-
Run the Revolut parser test with:
|
|
63
|
-
|
|
64
|
-
```bash
|
|
65
|
-
ruby -Ilib:test test/revolut_rule_test.rb
|
|
66
|
-
```
|
|
67
|
-
|
|
30
|
+
Repository: https://github.com/dnutiu/extras-de-cont
|
|
68
31
|
email: dnutiu@nuculabs.dev
|
|
69
32
|
executables: []
|
|
70
33
|
extensions: []
|
|
@@ -103,4 +66,3 @@ rubygems_version: 4.0.10
|
|
|
103
66
|
specification_version: 4
|
|
104
67
|
summary: A simple library which helps you extract transactions from a PDF bank statement.
|
|
105
68
|
test_files: []
|
|
106
|
-
...
|