sps_bill 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.travis.yml +6 -1
- data/CHANGELOG +9 -0
- data/Gemfile +1 -2
- data/Gemfile.lock +5 -5
- data/README.rdoc +15 -5
- data/lib/sps_bill.rb +1 -5
- data/lib/sps_bill/bill.rb +12 -9
- data/lib/sps_bill/bill_collection.rb +26 -17
- data/lib/sps_bill/bill_parser.rb +83 -37
- data/lib/sps_bill/shell.rb +4 -2
- data/lib/sps_bill/version.rb +1 -1
- data/scripts/data/all_services.csv.sample +70 -0
- data/scripts/data/all_services.sample.pdf +0 -0
- data/scripts/data/elec_and_water_only.csv.sample +53 -0
- data/scripts/data/elec_and_water_only.sample.pdf +0 -0
- data/scripts/full_analysis.R +191 -0
- data/spec/support/bill_examples.rb +1 -1
- data/spec/support/pdf_samples_helper.rb +2 -2
- data/spec/unit/bill_collection_spec.rb +6 -6
- data/spec/unit/shell_spec.rb +1 -1
- data/sps_bill.gemspec +11 -14
- metadata +27 -38
- data/lib/pdf/object_hash.rb +0 -39
- data/lib/pdf/positional_text_receiver.rb +0 -16
- data/lib/pdf/structured_reader.rb +0 -108
- data/lib/pdf/textangle.rb +0 -27
- data/spec/fixtures/pdf_samples/junk_prefix.pdf +0 -71
- data/spec/unit/pdf/object_hash_spec.rb +0 -15
data/.travis.yml
CHANGED
data/CHANGELOG
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
Version 0.1.1 Release: 1st August 2012
|
2
|
+
==================================================
|
3
|
+
* update to use pdf-reader-turtletext 0.2.2
|
4
|
+
* convert bill parsing to use the more idiomatic bounding_box
|
5
|
+
syntax available in pdf-reader-turtletext
|
6
|
+
|
7
|
+
Version 0.1.0 Release: 20th July 2012
|
8
|
+
==================================================
|
9
|
+
* Initial packaging and release
|
data/Gemfile
CHANGED
@@ -1,12 +1,11 @@
|
|
1
1
|
source "http://rubygems.org"
|
2
2
|
|
3
|
-
gem 'pdf-reader', '
|
3
|
+
gem 'pdf-reader-turtletext', '~> 0.2.2'
|
4
4
|
gem 'getoptions', '~> 0.3'
|
5
5
|
|
6
6
|
group :development do
|
7
7
|
gem 'bundler', '~> 1.1.4'
|
8
8
|
gem 'jeweler', '~> 1.6.4'
|
9
|
-
gem 'rcov', '>= 0'
|
10
9
|
end
|
11
10
|
|
12
11
|
group :development, :test do
|
data/Gemfile.lock
CHANGED
@@ -3,7 +3,7 @@ GEM
|
|
3
3
|
specs:
|
4
4
|
Ascii85 (1.0.1)
|
5
5
|
diff-lcs (1.1.3)
|
6
|
-
ffi (1.0
|
6
|
+
ffi (1.1.0)
|
7
7
|
getoptions (0.3)
|
8
8
|
git (1.2.5)
|
9
9
|
guard (1.2.3)
|
@@ -15,7 +15,7 @@ GEM
|
|
15
15
|
bundler (~> 1.0)
|
16
16
|
git (>= 1.2.5)
|
17
17
|
rake
|
18
|
-
json (1.
|
18
|
+
json (1.7.3)
|
19
19
|
listen (0.4.7)
|
20
20
|
rb-fchange (~> 0.0.5)
|
21
21
|
rb-fsevent (~> 0.9.1)
|
@@ -23,13 +23,14 @@ GEM
|
|
23
23
|
pdf-reader (1.1.1)
|
24
24
|
Ascii85 (~> 1.0.0)
|
25
25
|
ruby-rc4
|
26
|
+
pdf-reader-turtletext (0.2.2)
|
27
|
+
pdf-reader (= 1.1.1)
|
26
28
|
rake (0.9.2.2)
|
27
29
|
rb-fchange (0.0.5)
|
28
30
|
ffi
|
29
31
|
rb-fsevent (0.9.1)
|
30
32
|
rb-inotify (0.8.8)
|
31
33
|
ffi (>= 0.5.0)
|
32
|
-
rcov (0.9.11)
|
33
34
|
rdoc (3.12)
|
34
35
|
json (~> 1.4)
|
35
36
|
rspec (2.8.0)
|
@@ -51,8 +52,7 @@ DEPENDENCIES
|
|
51
52
|
getoptions (~> 0.3)
|
52
53
|
guard-rspec
|
53
54
|
jeweler (~> 1.6.4)
|
54
|
-
pdf-reader (
|
55
|
+
pdf-reader-turtletext (~> 0.2.2)
|
55
56
|
rake (~> 0.9.2.2)
|
56
|
-
rcov
|
57
57
|
rdoc (~> 3.11)
|
58
58
|
rspec (~> 2.8.0)
|
data/README.rdoc
CHANGED
@@ -1,12 +1,17 @@
|
|
1
1
|
= SP Services Bill Scanner {<img src="https://secure.travis-ci.org/tardate/sps_bill_scanner.png" />}[http://travis-ci.org/tardate/sps_bill_scanner]
|
2
2
|
|
3
|
-
Extracts bill details from SP Services PDF bills so that you can, um, do geeky data analysis n'stuff
|
3
|
+
Extracts bill details from SP Services PDF bills so that you can, um, do geeky data analysis n'stuff,
|
4
|
+
and because I loathe data entry!
|
5
|
+
One day we'll have {smart meters}[http://en.wikipedia.org/wiki/Smart_meter]
|
6
|
+
and SP Services will let us download our raw meter data. But until then...
|
4
7
|
|
5
8
|
If you are an SP Services subscriber, download your bills from https://services.spservices.sg
|
6
9
|
|
7
10
|
If you are not an SP Services subscriber, this gem ain't going to be much use for you!
|
8
11
|
|
9
12
|
Some example analysis using {R}[http://www.r-project.org/] is included in the <tt>scripts</tt> folder.
|
13
|
+
The inspiration for hacking away with R comes from reading Sau Sheong's new book
|
14
|
+
{Exploring Everyday Things with Ruby and R}[http://www.bookjetty.com/books/1449315151/exploring-data-learning-everyday]. Check it out!
|
10
15
|
|
11
16
|
== Requirements and Known Limitations
|
12
17
|
|
@@ -53,11 +58,14 @@ Here's the basic outline:
|
|
53
58
|
- copy this to <tt>spec/fixtures/personal_pdf_samples/expectations.yml</tt>
|
54
59
|
- enter in the details that describe each bill you have added
|
55
60
|
- now when you run <tt>rake</tt> it will also verify the data extracted from your
|
56
|
-
bills using expectations.yml
|
61
|
+
bills using <tt>expectations.yml</tt>
|
57
62
|
|
58
63
|
Feel free to get in touch or discuss in the github issues area if you are trying to help but run
|
59
64
|
into problems with this!
|
60
65
|
|
66
|
+
If you are more interested in the data analytics, I'm keen to add more interesting R scripts to the collection.
|
67
|
+
Your contributions are most welcome.
|
68
|
+
|
61
69
|
== Installation
|
62
70
|
|
63
71
|
gem install sps_bill
|
@@ -152,14 +160,16 @@ in the <tt>scripts</tt> folder.
|
|
152
160
|
=== sample data and analysis
|
153
161
|
|
154
162
|
[data/all_services.csv.sample] sample CSV data for a years worth of elec, gas, and water
|
155
|
-
[data/all_services.sample.pdf] PDF analysis produced by
|
163
|
+
[data/all_services.sample.pdf] PDF analysis produced by <tt>full_analysis.R</tt> using
|
164
|
+
the <tt>all_services.csv.sample</tt> data set.
|
156
165
|
[data/elec_and_water_only.csv.sample] sample CSV data for a years worth of elec and water
|
157
|
-
[data/elec_and_water_only.sample.pdf] PDF analysis produced by
|
166
|
+
[data/elec_and_water_only.sample.pdf] PDF analysis produced by <tt>full_analysis.R</tt> using
|
167
|
+
the <tt>elec_and_water_only.csv.sample</tt> data set.
|
158
168
|
|
159
169
|
=== example run
|
160
170
|
|
161
171
|
./scan_all_bills.sh ../path_to_my_bills/*.pdf > my_bill_data.csv
|
162
|
-
./full_analysis.R
|
172
|
+
./full_analysis.R my_bill_data.csv
|
163
173
|
|
164
174
|
This will have produced an analysis of all your bills in <tt>full_analysis.pdf</tt>.
|
165
175
|
|
data/lib/sps_bill.rb
CHANGED
data/lib/sps_bill/bill.rb
CHANGED
@@ -10,15 +10,18 @@ class SpsBill::Bill
|
|
10
10
|
|
11
11
|
# accessors for the various bill components
|
12
12
|
#
|
13
|
-
|
14
|
-
# [{ kwh: float, rate: float, amount: float }]
|
15
|
-
# gas_usage charges is an array of hashed values:
|
16
|
-
# [{ kwh: float, rate: float, amount: float }]
|
17
|
-
# water_usage charges is an array of hashed values:
|
18
|
-
# [{ cubic_m: float, rate: float, amount: float }]
|
19
|
-
#
|
13
|
+
|
20
14
|
attr_reader :account_number,:total_amount,:invoice_date,:invoice_month
|
21
|
-
|
15
|
+
|
16
|
+
# electricity_usage is an array of hashed values:
|
17
|
+
# [{ kwh: float, rate: float, amount: float }]
|
18
|
+
attr_reader :electricity_usage
|
19
|
+
# gas_usage is an array of hashed values:
|
20
|
+
# [{ kwh: float, rate: float, amount: float }]
|
21
|
+
attr_reader :gas_usage
|
22
|
+
# water_usage is an array of hashed values:
|
23
|
+
# [{ cubic_m: float, rate: float, amount: float }]
|
24
|
+
attr_reader :water_usage
|
22
25
|
|
23
26
|
# +source+ is a file name or stream-like object
|
24
27
|
def initialize(source)
|
@@ -28,7 +31,7 @@ class SpsBill::Bill
|
|
28
31
|
|
29
32
|
# Returns the PDF reader isntance
|
30
33
|
def reader
|
31
|
-
@reader ||= PDF::
|
34
|
+
@reader ||= PDF::Reader::Turtletext.new(source_file) if source_file
|
32
35
|
end
|
33
36
|
|
34
37
|
# Return a pretty(-ish) text format of the core bill details
|
@@ -12,8 +12,8 @@ class SpsBill::BillCollection < Array
|
|
12
12
|
|
13
13
|
# Returns an array of Bill objects for PDF files matching +path_spec+.
|
14
14
|
# +path_spec+ may be either:
|
15
|
-
#
|
16
|
-
#
|
15
|
+
# - an array of filenames e.g. ['data/file1.pdf','file2.pdf']
|
16
|
+
# - or a single file or path spec e.g. './somepath/file1.pdf' or './somepath/*.pdf'
|
17
17
|
def load(path_spec)
|
18
18
|
path_spec = Dir[path_spec] unless path_spec.class <= Array
|
19
19
|
path_spec.each_with_object(new) do |filename,memo|
|
@@ -23,30 +23,33 @@ class SpsBill::BillCollection < Array
|
|
23
23
|
|
24
24
|
end
|
25
25
|
|
26
|
+
# Returns the suitable array of headers for +dataset_selector+
|
26
27
|
def headers(dataset_selector)
|
27
28
|
case dataset_selector
|
28
29
|
when :total_amounts
|
29
|
-
|
30
|
+
%w(invoice_month amount)
|
30
31
|
when :electricity_usages
|
31
|
-
|
32
|
+
%w(invoice_month kwh rate amount)
|
32
33
|
when :gas_usages
|
33
|
-
|
34
|
+
%w(invoice_month kwh rate amount)
|
34
35
|
when :water_usages
|
35
|
-
|
36
|
+
%w(invoice_month cubic_m rate amount)
|
36
37
|
when :all_data
|
37
|
-
|
38
|
+
%w(invoice_month measure kwh cubic_m rate amount)
|
38
39
|
end
|
39
40
|
end
|
40
41
|
|
41
|
-
# Returns
|
42
|
-
#
|
42
|
+
# Returns an array of all data by month
|
43
|
+
# [[month,measure,kwh,cubic_m,rate,amount]]
|
43
44
|
# measure: total_charges,electricity,gas,water
|
44
45
|
def all_data
|
45
46
|
total_amounts(:all) + electricity_usages(:all) + gas_usages(:all) + water_usages(:all)
|
46
47
|
end
|
47
48
|
|
48
|
-
# Returns
|
49
|
-
#
|
49
|
+
# Returns an array of total bill amounts by month
|
50
|
+
# [[month,amount]]
|
51
|
+
# when +style+ is :solo, returns minimal array to describe this data set in isolation,
|
52
|
+
# else returns a normalised sparse array that is common to all data sets
|
50
53
|
def total_amounts(style=:solo)
|
51
54
|
each_with_object([]) do |bill,memo|
|
52
55
|
if style==:solo
|
@@ -57,8 +60,10 @@ class SpsBill::BillCollection < Array
|
|
57
60
|
end
|
58
61
|
end
|
59
62
|
|
60
|
-
# Returns
|
61
|
-
#
|
63
|
+
# Returns an array of electricity_usages by month
|
64
|
+
# [[month,kwh,rate,amount]]
|
65
|
+
# when +style+ is :solo, returns minimal array to describe this data set in isolation,
|
66
|
+
# else returns a normalised sparse array that is common to all data sets
|
62
67
|
def electricity_usages(style=:solo)
|
63
68
|
each_with_object([]) do |bill,memo|
|
64
69
|
bill.electricity_usage.each do |usage|
|
@@ -71,8 +76,10 @@ class SpsBill::BillCollection < Array
|
|
71
76
|
end
|
72
77
|
end
|
73
78
|
|
74
|
-
# Returns
|
75
|
-
#
|
79
|
+
# Returns an array of gas_usages by month
|
80
|
+
# [[month,kwh,rate,amount]]
|
81
|
+
# when +style+ is :solo, returns minimal array to describe this data set in isolation,
|
82
|
+
# else returns a normalised sparse array that is common to all data sets
|
76
83
|
def gas_usages(style=:solo)
|
77
84
|
each_with_object([]) do |bill,memo|
|
78
85
|
bill.gas_usage.each do |usage|
|
@@ -85,8 +92,10 @@ class SpsBill::BillCollection < Array
|
|
85
92
|
end
|
86
93
|
end
|
87
94
|
|
88
|
-
# Returns
|
89
|
-
#
|
95
|
+
# Returns an array of water_usages by month
|
96
|
+
# [[month,kwh,rate,amount]]
|
97
|
+
# when +style+ is :solo, returns minimal array to describe this data set in isolation,
|
98
|
+
# else returns a normalised sparse array that is common to all data sets
|
90
99
|
def water_usages(style=:solo)
|
91
100
|
each_with_object([]) do |bill,memo|
|
92
101
|
bill.water_usage.each do |usage|
|
data/lib/sps_bill/bill_parser.rb
CHANGED
@@ -3,9 +3,12 @@ require 'date'
|
|
3
3
|
# all the bill scanning and parsing intelligence
|
4
4
|
module SpsBill::BillParser
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
6
|
+
ELECTRICITY_SERVICE_HEADER = /Electricity Services/i
|
7
|
+
ELECTRICITY_SERVICE_FOOTER = /Gas Services|Water Services/i
|
8
|
+
GAS_SERVICE_HEADER = /Gas Services/i
|
9
|
+
GAS_SERVICE_FOOTER = /Water Services/i
|
10
|
+
WATER_SERVICE_HEADER = /Water Services/i
|
11
|
+
WATER_SERVICE_FOOTER = /Waterborne Fee/i
|
9
12
|
|
10
13
|
# Returns a collection of parser errors
|
11
14
|
def errors
|
@@ -26,66 +29,109 @@ module SpsBill::BillParser
|
|
26
29
|
|
27
30
|
# Command: extracts the account number
|
28
31
|
def parse_account_number
|
29
|
-
|
32
|
+
region = reader.bounding_box do
|
33
|
+
exclusive!
|
34
|
+
below 'Dated'
|
35
|
+
above 'Type'
|
36
|
+
right_of 'Account No'
|
37
|
+
end
|
38
|
+
# text will be returned like this:
|
39
|
+
# [[":", "8123123123"]]
|
40
|
+
@account_number = region.text.flatten.last
|
30
41
|
end
|
31
42
|
|
32
43
|
# Command: extracts the total amount due for the current month
|
33
44
|
def parse_total_amount
|
34
|
-
|
35
|
-
|
36
|
-
|
45
|
+
region = reader.bounding_box do
|
46
|
+
inclusive!
|
47
|
+
below /^Total Current Charges due on/
|
48
|
+
above /^Total Current Charges due on/
|
49
|
+
right_of /^Total Current Charges due on/
|
50
|
+
left_of 400.0
|
37
51
|
end
|
52
|
+
# text will be returned like this:
|
53
|
+
# [["Total Current Charges due on 14 Jun 2011 (Tue)", "251.44"]]
|
54
|
+
@total_amount = region.text.flatten.last.to_f
|
38
55
|
end
|
39
56
|
|
40
57
|
# Command: extracts the invoice date
|
41
58
|
def parse_invoice_date
|
42
|
-
|
43
|
-
|
44
|
-
|
59
|
+
region = reader.bounding_box do
|
60
|
+
inclusive!
|
61
|
+
below 'Dated'
|
62
|
+
above 'Dated'
|
63
|
+
right_of 'Dated'
|
45
64
|
end
|
65
|
+
# text will be returned like this:
|
66
|
+
# [["Dated", "31", "May", "2011"]]
|
67
|
+
date_string = region.text.flatten.slice(1..3).join('-')
|
68
|
+
@invoice_date = Date.parse(date_string)
|
46
69
|
end
|
47
70
|
|
48
71
|
# Command: extracts the invoice month (as Date, set to 1st of the month)
|
49
72
|
def parse_invoice_month
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
73
|
+
region = reader.bounding_box do
|
74
|
+
inclusive!
|
75
|
+
below 'Dated'
|
76
|
+
above 'Dated'
|
77
|
+
end
|
78
|
+
# text will be returned like this:
|
79
|
+
# [["May", "11", "Bill", "Dated", "31", "May", "2011"]]
|
80
|
+
date_array = ['01'] + region.text.flatten.slice(0..1)
|
81
|
+
if (yy = date_array[2]).length == 2
|
82
|
+
date_array[2] = "20#{yy}" # WARNING: converting 2-digit date. Assumed to be 21st C
|
54
83
|
end
|
84
|
+
@invoice_month = Date.parse(date_array.join('-'))
|
55
85
|
end
|
56
86
|
|
57
|
-
# Command: extracts an array of electricity usage charges. Each
|
58
|
-
#
|
87
|
+
# Command: extracts an array of electricity usage charges. Each element is a Hash:
|
88
|
+
# { kwh: float, rate: float, amount: float }
|
59
89
|
def parse_electricity_usage
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
90
|
+
region = reader.bounding_box do
|
91
|
+
exclusive!
|
92
|
+
below ELECTRICITY_SERVICE_HEADER
|
93
|
+
above ELECTRICITY_SERVICE_FOOTER
|
94
|
+
right_of 240.0
|
95
|
+
left_of 450.0
|
96
|
+
end
|
97
|
+
# text will be returned like this:
|
98
|
+
# [["4 kWh", "0.2410", "0.97"], ["616 kWh", "0.2558", "157.57"]]
|
99
|
+
@electricity_usage = unless (raw_data = region.text).empty?
|
100
|
+
raw_data.map{|l| {:kwh => l[0].gsub(/kwh/i,'').to_f, :rate => l[1].to_f, :amount => l[2].to_f} }
|
67
101
|
end
|
68
102
|
end
|
69
103
|
|
70
|
-
# Command: extracts an array of gas usage charges. Each
|
71
|
-
#
|
104
|
+
# Command: extracts an array of gas usage charges. Each element is a Hash:
|
105
|
+
# { kwh: float, rate: float, amount: float }
|
72
106
|
def parse_gas_usage
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
107
|
+
region = reader.bounding_box do
|
108
|
+
exclusive!
|
109
|
+
below GAS_SERVICE_HEADER
|
110
|
+
above GAS_SERVICE_FOOTER
|
111
|
+
right_of 240.0
|
112
|
+
left_of 450.0
|
113
|
+
end
|
114
|
+
# text will be returned like this:
|
115
|
+
# [["4 kWh", "0.2410", "0.97"], ["616 kWh", "0.2558", "157.57"]]
|
116
|
+
@gas_usage = unless (raw_data = region.text).empty?
|
117
|
+
raw_data.map{|l| {:kwh => l[0].gsub(/kwh/i,'').to_f, :rate => l[1].to_f, :amount => l[2].to_f} }
|
78
118
|
end
|
79
119
|
end
|
80
120
|
|
81
|
-
# Command: extracts an array of water usage charges. Each
|
82
|
-
#
|
121
|
+
# Command: extracts an array of water usage charges. Each element is a Hash:
|
122
|
+
# { cubic_m: float, rate: float, amount: float }
|
83
123
|
def parse_water_usage
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
124
|
+
region = reader.bounding_box do
|
125
|
+
exclusive!
|
126
|
+
below WATER_SERVICE_HEADER
|
127
|
+
above WATER_SERVICE_FOOTER
|
128
|
+
right_of 240.0
|
129
|
+
left_of 450.0
|
130
|
+
end
|
131
|
+
# text will be returned like this:
|
132
|
+
# [["36.1 Cu M", "1.1700", "42.24"], ["-3.0 Cu M", "1.4000", "-4.20"]]
|
133
|
+
@water_usage = unless (raw_data = region.text).empty?
|
134
|
+
raw_data.map{|l| {:cubic_m => l[0].gsub(/cu m/i,'').to_f, :rate => l[1].to_f, :amount => l[2].to_f} }
|
89
135
|
end
|
90
136
|
end
|
91
137
|
|
data/lib/sps_bill/shell.rb
CHANGED
@@ -12,13 +12,15 @@ SP Services Bill Scanner v#{SpsBill::Version::STRING}
|
|
12
12
|
===================================
|
13
13
|
|
14
14
|
Usage:
|
15
|
-
sps_bill [options]
|
15
|
+
sps_bill [options] file-spec
|
16
16
|
|
17
17
|
Command Options
|
18
18
|
-r | --raw raw data format (without headers)
|
19
19
|
-c | --csv output in CSV format (default)
|
20
20
|
-d= | --data=[charges,electricity,gas,water,all]
|
21
21
|
|
22
|
+
file-spec is a path to the PDF bill(s) to read.
|
23
|
+
|
22
24
|
EOS
|
23
25
|
end
|
24
26
|
|
@@ -29,7 +31,7 @@ Command Options
|
|
29
31
|
end
|
30
32
|
|
31
33
|
def run
|
32
|
-
if options[:help]
|
34
|
+
if options[:help] or fileset.empty?
|
33
35
|
self.class.usage
|
34
36
|
return
|
35
37
|
end
|