sps_bill 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.travis.yml +6 -1
- data/CHANGELOG +9 -0
- data/Gemfile +1 -2
- data/Gemfile.lock +5 -5
- data/README.rdoc +15 -5
- data/lib/sps_bill.rb +1 -5
- data/lib/sps_bill/bill.rb +12 -9
- data/lib/sps_bill/bill_collection.rb +26 -17
- data/lib/sps_bill/bill_parser.rb +83 -37
- data/lib/sps_bill/shell.rb +4 -2
- data/lib/sps_bill/version.rb +1 -1
- data/scripts/data/all_services.csv.sample +70 -0
- data/scripts/data/all_services.sample.pdf +0 -0
- data/scripts/data/elec_and_water_only.csv.sample +53 -0
- data/scripts/data/elec_and_water_only.sample.pdf +0 -0
- data/scripts/full_analysis.R +191 -0
- data/spec/support/bill_examples.rb +1 -1
- data/spec/support/pdf_samples_helper.rb +2 -2
- data/spec/unit/bill_collection_spec.rb +6 -6
- data/spec/unit/shell_spec.rb +1 -1
- data/sps_bill.gemspec +11 -14
- metadata +27 -38
- data/lib/pdf/object_hash.rb +0 -39
- data/lib/pdf/positional_text_receiver.rb +0 -16
- data/lib/pdf/structured_reader.rb +0 -108
- data/lib/pdf/textangle.rb +0 -27
- data/spec/fixtures/pdf_samples/junk_prefix.pdf +0 -71
- data/spec/unit/pdf/object_hash_spec.rb +0 -15
data/.travis.yml
CHANGED
data/CHANGELOG
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
Version 0.1.1 Release: 1st August 2012
|
2
|
+
==================================================
|
3
|
+
* update to use pdf-reader-turtletext 0.2.2
|
4
|
+
* convert bill parsing to use the more idiomatic bounding_box
|
5
|
+
syntax available in pdf-reader-turtletext
|
6
|
+
|
7
|
+
Version 0.1.0 Release: 20th July 2012
|
8
|
+
==================================================
|
9
|
+
* Initial packaging and release
|
data/Gemfile
CHANGED
@@ -1,12 +1,11 @@
|
|
1
1
|
source "http://rubygems.org"
|
2
2
|
|
3
|
-
gem 'pdf-reader', '
|
3
|
+
gem 'pdf-reader-turtletext', '~> 0.2.2'
|
4
4
|
gem 'getoptions', '~> 0.3'
|
5
5
|
|
6
6
|
group :development do
|
7
7
|
gem 'bundler', '~> 1.1.4'
|
8
8
|
gem 'jeweler', '~> 1.6.4'
|
9
|
-
gem 'rcov', '>= 0'
|
10
9
|
end
|
11
10
|
|
12
11
|
group :development, :test do
|
data/Gemfile.lock
CHANGED
@@ -3,7 +3,7 @@ GEM
|
|
3
3
|
specs:
|
4
4
|
Ascii85 (1.0.1)
|
5
5
|
diff-lcs (1.1.3)
|
6
|
-
ffi (1.0
|
6
|
+
ffi (1.1.0)
|
7
7
|
getoptions (0.3)
|
8
8
|
git (1.2.5)
|
9
9
|
guard (1.2.3)
|
@@ -15,7 +15,7 @@ GEM
|
|
15
15
|
bundler (~> 1.0)
|
16
16
|
git (>= 1.2.5)
|
17
17
|
rake
|
18
|
-
json (1.
|
18
|
+
json (1.7.3)
|
19
19
|
listen (0.4.7)
|
20
20
|
rb-fchange (~> 0.0.5)
|
21
21
|
rb-fsevent (~> 0.9.1)
|
@@ -23,13 +23,14 @@ GEM
|
|
23
23
|
pdf-reader (1.1.1)
|
24
24
|
Ascii85 (~> 1.0.0)
|
25
25
|
ruby-rc4
|
26
|
+
pdf-reader-turtletext (0.2.2)
|
27
|
+
pdf-reader (= 1.1.1)
|
26
28
|
rake (0.9.2.2)
|
27
29
|
rb-fchange (0.0.5)
|
28
30
|
ffi
|
29
31
|
rb-fsevent (0.9.1)
|
30
32
|
rb-inotify (0.8.8)
|
31
33
|
ffi (>= 0.5.0)
|
32
|
-
rcov (0.9.11)
|
33
34
|
rdoc (3.12)
|
34
35
|
json (~> 1.4)
|
35
36
|
rspec (2.8.0)
|
@@ -51,8 +52,7 @@ DEPENDENCIES
|
|
51
52
|
getoptions (~> 0.3)
|
52
53
|
guard-rspec
|
53
54
|
jeweler (~> 1.6.4)
|
54
|
-
pdf-reader (
|
55
|
+
pdf-reader-turtletext (~> 0.2.2)
|
55
56
|
rake (~> 0.9.2.2)
|
56
|
-
rcov
|
57
57
|
rdoc (~> 3.11)
|
58
58
|
rspec (~> 2.8.0)
|
data/README.rdoc
CHANGED
@@ -1,12 +1,17 @@
|
|
1
1
|
= SP Services Bill Scanner {<img src="https://secure.travis-ci.org/tardate/sps_bill_scanner.png" />}[http://travis-ci.org/tardate/sps_bill_scanner]
|
2
2
|
|
3
|
-
Extracts bill details from SP Services PDF bills so that you can, um, do geeky data analysis n'stuff
|
3
|
+
Extracts bill details from SP Services PDF bills so that you can, um, do geeky data analysis n'stuff,
|
4
|
+
and because I loathe data entry!
|
5
|
+
One day we'll have {smart meters}[http://en.wikipedia.org/wiki/Smart_meter]
|
6
|
+
and SP Services will let us download our raw meter data. But until then...
|
4
7
|
|
5
8
|
If you are an SP Services subscriber, download your bills from https://services.spservices.sg
|
6
9
|
|
7
10
|
If you are not an SP Services subscriber, this gem ain't going to be much use for you!
|
8
11
|
|
9
12
|
Some example analysis using {R}[http://www.r-project.org/] is included in the <tt>scripts</tt> folder.
|
13
|
+
The inspiration for hacking away with R comes from reading Sau Sheong's new book
|
14
|
+
{Exploring Everyday Things with Ruby and R}[http://www.bookjetty.com/books/1449315151/exploring-data-learning-everyday]. Check it out!
|
10
15
|
|
11
16
|
== Requirements and Known Limitations
|
12
17
|
|
@@ -53,11 +58,14 @@ Here's the basic outline:
|
|
53
58
|
- copy this to <tt>spec/fixtures/personal_pdf_samples/expectations.yml</tt>
|
54
59
|
- enter in the details that describe each bill you have added
|
55
60
|
- now when you run <tt>rake</tt> it will also verify the data extracted from your
|
56
|
-
bills using expectations.yml
|
61
|
+
bills using <tt>expectations.yml</tt>
|
57
62
|
|
58
63
|
Feel free to get in touch or discuss in the github issues area if you are trying to help but run
|
59
64
|
into problems with this!
|
60
65
|
|
66
|
+
If you are more interested in the data analytics, I'm keen to add more interesting R scripts to the collection.
|
67
|
+
Your contributions are most welcome.
|
68
|
+
|
61
69
|
== Installation
|
62
70
|
|
63
71
|
gem install sps_bill
|
@@ -152,14 +160,16 @@ in the <tt>scripts</tt> folder.
|
|
152
160
|
=== sample data and analysis
|
153
161
|
|
154
162
|
[data/all_services.csv.sample] sample CSV data for a years worth of elec, gas, and water
|
155
|
-
[data/all_services.sample.pdf] PDF analysis produced by
|
163
|
+
[data/all_services.sample.pdf] PDF analysis produced by <tt>full_analysis.R</tt> using
|
164
|
+
the <tt>all_services.csv.sample</tt> data set.
|
156
165
|
[data/elec_and_water_only.csv.sample] sample CSV data for a years worth of elec and water
|
157
|
-
[data/elec_and_water_only.sample.pdf] PDF analysis produced by
|
166
|
+
[data/elec_and_water_only.sample.pdf] PDF analysis produced by <tt>full_analysis.R</tt> using
|
167
|
+
the <tt>elec_and_water_only.csv.sample</tt> data set.
|
158
168
|
|
159
169
|
=== example run
|
160
170
|
|
161
171
|
./scan_all_bills.sh ../path_to_my_bills/*.pdf > my_bill_data.csv
|
162
|
-
./full_analysis.R
|
172
|
+
./full_analysis.R my_bill_data.csv
|
163
173
|
|
164
174
|
This will have produced an analysis of all your bills in <tt>full_analysis.pdf</tt>.
|
165
175
|
|
data/lib/sps_bill.rb
CHANGED
data/lib/sps_bill/bill.rb
CHANGED
@@ -10,15 +10,18 @@ class SpsBill::Bill
|
|
10
10
|
|
11
11
|
# accessors for the various bill components
|
12
12
|
#
|
13
|
-
|
14
|
-
# [{ kwh: float, rate: float, amount: float }]
|
15
|
-
# gas_usage charges is an array of hashed values:
|
16
|
-
# [{ kwh: float, rate: float, amount: float }]
|
17
|
-
# water_usage charges is an array of hashed values:
|
18
|
-
# [{ cubic_m: float, rate: float, amount: float }]
|
19
|
-
#
|
13
|
+
|
20
14
|
attr_reader :account_number,:total_amount,:invoice_date,:invoice_month
|
21
|
-
|
15
|
+
|
16
|
+
# electricity_usage is an array of hashed values:
|
17
|
+
# [{ kwh: float, rate: float, amount: float }]
|
18
|
+
attr_reader :electricity_usage
|
19
|
+
# gas_usage is an array of hashed values:
|
20
|
+
# [{ kwh: float, rate: float, amount: float }]
|
21
|
+
attr_reader :gas_usage
|
22
|
+
# water_usage is an array of hashed values:
|
23
|
+
# [{ cubic_m: float, rate: float, amount: float }]
|
24
|
+
attr_reader :water_usage
|
22
25
|
|
23
26
|
# +source+ is a file name or stream-like object
|
24
27
|
def initialize(source)
|
@@ -28,7 +31,7 @@ class SpsBill::Bill
|
|
28
31
|
|
29
32
|
# Returns the PDF reader isntance
|
30
33
|
def reader
|
31
|
-
@reader ||= PDF::
|
34
|
+
@reader ||= PDF::Reader::Turtletext.new(source_file) if source_file
|
32
35
|
end
|
33
36
|
|
34
37
|
# Return a pretty(-ish) text format of the core bill details
|
@@ -12,8 +12,8 @@ class SpsBill::BillCollection < Array
|
|
12
12
|
|
13
13
|
# Returns an array of Bill objects for PDF files matching +path_spec+.
|
14
14
|
# +path_spec+ may be either:
|
15
|
-
#
|
16
|
-
#
|
15
|
+
# - an array of filenames e.g. ['data/file1.pdf','file2.pdf']
|
16
|
+
# - or a single file or path spec e.g. './somepath/file1.pdf' or './somepath/*.pdf'
|
17
17
|
def load(path_spec)
|
18
18
|
path_spec = Dir[path_spec] unless path_spec.class <= Array
|
19
19
|
path_spec.each_with_object(new) do |filename,memo|
|
@@ -23,30 +23,33 @@ class SpsBill::BillCollection < Array
|
|
23
23
|
|
24
24
|
end
|
25
25
|
|
26
|
+
# Returns the suitable array of headers for +dataset_selector+
|
26
27
|
def headers(dataset_selector)
|
27
28
|
case dataset_selector
|
28
29
|
when :total_amounts
|
29
|
-
|
30
|
+
%w(invoice_month amount)
|
30
31
|
when :electricity_usages
|
31
|
-
|
32
|
+
%w(invoice_month kwh rate amount)
|
32
33
|
when :gas_usages
|
33
|
-
|
34
|
+
%w(invoice_month kwh rate amount)
|
34
35
|
when :water_usages
|
35
|
-
|
36
|
+
%w(invoice_month cubic_m rate amount)
|
36
37
|
when :all_data
|
37
|
-
|
38
|
+
%w(invoice_month measure kwh cubic_m rate amount)
|
38
39
|
end
|
39
40
|
end
|
40
41
|
|
41
|
-
# Returns
|
42
|
-
#
|
42
|
+
# Returns an array of all data by month
|
43
|
+
# [[month,measure,kwh,cubic_m,rate,amount]]
|
43
44
|
# measure: total_charges,electricity,gas,water
|
44
45
|
def all_data
|
45
46
|
total_amounts(:all) + electricity_usages(:all) + gas_usages(:all) + water_usages(:all)
|
46
47
|
end
|
47
48
|
|
48
|
-
# Returns
|
49
|
-
#
|
49
|
+
# Returns an array of total bill amounts by month
|
50
|
+
# [[month,amount]]
|
51
|
+
# when +style+ is :solo, returns minimal array to describe this data set in isolation,
|
52
|
+
# else returns a normalised sparse array that is common to all data sets
|
50
53
|
def total_amounts(style=:solo)
|
51
54
|
each_with_object([]) do |bill,memo|
|
52
55
|
if style==:solo
|
@@ -57,8 +60,10 @@ class SpsBill::BillCollection < Array
|
|
57
60
|
end
|
58
61
|
end
|
59
62
|
|
60
|
-
# Returns
|
61
|
-
#
|
63
|
+
# Returns an array of electricity_usages by month
|
64
|
+
# [[month,kwh,rate,amount]]
|
65
|
+
# when +style+ is :solo, returns minimal array to describe this data set in isolation,
|
66
|
+
# else returns a normalised sparse array that is common to all data sets
|
62
67
|
def electricity_usages(style=:solo)
|
63
68
|
each_with_object([]) do |bill,memo|
|
64
69
|
bill.electricity_usage.each do |usage|
|
@@ -71,8 +76,10 @@ class SpsBill::BillCollection < Array
|
|
71
76
|
end
|
72
77
|
end
|
73
78
|
|
74
|
-
# Returns
|
75
|
-
#
|
79
|
+
# Returns an array of gas_usages by month
|
80
|
+
# [[month,kwh,rate,amount]]
|
81
|
+
# when +style+ is :solo, returns minimal array to describe this data set in isolation,
|
82
|
+
# else returns a normalised sparse array that is common to all data sets
|
76
83
|
def gas_usages(style=:solo)
|
77
84
|
each_with_object([]) do |bill,memo|
|
78
85
|
bill.gas_usage.each do |usage|
|
@@ -85,8 +92,10 @@ class SpsBill::BillCollection < Array
|
|
85
92
|
end
|
86
93
|
end
|
87
94
|
|
88
|
-
# Returns
|
89
|
-
#
|
95
|
+
# Returns an array of water_usages by month
|
96
|
+
# [[month,kwh,rate,amount]]
|
97
|
+
# when +style+ is :solo, returns minimal array to describe this data set in isolation,
|
98
|
+
# else returns a normalised sparse array that is common to all data sets
|
90
99
|
def water_usages(style=:solo)
|
91
100
|
each_with_object([]) do |bill,memo|
|
92
101
|
bill.water_usage.each do |usage|
|
data/lib/sps_bill/bill_parser.rb
CHANGED
@@ -3,9 +3,12 @@ require 'date'
|
|
3
3
|
# all the bill scanning and parsing intelligence
|
4
4
|
module SpsBill::BillParser
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
6
|
+
ELECTRICITY_SERVICE_HEADER = /Electricity Services/i
|
7
|
+
ELECTRICITY_SERVICE_FOOTER = /Gas Services|Water Services/i
|
8
|
+
GAS_SERVICE_HEADER = /Gas Services/i
|
9
|
+
GAS_SERVICE_FOOTER = /Water Services/i
|
10
|
+
WATER_SERVICE_HEADER = /Water Services/i
|
11
|
+
WATER_SERVICE_FOOTER = /Waterborne Fee/i
|
9
12
|
|
10
13
|
# Returns a collection of parser errors
|
11
14
|
def errors
|
@@ -26,66 +29,109 @@ module SpsBill::BillParser
|
|
26
29
|
|
27
30
|
# Command: extracts the account number
|
28
31
|
def parse_account_number
|
29
|
-
|
32
|
+
region = reader.bounding_box do
|
33
|
+
exclusive!
|
34
|
+
below 'Dated'
|
35
|
+
above 'Type'
|
36
|
+
right_of 'Account No'
|
37
|
+
end
|
38
|
+
# text will be returned like this:
|
39
|
+
# [[":", "8123123123"]]
|
40
|
+
@account_number = region.text.flatten.last
|
30
41
|
end
|
31
42
|
|
32
43
|
# Command: extracts the total amount due for the current month
|
33
44
|
def parse_total_amount
|
34
|
-
|
35
|
-
|
36
|
-
|
45
|
+
region = reader.bounding_box do
|
46
|
+
inclusive!
|
47
|
+
below /^Total Current Charges due on/
|
48
|
+
above /^Total Current Charges due on/
|
49
|
+
right_of /^Total Current Charges due on/
|
50
|
+
left_of 400.0
|
37
51
|
end
|
52
|
+
# text will be returned like this:
|
53
|
+
# [["Total Current Charges due on 14 Jun 2011 (Tue)", "251.44"]]
|
54
|
+
@total_amount = region.text.flatten.last.to_f
|
38
55
|
end
|
39
56
|
|
40
57
|
# Command: extracts the invoice date
|
41
58
|
def parse_invoice_date
|
42
|
-
|
43
|
-
|
44
|
-
|
59
|
+
region = reader.bounding_box do
|
60
|
+
inclusive!
|
61
|
+
below 'Dated'
|
62
|
+
above 'Dated'
|
63
|
+
right_of 'Dated'
|
45
64
|
end
|
65
|
+
# text will be returned like this:
|
66
|
+
# [["Dated", "31", "May", "2011"]]
|
67
|
+
date_string = region.text.flatten.slice(1..3).join('-')
|
68
|
+
@invoice_date = Date.parse(date_string)
|
46
69
|
end
|
47
70
|
|
48
71
|
# Command: extracts the invoice month (as Date, set to 1st of the month)
|
49
72
|
def parse_invoice_month
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
73
|
+
region = reader.bounding_box do
|
74
|
+
inclusive!
|
75
|
+
below 'Dated'
|
76
|
+
above 'Dated'
|
77
|
+
end
|
78
|
+
# text will be returned like this:
|
79
|
+
# [["May", "11", "Bill", "Dated", "31", "May", "2011"]]
|
80
|
+
date_array = ['01'] + region.text.flatten.slice(0..1)
|
81
|
+
if (yy = date_array[2]).length == 2
|
82
|
+
date_array[2] = "20#{yy}" # WARNING: converting 2-digit date. Assumed to be 21st C
|
54
83
|
end
|
84
|
+
@invoice_month = Date.parse(date_array.join('-'))
|
55
85
|
end
|
56
86
|
|
57
|
-
# Command: extracts an array of electricity usage charges. Each
|
58
|
-
#
|
87
|
+
# Command: extracts an array of electricity usage charges. Each element is a Hash:
|
88
|
+
# { kwh: float, rate: float, amount: float }
|
59
89
|
def parse_electricity_usage
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
90
|
+
region = reader.bounding_box do
|
91
|
+
exclusive!
|
92
|
+
below ELECTRICITY_SERVICE_HEADER
|
93
|
+
above ELECTRICITY_SERVICE_FOOTER
|
94
|
+
right_of 240.0
|
95
|
+
left_of 450.0
|
96
|
+
end
|
97
|
+
# text will be returned like this:
|
98
|
+
# [["4 kWh", "0.2410", "0.97"], ["616 kWh", "0.2558", "157.57"]]
|
99
|
+
@electricity_usage = unless (raw_data = region.text).empty?
|
100
|
+
raw_data.map{|l| {:kwh => l[0].gsub(/kwh/i,'').to_f, :rate => l[1].to_f, :amount => l[2].to_f} }
|
67
101
|
end
|
68
102
|
end
|
69
103
|
|
70
|
-
# Command: extracts an array of gas usage charges. Each
|
71
|
-
#
|
104
|
+
# Command: extracts an array of gas usage charges. Each element is a Hash:
|
105
|
+
# { kwh: float, rate: float, amount: float }
|
72
106
|
def parse_gas_usage
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
107
|
+
region = reader.bounding_box do
|
108
|
+
exclusive!
|
109
|
+
below GAS_SERVICE_HEADER
|
110
|
+
above GAS_SERVICE_FOOTER
|
111
|
+
right_of 240.0
|
112
|
+
left_of 450.0
|
113
|
+
end
|
114
|
+
# text will be returned like this:
|
115
|
+
# [["4 kWh", "0.2410", "0.97"], ["616 kWh", "0.2558", "157.57"]]
|
116
|
+
@gas_usage = unless (raw_data = region.text).empty?
|
117
|
+
raw_data.map{|l| {:kwh => l[0].gsub(/kwh/i,'').to_f, :rate => l[1].to_f, :amount => l[2].to_f} }
|
78
118
|
end
|
79
119
|
end
|
80
120
|
|
81
|
-
# Command: extracts an array of water usage charges. Each
|
82
|
-
#
|
121
|
+
# Command: extracts an array of water usage charges. Each element is a Hash:
|
122
|
+
# { cubic_m: float, rate: float, amount: float }
|
83
123
|
def parse_water_usage
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
124
|
+
region = reader.bounding_box do
|
125
|
+
exclusive!
|
126
|
+
below WATER_SERVICE_HEADER
|
127
|
+
above WATER_SERVICE_FOOTER
|
128
|
+
right_of 240.0
|
129
|
+
left_of 450.0
|
130
|
+
end
|
131
|
+
# text will be returned like this:
|
132
|
+
# [["36.1 Cu M", "1.1700", "42.24"], ["-3.0 Cu M", "1.4000", "-4.20"]]
|
133
|
+
@water_usage = unless (raw_data = region.text).empty?
|
134
|
+
raw_data.map{|l| {:cubic_m => l[0].gsub(/cu m/i,'').to_f, :rate => l[1].to_f, :amount => l[2].to_f} }
|
89
135
|
end
|
90
136
|
end
|
91
137
|
|
data/lib/sps_bill/shell.rb
CHANGED
@@ -12,13 +12,15 @@ SP Services Bill Scanner v#{SpsBill::Version::STRING}
|
|
12
12
|
===================================
|
13
13
|
|
14
14
|
Usage:
|
15
|
-
sps_bill [options]
|
15
|
+
sps_bill [options] file-spec
|
16
16
|
|
17
17
|
Command Options
|
18
18
|
-r | --raw raw data format (without headers)
|
19
19
|
-c | --csv output in CSV format (default)
|
20
20
|
-d= | --data=[charges,electricity,gas,water,all]
|
21
21
|
|
22
|
+
file-spec is a path to the PDF bill(s) to read.
|
23
|
+
|
22
24
|
EOS
|
23
25
|
end
|
24
26
|
|
@@ -29,7 +31,7 @@ Command Options
|
|
29
31
|
end
|
30
32
|
|
31
33
|
def run
|
32
|
-
if options[:help]
|
34
|
+
if options[:help] or fileset.empty?
|
33
35
|
self.class.usage
|
34
36
|
return
|
35
37
|
end
|