sps_bill 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.rspec +1 -0
- data/.rvmrc +1 -0
- data/.travis.yml +3 -0
- data/Gemfile +18 -0
- data/Gemfile.lock +58 -0
- data/Guardfile +10 -0
- data/LICENSE +20 -0
- data/README.rdoc +180 -0
- data/Rakefile +49 -0
- data/bin/sps_bill +12 -0
- data/lib/pdf/object_hash.rb +39 -0
- data/lib/pdf/positional_text_receiver.rb +16 -0
- data/lib/pdf/structured_reader.rb +108 -0
- data/lib/pdf/textangle.rb +27 -0
- data/lib/sps_bill/bill.rb +58 -0
- data/lib/sps_bill/bill_collection.rb +102 -0
- data/lib/sps_bill/bill_parser.rb +92 -0
- data/lib/sps_bill/shell.rb +71 -0
- data/lib/sps_bill/version.rb +9 -0
- data/lib/sps_bill.rb +13 -0
- data/scripts/data/.gitkeep +0 -0
- data/scripts/scan_all_bills.sh +20 -0
- data/spec/fixtures/pdf_samples/.gitkeep +0 -0
- data/spec/fixtures/pdf_samples/junk_prefix.pdf +71 -0
- data/spec/fixtures/personal_pdf_samples/.gitkeep +0 -0
- data/spec/fixtures/personal_pdf_samples/expectations.yml.sample +48 -0
- data/spec/integration/personal_samples_spec.rb +74 -0
- data/spec/spec_helper.rb +24 -0
- data/spec/support/bill_examples.rb +31 -0
- data/spec/support/pdf_samples_helper.rb +37 -0
- data/spec/unit/bill_collection_spec.rb +169 -0
- data/spec/unit/bill_spec.rb +22 -0
- data/spec/unit/pdf/object_hash_spec.rb +15 -0
- data/spec/unit/shell_spec.rb +62 -0
- data/sps_bill.gemspec +100 -0
- metadata +184 -0
data/.document
ADDED
data/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--color
|
data/.rvmrc
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
rvm use 1.9.3@sps_bill_scanner --create
|
data/.travis.yml
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
|
3
|
+
gem 'pdf-reader', '1.1.1'
|
4
|
+
gem 'getoptions', '~> 0.3'
|
5
|
+
|
6
|
+
group :development do
|
7
|
+
gem 'bundler', '~> 1.1.4'
|
8
|
+
gem 'jeweler', '~> 1.6.4'
|
9
|
+
gem 'rcov', '>= 0'
|
10
|
+
end
|
11
|
+
|
12
|
+
group :development, :test do
|
13
|
+
gem 'rake', '~> 0.9.2.2'
|
14
|
+
gem 'rspec', '~> 2.8.0', :require => 'spec'
|
15
|
+
gem 'rdoc', '~> 3.11'
|
16
|
+
# guard for auto-running tests
|
17
|
+
gem 'guard-rspec'
|
18
|
+
end
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
Ascii85 (1.0.1)
|
5
|
+
diff-lcs (1.1.3)
|
6
|
+
ffi (1.0.11)
|
7
|
+
getoptions (0.3)
|
8
|
+
git (1.2.5)
|
9
|
+
guard (1.2.3)
|
10
|
+
listen (>= 0.4.2)
|
11
|
+
thor (>= 0.14.6)
|
12
|
+
guard-rspec (1.2.0)
|
13
|
+
guard (>= 1.1)
|
14
|
+
jeweler (1.6.4)
|
15
|
+
bundler (~> 1.0)
|
16
|
+
git (>= 1.2.5)
|
17
|
+
rake
|
18
|
+
json (1.6.4)
|
19
|
+
listen (0.4.7)
|
20
|
+
rb-fchange (~> 0.0.5)
|
21
|
+
rb-fsevent (~> 0.9.1)
|
22
|
+
rb-inotify (~> 0.8.8)
|
23
|
+
pdf-reader (1.1.1)
|
24
|
+
Ascii85 (~> 1.0.0)
|
25
|
+
ruby-rc4
|
26
|
+
rake (0.9.2.2)
|
27
|
+
rb-fchange (0.0.5)
|
28
|
+
ffi
|
29
|
+
rb-fsevent (0.9.1)
|
30
|
+
rb-inotify (0.8.8)
|
31
|
+
ffi (>= 0.5.0)
|
32
|
+
rcov (0.9.11)
|
33
|
+
rdoc (3.12)
|
34
|
+
json (~> 1.4)
|
35
|
+
rspec (2.8.0)
|
36
|
+
rspec-core (~> 2.8.0)
|
37
|
+
rspec-expectations (~> 2.8.0)
|
38
|
+
rspec-mocks (~> 2.8.0)
|
39
|
+
rspec-core (2.8.0)
|
40
|
+
rspec-expectations (2.8.0)
|
41
|
+
diff-lcs (~> 1.1.2)
|
42
|
+
rspec-mocks (2.8.0)
|
43
|
+
ruby-rc4 (0.1.5)
|
44
|
+
thor (0.15.4)
|
45
|
+
|
46
|
+
PLATFORMS
|
47
|
+
ruby
|
48
|
+
|
49
|
+
DEPENDENCIES
|
50
|
+
bundler (~> 1.1.4)
|
51
|
+
getoptions (~> 0.3)
|
52
|
+
guard-rspec
|
53
|
+
jeweler (~> 1.6.4)
|
54
|
+
pdf-reader (= 1.1.1)
|
55
|
+
rake (~> 0.9.2.2)
|
56
|
+
rcov
|
57
|
+
rdoc (~> 3.11)
|
58
|
+
rspec (~> 2.8.0)
|
data/Guardfile
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
# A sample Guardfile
|
2
|
+
# More info at https://github.com/guard/guard#readme
|
3
|
+
|
4
|
+
guard 'rspec', :version => 2, :all_on_start => true, :all_after_pass => false do
|
5
|
+
watch(%r{^spec/.+_spec\.rb$})
|
6
|
+
watch('spec/spec_helper.rb') { "spec" }
|
7
|
+
|
8
|
+
## we're not watching source files
|
9
|
+
end
|
10
|
+
|
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2012 Paul Gallagher
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,180 @@
|
|
1
|
+
= SP Services Bill Scanner {<img src="https://secure.travis-ci.org/tardate/sps_bill_scanner.png" />}[http://travis-ci.org/tardate/sps_bill_scanner]
|
2
|
+
|
3
|
+
Extracts bill details from SP Services PDF bills so that you can, um, do geeky data analysis n'stuff.
|
4
|
+
|
5
|
+
If you are an SP Services subscriber, download your bills from https://services.spservices.sg
|
6
|
+
|
7
|
+
If you are not an SP Services subscriber, this gem ain't going to be much use for you!
|
8
|
+
|
9
|
+
Some example analysis using {R}[http://www.r-project.org/] is included in the <tt>scripts</tt> folder.
|
10
|
+
|
11
|
+
== Requirements and Known Limitations
|
12
|
+
|
13
|
+
* Requires Ruby 1.9 (1.8 compatibility not tested or assured)
|
14
|
+
* R is used for data analysis examples - this is optional
|
15
|
+
* Currently it does not handle multi-property bills.
|
16
|
+
* May not handle extensive multi-page bills for a single property correctly.
|
17
|
+
|
18
|
+
If you _do_ come across bills that this gem can't read correctly, your help to get it fixed is greatly needed:
|
19
|
+
either submit a fix yourself, or report the problem at https://github.com/tardate/sps_bill_scanner/issues
|
20
|
+
|
21
|
+
== Help Required: Test this with your own bills
|
22
|
+
|
23
|
+
Unfortunately, there isn't a definitive set of bill examples that make it possible to ensure that
|
24
|
+
this gem will work for every bill issued by SP Services. And since bills are in PDF format,
|
25
|
+
extracting the data in a structured manner is a task fraught with pitfalls.
|
26
|
+
|
27
|
+
I have tested this with my own bills (going back over a year) but that definitely doesn't mean it will
|
28
|
+
work for others.
|
29
|
+
|
30
|
+
So I need help from others who are willing to test with their own bills. I'm not asking for your bills
|
31
|
+
as that raises privacy concerns, and I definitely don't want real bills committed to the git repository.
|
32
|
+
|
33
|
+
Instead, I have setup the tests in a way that should make it easy for you to test with your own bills.
|
34
|
+
|
35
|
+
Here's the basic outline:
|
36
|
+
|
37
|
+
* First, make sure tests are running green for you 'as-is':
|
38
|
+
|
39
|
+
- fork or clone the repo
|
40
|
+
- <tt>bundle</tt> will install development dependencies
|
41
|
+
- <tt>rake</tt> will run the tests - they should all be OK
|
42
|
+
|
43
|
+
* Get you own PDF bills from from https://services.spservices.sg
|
44
|
+
|
45
|
+
- put them in <tt>spec/fixtures/personal_pdf_samples</tt>
|
46
|
+
- NB: these are ignored by git so you won't accidentally commit them
|
47
|
+
|
48
|
+
* At this point you can run the tests with <tt>rake</tt> and it will do a very basic
|
49
|
+
check of the PDFs you have added
|
50
|
+
* To run complete checks to ensure all the data is being extracted correctly:
|
51
|
+
|
52
|
+
- see the doc in <tt>spec/fixtures/personal_pdf_samples/expectations.yml.sample</tt>
|
53
|
+
- copy this to <tt>spec/fixtures/personal_pdf_samples/expectations.yml</tt>
|
54
|
+
- enter in the details that describe each bill you have added
|
55
|
+
- now when you run <tt>rake</tt> it will also verify the data extracted from your
|
56
|
+
bills using expectations.yml
|
57
|
+
|
58
|
+
Feel free to get in touch or discuss in the github issues area if you are trying to help but run
|
59
|
+
into problems with this!
|
60
|
+
|
61
|
+
== Installation
|
62
|
+
|
63
|
+
gem install sps_bill
|
64
|
+
|
65
|
+
== Command Line Usage
|
66
|
+
|
67
|
+
Once the gem is installed, use <tt>sps_bill</tt> at the command line to interact with the library manually.
|
68
|
+
|
69
|
+
To get help on command options:
|
70
|
+
|
71
|
+
$ sps_bill -h
|
72
|
+
|
73
|
+
For example: to extract all data in CSV format from a set of PDF bills:
|
74
|
+
|
75
|
+
$ sps_bill --data=all ./path_to/my_bills*.pdf
|
76
|
+
|
77
|
+
== Programmatic Usage
|
78
|
+
|
79
|
+
You can use the gem from your own scripts or applications. There are just two classes you really need
|
80
|
+
to understand:
|
81
|
+
|
82
|
+
* SpsBill::BillCollection is an Array-like class that represents a collection of bills.
|
83
|
+
- the <tt>load</tt> method is used to initialise it given a path or array of filenames
|
84
|
+
- a range of collection methods are provided to extract sets of data (e.g. <tt>electricity_usages</tt>)
|
85
|
+
* SpsBill::Bill represents an individual bill
|
86
|
+
- initialised given a file name
|
87
|
+
- provides a range of accessors to get at individual data elements (e.g. <tt>electricity_usage</tt>)
|
88
|
+
|
89
|
+
To load a collection of bills:
|
90
|
+
|
91
|
+
> require 'sps_bill'
|
92
|
+
> bills = SpsBill::BillCollection.load('./my_bills/*.pdf')
|
93
|
+
> bills.total_amounts
|
94
|
+
=> [["2011-10-01", 168.86], ["2011-11-01", 196.46], ["2011-12-01", 176.54]]
|
95
|
+
> bills.electricity_usages
|
96
|
+
=> [["2011-10-01", 14.0, 0.2728, 3.82], ["2011-10-01", 444.0, 0.2698, 119.79],
|
97
|
+
["2011-11-01", 2.0, 0.2728, 0.54], ["2011-11-01", 537.0, 0.2698, 144.88],
|
98
|
+
["2011-12-01", 482.0, 0.2698, 130.04]]
|
99
|
+
> bills.gas_usages
|
100
|
+
=> [["2011-10-01", 12.0, 0.1961, 2.35], ["2011-11-01", 12.0, 0.2117, 2.54], ["2011-12-01", 12.0, 0.2117, 2.54]]
|
101
|
+
> bills.water_usages
|
102
|
+
=> [["2011-10-01", 8.4, 1.17, 9.83], ["2011-11-01", 11.4, 1.17, 13.34], ["2011-12-01", 9.6, 1.17, 11.23]]
|
103
|
+
|
104
|
+
|
105
|
+
To load and examine a specific bill:
|
106
|
+
|
107
|
+
> require 'sps_bill'
|
108
|
+
> pdf_bill_file = "./my_latest_bill.pdf"
|
109
|
+
> bill = SpsBill::Bill.new(pdf_bill_file)
|
110
|
+
> bill.account_number
|
111
|
+
8123123123
|
112
|
+
> bill.total_amount
|
113
|
+
251.44
|
114
|
+
> bill.invoice_date
|
115
|
+
2011-05-31
|
116
|
+
> bill.invoice_month
|
117
|
+
2011-05-01
|
118
|
+
> bill.electricity_usage
|
119
|
+
[{:kwh=>4.0, :rate=>0.241, :amount=>0.97},{:kwh=>616.0, :rate=>0.2558, :amount=>157.57}]
|
120
|
+
> bill.gas_usage
|
121
|
+
[{:kwh=>18.0, :rate=>0.1799, :amount=>3.24}]
|
122
|
+
> bill.water_usage
|
123
|
+
[{:cubic_m=>36.1, :rate=>1.17, :amount=>42.24},{:cubic_m=>-3.0, :rate=>1.4, :amount=>-4.2}]
|
124
|
+
> bill.to_s
|
125
|
+
Account number: 8123123123
|
126
|
+
Invoice date : 2011-10-31
|
127
|
+
Service month : 2011-10-01
|
128
|
+
Total bill : $168.86
|
129
|
+
|
130
|
+
Electricity Usage
|
131
|
+
-----------------
|
132
|
+
[{:kwh=>14.0, :rate=>0.2728, :amount=>3.82}, {:kwh=>444.0, :rate=>0.2698, :amount=>119.79}]
|
133
|
+
|
134
|
+
Gas Usage
|
135
|
+
---------
|
136
|
+
[{:kwh=>12.0, :rate=>0.1961, :amount=>2.35}]
|
137
|
+
|
138
|
+
Water Usage
|
139
|
+
-----------
|
140
|
+
[{:cubic_m=>8.4, :rate=>9.83, :amount=>0.0}, {:cubic_m=>1.17, :rate=>0.0, :amount=>0.0}]
|
141
|
+
|
142
|
+
== Data Analysis with R
|
143
|
+
|
144
|
+
Some examples of bill data and analysis using {R}[http://www.r-project.org/] are included
|
145
|
+
in the <tt>scripts</tt> folder.
|
146
|
+
|
147
|
+
=== sample scripts
|
148
|
+
|
149
|
+
[scripts/scan_all_bills.sh] an example script to scan a set of bills and produce a csv file for analysis
|
150
|
+
[scripts/full_analysis.R] an example script that prepares a one-page PDF summary analysis
|
151
|
+
|
152
|
+
=== sample data and analysis
|
153
|
+
|
154
|
+
[data/all_services.csv.sample] sample CSV data for a years worth of elec, gas, and water
|
155
|
+
[data/all_services.sample.pdf] PDF analysis produced by this script for all_services.csv.sample
|
156
|
+
[data/elec_and_water_only.csv.sample] sample CSV data for a years worth of elec and water
|
157
|
+
[data/elec_and_water_only.sample.pdf] PDF analysis produced by this script for elec_and_water_only.csv.sample
|
158
|
+
|
159
|
+
=== example run
|
160
|
+
|
161
|
+
./scan_all_bills.sh ../path_to_my_bills/*.pdf > my_bill_data.csv
|
162
|
+
./full_analysis.R data_file.csv my_bill_data.csv
|
163
|
+
|
164
|
+
This will have produced an analysis of all your bills in <tt>full_analysis.pdf</tt>.
|
165
|
+
|
166
|
+
== Contributing to sps_bill_scanner
|
167
|
+
|
168
|
+
* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
|
169
|
+
* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
|
170
|
+
* Fork the project
|
171
|
+
* Start a feature/bugfix branch
|
172
|
+
* Commit and push until you are happy with your contribution
|
173
|
+
* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
|
174
|
+
* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
|
175
|
+
|
176
|
+
== Copyright
|
177
|
+
|
178
|
+
Copyright (c) 2012 Paul Gallagher. See LICENSE.txt for
|
179
|
+
further details.
|
180
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
begin
|
6
|
+
Bundler.setup(:default, :development)
|
7
|
+
rescue Bundler::BundlerError => e
|
8
|
+
$stderr.puts e.message
|
9
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
+
exit e.status_code
|
11
|
+
end
|
12
|
+
require 'rake'
|
13
|
+
require 'rspec'
|
14
|
+
require 'rspec/core/rake_task'
|
15
|
+
|
16
|
+
$LOAD_PATH.unshift('lib')
|
17
|
+
require 'sps_bill/version'
|
18
|
+
|
19
|
+
require 'jeweler'
|
20
|
+
Jeweler::Tasks.new do |gem|
|
21
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
22
|
+
gem.name = "sps_bill"
|
23
|
+
gem.version = SpsBill::Version::STRING
|
24
|
+
gem.homepage = "https://github.com/tardate/sps_bill_scanner"
|
25
|
+
gem.license = "MIT"
|
26
|
+
gem.summary = %Q{SP Services PDF bill structured data reader}
|
27
|
+
gem.description = %Q{a library that can read SP Services PDF bills and extract and summarize the bill details}
|
28
|
+
gem.email = "gallagher.paul@gmail.com"
|
29
|
+
gem.authors = ["Paul Gallagher"]
|
30
|
+
gem.files.exclude 'pkg/*'
|
31
|
+
# dependencies defined in Gemfile
|
32
|
+
end
|
33
|
+
Jeweler::RubygemsDotOrgTasks.new
|
34
|
+
|
35
|
+
desc "Run all RSpec test examples"
|
36
|
+
RSpec::Core::RakeTask.new do |spec|
|
37
|
+
spec.rspec_opts = ["-c", "-f progress"]
|
38
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
39
|
+
end
|
40
|
+
|
41
|
+
task :default => :spec
|
42
|
+
|
43
|
+
require 'rdoc/task'
|
44
|
+
RDoc::Task.new do |rdoc|
|
45
|
+
rdoc.main = "README.rdoc"
|
46
|
+
rdoc.rdoc_dir = 'rdoc'
|
47
|
+
rdoc.title = "sps_bill #{SpsBill::Version::STRING}"
|
48
|
+
rdoc.rdoc_files.include('README*', 'lib/**/*.rb')
|
49
|
+
end
|
data/bin/sps_bill
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$LOAD_PATH.unshift File.join(File.dirname(__FILE__), '..', 'lib')
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
require 'sps_bill'
|
6
|
+
require 'getoptions'
|
7
|
+
|
8
|
+
begin
|
9
|
+
SpsBill::Shell.new(GetOptions.new(SpsBill::Shell::OPTIONS)).run
|
10
|
+
rescue Exception => e
|
11
|
+
STDERR.puts e
|
12
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
class PDF::Reader
|
2
|
+
class ObjectHash
|
3
|
+
|
4
|
+
def extract_io_from(input)
|
5
|
+
if input.respond_to?(:seek) && input.respond_to?(:read)
|
6
|
+
input
|
7
|
+
elsif File.file?(input.to_s)
|
8
|
+
read_with_quirks(input)
|
9
|
+
else
|
10
|
+
raise ArgumentError, "input must be an IO-like object or a filename"
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
# Load file as a StringIO stream, accounting for invalid format
|
15
|
+
# where additional characters exist in the file before the %PDF start of file
|
16
|
+
def read_with_quirks(input)
|
17
|
+
stream = File.open(input.to_s, "rb")
|
18
|
+
if ofs = pdf_offset(stream)
|
19
|
+
stream.seek(ofs)
|
20
|
+
StringIO.new(stream.read)
|
21
|
+
else
|
22
|
+
raise ArgumentError, "invalid file format"
|
23
|
+
end
|
24
|
+
end
|
25
|
+
private :read_with_quirks
|
26
|
+
|
27
|
+
# Returns the offset of the PDF document in the +stream+.
|
28
|
+
# Checks up to 50 chars into the file, returns nil of no PDF stream detected.
|
29
|
+
def pdf_offset(stream)
|
30
|
+
stream.rewind
|
31
|
+
ofs = stream.pos
|
32
|
+
until (c = stream.readchar) == '%' || c == 37 || ofs > 50
|
33
|
+
ofs += 1
|
34
|
+
end
|
35
|
+
ofs < 50 ? ofs : nil
|
36
|
+
end
|
37
|
+
private :pdf_offset
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
class PDF::Reader::PositionalTextReceiver < PDF::Reader::PageTextReceiver
|
2
|
+
|
3
|
+
# record text that is drawn on the page
|
4
|
+
def show_text(string) # Tj
|
5
|
+
raise PDF::Reader::MalformedPDFError, "current font is invalid" if @state.current_font.nil?
|
6
|
+
newx, newy = @state.trm_transform(0,0)
|
7
|
+
@content[newy] ||= {}
|
8
|
+
@content[newy][newx] = @state.current_font.to_utf8(string)
|
9
|
+
end
|
10
|
+
|
11
|
+
# override content accessor
|
12
|
+
def content
|
13
|
+
@content
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
@@ -0,0 +1,108 @@
|
|
1
|
+
# Class for reading structured text content
|
2
|
+
# This is the one that is a bit hairy - specifically check the fuzzed_y usage
|
3
|
+
# which attempts to align text content in the PDF so it can be extracted
|
4
|
+
# with correct alignment.
|
5
|
+
#
|
6
|
+
class PDF::StructuredReader
|
7
|
+
attr_reader :reader
|
8
|
+
|
9
|
+
# +source+ is a file name or stream-like object
|
10
|
+
def initialize(source)
|
11
|
+
@reader = PDF::Reader.new(source)
|
12
|
+
end
|
13
|
+
|
14
|
+
# Returns positional (with fuzzed y positioning) text content collection as a hash:
|
15
|
+
# { y_position: { x_position: content}}
|
16
|
+
def content(page=1)
|
17
|
+
@content ||= []
|
18
|
+
if @content[page]
|
19
|
+
@content[page]
|
20
|
+
else
|
21
|
+
@content[page] = fuzzed_y(precise_content(page))
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Returns a hash with fuzzed y positioning:
|
26
|
+
# { fuzzed_y_position: { x_position: content}}
|
27
|
+
# Given +input+ as a hash:
|
28
|
+
# { y_position: { x_position: content}}
|
29
|
+
# y values that fall within +precision+ points of another will be clustered
|
30
|
+
def fuzzed_y(input,precision=3)
|
31
|
+
output = {}
|
32
|
+
input.keys.sort.each do |precise_y|
|
33
|
+
# matching_y = (precise_y / 5.0).truncate * 5.0
|
34
|
+
matching_y = output.keys.select{|new_y| (new_y - precise_y).abs < precision }.first || precise_y
|
35
|
+
output[matching_y] ||= {}
|
36
|
+
output[matching_y].merge!(input[precise_y])
|
37
|
+
end
|
38
|
+
output
|
39
|
+
end
|
40
|
+
|
41
|
+
# Returns positional text content collection as a hash with precise x,y positioning:
|
42
|
+
# { y_position: { x_position: content}}
|
43
|
+
def precise_content(page=1)
|
44
|
+
@precise_content ||= []
|
45
|
+
if @precise_content[page]
|
46
|
+
@precise_content[page]
|
47
|
+
else
|
48
|
+
@precise_content[page] = load_content(page)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# Returns an array of text elements in the bounding box
|
53
|
+
def text_in_rect(xmin,xmax,ymin,ymax,page=1)
|
54
|
+
text_map = content(page)
|
55
|
+
box = []
|
56
|
+
text_map.keys.sort.reverse.each do |y|
|
57
|
+
if y >= ymin && y<= ymax
|
58
|
+
row = []
|
59
|
+
text_map[y].keys.sort.each do |x|
|
60
|
+
if x >= xmin && x<= xmax
|
61
|
+
row << text_map[y][x]
|
62
|
+
end
|
63
|
+
end
|
64
|
+
box << row unless row.empty?
|
65
|
+
end
|
66
|
+
end
|
67
|
+
box
|
68
|
+
end
|
69
|
+
|
70
|
+
# Returns the position {x: val, y: val } of +text+ on +page+
|
71
|
+
# +text+ may be astring (exact match required) or a Regexp
|
72
|
+
def text_position(text,page=1)
|
73
|
+
item = if text.class <= Regexp
|
74
|
+
content(page).map {|k,v| if x = v.reduce(nil){|memo,vv| memo = (vv[1] =~ text) ? vv[0] : memo } ; [k,x] ; end }
|
75
|
+
else
|
76
|
+
content(page).map {|k,v| if x = v.rassoc(text) ; [k,x] ; end }
|
77
|
+
end
|
78
|
+
item = item.compact.flatten
|
79
|
+
unless item.empty?
|
80
|
+
{ :x => item[1], :y => item[0] }
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
# WIP - not using Textangle yet for text extraction.
|
85
|
+
# Ideal usage is something like this:
|
86
|
+
#
|
87
|
+
# textangle = reader.bounding_box do
|
88
|
+
# page 1
|
89
|
+
# below "Electricity Services"
|
90
|
+
# above "Gas Services by City Gas Pte Ltd"
|
91
|
+
# right_of 240.0
|
92
|
+
# left_of "Total ($)"
|
93
|
+
# end
|
94
|
+
# textangle.text
|
95
|
+
#
|
96
|
+
def bounding_box(&block)
|
97
|
+
PDF::Reader::Textangle.new(self,&block)
|
98
|
+
end
|
99
|
+
|
100
|
+
private
|
101
|
+
|
102
|
+
def load_content(page)
|
103
|
+
receiver = PDF::Reader::PositionalTextReceiver.new
|
104
|
+
reader.page(page).walk(receiver)
|
105
|
+
receiver.content
|
106
|
+
end
|
107
|
+
|
108
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# A DSL syntax for text extraction.
|
2
|
+
# WIP - not using this yet
|
3
|
+
#
|
4
|
+
# textangle = PDF::Reader::Textangle.new(reader) do
|
5
|
+
# page 1
|
6
|
+
# below "Electricity Services"
|
7
|
+
# above "Gas Services by City Gas Pte Ltd"
|
8
|
+
# right_of 240.0
|
9
|
+
# left_of "Total ($)"
|
10
|
+
# end
|
11
|
+
# textangle.text
|
12
|
+
#
|
13
|
+
class PDF::Reader::Textangle
|
14
|
+
attr_reader :reader
|
15
|
+
attr_writer :page,:above,:below,:left_of,:right_of
|
16
|
+
|
17
|
+
# +structured_reader+ is a PDF::StructuredReader
|
18
|
+
def initialize(structured_reader,&block)
|
19
|
+
@reader = structured_reader
|
20
|
+
instance_eval( &block ) if block
|
21
|
+
end
|
22
|
+
|
23
|
+
def text
|
24
|
+
# TODO
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
# SpsBill::Bill represents an individual SP Services PDF bill
|
2
|
+
#
|
3
|
+
# It is initialised given a file name, and provides a range of accessors
|
4
|
+
# to get at individual data elements (e.g. <tt>electricity_usage</tt>)
|
5
|
+
#
|
6
|
+
class SpsBill::Bill
|
7
|
+
include SpsBill::BillParser
|
8
|
+
|
9
|
+
attr_reader :source_file
|
10
|
+
|
11
|
+
# accessors for the various bill components
|
12
|
+
#
|
13
|
+
# electricity_usage charges is an array of hashed values:
|
14
|
+
# [{ kwh: float, rate: float, amount: float }]
|
15
|
+
# gas_usage charges is an array of hashed values:
|
16
|
+
# [{ kwh: float, rate: float, amount: float }]
|
17
|
+
# water_usage charges is an array of hashed values:
|
18
|
+
# [{ cubic_m: float, rate: float, amount: float }]
|
19
|
+
#
|
20
|
+
attr_reader :account_number,:total_amount,:invoice_date,:invoice_month
|
21
|
+
attr_reader :electricity_usage,:gas_usage,:water_usage
|
22
|
+
|
23
|
+
# +source+ is a file name or stream-like object
|
24
|
+
def initialize(source)
|
25
|
+
@source_file = source
|
26
|
+
do_complete_parse
|
27
|
+
end
|
28
|
+
|
29
|
+
# Returns the PDF reader isntance
|
30
|
+
def reader
|
31
|
+
@reader ||= PDF::StructuredReader.new(source_file) if source_file
|
32
|
+
end
|
33
|
+
|
34
|
+
# Return a pretty(-ish) text format of the core bill details
|
35
|
+
def to_s
|
36
|
+
%(
|
37
|
+
Account number: #{account_number}
|
38
|
+
Invoice date : #{invoice_date}
|
39
|
+
Service month : #{invoice_month}
|
40
|
+
Total bill : $#{total_amount}
|
41
|
+
|
42
|
+
Electricity Usage
|
43
|
+
-----------------
|
44
|
+
#{electricity_usage}
|
45
|
+
|
46
|
+
Gas Usage
|
47
|
+
---------
|
48
|
+
#{gas_usage}
|
49
|
+
|
50
|
+
Water Usage
|
51
|
+
-----------
|
52
|
+
#{water_usage}
|
53
|
+
|
54
|
+
)
|
55
|
+
end
|
56
|
+
|
57
|
+
|
58
|
+
end
|