sps_bill 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.travis.yml +6 -1
- data/CHANGELOG +9 -0
- data/Gemfile +1 -2
- data/Gemfile.lock +5 -5
- data/README.rdoc +15 -5
- data/lib/sps_bill.rb +1 -5
- data/lib/sps_bill/bill.rb +12 -9
- data/lib/sps_bill/bill_collection.rb +26 -17
- data/lib/sps_bill/bill_parser.rb +83 -37
- data/lib/sps_bill/shell.rb +4 -2
- data/lib/sps_bill/version.rb +1 -1
- data/scripts/data/all_services.csv.sample +70 -0
- data/scripts/data/all_services.sample.pdf +0 -0
- data/scripts/data/elec_and_water_only.csv.sample +53 -0
- data/scripts/data/elec_and_water_only.sample.pdf +0 -0
- data/scripts/full_analysis.R +191 -0
- data/spec/support/bill_examples.rb +1 -1
- data/spec/support/pdf_samples_helper.rb +2 -2
- data/spec/unit/bill_collection_spec.rb +6 -6
- data/spec/unit/shell_spec.rb +1 -1
- data/sps_bill.gemspec +11 -14
- metadata +27 -38
- data/lib/pdf/object_hash.rb +0 -39
- data/lib/pdf/positional_text_receiver.rb +0 -16
- data/lib/pdf/structured_reader.rb +0 -108
- data/lib/pdf/textangle.rb +0 -27
- data/spec/fixtures/pdf_samples/junk_prefix.pdf +0 -71
- data/spec/unit/pdf/object_hash_spec.rb +0 -15
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sps_bill
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,22 +9,22 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-08-01 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
|
-
name: pdf-reader
|
16
|
-
requirement: &
|
15
|
+
name: pdf-reader-turtletext
|
16
|
+
requirement: &70331993763820 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
|
-
- -
|
19
|
+
- - ~>
|
20
20
|
- !ruby/object:Gem::Version
|
21
|
-
version:
|
21
|
+
version: 0.2.2
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70331993763820
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: getoptions
|
27
|
-
requirement: &
|
27
|
+
requirement: &70331993762960 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0.3'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70331993762960
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: bundler
|
38
|
-
requirement: &
|
38
|
+
requirement: &70331993762360 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: 1.1.4
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70331993762360
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: jeweler
|
49
|
-
requirement: &
|
49
|
+
requirement: &70331993761820 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ~>
|
@@ -54,21 +54,10 @@ dependencies:
|
|
54
54
|
version: 1.6.4
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
58
|
-
- !ruby/object:Gem::Dependency
|
59
|
-
name: rcov
|
60
|
-
requirement: &70313136419920 !ruby/object:Gem::Requirement
|
61
|
-
none: false
|
62
|
-
requirements:
|
63
|
-
- - ! '>='
|
64
|
-
- !ruby/object:Gem::Version
|
65
|
-
version: '0'
|
66
|
-
type: :development
|
67
|
-
prerelease: false
|
68
|
-
version_requirements: *70313136419920
|
57
|
+
version_requirements: *70331993761820
|
69
58
|
- !ruby/object:Gem::Dependency
|
70
59
|
name: rake
|
71
|
-
requirement: &
|
60
|
+
requirement: &70331993760540 !ruby/object:Gem::Requirement
|
72
61
|
none: false
|
73
62
|
requirements:
|
74
63
|
- - ~>
|
@@ -76,10 +65,10 @@ dependencies:
|
|
76
65
|
version: 0.9.2.2
|
77
66
|
type: :development
|
78
67
|
prerelease: false
|
79
|
-
version_requirements: *
|
68
|
+
version_requirements: *70331993760540
|
80
69
|
- !ruby/object:Gem::Dependency
|
81
70
|
name: rspec
|
82
|
-
requirement: &
|
71
|
+
requirement: &70331993759960 !ruby/object:Gem::Requirement
|
83
72
|
none: false
|
84
73
|
requirements:
|
85
74
|
- - ~>
|
@@ -87,10 +76,10 @@ dependencies:
|
|
87
76
|
version: 2.8.0
|
88
77
|
type: :development
|
89
78
|
prerelease: false
|
90
|
-
version_requirements: *
|
79
|
+
version_requirements: *70331993759960
|
91
80
|
- !ruby/object:Gem::Dependency
|
92
81
|
name: rdoc
|
93
|
-
requirement: &
|
82
|
+
requirement: &70331993743820 !ruby/object:Gem::Requirement
|
94
83
|
none: false
|
95
84
|
requirements:
|
96
85
|
- - ~>
|
@@ -98,10 +87,10 @@ dependencies:
|
|
98
87
|
version: '3.11'
|
99
88
|
type: :development
|
100
89
|
prerelease: false
|
101
|
-
version_requirements: *
|
90
|
+
version_requirements: *70331993743820
|
102
91
|
- !ruby/object:Gem::Dependency
|
103
92
|
name: guard-rspec
|
104
|
-
requirement: &
|
93
|
+
requirement: &70331993743100 !ruby/object:Gem::Requirement
|
105
94
|
none: false
|
106
95
|
requirements:
|
107
96
|
- - ! '>='
|
@@ -109,7 +98,7 @@ dependencies:
|
|
109
98
|
version: '0'
|
110
99
|
type: :development
|
111
100
|
prerelease: false
|
112
|
-
version_requirements: *
|
101
|
+
version_requirements: *70331993743100
|
113
102
|
description: a library that can read SP Services PDF bills and extract and summarize
|
114
103
|
the bill details
|
115
104
|
email: gallagher.paul@gmail.com
|
@@ -124,6 +113,7 @@ files:
|
|
124
113
|
- .rspec
|
125
114
|
- .rvmrc
|
126
115
|
- .travis.yml
|
116
|
+
- CHANGELOG
|
127
117
|
- Gemfile
|
128
118
|
- Gemfile.lock
|
129
119
|
- Guardfile
|
@@ -131,10 +121,6 @@ files:
|
|
131
121
|
- README.rdoc
|
132
122
|
- Rakefile
|
133
123
|
- bin/sps_bill
|
134
|
-
- lib/pdf/object_hash.rb
|
135
|
-
- lib/pdf/positional_text_receiver.rb
|
136
|
-
- lib/pdf/structured_reader.rb
|
137
|
-
- lib/pdf/textangle.rb
|
138
124
|
- lib/sps_bill.rb
|
139
125
|
- lib/sps_bill/bill.rb
|
140
126
|
- lib/sps_bill/bill_collection.rb
|
@@ -142,9 +128,13 @@ files:
|
|
142
128
|
- lib/sps_bill/shell.rb
|
143
129
|
- lib/sps_bill/version.rb
|
144
130
|
- scripts/data/.gitkeep
|
131
|
+
- scripts/data/all_services.csv.sample
|
132
|
+
- scripts/data/all_services.sample.pdf
|
133
|
+
- scripts/data/elec_and_water_only.csv.sample
|
134
|
+
- scripts/data/elec_and_water_only.sample.pdf
|
135
|
+
- scripts/full_analysis.R
|
145
136
|
- scripts/scan_all_bills.sh
|
146
137
|
- spec/fixtures/pdf_samples/.gitkeep
|
147
|
-
- spec/fixtures/pdf_samples/junk_prefix.pdf
|
148
138
|
- spec/fixtures/personal_pdf_samples/.gitkeep
|
149
139
|
- spec/fixtures/personal_pdf_samples/expectations.yml.sample
|
150
140
|
- spec/integration/personal_samples_spec.rb
|
@@ -153,7 +143,6 @@ files:
|
|
153
143
|
- spec/support/pdf_samples_helper.rb
|
154
144
|
- spec/unit/bill_collection_spec.rb
|
155
145
|
- spec/unit/bill_spec.rb
|
156
|
-
- spec/unit/pdf/object_hash_spec.rb
|
157
146
|
- spec/unit/shell_spec.rb
|
158
147
|
- sps_bill.gemspec
|
159
148
|
homepage: https://github.com/tardate/sps_bill_scanner
|
data/lib/pdf/object_hash.rb
DELETED
@@ -1,39 +0,0 @@
|
|
1
|
-
class PDF::Reader
|
2
|
-
class ObjectHash
|
3
|
-
|
4
|
-
def extract_io_from(input)
|
5
|
-
if input.respond_to?(:seek) && input.respond_to?(:read)
|
6
|
-
input
|
7
|
-
elsif File.file?(input.to_s)
|
8
|
-
read_with_quirks(input)
|
9
|
-
else
|
10
|
-
raise ArgumentError, "input must be an IO-like object or a filename"
|
11
|
-
end
|
12
|
-
end
|
13
|
-
|
14
|
-
# Load file as a StringIO stream, accounting for invalid format
|
15
|
-
# where additional characters exist in the file before the %PDF start of file
|
16
|
-
def read_with_quirks(input)
|
17
|
-
stream = File.open(input.to_s, "rb")
|
18
|
-
if ofs = pdf_offset(stream)
|
19
|
-
stream.seek(ofs)
|
20
|
-
StringIO.new(stream.read)
|
21
|
-
else
|
22
|
-
raise ArgumentError, "invalid file format"
|
23
|
-
end
|
24
|
-
end
|
25
|
-
private :read_with_quirks
|
26
|
-
|
27
|
-
# Returns the offset of the PDF document in the +stream+.
|
28
|
-
# Checks up to 50 chars into the file, returns nil of no PDF stream detected.
|
29
|
-
def pdf_offset(stream)
|
30
|
-
stream.rewind
|
31
|
-
ofs = stream.pos
|
32
|
-
until (c = stream.readchar) == '%' || c == 37 || ofs > 50
|
33
|
-
ofs += 1
|
34
|
-
end
|
35
|
-
ofs < 50 ? ofs : nil
|
36
|
-
end
|
37
|
-
private :pdf_offset
|
38
|
-
end
|
39
|
-
end
|
@@ -1,16 +0,0 @@
|
|
1
|
-
class PDF::Reader::PositionalTextReceiver < PDF::Reader::PageTextReceiver
|
2
|
-
|
3
|
-
# record text that is drawn on the page
|
4
|
-
def show_text(string) # Tj
|
5
|
-
raise PDF::Reader::MalformedPDFError, "current font is invalid" if @state.current_font.nil?
|
6
|
-
newx, newy = @state.trm_transform(0,0)
|
7
|
-
@content[newy] ||= {}
|
8
|
-
@content[newy][newx] = @state.current_font.to_utf8(string)
|
9
|
-
end
|
10
|
-
|
11
|
-
# override content accessor
|
12
|
-
def content
|
13
|
-
@content
|
14
|
-
end
|
15
|
-
|
16
|
-
end
|
@@ -1,108 +0,0 @@
|
|
1
|
-
# Class for reading structured text content
|
2
|
-
# This is the one that is a bit hairy - specifically check the fuzzed_y usage
|
3
|
-
# which attempts to align text content in the PDF so it can be extracted
|
4
|
-
# with correct alignment.
|
5
|
-
#
|
6
|
-
class PDF::StructuredReader
|
7
|
-
attr_reader :reader
|
8
|
-
|
9
|
-
# +source+ is a file name or stream-like object
|
10
|
-
def initialize(source)
|
11
|
-
@reader = PDF::Reader.new(source)
|
12
|
-
end
|
13
|
-
|
14
|
-
# Returns positional (with fuzzed y positioning) text content collection as a hash:
|
15
|
-
# { y_position: { x_position: content}}
|
16
|
-
def content(page=1)
|
17
|
-
@content ||= []
|
18
|
-
if @content[page]
|
19
|
-
@content[page]
|
20
|
-
else
|
21
|
-
@content[page] = fuzzed_y(precise_content(page))
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
# Returns a hash with fuzzed y positioning:
|
26
|
-
# { fuzzed_y_position: { x_position: content}}
|
27
|
-
# Given +input+ as a hash:
|
28
|
-
# { y_position: { x_position: content}}
|
29
|
-
# y values that fall within +precision+ points of another will be clustered
|
30
|
-
def fuzzed_y(input,precision=3)
|
31
|
-
output = {}
|
32
|
-
input.keys.sort.each do |precise_y|
|
33
|
-
# matching_y = (precise_y / 5.0).truncate * 5.0
|
34
|
-
matching_y = output.keys.select{|new_y| (new_y - precise_y).abs < precision }.first || precise_y
|
35
|
-
output[matching_y] ||= {}
|
36
|
-
output[matching_y].merge!(input[precise_y])
|
37
|
-
end
|
38
|
-
output
|
39
|
-
end
|
40
|
-
|
41
|
-
# Returns positional text content collection as a hash with precise x,y positioning:
|
42
|
-
# { y_position: { x_position: content}}
|
43
|
-
def precise_content(page=1)
|
44
|
-
@precise_content ||= []
|
45
|
-
if @precise_content[page]
|
46
|
-
@precise_content[page]
|
47
|
-
else
|
48
|
-
@precise_content[page] = load_content(page)
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
# Returns an array of text elements in the bounding box
|
53
|
-
def text_in_rect(xmin,xmax,ymin,ymax,page=1)
|
54
|
-
text_map = content(page)
|
55
|
-
box = []
|
56
|
-
text_map.keys.sort.reverse.each do |y|
|
57
|
-
if y >= ymin && y<= ymax
|
58
|
-
row = []
|
59
|
-
text_map[y].keys.sort.each do |x|
|
60
|
-
if x >= xmin && x<= xmax
|
61
|
-
row << text_map[y][x]
|
62
|
-
end
|
63
|
-
end
|
64
|
-
box << row unless row.empty?
|
65
|
-
end
|
66
|
-
end
|
67
|
-
box
|
68
|
-
end
|
69
|
-
|
70
|
-
# Returns the position {x: val, y: val } of +text+ on +page+
|
71
|
-
# +text+ may be astring (exact match required) or a Regexp
|
72
|
-
def text_position(text,page=1)
|
73
|
-
item = if text.class <= Regexp
|
74
|
-
content(page).map {|k,v| if x = v.reduce(nil){|memo,vv| memo = (vv[1] =~ text) ? vv[0] : memo } ; [k,x] ; end }
|
75
|
-
else
|
76
|
-
content(page).map {|k,v| if x = v.rassoc(text) ; [k,x] ; end }
|
77
|
-
end
|
78
|
-
item = item.compact.flatten
|
79
|
-
unless item.empty?
|
80
|
-
{ :x => item[1], :y => item[0] }
|
81
|
-
end
|
82
|
-
end
|
83
|
-
|
84
|
-
# WIP - not using Textangle yet for text extraction.
|
85
|
-
# Ideal usage is something like this:
|
86
|
-
#
|
87
|
-
# textangle = reader.bounding_box do
|
88
|
-
# page 1
|
89
|
-
# below "Electricity Services"
|
90
|
-
# above "Gas Services by City Gas Pte Ltd"
|
91
|
-
# right_of 240.0
|
92
|
-
# left_of "Total ($)"
|
93
|
-
# end
|
94
|
-
# textangle.text
|
95
|
-
#
|
96
|
-
def bounding_box(&block)
|
97
|
-
PDF::Reader::Textangle.new(self,&block)
|
98
|
-
end
|
99
|
-
|
100
|
-
private
|
101
|
-
|
102
|
-
def load_content(page)
|
103
|
-
receiver = PDF::Reader::PositionalTextReceiver.new
|
104
|
-
reader.page(page).walk(receiver)
|
105
|
-
receiver.content
|
106
|
-
end
|
107
|
-
|
108
|
-
end
|
data/lib/pdf/textangle.rb
DELETED
@@ -1,27 +0,0 @@
|
|
1
|
-
# A DSL syntax for text extraction.
|
2
|
-
# WIP - not using this yet
|
3
|
-
#
|
4
|
-
# textangle = PDF::Reader::Textangle.new(reader) do
|
5
|
-
# page 1
|
6
|
-
# below "Electricity Services"
|
7
|
-
# above "Gas Services by City Gas Pte Ltd"
|
8
|
-
# right_of 240.0
|
9
|
-
# left_of "Total ($)"
|
10
|
-
# end
|
11
|
-
# textangle.text
|
12
|
-
#
|
13
|
-
class PDF::Reader::Textangle
|
14
|
-
attr_reader :reader
|
15
|
-
attr_writer :page,:above,:below,:left_of,:right_of
|
16
|
-
|
17
|
-
# +structured_reader+ is a PDF::StructuredReader
|
18
|
-
def initialize(structured_reader,&block)
|
19
|
-
@reader = structured_reader
|
20
|
-
instance_eval( &block ) if block
|
21
|
-
end
|
22
|
-
|
23
|
-
def text
|
24
|
-
# TODO
|
25
|
-
end
|
26
|
-
|
27
|
-
end
|
@@ -1,71 +0,0 @@
|
|
1
|
-
<html>
|
2
|
-
<head></head>
|
3
|
-
%PDF-1.3
|
4
|
-
%����
|
5
|
-
1 0 obj
|
6
|
-
<< /Creator <feff0050007200610077006e>
|
7
|
-
/Producer <feff0050007200610077006e>
|
8
|
-
>>
|
9
|
-
endobj
|
10
|
-
2 0 obj
|
11
|
-
<< /Type /Catalog
|
12
|
-
/Pages 3 0 R
|
13
|
-
>>
|
14
|
-
endobj
|
15
|
-
3 0 obj
|
16
|
-
<< /Type /Pages
|
17
|
-
/Count 1
|
18
|
-
/Kids [5 0 R]
|
19
|
-
>>
|
20
|
-
endobj
|
21
|
-
4 0 obj
|
22
|
-
<< /Length 157
|
23
|
-
>>
|
24
|
-
stream
|
25
|
-
q
|
26
|
-
|
27
|
-
BT
|
28
|
-
36 747.384 Td
|
29
|
-
/F1.0 12 Tf
|
30
|
-
[<546869732050444620636f6e7461696e73206a756e6b20626566> 30 <6f72652074686520252d504446206d6172> -15 <6b> 20 <6572>] TJ
|
31
|
-
ET
|
32
|
-
|
33
|
-
Q
|
34
|
-
|
35
|
-
endstream
|
36
|
-
endobj
|
37
|
-
5 0 obj
|
38
|
-
<< /Type /Page
|
39
|
-
/Parent 3 0 R
|
40
|
-
/MediaBox [0 0 612.0 792.0]
|
41
|
-
/Contents 4 0 R
|
42
|
-
/Resources << /ProcSet [/PDF /Text /ImageB /ImageC /ImageI]
|
43
|
-
/Font << /F1.0 6 0 R
|
44
|
-
>>
|
45
|
-
>>
|
46
|
-
>>
|
47
|
-
endobj
|
48
|
-
6 0 obj
|
49
|
-
<< /Type /Font
|
50
|
-
/Subtype /Type1
|
51
|
-
/BaseFont /Helvetica
|
52
|
-
/Encoding /WinAnsiEncoding
|
53
|
-
>>
|
54
|
-
endobj
|
55
|
-
xref
|
56
|
-
0 7
|
57
|
-
0000000000 65535 f
|
58
|
-
0000000015 00000 n
|
59
|
-
0000000109 00000 n
|
60
|
-
0000000158 00000 n
|
61
|
-
0000000215 00000 n
|
62
|
-
0000000423 00000 n
|
63
|
-
0000000601 00000 n
|
64
|
-
trailer
|
65
|
-
<< /Size 7
|
66
|
-
/Root 2 0 R
|
67
|
-
/Info 1 0 R
|
68
|
-
>>
|
69
|
-
startxref
|
70
|
-
698
|
71
|
-
%%EOF
|
@@ -1,15 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
include PdfSamplesHelper
|
3
|
-
|
4
|
-
describe PDF::Reader::ObjectHash do
|
5
|
-
|
6
|
-
context "when there is a junk prefix" do
|
7
|
-
let(:sample_name) { junk_prefix_pdf_sample_name }
|
8
|
-
let(:object_hash) { PDF::Reader::ObjectHash.new(sample_name) }
|
9
|
-
let(:stream) { object_hash.instance_variable_get(:@io) }
|
10
|
-
before { stream.rewind }
|
11
|
-
subject { stream.read(4) }
|
12
|
-
it { should eql("%PDF") }
|
13
|
-
end
|
14
|
-
|
15
|
-
end
|