sps_bill 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.travis.yml +6 -1
- data/CHANGELOG +9 -0
- data/Gemfile +1 -2
- data/Gemfile.lock +5 -5
- data/README.rdoc +15 -5
- data/lib/sps_bill.rb +1 -5
- data/lib/sps_bill/bill.rb +12 -9
- data/lib/sps_bill/bill_collection.rb +26 -17
- data/lib/sps_bill/bill_parser.rb +83 -37
- data/lib/sps_bill/shell.rb +4 -2
- data/lib/sps_bill/version.rb +1 -1
- data/scripts/data/all_services.csv.sample +70 -0
- data/scripts/data/all_services.sample.pdf +0 -0
- data/scripts/data/elec_and_water_only.csv.sample +53 -0
- data/scripts/data/elec_and_water_only.sample.pdf +0 -0
- data/scripts/full_analysis.R +191 -0
- data/spec/support/bill_examples.rb +1 -1
- data/spec/support/pdf_samples_helper.rb +2 -2
- data/spec/unit/bill_collection_spec.rb +6 -6
- data/spec/unit/shell_spec.rb +1 -1
- data/sps_bill.gemspec +11 -14
- metadata +27 -38
- data/lib/pdf/object_hash.rb +0 -39
- data/lib/pdf/positional_text_receiver.rb +0 -16
- data/lib/pdf/structured_reader.rb +0 -108
- data/lib/pdf/textangle.rb +0 -27
- data/spec/fixtures/pdf_samples/junk_prefix.pdf +0 -71
- data/spec/unit/pdf/object_hash_spec.rb +0 -15
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sps_bill
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,22 +9,22 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-08-01 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
|
-
name: pdf-reader
|
16
|
-
requirement: &
|
15
|
+
name: pdf-reader-turtletext
|
16
|
+
requirement: &70331993763820 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
|
-
- -
|
19
|
+
- - ~>
|
20
20
|
- !ruby/object:Gem::Version
|
21
|
-
version:
|
21
|
+
version: 0.2.2
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70331993763820
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: getoptions
|
27
|
-
requirement: &
|
27
|
+
requirement: &70331993762960 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0.3'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70331993762960
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: bundler
|
38
|
-
requirement: &
|
38
|
+
requirement: &70331993762360 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: 1.1.4
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70331993762360
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: jeweler
|
49
|
-
requirement: &
|
49
|
+
requirement: &70331993761820 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ~>
|
@@ -54,21 +54,10 @@ dependencies:
|
|
54
54
|
version: 1.6.4
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
58
|
-
- !ruby/object:Gem::Dependency
|
59
|
-
name: rcov
|
60
|
-
requirement: &70313136419920 !ruby/object:Gem::Requirement
|
61
|
-
none: false
|
62
|
-
requirements:
|
63
|
-
- - ! '>='
|
64
|
-
- !ruby/object:Gem::Version
|
65
|
-
version: '0'
|
66
|
-
type: :development
|
67
|
-
prerelease: false
|
68
|
-
version_requirements: *70313136419920
|
57
|
+
version_requirements: *70331993761820
|
69
58
|
- !ruby/object:Gem::Dependency
|
70
59
|
name: rake
|
71
|
-
requirement: &
|
60
|
+
requirement: &70331993760540 !ruby/object:Gem::Requirement
|
72
61
|
none: false
|
73
62
|
requirements:
|
74
63
|
- - ~>
|
@@ -76,10 +65,10 @@ dependencies:
|
|
76
65
|
version: 0.9.2.2
|
77
66
|
type: :development
|
78
67
|
prerelease: false
|
79
|
-
version_requirements: *
|
68
|
+
version_requirements: *70331993760540
|
80
69
|
- !ruby/object:Gem::Dependency
|
81
70
|
name: rspec
|
82
|
-
requirement: &
|
71
|
+
requirement: &70331993759960 !ruby/object:Gem::Requirement
|
83
72
|
none: false
|
84
73
|
requirements:
|
85
74
|
- - ~>
|
@@ -87,10 +76,10 @@ dependencies:
|
|
87
76
|
version: 2.8.0
|
88
77
|
type: :development
|
89
78
|
prerelease: false
|
90
|
-
version_requirements: *
|
79
|
+
version_requirements: *70331993759960
|
91
80
|
- !ruby/object:Gem::Dependency
|
92
81
|
name: rdoc
|
93
|
-
requirement: &
|
82
|
+
requirement: &70331993743820 !ruby/object:Gem::Requirement
|
94
83
|
none: false
|
95
84
|
requirements:
|
96
85
|
- - ~>
|
@@ -98,10 +87,10 @@ dependencies:
|
|
98
87
|
version: '3.11'
|
99
88
|
type: :development
|
100
89
|
prerelease: false
|
101
|
-
version_requirements: *
|
90
|
+
version_requirements: *70331993743820
|
102
91
|
- !ruby/object:Gem::Dependency
|
103
92
|
name: guard-rspec
|
104
|
-
requirement: &
|
93
|
+
requirement: &70331993743100 !ruby/object:Gem::Requirement
|
105
94
|
none: false
|
106
95
|
requirements:
|
107
96
|
- - ! '>='
|
@@ -109,7 +98,7 @@ dependencies:
|
|
109
98
|
version: '0'
|
110
99
|
type: :development
|
111
100
|
prerelease: false
|
112
|
-
version_requirements: *
|
101
|
+
version_requirements: *70331993743100
|
113
102
|
description: a library that can read SP Services PDF bills and extract and summarize
|
114
103
|
the bill details
|
115
104
|
email: gallagher.paul@gmail.com
|
@@ -124,6 +113,7 @@ files:
|
|
124
113
|
- .rspec
|
125
114
|
- .rvmrc
|
126
115
|
- .travis.yml
|
116
|
+
- CHANGELOG
|
127
117
|
- Gemfile
|
128
118
|
- Gemfile.lock
|
129
119
|
- Guardfile
|
@@ -131,10 +121,6 @@ files:
|
|
131
121
|
- README.rdoc
|
132
122
|
- Rakefile
|
133
123
|
- bin/sps_bill
|
134
|
-
- lib/pdf/object_hash.rb
|
135
|
-
- lib/pdf/positional_text_receiver.rb
|
136
|
-
- lib/pdf/structured_reader.rb
|
137
|
-
- lib/pdf/textangle.rb
|
138
124
|
- lib/sps_bill.rb
|
139
125
|
- lib/sps_bill/bill.rb
|
140
126
|
- lib/sps_bill/bill_collection.rb
|
@@ -142,9 +128,13 @@ files:
|
|
142
128
|
- lib/sps_bill/shell.rb
|
143
129
|
- lib/sps_bill/version.rb
|
144
130
|
- scripts/data/.gitkeep
|
131
|
+
- scripts/data/all_services.csv.sample
|
132
|
+
- scripts/data/all_services.sample.pdf
|
133
|
+
- scripts/data/elec_and_water_only.csv.sample
|
134
|
+
- scripts/data/elec_and_water_only.sample.pdf
|
135
|
+
- scripts/full_analysis.R
|
145
136
|
- scripts/scan_all_bills.sh
|
146
137
|
- spec/fixtures/pdf_samples/.gitkeep
|
147
|
-
- spec/fixtures/pdf_samples/junk_prefix.pdf
|
148
138
|
- spec/fixtures/personal_pdf_samples/.gitkeep
|
149
139
|
- spec/fixtures/personal_pdf_samples/expectations.yml.sample
|
150
140
|
- spec/integration/personal_samples_spec.rb
|
@@ -153,7 +143,6 @@ files:
|
|
153
143
|
- spec/support/pdf_samples_helper.rb
|
154
144
|
- spec/unit/bill_collection_spec.rb
|
155
145
|
- spec/unit/bill_spec.rb
|
156
|
-
- spec/unit/pdf/object_hash_spec.rb
|
157
146
|
- spec/unit/shell_spec.rb
|
158
147
|
- sps_bill.gemspec
|
159
148
|
homepage: https://github.com/tardate/sps_bill_scanner
|
data/lib/pdf/object_hash.rb
DELETED
@@ -1,39 +0,0 @@
|
|
1
|
-
class PDF::Reader
|
2
|
-
class ObjectHash
|
3
|
-
|
4
|
-
def extract_io_from(input)
|
5
|
-
if input.respond_to?(:seek) && input.respond_to?(:read)
|
6
|
-
input
|
7
|
-
elsif File.file?(input.to_s)
|
8
|
-
read_with_quirks(input)
|
9
|
-
else
|
10
|
-
raise ArgumentError, "input must be an IO-like object or a filename"
|
11
|
-
end
|
12
|
-
end
|
13
|
-
|
14
|
-
# Load file as a StringIO stream, accounting for invalid format
|
15
|
-
# where additional characters exist in the file before the %PDF start of file
|
16
|
-
def read_with_quirks(input)
|
17
|
-
stream = File.open(input.to_s, "rb")
|
18
|
-
if ofs = pdf_offset(stream)
|
19
|
-
stream.seek(ofs)
|
20
|
-
StringIO.new(stream.read)
|
21
|
-
else
|
22
|
-
raise ArgumentError, "invalid file format"
|
23
|
-
end
|
24
|
-
end
|
25
|
-
private :read_with_quirks
|
26
|
-
|
27
|
-
# Returns the offset of the PDF document in the +stream+.
|
28
|
-
# Checks up to 50 chars into the file, returns nil of no PDF stream detected.
|
29
|
-
def pdf_offset(stream)
|
30
|
-
stream.rewind
|
31
|
-
ofs = stream.pos
|
32
|
-
until (c = stream.readchar) == '%' || c == 37 || ofs > 50
|
33
|
-
ofs += 1
|
34
|
-
end
|
35
|
-
ofs < 50 ? ofs : nil
|
36
|
-
end
|
37
|
-
private :pdf_offset
|
38
|
-
end
|
39
|
-
end
|
@@ -1,16 +0,0 @@
|
|
1
|
-
class PDF::Reader::PositionalTextReceiver < PDF::Reader::PageTextReceiver
|
2
|
-
|
3
|
-
# record text that is drawn on the page
|
4
|
-
def show_text(string) # Tj
|
5
|
-
raise PDF::Reader::MalformedPDFError, "current font is invalid" if @state.current_font.nil?
|
6
|
-
newx, newy = @state.trm_transform(0,0)
|
7
|
-
@content[newy] ||= {}
|
8
|
-
@content[newy][newx] = @state.current_font.to_utf8(string)
|
9
|
-
end
|
10
|
-
|
11
|
-
# override content accessor
|
12
|
-
def content
|
13
|
-
@content
|
14
|
-
end
|
15
|
-
|
16
|
-
end
|
@@ -1,108 +0,0 @@
|
|
1
|
-
# Class for reading structured text content
|
2
|
-
# This is the one that is a bit hairy - specifically check the fuzzed_y usage
|
3
|
-
# which attempts to align text content in the PDF so it can be extracted
|
4
|
-
# with correct alignment.
|
5
|
-
#
|
6
|
-
class PDF::StructuredReader
|
7
|
-
attr_reader :reader
|
8
|
-
|
9
|
-
# +source+ is a file name or stream-like object
|
10
|
-
def initialize(source)
|
11
|
-
@reader = PDF::Reader.new(source)
|
12
|
-
end
|
13
|
-
|
14
|
-
# Returns positional (with fuzzed y positioning) text content collection as a hash:
|
15
|
-
# { y_position: { x_position: content}}
|
16
|
-
def content(page=1)
|
17
|
-
@content ||= []
|
18
|
-
if @content[page]
|
19
|
-
@content[page]
|
20
|
-
else
|
21
|
-
@content[page] = fuzzed_y(precise_content(page))
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
# Returns a hash with fuzzed y positioning:
|
26
|
-
# { fuzzed_y_position: { x_position: content}}
|
27
|
-
# Given +input+ as a hash:
|
28
|
-
# { y_position: { x_position: content}}
|
29
|
-
# y values that fall within +precision+ points of another will be clustered
|
30
|
-
def fuzzed_y(input,precision=3)
|
31
|
-
output = {}
|
32
|
-
input.keys.sort.each do |precise_y|
|
33
|
-
# matching_y = (precise_y / 5.0).truncate * 5.0
|
34
|
-
matching_y = output.keys.select{|new_y| (new_y - precise_y).abs < precision }.first || precise_y
|
35
|
-
output[matching_y] ||= {}
|
36
|
-
output[matching_y].merge!(input[precise_y])
|
37
|
-
end
|
38
|
-
output
|
39
|
-
end
|
40
|
-
|
41
|
-
# Returns positional text content collection as a hash with precise x,y positioning:
|
42
|
-
# { y_position: { x_position: content}}
|
43
|
-
def precise_content(page=1)
|
44
|
-
@precise_content ||= []
|
45
|
-
if @precise_content[page]
|
46
|
-
@precise_content[page]
|
47
|
-
else
|
48
|
-
@precise_content[page] = load_content(page)
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
# Returns an array of text elements in the bounding box
|
53
|
-
def text_in_rect(xmin,xmax,ymin,ymax,page=1)
|
54
|
-
text_map = content(page)
|
55
|
-
box = []
|
56
|
-
text_map.keys.sort.reverse.each do |y|
|
57
|
-
if y >= ymin && y<= ymax
|
58
|
-
row = []
|
59
|
-
text_map[y].keys.sort.each do |x|
|
60
|
-
if x >= xmin && x<= xmax
|
61
|
-
row << text_map[y][x]
|
62
|
-
end
|
63
|
-
end
|
64
|
-
box << row unless row.empty?
|
65
|
-
end
|
66
|
-
end
|
67
|
-
box
|
68
|
-
end
|
69
|
-
|
70
|
-
# Returns the position {x: val, y: val } of +text+ on +page+
|
71
|
-
# +text+ may be astring (exact match required) or a Regexp
|
72
|
-
def text_position(text,page=1)
|
73
|
-
item = if text.class <= Regexp
|
74
|
-
content(page).map {|k,v| if x = v.reduce(nil){|memo,vv| memo = (vv[1] =~ text) ? vv[0] : memo } ; [k,x] ; end }
|
75
|
-
else
|
76
|
-
content(page).map {|k,v| if x = v.rassoc(text) ; [k,x] ; end }
|
77
|
-
end
|
78
|
-
item = item.compact.flatten
|
79
|
-
unless item.empty?
|
80
|
-
{ :x => item[1], :y => item[0] }
|
81
|
-
end
|
82
|
-
end
|
83
|
-
|
84
|
-
# WIP - not using Textangle yet for text extraction.
|
85
|
-
# Ideal usage is something like this:
|
86
|
-
#
|
87
|
-
# textangle = reader.bounding_box do
|
88
|
-
# page 1
|
89
|
-
# below "Electricity Services"
|
90
|
-
# above "Gas Services by City Gas Pte Ltd"
|
91
|
-
# right_of 240.0
|
92
|
-
# left_of "Total ($)"
|
93
|
-
# end
|
94
|
-
# textangle.text
|
95
|
-
#
|
96
|
-
def bounding_box(&block)
|
97
|
-
PDF::Reader::Textangle.new(self,&block)
|
98
|
-
end
|
99
|
-
|
100
|
-
private
|
101
|
-
|
102
|
-
def load_content(page)
|
103
|
-
receiver = PDF::Reader::PositionalTextReceiver.new
|
104
|
-
reader.page(page).walk(receiver)
|
105
|
-
receiver.content
|
106
|
-
end
|
107
|
-
|
108
|
-
end
|
data/lib/pdf/textangle.rb
DELETED
@@ -1,27 +0,0 @@
|
|
1
|
-
# A DSL syntax for text extraction.
|
2
|
-
# WIP - not using this yet
|
3
|
-
#
|
4
|
-
# textangle = PDF::Reader::Textangle.new(reader) do
|
5
|
-
# page 1
|
6
|
-
# below "Electricity Services"
|
7
|
-
# above "Gas Services by City Gas Pte Ltd"
|
8
|
-
# right_of 240.0
|
9
|
-
# left_of "Total ($)"
|
10
|
-
# end
|
11
|
-
# textangle.text
|
12
|
-
#
|
13
|
-
class PDF::Reader::Textangle
|
14
|
-
attr_reader :reader
|
15
|
-
attr_writer :page,:above,:below,:left_of,:right_of
|
16
|
-
|
17
|
-
# +structured_reader+ is a PDF::StructuredReader
|
18
|
-
def initialize(structured_reader,&block)
|
19
|
-
@reader = structured_reader
|
20
|
-
instance_eval( &block ) if block
|
21
|
-
end
|
22
|
-
|
23
|
-
def text
|
24
|
-
# TODO
|
25
|
-
end
|
26
|
-
|
27
|
-
end
|
@@ -1,71 +0,0 @@
|
|
1
|
-
<html>
|
2
|
-
<head></head>
|
3
|
-
%PDF-1.3
|
4
|
-
%����
|
5
|
-
1 0 obj
|
6
|
-
<< /Creator <feff0050007200610077006e>
|
7
|
-
/Producer <feff0050007200610077006e>
|
8
|
-
>>
|
9
|
-
endobj
|
10
|
-
2 0 obj
|
11
|
-
<< /Type /Catalog
|
12
|
-
/Pages 3 0 R
|
13
|
-
>>
|
14
|
-
endobj
|
15
|
-
3 0 obj
|
16
|
-
<< /Type /Pages
|
17
|
-
/Count 1
|
18
|
-
/Kids [5 0 R]
|
19
|
-
>>
|
20
|
-
endobj
|
21
|
-
4 0 obj
|
22
|
-
<< /Length 157
|
23
|
-
>>
|
24
|
-
stream
|
25
|
-
q
|
26
|
-
|
27
|
-
BT
|
28
|
-
36 747.384 Td
|
29
|
-
/F1.0 12 Tf
|
30
|
-
[<546869732050444620636f6e7461696e73206a756e6b20626566> 30 <6f72652074686520252d504446206d6172> -15 <6b> 20 <6572>] TJ
|
31
|
-
ET
|
32
|
-
|
33
|
-
Q
|
34
|
-
|
35
|
-
endstream
|
36
|
-
endobj
|
37
|
-
5 0 obj
|
38
|
-
<< /Type /Page
|
39
|
-
/Parent 3 0 R
|
40
|
-
/MediaBox [0 0 612.0 792.0]
|
41
|
-
/Contents 4 0 R
|
42
|
-
/Resources << /ProcSet [/PDF /Text /ImageB /ImageC /ImageI]
|
43
|
-
/Font << /F1.0 6 0 R
|
44
|
-
>>
|
45
|
-
>>
|
46
|
-
>>
|
47
|
-
endobj
|
48
|
-
6 0 obj
|
49
|
-
<< /Type /Font
|
50
|
-
/Subtype /Type1
|
51
|
-
/BaseFont /Helvetica
|
52
|
-
/Encoding /WinAnsiEncoding
|
53
|
-
>>
|
54
|
-
endobj
|
55
|
-
xref
|
56
|
-
0 7
|
57
|
-
0000000000 65535 f
|
58
|
-
0000000015 00000 n
|
59
|
-
0000000109 00000 n
|
60
|
-
0000000158 00000 n
|
61
|
-
0000000215 00000 n
|
62
|
-
0000000423 00000 n
|
63
|
-
0000000601 00000 n
|
64
|
-
trailer
|
65
|
-
<< /Size 7
|
66
|
-
/Root 2 0 R
|
67
|
-
/Info 1 0 R
|
68
|
-
>>
|
69
|
-
startxref
|
70
|
-
698
|
71
|
-
%%EOF
|
@@ -1,15 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
include PdfSamplesHelper
|
3
|
-
|
4
|
-
describe PDF::Reader::ObjectHash do
|
5
|
-
|
6
|
-
context "when there is a junk prefix" do
|
7
|
-
let(:sample_name) { junk_prefix_pdf_sample_name }
|
8
|
-
let(:object_hash) { PDF::Reader::ObjectHash.new(sample_name) }
|
9
|
-
let(:stream) { object_hash.instance_variable_get(:@io) }
|
10
|
-
before { stream.rewind }
|
11
|
-
subject { stream.read(4) }
|
12
|
-
it { should eql("%PDF") }
|
13
|
-
end
|
14
|
-
|
15
|
-
end
|