sps_bill 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sps_bill
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,22 +9,22 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-20 00:00:00.000000000 Z
12
+ date: 2012-08-01 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
- name: pdf-reader
16
- requirement: &70313136422520 !ruby/object:Gem::Requirement
15
+ name: pdf-reader-turtletext
16
+ requirement: &70331993763820 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
- - - =
19
+ - - ~>
20
20
  - !ruby/object:Gem::Version
21
- version: 1.1.1
21
+ version: 0.2.2
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70313136422520
24
+ version_requirements: *70331993763820
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: getoptions
27
- requirement: &70313136421900 !ruby/object:Gem::Requirement
27
+ requirement: &70331993762960 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0.3'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70313136421900
35
+ version_requirements: *70331993762960
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: bundler
38
- requirement: &70313136421060 !ruby/object:Gem::Requirement
38
+ requirement: &70331993762360 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: 1.1.4
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *70313136421060
46
+ version_requirements: *70331993762360
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: jeweler
49
- requirement: &70313136420480 !ruby/object:Gem::Requirement
49
+ requirement: &70331993761820 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ~>
@@ -54,21 +54,10 @@ dependencies:
54
54
  version: 1.6.4
55
55
  type: :development
56
56
  prerelease: false
57
- version_requirements: *70313136420480
58
- - !ruby/object:Gem::Dependency
59
- name: rcov
60
- requirement: &70313136419920 !ruby/object:Gem::Requirement
61
- none: false
62
- requirements:
63
- - - ! '>='
64
- - !ruby/object:Gem::Version
65
- version: '0'
66
- type: :development
67
- prerelease: false
68
- version_requirements: *70313136419920
57
+ version_requirements: *70331993761820
69
58
  - !ruby/object:Gem::Dependency
70
59
  name: rake
71
- requirement: &70313136419140 !ruby/object:Gem::Requirement
60
+ requirement: &70331993760540 !ruby/object:Gem::Requirement
72
61
  none: false
73
62
  requirements:
74
63
  - - ~>
@@ -76,10 +65,10 @@ dependencies:
76
65
  version: 0.9.2.2
77
66
  type: :development
78
67
  prerelease: false
79
- version_requirements: *70313136419140
68
+ version_requirements: *70331993760540
80
69
  - !ruby/object:Gem::Dependency
81
70
  name: rspec
82
- requirement: &70313136418160 !ruby/object:Gem::Requirement
71
+ requirement: &70331993759960 !ruby/object:Gem::Requirement
83
72
  none: false
84
73
  requirements:
85
74
  - - ~>
@@ -87,10 +76,10 @@ dependencies:
87
76
  version: 2.8.0
88
77
  type: :development
89
78
  prerelease: false
90
- version_requirements: *70313136418160
79
+ version_requirements: *70331993759960
91
80
  - !ruby/object:Gem::Dependency
92
81
  name: rdoc
93
- requirement: &70313136417580 !ruby/object:Gem::Requirement
82
+ requirement: &70331993743820 !ruby/object:Gem::Requirement
94
83
  none: false
95
84
  requirements:
96
85
  - - ~>
@@ -98,10 +87,10 @@ dependencies:
98
87
  version: '3.11'
99
88
  type: :development
100
89
  prerelease: false
101
- version_requirements: *70313136417580
90
+ version_requirements: *70331993743820
102
91
  - !ruby/object:Gem::Dependency
103
92
  name: guard-rspec
104
- requirement: &70313136417060 !ruby/object:Gem::Requirement
93
+ requirement: &70331993743100 !ruby/object:Gem::Requirement
105
94
  none: false
106
95
  requirements:
107
96
  - - ! '>='
@@ -109,7 +98,7 @@ dependencies:
109
98
  version: '0'
110
99
  type: :development
111
100
  prerelease: false
112
- version_requirements: *70313136417060
101
+ version_requirements: *70331993743100
113
102
  description: a library that can read SP Services PDF bills and extract and summarize
114
103
  the bill details
115
104
  email: gallagher.paul@gmail.com
@@ -124,6 +113,7 @@ files:
124
113
  - .rspec
125
114
  - .rvmrc
126
115
  - .travis.yml
116
+ - CHANGELOG
127
117
  - Gemfile
128
118
  - Gemfile.lock
129
119
  - Guardfile
@@ -131,10 +121,6 @@ files:
131
121
  - README.rdoc
132
122
  - Rakefile
133
123
  - bin/sps_bill
134
- - lib/pdf/object_hash.rb
135
- - lib/pdf/positional_text_receiver.rb
136
- - lib/pdf/structured_reader.rb
137
- - lib/pdf/textangle.rb
138
124
  - lib/sps_bill.rb
139
125
  - lib/sps_bill/bill.rb
140
126
  - lib/sps_bill/bill_collection.rb
@@ -142,9 +128,13 @@ files:
142
128
  - lib/sps_bill/shell.rb
143
129
  - lib/sps_bill/version.rb
144
130
  - scripts/data/.gitkeep
131
+ - scripts/data/all_services.csv.sample
132
+ - scripts/data/all_services.sample.pdf
133
+ - scripts/data/elec_and_water_only.csv.sample
134
+ - scripts/data/elec_and_water_only.sample.pdf
135
+ - scripts/full_analysis.R
145
136
  - scripts/scan_all_bills.sh
146
137
  - spec/fixtures/pdf_samples/.gitkeep
147
- - spec/fixtures/pdf_samples/junk_prefix.pdf
148
138
  - spec/fixtures/personal_pdf_samples/.gitkeep
149
139
  - spec/fixtures/personal_pdf_samples/expectations.yml.sample
150
140
  - spec/integration/personal_samples_spec.rb
@@ -153,7 +143,6 @@ files:
153
143
  - spec/support/pdf_samples_helper.rb
154
144
  - spec/unit/bill_collection_spec.rb
155
145
  - spec/unit/bill_spec.rb
156
- - spec/unit/pdf/object_hash_spec.rb
157
146
  - spec/unit/shell_spec.rb
158
147
  - sps_bill.gemspec
159
148
  homepage: https://github.com/tardate/sps_bill_scanner
@@ -1,39 +0,0 @@
1
- class PDF::Reader
2
- class ObjectHash
3
-
4
- def extract_io_from(input)
5
- if input.respond_to?(:seek) && input.respond_to?(:read)
6
- input
7
- elsif File.file?(input.to_s)
8
- read_with_quirks(input)
9
- else
10
- raise ArgumentError, "input must be an IO-like object or a filename"
11
- end
12
- end
13
-
14
- # Load file as a StringIO stream, accounting for invalid format
15
- # where additional characters exist in the file before the %PDF start of file
16
- def read_with_quirks(input)
17
- stream = File.open(input.to_s, "rb")
18
- if ofs = pdf_offset(stream)
19
- stream.seek(ofs)
20
- StringIO.new(stream.read)
21
- else
22
- raise ArgumentError, "invalid file format"
23
- end
24
- end
25
- private :read_with_quirks
26
-
27
- # Returns the offset of the PDF document in the +stream+.
28
- # Checks up to 50 chars into the file, returns nil of no PDF stream detected.
29
- def pdf_offset(stream)
30
- stream.rewind
31
- ofs = stream.pos
32
- until (c = stream.readchar) == '%' || c == 37 || ofs > 50
33
- ofs += 1
34
- end
35
- ofs < 50 ? ofs : nil
36
- end
37
- private :pdf_offset
38
- end
39
- end
@@ -1,16 +0,0 @@
1
- class PDF::Reader::PositionalTextReceiver < PDF::Reader::PageTextReceiver
2
-
3
- # record text that is drawn on the page
4
- def show_text(string) # Tj
5
- raise PDF::Reader::MalformedPDFError, "current font is invalid" if @state.current_font.nil?
6
- newx, newy = @state.trm_transform(0,0)
7
- @content[newy] ||= {}
8
- @content[newy][newx] = @state.current_font.to_utf8(string)
9
- end
10
-
11
- # override content accessor
12
- def content
13
- @content
14
- end
15
-
16
- end
@@ -1,108 +0,0 @@
1
- # Class for reading structured text content
2
- # This is the one that is a bit hairy - specifically check the fuzzed_y usage
3
- # which attempts to align text content in the PDF so it can be extracted
4
- # with correct alignment.
5
- #
6
- class PDF::StructuredReader
7
- attr_reader :reader
8
-
9
- # +source+ is a file name or stream-like object
10
- def initialize(source)
11
- @reader = PDF::Reader.new(source)
12
- end
13
-
14
- # Returns positional (with fuzzed y positioning) text content collection as a hash:
15
- # { y_position: { x_position: content}}
16
- def content(page=1)
17
- @content ||= []
18
- if @content[page]
19
- @content[page]
20
- else
21
- @content[page] = fuzzed_y(precise_content(page))
22
- end
23
- end
24
-
25
- # Returns a hash with fuzzed y positioning:
26
- # { fuzzed_y_position: { x_position: content}}
27
- # Given +input+ as a hash:
28
- # { y_position: { x_position: content}}
29
- # y values that fall within +precision+ points of another will be clustered
30
- def fuzzed_y(input,precision=3)
31
- output = {}
32
- input.keys.sort.each do |precise_y|
33
- # matching_y = (precise_y / 5.0).truncate * 5.0
34
- matching_y = output.keys.select{|new_y| (new_y - precise_y).abs < precision }.first || precise_y
35
- output[matching_y] ||= {}
36
- output[matching_y].merge!(input[precise_y])
37
- end
38
- output
39
- end
40
-
41
- # Returns positional text content collection as a hash with precise x,y positioning:
42
- # { y_position: { x_position: content}}
43
- def precise_content(page=1)
44
- @precise_content ||= []
45
- if @precise_content[page]
46
- @precise_content[page]
47
- else
48
- @precise_content[page] = load_content(page)
49
- end
50
- end
51
-
52
- # Returns an array of text elements in the bounding box
53
- def text_in_rect(xmin,xmax,ymin,ymax,page=1)
54
- text_map = content(page)
55
- box = []
56
- text_map.keys.sort.reverse.each do |y|
57
- if y >= ymin && y<= ymax
58
- row = []
59
- text_map[y].keys.sort.each do |x|
60
- if x >= xmin && x<= xmax
61
- row << text_map[y][x]
62
- end
63
- end
64
- box << row unless row.empty?
65
- end
66
- end
67
- box
68
- end
69
-
70
- # Returns the position {x: val, y: val } of +text+ on +page+
71
- # +text+ may be astring (exact match required) or a Regexp
72
- def text_position(text,page=1)
73
- item = if text.class <= Regexp
74
- content(page).map {|k,v| if x = v.reduce(nil){|memo,vv| memo = (vv[1] =~ text) ? vv[0] : memo } ; [k,x] ; end }
75
- else
76
- content(page).map {|k,v| if x = v.rassoc(text) ; [k,x] ; end }
77
- end
78
- item = item.compact.flatten
79
- unless item.empty?
80
- { :x => item[1], :y => item[0] }
81
- end
82
- end
83
-
84
- # WIP - not using Textangle yet for text extraction.
85
- # Ideal usage is something like this:
86
- #
87
- # textangle = reader.bounding_box do
88
- # page 1
89
- # below "Electricity Services"
90
- # above "Gas Services by City Gas Pte Ltd"
91
- # right_of 240.0
92
- # left_of "Total ($)"
93
- # end
94
- # textangle.text
95
- #
96
- def bounding_box(&block)
97
- PDF::Reader::Textangle.new(self,&block)
98
- end
99
-
100
- private
101
-
102
- def load_content(page)
103
- receiver = PDF::Reader::PositionalTextReceiver.new
104
- reader.page(page).walk(receiver)
105
- receiver.content
106
- end
107
-
108
- end
data/lib/pdf/textangle.rb DELETED
@@ -1,27 +0,0 @@
1
- # A DSL syntax for text extraction.
2
- # WIP - not using this yet
3
- #
4
- # textangle = PDF::Reader::Textangle.new(reader) do
5
- # page 1
6
- # below "Electricity Services"
7
- # above "Gas Services by City Gas Pte Ltd"
8
- # right_of 240.0
9
- # left_of "Total ($)"
10
- # end
11
- # textangle.text
12
- #
13
- class PDF::Reader::Textangle
14
- attr_reader :reader
15
- attr_writer :page,:above,:below,:left_of,:right_of
16
-
17
- # +structured_reader+ is a PDF::StructuredReader
18
- def initialize(structured_reader,&block)
19
- @reader = structured_reader
20
- instance_eval( &block ) if block
21
- end
22
-
23
- def text
24
- # TODO
25
- end
26
-
27
- end
@@ -1,71 +0,0 @@
1
- <html>
2
- <head></head>
3
- %PDF-1.3
4
- %����
5
- 1 0 obj
6
- << /Creator <feff0050007200610077006e>
7
- /Producer <feff0050007200610077006e>
8
- >>
9
- endobj
10
- 2 0 obj
11
- << /Type /Catalog
12
- /Pages 3 0 R
13
- >>
14
- endobj
15
- 3 0 obj
16
- << /Type /Pages
17
- /Count 1
18
- /Kids [5 0 R]
19
- >>
20
- endobj
21
- 4 0 obj
22
- << /Length 157
23
- >>
24
- stream
25
- q
26
-
27
- BT
28
- 36 747.384 Td
29
- /F1.0 12 Tf
30
- [<546869732050444620636f6e7461696e73206a756e6b20626566> 30 <6f72652074686520252d504446206d6172> -15 <6b> 20 <6572>] TJ
31
- ET
32
-
33
- Q
34
-
35
- endstream
36
- endobj
37
- 5 0 obj
38
- << /Type /Page
39
- /Parent 3 0 R
40
- /MediaBox [0 0 612.0 792.0]
41
- /Contents 4 0 R
42
- /Resources << /ProcSet [/PDF /Text /ImageB /ImageC /ImageI]
43
- /Font << /F1.0 6 0 R
44
- >>
45
- >>
46
- >>
47
- endobj
48
- 6 0 obj
49
- << /Type /Font
50
- /Subtype /Type1
51
- /BaseFont /Helvetica
52
- /Encoding /WinAnsiEncoding
53
- >>
54
- endobj
55
- xref
56
- 0 7
57
- 0000000000 65535 f
58
- 0000000015 00000 n
59
- 0000000109 00000 n
60
- 0000000158 00000 n
61
- 0000000215 00000 n
62
- 0000000423 00000 n
63
- 0000000601 00000 n
64
- trailer
65
- << /Size 7
66
- /Root 2 0 R
67
- /Info 1 0 R
68
- >>
69
- startxref
70
- 698
71
- %%EOF
@@ -1,15 +0,0 @@
1
- require 'spec_helper'
2
- include PdfSamplesHelper
3
-
4
- describe PDF::Reader::ObjectHash do
5
-
6
- context "when there is a junk prefix" do
7
- let(:sample_name) { junk_prefix_pdf_sample_name }
8
- let(:object_hash) { PDF::Reader::ObjectHash.new(sample_name) }
9
- let(:stream) { object_hash.instance_variable_get(:@io) }
10
- before { stream.rewind }
11
- subject { stream.read(4) }
12
- it { should eql("%PDF") }
13
- end
14
-
15
- end