sps_bill 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sps_bill
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,22 +9,22 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-20 00:00:00.000000000 Z
12
+ date: 2012-08-01 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
- name: pdf-reader
16
- requirement: &70313136422520 !ruby/object:Gem::Requirement
15
+ name: pdf-reader-turtletext
16
+ requirement: &70331993763820 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
- - - =
19
+ - - ~>
20
20
  - !ruby/object:Gem::Version
21
- version: 1.1.1
21
+ version: 0.2.2
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70313136422520
24
+ version_requirements: *70331993763820
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: getoptions
27
- requirement: &70313136421900 !ruby/object:Gem::Requirement
27
+ requirement: &70331993762960 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0.3'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70313136421900
35
+ version_requirements: *70331993762960
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: bundler
38
- requirement: &70313136421060 !ruby/object:Gem::Requirement
38
+ requirement: &70331993762360 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: 1.1.4
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *70313136421060
46
+ version_requirements: *70331993762360
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: jeweler
49
- requirement: &70313136420480 !ruby/object:Gem::Requirement
49
+ requirement: &70331993761820 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ~>
@@ -54,21 +54,10 @@ dependencies:
54
54
  version: 1.6.4
55
55
  type: :development
56
56
  prerelease: false
57
- version_requirements: *70313136420480
58
- - !ruby/object:Gem::Dependency
59
- name: rcov
60
- requirement: &70313136419920 !ruby/object:Gem::Requirement
61
- none: false
62
- requirements:
63
- - - ! '>='
64
- - !ruby/object:Gem::Version
65
- version: '0'
66
- type: :development
67
- prerelease: false
68
- version_requirements: *70313136419920
57
+ version_requirements: *70331993761820
69
58
  - !ruby/object:Gem::Dependency
70
59
  name: rake
71
- requirement: &70313136419140 !ruby/object:Gem::Requirement
60
+ requirement: &70331993760540 !ruby/object:Gem::Requirement
72
61
  none: false
73
62
  requirements:
74
63
  - - ~>
@@ -76,10 +65,10 @@ dependencies:
76
65
  version: 0.9.2.2
77
66
  type: :development
78
67
  prerelease: false
79
- version_requirements: *70313136419140
68
+ version_requirements: *70331993760540
80
69
  - !ruby/object:Gem::Dependency
81
70
  name: rspec
82
- requirement: &70313136418160 !ruby/object:Gem::Requirement
71
+ requirement: &70331993759960 !ruby/object:Gem::Requirement
83
72
  none: false
84
73
  requirements:
85
74
  - - ~>
@@ -87,10 +76,10 @@ dependencies:
87
76
  version: 2.8.0
88
77
  type: :development
89
78
  prerelease: false
90
- version_requirements: *70313136418160
79
+ version_requirements: *70331993759960
91
80
  - !ruby/object:Gem::Dependency
92
81
  name: rdoc
93
- requirement: &70313136417580 !ruby/object:Gem::Requirement
82
+ requirement: &70331993743820 !ruby/object:Gem::Requirement
94
83
  none: false
95
84
  requirements:
96
85
  - - ~>
@@ -98,10 +87,10 @@ dependencies:
98
87
  version: '3.11'
99
88
  type: :development
100
89
  prerelease: false
101
- version_requirements: *70313136417580
90
+ version_requirements: *70331993743820
102
91
  - !ruby/object:Gem::Dependency
103
92
  name: guard-rspec
104
- requirement: &70313136417060 !ruby/object:Gem::Requirement
93
+ requirement: &70331993743100 !ruby/object:Gem::Requirement
105
94
  none: false
106
95
  requirements:
107
96
  - - ! '>='
@@ -109,7 +98,7 @@ dependencies:
109
98
  version: '0'
110
99
  type: :development
111
100
  prerelease: false
112
- version_requirements: *70313136417060
101
+ version_requirements: *70331993743100
113
102
  description: a library that can read SP Services PDF bills and extract and summarize
114
103
  the bill details
115
104
  email: gallagher.paul@gmail.com
@@ -124,6 +113,7 @@ files:
124
113
  - .rspec
125
114
  - .rvmrc
126
115
  - .travis.yml
116
+ - CHANGELOG
127
117
  - Gemfile
128
118
  - Gemfile.lock
129
119
  - Guardfile
@@ -131,10 +121,6 @@ files:
131
121
  - README.rdoc
132
122
  - Rakefile
133
123
  - bin/sps_bill
134
- - lib/pdf/object_hash.rb
135
- - lib/pdf/positional_text_receiver.rb
136
- - lib/pdf/structured_reader.rb
137
- - lib/pdf/textangle.rb
138
124
  - lib/sps_bill.rb
139
125
  - lib/sps_bill/bill.rb
140
126
  - lib/sps_bill/bill_collection.rb
@@ -142,9 +128,13 @@ files:
142
128
  - lib/sps_bill/shell.rb
143
129
  - lib/sps_bill/version.rb
144
130
  - scripts/data/.gitkeep
131
+ - scripts/data/all_services.csv.sample
132
+ - scripts/data/all_services.sample.pdf
133
+ - scripts/data/elec_and_water_only.csv.sample
134
+ - scripts/data/elec_and_water_only.sample.pdf
135
+ - scripts/full_analysis.R
145
136
  - scripts/scan_all_bills.sh
146
137
  - spec/fixtures/pdf_samples/.gitkeep
147
- - spec/fixtures/pdf_samples/junk_prefix.pdf
148
138
  - spec/fixtures/personal_pdf_samples/.gitkeep
149
139
  - spec/fixtures/personal_pdf_samples/expectations.yml.sample
150
140
  - spec/integration/personal_samples_spec.rb
@@ -153,7 +143,6 @@ files:
153
143
  - spec/support/pdf_samples_helper.rb
154
144
  - spec/unit/bill_collection_spec.rb
155
145
  - spec/unit/bill_spec.rb
156
- - spec/unit/pdf/object_hash_spec.rb
157
146
  - spec/unit/shell_spec.rb
158
147
  - sps_bill.gemspec
159
148
  homepage: https://github.com/tardate/sps_bill_scanner
@@ -1,39 +0,0 @@
1
- class PDF::Reader
2
- class ObjectHash
3
-
4
- def extract_io_from(input)
5
- if input.respond_to?(:seek) && input.respond_to?(:read)
6
- input
7
- elsif File.file?(input.to_s)
8
- read_with_quirks(input)
9
- else
10
- raise ArgumentError, "input must be an IO-like object or a filename"
11
- end
12
- end
13
-
14
- # Load file as a StringIO stream, accounting for invalid format
15
- # where additional characters exist in the file before the %PDF start of file
16
- def read_with_quirks(input)
17
- stream = File.open(input.to_s, "rb")
18
- if ofs = pdf_offset(stream)
19
- stream.seek(ofs)
20
- StringIO.new(stream.read)
21
- else
22
- raise ArgumentError, "invalid file format"
23
- end
24
- end
25
- private :read_with_quirks
26
-
27
- # Returns the offset of the PDF document in the +stream+.
28
- # Checks up to 50 chars into the file, returns nil of no PDF stream detected.
29
- def pdf_offset(stream)
30
- stream.rewind
31
- ofs = stream.pos
32
- until (c = stream.readchar) == '%' || c == 37 || ofs > 50
33
- ofs += 1
34
- end
35
- ofs < 50 ? ofs : nil
36
- end
37
- private :pdf_offset
38
- end
39
- end
@@ -1,16 +0,0 @@
1
- class PDF::Reader::PositionalTextReceiver < PDF::Reader::PageTextReceiver
2
-
3
- # record text that is drawn on the page
4
- def show_text(string) # Tj
5
- raise PDF::Reader::MalformedPDFError, "current font is invalid" if @state.current_font.nil?
6
- newx, newy = @state.trm_transform(0,0)
7
- @content[newy] ||= {}
8
- @content[newy][newx] = @state.current_font.to_utf8(string)
9
- end
10
-
11
- # override content accessor
12
- def content
13
- @content
14
- end
15
-
16
- end
@@ -1,108 +0,0 @@
1
- # Class for reading structured text content
2
- # This is the one that is a bit hairy - specifically check the fuzzed_y usage
3
- # which attempts to align text content in the PDF so it can be extracted
4
- # with correct alignment.
5
- #
6
- class PDF::StructuredReader
7
- attr_reader :reader
8
-
9
- # +source+ is a file name or stream-like object
10
- def initialize(source)
11
- @reader = PDF::Reader.new(source)
12
- end
13
-
14
- # Returns positional (with fuzzed y positioning) text content collection as a hash:
15
- # { y_position: { x_position: content}}
16
- def content(page=1)
17
- @content ||= []
18
- if @content[page]
19
- @content[page]
20
- else
21
- @content[page] = fuzzed_y(precise_content(page))
22
- end
23
- end
24
-
25
- # Returns a hash with fuzzed y positioning:
26
- # { fuzzed_y_position: { x_position: content}}
27
- # Given +input+ as a hash:
28
- # { y_position: { x_position: content}}
29
- # y values that fall within +precision+ points of another will be clustered
30
- def fuzzed_y(input,precision=3)
31
- output = {}
32
- input.keys.sort.each do |precise_y|
33
- # matching_y = (precise_y / 5.0).truncate * 5.0
34
- matching_y = output.keys.select{|new_y| (new_y - precise_y).abs < precision }.first || precise_y
35
- output[matching_y] ||= {}
36
- output[matching_y].merge!(input[precise_y])
37
- end
38
- output
39
- end
40
-
41
- # Returns positional text content collection as a hash with precise x,y positioning:
42
- # { y_position: { x_position: content}}
43
- def precise_content(page=1)
44
- @precise_content ||= []
45
- if @precise_content[page]
46
- @precise_content[page]
47
- else
48
- @precise_content[page] = load_content(page)
49
- end
50
- end
51
-
52
- # Returns an array of text elements in the bounding box
53
- def text_in_rect(xmin,xmax,ymin,ymax,page=1)
54
- text_map = content(page)
55
- box = []
56
- text_map.keys.sort.reverse.each do |y|
57
- if y >= ymin && y<= ymax
58
- row = []
59
- text_map[y].keys.sort.each do |x|
60
- if x >= xmin && x<= xmax
61
- row << text_map[y][x]
62
- end
63
- end
64
- box << row unless row.empty?
65
- end
66
- end
67
- box
68
- end
69
-
70
- # Returns the position {x: val, y: val } of +text+ on +page+
71
- # +text+ may be astring (exact match required) or a Regexp
72
- def text_position(text,page=1)
73
- item = if text.class <= Regexp
74
- content(page).map {|k,v| if x = v.reduce(nil){|memo,vv| memo = (vv[1] =~ text) ? vv[0] : memo } ; [k,x] ; end }
75
- else
76
- content(page).map {|k,v| if x = v.rassoc(text) ; [k,x] ; end }
77
- end
78
- item = item.compact.flatten
79
- unless item.empty?
80
- { :x => item[1], :y => item[0] }
81
- end
82
- end
83
-
84
- # WIP - not using Textangle yet for text extraction.
85
- # Ideal usage is something like this:
86
- #
87
- # textangle = reader.bounding_box do
88
- # page 1
89
- # below "Electricity Services"
90
- # above "Gas Services by City Gas Pte Ltd"
91
- # right_of 240.0
92
- # left_of "Total ($)"
93
- # end
94
- # textangle.text
95
- #
96
- def bounding_box(&block)
97
- PDF::Reader::Textangle.new(self,&block)
98
- end
99
-
100
- private
101
-
102
- def load_content(page)
103
- receiver = PDF::Reader::PositionalTextReceiver.new
104
- reader.page(page).walk(receiver)
105
- receiver.content
106
- end
107
-
108
- end
data/lib/pdf/textangle.rb DELETED
@@ -1,27 +0,0 @@
1
- # A DSL syntax for text extraction.
2
- # WIP - not using this yet
3
- #
4
- # textangle = PDF::Reader::Textangle.new(reader) do
5
- # page 1
6
- # below "Electricity Services"
7
- # above "Gas Services by City Gas Pte Ltd"
8
- # right_of 240.0
9
- # left_of "Total ($)"
10
- # end
11
- # textangle.text
12
- #
13
- class PDF::Reader::Textangle
14
- attr_reader :reader
15
- attr_writer :page,:above,:below,:left_of,:right_of
16
-
17
- # +structured_reader+ is a PDF::StructuredReader
18
- def initialize(structured_reader,&block)
19
- @reader = structured_reader
20
- instance_eval( &block ) if block
21
- end
22
-
23
- def text
24
- # TODO
25
- end
26
-
27
- end
@@ -1,71 +0,0 @@
1
- <html>
2
- <head></head>
3
- %PDF-1.3
4
- %����
5
- 1 0 obj
6
- << /Creator <feff0050007200610077006e>
7
- /Producer <feff0050007200610077006e>
8
- >>
9
- endobj
10
- 2 0 obj
11
- << /Type /Catalog
12
- /Pages 3 0 R
13
- >>
14
- endobj
15
- 3 0 obj
16
- << /Type /Pages
17
- /Count 1
18
- /Kids [5 0 R]
19
- >>
20
- endobj
21
- 4 0 obj
22
- << /Length 157
23
- >>
24
- stream
25
- q
26
-
27
- BT
28
- 36 747.384 Td
29
- /F1.0 12 Tf
30
- [<546869732050444620636f6e7461696e73206a756e6b20626566> 30 <6f72652074686520252d504446206d6172> -15 <6b> 20 <6572>] TJ
31
- ET
32
-
33
- Q
34
-
35
- endstream
36
- endobj
37
- 5 0 obj
38
- << /Type /Page
39
- /Parent 3 0 R
40
- /MediaBox [0 0 612.0 792.0]
41
- /Contents 4 0 R
42
- /Resources << /ProcSet [/PDF /Text /ImageB /ImageC /ImageI]
43
- /Font << /F1.0 6 0 R
44
- >>
45
- >>
46
- >>
47
- endobj
48
- 6 0 obj
49
- << /Type /Font
50
- /Subtype /Type1
51
- /BaseFont /Helvetica
52
- /Encoding /WinAnsiEncoding
53
- >>
54
- endobj
55
- xref
56
- 0 7
57
- 0000000000 65535 f
58
- 0000000015 00000 n
59
- 0000000109 00000 n
60
- 0000000158 00000 n
61
- 0000000215 00000 n
62
- 0000000423 00000 n
63
- 0000000601 00000 n
64
- trailer
65
- << /Size 7
66
- /Root 2 0 R
67
- /Info 1 0 R
68
- >>
69
- startxref
70
- 698
71
- %%EOF
@@ -1,15 +0,0 @@
1
- require 'spec_helper'
2
- include PdfSamplesHelper
3
-
4
- describe PDF::Reader::ObjectHash do
5
-
6
- context "when there is a junk prefix" do
7
- let(:sample_name) { junk_prefix_pdf_sample_name }
8
- let(:object_hash) { PDF::Reader::ObjectHash.new(sample_name) }
9
- let(:stream) { object_hash.instance_variable_get(:@io) }
10
- before { stream.rewind }
11
- subject { stream.read(4) }
12
- it { should eql("%PDF") }
13
- end
14
-
15
- end