pdf-reader-turtletext 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color
data/.rvmrc ADDED
@@ -0,0 +1 @@
1
+ rvm use 1.9.3@pdf-reader-turtletext --create
@@ -0,0 +1,3 @@
1
+ # These are specific configuration settings required for travis-ci
2
+ # see http://travis-ci.org/tardate/pdf-reader-turtletext
3
+ rvm: 1.9.3
data/Gemfile ADDED
@@ -0,0 +1,18 @@
1
+ source "http://rubygems.org"
2
+
3
+ gem 'pdf-reader', '1.1.1'
4
+
5
+ group :development do
6
+ gem 'bundler', '~> 1.1.4'
7
+ gem 'jeweler', '~> 1.6.4'
8
+ end
9
+
10
+ group :development, :test do
11
+ gem 'rake', '~> 0.9.2.2'
12
+ gem 'rspec', '~> 2.8.0', :require => 'spec'
13
+ gem 'rdoc', '~> 3.11'
14
+ # prawn for generating PDFs for tests
15
+ gem 'prawn', '~> 0.12.0'
16
+ # guard for auto-running tests
17
+ gem 'guard-rspec', '~> 1.2.0'
18
+ end
@@ -0,0 +1,59 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ Ascii85 (1.0.1)
5
+ diff-lcs (1.1.3)
6
+ ffi (1.1.0)
7
+ git (1.2.5)
8
+ guard (1.2.3)
9
+ listen (>= 0.4.2)
10
+ thor (>= 0.14.6)
11
+ guard-rspec (1.2.0)
12
+ guard (>= 1.1)
13
+ jeweler (1.6.4)
14
+ bundler (~> 1.0)
15
+ git (>= 1.2.5)
16
+ rake
17
+ json (1.7.3)
18
+ listen (0.4.7)
19
+ rb-fchange (~> 0.0.5)
20
+ rb-fsevent (~> 0.9.1)
21
+ rb-inotify (~> 0.8.8)
22
+ pdf-reader (1.1.1)
23
+ Ascii85 (~> 1.0.0)
24
+ ruby-rc4
25
+ prawn (0.12.0)
26
+ pdf-reader (>= 0.9.0)
27
+ ttfunk (~> 1.0.2)
28
+ rake (0.9.2.2)
29
+ rb-fchange (0.0.5)
30
+ ffi
31
+ rb-fsevent (0.9.1)
32
+ rb-inotify (0.8.8)
33
+ ffi (>= 0.5.0)
34
+ rdoc (3.12)
35
+ json (~> 1.4)
36
+ rspec (2.8.0)
37
+ rspec-core (~> 2.8.0)
38
+ rspec-expectations (~> 2.8.0)
39
+ rspec-mocks (~> 2.8.0)
40
+ rspec-core (2.8.0)
41
+ rspec-expectations (2.8.0)
42
+ diff-lcs (~> 1.1.2)
43
+ rspec-mocks (2.8.0)
44
+ ruby-rc4 (0.1.5)
45
+ thor (0.15.4)
46
+ ttfunk (1.0.3)
47
+
48
+ PLATFORMS
49
+ ruby
50
+
51
+ DEPENDENCIES
52
+ bundler (~> 1.1.4)
53
+ guard-rspec (~> 1.2.0)
54
+ jeweler (~> 1.6.4)
55
+ pdf-reader (= 1.1.1)
56
+ prawn (~> 0.12.0)
57
+ rake (~> 0.9.2.2)
58
+ rdoc (~> 3.11)
59
+ rspec (~> 2.8.0)
@@ -0,0 +1,10 @@
1
+ # A sample Guardfile
2
+ # More info at https://github.com/guard/guard#readme
3
+
4
+ guard 'rspec', :version => 2, :all_on_start => true, :all_after_pass => false do
5
+ watch(%r{^spec/.+_spec\.rb$})
6
+ watch('spec/spec_helper.rb') { "spec" }
7
+
8
+ ## we're not watching source files
9
+ end
10
+
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2012 Paul Gallagher
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,56 @@
1
+ = PDF::Reader::Turtletext {<img src="https://secure.travis-ci.org/tardate/pdf-reader-turtletext.png" />}[http://travis-ci.org/tardate/pdf-reader-turtletext]
2
+
3
+ PDF::Reader::Turtletext is an extension for the most excellent {PDF::Reader}[https://github.com/yob/pdf-reader] gem.
4
+
5
+ The aim of Turtletext is to provide simple and convenient methods for extracting PDF text content and
6
+ converting it to structured data - even when there is no explicit structure in the original PDF source.
7
+
8
+ A typical use is to extract details from utility bills that are provided in PDF format, to open up the data
9
+ for analysis and other secondary uses.
10
+
11
+ For an example of how this is works in practice, see the
12
+ {sps_bill}[https://github.com/tardate/sps_bill_scanner/] gem
13
+ (which is in fact the project where the original ideas for Turtletext gestated).
14
+
15
+ == Requirements and Known Limitations
16
+
17
+ * currently only tested with Ruby 1.9
18
+ * fixed dependency on PDF::Reader v 1.1.1
19
+
20
+ == Installation
21
+
22
+ gem install pdf-reader-turtletext
23
+
24
+ == Usage
25
+
26
+ === PDF::Reader::Turtletext
27
+
28
+ Provides a range of methods to extract structured text from a PDF file,
29
+ such as <tt>text_position</tt> and <tt>text_in_region</tt>.
30
+
31
+ A typical usage:
32
+
33
+ reader = PDF::Reader::Turtletext.new(pdf_filename)
34
+ page = 1
35
+ heading_position = reader.text_position(/transaction table/i)
36
+ next_section = reader.text_position(/transaction summary/i)
37
+ transaction_rows = reader.text_in_region(
38
+ heading_position[x], 900,
39
+ heading_position[y] + 1,next_section[:y] -1
40
+ )
41
+
42
+
43
+ == Contributing to PDF::Reader::Turtletext
44
+
45
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
46
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
47
+ * Fork the project
48
+ * Start a feature/bugfix branch
49
+ * Commit and push until you are happy with your contribution
50
+ * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
51
+ * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
52
+
53
+ == Copyright
54
+
55
+ Copyright (c) 2012 Paul Gallagher. See LICENSE for further details.
56
+
@@ -0,0 +1,57 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+ require 'rspec'
14
+ require 'rspec/core/rake_task'
15
+
16
+ $LOAD_PATH.unshift('lib')
17
+ require 'pdf/reader/turtletext/version'
18
+
19
+ require 'jeweler'
20
+ Jeweler::Tasks.new do |gem|
21
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
22
+ gem.name = "pdf-reader-turtletext"
23
+ gem.version = PDF::Reader::Turtletext::Version::STRING
24
+ gem.homepage = "https://github.com/tardate/pdf-reader-turtletext"
25
+ gem.license = "MIT"
26
+ gem.summary = %Q{PDF structured text reader}
27
+ gem.description = %Q{a library that can read structured and positional text from PDFs. Ideal for asembling structured data from invoices and the like.}
28
+ gem.email = "gallagher.paul@gmail.com"
29
+ gem.authors = ["Paul Gallagher"]
30
+ gem.files.exclude 'pkg/*'
31
+ # dependencies defined in Gemfile
32
+ end
33
+ Jeweler::RubygemsDotOrgTasks.new
34
+
35
+ desc "Run all RSpec test examples"
36
+ RSpec::Core::RakeTask.new do |spec|
37
+ spec.rspec_opts = ["-c", "-f progress"]
38
+ spec.pattern = 'spec/**/*_spec.rb'
39
+ end
40
+
41
+ task :default => :spec
42
+
43
+ require 'rdoc/task'
44
+ RDoc::Task.new do |rdoc|
45
+ rdoc.main = "README.rdoc"
46
+ rdoc.rdoc_dir = 'rdoc'
47
+ rdoc.title = "pdf-reader-turtletext #{PDF::Reader::Turtletext::Version::STRING}"
48
+ rdoc.rdoc_files.include('README*', 'lib/**/*.rb')
49
+ end
50
+
51
+ desc "Generate sample PDFs for tests"
52
+ task :make_pdf_samples do |t|
53
+ require Pathname.new(File.dirname(__FILE__)).join('spec','support','pdf_samples_helper')
54
+ include PdfSamplesHelper
55
+ make_pdf_samples
56
+ end
57
+
@@ -0,0 +1,7 @@
1
+ require 'pdf-reader'
2
+ require 'pdf/reader/patch/object_hash'
3
+ require 'pdf/reader/positional_text_receiver'
4
+
5
+ require 'pdf/reader/turtletext'
6
+ require 'pdf/reader/turtletext/version'
7
+ require 'pdf/reader/turtletext/textangle'
@@ -0,0 +1,47 @@
1
+ # This monkey-patches pdf-reader to allow it to read PDFs that have junk characters that appear
2
+ # in the file before the start of the PDF stream.
3
+ # (this is quite commonly an html head block - I suspect a bug in the Adobe or other software used
4
+ # to serve the bills)
5
+ #
6
+ # The patch has been contributed back to the pdf-reader project (https://github.com/yob/pdf-reader/pull/54)
7
+ # and has already been merged on master. When it shows up in a release of the pdf-reader gem
8
+ # we can trash this patch.
9
+ #
10
+ class PDF::Reader::ObjectHash
11
+
12
+ def extract_io_from(input)
13
+ if input.respond_to?(:seek) && input.respond_to?(:read)
14
+ input
15
+ elsif File.file?(input.to_s)
16
+ read_with_quirks(input)
17
+ else
18
+ raise ArgumentError, "input must be an IO-like object or a filename"
19
+ end
20
+ end
21
+
22
+ # Load file as a StringIO stream, accounting for invalid format
23
+ # where additional characters exist in the file before the %PDF start of file
24
+ def read_with_quirks(input)
25
+ stream = File.open(input.to_s, "rb")
26
+ if ofs = pdf_offset(stream)
27
+ stream.seek(ofs)
28
+ StringIO.new(stream.read)
29
+ else
30
+ raise ArgumentError, "invalid file format"
31
+ end
32
+ end
33
+ private :read_with_quirks
34
+
35
+ # Returns the offset of the PDF document in the +stream+.
36
+ # Checks up to 50 chars into the file, returns nil of no PDF stream detected.
37
+ def pdf_offset(stream)
38
+ stream.rewind
39
+ ofs = stream.pos
40
+ until (c = stream.readchar) == '%' || c == 37 || ofs > 50
41
+ ofs += 1
42
+ end
43
+ ofs < 50 ? ofs : nil
44
+ end
45
+ private :pdf_offset
46
+
47
+ end
@@ -0,0 +1,31 @@
1
+ # Receiver to access positional (x,y) text content from a PDF
2
+ #
3
+ # Typical usage:
4
+ #
5
+ # reader = PDF::Reader.new(filename)
6
+ # receiver = PDF::Reader::PositionalTextReceiver.new
7
+ # reader.page(page).walk(receiver)
8
+ # receiver.content
9
+ #
10
+ class PDF::Reader::PositionalTextReceiver < PDF::Reader::PageTextReceiver
11
+
12
+ # record text that is drawn on the page
13
+ def show_text(string) # Tj
14
+ raise PDF::Reader::MalformedPDFError, "current font is invalid" if @state.current_font.nil?
15
+ newx, newy = @state.trm_transform(0,0)
16
+ @content[newy] ||= {}
17
+ @content[newy][newx] ||= ''
18
+ @content[newy][newx] << @state.current_font.to_utf8(string)
19
+ end
20
+
21
+ # override PageTextReceiver content accessor .
22
+ # Returns a hash of positional text:
23
+ # {
24
+ # y_coord=>{x_coord=>text, x_coord=>text },
25
+ # y_coord=>{x_coord=>text, x_coord=>text }
26
+ # }
27
+ def content
28
+ @content
29
+ end
30
+
31
+ end
@@ -0,0 +1,131 @@
1
+ # Class for reading structured text content
2
+ #
3
+ # Typical usage:
4
+ #
5
+ # reader = PDF::Reader::Turtletext.new(pdf_filename)
6
+ # page = 1
7
+ # heading_position = reader.text_position(/transaction table/i)
8
+ # next_section = reader.text_position(/transaction summary/i)
9
+ # transaction_rows = reader.text_in_region(
10
+ # heading_position[x], 900,
11
+ # heading_position[y] + 1,next_section[:y] -1
12
+ # )
13
+ #
14
+ class PDF::Reader::Turtletext
15
+ attr_reader :reader
16
+ attr_reader :options
17
+
18
+ # +source+ is a file name or stream-like object
19
+ def initialize(source, options={})
20
+ @options = options
21
+ @reader = PDF::Reader.new(source)
22
+ end
23
+
24
+ # Returns the precision required in y positions.
25
+ # This is the fuzz range for interpreting y positions.
26
+ # Lines with y positions +/- +y_precision+ will be merged together.
27
+ # This helps align text correctly which may visually appear on the same line, but is actually
28
+ # off by a few pixels.
29
+ def y_precision
30
+ options[:y_precision] ||= 3
31
+ end
32
+
33
+ # Returns positional (with fuzzed y positioning) text content collection as a hash:
34
+ # { y_position: { x_position: content}}
35
+ def content(page=1)
36
+ @content ||= []
37
+ if @content[page]
38
+ @content[page]
39
+ else
40
+ @content[page] = fuzzed_y(precise_content(page))
41
+ end
42
+ end
43
+
44
+ # Returns a hash with fuzzed positioning:
45
+ # { fuzzed_y_position: { x_position: content}}
46
+ # Given +input+ as a hash:
47
+ # { y_position: { x_position: content}}
48
+ # Fuzz factors: +y_precision+
49
+ def fuzzed_y(input)
50
+ output = {}
51
+ input.keys.sort.each do |precise_y|
52
+ # matching_y = (precise_y / 5.0).truncate * 5.0
53
+ matching_y = output.keys.select{|new_y| (new_y - precise_y).abs < y_precision }.first || precise_y
54
+ output[matching_y] ||= {}
55
+ output[matching_y].merge!(input[precise_y])
56
+ end
57
+ output
58
+ end
59
+
60
+ # Returns positional text content collection as a hash with precise x,y positioning:
61
+ # { y_position: { x_position: content}}
62
+ def precise_content(page=1)
63
+ @precise_content ||= []
64
+ if @precise_content[page]
65
+ @precise_content[page]
66
+ else
67
+ @precise_content[page] = load_content(page)
68
+ end
69
+ end
70
+
71
+ # Returns an array of text elements found within the x,y limits,
72
+ # Each line of text found is returned as an array element.
73
+ # Each line of text is an array of the seperate text elements found on that line.
74
+ # [["first line first text", "first line last text"],["second line text"]]
75
+ def text_in_region(xmin,xmax,ymin,ymax,page=1)
76
+ text_map = content(page)
77
+ box = []
78
+ text_map.keys.sort.reverse.each do |y|
79
+ if y >= ymin && y<= ymax
80
+ row = []
81
+ text_map[y].keys.sort.each do |x|
82
+ if x >= xmin && x<= xmax
83
+ row << text_map[y][x]
84
+ end
85
+ end
86
+ box << row unless row.empty?
87
+ end
88
+ end
89
+ box
90
+ end
91
+
92
+ # Returns the position of +text+ on +page+
93
+ # {x: val, y: val }
94
+ # +text+ may be a string (exact match required) or a Regexp
95
+ def text_position(text,page=1)
96
+ item = if text.class <= Regexp
97
+ content(page).map {|k,v| if x = v.reduce(nil){|memo,vv| memo = (vv[1] =~ text) ? vv[0] : memo } ; [k,x] ; end }
98
+ else
99
+ content(page).map {|k,v| if x = v.rassoc(text) ; [k,x] ; end }
100
+ end
101
+ item = item.compact.flatten
102
+ unless item.empty?
103
+ { :x => item[1], :y => item[0] }
104
+ end
105
+ end
106
+
107
+ # WIP - not using Textangle yet for text extraction.
108
+ # Ideal usage is something like this:
109
+ #
110
+ # textangle = reader.bounding_box do
111
+ # page 1
112
+ # below "Electricity Services"
113
+ # above "Gas Services by City Gas Pte Ltd"
114
+ # right_of 240.0
115
+ # left_of "Total ($)"
116
+ # end
117
+ # textangle.text
118
+ #
119
+ def bounding_box(&block)
120
+ PDF::Reader::Turtletext::Textangle.new(self,&block)
121
+ end
122
+
123
+ private
124
+
125
+ def load_content(page)
126
+ receiver = PDF::Reader::PositionalTextReceiver.new
127
+ reader.page(page).walk(receiver)
128
+ receiver.content
129
+ end
130
+
131
+ end
@@ -0,0 +1,27 @@
1
+ # A DSL syntax for text extraction.
2
+ # WIP - not using this yet
3
+ #
4
+ # textangle = PDF::Reader::Turtletext::Textangle.new(reader) do
5
+ # page 1
6
+ # below "Electricity Services"
7
+ # above "Gas Services by City Gas Pte Ltd"
8
+ # right_of 240.0
9
+ # left_of "Total ($)"
10
+ # end
11
+ # textangle.text
12
+ #
13
+ class PDF::Reader::Turtletext::Textangle
14
+ attr_reader :reader
15
+ attr_writer :page,:above,:below,:left_of,:right_of
16
+
17
+ # +structured_reader+ is a PDF::StructuredReader
18
+ def initialize(structured_reader,&block)
19
+ @reader = structured_reader
20
+ instance_eval( &block ) if block
21
+ end
22
+
23
+ def text
24
+ # TODO
25
+ end
26
+
27
+ end
@@ -0,0 +1,13 @@
1
+ module PDF
2
+ class Reader
3
+ class Turtletext
4
+ class Version
5
+ MAJOR = 0
6
+ MINOR = 1
7
+ PATCH = 0
8
+
9
+ STRING = [MAJOR, MINOR, PATCH].compact.join('.')
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,87 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "pdf-reader-turtletext"
8
+ s.version = "0.1.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Paul Gallagher"]
12
+ s.date = "2012-07-22"
13
+ s.description = "a library that can read structured and positional text from PDFs. Ideal for asembling structured data from invoices and the like."
14
+ s.email = "gallagher.paul@gmail.com"
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".rspec",
21
+ ".rvmrc",
22
+ ".travis.yml",
23
+ "Gemfile",
24
+ "Gemfile.lock",
25
+ "Guardfile",
26
+ "LICENSE",
27
+ "README.rdoc",
28
+ "Rakefile",
29
+ "lib/pdf-reader-turtletext.rb",
30
+ "lib/pdf/reader/patch/object_hash.rb",
31
+ "lib/pdf/reader/positional_text_receiver.rb",
32
+ "lib/pdf/reader/turtletext.rb",
33
+ "lib/pdf/reader/turtletext/textangle.rb",
34
+ "lib/pdf/reader/turtletext/version.rb",
35
+ "pdf-reader-turtletext.gemspec",
36
+ "spec/fixtures/pdf_samples/.gitkeep",
37
+ "spec/fixtures/pdf_samples/hello_world.pdf",
38
+ "spec/fixtures/pdf_samples/junk_prefix.pdf",
39
+ "spec/integration/pdf_samples_spec.rb",
40
+ "spec/spec_helper.rb",
41
+ "spec/support/pdf_samples_helper.rb",
42
+ "spec/unit/reader/patch/object_hash_spec.rb",
43
+ "spec/unit/reader/positional_text_receiver_spec.rb",
44
+ "spec/unit/reader/turtletext/textangle_spec.rb",
45
+ "spec/unit/reader/turtletext/turtletext_spec.rb",
46
+ "spec/unit/reader/turtletext/version_spec.rb"
47
+ ]
48
+ s.homepage = "https://github.com/tardate/pdf-reader-turtletext"
49
+ s.licenses = ["MIT"]
50
+ s.require_paths = ["lib"]
51
+ s.rubygems_version = "1.8.15"
52
+ s.summary = "PDF structured text reader"
53
+
54
+ if s.respond_to? :specification_version then
55
+ s.specification_version = 3
56
+
57
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
58
+ s.add_runtime_dependency(%q<pdf-reader>, ["= 1.1.1"])
59
+ s.add_development_dependency(%q<bundler>, ["~> 1.1.4"])
60
+ s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
61
+ s.add_development_dependency(%q<rake>, ["~> 0.9.2.2"])
62
+ s.add_development_dependency(%q<rspec>, ["~> 2.8.0"])
63
+ s.add_development_dependency(%q<rdoc>, ["~> 3.11"])
64
+ s.add_development_dependency(%q<prawn>, ["~> 0.12.0"])
65
+ s.add_development_dependency(%q<guard-rspec>, ["~> 1.2.0"])
66
+ else
67
+ s.add_dependency(%q<pdf-reader>, ["= 1.1.1"])
68
+ s.add_dependency(%q<bundler>, ["~> 1.1.4"])
69
+ s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
70
+ s.add_dependency(%q<rake>, ["~> 0.9.2.2"])
71
+ s.add_dependency(%q<rspec>, ["~> 2.8.0"])
72
+ s.add_dependency(%q<rdoc>, ["~> 3.11"])
73
+ s.add_dependency(%q<prawn>, ["~> 0.12.0"])
74
+ s.add_dependency(%q<guard-rspec>, ["~> 1.2.0"])
75
+ end
76
+ else
77
+ s.add_dependency(%q<pdf-reader>, ["= 1.1.1"])
78
+ s.add_dependency(%q<bundler>, ["~> 1.1.4"])
79
+ s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
80
+ s.add_dependency(%q<rake>, ["~> 0.9.2.2"])
81
+ s.add_dependency(%q<rspec>, ["~> 2.8.0"])
82
+ s.add_dependency(%q<rdoc>, ["~> 3.11"])
83
+ s.add_dependency(%q<prawn>, ["~> 0.12.0"])
84
+ s.add_dependency(%q<guard-rspec>, ["~> 1.2.0"])
85
+ end
86
+ end
87
+
File without changes
@@ -0,0 +1,69 @@
1
+ %PDF-1.3
2
+ %����
3
+ 1 0 obj
4
+ << /Creator <feff0050007200610077006e>
5
+ /Producer <feff0050007200610077006e>
6
+ >>
7
+ endobj
8
+ 2 0 obj
9
+ << /Type /Catalog
10
+ /Pages 3 0 R
11
+ >>
12
+ endobj
13
+ 3 0 obj
14
+ << /Type /Pages
15
+ /Count 1
16
+ /Kids [5 0 R]
17
+ >>
18
+ endobj
19
+ 4 0 obj
20
+ << /Length 81
21
+ >>
22
+ stream
23
+ q
24
+
25
+ BT
26
+ 36 747.384 Td
27
+ /F1.0 12 Tf
28
+ [<48656c6c6f2057> 30 <6f72> -15 <6c64>] TJ
29
+ ET
30
+
31
+ Q
32
+
33
+ endstream
34
+ endobj
35
+ 5 0 obj
36
+ << /Type /Page
37
+ /Parent 3 0 R
38
+ /MediaBox [0 0 612.0 792.0]
39
+ /Contents 4 0 R
40
+ /Resources << /ProcSet [/PDF /Text /ImageB /ImageC /ImageI]
41
+ /Font << /F1.0 6 0 R
42
+ >>
43
+ >>
44
+ >>
45
+ endobj
46
+ 6 0 obj
47
+ << /Type /Font
48
+ /Subtype /Type1
49
+ /BaseFont /Helvetica
50
+ /Encoding /WinAnsiEncoding
51
+ >>
52
+ endobj
53
+ xref
54
+ 0 7
55
+ 0000000000 65535 f
56
+ 0000000015 00000 n
57
+ 0000000109 00000 n
58
+ 0000000158 00000 n
59
+ 0000000215 00000 n
60
+ 0000000346 00000 n
61
+ 0000000524 00000 n
62
+ trailer
63
+ << /Size 7
64
+ /Root 2 0 R
65
+ /Info 1 0 R
66
+ >>
67
+ startxref
68
+ 621
69
+ %%EOF
@@ -0,0 +1,71 @@
1
+ <html>
2
+ <head></head>
3
+ %PDF-1.3
4
+ %����
5
+ 1 0 obj
6
+ << /Creator <feff0050007200610077006e>
7
+ /Producer <feff0050007200610077006e>
8
+ >>
9
+ endobj
10
+ 2 0 obj
11
+ << /Type /Catalog
12
+ /Pages 3 0 R
13
+ >>
14
+ endobj
15
+ 3 0 obj
16
+ << /Type /Pages
17
+ /Count 1
18
+ /Kids [5 0 R]
19
+ >>
20
+ endobj
21
+ 4 0 obj
22
+ << /Length 157
23
+ >>
24
+ stream
25
+ q
26
+
27
+ BT
28
+ 36 747.384 Td
29
+ /F1.0 12 Tf
30
+ [<546869732050444620636f6e7461696e73206a756e6b20626566> 30 <6f72652074686520252d504446206d6172> -15 <6b> 20 <6572>] TJ
31
+ ET
32
+
33
+ Q
34
+
35
+ endstream
36
+ endobj
37
+ 5 0 obj
38
+ << /Type /Page
39
+ /Parent 3 0 R
40
+ /MediaBox [0 0 612.0 792.0]
41
+ /Contents 4 0 R
42
+ /Resources << /ProcSet [/PDF /Text /ImageB /ImageC /ImageI]
43
+ /Font << /F1.0 6 0 R
44
+ >>
45
+ >>
46
+ >>
47
+ endobj
48
+ 6 0 obj
49
+ << /Type /Font
50
+ /Subtype /Type1
51
+ /BaseFont /Helvetica
52
+ /Encoding /WinAnsiEncoding
53
+ >>
54
+ endobj
55
+ xref
56
+ 0 7
57
+ 0000000000 65535 f
58
+ 0000000015 00000 n
59
+ 0000000109 00000 n
60
+ 0000000158 00000 n
61
+ 0000000215 00000 n
62
+ 0000000423 00000 n
63
+ 0000000601 00000 n
64
+ trailer
65
+ << /Size 7
66
+ /Root 2 0 R
67
+ /Info 1 0 R
68
+ >>
69
+ startxref
70
+ 698
71
+ %%EOF
@@ -0,0 +1,7 @@
1
+ require 'spec_helper'
2
+ include PdfSamplesHelper
3
+
4
+ describe "PDF Samples" do
5
+
6
+
7
+ end
@@ -0,0 +1,24 @@
1
+ require 'pdf-reader-turtletext'
2
+
3
+ # Requires supporting files with custom matchers and macros, etc,
4
+ # in ./support/ and its subdirectories.
5
+ Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
6
+
7
+ RSpec.configure do |config|
8
+ # == Mock Framework
9
+ #
10
+ # If you prefer to use mocha, flexmock or RR, uncomment the appropriate line:
11
+ #
12
+ # config.mock_with :mocha
13
+ # config.mock_with :flexmock
14
+ # config.mock_with :rr
15
+ config.mock_with :rspec
16
+
17
+ # Remove this line if you're not using ActiveRecord or ActiveRecord fixtures
18
+ # config.fixture_path = "#{::Rails.root}/spec/fixtures"
19
+
20
+ # If you're not using ActiveRecord, or you'd prefer not to run each of your
21
+ # examples within a transaction, remove the following line or assign false
22
+ # instead of true.
23
+ # config.use_transactional_fixtures = true
24
+ end
@@ -0,0 +1,43 @@
1
+ require 'pathname'
2
+ require 'yaml'
3
+
4
+ module PdfSamplesHelper
5
+
6
+ def pdf_samples_path
7
+ Pathname.new(File.dirname(__FILE__)).join('..','fixtures','pdf_samples')
8
+ end
9
+
10
+ def pdf_sample(sample_name)
11
+ pdf_samples_path.join(sample_name)
12
+ end
13
+
14
+ def pdf_sample_names
15
+ Dir[pdf_samples_path.join("*.pdf")]
16
+ end
17
+
18
+ def pdf_sample_expectations_path
19
+ pdf_samples_path.join('expectations.yml')
20
+ end
21
+
22
+ def pdf_sample_expectations
23
+ begin
24
+ YAML.load_file pdf_sample_expectations_path
25
+ rescue
26
+ []
27
+ end
28
+ end
29
+
30
+ def make_pdf_samples
31
+ require 'prawn'
32
+ puts "Making PDF samples for tests.."
33
+ make_sample_hello_world
34
+ end
35
+
36
+ def make_sample_hello_world
37
+ filename = pdf_sample('hello_world.pdf')
38
+ Prawn::Document.generate filename do
39
+ text "Hello World"
40
+ end
41
+ puts "Created: #{filename}"
42
+ end
43
+ end
@@ -0,0 +1,15 @@
1
+ require 'spec_helper'
2
+ include PdfSamplesHelper
3
+
4
+ describe PDF::Reader::ObjectHash do
5
+
6
+ context "when there is a junk prefix" do
7
+ let(:sample_name) { pdf_sample('junk_prefix.pdf') }
8
+ let(:object_hash) { PDF::Reader::ObjectHash.new(sample_name) }
9
+ let(:stream) { object_hash.instance_variable_get(:@io) }
10
+ before { stream.rewind }
11
+ subject { stream.read(4) }
12
+ it { should eql("%PDF") }
13
+ end
14
+
15
+ end
@@ -0,0 +1,26 @@
1
+ require 'spec_helper'
2
+ include PdfSamplesHelper
3
+
4
+ describe PDF::Reader::PositionalTextReceiver do
5
+ let(:resource_class) { PDF::Reader::PositionalTextReceiver }
6
+
7
+ let(:reader) { PDF::Reader.new(source) }
8
+ let(:receiver) { resource_class.new }
9
+ let(:page) { 1 }
10
+
11
+ before do
12
+ reader.page(page).walk(receiver)
13
+ end
14
+
15
+ {
16
+ 'junk_prefix.pdf' => {747.384=>{36.0=>"This PDF contains junk before the %-PDF marker"}},
17
+ 'hello_world.pdf' => {747.384=>{36.0=>"Hello World"}}
18
+ }.each do |sample_file,expected_page_content|
19
+ describe "#content for #{sample_file}" do
20
+ let(:source) { pdf_sample(sample_file) }
21
+ subject { receiver.content }
22
+ it { should eql(expected_page_content) }
23
+ end
24
+ end
25
+
26
+ end
@@ -0,0 +1,6 @@
1
+ require 'spec_helper'
2
+
3
+ describe PDF::Reader::Turtletext::Textangle do
4
+ let(:resource_class) { PDF::Reader::Turtletext::Textangle }
5
+
6
+ end
@@ -0,0 +1,152 @@
1
+ require 'spec_helper'
2
+
3
+ describe PDF::Reader::Turtletext do
4
+ let(:resource_class) { PDF::Reader::Turtletext }
5
+
6
+ let(:source) { nil } # we're just going to mock the PDF source here
7
+ let(:structured_reader) { resource_class.new(source,options) }
8
+ let(:options) { {} }
9
+
10
+ describe "#reader" do
11
+ subject { structured_reader.reader}
12
+ it { should be_a(PDF::Reader) }
13
+ end
14
+
15
+ describe "#y_precision" do
16
+ subject { structured_reader.y_precision}
17
+ context "default" do
18
+ it { should eql(3) }
19
+ end
20
+ context "when set with options" do
21
+ let(:expected) { 5 }
22
+ let(:options) { { :y_precision => expected } }
23
+ it { should eql(expected) }
24
+ end
25
+ end
26
+
27
+ context "with mocked source content" do
28
+ let(:page) { 1 }
29
+ before do
30
+ structured_reader.should_receive(:load_content).with(page).and_return(given_page_content)
31
+ end
32
+
33
+ {
34
+ :with_simple_text => {
35
+ :source_page_content => {10.0=>{10.0=>"a first bit of text"}},
36
+ :expected_precise_content => {10.0=>{10.0=>"a first bit of text"}},
37
+ :expected_fuzzed_content => {10.0=>{10.0=>"a first bit of text"}}
38
+ },
39
+ :with_widely_separated_text => {
40
+ :source_page_content => {10.0=>{10.0=>"a first bit of text"},20.0=>{20.0=>"a second bit of text"}},
41
+ :expected_precise_content => {10.0=>{10.0=>"a first bit of text"},20.0=>{20.0=>"a second bit of text"}},
42
+ :expected_fuzzed_content => {10.0=>{10.0=>"a first bit of text"},20.0=>{20.0=>"a second bit of text"}}
43
+ },
44
+ :with_unsorted_y_text => {
45
+ :source_page_content => {20.0=>{10.0=>"a first bit of text"},10.0=>{20.0=>"a second bit of text"}},
46
+ :expected_precise_content => {20.0=>{10.0=>"a first bit of text"},10.0=>{20.0=>"a second bit of text"}},
47
+ :expected_fuzzed_content => {10.0=>{20.0=>"a second bit of text"},20.0=>{10.0=>"a first bit of text"}}
48
+ },
49
+ :with_fuzzed_y_text => {
50
+ :source_page_content => {10.0=>{10.0=>"a first bit of text"},12.0=>{12.0=>"a second bit of text"}},
51
+ :expected_precise_content => {10.0=>{10.0=>"a first bit of text"},12.0=>{12.0=>"a second bit of text"}},
52
+ :expected_fuzzed_content => {10.0=>{10.0=>"a first bit of text",12.0=>"a second bit of text"}}
53
+ },
54
+ :with_widely_separated_fuzzed_y_text => {
55
+ :y_precision => 25,
56
+ :source_page_content => {10.0=>{10.0=>"a first bit of text"},20.0=>{20.0=>"a second bit of text"}},
57
+ :expected_precise_content => {10.0=>{10.0=>"a first bit of text"},20.0=>{20.0=>"a second bit of text"}},
58
+ :expected_fuzzed_content => {10.0=>{10.0=>"a first bit of text",20.0=>"a second bit of text"}}
59
+ }
60
+ }.each do |test_name,test_expectations|
61
+ context test_name do
62
+ let(:given_page_content) { test_expectations[:source_page_content] }
63
+ let(:options) {
64
+ if (y_precision = test_expectations[:y_precision]) && y_precision != :default
65
+ { :y_precision => y_precision }
66
+ else
67
+ {}
68
+ end
69
+ }
70
+
71
+ describe "#content" do
72
+ subject { structured_reader.content(page) }
73
+ it { should eql(test_expectations[:expected_fuzzed_content]) }
74
+ end
75
+
76
+ describe "#precise_content" do
77
+ subject { structured_reader.precise_content(page) }
78
+ it { should eql(test_expectations[:expected_precise_content]) }
79
+ end
80
+
81
+ end
82
+ end
83
+
84
+ describe "#text_in_region" do
85
+ {
86
+ :with_single_text => {
87
+ :source_page_content => {10.0=>{10.0=>"a first bit of text"}},
88
+ :xmin => 0, :xmax => 100, :ymin => 0, :ymax => 100,
89
+ :expected_text => [["a first bit of text"]]
90
+ },
91
+ :with_single_line_text => {
92
+ :source_page_content => {
93
+ 10.0=>{10.0=>"first line ignored"},
94
+ 30.0=>{10.0=>"first part found", 20.0=>"last part found"},
95
+ 70.0=>{10.0=>"last line ignored"}
96
+ },
97
+ :xmin => 0, :xmax => 100, :ymin => 20, :ymax => 50,
98
+ :expected_text => [["first part found", "last part found"]]
99
+ },
100
+ :with_multi_line_text => {
101
+ :source_page_content => {
102
+ 10.0=>{10.0=>"first line ignored"},
103
+ 30.0=>{10.0=>"first line first part found", 20.0=>"first line last part found"},
104
+ 40.0=>{10.0=>"last line first part found", 20.0=>"last line last part found"},
105
+ 70.0=>{10.0=>"last line ignored"}
106
+ },
107
+ :xmin => 0, :xmax => 100, :ymin => 20, :ymax => 50,
108
+ :expected_text => [
109
+ ["last line first part found", "last line last part found"],
110
+ ["first line first part found", "first line last part found"]
111
+ ]
112
+ }
113
+ }.each do |test_name,test_expectations|
114
+ context test_name do
115
+ let(:given_page_content) { test_expectations[:source_page_content] }
116
+ let(:xmin) { test_expectations[:xmin] }
117
+ let(:xmax) { test_expectations[:xmax] }
118
+ let(:ymin) { test_expectations[:ymin] }
119
+ let(:ymax) { test_expectations[:ymax] }
120
+ let(:expected_text) { test_expectations[:expected_text] }
121
+ subject { structured_reader.text_in_region(xmin,xmax,ymin,ymax,page) }
122
+ it { should eql(expected_text) }
123
+ end
124
+ end
125
+ end
126
+
127
+ describe "#text_position" do
128
+ let(:given_page_content) { {
129
+ 10.0=>{10.0=>"crunchy bacon"},
130
+ 30.0=>{15.0=>"bacon on kimchi noodles", 25.0=>"heaven"},
131
+ 40.0=>{30.0=>"turkey bacon", 35.0=>"fraud"},
132
+ 70.0=>{40.0=>"smoked and streaky da bomb"}
133
+ } }
134
+ {
135
+ :with_simple_match => { :match_term => 'turkey bacon', :expected_position => {:x=>30.0, :y=>40.0} },
136
+ :with_match_along_line => { :match_term => 'heaven', :expected_position => {:x=>25.0, :y=>30.0} },
137
+ :with_regex_match => { :match_term => /kimchi/, :expected_position => {:x=>15.0, :y=>30.0} },
138
+ :with_regex_multi_matches_first => { :match_term => /turkey|crunchy/, :expected_position => {:x=>10.0, :y=>10.0} }
139
+ }.each do |test_name,test_expectations|
140
+ context test_name do
141
+ let(:match_term) { test_expectations[:match_term] }
142
+ let(:expected_position) { test_expectations[:expected_position] }
143
+ subject { structured_reader.text_position(match_term,page) }
144
+ it { should eql(expected_position) }
145
+ end
146
+ end
147
+ end
148
+
149
+
150
+ end
151
+
152
+ end
@@ -0,0 +1,14 @@
1
+ require 'spec_helper'
2
+
3
+ describe PDF::Reader::Turtletext::Version do
4
+ let(:resource_class) { PDF::Reader::Turtletext::Version }
5
+
6
+ it { resource_class::MAJOR.should be_a(Fixnum) }
7
+ it { resource_class::MINOR.should be_a(Fixnum) }
8
+ it { resource_class::PATCH.should be_a(Fixnum) }
9
+
10
+ describe "##STRING" do
11
+ subject { resource_class::STRING }
12
+ it { should match(/\d+\.\d+\.\d+/)}
13
+ end
14
+ end
metadata ADDED
@@ -0,0 +1,163 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pdf-reader-turtletext
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Paul Gallagher
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-07-22 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: pdf-reader
16
+ requirement: &70193556628420 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - =
20
+ - !ruby/object:Gem::Version
21
+ version: 1.1.1
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *70193556628420
25
+ - !ruby/object:Gem::Dependency
26
+ name: bundler
27
+ requirement: &70193556627700 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ~>
31
+ - !ruby/object:Gem::Version
32
+ version: 1.1.4
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: *70193556627700
36
+ - !ruby/object:Gem::Dependency
37
+ name: jeweler
38
+ requirement: &70193556626800 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ~>
42
+ - !ruby/object:Gem::Version
43
+ version: 1.6.4
44
+ type: :development
45
+ prerelease: false
46
+ version_requirements: *70193556626800
47
+ - !ruby/object:Gem::Dependency
48
+ name: rake
49
+ requirement: &70193556626300 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: 0.9.2.2
55
+ type: :development
56
+ prerelease: false
57
+ version_requirements: *70193556626300
58
+ - !ruby/object:Gem::Dependency
59
+ name: rspec
60
+ requirement: &70193556625680 !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ~>
64
+ - !ruby/object:Gem::Version
65
+ version: 2.8.0
66
+ type: :development
67
+ prerelease: false
68
+ version_requirements: *70193556625680
69
+ - !ruby/object:Gem::Dependency
70
+ name: rdoc
71
+ requirement: &70193556624820 !ruby/object:Gem::Requirement
72
+ none: false
73
+ requirements:
74
+ - - ~>
75
+ - !ruby/object:Gem::Version
76
+ version: '3.11'
77
+ type: :development
78
+ prerelease: false
79
+ version_requirements: *70193556624820
80
+ - !ruby/object:Gem::Dependency
81
+ name: prawn
82
+ requirement: &70193556623960 !ruby/object:Gem::Requirement
83
+ none: false
84
+ requirements:
85
+ - - ~>
86
+ - !ruby/object:Gem::Version
87
+ version: 0.12.0
88
+ type: :development
89
+ prerelease: false
90
+ version_requirements: *70193556623960
91
+ - !ruby/object:Gem::Dependency
92
+ name: guard-rspec
93
+ requirement: &70193556623440 !ruby/object:Gem::Requirement
94
+ none: false
95
+ requirements:
96
+ - - ~>
97
+ - !ruby/object:Gem::Version
98
+ version: 1.2.0
99
+ type: :development
100
+ prerelease: false
101
+ version_requirements: *70193556623440
102
+ description: a library that can read structured and positional text from PDFs. Ideal
103
+ for asembling structured data from invoices and the like.
104
+ email: gallagher.paul@gmail.com
105
+ executables: []
106
+ extensions: []
107
+ extra_rdoc_files:
108
+ - LICENSE
109
+ - README.rdoc
110
+ files:
111
+ - .rspec
112
+ - .rvmrc
113
+ - .travis.yml
114
+ - Gemfile
115
+ - Gemfile.lock
116
+ - Guardfile
117
+ - LICENSE
118
+ - README.rdoc
119
+ - Rakefile
120
+ - lib/pdf-reader-turtletext.rb
121
+ - lib/pdf/reader/patch/object_hash.rb
122
+ - lib/pdf/reader/positional_text_receiver.rb
123
+ - lib/pdf/reader/turtletext.rb
124
+ - lib/pdf/reader/turtletext/textangle.rb
125
+ - lib/pdf/reader/turtletext/version.rb
126
+ - pdf-reader-turtletext.gemspec
127
+ - spec/fixtures/pdf_samples/.gitkeep
128
+ - spec/fixtures/pdf_samples/hello_world.pdf
129
+ - spec/fixtures/pdf_samples/junk_prefix.pdf
130
+ - spec/integration/pdf_samples_spec.rb
131
+ - spec/spec_helper.rb
132
+ - spec/support/pdf_samples_helper.rb
133
+ - spec/unit/reader/patch/object_hash_spec.rb
134
+ - spec/unit/reader/positional_text_receiver_spec.rb
135
+ - spec/unit/reader/turtletext/textangle_spec.rb
136
+ - spec/unit/reader/turtletext/turtletext_spec.rb
137
+ - spec/unit/reader/turtletext/version_spec.rb
138
+ homepage: https://github.com/tardate/pdf-reader-turtletext
139
+ licenses:
140
+ - MIT
141
+ post_install_message:
142
+ rdoc_options: []
143
+ require_paths:
144
+ - lib
145
+ required_ruby_version: !ruby/object:Gem::Requirement
146
+ none: false
147
+ requirements:
148
+ - - ! '>='
149
+ - !ruby/object:Gem::Version
150
+ version: '0'
151
+ required_rubygems_version: !ruby/object:Gem::Requirement
152
+ none: false
153
+ requirements:
154
+ - - ! '>='
155
+ - !ruby/object:Gem::Version
156
+ version: '0'
157
+ requirements: []
158
+ rubyforge_project:
159
+ rubygems_version: 1.8.15
160
+ signing_key:
161
+ specification_version: 3
162
+ summary: PDF structured text reader
163
+ test_files: []