RubyGems - pdf-reader-turtletext - Versions diffs - 0.1.0 - Mend

pdf-reader-turtletext 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

data/.rspec +1 -0
data/.rvmrc +1 -0
data/.travis.yml +3 -0
data/Gemfile +18 -0
data/Gemfile.lock +59 -0
data/Guardfile +10 -0
data/LICENSE +20 -0
data/README.rdoc +56 -0
data/Rakefile +57 -0
data/lib/pdf-reader-turtletext.rb +7 -0
data/lib/pdf/reader/patch/object_hash.rb +47 -0
data/lib/pdf/reader/positional_text_receiver.rb +31 -0
data/lib/pdf/reader/turtletext.rb +131 -0
data/lib/pdf/reader/turtletext/textangle.rb +27 -0
data/lib/pdf/reader/turtletext/version.rb +13 -0
data/pdf-reader-turtletext.gemspec +87 -0
data/spec/fixtures/pdf_samples/.gitkeep +0 -0
data/spec/fixtures/pdf_samples/hello_world.pdf +69 -0
data/spec/fixtures/pdf_samples/junk_prefix.pdf +71 -0
data/spec/integration/pdf_samples_spec.rb +7 -0
data/spec/spec_helper.rb +24 -0
data/spec/support/pdf_samples_helper.rb +43 -0
data/spec/unit/reader/patch/object_hash_spec.rb +15 -0
data/spec/unit/reader/positional_text_receiver_spec.rb +26 -0
data/spec/unit/reader/turtletext/textangle_spec.rb +6 -0
data/spec/unit/reader/turtletext/turtletext_spec.rb +152 -0
data/spec/unit/reader/turtletext/version_spec.rb +14 -0
metadata +163 -0

data/.rspec ADDED

	@@ -0,0 +1 @@
1	+ --color

data/.rvmrc ADDED

	@@ -0,0 +1 @@
1	+ rvm use 1.9.3@pdf-reader-turtletext --create

data/.travis.yml ADDED

@@ -0,0 +1,3 @@
+# These are specific configuration settings required for travis-ci
+# see http://travis-ci.org/tardate/pdf-reader-turtletext
+rvm: 1.9.3

data/Gemfile ADDED

@@ -0,0 +1,18 @@
+source "http://rubygems.org"
+gem 'pdf-reader', '1.1.1'
+group :development do
+  gem 'bundler', '~> 1.1.4'
+  gem 'jeweler', '~> 1.6.4'
+end
+group :development, :test do
+  gem 'rake', '~> 0.9.2.2'
+  gem 'rspec', '~> 2.8.0', :require => 'spec'
+  gem 'rdoc', '~> 3.11'
+  # prawn for generating PDFs for tests
+  gem 'prawn', '~> 0.12.0'
+  # guard for auto-running tests
+  gem 'guard-rspec', '~> 1.2.0'
+end

data/Gemfile.lock ADDED

@@ -0,0 +1,59 @@
+GEM
+  remote: http://rubygems.org/
+  specs:
+    Ascii85 (1.0.1)
+    diff-lcs (1.1.3)
+    ffi (1.1.0)
+    git (1.2.5)
+    guard (1.2.3)
+      listen (>= 0.4.2)
+      thor (>= 0.14.6)
+    guard-rspec (1.2.0)
+      guard (>= 1.1)
+    jeweler (1.6.4)
+      bundler (~> 1.0)
+      git (>= 1.2.5)
+      rake
+    json (1.7.3)
+    listen (0.4.7)
+      rb-fchange (~> 0.0.5)
+      rb-fsevent (~> 0.9.1)
+      rb-inotify (~> 0.8.8)
+    pdf-reader (1.1.1)
+      Ascii85 (~> 1.0.0)
+      ruby-rc4
+    prawn (0.12.0)
+      pdf-reader (>= 0.9.0)
+      ttfunk (~> 1.0.2)
+    rake (0.9.2.2)
+    rb-fchange (0.0.5)
+      ffi
+    rb-fsevent (0.9.1)
+    rb-inotify (0.8.8)
+      ffi (>= 0.5.0)
+    rdoc (3.12)
+      json (~> 1.4)
+    rspec (2.8.0)
+      rspec-core (~> 2.8.0)
+      rspec-expectations (~> 2.8.0)
+      rspec-mocks (~> 2.8.0)
+    rspec-core (2.8.0)
+    rspec-expectations (2.8.0)
+      diff-lcs (~> 1.1.2)
+    rspec-mocks (2.8.0)
+    ruby-rc4 (0.1.5)
+    thor (0.15.4)
+    ttfunk (1.0.3)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  bundler (~> 1.1.4)
+  guard-rspec (~> 1.2.0)
+  jeweler (~> 1.6.4)
+  pdf-reader (= 1.1.1)
+  prawn (~> 0.12.0)
+  rake (~> 0.9.2.2)
+  rdoc (~> 3.11)
+  rspec (~> 2.8.0)

data/Guardfile ADDED

@@ -0,0 +1,10 @@
+# A sample Guardfile
+# More info at https://github.com/guard/guard#readme
+guard 'rspec', :version => 2, :all_on_start => true, :all_after_pass => false do
+  watch(%r{^spec/.+_spec\.rb$})
+  watch('spec/spec_helper.rb')  { "spec" }
+  ## we're not watching source files
+end

data/LICENSE ADDED

@@ -0,0 +1,20 @@
+Copyright (c) 2012 Paul Gallagher
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.rdoc ADDED

@@ -0,0 +1,56 @@
+= PDF::Reader::Turtletext {<img src="https://secure.travis-ci.org/tardate/pdf-reader-turtletext.png" />}[http://travis-ci.org/tardate/pdf-reader-turtletext]
+PDF::Reader::Turtletext is an extension for the most excellent {PDF::Reader}[https://github.com/yob/pdf-reader] gem.
+The aim of Turtletext is to provide simple and convenient methods for extracting PDF text content and
+converting it to structured data - even when there is no explicit structure in the original PDF source.
+A typical use is to extract details from utility bills that are provided in PDF format, to open up the data
+for analysis and other secondary uses.
+For an example of how this is works in practice, see the
+{sps_bill}[https://github.com/tardate/sps_bill_scanner/] gem
+(which is in fact the project where the original ideas for Turtletext gestated).
+== Requirements and Known Limitations
+* currently only tested with Ruby 1.9
+* fixed dependency on PDF::Reader v 1.1.1
+== Installation
+  gem install pdf-reader-turtletext
+== Usage
+=== PDF::Reader::Turtletext
+Provides a range of methods to extract structured text from a PDF file,
+such as <tt>text_position</tt> and <tt>text_in_region</tt>.
+A typical usage:
+  reader = PDF::Reader::Turtletext.new(pdf_filename)
+  page = 1
+  heading_position = reader.text_position(/transaction table/i)
+  next_section = reader.text_position(/transaction summary/i)
+  transaction_rows = reader.text_in_region(
+    heading_position[x], 900,
+    heading_position[y] + 1,next_section[:y] -1
+  )
+== Contributing to PDF::Reader::Turtletext
+* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
+* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
+* Fork the project
+* Start a feature/bugfix branch
+* Commit and push until you are happy with your contribution
+* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
+* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
+== Copyright
+Copyright (c) 2012 Paul Gallagher. See LICENSE for further details.

data/Rakefile ADDED

@@ -0,0 +1,57 @@
+# encoding: utf-8
+require 'rubygems'
+require 'bundler'
+begin
+  Bundler.setup(:default, :development)
+rescue Bundler::BundlerError => e
+  $stderr.puts e.message
+  $stderr.puts "Run `bundle install` to install missing gems"
+  exit e.status_code
+end
+require 'rake'
+require 'rspec'
+require 'rspec/core/rake_task'
+$LOAD_PATH.unshift('lib')
+require 'pdf/reader/turtletext/version'
+require 'jeweler'
+Jeweler::Tasks.new do |gem|
+  # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
+  gem.name = "pdf-reader-turtletext"
+  gem.version = PDF::Reader::Turtletext::Version::STRING
+  gem.homepage = "https://github.com/tardate/pdf-reader-turtletext"
+  gem.license = "MIT"
+  gem.summary = %Q{PDF structured text reader}
+  gem.description = %Q{a library that can read structured and positional text from PDFs. Ideal for asembling structured data from invoices and the like.}
+  gem.email = "gallagher.paul@gmail.com"
+  gem.authors = ["Paul Gallagher"]
+  gem.files.exclude 'pkg/*'
+  # dependencies defined in Gemfile
+end
+Jeweler::RubygemsDotOrgTasks.new
+desc "Run all RSpec test examples"
+RSpec::Core::RakeTask.new do |spec|
+  spec.rspec_opts = ["-c", "-f progress"]
+  spec.pattern = 'spec/**/*_spec.rb'
+end
+task :default => :spec
+require 'rdoc/task'
+RDoc::Task.new do |rdoc|
+  rdoc.main = "README.rdoc"
+  rdoc.rdoc_dir = 'rdoc'
+  rdoc.title = "pdf-reader-turtletext #{PDF::Reader::Turtletext::Version::STRING}"
+  rdoc.rdoc_files.include('README*', 'lib/**/*.rb')
+end
+desc "Generate sample PDFs for tests"
+task :make_pdf_samples do |t|
+  require Pathname.new(File.dirname(__FILE__)).join('spec','support','pdf_samples_helper')
+  include PdfSamplesHelper
+  make_pdf_samples
+end

data/lib/pdf-reader-turtletext.rb ADDED

@@ -0,0 +1,7 @@
+require 'pdf-reader'
+require 'pdf/reader/patch/object_hash'
+require 'pdf/reader/positional_text_receiver'
+require 'pdf/reader/turtletext'
+require 'pdf/reader/turtletext/version'
+require 'pdf/reader/turtletext/textangle'

data/lib/pdf/reader/patch/object_hash.rb ADDED

@@ -0,0 +1,47 @@
+# This monkey-patches pdf-reader to allow it to read PDFs that have junk characters that appear
+# in the file before the start of the PDF stream.
+# (this is quite commonly an html head block - I suspect a bug in the Adobe or other software used
+# to serve the bills)
+#
+# The patch has been contributed back to the pdf-reader project (https://github.com/yob/pdf-reader/pull/54)
+# and has already been merged on master. When it shows up in a release of the pdf-reader gem
+# we can trash this patch.
+#
+class PDF::Reader::ObjectHash
+  def extract_io_from(input)
+    if input.respond_to?(:seek) && input.respond_to?(:read)
+      input
+    elsif File.file?(input.to_s)
+      read_with_quirks(input)
+    else
+      raise ArgumentError, "input must be an IO-like object or a filename"
+    end
+  end
+  # Load file as a StringIO stream, accounting for invalid format
+  # where additional characters exist in the file before the %PDF start of file
+  def read_with_quirks(input)
+    stream = File.open(input.to_s, "rb")
+    if ofs = pdf_offset(stream)
+      stream.seek(ofs)
+      StringIO.new(stream.read)
+    else
+      raise ArgumentError, "invalid file format"
+    end
+  end
+  private :read_with_quirks
+  # Returns the offset of the PDF document in the +stream+.
+  # Checks up to 50 chars into the file, returns nil of no PDF stream detected.
+  def pdf_offset(stream)
+    stream.rewind
+    ofs = stream.pos
+    until (c = stream.readchar) == '%' || c == 37 || ofs > 50
+      ofs += 1
+    end
+    ofs < 50 ? ofs : nil
+  end
+  private :pdf_offset
+end

data/lib/pdf/reader/positional_text_receiver.rb ADDED

@@ -0,0 +1,31 @@
+# Receiver to access positional (x,y) text content from a PDF
+#
+# Typical usage:
+#
+#   reader = PDF::Reader.new(filename)
+#   receiver = PDF::Reader::PositionalTextReceiver.new
+#   reader.page(page).walk(receiver)
+#   receiver.content
+#
+class PDF::Reader::PositionalTextReceiver < PDF::Reader::PageTextReceiver
+  # record text that is drawn on the page
+  def show_text(string) # Tj
+    raise PDF::Reader::MalformedPDFError, "current font is invalid" if @state.current_font.nil?
+    newx, newy = @state.trm_transform(0,0)
+    @content[newy] ||= {}
+    @content[newy][newx] ||= ''
+    @content[newy][newx] << @state.current_font.to_utf8(string)
+  end
+  # override PageTextReceiver content accessor .
+  # Returns a hash of positional text:
+  #   {
+  #     y_coord=>{x_coord=>text, x_coord=>text },
+  #     y_coord=>{x_coord=>text, x_coord=>text }
+  #   }
+  def content
+    @content
+  end
+end

data/lib/pdf/reader/turtletext.rb ADDED

@@ -0,0 +1,131 @@
+# Class for reading structured text content
+#
+# Typical usage:
+#
+#   reader = PDF::Reader::Turtletext.new(pdf_filename)
+#   page = 1
+#   heading_position = reader.text_position(/transaction table/i)
+#   next_section = reader.text_position(/transaction summary/i)
+#   transaction_rows = reader.text_in_region(
+#     heading_position[x], 900,
+#     heading_position[y] + 1,next_section[:y] -1
+#   )
+#
+class PDF::Reader::Turtletext
+  attr_reader :reader
+  attr_reader :options
+  # +source+ is a file name or stream-like object
+  def initialize(source, options={})
+    @options = options
+    @reader = PDF::Reader.new(source)
+  end
+  # Returns the precision required in y positions.
+  # This is the fuzz range for interpreting y positions.
+  # Lines with y positions +/- +y_precision+ will be merged together.
+  # This helps align text correctly which may visually appear on the same line, but is actually
+  # off by a few pixels.
+  def y_precision
+    options[:y_precision] ||= 3
+  end
+  # Returns positional (with fuzzed y positioning) text content collection as a hash:
+  #   { y_position: { x_position: content}}
+  def content(page=1)
+    @content ||= []
+    if @content[page]
+      @content[page]
+    else
+      @content[page] = fuzzed_y(precise_content(page))
+    end
+  end
+  # Returns a hash with fuzzed positioning:
+  #   { fuzzed_y_position: { x_position: content}}
+  # Given +input+ as a hash:
+  #   { y_position: { x_position: content}}
+  # Fuzz factors: +y_precision+
+  def fuzzed_y(input)
+    output = {}
+    input.keys.sort.each do |precise_y|
+      # matching_y = (precise_y / 5.0).truncate * 5.0
+      matching_y = output.keys.select{|new_y| (new_y - precise_y).abs < y_precision }.first || precise_y
+      output[matching_y] ||= {}
+      output[matching_y].merge!(input[precise_y])
+    end
+    output
+  end
+  # Returns positional text content collection as a hash with precise x,y positioning:
+  #   { y_position: { x_position: content}}
+  def precise_content(page=1)
+    @precise_content ||= []
+    if @precise_content[page]
+      @precise_content[page]
+    else
+      @precise_content[page] = load_content(page)
+    end
+  end
+  # Returns an array of text elements found within the x,y limits,
+  # Each line of text found is returned as an array element.
+  # Each line of text is an array of the seperate text elements found on that line.
+  #   [["first line first text", "first line last text"],["second line text"]]
+  def text_in_region(xmin,xmax,ymin,ymax,page=1)
+    text_map = content(page)
+    box = []
+    text_map.keys.sort.reverse.each do |y|
+      if y >= ymin && y<= ymax
+        row = []
+        text_map[y].keys.sort.each do |x|
+          if x >= xmin && x<= xmax
+            row << text_map[y][x]
+          end
+        end
+        box << row unless row.empty?
+      end
+    end
+    box
+  end
+  # Returns the position of +text+ on +page+
+  #   {x: val, y: val }
+  # +text+ may be a string (exact match required) or a Regexp
+  def text_position(text,page=1)
+    item = if text.class <= Regexp
+      content(page).map {|k,v| if x = v.reduce(nil){|memo,vv|  memo = (vv[1] =~ text) ? vv[0] : memo  } ; [k,x] ; end }
+    else
+      content(page).map {|k,v| if x = v.rassoc(text) ; [k,x] ; end }
+    end
+    item = item.compact.flatten
+    unless item.empty?
+      { :x => item[1], :y => item[0] }
+    end
+  end
+  # WIP - not using Textangle yet for text extraction.
+  # Ideal usage is something like this:
+  #
+  # textangle = reader.bounding_box do
+  #   page 1
+  #   below "Electricity Services"
+  #   above "Gas Services by City Gas Pte Ltd"
+  #   right_of 240.0
+  #   left_of "Total ($)"
+  # end
+  # textangle.text
+  #
+  def bounding_box(&block)
+    PDF::Reader::Turtletext::Textangle.new(self,&block)
+  end
+  private
+    def load_content(page)
+      receiver = PDF::Reader::PositionalTextReceiver.new
+      reader.page(page).walk(receiver)
+      receiver.content
+    end
+end

data/lib/pdf/reader/turtletext/textangle.rb ADDED

@@ -0,0 +1,27 @@
+# A DSL syntax for text extraction.
+# WIP - not using this yet
+#
+# textangle = PDF::Reader::Turtletext::Textangle.new(reader) do
+#   page 1
+#   below "Electricity Services"
+#   above "Gas Services by City Gas Pte Ltd"
+#   right_of 240.0
+#   left_of "Total ($)"
+# end
+# textangle.text
+#
+class PDF::Reader::Turtletext::Textangle
+  attr_reader :reader
+  attr_writer :page,:above,:below,:left_of,:right_of
+  # +structured_reader+ is a PDF::StructuredReader
+  def initialize(structured_reader,&block)
+    @reader = structured_reader
+    instance_eval( &block ) if block
+  end
+  def text
+    # TODO
+  end
+end

data/lib/pdf/reader/turtletext/version.rb ADDED

@@ -0,0 +1,13 @@
+module PDF
+  class Reader
+    class Turtletext
+      class Version
+        MAJOR = 0
+        MINOR = 1
+        PATCH = 0
+        STRING = [MAJOR, MINOR, PATCH].compact.join('.')
+      end
+    end
+  end
+end

data/pdf-reader-turtletext.gemspec ADDED

@@ -0,0 +1,87 @@
+# Generated by jeweler
+# DO NOT EDIT THIS FILE DIRECTLY
+# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
+# -*- encoding: utf-8 -*-
+Gem::Specification.new do |s|
+  s.name = "pdf-reader-turtletext"
+  s.version = "0.1.0"
+  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+  s.authors = ["Paul Gallagher"]
+  s.date = "2012-07-22"
+  s.description = "a library that can read structured and positional text from PDFs. Ideal for asembling structured data from invoices and the like."
+  s.email = "gallagher.paul@gmail.com"
+  s.extra_rdoc_files = [
+    "LICENSE",
+    "README.rdoc"
+  ]
+  s.files = [
+    ".rspec",
+    ".rvmrc",
+    ".travis.yml",
+    "Gemfile",
+    "Gemfile.lock",
+    "Guardfile",
+    "LICENSE",
+    "README.rdoc",
+    "Rakefile",
+    "lib/pdf-reader-turtletext.rb",
+    "lib/pdf/reader/patch/object_hash.rb",
+    "lib/pdf/reader/positional_text_receiver.rb",
+    "lib/pdf/reader/turtletext.rb",
+    "lib/pdf/reader/turtletext/textangle.rb",
+    "lib/pdf/reader/turtletext/version.rb",
+    "pdf-reader-turtletext.gemspec",
+    "spec/fixtures/pdf_samples/.gitkeep",
+    "spec/fixtures/pdf_samples/hello_world.pdf",
+    "spec/fixtures/pdf_samples/junk_prefix.pdf",
+    "spec/integration/pdf_samples_spec.rb",
+    "spec/spec_helper.rb",
+    "spec/support/pdf_samples_helper.rb",
+    "spec/unit/reader/patch/object_hash_spec.rb",
+    "spec/unit/reader/positional_text_receiver_spec.rb",
+    "spec/unit/reader/turtletext/textangle_spec.rb",
+    "spec/unit/reader/turtletext/turtletext_spec.rb",
+    "spec/unit/reader/turtletext/version_spec.rb"
+  ]
+  s.homepage = "https://github.com/tardate/pdf-reader-turtletext"
+  s.licenses = ["MIT"]
+  s.require_paths = ["lib"]
+  s.rubygems_version = "1.8.15"
+  s.summary = "PDF structured text reader"
+  if s.respond_to? :specification_version then
+    s.specification_version = 3
+    if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
+      s.add_runtime_dependency(%q<pdf-reader>, ["= 1.1.1"])
+      s.add_development_dependency(%q<bundler>, ["~> 1.1.4"])
+      s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
+      s.add_development_dependency(%q<rake>, ["~> 0.9.2.2"])
+      s.add_development_dependency(%q<rspec>, ["~> 2.8.0"])
+      s.add_development_dependency(%q<rdoc>, ["~> 3.11"])
+      s.add_development_dependency(%q<prawn>, ["~> 0.12.0"])
+      s.add_development_dependency(%q<guard-rspec>, ["~> 1.2.0"])
+    else
+      s.add_dependency(%q<pdf-reader>, ["= 1.1.1"])
+      s.add_dependency(%q<bundler>, ["~> 1.1.4"])
+      s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
+      s.add_dependency(%q<rake>, ["~> 0.9.2.2"])
+      s.add_dependency(%q<rspec>, ["~> 2.8.0"])
+      s.add_dependency(%q<rdoc>, ["~> 3.11"])
+      s.add_dependency(%q<prawn>, ["~> 0.12.0"])
+      s.add_dependency(%q<guard-rspec>, ["~> 1.2.0"])
+    end
+  else
+    s.add_dependency(%q<pdf-reader>, ["= 1.1.1"])
+    s.add_dependency(%q<bundler>, ["~> 1.1.4"])
+    s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
+    s.add_dependency(%q<rake>, ["~> 0.9.2.2"])
+    s.add_dependency(%q<rspec>, ["~> 2.8.0"])
+    s.add_dependency(%q<rdoc>, ["~> 3.11"])
+    s.add_dependency(%q<prawn>, ["~> 0.12.0"])
+    s.add_dependency(%q<guard-rspec>, ["~> 1.2.0"])
+  end
+end

data/spec/fixtures/pdf_samples/.gitkeep ADDED

File without changes

data/spec/fixtures/pdf_samples/hello_world.pdf ADDED

@@ -0,0 +1,69 @@
+%PDF-1.3
+%����
+1 0 obj
+<< /Creator <feff0050007200610077006e>
+/Producer <feff0050007200610077006e>
+>>
+endobj
+2 0 obj
+<< /Type /Catalog
+/Pages 3 0 R
+>>
+endobj
+3 0 obj
+<< /Type /Pages
+/Count 1
+/Kids [5 0 R]
+>>
+endobj
+4 0 obj
+<< /Length 81
+>>
+stream
+q
+BT
+36 747.384 Td
+/F1.0 12 Tf
+[<48656c6c6f2057> 30 <6f72> -15 <6c64>] TJ
+ET
+Q
+endstream
+endobj
+5 0 obj
+<< /Type /Page
+/Parent 3 0 R
+/MediaBox [0 0 612.0 792.0]
+/Contents 4 0 R
+/Resources << /ProcSet [/PDF /Text /ImageB /ImageC /ImageI]
+/Font << /F1.0 6 0 R
+>>
+>>
+>>
+endobj
+6 0 obj
+<< /Type /Font
+/Subtype /Type1
+/BaseFont /Helvetica
+/Encoding /WinAnsiEncoding
+>>
+endobj
+xref
+0 7
+0000000000 65535 f
+0000000015 00000 n
+0000000109 00000 n
+0000000158 00000 n
+0000000215 00000 n
+0000000346 00000 n
+0000000524 00000 n
+trailer
+<< /Size 7
+/Root 2 0 R
+/Info 1 0 R
+>>
+startxref
+621
+%%EOF

data/spec/fixtures/pdf_samples/junk_prefix.pdf ADDED

@@ -0,0 +1,71 @@
+<html>
+<head></head>
+%PDF-1.3
+%����
+1 0 obj
+<< /Creator <feff0050007200610077006e>
+/Producer <feff0050007200610077006e>
+>>
+endobj
+2 0 obj
+<< /Type /Catalog
+/Pages 3 0 R
+>>
+endobj
+3 0 obj
+<< /Type /Pages
+/Count 1
+/Kids [5 0 R]
+>>
+endobj
+4 0 obj
+<< /Length 157
+>>
+stream
+q
+BT
+36 747.384 Td
+/F1.0 12 Tf
+[<546869732050444620636f6e7461696e73206a756e6b20626566> 30 <6f72652074686520252d504446206d6172> -15 <6b> 20 <6572>] TJ
+ET
+Q
+endstream
+endobj
+5 0 obj
+<< /Type /Page
+/Parent 3 0 R
+/MediaBox [0 0 612.0 792.0]
+/Contents 4 0 R
+/Resources << /ProcSet [/PDF /Text /ImageB /ImageC /ImageI]
+/Font << /F1.0 6 0 R
+>>
+>>
+>>
+endobj
+6 0 obj
+<< /Type /Font
+/Subtype /Type1
+/BaseFont /Helvetica
+/Encoding /WinAnsiEncoding
+>>
+endobj
+xref
+0 7
+0000000000 65535 f
+0000000015 00000 n
+0000000109 00000 n
+0000000158 00000 n
+0000000215 00000 n
+0000000423 00000 n
+0000000601 00000 n
+trailer
+<< /Size 7
+/Root 2 0 R
+/Info 1 0 R
+>>
+startxref
+698
+%%EOF

data/spec/integration/pdf_samples_spec.rb ADDED

@@ -0,0 +1,7 @@
+require 'spec_helper'
+include PdfSamplesHelper
+describe "PDF Samples" do
+end

data/spec/spec_helper.rb ADDED

@@ -0,0 +1,24 @@
+require 'pdf-reader-turtletext'
+# Requires supporting files with custom matchers and macros, etc,
+# in ./support/ and its subdirectories.
+Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
+RSpec.configure do |config|
+  # == Mock Framework
+  #
+  # If you prefer to use mocha, flexmock or RR, uncomment the appropriate line:
+  #
+  # config.mock_with :mocha
+  # config.mock_with :flexmock
+  # config.mock_with :rr
+  config.mock_with :rspec
+  # Remove this line if you're not using ActiveRecord or ActiveRecord fixtures
+  # config.fixture_path = "#{::Rails.root}/spec/fixtures"
+  # If you're not using ActiveRecord, or you'd prefer not to run each of your
+  # examples within a transaction, remove the following line or assign false
+  # instead of true.
+  # config.use_transactional_fixtures = true
+end

data/spec/support/pdf_samples_helper.rb ADDED

@@ -0,0 +1,43 @@
+require 'pathname'
+require 'yaml'
+module PdfSamplesHelper
+  def pdf_samples_path
+    Pathname.new(File.dirname(__FILE__)).join('..','fixtures','pdf_samples')
+  end
+  def pdf_sample(sample_name)
+    pdf_samples_path.join(sample_name)
+  end
+  def pdf_sample_names
+    Dir[pdf_samples_path.join("*.pdf")]
+  end
+  def pdf_sample_expectations_path
+    pdf_samples_path.join('expectations.yml')
+  end
+  def pdf_sample_expectations
+    begin
+      YAML.load_file pdf_sample_expectations_path
+    rescue
+      []
+    end
+  end
+  def make_pdf_samples
+    require 'prawn'
+    puts "Making PDF samples for tests.."
+    make_sample_hello_world
+  end
+  def make_sample_hello_world
+    filename = pdf_sample('hello_world.pdf')
+    Prawn::Document.generate filename do
+      text "Hello World"
+    end
+    puts "Created: #{filename}"
+  end
+end

data/spec/unit/reader/patch/object_hash_spec.rb ADDED

@@ -0,0 +1,15 @@
+require 'spec_helper'
+include PdfSamplesHelper
+describe PDF::Reader::ObjectHash do
+  context "when there is a junk prefix" do
+    let(:sample_name) { pdf_sample('junk_prefix.pdf') }
+    let(:object_hash) { PDF::Reader::ObjectHash.new(sample_name) }
+    let(:stream) { object_hash.instance_variable_get(:@io) }
+    before { stream.rewind }
+    subject { stream.read(4) }
+    it { should eql("%PDF") }
+  end
+end

data/spec/unit/reader/positional_text_receiver_spec.rb ADDED

@@ -0,0 +1,26 @@
+require 'spec_helper'
+include PdfSamplesHelper
+describe PDF::Reader::PositionalTextReceiver do
+  let(:resource_class) { PDF::Reader::PositionalTextReceiver }
+  let(:reader) { PDF::Reader.new(source) }
+  let(:receiver) { resource_class.new }
+  let(:page) { 1 }
+  before do
+    reader.page(page).walk(receiver)
+  end
+  {
+    'junk_prefix.pdf' => {747.384=>{36.0=>"This PDF contains junk before the %-PDF marker"}},
+    'hello_world.pdf' => {747.384=>{36.0=>"Hello World"}}
+  }.each do |sample_file,expected_page_content|
+    describe "#content for #{sample_file}" do
+      let(:source) { pdf_sample(sample_file) }
+      subject { receiver.content }
+      it { should eql(expected_page_content) }
+    end
+  end
+end

data/spec/unit/reader/turtletext/textangle_spec.rb ADDED

@@ -0,0 +1,6 @@
+require 'spec_helper'
+describe PDF::Reader::Turtletext::Textangle do
+  let(:resource_class) { PDF::Reader::Turtletext::Textangle }
+end

data/spec/unit/reader/turtletext/turtletext_spec.rb ADDED

@@ -0,0 +1,152 @@
+require 'spec_helper'
+describe PDF::Reader::Turtletext do
+  let(:resource_class) { PDF::Reader::Turtletext }
+  let(:source) { nil } # we're just going to mock the PDF source here
+  let(:structured_reader) { resource_class.new(source,options) }
+  let(:options) { {} }
+  describe "#reader" do
+    subject { structured_reader.reader}
+    it { should be_a(PDF::Reader) }
+  end
+  describe "#y_precision" do
+    subject { structured_reader.y_precision}
+    context "default" do
+      it { should eql(3) }
+    end
+    context "when set with options" do
+      let(:expected) { 5 }
+      let(:options) { { :y_precision => expected } }
+      it { should eql(expected) }
+    end
+  end
+  context "with mocked source content" do
+    let(:page) { 1 }
+    before do
+      structured_reader.should_receive(:load_content).with(page).and_return(given_page_content)
+    end
+    {
+      :with_simple_text => {
+        :source_page_content => {10.0=>{10.0=>"a first bit of text"}},
+        :expected_precise_content => {10.0=>{10.0=>"a first bit of text"}},
+        :expected_fuzzed_content => {10.0=>{10.0=>"a first bit of text"}}
+      },
+      :with_widely_separated_text => {
+        :source_page_content => {10.0=>{10.0=>"a first bit of text"},20.0=>{20.0=>"a second bit of text"}},
+        :expected_precise_content => {10.0=>{10.0=>"a first bit of text"},20.0=>{20.0=>"a second bit of text"}},
+        :expected_fuzzed_content => {10.0=>{10.0=>"a first bit of text"},20.0=>{20.0=>"a second bit of text"}}
+      },
+      :with_unsorted_y_text => {
+        :source_page_content => {20.0=>{10.0=>"a first bit of text"},10.0=>{20.0=>"a second bit of text"}},
+        :expected_precise_content => {20.0=>{10.0=>"a first bit of text"},10.0=>{20.0=>"a second bit of text"}},
+        :expected_fuzzed_content => {10.0=>{20.0=>"a second bit of text"},20.0=>{10.0=>"a first bit of text"}}
+      },
+      :with_fuzzed_y_text => {
+        :source_page_content => {10.0=>{10.0=>"a first bit of text"},12.0=>{12.0=>"a second bit of text"}},
+        :expected_precise_content => {10.0=>{10.0=>"a first bit of text"},12.0=>{12.0=>"a second bit of text"}},
+        :expected_fuzzed_content => {10.0=>{10.0=>"a first bit of text",12.0=>"a second bit of text"}}
+      },
+      :with_widely_separated_fuzzed_y_text => {
+        :y_precision => 25,
+        :source_page_content => {10.0=>{10.0=>"a first bit of text"},20.0=>{20.0=>"a second bit of text"}},
+        :expected_precise_content => {10.0=>{10.0=>"a first bit of text"},20.0=>{20.0=>"a second bit of text"}},
+        :expected_fuzzed_content => {10.0=>{10.0=>"a first bit of text",20.0=>"a second bit of text"}}
+      }
+    }.each do |test_name,test_expectations|
+      context test_name do
+        let(:given_page_content) { test_expectations[:source_page_content] }
+        let(:options) {
+          if (y_precision = test_expectations[:y_precision]) && y_precision != :default
+            { :y_precision => y_precision }
+          else
+            {}
+          end
+        }
+        describe "#content" do
+          subject { structured_reader.content(page) }
+          it { should eql(test_expectations[:expected_fuzzed_content]) }
+        end
+        describe "#precise_content" do
+          subject { structured_reader.precise_content(page) }
+          it { should eql(test_expectations[:expected_precise_content]) }
+        end
+      end
+    end
+    describe "#text_in_region" do
+      {
+        :with_single_text => {
+          :source_page_content => {10.0=>{10.0=>"a first bit of text"}},
+          :xmin => 0, :xmax => 100, :ymin => 0, :ymax => 100,
+          :expected_text => [["a first bit of text"]]
+        },
+        :with_single_line_text => {
+          :source_page_content => {
+            10.0=>{10.0=>"first line ignored"},
+            30.0=>{10.0=>"first part found", 20.0=>"last part found"},
+            70.0=>{10.0=>"last line ignored"}
+          },
+          :xmin => 0, :xmax => 100, :ymin => 20, :ymax => 50,
+          :expected_text => [["first part found", "last part found"]]
+        },
+        :with_multi_line_text => {
+          :source_page_content => {
+            10.0=>{10.0=>"first line ignored"},
+            30.0=>{10.0=>"first line first part found", 20.0=>"first line last part found"},
+            40.0=>{10.0=>"last line first part found", 20.0=>"last line last part found"},
+            70.0=>{10.0=>"last line ignored"}
+          },
+          :xmin => 0, :xmax => 100, :ymin => 20, :ymax => 50,
+          :expected_text => [
+            ["last line first part found", "last line last part found"],
+            ["first line first part found", "first line last part found"]
+          ]
+        }
+      }.each do |test_name,test_expectations|
+        context test_name do
+          let(:given_page_content) { test_expectations[:source_page_content] }
+          let(:xmin) { test_expectations[:xmin] }
+          let(:xmax) { test_expectations[:xmax] }
+          let(:ymin) { test_expectations[:ymin] }
+          let(:ymax) { test_expectations[:ymax] }
+          let(:expected_text) { test_expectations[:expected_text] }
+          subject { structured_reader.text_in_region(xmin,xmax,ymin,ymax,page) }
+          it { should eql(expected_text) }
+        end
+      end
+    end
+    describe "#text_position" do
+      let(:given_page_content) { {
+        10.0=>{10.0=>"crunchy bacon"},
+        30.0=>{15.0=>"bacon on kimchi noodles", 25.0=>"heaven"},
+        40.0=>{30.0=>"turkey bacon", 35.0=>"fraud"},
+        70.0=>{40.0=>"smoked and streaky da bomb"}
+      } }
+      {
+        :with_simple_match => { :match_term => 'turkey bacon', :expected_position => {:x=>30.0, :y=>40.0} },
+        :with_match_along_line => { :match_term => 'heaven', :expected_position => {:x=>25.0, :y=>30.0} },
+        :with_regex_match => { :match_term => /kimchi/, :expected_position => {:x=>15.0, :y=>30.0} },
+        :with_regex_multi_matches_first => { :match_term => /turkey|crunchy/, :expected_position => {:x=>10.0, :y=>10.0} }
+      }.each do |test_name,test_expectations|
+        context test_name do
+          let(:match_term) { test_expectations[:match_term] }
+          let(:expected_position) { test_expectations[:expected_position] }
+          subject { structured_reader.text_position(match_term,page) }
+          it { should eql(expected_position) }
+        end
+      end
+    end
+  end
+end

data/spec/unit/reader/turtletext/version_spec.rb ADDED

@@ -0,0 +1,14 @@
+require 'spec_helper'
+describe PDF::Reader::Turtletext::Version do
+  let(:resource_class) { PDF::Reader::Turtletext::Version }
+  it { resource_class::MAJOR.should be_a(Fixnum) }
+  it { resource_class::MINOR.should be_a(Fixnum) }
+  it { resource_class::PATCH.should be_a(Fixnum) }
+  describe "##STRING" do
+    subject { resource_class::STRING }
+    it { should match(/\d+\.\d+\.\d+/)}
+  end
+end

metadata ADDED

@@ -0,0 +1,163 @@
+--- !ruby/object:Gem::Specification
+name: pdf-reader-turtletext
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+  prerelease:
+platform: ruby
+authors:
+- Paul Gallagher
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2012-07-22 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: pdf-reader
+  requirement: &70193556628420 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - =
+      - !ruby/object:Gem::Version
+        version: 1.1.1
+  type: :runtime
+  prerelease: false
+  version_requirements: *70193556628420
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: &70193556627700 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.1.4
+  type: :development
+  prerelease: false
+  version_requirements: *70193556627700
+- !ruby/object:Gem::Dependency
+  name: jeweler
+  requirement: &70193556626800 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.6.4
+  type: :development
+  prerelease: false
+  version_requirements: *70193556626800
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: &70193556626300 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 0.9.2.2
+  type: :development
+  prerelease: false
+  version_requirements: *70193556626300
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: &70193556625680 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 2.8.0
+  type: :development
+  prerelease: false
+  version_requirements: *70193556625680
+- !ruby/object:Gem::Dependency
+  name: rdoc
+  requirement: &70193556624820 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '3.11'
+  type: :development
+  prerelease: false
+  version_requirements: *70193556624820
+- !ruby/object:Gem::Dependency
+  name: prawn
+  requirement: &70193556623960 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 0.12.0
+  type: :development
+  prerelease: false
+  version_requirements: *70193556623960
+- !ruby/object:Gem::Dependency
+  name: guard-rspec
+  requirement: &70193556623440 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.2.0
+  type: :development
+  prerelease: false
+  version_requirements: *70193556623440
+description: a library that can read structured and positional text from PDFs. Ideal
+  for asembling structured data from invoices and the like.
+email: gallagher.paul@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files:
+- LICENSE
+- README.rdoc
+files:
+- .rspec
+- .rvmrc
+- .travis.yml
+- Gemfile
+- Gemfile.lock
+- Guardfile
+- LICENSE
+- README.rdoc
+- Rakefile
+- lib/pdf-reader-turtletext.rb
+- lib/pdf/reader/patch/object_hash.rb
+- lib/pdf/reader/positional_text_receiver.rb
+- lib/pdf/reader/turtletext.rb
+- lib/pdf/reader/turtletext/textangle.rb
+- lib/pdf/reader/turtletext/version.rb
+- pdf-reader-turtletext.gemspec
+- spec/fixtures/pdf_samples/.gitkeep
+- spec/fixtures/pdf_samples/hello_world.pdf
+- spec/fixtures/pdf_samples/junk_prefix.pdf
+- spec/integration/pdf_samples_spec.rb
+- spec/spec_helper.rb
+- spec/support/pdf_samples_helper.rb
+- spec/unit/reader/patch/object_hash_spec.rb
+- spec/unit/reader/positional_text_receiver_spec.rb
+- spec/unit/reader/turtletext/textangle_spec.rb
+- spec/unit/reader/turtletext/turtletext_spec.rb
+- spec/unit/reader/turtletext/version_spec.rb
+homepage: https://github.com/tardate/pdf-reader-turtletext
+licenses:
+- MIT
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.15
+signing_key:
+specification_version: 3
+summary: PDF structured text reader
+test_files: []