RubyGems - hocr_turtletext - Versions diffs - 0.1.1 - Mend

hocr_turtletext 0.1.1

Files changed (17) hide show

checksums.yaml +7 -0
data/.gitignore +14 -0
data/.rspec +3 -0
data/.travis.yml +7 -0
data/Gemfile +6 -0
data/Gemfile.lock +39 -0
data/LICENSE.txt +21 -0
data/README.md +168 -0
data/Rakefile +6 -0
data/bin/console +14 -0
data/bin/setup +8 -0
data/hocr_turtletext.gemspec +43 -0
data/lib/hocr_turtletext.rb +3 -0
data/lib/hocr_turtletext/reader.rb +155 -0
data/lib/hocr_turtletext/textangle.rb +117 -0
data/lib/hocr_turtletext/version.rb +3 -0
metadata +123 -0

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: da1d48bb9c9bbf8e723a10d039603670bba89ba5
+  data.tar.gz: 00f89918601c84176ccac11b41ae38a5f829f183
+SHA512:
+  metadata.gz: f6c50bfb48f0483673b13a963535e0a783554a4c5cdbd55aca911f47cd084c512242e9c3868f46a1948d52128575d5efbfec8f1ce60f78fa2ad81f282abe61e2
+  data.tar.gz: 27727e5d7a6f640b0152dcc0695228ff21bd9cafd7d5f3fcde798e5c86a7f5d89e1c5ba8b455cbca737291c9cce80128e062903f764c54cb17e16ccc96baa84e

data/.gitignore ADDED

@@ -0,0 +1,14 @@
+/.bundle/
+/.yardoc
+/_yardoc/
+/coverage/
+/doc/
+/pkg/
+/spec/reports/
+/tmp/
+# rspec failure tracking
+.rspec_status
+/.idea
+*.iml

data/.rspec ADDED

@@ -0,0 +1,3 @@
+--format documentation
+--color
+--require spec_helper

data/.travis.yml ADDED

@@ -0,0 +1,7 @@
+---
+sudo: false
+language: ruby
+cache: bundler
+rvm:
+  - 2.4.4
+before_install: gem install bundler -v 1.16.3

data/Gemfile ADDED

@@ -0,0 +1,6 @@
+source "https://rubygems.org"
+git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
+# Specify your gem's dependencies in hocr_turtletext.gemspec
+gemspec

data/Gemfile.lock ADDED

@@ -0,0 +1,39 @@
+PATH
+  remote: .
+  specs:
+    hocr_turtletext (0.1.0)
+      nokogiri (~> 1.10, >= 1.10.7)
+GEM
+  remote: https://rubygems.org/
+  specs:
+    diff-lcs (1.3)
+    mini_portile2 (2.4.0)
+    nokogiri (1.10.7)
+      mini_portile2 (~> 2.4.0)
+    rake (10.5.0)
+    rspec (3.7.0)
+      rspec-core (~> 3.7.0)
+      rspec-expectations (~> 3.7.0)
+      rspec-mocks (~> 3.7.0)
+    rspec-core (3.7.1)
+      rspec-support (~> 3.7.0)
+    rspec-expectations (3.7.0)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.7.0)
+    rspec-mocks (3.7.0)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.7.0)
+    rspec-support (3.7.1)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  bundler (~> 1.16)
+  hocr_turtletext!
+  rake (~> 10.0)
+  rspec (~> 3.0)
+BUNDLED WITH
+   1.16.3

data/LICENSE.txt ADDED

@@ -0,0 +1,21 @@
+The MIT License (MIT)
+Copyright (c) 2020 Sue Zheng Hao
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

data/README.md ADDED

@@ -0,0 +1,168 @@
+# HocrTurtletext
+Heavily inspired by [PDF::Reader::Turtletext](https://github.com/tardate/pdf-reader-turtletext), HocrTurtletext provides convenient methods to extract content from a hOCR file. hOCR output is commonly produced by OCR software such as tesseract-ocr.
+## Installation
+Add this line to your application's Gemfile:
+```ruby
+gem 'hocr_turtletext'
+```
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install hocr_turtletext
+## Usage
+### Instantiate HocrTurtletext
+Typical usage:
+```ruby
+hocr_path = '/tmp/page1.hocr'
+options = { :y_precision => 7 }
+reader = HocrTurtletext::Reader.new(hocr_path, options)
+```
+Options:
+`x_whitespace_threshold`: Words with a x distance of less than this threshold will be concatenated with a space. Try increasing this value if words/letters that are supposed to belong together are separated.
+`y_precision`: Different rows of text with y positions that are less than y_precision of difference will be put together into one row. Try increasing this value if words that are supposed to be on the same row are detected as separate rows.
+### Extract text within a region described in relation to other text
+This method works nearly identically to its counterpart from PDF::Reader::Turtletext.
+The main difference is that we are not dealing with multiple pages in our hOCR input, so
+there is no need to support page selection.
+Given that we know the text we want to find is relatively positioned (for example)
+below a certain bit of text, to the left of another, and above some other text, use
+the `bounding_box` method to describe the region and extract the matching text.
+```
+  textangle = reader.bounding_box do
+    below /electricity/i
+    above 10
+    right_of 240.0
+    left_of "Total ($)"
+  end
+  textangle.text
+  => [['string','string'],['string']] # array of rows, each row is an array of text elements in the row
+```
+The range of methods that can be used within the `bounding_box` block are all optional, and include:
+- `inclusive` - whether region selection should be inclusive or exclusive of the specified positions
+  (default is false).
+- `below` - a string, regex or number that describes the upper limit of the text box
+  (default is top border of the page)`.
+- `above` - a string, regex or number that describes the lower limit of the text box
+  (default is bottom border of the page).
+- `left_of` - a string, regex or number that describes the right limit of the text box
+  (default is right border of the page).
+- `right_of` - a string, regex or number that describes the left limit of the text box
+  (default is left border of the page).
+Note that `left_of` and `right_of` constraints do *not* need to be within the vertical
+range of the box being described.
+For example, you could use an element in the page header to describe the `left_of` limit
+for a table at the bottom of the page, if it has the correct alignment needed to describe your text region.
+Similarly, `above` and `below` constraints do *not* need to be within the horizontal
+range of the box being described.
+### Using a block parameter with the `bounding_box` method
+An explicit block parameter may be used with the `bounding_box` method:
+```
+  textangle = reader.bounding_box do |r|
+    r.below /electricity/i
+    r.left_of "Total ($)"
+  end
+  textangle.text
+  => [['string','string'],['string']] # array of rows, each row is an array of text elements in the row
+```
+### How to describe an inclusive `bounding_box` region
+By default, the `bounding_box` method makes exclusive selection (i.e. not including the
+region limits).
+To specify an inclusive region, use the `inclusive!` command:
+```ruby
+  textangle = reader.bounding_box do
+    inclusive!
+    below /electricity/i
+    left_of "Total ($)"
+  end
+```
+Alternatively, set `inclusive` to true:
+```ruby
+  textangle = reader.bounding_box do
+    inclusive true
+    below /electricity/i
+    left_of "Total ($)"
+  end
+```
+Or with a block parameter, you may also assign `inclusive` to true:
+```ruby
+  textangle = reader.bounding_box do |r|
+    r.inclusive = true
+    r.below /electricity/i
+    r.left_of "Total ($)"
+  end
+```
+### Extract text for a region with known positional co-ordinates
+If you know (or can calculate) the x,y positions of the required text region, you can extract the region's text using the `text_in_region` method.
+```
+  text = reader.text_in_region(
+    10,   # minimum x (left-most)
+    900,  # maximum x (right-most)
+    200,  # minimum y (top-most)
+    400,  # maximum y (bottom-most)
+    false # inclusive of x/y position if true (default false)
+  )
+  => [['string','string'],['string']] # array of rows, each row is an array of text elements in the row
+```
+Note that the x,y origin is at the **top-left**.
+This differs from how it works in PDF::Reader::Turtletext, where the origin
+was bottom-left of the page.
+### How to find the x,y co-ordinate of a specific text element
+If you are doing low-level text extraction with `text_in_region` for example,
+it is usually necessary to locate specific text to provide a positional reference.
+Use the `text_position` method to locate text by exact or partial match.
+It returns a Hash of x/y co-ordinates that is the bottom-left corner of the text.
+```
+  text_by_exact_match = reader.text_position("Transaction Table")
+  => { :x => 10.0, :y => 600.0 }
+  text_by_regex_match = reader.text_position(/transaction summary/i)
+  => { :x => 10.0, :y => 300.0 }
+```
+Note: in the case of multiple matches, only the first match is returned.
+## Development
+After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
+To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
+## Contributing
+- Check issue tracker if someone is working on what you plan to work on
+- Fork project
+- Create new branch
+- Make changes in new branch
+- Submit pull request
+## License
+The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
+## Special Thanks
+- Paul Gallagher, creator of the [PDF::Reader::Turtletext](https://github.com/tardate/pdf-reader-turtletext) gem, from which large sections of this gem was copied/modified from.

data/Rakefile ADDED

@@ -0,0 +1,6 @@
+require "bundler/gem_tasks"
+require "rspec/core/rake_task"
+RSpec::Core::RakeTask.new(:spec)
+task :default => :spec

data/bin/console ADDED

@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+require "bundler/setup"
+require "hocr_turtletext"
+# You can add fixtures and/or initialization code here to make experimenting
+# with your gem easier. You can also use a different console, if you like.
+# (If you use this, don't forget to add pry to your Gemfile!)
+# require "pry"
+# Pry.start
+require "irb"
+IRB.start(__FILE__)

data/bin/setup ADDED

@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+set -euo pipefail
+IFS=$'\n\t'
+set -vx
+bundle install
+# Do any other automated setup that you need to do here

data/hocr_turtletext.gemspec ADDED

@@ -0,0 +1,43 @@
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'hocr_turtletext/version'
+Gem::Specification.new do |spec|
+  spec.name          = 'hocr_turtletext'
+  spec.version       = HocrTurtletext::VERSION
+  spec.authors       = ['Sue Zheng Hao']
+  spec.summary       = 'Reads structured text from hOCR input.'
+  spec.description   = <<-DESC
+  Parses hOCR input and provides methods to access text in a structured manner. Typical use
+  cases include parsing formatted text from a hOCR file produced by running a document
+  through OCR.
+  DESC
+  spec.homepage      = 'https://github.com/emmeryn/hocr-turtletext'
+  spec.license       = 'MIT'
+  # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
+  # to allow pushing to a single host or delete this section to allow pushing to any host.
+  # if spec.respond_to?(:metadata)
+  #   spec.metadata['allowed_push_host'] = "TODO: Set to 'http://mygemserver.com'"
+  # else
+  #   raise 'RubyGems 2.0 or newer is required to protect against ' \
+  #     'public gem pushes.'
+  # end
+  # Specify which files should be added to the gem when it is released.
+  # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
+  spec.files         = Dir.chdir(File.expand_path('..', __FILE__)) do
+    `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
+  end
+  spec.bindir        = 'exe'
+  spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
+  spec.require_paths = ['lib']
+  spec.add_development_dependency 'bundler', '~> 1.16'
+  spec.add_development_dependency 'rake', '~> 10.0'
+  spec.add_development_dependency 'rspec', '~> 3.0'
+  spec.add_runtime_dependency 'nokogiri', '~> 1.10', '>= 1.10.7'
+end

data/lib/hocr_turtletext.rb ADDED

@@ -0,0 +1,3 @@
+require 'nokogiri'
+require 'hocr_turtletext/version'
+require 'hocr_turtletext/reader'

data/lib/hocr_turtletext/reader.rb ADDED

@@ -0,0 +1,155 @@
+# pdf-reader-turtletext methods such as text_in_region, text_position and
+# fuzzed_y method modified from the original at https://github.com/tardate/pdf-reader-turtletext
+class HocrTurtletext::Reader
+  def initialize(hocr_path, options = {})
+    @hocr_path = hocr_path
+    @options = options
+  end
+  def content
+    hocr_content = File.read(@hocr_path)
+    lines = precise_content(hocr_content)
+    pos_hash = to_pos_hash(lines)
+    fuzzed_y(pos_hash)
+  end
+  def text_in_region(xmin,xmax,ymin,ymax,inclusive=false)
+    return [] unless xmin && xmax && ymin && ymax
+    text_map = content
+    box = []
+    text_map.each do |y,text_row|
+      if inclusive ? (y >= ymin && y <= ymax) : (y > ymin && y < ymax)
+        row = []
+        text_row.each do |x,element|
+          if inclusive ? (x >= xmin && x <= xmax) : (x > xmin && x < xmax)
+            row << element
+          end
+        end
+        box << row unless row.empty?
+      end
+    end
+    box
+  end
+  def text_position(text)
+    item = if text.class <= Regexp
+             content.map do |k,v|
+               if x = v.reduce(nil){|memo,vv|  memo = (vv[1] =~ text) ? vv[0] : memo  }
+                 [k,x]
+               end
+             end
+           else
+             content.map {|k,v| if x = v.rassoc(text) ; [k,x] ; end }
+           end
+    item = item.compact.flatten
+    unless item.empty?
+      { :x => item[1], :y => item[0] }
+    end
+  end
+  def bounding_box(&block)
+    HocrTurtletext::Textangle.new(self,&block)
+  end
+  private
+  def x_whitespace_threshold
+    @options[:x_whitespace_threshold] ||= 30
+  end
+  def y_precision
+    @options[:y_precision] ||= 3
+  end
+  def fuzzed_y(input)
+    output = []
+    input.keys.sort.each do |precise_y|
+      matching_y = output.map(&:first).select { |new_y| (new_y - precise_y).abs < y_precision }.first || precise_y
+      y_index = output.index{ |y| y.first == matching_y }
+      new_row_content = input[precise_y].to_a
+      if y_index
+        row_content = output[y_index].last
+        row_content += new_row_content
+        output[y_index] = [matching_y,row_content.sort{ |a,b| a.first <=> b.first }]
+      else
+        output << [matching_y,new_row_content.sort{ |a,b| a.first <=> b.first }]
+      end
+    end
+    output
+  end
+  def precise_content(hocr_content)
+    html = Nokogiri::HTML(hocr_content)
+    lines = []
+    html.css('span.ocr_line').map do |line|
+      chunks = chunks_from_processed_ocr_line(line)
+      lines.concat(chunks)
+    end
+    lines
+  end
+  def chunks_from_processed_ocr_line(ocr_line)
+    pos_info_line = add_positional_info_to_line(ocr_line)
+    sorted_pos_info_line = sort_words_in_line(pos_info_line)
+    concat_words_in_line(sorted_pos_info_line)
+  end
+  def add_positional_info_to_line(ocr_line)
+    ocr_line.css('span.ocrx_word, span.ocr_word').map do |word|
+      word_attributes = word.attributes['title'].value.to_s
+                            .delete(';').split(' ')
+      info(word, word_attributes)
+    end
+  end
+  def sort_words_in_line(pos_info_line)
+    # sort word by x value, concat if x2.x_start - x1.x_end < some_x_threshold
+    pos_info_line.sort_by { |word| word[:x_start] }
+    pos_info_line.slice_when do |x, y|
+      y[:x_start] - x[:x_end] > x_whitespace_threshold
+    end.to_a
+  end
+  def concat_words_in_line(sorted_pos_info_line)
+    chunks = []
+    # merge all words in each chunk
+    sorted_pos_info_line.each do |chunk|
+      sentence = nil
+      chunk.each do |word|
+        if sentence.nil?
+          sentence = word
+        else
+          sentence[:word] = "#{sentence[:word]} #{word[:word]}"
+          sentence[:x_end] = word[:x_end]
+        end
+      end
+      chunks.push sentence
+    end
+    chunks
+  end
+  def to_pos_hash(lines)
+    lines.sort_by { |line| line[:y_start] }
+    pos_hash = {}
+    lines.each do |run|
+      pos_hash[run[:y_start]] ||= {}
+      pos_hash[run[:y_start]][run[:x_start]] ||= ''
+      pos_hash[run[:y_start]][run[:x_start]] << run[:word]
+    end
+    pos_hash
+  end
+  def info(word, data)
+    {
+        word: word.text,
+        x_start: data[1].to_i,
+        y_start: data[2].to_i,
+        x_end: data[3].to_i,
+        y_end: data[4].to_i
+    }
+  end
+end

data/lib/hocr_turtletext/textangle.rb ADDED

@@ -0,0 +1,117 @@
+# A DSL syntax for text extraction.
+# Modified from the original at https://github.com/tardate/pdf-reader-turtletext
+class HocrTurtletext::Textangle
+  attr_reader :reader
+  # +hocr_turtletext_reader+ is a HocrTurtletext::Reader
+  def initialize(hocr_turtletext_reader,&block)
+    @reader = hocr_turtletext_reader
+    @inclusive = false
+    if block_given?
+      if block.arity == 1
+        yield self
+      else
+        instance_eval &block
+      end
+    end
+  end
+  attr_writer :inclusive
+  def inclusive(*args)
+    if value = args.first
+      @inclusive = value
+    end
+    @inclusive
+  end
+  # Command: sets +inclusive true
+  def inclusive!
+    @inclusive = true
+  end
+  # Command: sets +inclusive false
+  def exclusive!
+    @inclusive = false
+  end
+  attr_writer :above
+  def above(*args)
+    if value = args.first
+      @above = value
+    end
+    @above
+  end
+  attr_writer :below
+  def below(*args)
+    if value = args.first
+      @below = value
+    end
+    @below
+  end
+  attr_writer :left_of
+  def left_of(*args)
+    if value = args.first
+      @left_of = value
+    end
+    @left_of
+  end
+  attr_writer :right_of
+  def right_of(*args)
+    if value = args.first
+      @right_of = value
+    end
+    @right_of
+  end
+  # Returns the text array found within the defined region.
+  # Each line of text is an array of the separate text elements found on that line.
+  #   [["first line first text", "first line last text"],["second line text"]]
+  def text
+    return unless reader
+    xmin = if right_of
+             if [Integer,Float].include?(right_of.class)
+               right_of
+             elsif xy = reader.text_position(right_of)
+               xy[:x]
+             end
+           else
+             0
+           end
+    xmax = if left_of
+             if [Integer,Float].include?(left_of.class)
+               left_of
+             elsif xy = reader.text_position(left_of)
+               xy[:x]
+             end
+           else
+             99999 # TODO: figure out the actual limit?
+           end
+    ymax = if above
+             if [Integer,Float].include?(above.class)
+               above
+             elsif xy = reader.text_position(above)
+               xy[:y]
+             end
+           else
+             99999
+           end
+    ymin = if below
+             if [Integer,Float].include?(below.class)
+               below
+             elsif xy = reader.text_position(below)
+               xy[:y]
+             end
+           else
+             0 # TODO: figure out the actual limit?
+           end
+    reader.text_in_region(xmin,xmax,ymin,ymax,inclusive)
+  end
+end

data/lib/hocr_turtletext/version.rb ADDED

@@ -0,0 +1,3 @@
+module HocrTurtletext
+  VERSION = '0.1.1'.freeze
+end

metadata ADDED

@@ -0,0 +1,123 @@
+--- !ruby/object:Gem::Specification
+name: hocr_turtletext
+version: !ruby/object:Gem::Version
+  version: 0.1.1
+platform: ruby
+authors:
+- Sue Zheng Hao
+autorequire:
+bindir: exe
+cert_chain: []
+date: 2020-01-24 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.16'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.16'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.0'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.0'
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.10'
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.10.7
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.10'
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.10.7
+description: |2
+    Parses hOCR input and provides methods to access text in a structured manner. Typical use
+    cases include parsing formatted text from a hOCR file produced by running a document
+    through OCR.
+email:
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- ".gitignore"
+- ".rspec"
+- ".travis.yml"
+- Gemfile
+- Gemfile.lock
+- LICENSE.txt
+- README.md
+- Rakefile
+- bin/console
+- bin/setup
+- hocr_turtletext.gemspec
+- lib/hocr_turtletext.rb
+- lib/hocr_turtletext/reader.rb
+- lib/hocr_turtletext/textangle.rb
+- lib/hocr_turtletext/version.rb
+homepage: https://github.com/emmeryn/hocr-turtletext
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.6.14.1
+signing_key:
+specification_version: 4
+summary: Reads structured text from hOCR input.
+test_files: []