RubyGems - hocr_turtletext - Versions diffs - 0.1.2 → 0.1.3 - Mend

hocr_turtletext 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

checksums.yaml +4 -4
data/README.md +0 -6
data/lib/hocr_turtletext/reader.rb +58 -72
data/lib/hocr_turtletext/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 23ed7d4b8c32b028f82675d42d9a8b2625692276
-  data.tar.gz: c0e778642e9c0fe39a72116797ea39223cbf17bb
+  metadata.gz: 39a62a3746924d1fee867013bc7a03f361d805bf
+  data.tar.gz: ee01353babbe3f3d4fcb78b84f40fcd1d2fea4e7
 SHA512:
-  metadata.gz: ec2e4c2fc85a3b241afff470cb820c0b907301be78cd4f2d28e67c60509337bf75ba8c7cd1a17c592fb2e3234c96803b0c3db5247d50cbbefd2445fbb6068149
-  data.tar.gz: 53dd315bc5c91df9d2da50a18385f64b243bd852c4d77b7cf88d6dc0431f54e3f6a441be71ddf6906d324d76f9e61b13bb551a35fda0a017bcda97a304d97bda
+  metadata.gz: c62c8d2a3c6a1c7aa7a31185d92e5a9903471f345ec72aeb56b56a2c95953478aa713011a6ebf8483c20d26b940cbbdd0d248524c37c1aa467d79f7a827641d0
+  data.tar.gz: f2942de656083773a4132fc2a84ead1cb5b670b4cb4b5dc5ce1026eb7d1a472b0035d1e5afbfa635236b860aa91f620468851fde5bf0179347a5fd4f9acd1a2f

data/README.md CHANGED

@@ -146,12 +146,6 @@ It returns a Hash of x/y co-ordinates that is the bottom-left corner of the text
 ```
 Note: in the case of multiple matches, only the first match is returned.
-## Development
-After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
-To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
 ## Contributing
 - Check issue tracker if someone is working on what you plan to work on

data/lib/hocr_turtletext/reader.rb CHANGED

@@ -10,12 +10,14 @@ class HocrTurtletext::Reader
   def content
     hocr_content = File.read(@hocr_path)
-    lines = precise_content(hocr_content)
-    pos_hash = to_pos_hash(lines)
-    fuzzed_y(pos_hash)
+    html = Nokogiri::HTML(hocr_content)
+    pos_info_words = extract_words_from_html(html)
+    pos_hash = to_pos_hash pos_info_words
+    fuzzed_y = fuzzed_y(pos_hash)
+    concat_words_in_lines(fuzzed_y)
   end
-  def text_in_region(xmin,xmax,ymin,ymax,inclusive=false)
+  def text_in_region(xmin, xmax, ymin, ymax, inclusive=false)
     return [] unless xmin && xmax && ymin && ymax
     text_map = content
     box = []
@@ -37,12 +39,12 @@ class HocrTurtletext::Reader
   def text_position(text)
     item = if text.class <= Regexp
              content.map do |k,v|
-               if x = v.reduce(nil){|memo,vv|  memo = (vv[1] =~ text) ? vv[0] : memo  }
+               if x = v.reduce(nil){ |memo,vv|  memo = (vv[1] =~ text) ? vv[0] : memo }
                  [k,x]
                end
              end
            else
-             content.map {|k,v| if x = v.rassoc(text) ; [k,x] ; end }
+             content.map { |k,v| if x = v.rassoc(text) ; [k,x] ; end }
            end
     item = item.compact.flatten
     unless item.empty?
@@ -51,7 +53,7 @@ class HocrTurtletext::Reader
   end
   def bounding_box(&block)
-    HocrTurtletext::Textangle.new(self,&block)
+    HocrTurtletext::Textangle.new(self, &block)
   end
   private
@@ -64,10 +66,37 @@ class HocrTurtletext::Reader
     @options[:y_precision] ||= 3
   end
+  def extract_words_from_html(html)
+    pos_info_words = []
+    html.css('span.ocrx_word, span.ocr_word')
+        .reject { |word| word.text.strip.empty? }
+        .each do |word|
+      word_attributes = word.attributes['title'].value.to_s
+                            .delete(';').split(' ')
+      pos_info_word = word_info(word, word_attributes)
+      pos_info_words.push pos_info_word
+    end
+    pos_info_words
+  end
+  def to_pos_hash(lines)
+    lines.sort_by { |line| line[:y_start] }
+    pos_hash = {}
+    lines.each do |run|
+      pos_hash[run[:y_start]] ||= {}
+      pos_hash[run[:y_start]][run[:x_start]] = run
+    end
+    pos_hash
+  end
   def fuzzed_y(input)
     output = []
     input.keys.sort.each do |precise_y|
-      matching_y = output.map(&:first).select { |new_y| (new_y - precise_y).abs < y_precision }.first || precise_y
+      matching_y = output.map(&:first)
+                       .select { |new_y| (new_y - precise_y).abs < y_precision }
+                       .first || precise_y
       y_index = output.index{ |y| y.first == matching_y }
       new_row_content = input[precise_y].to_a
       if y_index
@@ -81,77 +110,34 @@ class HocrTurtletext::Reader
     output
   end
-  def precise_content(hocr_content)
-    html = Nokogiri::HTML(hocr_content)
-    lines = []
-    html.css('span.ocr_line').map do |line|
-      chunks = chunks_from_processed_ocr_line(line)
-      lines.concat(chunks)
-    end
-    lines
-  end
-  def chunks_from_processed_ocr_line(ocr_line)
-    pos_info_line = add_positional_info_to_line(ocr_line)
-    sorted_pos_info_line = sort_words_in_line(pos_info_line)
-    concat_words_in_line(sorted_pos_info_line)
-  end
-  def add_positional_info_to_line(ocr_line)
-    ocr_line.css('span.ocrx_word, span.ocr_word')
-            .reject { |word| word.text.strip.empty? }
-            .map do |word|
-      word_attributes = word.attributes['title'].value.to_s
-                            .delete(';').split(' ')
-      info(word, word_attributes)
-    end
-  end
-  def sort_words_in_line(pos_info_line)
-    # sort word by x value, concat if x2.x_start - x1.x_end < some_x_threshold
-    pos_info_line.sort_by { |word| word[:x_start] }
-    pos_info_line.slice_when do |x, y|
-      y[:x_start] - x[:x_end] > x_whitespace_threshold
-    end.to_a
-  end
-  def concat_words_in_line(sorted_pos_info_line)
-    chunks = []
-    # merge all words in each chunk
-    sorted_pos_info_line.each do |chunk|
-      sentence = nil
-      chunk.each do |word|
-        if sentence.nil?
-          sentence = word
+  def concat_words_in_lines(fuzzed_y)
+    fuzzed_y.map do |line|
+      x_pos_keyed_words = line[1]
+      concatenated_words = []
+      x_pos_keyed_words.each do |x_pos_keyed_word|
+        word_hash = x_pos_keyed_word[1]
+        if concatenated_words.empty? ||
+            word_hash[:x_start] - concatenated_words.last[:x_end] > x_whitespace_threshold
+          concatenated_words.push word_hash
         else
-          sentence[:word] = "#{sentence[:word]} #{word[:word]}"
-          sentence[:x_end] = word[:x_end]
+          concatenated_words.last[:word] = "#{concatenated_words.last[:word]} #{word_hash[:word]}"
+          concatenated_words.last[:x_end] = word_hash[:x_end]
         end
       end
-      chunks.push sentence
-    end
-    chunks
-  end
-  def to_pos_hash(lines)
-    lines.sort_by { |line| line[:y_start] }
-    pos_hash = {}
-    lines.each do |run|
-      pos_hash[run[:y_start]] ||= {}
-      pos_hash[run[:y_start]][run[:x_start]] ||= ''
-      pos_hash[run[:y_start]][run[:x_start]] << run[:word]
+      line[1] = concatenated_words.map! do |word_hash|
+        [word_hash[:x_start], word_hash[:word]]
+      end
+      line
     end
-    pos_hash
   end
-  def info(word, data)
+  def word_info(word, data)
     {
-        word: word.text,
-        x_start: data[1].to_i,
-        y_start: data[2].to_i,
-        x_end: data[3].to_i,
-        y_end: data[4].to_i
+      word: word.text,
+      x_start: data[1].to_i,
+      y_start: data[2].to_i,
+      x_end: data[3].to_i,
+      y_end: data[4].to_i
     }
   end
 end

data/lib/hocr_turtletext/version.rb CHANGED

@@ -1,3 +1,3 @@
 module HocrTurtletext
-  VERSION = '0.1.2'.freeze
+  VERSION = '0.1.3'.freeze
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: hocr_turtletext
 version: !ruby/object:Gem::Version
-  version: 0.1.2
+  version: 0.1.3
 platform: ruby
 authors:
 - Sue Zheng Hao
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2020-01-24 00:00:00.000000000 Z
+date: 2020-01-29 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler