hocr_turtletext 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 23ed7d4b8c32b028f82675d42d9a8b2625692276
4
- data.tar.gz: c0e778642e9c0fe39a72116797ea39223cbf17bb
3
+ metadata.gz: 39a62a3746924d1fee867013bc7a03f361d805bf
4
+ data.tar.gz: ee01353babbe3f3d4fcb78b84f40fcd1d2fea4e7
5
5
  SHA512:
6
- metadata.gz: ec2e4c2fc85a3b241afff470cb820c0b907301be78cd4f2d28e67c60509337bf75ba8c7cd1a17c592fb2e3234c96803b0c3db5247d50cbbefd2445fbb6068149
7
- data.tar.gz: 53dd315bc5c91df9d2da50a18385f64b243bd852c4d77b7cf88d6dc0431f54e3f6a441be71ddf6906d324d76f9e61b13bb551a35fda0a017bcda97a304d97bda
6
+ metadata.gz: c62c8d2a3c6a1c7aa7a31185d92e5a9903471f345ec72aeb56b56a2c95953478aa713011a6ebf8483c20d26b940cbbdd0d248524c37c1aa467d79f7a827641d0
7
+ data.tar.gz: f2942de656083773a4132fc2a84ead1cb5b670b4cb4b5dc5ce1026eb7d1a472b0035d1e5afbfa635236b860aa91f620468851fde5bf0179347a5fd4f9acd1a2f
data/README.md CHANGED
@@ -146,12 +146,6 @@ It returns a Hash of x/y co-ordinates that is the bottom-left corner of the text
146
146
  ```
147
147
  Note: in the case of multiple matches, only the first match is returned.
148
148
 
149
- ## Development
150
-
151
- After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
152
-
153
- To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
154
-
155
149
  ## Contributing
156
150
 
157
151
  - Check issue tracker if someone is working on what you plan to work on
@@ -10,12 +10,14 @@ class HocrTurtletext::Reader
10
10
 
11
11
  def content
12
12
  hocr_content = File.read(@hocr_path)
13
- lines = precise_content(hocr_content)
14
- pos_hash = to_pos_hash(lines)
15
- fuzzed_y(pos_hash)
13
+ html = Nokogiri::HTML(hocr_content)
14
+ pos_info_words = extract_words_from_html(html)
15
+ pos_hash = to_pos_hash pos_info_words
16
+ fuzzed_y = fuzzed_y(pos_hash)
17
+ concat_words_in_lines(fuzzed_y)
16
18
  end
17
19
 
18
- def text_in_region(xmin,xmax,ymin,ymax,inclusive=false)
20
+ def text_in_region(xmin, xmax, ymin, ymax, inclusive=false)
19
21
  return [] unless xmin && xmax && ymin && ymax
20
22
  text_map = content
21
23
  box = []
@@ -37,12 +39,12 @@ class HocrTurtletext::Reader
37
39
  def text_position(text)
38
40
  item = if text.class <= Regexp
39
41
  content.map do |k,v|
40
- if x = v.reduce(nil){|memo,vv| memo = (vv[1] =~ text) ? vv[0] : memo }
42
+ if x = v.reduce(nil){ |memo,vv| memo = (vv[1] =~ text) ? vv[0] : memo }
41
43
  [k,x]
42
44
  end
43
45
  end
44
46
  else
45
- content.map {|k,v| if x = v.rassoc(text) ; [k,x] ; end }
47
+ content.map { |k,v| if x = v.rassoc(text) ; [k,x] ; end }
46
48
  end
47
49
  item = item.compact.flatten
48
50
  unless item.empty?
@@ -51,7 +53,7 @@ class HocrTurtletext::Reader
51
53
  end
52
54
 
53
55
  def bounding_box(&block)
54
- HocrTurtletext::Textangle.new(self,&block)
56
+ HocrTurtletext::Textangle.new(self, &block)
55
57
  end
56
58
 
57
59
  private
@@ -64,10 +66,37 @@ class HocrTurtletext::Reader
64
66
  @options[:y_precision] ||= 3
65
67
  end
66
68
 
69
+ def extract_words_from_html(html)
70
+ pos_info_words = []
71
+
72
+ html.css('span.ocrx_word, span.ocr_word')
73
+ .reject { |word| word.text.strip.empty? }
74
+ .each do |word|
75
+ word_attributes = word.attributes['title'].value.to_s
76
+ .delete(';').split(' ')
77
+ pos_info_word = word_info(word, word_attributes)
78
+ pos_info_words.push pos_info_word
79
+ end
80
+ pos_info_words
81
+ end
82
+
83
+ def to_pos_hash(lines)
84
+ lines.sort_by { |line| line[:y_start] }
85
+
86
+ pos_hash = {}
87
+ lines.each do |run|
88
+ pos_hash[run[:y_start]] ||= {}
89
+ pos_hash[run[:y_start]][run[:x_start]] = run
90
+ end
91
+ pos_hash
92
+ end
93
+
67
94
  def fuzzed_y(input)
68
95
  output = []
69
96
  input.keys.sort.each do |precise_y|
70
- matching_y = output.map(&:first).select { |new_y| (new_y - precise_y).abs < y_precision }.first || precise_y
97
+ matching_y = output.map(&:first)
98
+ .select { |new_y| (new_y - precise_y).abs < y_precision }
99
+ .first || precise_y
71
100
  y_index = output.index{ |y| y.first == matching_y }
72
101
  new_row_content = input[precise_y].to_a
73
102
  if y_index
@@ -81,77 +110,34 @@ class HocrTurtletext::Reader
81
110
  output
82
111
  end
83
112
 
84
- def precise_content(hocr_content)
85
- html = Nokogiri::HTML(hocr_content)
86
- lines = []
87
- html.css('span.ocr_line').map do |line|
88
- chunks = chunks_from_processed_ocr_line(line)
89
- lines.concat(chunks)
90
- end
91
- lines
92
- end
93
-
94
- def chunks_from_processed_ocr_line(ocr_line)
95
- pos_info_line = add_positional_info_to_line(ocr_line)
96
- sorted_pos_info_line = sort_words_in_line(pos_info_line)
97
- concat_words_in_line(sorted_pos_info_line)
98
- end
99
-
100
- def add_positional_info_to_line(ocr_line)
101
- ocr_line.css('span.ocrx_word, span.ocr_word')
102
- .reject { |word| word.text.strip.empty? }
103
- .map do |word|
104
- word_attributes = word.attributes['title'].value.to_s
105
- .delete(';').split(' ')
106
- info(word, word_attributes)
107
- end
108
- end
109
-
110
- def sort_words_in_line(pos_info_line)
111
- # sort word by x value, concat if x2.x_start - x1.x_end < some_x_threshold
112
- pos_info_line.sort_by { |word| word[:x_start] }
113
- pos_info_line.slice_when do |x, y|
114
- y[:x_start] - x[:x_end] > x_whitespace_threshold
115
- end.to_a
116
- end
117
-
118
- def concat_words_in_line(sorted_pos_info_line)
119
- chunks = []
120
- # merge all words in each chunk
121
- sorted_pos_info_line.each do |chunk|
122
- sentence = nil
123
- chunk.each do |word|
124
- if sentence.nil?
125
- sentence = word
113
+ def concat_words_in_lines(fuzzed_y)
114
+ fuzzed_y.map do |line|
115
+ x_pos_keyed_words = line[1]
116
+ concatenated_words = []
117
+ x_pos_keyed_words.each do |x_pos_keyed_word|
118
+ word_hash = x_pos_keyed_word[1]
119
+ if concatenated_words.empty? ||
120
+ word_hash[:x_start] - concatenated_words.last[:x_end] > x_whitespace_threshold
121
+ concatenated_words.push word_hash
126
122
  else
127
- sentence[:word] = "#{sentence[:word]} #{word[:word]}"
128
- sentence[:x_end] = word[:x_end]
123
+ concatenated_words.last[:word] = "#{concatenated_words.last[:word]} #{word_hash[:word]}"
124
+ concatenated_words.last[:x_end] = word_hash[:x_end]
129
125
  end
130
126
  end
131
- chunks.push sentence
132
- end
133
- chunks
134
- end
135
-
136
- def to_pos_hash(lines)
137
- lines.sort_by { |line| line[:y_start] }
138
-
139
- pos_hash = {}
140
- lines.each do |run|
141
- pos_hash[run[:y_start]] ||= {}
142
- pos_hash[run[:y_start]][run[:x_start]] ||= ''
143
- pos_hash[run[:y_start]][run[:x_start]] << run[:word]
127
+ line[1] = concatenated_words.map! do |word_hash|
128
+ [word_hash[:x_start], word_hash[:word]]
129
+ end
130
+ line
144
131
  end
145
- pos_hash
146
132
  end
147
133
 
148
- def info(word, data)
134
+ def word_info(word, data)
149
135
  {
150
- word: word.text,
151
- x_start: data[1].to_i,
152
- y_start: data[2].to_i,
153
- x_end: data[3].to_i,
154
- y_end: data[4].to_i
136
+ word: word.text,
137
+ x_start: data[1].to_i,
138
+ y_start: data[2].to_i,
139
+ x_end: data[3].to_i,
140
+ y_end: data[4].to_i
155
141
  }
156
142
  end
157
143
  end
@@ -1,3 +1,3 @@
1
1
  module HocrTurtletext
2
- VERSION = '0.1.2'.freeze
2
+ VERSION = '0.1.3'.freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hocr_turtletext
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sue Zheng Hao
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-01-24 00:00:00.000000000 Z
11
+ date: 2020-01-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler