hocr_turtletext 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 23ed7d4b8c32b028f82675d42d9a8b2625692276
4
- data.tar.gz: c0e778642e9c0fe39a72116797ea39223cbf17bb
3
+ metadata.gz: 39a62a3746924d1fee867013bc7a03f361d805bf
4
+ data.tar.gz: ee01353babbe3f3d4fcb78b84f40fcd1d2fea4e7
5
5
  SHA512:
6
- metadata.gz: ec2e4c2fc85a3b241afff470cb820c0b907301be78cd4f2d28e67c60509337bf75ba8c7cd1a17c592fb2e3234c96803b0c3db5247d50cbbefd2445fbb6068149
7
- data.tar.gz: 53dd315bc5c91df9d2da50a18385f64b243bd852c4d77b7cf88d6dc0431f54e3f6a441be71ddf6906d324d76f9e61b13bb551a35fda0a017bcda97a304d97bda
6
+ metadata.gz: c62c8d2a3c6a1c7aa7a31185d92e5a9903471f345ec72aeb56b56a2c95953478aa713011a6ebf8483c20d26b940cbbdd0d248524c37c1aa467d79f7a827641d0
7
+ data.tar.gz: f2942de656083773a4132fc2a84ead1cb5b670b4cb4b5dc5ce1026eb7d1a472b0035d1e5afbfa635236b860aa91f620468851fde5bf0179347a5fd4f9acd1a2f
data/README.md CHANGED
@@ -146,12 +146,6 @@ It returns a Hash of x/y co-ordinates that is the bottom-left corner of the text
146
146
  ```
147
147
  Note: in the case of multiple matches, only the first match is returned.
148
148
 
149
- ## Development
150
-
151
- After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
152
-
153
- To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
154
-
155
149
  ## Contributing
156
150
 
157
151
  - Check issue tracker if someone is working on what you plan to work on
@@ -10,12 +10,14 @@ class HocrTurtletext::Reader
10
10
 
11
11
  def content
12
12
  hocr_content = File.read(@hocr_path)
13
- lines = precise_content(hocr_content)
14
- pos_hash = to_pos_hash(lines)
15
- fuzzed_y(pos_hash)
13
+ html = Nokogiri::HTML(hocr_content)
14
+ pos_info_words = extract_words_from_html(html)
15
+ pos_hash = to_pos_hash pos_info_words
16
+ fuzzed_y = fuzzed_y(pos_hash)
17
+ concat_words_in_lines(fuzzed_y)
16
18
  end
17
19
 
18
- def text_in_region(xmin,xmax,ymin,ymax,inclusive=false)
20
+ def text_in_region(xmin, xmax, ymin, ymax, inclusive=false)
19
21
  return [] unless xmin && xmax && ymin && ymax
20
22
  text_map = content
21
23
  box = []
@@ -37,12 +39,12 @@ class HocrTurtletext::Reader
37
39
  def text_position(text)
38
40
  item = if text.class <= Regexp
39
41
  content.map do |k,v|
40
- if x = v.reduce(nil){|memo,vv| memo = (vv[1] =~ text) ? vv[0] : memo }
42
+ if x = v.reduce(nil){ |memo,vv| memo = (vv[1] =~ text) ? vv[0] : memo }
41
43
  [k,x]
42
44
  end
43
45
  end
44
46
  else
45
- content.map {|k,v| if x = v.rassoc(text) ; [k,x] ; end }
47
+ content.map { |k,v| if x = v.rassoc(text) ; [k,x] ; end }
46
48
  end
47
49
  item = item.compact.flatten
48
50
  unless item.empty?
@@ -51,7 +53,7 @@ class HocrTurtletext::Reader
51
53
  end
52
54
 
53
55
  def bounding_box(&block)
54
- HocrTurtletext::Textangle.new(self,&block)
56
+ HocrTurtletext::Textangle.new(self, &block)
55
57
  end
56
58
 
57
59
  private
@@ -64,10 +66,37 @@ class HocrTurtletext::Reader
64
66
  @options[:y_precision] ||= 3
65
67
  end
66
68
 
69
+ def extract_words_from_html(html)
70
+ pos_info_words = []
71
+
72
+ html.css('span.ocrx_word, span.ocr_word')
73
+ .reject { |word| word.text.strip.empty? }
74
+ .each do |word|
75
+ word_attributes = word.attributes['title'].value.to_s
76
+ .delete(';').split(' ')
77
+ pos_info_word = word_info(word, word_attributes)
78
+ pos_info_words.push pos_info_word
79
+ end
80
+ pos_info_words
81
+ end
82
+
83
+ def to_pos_hash(lines)
84
+ lines.sort_by { |line| line[:y_start] }
85
+
86
+ pos_hash = {}
87
+ lines.each do |run|
88
+ pos_hash[run[:y_start]] ||= {}
89
+ pos_hash[run[:y_start]][run[:x_start]] = run
90
+ end
91
+ pos_hash
92
+ end
93
+
67
94
  def fuzzed_y(input)
68
95
  output = []
69
96
  input.keys.sort.each do |precise_y|
70
- matching_y = output.map(&:first).select { |new_y| (new_y - precise_y).abs < y_precision }.first || precise_y
97
+ matching_y = output.map(&:first)
98
+ .select { |new_y| (new_y - precise_y).abs < y_precision }
99
+ .first || precise_y
71
100
  y_index = output.index{ |y| y.first == matching_y }
72
101
  new_row_content = input[precise_y].to_a
73
102
  if y_index
@@ -81,77 +110,34 @@ class HocrTurtletext::Reader
81
110
  output
82
111
  end
83
112
 
84
- def precise_content(hocr_content)
85
- html = Nokogiri::HTML(hocr_content)
86
- lines = []
87
- html.css('span.ocr_line').map do |line|
88
- chunks = chunks_from_processed_ocr_line(line)
89
- lines.concat(chunks)
90
- end
91
- lines
92
- end
93
-
94
- def chunks_from_processed_ocr_line(ocr_line)
95
- pos_info_line = add_positional_info_to_line(ocr_line)
96
- sorted_pos_info_line = sort_words_in_line(pos_info_line)
97
- concat_words_in_line(sorted_pos_info_line)
98
- end
99
-
100
- def add_positional_info_to_line(ocr_line)
101
- ocr_line.css('span.ocrx_word, span.ocr_word')
102
- .reject { |word| word.text.strip.empty? }
103
- .map do |word|
104
- word_attributes = word.attributes['title'].value.to_s
105
- .delete(';').split(' ')
106
- info(word, word_attributes)
107
- end
108
- end
109
-
110
- def sort_words_in_line(pos_info_line)
111
- # sort word by x value, concat if x2.x_start - x1.x_end < some_x_threshold
112
- pos_info_line.sort_by { |word| word[:x_start] }
113
- pos_info_line.slice_when do |x, y|
114
- y[:x_start] - x[:x_end] > x_whitespace_threshold
115
- end.to_a
116
- end
117
-
118
- def concat_words_in_line(sorted_pos_info_line)
119
- chunks = []
120
- # merge all words in each chunk
121
- sorted_pos_info_line.each do |chunk|
122
- sentence = nil
123
- chunk.each do |word|
124
- if sentence.nil?
125
- sentence = word
113
+ def concat_words_in_lines(fuzzed_y)
114
+ fuzzed_y.map do |line|
115
+ x_pos_keyed_words = line[1]
116
+ concatenated_words = []
117
+ x_pos_keyed_words.each do |x_pos_keyed_word|
118
+ word_hash = x_pos_keyed_word[1]
119
+ if concatenated_words.empty? ||
120
+ word_hash[:x_start] - concatenated_words.last[:x_end] > x_whitespace_threshold
121
+ concatenated_words.push word_hash
126
122
  else
127
- sentence[:word] = "#{sentence[:word]} #{word[:word]}"
128
- sentence[:x_end] = word[:x_end]
123
+ concatenated_words.last[:word] = "#{concatenated_words.last[:word]} #{word_hash[:word]}"
124
+ concatenated_words.last[:x_end] = word_hash[:x_end]
129
125
  end
130
126
  end
131
- chunks.push sentence
132
- end
133
- chunks
134
- end
135
-
136
- def to_pos_hash(lines)
137
- lines.sort_by { |line| line[:y_start] }
138
-
139
- pos_hash = {}
140
- lines.each do |run|
141
- pos_hash[run[:y_start]] ||= {}
142
- pos_hash[run[:y_start]][run[:x_start]] ||= ''
143
- pos_hash[run[:y_start]][run[:x_start]] << run[:word]
127
+ line[1] = concatenated_words.map! do |word_hash|
128
+ [word_hash[:x_start], word_hash[:word]]
129
+ end
130
+ line
144
131
  end
145
- pos_hash
146
132
  end
147
133
 
148
- def info(word, data)
134
+ def word_info(word, data)
149
135
  {
150
- word: word.text,
151
- x_start: data[1].to_i,
152
- y_start: data[2].to_i,
153
- x_end: data[3].to_i,
154
- y_end: data[4].to_i
136
+ word: word.text,
137
+ x_start: data[1].to_i,
138
+ y_start: data[2].to_i,
139
+ x_end: data[3].to_i,
140
+ y_end: data[4].to_i
155
141
  }
156
142
  end
157
143
  end
@@ -1,3 +1,3 @@
1
1
  module HocrTurtletext
2
- VERSION = '0.1.2'.freeze
2
+ VERSION = '0.1.3'.freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hocr_turtletext
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sue Zheng Hao
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-01-24 00:00:00.000000000 Z
11
+ date: 2020-01-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler