hocr_turtletext 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +0 -6
- data/lib/hocr_turtletext/reader.rb +58 -72
- data/lib/hocr_turtletext/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 39a62a3746924d1fee867013bc7a03f361d805bf
|
4
|
+
data.tar.gz: ee01353babbe3f3d4fcb78b84f40fcd1d2fea4e7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c62c8d2a3c6a1c7aa7a31185d92e5a9903471f345ec72aeb56b56a2c95953478aa713011a6ebf8483c20d26b940cbbdd0d248524c37c1aa467d79f7a827641d0
|
7
|
+
data.tar.gz: f2942de656083773a4132fc2a84ead1cb5b670b4cb4b5dc5ce1026eb7d1a472b0035d1e5afbfa635236b860aa91f620468851fde5bf0179347a5fd4f9acd1a2f
|
data/README.md
CHANGED
@@ -146,12 +146,6 @@ It returns a Hash of x/y co-ordinates that is the bottom-left corner of the text
|
|
146
146
|
```
|
147
147
|
Note: in the case of multiple matches, only the first match is returned.
|
148
148
|
|
149
|
-
## Development
|
150
|
-
|
151
|
-
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
152
|
-
|
153
|
-
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
154
|
-
|
155
149
|
## Contributing
|
156
150
|
|
157
151
|
- Check issue tracker if someone is working on what you plan to work on
|
@@ -10,12 +10,14 @@ class HocrTurtletext::Reader
|
|
10
10
|
|
11
11
|
def content
|
12
12
|
hocr_content = File.read(@hocr_path)
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
html = Nokogiri::HTML(hocr_content)
|
14
|
+
pos_info_words = extract_words_from_html(html)
|
15
|
+
pos_hash = to_pos_hash pos_info_words
|
16
|
+
fuzzed_y = fuzzed_y(pos_hash)
|
17
|
+
concat_words_in_lines(fuzzed_y)
|
16
18
|
end
|
17
19
|
|
18
|
-
def text_in_region(xmin,xmax,ymin,ymax,inclusive=false)
|
20
|
+
def text_in_region(xmin, xmax, ymin, ymax, inclusive=false)
|
19
21
|
return [] unless xmin && xmax && ymin && ymax
|
20
22
|
text_map = content
|
21
23
|
box = []
|
@@ -37,12 +39,12 @@ class HocrTurtletext::Reader
|
|
37
39
|
def text_position(text)
|
38
40
|
item = if text.class <= Regexp
|
39
41
|
content.map do |k,v|
|
40
|
-
if x = v.reduce(nil){|memo,vv| memo = (vv[1] =~ text) ? vv[0] : memo
|
42
|
+
if x = v.reduce(nil){ |memo,vv| memo = (vv[1] =~ text) ? vv[0] : memo }
|
41
43
|
[k,x]
|
42
44
|
end
|
43
45
|
end
|
44
46
|
else
|
45
|
-
content.map {|k,v| if x = v.rassoc(text) ; [k,x] ; end }
|
47
|
+
content.map { |k,v| if x = v.rassoc(text) ; [k,x] ; end }
|
46
48
|
end
|
47
49
|
item = item.compact.flatten
|
48
50
|
unless item.empty?
|
@@ -51,7 +53,7 @@ class HocrTurtletext::Reader
|
|
51
53
|
end
|
52
54
|
|
53
55
|
def bounding_box(&block)
|
54
|
-
HocrTurtletext::Textangle.new(self
|
56
|
+
HocrTurtletext::Textangle.new(self, &block)
|
55
57
|
end
|
56
58
|
|
57
59
|
private
|
@@ -64,10 +66,37 @@ class HocrTurtletext::Reader
|
|
64
66
|
@options[:y_precision] ||= 3
|
65
67
|
end
|
66
68
|
|
69
|
+
def extract_words_from_html(html)
|
70
|
+
pos_info_words = []
|
71
|
+
|
72
|
+
html.css('span.ocrx_word, span.ocr_word')
|
73
|
+
.reject { |word| word.text.strip.empty? }
|
74
|
+
.each do |word|
|
75
|
+
word_attributes = word.attributes['title'].value.to_s
|
76
|
+
.delete(';').split(' ')
|
77
|
+
pos_info_word = word_info(word, word_attributes)
|
78
|
+
pos_info_words.push pos_info_word
|
79
|
+
end
|
80
|
+
pos_info_words
|
81
|
+
end
|
82
|
+
|
83
|
+
def to_pos_hash(lines)
|
84
|
+
lines.sort_by { |line| line[:y_start] }
|
85
|
+
|
86
|
+
pos_hash = {}
|
87
|
+
lines.each do |run|
|
88
|
+
pos_hash[run[:y_start]] ||= {}
|
89
|
+
pos_hash[run[:y_start]][run[:x_start]] = run
|
90
|
+
end
|
91
|
+
pos_hash
|
92
|
+
end
|
93
|
+
|
67
94
|
def fuzzed_y(input)
|
68
95
|
output = []
|
69
96
|
input.keys.sort.each do |precise_y|
|
70
|
-
matching_y = output.map(&:first)
|
97
|
+
matching_y = output.map(&:first)
|
98
|
+
.select { |new_y| (new_y - precise_y).abs < y_precision }
|
99
|
+
.first || precise_y
|
71
100
|
y_index = output.index{ |y| y.first == matching_y }
|
72
101
|
new_row_content = input[precise_y].to_a
|
73
102
|
if y_index
|
@@ -81,77 +110,34 @@ class HocrTurtletext::Reader
|
|
81
110
|
output
|
82
111
|
end
|
83
112
|
|
84
|
-
def
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
def chunks_from_processed_ocr_line(ocr_line)
|
95
|
-
pos_info_line = add_positional_info_to_line(ocr_line)
|
96
|
-
sorted_pos_info_line = sort_words_in_line(pos_info_line)
|
97
|
-
concat_words_in_line(sorted_pos_info_line)
|
98
|
-
end
|
99
|
-
|
100
|
-
def add_positional_info_to_line(ocr_line)
|
101
|
-
ocr_line.css('span.ocrx_word, span.ocr_word')
|
102
|
-
.reject { |word| word.text.strip.empty? }
|
103
|
-
.map do |word|
|
104
|
-
word_attributes = word.attributes['title'].value.to_s
|
105
|
-
.delete(';').split(' ')
|
106
|
-
info(word, word_attributes)
|
107
|
-
end
|
108
|
-
end
|
109
|
-
|
110
|
-
def sort_words_in_line(pos_info_line)
|
111
|
-
# sort word by x value, concat if x2.x_start - x1.x_end < some_x_threshold
|
112
|
-
pos_info_line.sort_by { |word| word[:x_start] }
|
113
|
-
pos_info_line.slice_when do |x, y|
|
114
|
-
y[:x_start] - x[:x_end] > x_whitespace_threshold
|
115
|
-
end.to_a
|
116
|
-
end
|
117
|
-
|
118
|
-
def concat_words_in_line(sorted_pos_info_line)
|
119
|
-
chunks = []
|
120
|
-
# merge all words in each chunk
|
121
|
-
sorted_pos_info_line.each do |chunk|
|
122
|
-
sentence = nil
|
123
|
-
chunk.each do |word|
|
124
|
-
if sentence.nil?
|
125
|
-
sentence = word
|
113
|
+
def concat_words_in_lines(fuzzed_y)
|
114
|
+
fuzzed_y.map do |line|
|
115
|
+
x_pos_keyed_words = line[1]
|
116
|
+
concatenated_words = []
|
117
|
+
x_pos_keyed_words.each do |x_pos_keyed_word|
|
118
|
+
word_hash = x_pos_keyed_word[1]
|
119
|
+
if concatenated_words.empty? ||
|
120
|
+
word_hash[:x_start] - concatenated_words.last[:x_end] > x_whitespace_threshold
|
121
|
+
concatenated_words.push word_hash
|
126
122
|
else
|
127
|
-
|
128
|
-
|
123
|
+
concatenated_words.last[:word] = "#{concatenated_words.last[:word]} #{word_hash[:word]}"
|
124
|
+
concatenated_words.last[:x_end] = word_hash[:x_end]
|
129
125
|
end
|
130
126
|
end
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
def to_pos_hash(lines)
|
137
|
-
lines.sort_by { |line| line[:y_start] }
|
138
|
-
|
139
|
-
pos_hash = {}
|
140
|
-
lines.each do |run|
|
141
|
-
pos_hash[run[:y_start]] ||= {}
|
142
|
-
pos_hash[run[:y_start]][run[:x_start]] ||= ''
|
143
|
-
pos_hash[run[:y_start]][run[:x_start]] << run[:word]
|
127
|
+
line[1] = concatenated_words.map! do |word_hash|
|
128
|
+
[word_hash[:x_start], word_hash[:word]]
|
129
|
+
end
|
130
|
+
line
|
144
131
|
end
|
145
|
-
pos_hash
|
146
132
|
end
|
147
133
|
|
148
|
-
def
|
134
|
+
def word_info(word, data)
|
149
135
|
{
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
136
|
+
word: word.text,
|
137
|
+
x_start: data[1].to_i,
|
138
|
+
y_start: data[2].to_i,
|
139
|
+
x_end: data[3].to_i,
|
140
|
+
y_end: data[4].to_i
|
155
141
|
}
|
156
142
|
end
|
157
143
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hocr_turtletext
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sue Zheng Hao
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-01-
|
11
|
+
date: 2020-01-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|