hocr_turtletext 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +0 -6
- data/lib/hocr_turtletext/reader.rb +58 -72
- data/lib/hocr_turtletext/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 39a62a3746924d1fee867013bc7a03f361d805bf
|
4
|
+
data.tar.gz: ee01353babbe3f3d4fcb78b84f40fcd1d2fea4e7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c62c8d2a3c6a1c7aa7a31185d92e5a9903471f345ec72aeb56b56a2c95953478aa713011a6ebf8483c20d26b940cbbdd0d248524c37c1aa467d79f7a827641d0
|
7
|
+
data.tar.gz: f2942de656083773a4132fc2a84ead1cb5b670b4cb4b5dc5ce1026eb7d1a472b0035d1e5afbfa635236b860aa91f620468851fde5bf0179347a5fd4f9acd1a2f
|
data/README.md
CHANGED
@@ -146,12 +146,6 @@ It returns a Hash of x/y co-ordinates that is the bottom-left corner of the text
|
|
146
146
|
```
|
147
147
|
Note: in the case of multiple matches, only the first match is returned.
|
148
148
|
|
149
|
-
## Development
|
150
|
-
|
151
|
-
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
152
|
-
|
153
|
-
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
154
|
-
|
155
149
|
## Contributing
|
156
150
|
|
157
151
|
- Check issue tracker if someone is working on what you plan to work on
|
@@ -10,12 +10,14 @@ class HocrTurtletext::Reader
|
|
10
10
|
|
11
11
|
def content
|
12
12
|
hocr_content = File.read(@hocr_path)
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
html = Nokogiri::HTML(hocr_content)
|
14
|
+
pos_info_words = extract_words_from_html(html)
|
15
|
+
pos_hash = to_pos_hash pos_info_words
|
16
|
+
fuzzed_y = fuzzed_y(pos_hash)
|
17
|
+
concat_words_in_lines(fuzzed_y)
|
16
18
|
end
|
17
19
|
|
18
|
-
def text_in_region(xmin,xmax,ymin,ymax,inclusive=false)
|
20
|
+
def text_in_region(xmin, xmax, ymin, ymax, inclusive=false)
|
19
21
|
return [] unless xmin && xmax && ymin && ymax
|
20
22
|
text_map = content
|
21
23
|
box = []
|
@@ -37,12 +39,12 @@ class HocrTurtletext::Reader
|
|
37
39
|
def text_position(text)
|
38
40
|
item = if text.class <= Regexp
|
39
41
|
content.map do |k,v|
|
40
|
-
if x = v.reduce(nil){|memo,vv| memo = (vv[1] =~ text) ? vv[0] : memo
|
42
|
+
if x = v.reduce(nil){ |memo,vv| memo = (vv[1] =~ text) ? vv[0] : memo }
|
41
43
|
[k,x]
|
42
44
|
end
|
43
45
|
end
|
44
46
|
else
|
45
|
-
content.map {|k,v| if x = v.rassoc(text) ; [k,x] ; end }
|
47
|
+
content.map { |k,v| if x = v.rassoc(text) ; [k,x] ; end }
|
46
48
|
end
|
47
49
|
item = item.compact.flatten
|
48
50
|
unless item.empty?
|
@@ -51,7 +53,7 @@ class HocrTurtletext::Reader
|
|
51
53
|
end
|
52
54
|
|
53
55
|
def bounding_box(&block)
|
54
|
-
HocrTurtletext::Textangle.new(self
|
56
|
+
HocrTurtletext::Textangle.new(self, &block)
|
55
57
|
end
|
56
58
|
|
57
59
|
private
|
@@ -64,10 +66,37 @@ class HocrTurtletext::Reader
|
|
64
66
|
@options[:y_precision] ||= 3
|
65
67
|
end
|
66
68
|
|
69
|
+
def extract_words_from_html(html)
|
70
|
+
pos_info_words = []
|
71
|
+
|
72
|
+
html.css('span.ocrx_word, span.ocr_word')
|
73
|
+
.reject { |word| word.text.strip.empty? }
|
74
|
+
.each do |word|
|
75
|
+
word_attributes = word.attributes['title'].value.to_s
|
76
|
+
.delete(';').split(' ')
|
77
|
+
pos_info_word = word_info(word, word_attributes)
|
78
|
+
pos_info_words.push pos_info_word
|
79
|
+
end
|
80
|
+
pos_info_words
|
81
|
+
end
|
82
|
+
|
83
|
+
def to_pos_hash(lines)
|
84
|
+
lines.sort_by { |line| line[:y_start] }
|
85
|
+
|
86
|
+
pos_hash = {}
|
87
|
+
lines.each do |run|
|
88
|
+
pos_hash[run[:y_start]] ||= {}
|
89
|
+
pos_hash[run[:y_start]][run[:x_start]] = run
|
90
|
+
end
|
91
|
+
pos_hash
|
92
|
+
end
|
93
|
+
|
67
94
|
def fuzzed_y(input)
|
68
95
|
output = []
|
69
96
|
input.keys.sort.each do |precise_y|
|
70
|
-
matching_y = output.map(&:first)
|
97
|
+
matching_y = output.map(&:first)
|
98
|
+
.select { |new_y| (new_y - precise_y).abs < y_precision }
|
99
|
+
.first || precise_y
|
71
100
|
y_index = output.index{ |y| y.first == matching_y }
|
72
101
|
new_row_content = input[precise_y].to_a
|
73
102
|
if y_index
|
@@ -81,77 +110,34 @@ class HocrTurtletext::Reader
|
|
81
110
|
output
|
82
111
|
end
|
83
112
|
|
84
|
-
def
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
def chunks_from_processed_ocr_line(ocr_line)
|
95
|
-
pos_info_line = add_positional_info_to_line(ocr_line)
|
96
|
-
sorted_pos_info_line = sort_words_in_line(pos_info_line)
|
97
|
-
concat_words_in_line(sorted_pos_info_line)
|
98
|
-
end
|
99
|
-
|
100
|
-
def add_positional_info_to_line(ocr_line)
|
101
|
-
ocr_line.css('span.ocrx_word, span.ocr_word')
|
102
|
-
.reject { |word| word.text.strip.empty? }
|
103
|
-
.map do |word|
|
104
|
-
word_attributes = word.attributes['title'].value.to_s
|
105
|
-
.delete(';').split(' ')
|
106
|
-
info(word, word_attributes)
|
107
|
-
end
|
108
|
-
end
|
109
|
-
|
110
|
-
def sort_words_in_line(pos_info_line)
|
111
|
-
# sort word by x value, concat if x2.x_start - x1.x_end < some_x_threshold
|
112
|
-
pos_info_line.sort_by { |word| word[:x_start] }
|
113
|
-
pos_info_line.slice_when do |x, y|
|
114
|
-
y[:x_start] - x[:x_end] > x_whitespace_threshold
|
115
|
-
end.to_a
|
116
|
-
end
|
117
|
-
|
118
|
-
def concat_words_in_line(sorted_pos_info_line)
|
119
|
-
chunks = []
|
120
|
-
# merge all words in each chunk
|
121
|
-
sorted_pos_info_line.each do |chunk|
|
122
|
-
sentence = nil
|
123
|
-
chunk.each do |word|
|
124
|
-
if sentence.nil?
|
125
|
-
sentence = word
|
113
|
+
def concat_words_in_lines(fuzzed_y)
|
114
|
+
fuzzed_y.map do |line|
|
115
|
+
x_pos_keyed_words = line[1]
|
116
|
+
concatenated_words = []
|
117
|
+
x_pos_keyed_words.each do |x_pos_keyed_word|
|
118
|
+
word_hash = x_pos_keyed_word[1]
|
119
|
+
if concatenated_words.empty? ||
|
120
|
+
word_hash[:x_start] - concatenated_words.last[:x_end] > x_whitespace_threshold
|
121
|
+
concatenated_words.push word_hash
|
126
122
|
else
|
127
|
-
|
128
|
-
|
123
|
+
concatenated_words.last[:word] = "#{concatenated_words.last[:word]} #{word_hash[:word]}"
|
124
|
+
concatenated_words.last[:x_end] = word_hash[:x_end]
|
129
125
|
end
|
130
126
|
end
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
def to_pos_hash(lines)
|
137
|
-
lines.sort_by { |line| line[:y_start] }
|
138
|
-
|
139
|
-
pos_hash = {}
|
140
|
-
lines.each do |run|
|
141
|
-
pos_hash[run[:y_start]] ||= {}
|
142
|
-
pos_hash[run[:y_start]][run[:x_start]] ||= ''
|
143
|
-
pos_hash[run[:y_start]][run[:x_start]] << run[:word]
|
127
|
+
line[1] = concatenated_words.map! do |word_hash|
|
128
|
+
[word_hash[:x_start], word_hash[:word]]
|
129
|
+
end
|
130
|
+
line
|
144
131
|
end
|
145
|
-
pos_hash
|
146
132
|
end
|
147
133
|
|
148
|
-
def
|
134
|
+
def word_info(word, data)
|
149
135
|
{
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
136
|
+
word: word.text,
|
137
|
+
x_start: data[1].to_i,
|
138
|
+
y_start: data[2].to_i,
|
139
|
+
x_end: data[3].to_i,
|
140
|
+
y_end: data[4].to_i
|
155
141
|
}
|
156
142
|
end
|
157
143
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hocr_turtletext
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sue Zheng Hao
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-01-
|
11
|
+
date: 2020-01-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|