rhocr 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Manifest +0 -1
- data/Rakefile +1 -1
- data/lib/hocr_box.rb +1 -3
- data/lib/ocr_document.rb +2 -1
- data/lib/ocr_element.rb +16 -4
- data/lib/ocr_page.rb +7 -16
- data/lib/rhocr.rb +4 -17
- data/rhocr.gemspec +3 -3
- data/spec/hocr_box_spec.rb +19 -1
- data/spec/ocr_element_spec.rb +24 -1
- data/spec/ocr_page_spec.rb +10 -25
- data/spec/rhocr_spec.rb +4 -3
- metadata +5 -5
data/Manifest
CHANGED
data/Rakefile
CHANGED
|
@@ -2,7 +2,7 @@ require 'rubygems'
|
|
|
2
2
|
require 'rake'
|
|
3
3
|
require 'echoe'
|
|
4
4
|
|
|
5
|
-
Echoe.new('rhocr', '0.1.
|
|
5
|
+
Echoe.new('rhocr', '0.1.3') do |p|
|
|
6
6
|
p.description = "Manipulate and use OCR data encoded in hOCR-Format see: http://code.google.com/p/hocr-tools/"
|
|
7
7
|
p.url = "http://github.com/daandi/rhocr"
|
|
8
8
|
p.author = "Andreas Neumann"
|
data/lib/hocr_box.rb
CHANGED
|
@@ -2,15 +2,13 @@
|
|
|
2
2
|
|
|
3
3
|
class HOCRBox
|
|
4
4
|
|
|
5
|
-
attr_reader :left, :top, :right, :
|
|
5
|
+
attr_reader :left, :top, :right, :width, :height, :bottom, :coordinates
|
|
6
6
|
def initialize(* coordinates)
|
|
7
7
|
|
|
8
8
|
@left, @top, @right, @bottom = coordinates.flatten.collect { |x| x.to_i}
|
|
9
9
|
|
|
10
10
|
@height = @bottom - @top
|
|
11
11
|
@width = @right - @left
|
|
12
|
-
@upper_left = [ @left, @top]
|
|
13
|
-
@lower_rigth = [ @right, @bottom ]
|
|
14
12
|
@coordinates = [ @left, @top,@right, @bottom ]
|
|
15
13
|
|
|
16
14
|
if left > right || top > bottom then
|
data/lib/ocr_document.rb
CHANGED
data/lib/ocr_element.rb
CHANGED
|
@@ -86,16 +86,28 @@ class OCRElement < HOCRBox
|
|
|
86
86
|
"<span style='color: #{color}'>#{to_s}</span>"
|
|
87
87
|
end
|
|
88
88
|
|
|
89
|
-
def
|
|
89
|
+
def css_class_string
|
|
90
|
+
if @features.empty?
|
|
91
|
+
"#{@ocr_class}"
|
|
92
|
+
else
|
|
93
|
+
"#{@ocr_class}-#{features_to_css_class}"
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def to_image_html(dipslay_class =css_class_string)
|
|
90
98
|
children_html = @children.map {|c| c.to_image_html}.join("")
|
|
91
99
|
"<span class='#{ dipslay_class }' style='#{ to_css_style }' ></span>#{ children_html }"
|
|
92
100
|
end
|
|
93
101
|
|
|
94
|
-
def to_html( display_class =
|
|
102
|
+
def to_html( display_class = css_class_string, style = nil )
|
|
95
103
|
children_html = @children.map {|c| c.to_html}.join("")
|
|
96
104
|
"<span class='#{ display_class }'> #{ children_html } </span>"
|
|
97
105
|
end
|
|
98
106
|
|
|
107
|
+
def features_to_css_class
|
|
108
|
+
@features.uniq.sort.join('_')
|
|
109
|
+
end
|
|
110
|
+
|
|
99
111
|
end
|
|
100
112
|
|
|
101
113
|
class OCRWord < OCRElement
|
|
@@ -109,11 +121,11 @@ class OCRWord < OCRElement
|
|
|
109
121
|
end
|
|
110
122
|
|
|
111
123
|
def to_image_html
|
|
112
|
-
"<span class='#{
|
|
124
|
+
"<span class='#{ css_class_string}' style='#{ to_css_style }'>#{ text }</span>"
|
|
113
125
|
end
|
|
114
126
|
|
|
115
127
|
def to_html
|
|
116
|
-
"<span class='#{
|
|
128
|
+
"<span class='#{ css_class_string }'>#{ text }</span>"
|
|
117
129
|
end
|
|
118
130
|
|
|
119
131
|
end
|
data/lib/ocr_page.rb
CHANGED
|
@@ -52,18 +52,6 @@ class OCRPage < OCRElement
|
|
|
52
52
|
end
|
|
53
53
|
end
|
|
54
54
|
|
|
55
|
-
#deprecated
|
|
56
|
-
def lines
|
|
57
|
-
unless @lines then
|
|
58
|
-
@lines = []
|
|
59
|
-
|
|
60
|
-
each_line do |line|
|
|
61
|
-
@lines << line
|
|
62
|
-
end
|
|
63
|
-
|
|
64
|
-
end
|
|
65
|
-
@lines
|
|
66
|
-
end
|
|
67
55
|
|
|
68
56
|
def extract_bbox_ppageno( ocr_html_text_fragment )
|
|
69
57
|
bbox, ppageno = ocr_html_text_fragment.split(';')
|
|
@@ -77,7 +65,7 @@ class OCRPage < OCRElement
|
|
|
77
65
|
end
|
|
78
66
|
|
|
79
67
|
def to_text
|
|
80
|
-
|
|
68
|
+
Enumerator.new(self,:each_line).map {|line| line.to_text}.join("\n")
|
|
81
69
|
end
|
|
82
70
|
|
|
83
71
|
def to_image_html(dipslay_class = @ocr_class)
|
|
@@ -86,13 +74,16 @@ class OCRPage < OCRElement
|
|
|
86
74
|
end
|
|
87
75
|
|
|
88
76
|
def enclosed_words(ocr_box)
|
|
89
|
-
|
|
77
|
+
enum = Enumerator.new(self,:each_enclosed_word,ocr_box)
|
|
78
|
+
enum.inject([]) { |acc,w| acc << w}
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def each_enclosed_word(ocr_box)
|
|
90
82
|
each_word do |w|
|
|
91
83
|
if w.enclosed_by? ocr_box then
|
|
92
|
-
|
|
84
|
+
yield w
|
|
93
85
|
end
|
|
94
86
|
end
|
|
95
|
-
a
|
|
96
87
|
end
|
|
97
88
|
|
|
98
89
|
end
|
data/lib/rhocr.rb
CHANGED
|
@@ -3,29 +3,16 @@
|
|
|
3
3
|
require_relative "ocr_document"
|
|
4
4
|
class RHOCR < OCRDocument
|
|
5
5
|
|
|
6
|
-
attr_reader :words, :lines
|
|
7
|
-
|
|
8
6
|
def add_folder(path)
|
|
9
7
|
add_files Dir[path]
|
|
10
|
-
compute_lines
|
|
11
|
-
compute_words
|
|
12
|
-
self
|
|
13
8
|
end
|
|
14
9
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
@words = []
|
|
18
|
-
each_word do |w|
|
|
19
|
-
@words << w
|
|
20
|
-
end
|
|
10
|
+
def words
|
|
11
|
+
Enumerator.new(self,:each_word)
|
|
21
12
|
end
|
|
22
13
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
@lines = []
|
|
26
|
-
each_line do |l|
|
|
27
|
-
@lines << l
|
|
28
|
-
end
|
|
14
|
+
def lines
|
|
15
|
+
Enumerator.new(self,:each_line)
|
|
29
16
|
end
|
|
30
17
|
|
|
31
18
|
end
|
data/rhocr.gemspec
CHANGED
|
@@ -2,15 +2,15 @@
|
|
|
2
2
|
|
|
3
3
|
Gem::Specification.new do |s|
|
|
4
4
|
s.name = %q{rhocr}
|
|
5
|
-
s.version = "0.1.
|
|
5
|
+
s.version = "0.1.3"
|
|
6
6
|
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
|
8
8
|
s.authors = [%q{Andreas Neumann}]
|
|
9
|
-
s.date = %q{2011-
|
|
9
|
+
s.date = %q{2011-10-10}
|
|
10
10
|
s.description = %q{Manipulate and use OCR data encoded in hOCR-Format see: http://code.google.com/p/hocr-tools/}
|
|
11
11
|
s.email = %q{andreas@neumann.biz}
|
|
12
12
|
s.extra_rdoc_files = [%q{README}, %q{lib/hocr_box.rb}, %q{lib/ocr_document.rb}, %q{lib/ocr_element.rb}, %q{lib/ocr_page.rb}, %q{lib/rhocr.rb}]
|
|
13
|
-
s.files = [%q{Manifest}, %q{README}, %q{Rakefile}, %q{data/Seite_Die_Gartenlaube_242.html}, %q{data/Seite_Tagebuch_H_C_Lang_08.html}, %q{data/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{data/test.html}, %q{data/test.png}, %q{example/example_server.rb}, %q{example/public/OCRTest.css}, %q{example/public/OCRTest.html}, %q{example/public/OCRTest_marker.js}, %q{example/public/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{example/public/img/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{lib/hocr_box.rb}, %q{lib/ocr_document.rb}, %q{lib/ocr_element.rb}, %q{lib/ocr_page.rb}, %q{lib/rhocr.rb}, %q{
|
|
13
|
+
s.files = [%q{Manifest}, %q{README}, %q{Rakefile}, %q{data/Seite_Die_Gartenlaube_242.html}, %q{data/Seite_Tagebuch_H_C_Lang_08.html}, %q{data/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{data/test.html}, %q{data/test.png}, %q{example/example_server.rb}, %q{example/public/OCRTest.css}, %q{example/public/OCRTest.html}, %q{example/public/OCRTest_marker.js}, %q{example/public/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{example/public/img/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{lib/hocr_box.rb}, %q{lib/ocr_document.rb}, %q{lib/ocr_element.rb}, %q{lib/ocr_page.rb}, %q{lib/rhocr.rb}, %q{spec/hocr_box_spec.rb}, %q{spec/ocr_document_spec.rb}, %q{spec/ocr_element_spec.rb}, %q{spec/ocr_page_spec.rb}, %q{spec/rhocr_spec.rb}, %q{test.html}, %q{rhocr.gemspec}]
|
|
14
14
|
s.homepage = %q{http://github.com/daandi/rhocr}
|
|
15
15
|
s.rdoc_options = [%q{--line-numbers}, %q{--inline-source}, %q{--title}, %q{Rhocr}, %q{--main}, %q{README}]
|
|
16
16
|
s.require_paths = [%q{lib}]
|
data/spec/hocr_box_spec.rb
CHANGED
|
@@ -12,6 +12,24 @@ describe HOCRBox do
|
|
|
12
12
|
it 'should have coordinates' do
|
|
13
13
|
@box.coordinates.should == [1,2,20,8]
|
|
14
14
|
end
|
|
15
|
+
it 'should have #left' do
|
|
16
|
+
@box.left.should == 1
|
|
17
|
+
end
|
|
18
|
+
it 'should have #right' do
|
|
19
|
+
@box.right.should == 20
|
|
20
|
+
end
|
|
21
|
+
it 'should have #top' do
|
|
22
|
+
@box.top.should == 2
|
|
23
|
+
end
|
|
24
|
+
it 'should have #bottom' do
|
|
25
|
+
@box.bottom.should == 8
|
|
26
|
+
end
|
|
27
|
+
it 'should have height' do
|
|
28
|
+
@box.height == 7
|
|
29
|
+
end
|
|
30
|
+
it 'should have width' do
|
|
31
|
+
@box.width.should == 19
|
|
32
|
+
end
|
|
15
33
|
end
|
|
16
34
|
|
|
17
35
|
describe "#to_s" do
|
|
@@ -91,7 +109,7 @@ describe HOCRBox do
|
|
|
91
109
|
end
|
|
92
110
|
|
|
93
111
|
describe '#top_distance_to(element)' do
|
|
94
|
-
it 'box should be
|
|
112
|
+
it 'box should be 9px below of element' do
|
|
95
113
|
HOCRBox.new(109,241,206,274).top_distance_to(HOCRBox.new(160,196,1117,232)).should == 9
|
|
96
114
|
end
|
|
97
115
|
end
|
data/spec/ocr_element_spec.rb
CHANGED
|
@@ -16,7 +16,7 @@ describe OCRElement do
|
|
|
16
16
|
describe '#initialize and Object' do
|
|
17
17
|
it 'should create an element given ocr_class, children and coordiantes' do
|
|
18
18
|
test_element = OCRElement.new('test', [], %w{10, 11, 20, 21})
|
|
19
|
-
test_element.children.should
|
|
19
|
+
test_element.children.should be_empty
|
|
20
20
|
test_element.coordinates.should == [10,11,20,21]
|
|
21
21
|
test_element.ocr_class.should == 'test'
|
|
22
22
|
end
|
|
@@ -63,6 +63,29 @@ describe OCRElement do
|
|
|
63
63
|
it 'should have a #to_html method' do
|
|
64
64
|
@ocr_element.to_html.should == "<span class='ocr_par'> <span class='ocr_line'> <span class='ocrx_word'>Athenobius,</span><span class='ocrx_word'>Der</span><span class='ocrx_word'>von</span><span class='ocrx_word'>der</span><span class='ocrx_word'>Göttin</span><span class='ocrx_word'>Minerva</span><span class='ocrx_word'>lebt,</span><span class='ocrx_word'>oder:</span><span class='ocrx_word'>Mi»</span> </span><span class='ocr_line'> <span class='ocrx_word'>nerva</span><span class='ocrx_word'>Bogen.</span> </span> </span>"
|
|
65
65
|
end
|
|
66
|
+
|
|
67
|
+
it 'should add features to css class' do
|
|
68
|
+
@ocr_element.lines[0].words[5].features << :test
|
|
69
|
+
@ocr_element.lines[0].words[5].to_html.should == "<span class='ocrx_word-test'>Minerva</span>"
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
it 'should transform features to an css class #features_to_css_class' do
|
|
73
|
+
test_elem = @ocr_element.lines[0]
|
|
74
|
+
test_elem.features_to_css_class.should == ''
|
|
75
|
+
test_elem.features << :test
|
|
76
|
+
test_elem.features << :stuff
|
|
77
|
+
test_elem.features_to_css_class.should == 'stuff_test'
|
|
78
|
+
test_elem.features << :test
|
|
79
|
+
test_elem.features_to_css_class.should == 'stuff_test'
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
it 'should use plain css class if elemetn has no features #css_class_string' do
|
|
83
|
+
test = @ocr_element.lines[1]
|
|
84
|
+
test.css_class_string.should == 'ocr_line'
|
|
85
|
+
test.features << :abc
|
|
86
|
+
test.css_class_string.should == 'ocr_line-abc'
|
|
87
|
+
end
|
|
88
|
+
|
|
66
89
|
end
|
|
67
90
|
|
|
68
91
|
describe 'ocr_line' do
|
data/spec/ocr_page_spec.rb
CHANGED
|
@@ -46,41 +46,24 @@ describe OCRPage do
|
|
|
46
46
|
|
|
47
47
|
describe '#Iterators' do
|
|
48
48
|
it 'should have a block iterator #each_block' do
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
a << block
|
|
52
|
-
end
|
|
53
|
-
a.length.should == 1
|
|
49
|
+
Enumerator.new(@test_page,:each_block).
|
|
50
|
+
inject([]) {|acc,elem| acc << elem}.length.should == 1
|
|
54
51
|
end
|
|
55
52
|
it 'should have a paragraph iterator #each_paragraph' do
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
a << paragraph
|
|
59
|
-
end
|
|
60
|
-
a.length.should == 12
|
|
53
|
+
Enumerator.new(@test_page,:each_paragraph).
|
|
54
|
+
inject([]) {|acc,elem| acc << elem}.length.should == 12
|
|
61
55
|
end
|
|
62
56
|
it 'should have a line iterator #each_line' do
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
a << line
|
|
66
|
-
end
|
|
67
|
-
a.length.should == 45
|
|
57
|
+
Enumerator.new(@test_page,:each_line).
|
|
58
|
+
inject([]) {|acc,elem| acc << elem}.length.should == 45
|
|
68
59
|
end
|
|
69
60
|
it 'should have a word iterator #each_word' do
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
a << word
|
|
73
|
-
end
|
|
74
|
-
a.length.should == 415
|
|
61
|
+
Enumerator.new(@test_page,:each_word).
|
|
62
|
+
inject([]) {|acc,elem| acc << elem}.length.should == 415
|
|
75
63
|
end
|
|
76
64
|
|
|
77
65
|
end
|
|
78
66
|
|
|
79
|
-
describe 'convinience methods' do
|
|
80
|
-
it 'should have a method #lines' do
|
|
81
|
-
@test_page.lines[5].children.length.should == 11
|
|
82
|
-
end
|
|
83
|
-
end
|
|
84
67
|
|
|
85
68
|
describe ' display methods' do
|
|
86
69
|
|
|
@@ -111,6 +94,8 @@ describe OCRPage do
|
|
|
111
94
|
words = @output_page.enclosed_words( HOCRBox.new 0,0, 300,300 )
|
|
112
95
|
words.length.should == 6
|
|
113
96
|
end
|
|
97
|
+
|
|
98
|
+
it 'should have an #enclosed_word method to iterate over ewords enclosed by given box'
|
|
114
99
|
end
|
|
115
100
|
|
|
116
101
|
end
|
data/spec/rhocr_spec.rb
CHANGED
|
@@ -13,16 +13,17 @@ describe RHOCR do
|
|
|
13
13
|
end
|
|
14
14
|
|
|
15
15
|
before(:each) do
|
|
16
|
-
@rhocr_doc ||= RHOCR.new
|
|
16
|
+
@rhocr_doc ||= RHOCR.new
|
|
17
|
+
@rhocr_doc.add_folder "data/*.html"
|
|
17
18
|
end
|
|
18
19
|
|
|
19
20
|
describe 'methods to iterate' do
|
|
20
21
|
it 'should have lines' do
|
|
21
|
-
@rhocr_doc.lines.length.should == 237
|
|
22
|
+
@rhocr_doc.lines.inject([]){|acc,l| acc <<l}.length.should == 237
|
|
22
23
|
end
|
|
23
24
|
|
|
24
25
|
it 'should have words' do
|
|
25
|
-
@rhocr_doc.words.length.should == 2071
|
|
26
|
+
@rhocr_doc.words.inject([]){|acc,w| acc <<w}.length.should == 2071
|
|
26
27
|
end
|
|
27
28
|
|
|
28
29
|
it 'should support common iterator methods throug enumerable for word an line arrays' do
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: rhocr
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.3
|
|
5
5
|
prerelease:
|
|
6
6
|
platform: ruby
|
|
7
7
|
authors:
|
|
@@ -9,11 +9,11 @@ authors:
|
|
|
9
9
|
autorequire:
|
|
10
10
|
bindir: bin
|
|
11
11
|
cert_chain: []
|
|
12
|
-
date: 2011-
|
|
12
|
+
date: 2011-10-10 00:00:00.000000000Z
|
|
13
13
|
dependencies:
|
|
14
14
|
- !ruby/object:Gem::Dependency
|
|
15
15
|
name: nokogiri
|
|
16
|
-
requirement: &
|
|
16
|
+
requirement: &70118176813440 !ruby/object:Gem::Requirement
|
|
17
17
|
none: false
|
|
18
18
|
requirements:
|
|
19
19
|
- - ! '>='
|
|
@@ -21,7 +21,7 @@ dependencies:
|
|
|
21
21
|
version: '0'
|
|
22
22
|
type: :runtime
|
|
23
23
|
prerelease: false
|
|
24
|
-
version_requirements: *
|
|
24
|
+
version_requirements: *70118176813440
|
|
25
25
|
description: ! 'Manipulate and use OCR data encoded in hOCR-Format see: http://code.google.com/p/hocr-tools/'
|
|
26
26
|
email: andreas@neumann.biz
|
|
27
27
|
executables: []
|
|
@@ -53,13 +53,13 @@ files:
|
|
|
53
53
|
- lib/ocr_element.rb
|
|
54
54
|
- lib/ocr_page.rb
|
|
55
55
|
- lib/rhocr.rb
|
|
56
|
-
- rhocr.gemspec
|
|
57
56
|
- spec/hocr_box_spec.rb
|
|
58
57
|
- spec/ocr_document_spec.rb
|
|
59
58
|
- spec/ocr_element_spec.rb
|
|
60
59
|
- spec/ocr_page_spec.rb
|
|
61
60
|
- spec/rhocr_spec.rb
|
|
62
61
|
- test.html
|
|
62
|
+
- rhocr.gemspec
|
|
63
63
|
homepage: http://github.com/daandi/rhocr
|
|
64
64
|
licenses: []
|
|
65
65
|
post_install_message:
|