rhocr 0.1.1 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- data/Manifest +0 -1
- data/Rakefile +1 -1
- data/lib/hocr_box.rb +1 -3
- data/lib/ocr_document.rb +2 -1
- data/lib/ocr_element.rb +16 -4
- data/lib/ocr_page.rb +7 -16
- data/lib/rhocr.rb +4 -17
- data/rhocr.gemspec +3 -3
- data/spec/hocr_box_spec.rb +19 -1
- data/spec/ocr_element_spec.rb +24 -1
- data/spec/ocr_page_spec.rb +10 -25
- data/spec/rhocr_spec.rb +4 -3
- metadata +5 -5
data/Manifest
CHANGED
data/Rakefile
CHANGED
@@ -2,7 +2,7 @@ require 'rubygems'
|
|
2
2
|
require 'rake'
|
3
3
|
require 'echoe'
|
4
4
|
|
5
|
-
Echoe.new('rhocr', '0.1.
|
5
|
+
Echoe.new('rhocr', '0.1.3') do |p|
|
6
6
|
p.description = "Manipulate and use OCR data encoded in hOCR-Format see: http://code.google.com/p/hocr-tools/"
|
7
7
|
p.url = "http://github.com/daandi/rhocr"
|
8
8
|
p.author = "Andreas Neumann"
|
data/lib/hocr_box.rb
CHANGED
@@ -2,15 +2,13 @@
|
|
2
2
|
|
3
3
|
class HOCRBox
|
4
4
|
|
5
|
-
attr_reader :left, :top, :right, :
|
5
|
+
attr_reader :left, :top, :right, :width, :height, :bottom, :coordinates
|
6
6
|
def initialize(* coordinates)
|
7
7
|
|
8
8
|
@left, @top, @right, @bottom = coordinates.flatten.collect { |x| x.to_i}
|
9
9
|
|
10
10
|
@height = @bottom - @top
|
11
11
|
@width = @right - @left
|
12
|
-
@upper_left = [ @left, @top]
|
13
|
-
@lower_rigth = [ @right, @bottom ]
|
14
12
|
@coordinates = [ @left, @top,@right, @bottom ]
|
15
13
|
|
16
14
|
if left > right || top > bottom then
|
data/lib/ocr_document.rb
CHANGED
data/lib/ocr_element.rb
CHANGED
@@ -86,16 +86,28 @@ class OCRElement < HOCRBox
|
|
86
86
|
"<span style='color: #{color}'>#{to_s}</span>"
|
87
87
|
end
|
88
88
|
|
89
|
-
def
|
89
|
+
def css_class_string
|
90
|
+
if @features.empty?
|
91
|
+
"#{@ocr_class}"
|
92
|
+
else
|
93
|
+
"#{@ocr_class}-#{features_to_css_class}"
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def to_image_html(dipslay_class =css_class_string)
|
90
98
|
children_html = @children.map {|c| c.to_image_html}.join("")
|
91
99
|
"<span class='#{ dipslay_class }' style='#{ to_css_style }' ></span>#{ children_html }"
|
92
100
|
end
|
93
101
|
|
94
|
-
def to_html( display_class =
|
102
|
+
def to_html( display_class = css_class_string, style = nil )
|
95
103
|
children_html = @children.map {|c| c.to_html}.join("")
|
96
104
|
"<span class='#{ display_class }'> #{ children_html } </span>"
|
97
105
|
end
|
98
106
|
|
107
|
+
def features_to_css_class
|
108
|
+
@features.uniq.sort.join('_')
|
109
|
+
end
|
110
|
+
|
99
111
|
end
|
100
112
|
|
101
113
|
class OCRWord < OCRElement
|
@@ -109,11 +121,11 @@ class OCRWord < OCRElement
|
|
109
121
|
end
|
110
122
|
|
111
123
|
def to_image_html
|
112
|
-
"<span class='#{
|
124
|
+
"<span class='#{ css_class_string}' style='#{ to_css_style }'>#{ text }</span>"
|
113
125
|
end
|
114
126
|
|
115
127
|
def to_html
|
116
|
-
"<span class='#{
|
128
|
+
"<span class='#{ css_class_string }'>#{ text }</span>"
|
117
129
|
end
|
118
130
|
|
119
131
|
end
|
data/lib/ocr_page.rb
CHANGED
@@ -52,18 +52,6 @@ class OCRPage < OCRElement
|
|
52
52
|
end
|
53
53
|
end
|
54
54
|
|
55
|
-
#deprecated
|
56
|
-
def lines
|
57
|
-
unless @lines then
|
58
|
-
@lines = []
|
59
|
-
|
60
|
-
each_line do |line|
|
61
|
-
@lines << line
|
62
|
-
end
|
63
|
-
|
64
|
-
end
|
65
|
-
@lines
|
66
|
-
end
|
67
55
|
|
68
56
|
def extract_bbox_ppageno( ocr_html_text_fragment )
|
69
57
|
bbox, ppageno = ocr_html_text_fragment.split(';')
|
@@ -77,7 +65,7 @@ class OCRPage < OCRElement
|
|
77
65
|
end
|
78
66
|
|
79
67
|
def to_text
|
80
|
-
|
68
|
+
Enumerator.new(self,:each_line).map {|line| line.to_text}.join("\n")
|
81
69
|
end
|
82
70
|
|
83
71
|
def to_image_html(dipslay_class = @ocr_class)
|
@@ -86,13 +74,16 @@ class OCRPage < OCRElement
|
|
86
74
|
end
|
87
75
|
|
88
76
|
def enclosed_words(ocr_box)
|
89
|
-
|
77
|
+
enum = Enumerator.new(self,:each_enclosed_word,ocr_box)
|
78
|
+
enum.inject([]) { |acc,w| acc << w}
|
79
|
+
end
|
80
|
+
|
81
|
+
def each_enclosed_word(ocr_box)
|
90
82
|
each_word do |w|
|
91
83
|
if w.enclosed_by? ocr_box then
|
92
|
-
|
84
|
+
yield w
|
93
85
|
end
|
94
86
|
end
|
95
|
-
a
|
96
87
|
end
|
97
88
|
|
98
89
|
end
|
data/lib/rhocr.rb
CHANGED
@@ -3,29 +3,16 @@
|
|
3
3
|
require_relative "ocr_document"
|
4
4
|
class RHOCR < OCRDocument
|
5
5
|
|
6
|
-
attr_reader :words, :lines
|
7
|
-
|
8
6
|
def add_folder(path)
|
9
7
|
add_files Dir[path]
|
10
|
-
compute_lines
|
11
|
-
compute_words
|
12
|
-
self
|
13
8
|
end
|
14
9
|
|
15
|
-
|
16
|
-
|
17
|
-
@words = []
|
18
|
-
each_word do |w|
|
19
|
-
@words << w
|
20
|
-
end
|
10
|
+
def words
|
11
|
+
Enumerator.new(self,:each_word)
|
21
12
|
end
|
22
13
|
|
23
|
-
|
24
|
-
|
25
|
-
@lines = []
|
26
|
-
each_line do |l|
|
27
|
-
@lines << l
|
28
|
-
end
|
14
|
+
def lines
|
15
|
+
Enumerator.new(self,:each_line)
|
29
16
|
end
|
30
17
|
|
31
18
|
end
|
data/rhocr.gemspec
CHANGED
@@ -2,15 +2,15 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{rhocr}
|
5
|
-
s.version = "0.1.
|
5
|
+
s.version = "0.1.3"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = [%q{Andreas Neumann}]
|
9
|
-
s.date = %q{2011-
|
9
|
+
s.date = %q{2011-10-10}
|
10
10
|
s.description = %q{Manipulate and use OCR data encoded in hOCR-Format see: http://code.google.com/p/hocr-tools/}
|
11
11
|
s.email = %q{andreas@neumann.biz}
|
12
12
|
s.extra_rdoc_files = [%q{README}, %q{lib/hocr_box.rb}, %q{lib/ocr_document.rb}, %q{lib/ocr_element.rb}, %q{lib/ocr_page.rb}, %q{lib/rhocr.rb}]
|
13
|
-
s.files = [%q{Manifest}, %q{README}, %q{Rakefile}, %q{data/Seite_Die_Gartenlaube_242.html}, %q{data/Seite_Tagebuch_H_C_Lang_08.html}, %q{data/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{data/test.html}, %q{data/test.png}, %q{example/example_server.rb}, %q{example/public/OCRTest.css}, %q{example/public/OCRTest.html}, %q{example/public/OCRTest_marker.js}, %q{example/public/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{example/public/img/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{lib/hocr_box.rb}, %q{lib/ocr_document.rb}, %q{lib/ocr_element.rb}, %q{lib/ocr_page.rb}, %q{lib/rhocr.rb}, %q{
|
13
|
+
s.files = [%q{Manifest}, %q{README}, %q{Rakefile}, %q{data/Seite_Die_Gartenlaube_242.html}, %q{data/Seite_Tagebuch_H_C_Lang_08.html}, %q{data/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{data/test.html}, %q{data/test.png}, %q{example/example_server.rb}, %q{example/public/OCRTest.css}, %q{example/public/OCRTest.html}, %q{example/public/OCRTest_marker.js}, %q{example/public/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{example/public/img/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{lib/hocr_box.rb}, %q{lib/ocr_document.rb}, %q{lib/ocr_element.rb}, %q{lib/ocr_page.rb}, %q{lib/rhocr.rb}, %q{spec/hocr_box_spec.rb}, %q{spec/ocr_document_spec.rb}, %q{spec/ocr_element_spec.rb}, %q{spec/ocr_page_spec.rb}, %q{spec/rhocr_spec.rb}, %q{test.html}, %q{rhocr.gemspec}]
|
14
14
|
s.homepage = %q{http://github.com/daandi/rhocr}
|
15
15
|
s.rdoc_options = [%q{--line-numbers}, %q{--inline-source}, %q{--title}, %q{Rhocr}, %q{--main}, %q{README}]
|
16
16
|
s.require_paths = [%q{lib}]
|
data/spec/hocr_box_spec.rb
CHANGED
@@ -12,6 +12,24 @@ describe HOCRBox do
|
|
12
12
|
it 'should have coordinates' do
|
13
13
|
@box.coordinates.should == [1,2,20,8]
|
14
14
|
end
|
15
|
+
it 'should have #left' do
|
16
|
+
@box.left.should == 1
|
17
|
+
end
|
18
|
+
it 'should have #right' do
|
19
|
+
@box.right.should == 20
|
20
|
+
end
|
21
|
+
it 'should have #top' do
|
22
|
+
@box.top.should == 2
|
23
|
+
end
|
24
|
+
it 'should have #bottom' do
|
25
|
+
@box.bottom.should == 8
|
26
|
+
end
|
27
|
+
it 'should have height' do
|
28
|
+
@box.height == 7
|
29
|
+
end
|
30
|
+
it 'should have width' do
|
31
|
+
@box.width.should == 19
|
32
|
+
end
|
15
33
|
end
|
16
34
|
|
17
35
|
describe "#to_s" do
|
@@ -91,7 +109,7 @@ describe HOCRBox do
|
|
91
109
|
end
|
92
110
|
|
93
111
|
describe '#top_distance_to(element)' do
|
94
|
-
it 'box should be
|
112
|
+
it 'box should be 9px below of element' do
|
95
113
|
HOCRBox.new(109,241,206,274).top_distance_to(HOCRBox.new(160,196,1117,232)).should == 9
|
96
114
|
end
|
97
115
|
end
|
data/spec/ocr_element_spec.rb
CHANGED
@@ -16,7 +16,7 @@ describe OCRElement do
|
|
16
16
|
describe '#initialize and Object' do
|
17
17
|
it 'should create an element given ocr_class, children and coordiantes' do
|
18
18
|
test_element = OCRElement.new('test', [], %w{10, 11, 20, 21})
|
19
|
-
test_element.children.should
|
19
|
+
test_element.children.should be_empty
|
20
20
|
test_element.coordinates.should == [10,11,20,21]
|
21
21
|
test_element.ocr_class.should == 'test'
|
22
22
|
end
|
@@ -63,6 +63,29 @@ describe OCRElement do
|
|
63
63
|
it 'should have a #to_html method' do
|
64
64
|
@ocr_element.to_html.should == "<span class='ocr_par'> <span class='ocr_line'> <span class='ocrx_word'>Athenobius,</span><span class='ocrx_word'>Der</span><span class='ocrx_word'>von</span><span class='ocrx_word'>der</span><span class='ocrx_word'>Göttin</span><span class='ocrx_word'>Minerva</span><span class='ocrx_word'>lebt,</span><span class='ocrx_word'>oder:</span><span class='ocrx_word'>Mi»</span> </span><span class='ocr_line'> <span class='ocrx_word'>nerva</span><span class='ocrx_word'>Bogen.</span> </span> </span>"
|
65
65
|
end
|
66
|
+
|
67
|
+
it 'should add features to css class' do
|
68
|
+
@ocr_element.lines[0].words[5].features << :test
|
69
|
+
@ocr_element.lines[0].words[5].to_html.should == "<span class='ocrx_word-test'>Minerva</span>"
|
70
|
+
end
|
71
|
+
|
72
|
+
it 'should transform features to an css class #features_to_css_class' do
|
73
|
+
test_elem = @ocr_element.lines[0]
|
74
|
+
test_elem.features_to_css_class.should == ''
|
75
|
+
test_elem.features << :test
|
76
|
+
test_elem.features << :stuff
|
77
|
+
test_elem.features_to_css_class.should == 'stuff_test'
|
78
|
+
test_elem.features << :test
|
79
|
+
test_elem.features_to_css_class.should == 'stuff_test'
|
80
|
+
end
|
81
|
+
|
82
|
+
it 'should use plain css class if elemetn has no features #css_class_string' do
|
83
|
+
test = @ocr_element.lines[1]
|
84
|
+
test.css_class_string.should == 'ocr_line'
|
85
|
+
test.features << :abc
|
86
|
+
test.css_class_string.should == 'ocr_line-abc'
|
87
|
+
end
|
88
|
+
|
66
89
|
end
|
67
90
|
|
68
91
|
describe 'ocr_line' do
|
data/spec/ocr_page_spec.rb
CHANGED
@@ -46,41 +46,24 @@ describe OCRPage do
|
|
46
46
|
|
47
47
|
describe '#Iterators' do
|
48
48
|
it 'should have a block iterator #each_block' do
|
49
|
-
|
50
|
-
|
51
|
-
a << block
|
52
|
-
end
|
53
|
-
a.length.should == 1
|
49
|
+
Enumerator.new(@test_page,:each_block).
|
50
|
+
inject([]) {|acc,elem| acc << elem}.length.should == 1
|
54
51
|
end
|
55
52
|
it 'should have a paragraph iterator #each_paragraph' do
|
56
|
-
|
57
|
-
|
58
|
-
a << paragraph
|
59
|
-
end
|
60
|
-
a.length.should == 12
|
53
|
+
Enumerator.new(@test_page,:each_paragraph).
|
54
|
+
inject([]) {|acc,elem| acc << elem}.length.should == 12
|
61
55
|
end
|
62
56
|
it 'should have a line iterator #each_line' do
|
63
|
-
|
64
|
-
|
65
|
-
a << line
|
66
|
-
end
|
67
|
-
a.length.should == 45
|
57
|
+
Enumerator.new(@test_page,:each_line).
|
58
|
+
inject([]) {|acc,elem| acc << elem}.length.should == 45
|
68
59
|
end
|
69
60
|
it 'should have a word iterator #each_word' do
|
70
|
-
|
71
|
-
|
72
|
-
a << word
|
73
|
-
end
|
74
|
-
a.length.should == 415
|
61
|
+
Enumerator.new(@test_page,:each_word).
|
62
|
+
inject([]) {|acc,elem| acc << elem}.length.should == 415
|
75
63
|
end
|
76
64
|
|
77
65
|
end
|
78
66
|
|
79
|
-
describe 'convinience methods' do
|
80
|
-
it 'should have a method #lines' do
|
81
|
-
@test_page.lines[5].children.length.should == 11
|
82
|
-
end
|
83
|
-
end
|
84
67
|
|
85
68
|
describe ' display methods' do
|
86
69
|
|
@@ -111,6 +94,8 @@ describe OCRPage do
|
|
111
94
|
words = @output_page.enclosed_words( HOCRBox.new 0,0, 300,300 )
|
112
95
|
words.length.should == 6
|
113
96
|
end
|
97
|
+
|
98
|
+
it 'should have an #enclosed_word method to iterate over ewords enclosed by given box'
|
114
99
|
end
|
115
100
|
|
116
101
|
end
|
data/spec/rhocr_spec.rb
CHANGED
@@ -13,16 +13,17 @@ describe RHOCR do
|
|
13
13
|
end
|
14
14
|
|
15
15
|
before(:each) do
|
16
|
-
@rhocr_doc ||= RHOCR.new
|
16
|
+
@rhocr_doc ||= RHOCR.new
|
17
|
+
@rhocr_doc.add_folder "data/*.html"
|
17
18
|
end
|
18
19
|
|
19
20
|
describe 'methods to iterate' do
|
20
21
|
it 'should have lines' do
|
21
|
-
@rhocr_doc.lines.length.should == 237
|
22
|
+
@rhocr_doc.lines.inject([]){|acc,l| acc <<l}.length.should == 237
|
22
23
|
end
|
23
24
|
|
24
25
|
it 'should have words' do
|
25
|
-
@rhocr_doc.words.length.should == 2071
|
26
|
+
@rhocr_doc.words.inject([]){|acc,w| acc <<w}.length.should == 2071
|
26
27
|
end
|
27
28
|
|
28
29
|
it 'should support common iterator methods throug enumerable for word an line arrays' do
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rhocr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-
|
12
|
+
date: 2011-10-10 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
16
|
-
requirement: &
|
16
|
+
requirement: &70118176813440 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,7 +21,7 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70118176813440
|
25
25
|
description: ! 'Manipulate and use OCR data encoded in hOCR-Format see: http://code.google.com/p/hocr-tools/'
|
26
26
|
email: andreas@neumann.biz
|
27
27
|
executables: []
|
@@ -53,13 +53,13 @@ files:
|
|
53
53
|
- lib/ocr_element.rb
|
54
54
|
- lib/ocr_page.rb
|
55
55
|
- lib/rhocr.rb
|
56
|
-
- rhocr.gemspec
|
57
56
|
- spec/hocr_box_spec.rb
|
58
57
|
- spec/ocr_document_spec.rb
|
59
58
|
- spec/ocr_element_spec.rb
|
60
59
|
- spec/ocr_page_spec.rb
|
61
60
|
- spec/rhocr_spec.rb
|
62
61
|
- test.html
|
62
|
+
- rhocr.gemspec
|
63
63
|
homepage: http://github.com/daandi/rhocr
|
64
64
|
licenses: []
|
65
65
|
post_install_message:
|