rhocr 0.1.1 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
data/Manifest CHANGED
@@ -17,7 +17,6 @@ lib/ocr_document.rb
17
17
  lib/ocr_element.rb
18
18
  lib/ocr_page.rb
19
19
  lib/rhocr.rb
20
- rhocr.gemspec
21
20
  spec/hocr_box_spec.rb
22
21
  spec/ocr_document_spec.rb
23
22
  spec/ocr_element_spec.rb
data/Rakefile CHANGED
@@ -2,7 +2,7 @@ require 'rubygems'
2
2
  require 'rake'
3
3
  require 'echoe'
4
4
 
5
- Echoe.new('rhocr', '0.1.1') do |p|
5
+ Echoe.new('rhocr', '0.1.3') do |p|
6
6
  p.description = "Manipulate and use OCR data encoded in hOCR-Format see: http://code.google.com/p/hocr-tools/"
7
7
  p.url = "http://github.com/daandi/rhocr"
8
8
  p.author = "Andreas Neumann"
data/lib/hocr_box.rb CHANGED
@@ -2,15 +2,13 @@
2
2
 
3
3
  class HOCRBox
4
4
 
5
- attr_reader :left, :top, :right, :bottom, :upper_left, :lower_right, :coordinates
5
+ attr_reader :left, :top, :right, :width, :height, :bottom, :coordinates
6
6
  def initialize(* coordinates)
7
7
 
8
8
  @left, @top, @right, @bottom = coordinates.flatten.collect { |x| x.to_i}
9
9
 
10
10
  @height = @bottom - @top
11
11
  @width = @right - @left
12
- @upper_left = [ @left, @top]
13
- @lower_rigth = [ @right, @bottom ]
14
12
  @coordinates = [ @left, @top,@right, @bottom ]
15
13
 
16
14
  if left > right || top > bottom then
data/lib/ocr_document.rb CHANGED
@@ -11,7 +11,8 @@ class OCRDocument
11
11
  end
12
12
 
13
13
  def add_pages( list_o_pages )
14
- for file in list_o_pages do
14
+ raise "no files given" if list_o_pages.empty?
15
+ list_o_pages.each do |file|
15
16
  add_page(file)
16
17
  end
17
18
  end
data/lib/ocr_element.rb CHANGED
@@ -86,16 +86,28 @@ class OCRElement < HOCRBox
86
86
  "<span style='color: #{color}'>#{to_s}</span>"
87
87
  end
88
88
 
89
- def to_image_html(dipslay_class = @ocr_class)
89
+ def css_class_string
90
+ if @features.empty?
91
+ "#{@ocr_class}"
92
+ else
93
+ "#{@ocr_class}-#{features_to_css_class}"
94
+ end
95
+ end
96
+
97
+ def to_image_html(dipslay_class =css_class_string)
90
98
  children_html = @children.map {|c| c.to_image_html}.join("")
91
99
  "<span class='#{ dipslay_class }' style='#{ to_css_style }' ></span>#{ children_html }"
92
100
  end
93
101
 
94
- def to_html( display_class = @ocr_class, style = nil )
102
+ def to_html( display_class = css_class_string, style = nil )
95
103
  children_html = @children.map {|c| c.to_html}.join("")
96
104
  "<span class='#{ display_class }'> #{ children_html } </span>"
97
105
  end
98
106
 
107
+ def features_to_css_class
108
+ @features.uniq.sort.join('_')
109
+ end
110
+
99
111
  end
100
112
 
101
113
  class OCRWord < OCRElement
@@ -109,11 +121,11 @@ class OCRWord < OCRElement
109
121
  end
110
122
 
111
123
  def to_image_html
112
- "<span class='#{ @ocr_class }' style='#{ to_css_style }'>#{ text }</span>"
124
+ "<span class='#{ css_class_string}' style='#{ to_css_style }'>#{ text }</span>"
113
125
  end
114
126
 
115
127
  def to_html
116
- "<span class='#{ @ocr_class }'>#{ text }</span>"
128
+ "<span class='#{ css_class_string }'>#{ text }</span>"
117
129
  end
118
130
 
119
131
  end
data/lib/ocr_page.rb CHANGED
@@ -52,18 +52,6 @@ class OCRPage < OCRElement
52
52
  end
53
53
  end
54
54
 
55
- #deprecated
56
- def lines
57
- unless @lines then
58
- @lines = []
59
-
60
- each_line do |line|
61
- @lines << line
62
- end
63
-
64
- end
65
- @lines
66
- end
67
55
 
68
56
  def extract_bbox_ppageno( ocr_html_text_fragment )
69
57
  bbox, ppageno = ocr_html_text_fragment.split(';')
@@ -77,7 +65,7 @@ class OCRPage < OCRElement
77
65
  end
78
66
 
79
67
  def to_text
80
- lines.map {|line| line.to_text}.join("\n")
68
+ Enumerator.new(self,:each_line).map {|line| line.to_text}.join("\n")
81
69
  end
82
70
 
83
71
  def to_image_html(dipslay_class = @ocr_class)
@@ -86,13 +74,16 @@ class OCRPage < OCRElement
86
74
  end
87
75
 
88
76
  def enclosed_words(ocr_box)
89
- a = []
77
+ enum = Enumerator.new(self,:each_enclosed_word,ocr_box)
78
+ enum.inject([]) { |acc,w| acc << w}
79
+ end
80
+
81
+ def each_enclosed_word(ocr_box)
90
82
  each_word do |w|
91
83
  if w.enclosed_by? ocr_box then
92
- a << w
84
+ yield w
93
85
  end
94
86
  end
95
- a
96
87
  end
97
88
 
98
89
  end
data/lib/rhocr.rb CHANGED
@@ -3,29 +3,16 @@
3
3
  require_relative "ocr_document"
4
4
  class RHOCR < OCRDocument
5
5
 
6
- attr_reader :words, :lines
7
-
8
6
  def add_folder(path)
9
7
  add_files Dir[path]
10
- compute_lines
11
- compute_words
12
- self
13
8
  end
14
9
 
15
- #should be called if new pages are added
16
- def compute_words
17
- @words = []
18
- each_word do |w|
19
- @words << w
20
- end
10
+ def words
11
+ Enumerator.new(self,:each_word)
21
12
  end
22
13
 
23
- #should be called if new pages are added
24
- def compute_lines
25
- @lines = []
26
- each_line do |l|
27
- @lines << l
28
- end
14
+ def lines
15
+ Enumerator.new(self,:each_line)
29
16
  end
30
17
 
31
18
  end
data/rhocr.gemspec CHANGED
@@ -2,15 +2,15 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{rhocr}
5
- s.version = "0.1.1"
5
+ s.version = "0.1.3"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = [%q{Andreas Neumann}]
9
- s.date = %q{2011-09-30}
9
+ s.date = %q{2011-10-10}
10
10
  s.description = %q{Manipulate and use OCR data encoded in hOCR-Format see: http://code.google.com/p/hocr-tools/}
11
11
  s.email = %q{andreas@neumann.biz}
12
12
  s.extra_rdoc_files = [%q{README}, %q{lib/hocr_box.rb}, %q{lib/ocr_document.rb}, %q{lib/ocr_element.rb}, %q{lib/ocr_page.rb}, %q{lib/rhocr.rb}]
13
- s.files = [%q{Manifest}, %q{README}, %q{Rakefile}, %q{data/Seite_Die_Gartenlaube_242.html}, %q{data/Seite_Tagebuch_H_C_Lang_08.html}, %q{data/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{data/test.html}, %q{data/test.png}, %q{example/example_server.rb}, %q{example/public/OCRTest.css}, %q{example/public/OCRTest.html}, %q{example/public/OCRTest_marker.js}, %q{example/public/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{example/public/img/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{lib/hocr_box.rb}, %q{lib/ocr_document.rb}, %q{lib/ocr_element.rb}, %q{lib/ocr_page.rb}, %q{lib/rhocr.rb}, %q{rhocr.gemspec}, %q{spec/hocr_box_spec.rb}, %q{spec/ocr_document_spec.rb}, %q{spec/ocr_element_spec.rb}, %q{spec/ocr_page_spec.rb}, %q{spec/rhocr_spec.rb}, %q{test.html}]
13
+ s.files = [%q{Manifest}, %q{README}, %q{Rakefile}, %q{data/Seite_Die_Gartenlaube_242.html}, %q{data/Seite_Tagebuch_H_C_Lang_08.html}, %q{data/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{data/test.html}, %q{data/test.png}, %q{example/example_server.rb}, %q{example/public/OCRTest.css}, %q{example/public/OCRTest.html}, %q{example/public/OCRTest_marker.js}, %q{example/public/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{example/public/img/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{lib/hocr_box.rb}, %q{lib/ocr_document.rb}, %q{lib/ocr_element.rb}, %q{lib/ocr_page.rb}, %q{lib/rhocr.rb}, %q{spec/hocr_box_spec.rb}, %q{spec/ocr_document_spec.rb}, %q{spec/ocr_element_spec.rb}, %q{spec/ocr_page_spec.rb}, %q{spec/rhocr_spec.rb}, %q{test.html}, %q{rhocr.gemspec}]
14
14
  s.homepage = %q{http://github.com/daandi/rhocr}
15
15
  s.rdoc_options = [%q{--line-numbers}, %q{--inline-source}, %q{--title}, %q{Rhocr}, %q{--main}, %q{README}]
16
16
  s.require_paths = [%q{lib}]
@@ -12,6 +12,24 @@ describe HOCRBox do
12
12
  it 'should have coordinates' do
13
13
  @box.coordinates.should == [1,2,20,8]
14
14
  end
15
+ it 'should have #left' do
16
+ @box.left.should == 1
17
+ end
18
+ it 'should have #right' do
19
+ @box.right.should == 20
20
+ end
21
+ it 'should have #top' do
22
+ @box.top.should == 2
23
+ end
24
+ it 'should have #bottom' do
25
+ @box.bottom.should == 8
26
+ end
27
+ it 'should have height' do
28
+ @box.height == 7
29
+ end
30
+ it 'should have width' do
31
+ @box.width.should == 19
32
+ end
15
33
  end
16
34
 
17
35
  describe "#to_s" do
@@ -91,7 +109,7 @@ describe HOCRBox do
91
109
  end
92
110
 
93
111
  describe '#top_distance_to(element)' do
94
- it 'box should be 2px below of element' do
112
+ it 'box should be 9px below of element' do
95
113
  HOCRBox.new(109,241,206,274).top_distance_to(HOCRBox.new(160,196,1117,232)).should == 9
96
114
  end
97
115
  end
@@ -16,7 +16,7 @@ describe OCRElement do
16
16
  describe '#initialize and Object' do
17
17
  it 'should create an element given ocr_class, children and coordiantes' do
18
18
  test_element = OCRElement.new('test', [], %w{10, 11, 20, 21})
19
- test_element.children.should == []
19
+ test_element.children.should be_empty
20
20
  test_element.coordinates.should == [10,11,20,21]
21
21
  test_element.ocr_class.should == 'test'
22
22
  end
@@ -63,6 +63,29 @@ describe OCRElement do
63
63
  it 'should have a #to_html method' do
64
64
  @ocr_element.to_html.should == "<span class='ocr_par'> <span class='ocr_line'> <span class='ocrx_word'>Athenobius,</span><span class='ocrx_word'>Der</span><span class='ocrx_word'>von</span><span class='ocrx_word'>der</span><span class='ocrx_word'>Göttin</span><span class='ocrx_word'>Minerva</span><span class='ocrx_word'>lebt,</span><span class='ocrx_word'>oder:</span><span class='ocrx_word'>Mi»</span> </span><span class='ocr_line'> <span class='ocrx_word'>nerva</span><span class='ocrx_word'>Bogen.</span> </span> </span>"
65
65
  end
66
+
67
+ it 'should add features to css class' do
68
+ @ocr_element.lines[0].words[5].features << :test
69
+ @ocr_element.lines[0].words[5].to_html.should == "<span class='ocrx_word-test'>Minerva</span>"
70
+ end
71
+
72
+ it 'should transform features to an css class #features_to_css_class' do
73
+ test_elem = @ocr_element.lines[0]
74
+ test_elem.features_to_css_class.should == ''
75
+ test_elem.features << :test
76
+ test_elem.features << :stuff
77
+ test_elem.features_to_css_class.should == 'stuff_test'
78
+ test_elem.features << :test
79
+ test_elem.features_to_css_class.should == 'stuff_test'
80
+ end
81
+
82
+ it 'should use plain css class if elemetn has no features #css_class_string' do
83
+ test = @ocr_element.lines[1]
84
+ test.css_class_string.should == 'ocr_line'
85
+ test.features << :abc
86
+ test.css_class_string.should == 'ocr_line-abc'
87
+ end
88
+
66
89
  end
67
90
 
68
91
  describe 'ocr_line' do
@@ -46,41 +46,24 @@ describe OCRPage do
46
46
 
47
47
  describe '#Iterators' do
48
48
  it 'should have a block iterator #each_block' do
49
- a = []
50
- @test_page.each_block do |block|
51
- a << block
52
- end
53
- a.length.should == 1
49
+ Enumerator.new(@test_page,:each_block).
50
+ inject([]) {|acc,elem| acc << elem}.length.should == 1
54
51
  end
55
52
  it 'should have a paragraph iterator #each_paragraph' do
56
- a = []
57
- @test_page.each_paragraph do |paragraph|
58
- a << paragraph
59
- end
60
- a.length.should == 12
53
+ Enumerator.new(@test_page,:each_paragraph).
54
+ inject([]) {|acc,elem| acc << elem}.length.should == 12
61
55
  end
62
56
  it 'should have a line iterator #each_line' do
63
- a = []
64
- @test_page.each_line do |line|
65
- a << line
66
- end
67
- a.length.should == 45
57
+ Enumerator.new(@test_page,:each_line).
58
+ inject([]) {|acc,elem| acc << elem}.length.should == 45
68
59
  end
69
60
  it 'should have a word iterator #each_word' do
70
- a = []
71
- @test_page.each_word do |word|
72
- a << word
73
- end
74
- a.length.should == 415
61
+ Enumerator.new(@test_page,:each_word).
62
+ inject([]) {|acc,elem| acc << elem}.length.should == 415
75
63
  end
76
64
 
77
65
  end
78
66
 
79
- describe 'convinience methods' do
80
- it 'should have a method #lines' do
81
- @test_page.lines[5].children.length.should == 11
82
- end
83
- end
84
67
 
85
68
  describe ' display methods' do
86
69
 
@@ -111,6 +94,8 @@ describe OCRPage do
111
94
  words = @output_page.enclosed_words( HOCRBox.new 0,0, 300,300 )
112
95
  words.length.should == 6
113
96
  end
97
+
98
+ it 'should have an #enclosed_word method to iterate over ewords enclosed by given box'
114
99
  end
115
100
 
116
101
  end
data/spec/rhocr_spec.rb CHANGED
@@ -13,16 +13,17 @@ describe RHOCR do
13
13
  end
14
14
 
15
15
  before(:each) do
16
- @rhocr_doc ||= RHOCR.new.add_folder "data/*.html"
16
+ @rhocr_doc ||= RHOCR.new
17
+ @rhocr_doc.add_folder "data/*.html"
17
18
  end
18
19
 
19
20
  describe 'methods to iterate' do
20
21
  it 'should have lines' do
21
- @rhocr_doc.lines.length.should == 237
22
+ @rhocr_doc.lines.inject([]){|acc,l| acc <<l}.length.should == 237
22
23
  end
23
24
 
24
25
  it 'should have words' do
25
- @rhocr_doc.words.length.should == 2071
26
+ @rhocr_doc.words.inject([]){|acc,w| acc <<w}.length.should == 2071
26
27
  end
27
28
 
28
29
  it 'should support common iterator methods throug enumerable for word an line arrays' do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rhocr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-09-30 00:00:00.000000000Z
12
+ date: 2011-10-10 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
16
- requirement: &70233340332980 !ruby/object:Gem::Requirement
16
+ requirement: &70118176813440 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,7 +21,7 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70233340332980
24
+ version_requirements: *70118176813440
25
25
  description: ! 'Manipulate and use OCR data encoded in hOCR-Format see: http://code.google.com/p/hocr-tools/'
26
26
  email: andreas@neumann.biz
27
27
  executables: []
@@ -53,13 +53,13 @@ files:
53
53
  - lib/ocr_element.rb
54
54
  - lib/ocr_page.rb
55
55
  - lib/rhocr.rb
56
- - rhocr.gemspec
57
56
  - spec/hocr_box_spec.rb
58
57
  - spec/ocr_document_spec.rb
59
58
  - spec/ocr_element_spec.rb
60
59
  - spec/ocr_page_spec.rb
61
60
  - spec/rhocr_spec.rb
62
61
  - test.html
62
+ - rhocr.gemspec
63
63
  homepage: http://github.com/daandi/rhocr
64
64
  licenses: []
65
65
  post_install_message: