rhocr 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Manifest CHANGED
@@ -17,7 +17,6 @@ lib/ocr_document.rb
17
17
  lib/ocr_element.rb
18
18
  lib/ocr_page.rb
19
19
  lib/rhocr.rb
20
- rhocr.gemspec
21
20
  spec/hocr_box_spec.rb
22
21
  spec/ocr_document_spec.rb
23
22
  spec/ocr_element_spec.rb
data/Rakefile CHANGED
@@ -2,7 +2,7 @@ require 'rubygems'
2
2
  require 'rake'
3
3
  require 'echoe'
4
4
 
5
- Echoe.new('rhocr', '0.1.1') do |p|
5
+ Echoe.new('rhocr', '0.1.3') do |p|
6
6
  p.description = "Manipulate and use OCR data encoded in hOCR-Format see: http://code.google.com/p/hocr-tools/"
7
7
  p.url = "http://github.com/daandi/rhocr"
8
8
  p.author = "Andreas Neumann"
data/lib/hocr_box.rb CHANGED
@@ -2,15 +2,13 @@
2
2
 
3
3
  class HOCRBox
4
4
 
5
- attr_reader :left, :top, :right, :bottom, :upper_left, :lower_right, :coordinates
5
+ attr_reader :left, :top, :right, :width, :height, :bottom, :coordinates
6
6
  def initialize(* coordinates)
7
7
 
8
8
  @left, @top, @right, @bottom = coordinates.flatten.collect { |x| x.to_i}
9
9
 
10
10
  @height = @bottom - @top
11
11
  @width = @right - @left
12
- @upper_left = [ @left, @top]
13
- @lower_rigth = [ @right, @bottom ]
14
12
  @coordinates = [ @left, @top,@right, @bottom ]
15
13
 
16
14
  if left > right || top > bottom then
data/lib/ocr_document.rb CHANGED
@@ -11,7 +11,8 @@ class OCRDocument
11
11
  end
12
12
 
13
13
  def add_pages( list_o_pages )
14
- for file in list_o_pages do
14
+ raise "no files given" if list_o_pages.empty?
15
+ list_o_pages.each do |file|
15
16
  add_page(file)
16
17
  end
17
18
  end
data/lib/ocr_element.rb CHANGED
@@ -86,16 +86,28 @@ class OCRElement < HOCRBox
86
86
  "<span style='color: #{color}'>#{to_s}</span>"
87
87
  end
88
88
 
89
- def to_image_html(dipslay_class = @ocr_class)
89
+ def css_class_string
90
+ if @features.empty?
91
+ "#{@ocr_class}"
92
+ else
93
+ "#{@ocr_class}-#{features_to_css_class}"
94
+ end
95
+ end
96
+
97
+ def to_image_html(dipslay_class =css_class_string)
90
98
  children_html = @children.map {|c| c.to_image_html}.join("")
91
99
  "<span class='#{ dipslay_class }' style='#{ to_css_style }' ></span>#{ children_html }"
92
100
  end
93
101
 
94
- def to_html( display_class = @ocr_class, style = nil )
102
+ def to_html( display_class = css_class_string, style = nil )
95
103
  children_html = @children.map {|c| c.to_html}.join("")
96
104
  "<span class='#{ display_class }'> #{ children_html } </span>"
97
105
  end
98
106
 
107
+ def features_to_css_class
108
+ @features.uniq.sort.join('_')
109
+ end
110
+
99
111
  end
100
112
 
101
113
  class OCRWord < OCRElement
@@ -109,11 +121,11 @@ class OCRWord < OCRElement
109
121
  end
110
122
 
111
123
  def to_image_html
112
- "<span class='#{ @ocr_class }' style='#{ to_css_style }'>#{ text }</span>"
124
+ "<span class='#{ css_class_string}' style='#{ to_css_style }'>#{ text }</span>"
113
125
  end
114
126
 
115
127
  def to_html
116
- "<span class='#{ @ocr_class }'>#{ text }</span>"
128
+ "<span class='#{ css_class_string }'>#{ text }</span>"
117
129
  end
118
130
 
119
131
  end
data/lib/ocr_page.rb CHANGED
@@ -52,18 +52,6 @@ class OCRPage < OCRElement
52
52
  end
53
53
  end
54
54
 
55
- #deprecated
56
- def lines
57
- unless @lines then
58
- @lines = []
59
-
60
- each_line do |line|
61
- @lines << line
62
- end
63
-
64
- end
65
- @lines
66
- end
67
55
 
68
56
  def extract_bbox_ppageno( ocr_html_text_fragment )
69
57
  bbox, ppageno = ocr_html_text_fragment.split(';')
@@ -77,7 +65,7 @@ class OCRPage < OCRElement
77
65
  end
78
66
 
79
67
  def to_text
80
- lines.map {|line| line.to_text}.join("\n")
68
+ Enumerator.new(self,:each_line).map {|line| line.to_text}.join("\n")
81
69
  end
82
70
 
83
71
  def to_image_html(dipslay_class = @ocr_class)
@@ -86,13 +74,16 @@ class OCRPage < OCRElement
86
74
  end
87
75
 
88
76
  def enclosed_words(ocr_box)
89
- a = []
77
+ enum = Enumerator.new(self,:each_enclosed_word,ocr_box)
78
+ enum.inject([]) { |acc,w| acc << w}
79
+ end
80
+
81
+ def each_enclosed_word(ocr_box)
90
82
  each_word do |w|
91
83
  if w.enclosed_by? ocr_box then
92
- a << w
84
+ yield w
93
85
  end
94
86
  end
95
- a
96
87
  end
97
88
 
98
89
  end
data/lib/rhocr.rb CHANGED
@@ -3,29 +3,16 @@
3
3
  require_relative "ocr_document"
4
4
  class RHOCR < OCRDocument
5
5
 
6
- attr_reader :words, :lines
7
-
8
6
  def add_folder(path)
9
7
  add_files Dir[path]
10
- compute_lines
11
- compute_words
12
- self
13
8
  end
14
9
 
15
- #should be called if new pages are added
16
- def compute_words
17
- @words = []
18
- each_word do |w|
19
- @words << w
20
- end
10
+ def words
11
+ Enumerator.new(self,:each_word)
21
12
  end
22
13
 
23
- #should be called if new pages are added
24
- def compute_lines
25
- @lines = []
26
- each_line do |l|
27
- @lines << l
28
- end
14
+ def lines
15
+ Enumerator.new(self,:each_line)
29
16
  end
30
17
 
31
18
  end
data/rhocr.gemspec CHANGED
@@ -2,15 +2,15 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{rhocr}
5
- s.version = "0.1.1"
5
+ s.version = "0.1.3"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = [%q{Andreas Neumann}]
9
- s.date = %q{2011-09-30}
9
+ s.date = %q{2011-10-10}
10
10
  s.description = %q{Manipulate and use OCR data encoded in hOCR-Format see: http://code.google.com/p/hocr-tools/}
11
11
  s.email = %q{andreas@neumann.biz}
12
12
  s.extra_rdoc_files = [%q{README}, %q{lib/hocr_box.rb}, %q{lib/ocr_document.rb}, %q{lib/ocr_element.rb}, %q{lib/ocr_page.rb}, %q{lib/rhocr.rb}]
13
- s.files = [%q{Manifest}, %q{README}, %q{Rakefile}, %q{data/Seite_Die_Gartenlaube_242.html}, %q{data/Seite_Tagebuch_H_C_Lang_08.html}, %q{data/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{data/test.html}, %q{data/test.png}, %q{example/example_server.rb}, %q{example/public/OCRTest.css}, %q{example/public/OCRTest.html}, %q{example/public/OCRTest_marker.js}, %q{example/public/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{example/public/img/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{lib/hocr_box.rb}, %q{lib/ocr_document.rb}, %q{lib/ocr_element.rb}, %q{lib/ocr_page.rb}, %q{lib/rhocr.rb}, %q{rhocr.gemspec}, %q{spec/hocr_box_spec.rb}, %q{spec/ocr_document_spec.rb}, %q{spec/ocr_element_spec.rb}, %q{spec/ocr_page_spec.rb}, %q{spec/rhocr_spec.rb}, %q{test.html}]
13
+ s.files = [%q{Manifest}, %q{README}, %q{Rakefile}, %q{data/Seite_Die_Gartenlaube_242.html}, %q{data/Seite_Tagebuch_H_C_Lang_08.html}, %q{data/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{data/test.html}, %q{data/test.png}, %q{example/example_server.rb}, %q{example/public/OCRTest.css}, %q{example/public/OCRTest.html}, %q{example/public/OCRTest_marker.js}, %q{example/public/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{example/public/img/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{lib/hocr_box.rb}, %q{lib/ocr_document.rb}, %q{lib/ocr_element.rb}, %q{lib/ocr_page.rb}, %q{lib/rhocr.rb}, %q{spec/hocr_box_spec.rb}, %q{spec/ocr_document_spec.rb}, %q{spec/ocr_element_spec.rb}, %q{spec/ocr_page_spec.rb}, %q{spec/rhocr_spec.rb}, %q{test.html}, %q{rhocr.gemspec}]
14
14
  s.homepage = %q{http://github.com/daandi/rhocr}
15
15
  s.rdoc_options = [%q{--line-numbers}, %q{--inline-source}, %q{--title}, %q{Rhocr}, %q{--main}, %q{README}]
16
16
  s.require_paths = [%q{lib}]
@@ -12,6 +12,24 @@ describe HOCRBox do
12
12
  it 'should have coordinates' do
13
13
  @box.coordinates.should == [1,2,20,8]
14
14
  end
15
+ it 'should have #left' do
16
+ @box.left.should == 1
17
+ end
18
+ it 'should have #right' do
19
+ @box.right.should == 20
20
+ end
21
+ it 'should have #top' do
22
+ @box.top.should == 2
23
+ end
24
+ it 'should have #bottom' do
25
+ @box.bottom.should == 8
26
+ end
27
+ it 'should have height' do
28
+ @box.height == 7
29
+ end
30
+ it 'should have width' do
31
+ @box.width.should == 19
32
+ end
15
33
  end
16
34
 
17
35
  describe "#to_s" do
@@ -91,7 +109,7 @@ describe HOCRBox do
91
109
  end
92
110
 
93
111
  describe '#top_distance_to(element)' do
94
- it 'box should be 2px below of element' do
112
+ it 'box should be 9px below of element' do
95
113
  HOCRBox.new(109,241,206,274).top_distance_to(HOCRBox.new(160,196,1117,232)).should == 9
96
114
  end
97
115
  end
@@ -16,7 +16,7 @@ describe OCRElement do
16
16
  describe '#initialize and Object' do
17
17
  it 'should create an element given ocr_class, children and coordiantes' do
18
18
  test_element = OCRElement.new('test', [], %w{10, 11, 20, 21})
19
- test_element.children.should == []
19
+ test_element.children.should be_empty
20
20
  test_element.coordinates.should == [10,11,20,21]
21
21
  test_element.ocr_class.should == 'test'
22
22
  end
@@ -63,6 +63,29 @@ describe OCRElement do
63
63
  it 'should have a #to_html method' do
64
64
  @ocr_element.to_html.should == "<span class='ocr_par'> <span class='ocr_line'> <span class='ocrx_word'>Athenobius,</span><span class='ocrx_word'>Der</span><span class='ocrx_word'>von</span><span class='ocrx_word'>der</span><span class='ocrx_word'>Göttin</span><span class='ocrx_word'>Minerva</span><span class='ocrx_word'>lebt,</span><span class='ocrx_word'>oder:</span><span class='ocrx_word'>Mi»</span> </span><span class='ocr_line'> <span class='ocrx_word'>nerva</span><span class='ocrx_word'>Bogen.</span> </span> </span>"
65
65
  end
66
+
67
+ it 'should add features to css class' do
68
+ @ocr_element.lines[0].words[5].features << :test
69
+ @ocr_element.lines[0].words[5].to_html.should == "<span class='ocrx_word-test'>Minerva</span>"
70
+ end
71
+
72
+ it 'should transform features to an css class #features_to_css_class' do
73
+ test_elem = @ocr_element.lines[0]
74
+ test_elem.features_to_css_class.should == ''
75
+ test_elem.features << :test
76
+ test_elem.features << :stuff
77
+ test_elem.features_to_css_class.should == 'stuff_test'
78
+ test_elem.features << :test
79
+ test_elem.features_to_css_class.should == 'stuff_test'
80
+ end
81
+
82
+ it 'should use plain css class if elemetn has no features #css_class_string' do
83
+ test = @ocr_element.lines[1]
84
+ test.css_class_string.should == 'ocr_line'
85
+ test.features << :abc
86
+ test.css_class_string.should == 'ocr_line-abc'
87
+ end
88
+
66
89
  end
67
90
 
68
91
  describe 'ocr_line' do
@@ -46,41 +46,24 @@ describe OCRPage do
46
46
 
47
47
  describe '#Iterators' do
48
48
  it 'should have a block iterator #each_block' do
49
- a = []
50
- @test_page.each_block do |block|
51
- a << block
52
- end
53
- a.length.should == 1
49
+ Enumerator.new(@test_page,:each_block).
50
+ inject([]) {|acc,elem| acc << elem}.length.should == 1
54
51
  end
55
52
  it 'should have a paragraph iterator #each_paragraph' do
56
- a = []
57
- @test_page.each_paragraph do |paragraph|
58
- a << paragraph
59
- end
60
- a.length.should == 12
53
+ Enumerator.new(@test_page,:each_paragraph).
54
+ inject([]) {|acc,elem| acc << elem}.length.should == 12
61
55
  end
62
56
  it 'should have a line iterator #each_line' do
63
- a = []
64
- @test_page.each_line do |line|
65
- a << line
66
- end
67
- a.length.should == 45
57
+ Enumerator.new(@test_page,:each_line).
58
+ inject([]) {|acc,elem| acc << elem}.length.should == 45
68
59
  end
69
60
  it 'should have a word iterator #each_word' do
70
- a = []
71
- @test_page.each_word do |word|
72
- a << word
73
- end
74
- a.length.should == 415
61
+ Enumerator.new(@test_page,:each_word).
62
+ inject([]) {|acc,elem| acc << elem}.length.should == 415
75
63
  end
76
64
 
77
65
  end
78
66
 
79
- describe 'convinience methods' do
80
- it 'should have a method #lines' do
81
- @test_page.lines[5].children.length.should == 11
82
- end
83
- end
84
67
 
85
68
  describe ' display methods' do
86
69
 
@@ -111,6 +94,8 @@ describe OCRPage do
111
94
  words = @output_page.enclosed_words( HOCRBox.new 0,0, 300,300 )
112
95
  words.length.should == 6
113
96
  end
97
+
98
+ it 'should have an #enclosed_word method to iterate over ewords enclosed by given box'
114
99
  end
115
100
 
116
101
  end
data/spec/rhocr_spec.rb CHANGED
@@ -13,16 +13,17 @@ describe RHOCR do
13
13
  end
14
14
 
15
15
  before(:each) do
16
- @rhocr_doc ||= RHOCR.new.add_folder "data/*.html"
16
+ @rhocr_doc ||= RHOCR.new
17
+ @rhocr_doc.add_folder "data/*.html"
17
18
  end
18
19
 
19
20
  describe 'methods to iterate' do
20
21
  it 'should have lines' do
21
- @rhocr_doc.lines.length.should == 237
22
+ @rhocr_doc.lines.inject([]){|acc,l| acc <<l}.length.should == 237
22
23
  end
23
24
 
24
25
  it 'should have words' do
25
- @rhocr_doc.words.length.should == 2071
26
+ @rhocr_doc.words.inject([]){|acc,w| acc <<w}.length.should == 2071
26
27
  end
27
28
 
28
29
  it 'should support common iterator methods throug enumerable for word an line arrays' do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rhocr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-09-30 00:00:00.000000000Z
12
+ date: 2011-10-10 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
16
- requirement: &70233340332980 !ruby/object:Gem::Requirement
16
+ requirement: &70118176813440 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,7 +21,7 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70233340332980
24
+ version_requirements: *70118176813440
25
25
  description: ! 'Manipulate and use OCR data encoded in hOCR-Format see: http://code.google.com/p/hocr-tools/'
26
26
  email: andreas@neumann.biz
27
27
  executables: []
@@ -53,13 +53,13 @@ files:
53
53
  - lib/ocr_element.rb
54
54
  - lib/ocr_page.rb
55
55
  - lib/rhocr.rb
56
- - rhocr.gemspec
57
56
  - spec/hocr_box_spec.rb
58
57
  - spec/ocr_document_spec.rb
59
58
  - spec/ocr_element_spec.rb
60
59
  - spec/ocr_page_spec.rb
61
60
  - spec/rhocr_spec.rb
62
61
  - test.html
62
+ - rhocr.gemspec
63
63
  homepage: http://github.com/daandi/rhocr
64
64
  licenses: []
65
65
  post_install_message: