rhocr 0.1 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
data/Manifest CHANGED
@@ -1,7 +1,6 @@
1
1
  Manifest
2
2
  README
3
3
  Rakefile
4
- TODO.txt
5
4
  data/Seite_Die_Gartenlaube_242.html
6
5
  data/Seite_Tagebuch_H_C_Lang_08.html
7
6
  data/Seite_Tagebuch_H_C_Lang_08.jpg
data/Rakefile CHANGED
@@ -2,8 +2,8 @@ require 'rubygems'
2
2
  require 'rake'
3
3
  require 'echoe'
4
4
 
5
- Echoe.new('rhocr', '0.1') do |p|
6
- p.description = "Manipulate and use OCR data encode in HOCR"
5
+ Echoe.new('rhocr', '0.1.1') do |p|
6
+ p.description = "Manipulate and use OCR data encoded in hOCR-Format see: http://code.google.com/p/hocr-tools/"
7
7
  p.url = "http://github.com/daandi/rhocr"
8
8
  p.author = "Andreas Neumann"
9
9
  p.email = "andreas@neumann.biz"
data/data/test.html CHANGED
@@ -14,7 +14,8 @@
14
14
 
15
15
  <p class='ocr_par' title='bbox 79 109 1119 189' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 79 109 1119 145'><span class='ocrx_word' title='bbox 79 109 294 144'>Athenobius,</span> <span class='ocrx_word' title='bbox 334 112 398 139'>Der</span> <span class='ocrx_word' title='bbox 417 115 476 139'>von</span> <span class='ocrx_word' title='bbox 494 112 545 139'>der</span> <span class='ocrx_word' title='bbox 565 112 687 140'>Göttin</span> <span class='ocrx_word' title='bbox 707 112 857 140'>Minerva</span> <span class='ocrx_word' title='bbox 876 112 954 145'>lebt,</span> <span class='ocrx_word' title='bbox 974 112 1043 140'>oder:</span> <span class='ocrx_word' title='bbox 1062 112 1119 140'>Mi»</span><br></span><span class='ocr_line' title='bbox 108 155 300 189'><span class='ocrx_word' title='bbox 108 159 183 182'>nerva</span> <span class='ocrx_word' title='bbox 201 155 300 189'>Bogen.</span></span></p>
16
16
 
17
- <p class='ocr_par' title='bbox 74 196 1117 316' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 160 196 1117 232'><span class='ocrx_word' title='bbox 160 198 214 225'>Des</span> <span class='ocrx_word' title='bbox 242 197 340 230'>Königs</span> <span class='ocrx_word' title='bbox 367 196 503 230'>Antiochus</span> <span class='ocrx_word' title='bbox 531 197 626 230'>Freund</span> <span class='ocrx_word' title='bbox 655 197 713 225'>oder</span> <span class='ocrx_word' title='bbox 739 196 858 232'>geheimer</span> <span class='ocrx_word' title='bbox 885 196 963 230'>Nath.</span> <span class='ocrx_word' title='bbox 994 199 1005 224'>l</span> <span class='ocrx_word' title='bbox 1033 197 1117 226'>Mack.</span><br></span><span class='ocr_line' title='bbox 109 241 206 274'><span class='ocrx_word' title='bbox 109 241 147 274'>15,</span> <span class='ocrx_word' title='bbox 166 242 206 267'>28.</span><br></span><span class='ocr_line' title='bbox 74 281 1116 316'><span class='ocrx_word' title='bbox 74 281 205 315'>Athlai.</span> <span class='ocrx_word' title='bbox 242 284 310 310'>Dee</span> <span class='ocrx_word' title='bbox 337 282 417 315'>Herr</span> <span class='ocrx_word' title='bbox 440 281 598 315'>zerreißet</span> <span class='ocrx_word' title='bbox 625 282 681 310'>oder</span> <span class='ocrx_word' title='bbox 706 282 864 316'>zerbricht.</span> <span class='ocrx_word' title='bbox 898 282 975 310'>Einer</span> <span class='ocrx_word' title='bbox 999 286 1050 310'>von</span> <span class='ocrx_word' title='bbox 1069 282 1116 310'>den</span></span></p>
17
+ <p class='ocr_par' title='bbox 74 196 1117 316' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 160 196 1117 232'>
18
+ <span class='ocrx_word' title='bbox 160 198 214 225'>Des</span> <span class='ocrx_word' title='bbox 242 197 340 230'>Königs</span> <span class='ocrx_word' title='bbox 367 196 503 230'>Antiochus</span> <span class='ocrx_word' title='bbox 531 197 626 230'>Freund</span> <span class='ocrx_word' title='bbox 655 197 713 225'>oder</span> <span class='ocrx_word' title='bbox 739 196 858 232'>geheimer</span> <span class='ocrx_word' title='bbox 885 196 963 230'>Nath.</span> <span class='ocrx_word' title='bbox 994 199 1005 224'>l</span> <span class='ocrx_word' title='bbox 1033 197 1117 226'>Mack.</span><br></span><span class='ocr_line' title='bbox 109 241 206 274'><span class='ocrx_word' title='bbox 109 241 147 274'>15,</span> <span class='ocrx_word' title='bbox 166 242 206 267'>28.</span><br></span><span class='ocr_line' title='bbox 74 281 1116 316'><span class='ocrx_word' title='bbox 74 281 205 315'>Athlai.</span> <span class='ocrx_word' title='bbox 242 284 310 310'>Dee</span> <span class='ocrx_word' title='bbox 337 282 417 315'>Herr</span> <span class='ocrx_word' title='bbox 440 281 598 315'>zerreißet</span> <span class='ocrx_word' title='bbox 625 282 681 310'>oder</span> <span class='ocrx_word' title='bbox 706 282 864 316'>zerbricht.</span> <span class='ocrx_word' title='bbox 898 282 975 310'>Einer</span> <span class='ocrx_word' title='bbox 999 286 1050 310'>von</span> <span class='ocrx_word' title='bbox 1069 282 1116 310'>den</span></span></p>
18
19
 
19
20
  <p class='ocr_par' title='bbox 74 324 1114 401' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 107 324 594 357'><span class='ocrx_word' title='bbox 107 325 281 357'>Nachlommen</span> <span class='ocrx_word' title='bbox 300 324 392 352'>Bebai.</span> <span class='ocrx_word' title='bbox 410 324 472 356'>Esra</span> <span class='ocrx_word' title='bbox 496 327 533 355'>10,</span> <span class='ocrx_word' title='bbox 553 326 594 351'>28.</span><br></span><span class='ocr_line' title='bbox 74 366 1114 401'><span class='ocrx_word' title='bbox 74 366 189 400'>Athni.</span> <span class='ocrx_word' title='bbox 217 368 296 395'>Eine</span> <span class='ocrx_word' title='bbox 315 367 450 401'>Trübsal</span> <span class='ocrx_word' title='bbox 469 372 528 394'>von</span> <span class='ocrx_word' title='bbox 548 366 638 394'>Gott.</span> <span class='ocrx_word' title='bbox 673 366 722 394'>Ein</span> <span class='ocrx_word' title='bbox 742 366 819 400'>Sohn</span> <span class='ocrx_word' title='bbox 838 366 954 400'>Semaja.</span> <span class='ocrx_word' title='bbox 986 369 998 394'>1</span> <span class='ocrx_word' title='bbox 1018 368 1114 400'>Chron.</span></span></p>
20
21
 
data/lib/hocr_box.rb CHANGED
@@ -46,6 +46,14 @@ class HOCRBox
46
46
  other.left_distance_to(self)
47
47
  end
48
48
 
49
+ def top_distance_to(other)
50
+ @top - other.bottom
51
+ end
52
+
53
+ def bottom_distance_to(other)
54
+ other.top_distance_to(self)
55
+ end
56
+
49
57
  def to_s
50
58
  coordinates_to_s
51
59
  end
data/lib/ocr_element.rb CHANGED
@@ -105,7 +105,7 @@ class OCRWord < OCRElement
105
105
  end
106
106
 
107
107
  def to_s
108
- "#{text}[#{@features}]"
108
+ "#{text}:#{coordinates}->#{@features}"
109
109
  end
110
110
 
111
111
  def to_image_html
data/rhocr.gemspec CHANGED
@@ -2,21 +2,21 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{rhocr}
5
- s.version = "0.1"
5
+ s.version = "0.1.1"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = [%q{Andreas Neumann}]
9
- s.date = %q{2011-09-08}
10
- s.description = %q{Manipulate and use OCR data encode in HOCR}
9
+ s.date = %q{2011-09-30}
10
+ s.description = %q{Manipulate and use OCR data encoded in hOCR-Format see: http://code.google.com/p/hocr-tools/}
11
11
  s.email = %q{andreas@neumann.biz}
12
- s.extra_rdoc_files = [%q{README}, %q{TODO.txt}, %q{lib/hocr_box.rb}, %q{lib/ocr_document.rb}, %q{lib/ocr_element.rb}, %q{lib/ocr_page.rb}, %q{lib/rhocr.rb}]
13
- s.files = [%q{Manifest}, %q{README}, %q{Rakefile}, %q{TODO.txt}, %q{data/Seite_Die_Gartenlaube_242.html}, %q{data/Seite_Tagebuch_H_C_Lang_08.html}, %q{data/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{data/test.html}, %q{data/test.png}, %q{example/example_server.rb}, %q{example/public/OCRTest.css}, %q{example/public/OCRTest.html}, %q{example/public/OCRTest_marker.js}, %q{example/public/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{example/public/img/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{lib/hocr_box.rb}, %q{lib/ocr_document.rb}, %q{lib/ocr_element.rb}, %q{lib/ocr_page.rb}, %q{lib/rhocr.rb}, %q{rhocr.gemspec}, %q{spec/hocr_box_spec.rb}, %q{spec/ocr_document_spec.rb}, %q{spec/ocr_element_spec.rb}, %q{spec/ocr_page_spec.rb}, %q{spec/rhocr_spec.rb}, %q{test.html}]
12
+ s.extra_rdoc_files = [%q{README}, %q{lib/hocr_box.rb}, %q{lib/ocr_document.rb}, %q{lib/ocr_element.rb}, %q{lib/ocr_page.rb}, %q{lib/rhocr.rb}]
13
+ s.files = [%q{Manifest}, %q{README}, %q{Rakefile}, %q{data/Seite_Die_Gartenlaube_242.html}, %q{data/Seite_Tagebuch_H_C_Lang_08.html}, %q{data/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{data/test.html}, %q{data/test.png}, %q{example/example_server.rb}, %q{example/public/OCRTest.css}, %q{example/public/OCRTest.html}, %q{example/public/OCRTest_marker.js}, %q{example/public/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{example/public/img/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{lib/hocr_box.rb}, %q{lib/ocr_document.rb}, %q{lib/ocr_element.rb}, %q{lib/ocr_page.rb}, %q{lib/rhocr.rb}, %q{rhocr.gemspec}, %q{spec/hocr_box_spec.rb}, %q{spec/ocr_document_spec.rb}, %q{spec/ocr_element_spec.rb}, %q{spec/ocr_page_spec.rb}, %q{spec/rhocr_spec.rb}, %q{test.html}]
14
14
  s.homepage = %q{http://github.com/daandi/rhocr}
15
15
  s.rdoc_options = [%q{--line-numbers}, %q{--inline-source}, %q{--title}, %q{Rhocr}, %q{--main}, %q{README}]
16
16
  s.require_paths = [%q{lib}]
17
17
  s.rubyforge_project = %q{rhocr}
18
18
  s.rubygems_version = %q{1.8.6}
19
- s.summary = %q{Manipulate and use OCR data encode in HOCR}
19
+ s.summary = %q{Manipulate and use OCR data encoded in hOCR-Format see: http://code.google.com/p/hocr-tools/}
20
20
 
21
21
  if s.respond_to? :specification_version then
22
22
  s.specification_version = 3
@@ -90,5 +90,17 @@ describe HOCRBox do
90
90
  end
91
91
  end
92
92
 
93
+ describe '#top_distance_to(element)' do
94
+ it 'box should be 2px below of element' do
95
+ HOCRBox.new(109,241,206,274).top_distance_to(HOCRBox.new(160,196,1117,232)).should == 9
96
+ end
97
+ end
98
+
99
+ describe '#bottom_distance_to(element)' do
100
+ it 'box should be 2px above of element' do
101
+ HOCRBox.new(160,196,1117,232).bottom_distance_to(HOCRBox.new(109,241,206,274)).should == 9
102
+ end
103
+ end
104
+
93
105
 
94
106
  end
@@ -81,6 +81,11 @@ describe OCRElement do
81
81
  it 'special #to_html method' do
82
82
  @ocr_element.lines[0].words[5].to_html.should == "<span class='ocrx_word'>Minerva</span>"
83
83
  end
84
+ it 'special #to_s method' do
85
+ @ocr_element.lines[0].words[5].to_s.should == "Minerva:[707, 112, 857, 140]->[]"
86
+ @ocr_element.lines[0].words[5].features << :a_test << :additional
87
+ @ocr_element.lines[0].words[5].to_s.should == "Minerva:[707, 112, 857, 140]->[:a_test, :additional]"
88
+ end
84
89
  end
85
90
 
86
91
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rhocr
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.1'
4
+ version: 0.1.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-09-08 00:00:00.000000000Z
12
+ date: 2011-09-30 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
16
- requirement: &70276550001820 !ruby/object:Gem::Requirement
16
+ requirement: &70233340332980 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,14 +21,13 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70276550001820
25
- description: Manipulate and use OCR data encode in HOCR
24
+ version_requirements: *70233340332980
25
+ description: ! 'Manipulate and use OCR data encoded in hOCR-Format see: http://code.google.com/p/hocr-tools/'
26
26
  email: andreas@neumann.biz
27
27
  executables: []
28
28
  extensions: []
29
29
  extra_rdoc_files:
30
30
  - README
31
- - TODO.txt
32
31
  - lib/hocr_box.rb
33
32
  - lib/ocr_document.rb
34
33
  - lib/ocr_element.rb
@@ -38,7 +37,6 @@ files:
38
37
  - Manifest
39
38
  - README
40
39
  - Rakefile
41
- - TODO.txt
42
40
  - data/Seite_Die_Gartenlaube_242.html
43
41
  - data/Seite_Tagebuch_H_C_Lang_08.html
44
42
  - data/Seite_Tagebuch_H_C_Lang_08.jpg
@@ -91,5 +89,5 @@ rubyforge_project: rhocr
91
89
  rubygems_version: 1.8.6
92
90
  signing_key:
93
91
  specification_version: 3
94
- summary: Manipulate and use OCR data encode in HOCR
92
+ summary: ! 'Manipulate and use OCR data encoded in hOCR-Format see: http://code.google.com/p/hocr-tools/'
95
93
  test_files: []
data/TODO.txt DELETED
@@ -1,42 +0,0 @@
1
- 1. Implementiert
2
- * HOCR einlesen
3
- * Auf HOCR-Elemente zugreifen
4
- * Als Text ausgeben
5
- * Als formatiertes HTML ausgeben (Stylesheet fehlt)
6
- * Auf Grafik ausgeben (nur ohne Nesting)
7
-
8
- * Erste Tests um Elemente mit Features zu versehen
9
-
10
- 2. geplant
11
- * ABBYY-XML
12
- * Algorithmen um Elemente mit Features zu versehen
13
- * logische Strukturen markieren
14
-
15
- 3. Probleme
16
- * Testdaten, Stabi liefert schlecht Grafiken
17
- * Geschwindigkeit
18
- * Algorithmen, wie integrieren? Sollte modular sein
19
- * Feste Ordnung wird zur Zeit voraussgesetz
20
-
21
- 4. Bisheriges Vorgehen
22
- => HOCR einlesen, parsen
23
- => OCRElemente nach bestimmten Kriterien mit Features versehen
24
- => Anhand von Features logische Strukturen ermitteln
25
-
26
-
27
-
28
- 5. geplante Schritte
29
- * OCRs in ABBYY-XML von Dokumenten in Antrag anfertigen
30
- * Tests in Ruby-Version
31
- * weitere Algorithmen um Features zu finden
32
-
33
- * Scala/Java - Version
34
- * Uli/Thorsten XML integrieren
35
- * GUI
36
-
37
- 6. Idee:
38
- => Nutzer wählt Bereiche in GUI aus und benennt sie.
39
- => Features der gewählten Bereiche werden ermittelt.
40
- => Anhand der Features werden aus dem gesamten Dokument die logischen Strukturen extrahiert
41
- => Feedbackschleife in GUI mit
42
- Grafik+Marker|Text als HTML mit Marker|