rhocr 0.1 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Manifest CHANGED
@@ -1,7 +1,6 @@
1
1
  Manifest
2
2
  README
3
3
  Rakefile
4
- TODO.txt
5
4
  data/Seite_Die_Gartenlaube_242.html
6
5
  data/Seite_Tagebuch_H_C_Lang_08.html
7
6
  data/Seite_Tagebuch_H_C_Lang_08.jpg
data/Rakefile CHANGED
@@ -2,8 +2,8 @@ require 'rubygems'
2
2
  require 'rake'
3
3
  require 'echoe'
4
4
 
5
- Echoe.new('rhocr', '0.1') do |p|
6
- p.description = "Manipulate and use OCR data encode in HOCR"
5
+ Echoe.new('rhocr', '0.1.1') do |p|
6
+ p.description = "Manipulate and use OCR data encoded in hOCR-Format see: http://code.google.com/p/hocr-tools/"
7
7
  p.url = "http://github.com/daandi/rhocr"
8
8
  p.author = "Andreas Neumann"
9
9
  p.email = "andreas@neumann.biz"
data/data/test.html CHANGED
@@ -14,7 +14,8 @@
14
14
 
15
15
  <p class='ocr_par' title='bbox 79 109 1119 189' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 79 109 1119 145'><span class='ocrx_word' title='bbox 79 109 294 144'>Athenobius,</span> <span class='ocrx_word' title='bbox 334 112 398 139'>Der</span> <span class='ocrx_word' title='bbox 417 115 476 139'>von</span> <span class='ocrx_word' title='bbox 494 112 545 139'>der</span> <span class='ocrx_word' title='bbox 565 112 687 140'>Göttin</span> <span class='ocrx_word' title='bbox 707 112 857 140'>Minerva</span> <span class='ocrx_word' title='bbox 876 112 954 145'>lebt,</span> <span class='ocrx_word' title='bbox 974 112 1043 140'>oder:</span> <span class='ocrx_word' title='bbox 1062 112 1119 140'>Mi»</span><br></span><span class='ocr_line' title='bbox 108 155 300 189'><span class='ocrx_word' title='bbox 108 159 183 182'>nerva</span> <span class='ocrx_word' title='bbox 201 155 300 189'>Bogen.</span></span></p>
16
16
 
17
- <p class='ocr_par' title='bbox 74 196 1117 316' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 160 196 1117 232'><span class='ocrx_word' title='bbox 160 198 214 225'>Des</span> <span class='ocrx_word' title='bbox 242 197 340 230'>Königs</span> <span class='ocrx_word' title='bbox 367 196 503 230'>Antiochus</span> <span class='ocrx_word' title='bbox 531 197 626 230'>Freund</span> <span class='ocrx_word' title='bbox 655 197 713 225'>oder</span> <span class='ocrx_word' title='bbox 739 196 858 232'>geheimer</span> <span class='ocrx_word' title='bbox 885 196 963 230'>Nath.</span> <span class='ocrx_word' title='bbox 994 199 1005 224'>l</span> <span class='ocrx_word' title='bbox 1033 197 1117 226'>Mack.</span><br></span><span class='ocr_line' title='bbox 109 241 206 274'><span class='ocrx_word' title='bbox 109 241 147 274'>15,</span> <span class='ocrx_word' title='bbox 166 242 206 267'>28.</span><br></span><span class='ocr_line' title='bbox 74 281 1116 316'><span class='ocrx_word' title='bbox 74 281 205 315'>Athlai.</span> <span class='ocrx_word' title='bbox 242 284 310 310'>Dee</span> <span class='ocrx_word' title='bbox 337 282 417 315'>Herr</span> <span class='ocrx_word' title='bbox 440 281 598 315'>zerreißet</span> <span class='ocrx_word' title='bbox 625 282 681 310'>oder</span> <span class='ocrx_word' title='bbox 706 282 864 316'>zerbricht.</span> <span class='ocrx_word' title='bbox 898 282 975 310'>Einer</span> <span class='ocrx_word' title='bbox 999 286 1050 310'>von</span> <span class='ocrx_word' title='bbox 1069 282 1116 310'>den</span></span></p>
17
+ <p class='ocr_par' title='bbox 74 196 1117 316' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 160 196 1117 232'>
18
+ <span class='ocrx_word' title='bbox 160 198 214 225'>Des</span> <span class='ocrx_word' title='bbox 242 197 340 230'>Königs</span> <span class='ocrx_word' title='bbox 367 196 503 230'>Antiochus</span> <span class='ocrx_word' title='bbox 531 197 626 230'>Freund</span> <span class='ocrx_word' title='bbox 655 197 713 225'>oder</span> <span class='ocrx_word' title='bbox 739 196 858 232'>geheimer</span> <span class='ocrx_word' title='bbox 885 196 963 230'>Nath.</span> <span class='ocrx_word' title='bbox 994 199 1005 224'>l</span> <span class='ocrx_word' title='bbox 1033 197 1117 226'>Mack.</span><br></span><span class='ocr_line' title='bbox 109 241 206 274'><span class='ocrx_word' title='bbox 109 241 147 274'>15,</span> <span class='ocrx_word' title='bbox 166 242 206 267'>28.</span><br></span><span class='ocr_line' title='bbox 74 281 1116 316'><span class='ocrx_word' title='bbox 74 281 205 315'>Athlai.</span> <span class='ocrx_word' title='bbox 242 284 310 310'>Dee</span> <span class='ocrx_word' title='bbox 337 282 417 315'>Herr</span> <span class='ocrx_word' title='bbox 440 281 598 315'>zerreißet</span> <span class='ocrx_word' title='bbox 625 282 681 310'>oder</span> <span class='ocrx_word' title='bbox 706 282 864 316'>zerbricht.</span> <span class='ocrx_word' title='bbox 898 282 975 310'>Einer</span> <span class='ocrx_word' title='bbox 999 286 1050 310'>von</span> <span class='ocrx_word' title='bbox 1069 282 1116 310'>den</span></span></p>
18
19
 
19
20
  <p class='ocr_par' title='bbox 74 324 1114 401' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 107 324 594 357'><span class='ocrx_word' title='bbox 107 325 281 357'>Nachlommen</span> <span class='ocrx_word' title='bbox 300 324 392 352'>Bebai.</span> <span class='ocrx_word' title='bbox 410 324 472 356'>Esra</span> <span class='ocrx_word' title='bbox 496 327 533 355'>10,</span> <span class='ocrx_word' title='bbox 553 326 594 351'>28.</span><br></span><span class='ocr_line' title='bbox 74 366 1114 401'><span class='ocrx_word' title='bbox 74 366 189 400'>Athni.</span> <span class='ocrx_word' title='bbox 217 368 296 395'>Eine</span> <span class='ocrx_word' title='bbox 315 367 450 401'>Trübsal</span> <span class='ocrx_word' title='bbox 469 372 528 394'>von</span> <span class='ocrx_word' title='bbox 548 366 638 394'>Gott.</span> <span class='ocrx_word' title='bbox 673 366 722 394'>Ein</span> <span class='ocrx_word' title='bbox 742 366 819 400'>Sohn</span> <span class='ocrx_word' title='bbox 838 366 954 400'>Semaja.</span> <span class='ocrx_word' title='bbox 986 369 998 394'>1</span> <span class='ocrx_word' title='bbox 1018 368 1114 400'>Chron.</span></span></p>
20
21
 
data/lib/hocr_box.rb CHANGED
@@ -46,6 +46,14 @@ class HOCRBox
46
46
  other.left_distance_to(self)
47
47
  end
48
48
 
49
+ def top_distance_to(other)
50
+ @top - other.bottom
51
+ end
52
+
53
+ def bottom_distance_to(other)
54
+ other.top_distance_to(self)
55
+ end
56
+
49
57
  def to_s
50
58
  coordinates_to_s
51
59
  end
data/lib/ocr_element.rb CHANGED
@@ -105,7 +105,7 @@ class OCRWord < OCRElement
105
105
  end
106
106
 
107
107
  def to_s
108
- "#{text}[#{@features}]"
108
+ "#{text}:#{coordinates}->#{@features}"
109
109
  end
110
110
 
111
111
  def to_image_html
data/rhocr.gemspec CHANGED
@@ -2,21 +2,21 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{rhocr}
5
- s.version = "0.1"
5
+ s.version = "0.1.1"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = [%q{Andreas Neumann}]
9
- s.date = %q{2011-09-08}
10
- s.description = %q{Manipulate and use OCR data encode in HOCR}
9
+ s.date = %q{2011-09-30}
10
+ s.description = %q{Manipulate and use OCR data encoded in hOCR-Format see: http://code.google.com/p/hocr-tools/}
11
11
  s.email = %q{andreas@neumann.biz}
12
- s.extra_rdoc_files = [%q{README}, %q{TODO.txt}, %q{lib/hocr_box.rb}, %q{lib/ocr_document.rb}, %q{lib/ocr_element.rb}, %q{lib/ocr_page.rb}, %q{lib/rhocr.rb}]
13
- s.files = [%q{Manifest}, %q{README}, %q{Rakefile}, %q{TODO.txt}, %q{data/Seite_Die_Gartenlaube_242.html}, %q{data/Seite_Tagebuch_H_C_Lang_08.html}, %q{data/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{data/test.html}, %q{data/test.png}, %q{example/example_server.rb}, %q{example/public/OCRTest.css}, %q{example/public/OCRTest.html}, %q{example/public/OCRTest_marker.js}, %q{example/public/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{example/public/img/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{lib/hocr_box.rb}, %q{lib/ocr_document.rb}, %q{lib/ocr_element.rb}, %q{lib/ocr_page.rb}, %q{lib/rhocr.rb}, %q{rhocr.gemspec}, %q{spec/hocr_box_spec.rb}, %q{spec/ocr_document_spec.rb}, %q{spec/ocr_element_spec.rb}, %q{spec/ocr_page_spec.rb}, %q{spec/rhocr_spec.rb}, %q{test.html}]
12
+ s.extra_rdoc_files = [%q{README}, %q{lib/hocr_box.rb}, %q{lib/ocr_document.rb}, %q{lib/ocr_element.rb}, %q{lib/ocr_page.rb}, %q{lib/rhocr.rb}]
13
+ s.files = [%q{Manifest}, %q{README}, %q{Rakefile}, %q{data/Seite_Die_Gartenlaube_242.html}, %q{data/Seite_Tagebuch_H_C_Lang_08.html}, %q{data/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{data/test.html}, %q{data/test.png}, %q{example/example_server.rb}, %q{example/public/OCRTest.css}, %q{example/public/OCRTest.html}, %q{example/public/OCRTest_marker.js}, %q{example/public/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{example/public/img/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{lib/hocr_box.rb}, %q{lib/ocr_document.rb}, %q{lib/ocr_element.rb}, %q{lib/ocr_page.rb}, %q{lib/rhocr.rb}, %q{rhocr.gemspec}, %q{spec/hocr_box_spec.rb}, %q{spec/ocr_document_spec.rb}, %q{spec/ocr_element_spec.rb}, %q{spec/ocr_page_spec.rb}, %q{spec/rhocr_spec.rb}, %q{test.html}]
14
14
  s.homepage = %q{http://github.com/daandi/rhocr}
15
15
  s.rdoc_options = [%q{--line-numbers}, %q{--inline-source}, %q{--title}, %q{Rhocr}, %q{--main}, %q{README}]
16
16
  s.require_paths = [%q{lib}]
17
17
  s.rubyforge_project = %q{rhocr}
18
18
  s.rubygems_version = %q{1.8.6}
19
- s.summary = %q{Manipulate and use OCR data encode in HOCR}
19
+ s.summary = %q{Manipulate and use OCR data encoded in hOCR-Format see: http://code.google.com/p/hocr-tools/}
20
20
 
21
21
  if s.respond_to? :specification_version then
22
22
  s.specification_version = 3
@@ -90,5 +90,17 @@ describe HOCRBox do
90
90
  end
91
91
  end
92
92
 
93
+ describe '#top_distance_to(element)' do
94
+ it 'box should be 2px below of element' do
95
+ HOCRBox.new(109,241,206,274).top_distance_to(HOCRBox.new(160,196,1117,232)).should == 9
96
+ end
97
+ end
98
+
99
+ describe '#bottom_distance_to(element)' do
100
+ it 'box should be 2px above of element' do
101
+ HOCRBox.new(160,196,1117,232).bottom_distance_to(HOCRBox.new(109,241,206,274)).should == 9
102
+ end
103
+ end
104
+
93
105
 
94
106
  end
@@ -81,6 +81,11 @@ describe OCRElement do
81
81
  it 'special #to_html method' do
82
82
  @ocr_element.lines[0].words[5].to_html.should == "<span class='ocrx_word'>Minerva</span>"
83
83
  end
84
+ it 'special #to_s method' do
85
+ @ocr_element.lines[0].words[5].to_s.should == "Minerva:[707, 112, 857, 140]->[]"
86
+ @ocr_element.lines[0].words[5].features << :a_test << :additional
87
+ @ocr_element.lines[0].words[5].to_s.should == "Minerva:[707, 112, 857, 140]->[:a_test, :additional]"
88
+ end
84
89
  end
85
90
 
86
91
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rhocr
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.1'
4
+ version: 0.1.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-09-08 00:00:00.000000000Z
12
+ date: 2011-09-30 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
16
- requirement: &70276550001820 !ruby/object:Gem::Requirement
16
+ requirement: &70233340332980 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,14 +21,13 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70276550001820
25
- description: Manipulate and use OCR data encode in HOCR
24
+ version_requirements: *70233340332980
25
+ description: ! 'Manipulate and use OCR data encoded in hOCR-Format see: http://code.google.com/p/hocr-tools/'
26
26
  email: andreas@neumann.biz
27
27
  executables: []
28
28
  extensions: []
29
29
  extra_rdoc_files:
30
30
  - README
31
- - TODO.txt
32
31
  - lib/hocr_box.rb
33
32
  - lib/ocr_document.rb
34
33
  - lib/ocr_element.rb
@@ -38,7 +37,6 @@ files:
38
37
  - Manifest
39
38
  - README
40
39
  - Rakefile
41
- - TODO.txt
42
40
  - data/Seite_Die_Gartenlaube_242.html
43
41
  - data/Seite_Tagebuch_H_C_Lang_08.html
44
42
  - data/Seite_Tagebuch_H_C_Lang_08.jpg
@@ -91,5 +89,5 @@ rubyforge_project: rhocr
91
89
  rubygems_version: 1.8.6
92
90
  signing_key:
93
91
  specification_version: 3
94
- summary: Manipulate and use OCR data encode in HOCR
92
+ summary: ! 'Manipulate and use OCR data encoded in hOCR-Format see: http://code.google.com/p/hocr-tools/'
95
93
  test_files: []
data/TODO.txt DELETED
@@ -1,42 +0,0 @@
1
- 1. Implementiert
2
- * HOCR einlesen
3
- * Auf HOCR-Elemente zugreifen
4
- * Als Text ausgeben
5
- * Als formatiertes HTML ausgeben (Stylesheet fehlt)
6
- * Auf Grafik ausgeben (nur ohne Nesting)
7
-
8
- * Erste Tests um Elemente mit Features zu versehen
9
-
10
- 2. geplant
11
- * ABBYY-XML
12
- * Algorithmen um Elemente mit Features zu versehen
13
- * logische Strukturen markieren
14
-
15
- 3. Probleme
16
- * Testdaten, Stabi liefert schlecht Grafiken
17
- * Geschwindigkeit
18
- * Algorithmen, wie integrieren? Sollte modular sein
19
- * Feste Ordnung wird zur Zeit voraussgesetz
20
-
21
- 4. Bisheriges Vorgehen
22
- => HOCR einlesen, parsen
23
- => OCRElemente nach bestimmten Kriterien mit Features versehen
24
- => Anhand von Features logische Strukturen ermitteln
25
-
26
-
27
-
28
- 5. geplante Schritte
29
- * OCRs in ABBYY-XML von Dokumenten in Antrag anfertigen
30
- * Tests in Ruby-Version
31
- * weitere Algorithmen um Features zu finden
32
-
33
- * Scala/Java - Version
34
- * Uli/Thorsten XML integrieren
35
- * GUI
36
-
37
- 6. Idee:
38
- => Nutzer wählt Bereiche in GUI aus und benennt sie.
39
- => Features der gewählten Bereiche werden ermittelt.
40
- => Anhand der Features werden aus dem gesamten Dokument die logischen Strukturen extrahiert
41
- => Feedbackschleife in GUI mit
42
- Grafik+Marker|Text als HTML mit Marker|