rhocr 0.1 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Manifest +0 -1
- data/Rakefile +2 -2
- data/data/test.html +2 -1
- data/lib/hocr_box.rb +8 -0
- data/lib/ocr_element.rb +1 -1
- data/rhocr.gemspec +6 -6
- data/spec/hocr_box_spec.rb +12 -0
- data/spec/ocr_element_spec.rb +5 -0
- metadata +6 -8
- data/TODO.txt +0 -42
data/Manifest
CHANGED
data/Rakefile
CHANGED
@@ -2,8 +2,8 @@ require 'rubygems'
|
|
2
2
|
require 'rake'
|
3
3
|
require 'echoe'
|
4
4
|
|
5
|
-
Echoe.new('rhocr', '0.1') do |p|
|
6
|
-
p.description = "Manipulate and use OCR data
|
5
|
+
Echoe.new('rhocr', '0.1.1') do |p|
|
6
|
+
p.description = "Manipulate and use OCR data encoded in hOCR-Format see: http://code.google.com/p/hocr-tools/"
|
7
7
|
p.url = "http://github.com/daandi/rhocr"
|
8
8
|
p.author = "Andreas Neumann"
|
9
9
|
p.email = "andreas@neumann.biz"
|
data/data/test.html
CHANGED
@@ -14,7 +14,8 @@
|
|
14
14
|
|
15
15
|
<p class='ocr_par' title='bbox 79 109 1119 189' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 79 109 1119 145'><span class='ocrx_word' title='bbox 79 109 294 144'>Athenobius,</span> <span class='ocrx_word' title='bbox 334 112 398 139'>Der</span> <span class='ocrx_word' title='bbox 417 115 476 139'>von</span> <span class='ocrx_word' title='bbox 494 112 545 139'>der</span> <span class='ocrx_word' title='bbox 565 112 687 140'>Göttin</span> <span class='ocrx_word' title='bbox 707 112 857 140'>Minerva</span> <span class='ocrx_word' title='bbox 876 112 954 145'>lebt,</span> <span class='ocrx_word' title='bbox 974 112 1043 140'>oder:</span> <span class='ocrx_word' title='bbox 1062 112 1119 140'>Mi»</span><br></span><span class='ocr_line' title='bbox 108 155 300 189'><span class='ocrx_word' title='bbox 108 159 183 182'>nerva</span> <span class='ocrx_word' title='bbox 201 155 300 189'>Bogen.</span></span></p>
|
16
16
|
|
17
|
-
<p class='ocr_par' title='bbox 74 196 1117 316' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 160 196 1117 232'
|
17
|
+
<p class='ocr_par' title='bbox 74 196 1117 316' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 160 196 1117 232'>
|
18
|
+
<span class='ocrx_word' title='bbox 160 198 214 225'>Des</span> <span class='ocrx_word' title='bbox 242 197 340 230'>Königs</span> <span class='ocrx_word' title='bbox 367 196 503 230'>Antiochus</span> <span class='ocrx_word' title='bbox 531 197 626 230'>Freund</span> <span class='ocrx_word' title='bbox 655 197 713 225'>oder</span> <span class='ocrx_word' title='bbox 739 196 858 232'>geheimer</span> <span class='ocrx_word' title='bbox 885 196 963 230'>Nath.</span> <span class='ocrx_word' title='bbox 994 199 1005 224'>l</span> <span class='ocrx_word' title='bbox 1033 197 1117 226'>Mack.</span><br></span><span class='ocr_line' title='bbox 109 241 206 274'><span class='ocrx_word' title='bbox 109 241 147 274'>15,</span> <span class='ocrx_word' title='bbox 166 242 206 267'>28.</span><br></span><span class='ocr_line' title='bbox 74 281 1116 316'><span class='ocrx_word' title='bbox 74 281 205 315'>Athlai.</span> <span class='ocrx_word' title='bbox 242 284 310 310'>Dee</span> <span class='ocrx_word' title='bbox 337 282 417 315'>Herr</span> <span class='ocrx_word' title='bbox 440 281 598 315'>zerreißet</span> <span class='ocrx_word' title='bbox 625 282 681 310'>oder</span> <span class='ocrx_word' title='bbox 706 282 864 316'>zerbricht.</span> <span class='ocrx_word' title='bbox 898 282 975 310'>Einer</span> <span class='ocrx_word' title='bbox 999 286 1050 310'>von</span> <span class='ocrx_word' title='bbox 1069 282 1116 310'>den</span></span></p>
|
18
19
|
|
19
20
|
<p class='ocr_par' title='bbox 74 324 1114 401' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 107 324 594 357'><span class='ocrx_word' title='bbox 107 325 281 357'>Nachlommen</span> <span class='ocrx_word' title='bbox 300 324 392 352'>Bebai.</span> <span class='ocrx_word' title='bbox 410 324 472 356'>Esra</span> <span class='ocrx_word' title='bbox 496 327 533 355'>10,</span> <span class='ocrx_word' title='bbox 553 326 594 351'>28.</span><br></span><span class='ocr_line' title='bbox 74 366 1114 401'><span class='ocrx_word' title='bbox 74 366 189 400'>Athni.</span> <span class='ocrx_word' title='bbox 217 368 296 395'>Eine</span> <span class='ocrx_word' title='bbox 315 367 450 401'>Trübsal</span> <span class='ocrx_word' title='bbox 469 372 528 394'>von</span> <span class='ocrx_word' title='bbox 548 366 638 394'>Gott.</span> <span class='ocrx_word' title='bbox 673 366 722 394'>Ein</span> <span class='ocrx_word' title='bbox 742 366 819 400'>Sohn</span> <span class='ocrx_word' title='bbox 838 366 954 400'>Semaja.</span> <span class='ocrx_word' title='bbox 986 369 998 394'>1</span> <span class='ocrx_word' title='bbox 1018 368 1114 400'>Chron.</span></span></p>
|
20
21
|
|
data/lib/hocr_box.rb
CHANGED
data/lib/ocr_element.rb
CHANGED
data/rhocr.gemspec
CHANGED
@@ -2,21 +2,21 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{rhocr}
|
5
|
-
s.version = "0.1"
|
5
|
+
s.version = "0.1.1"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = [%q{Andreas Neumann}]
|
9
|
-
s.date = %q{2011-09-
|
10
|
-
s.description = %q{Manipulate and use OCR data
|
9
|
+
s.date = %q{2011-09-30}
|
10
|
+
s.description = %q{Manipulate and use OCR data encoded in hOCR-Format see: http://code.google.com/p/hocr-tools/}
|
11
11
|
s.email = %q{andreas@neumann.biz}
|
12
|
-
s.extra_rdoc_files = [%q{README}, %q{
|
13
|
-
s.files = [%q{Manifest}, %q{README}, %q{Rakefile}, %q{
|
12
|
+
s.extra_rdoc_files = [%q{README}, %q{lib/hocr_box.rb}, %q{lib/ocr_document.rb}, %q{lib/ocr_element.rb}, %q{lib/ocr_page.rb}, %q{lib/rhocr.rb}]
|
13
|
+
s.files = [%q{Manifest}, %q{README}, %q{Rakefile}, %q{data/Seite_Die_Gartenlaube_242.html}, %q{data/Seite_Tagebuch_H_C_Lang_08.html}, %q{data/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{data/test.html}, %q{data/test.png}, %q{example/example_server.rb}, %q{example/public/OCRTest.css}, %q{example/public/OCRTest.html}, %q{example/public/OCRTest_marker.js}, %q{example/public/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{example/public/img/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{lib/hocr_box.rb}, %q{lib/ocr_document.rb}, %q{lib/ocr_element.rb}, %q{lib/ocr_page.rb}, %q{lib/rhocr.rb}, %q{rhocr.gemspec}, %q{spec/hocr_box_spec.rb}, %q{spec/ocr_document_spec.rb}, %q{spec/ocr_element_spec.rb}, %q{spec/ocr_page_spec.rb}, %q{spec/rhocr_spec.rb}, %q{test.html}]
|
14
14
|
s.homepage = %q{http://github.com/daandi/rhocr}
|
15
15
|
s.rdoc_options = [%q{--line-numbers}, %q{--inline-source}, %q{--title}, %q{Rhocr}, %q{--main}, %q{README}]
|
16
16
|
s.require_paths = [%q{lib}]
|
17
17
|
s.rubyforge_project = %q{rhocr}
|
18
18
|
s.rubygems_version = %q{1.8.6}
|
19
|
-
s.summary = %q{Manipulate and use OCR data
|
19
|
+
s.summary = %q{Manipulate and use OCR data encoded in hOCR-Format see: http://code.google.com/p/hocr-tools/}
|
20
20
|
|
21
21
|
if s.respond_to? :specification_version then
|
22
22
|
s.specification_version = 3
|
data/spec/hocr_box_spec.rb
CHANGED
@@ -90,5 +90,17 @@ describe HOCRBox do
|
|
90
90
|
end
|
91
91
|
end
|
92
92
|
|
93
|
+
describe '#top_distance_to(element)' do
|
94
|
+
it 'box should be 2px below of element' do
|
95
|
+
HOCRBox.new(109,241,206,274).top_distance_to(HOCRBox.new(160,196,1117,232)).should == 9
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
describe '#bottom_distance_to(element)' do
|
100
|
+
it 'box should be 2px above of element' do
|
101
|
+
HOCRBox.new(160,196,1117,232).bottom_distance_to(HOCRBox.new(109,241,206,274)).should == 9
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
93
105
|
|
94
106
|
end
|
data/spec/ocr_element_spec.rb
CHANGED
@@ -81,6 +81,11 @@ describe OCRElement do
|
|
81
81
|
it 'special #to_html method' do
|
82
82
|
@ocr_element.lines[0].words[5].to_html.should == "<span class='ocrx_word'>Minerva</span>"
|
83
83
|
end
|
84
|
+
it 'special #to_s method' do
|
85
|
+
@ocr_element.lines[0].words[5].to_s.should == "Minerva:[707, 112, 857, 140]->[]"
|
86
|
+
@ocr_element.lines[0].words[5].features << :a_test << :additional
|
87
|
+
@ocr_element.lines[0].words[5].to_s.should == "Minerva:[707, 112, 857, 140]->[:a_test, :additional]"
|
88
|
+
end
|
84
89
|
end
|
85
90
|
|
86
91
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rhocr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 0.1.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-09-
|
12
|
+
date: 2011-09-30 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
16
|
-
requirement: &
|
16
|
+
requirement: &70233340332980 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,14 +21,13 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
25
|
-
description: Manipulate and use OCR data
|
24
|
+
version_requirements: *70233340332980
|
25
|
+
description: ! 'Manipulate and use OCR data encoded in hOCR-Format see: http://code.google.com/p/hocr-tools/'
|
26
26
|
email: andreas@neumann.biz
|
27
27
|
executables: []
|
28
28
|
extensions: []
|
29
29
|
extra_rdoc_files:
|
30
30
|
- README
|
31
|
-
- TODO.txt
|
32
31
|
- lib/hocr_box.rb
|
33
32
|
- lib/ocr_document.rb
|
34
33
|
- lib/ocr_element.rb
|
@@ -38,7 +37,6 @@ files:
|
|
38
37
|
- Manifest
|
39
38
|
- README
|
40
39
|
- Rakefile
|
41
|
-
- TODO.txt
|
42
40
|
- data/Seite_Die_Gartenlaube_242.html
|
43
41
|
- data/Seite_Tagebuch_H_C_Lang_08.html
|
44
42
|
- data/Seite_Tagebuch_H_C_Lang_08.jpg
|
@@ -91,5 +89,5 @@ rubyforge_project: rhocr
|
|
91
89
|
rubygems_version: 1.8.6
|
92
90
|
signing_key:
|
93
91
|
specification_version: 3
|
94
|
-
summary: Manipulate and use OCR data
|
92
|
+
summary: ! 'Manipulate and use OCR data encoded in hOCR-Format see: http://code.google.com/p/hocr-tools/'
|
95
93
|
test_files: []
|
data/TODO.txt
DELETED
@@ -1,42 +0,0 @@
|
|
1
|
-
1. Implementiert
|
2
|
-
* HOCR einlesen
|
3
|
-
* Auf HOCR-Elemente zugreifen
|
4
|
-
* Als Text ausgeben
|
5
|
-
* Als formatiertes HTML ausgeben (Stylesheet fehlt)
|
6
|
-
* Auf Grafik ausgeben (nur ohne Nesting)
|
7
|
-
|
8
|
-
* Erste Tests um Elemente mit Features zu versehen
|
9
|
-
|
10
|
-
2. geplant
|
11
|
-
* ABBYY-XML
|
12
|
-
* Algorithmen um Elemente mit Features zu versehen
|
13
|
-
* logische Strukturen markieren
|
14
|
-
|
15
|
-
3. Probleme
|
16
|
-
* Testdaten, Stabi liefert schlecht Grafiken
|
17
|
-
* Geschwindigkeit
|
18
|
-
* Algorithmen, wie integrieren? Sollte modular sein
|
19
|
-
* Feste Ordnung wird zur Zeit voraussgesetz
|
20
|
-
|
21
|
-
4. Bisheriges Vorgehen
|
22
|
-
=> HOCR einlesen, parsen
|
23
|
-
=> OCRElemente nach bestimmten Kriterien mit Features versehen
|
24
|
-
=> Anhand von Features logische Strukturen ermitteln
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
5. geplante Schritte
|
29
|
-
* OCRs in ABBYY-XML von Dokumenten in Antrag anfertigen
|
30
|
-
* Tests in Ruby-Version
|
31
|
-
* weitere Algorithmen um Features zu finden
|
32
|
-
|
33
|
-
* Scala/Java - Version
|
34
|
-
* Uli/Thorsten XML integrieren
|
35
|
-
* GUI
|
36
|
-
|
37
|
-
6. Idee:
|
38
|
-
=> Nutzer wählt Bereiche in GUI aus und benennt sie.
|
39
|
-
=> Features der gewählten Bereiche werden ermittelt.
|
40
|
-
=> Anhand der Features werden aus dem gesamten Dokument die logischen Strukturen extrahiert
|
41
|
-
=> Feedbackschleife in GUI mit
|
42
|
-
Grafik+Marker|Text als HTML mit Marker|
|