rhocr 0.1 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Manifest +0 -1
- data/Rakefile +2 -2
- data/data/test.html +2 -1
- data/lib/hocr_box.rb +8 -0
- data/lib/ocr_element.rb +1 -1
- data/rhocr.gemspec +6 -6
- data/spec/hocr_box_spec.rb +12 -0
- data/spec/ocr_element_spec.rb +5 -0
- metadata +6 -8
- data/TODO.txt +0 -42
data/Manifest
CHANGED
data/Rakefile
CHANGED
|
@@ -2,8 +2,8 @@ require 'rubygems'
|
|
|
2
2
|
require 'rake'
|
|
3
3
|
require 'echoe'
|
|
4
4
|
|
|
5
|
-
Echoe.new('rhocr', '0.1') do |p|
|
|
6
|
-
p.description = "Manipulate and use OCR data
|
|
5
|
+
Echoe.new('rhocr', '0.1.1') do |p|
|
|
6
|
+
p.description = "Manipulate and use OCR data encoded in hOCR-Format see: http://code.google.com/p/hocr-tools/"
|
|
7
7
|
p.url = "http://github.com/daandi/rhocr"
|
|
8
8
|
p.author = "Andreas Neumann"
|
|
9
9
|
p.email = "andreas@neumann.biz"
|
data/data/test.html
CHANGED
|
@@ -14,7 +14,8 @@
|
|
|
14
14
|
|
|
15
15
|
<p class='ocr_par' title='bbox 79 109 1119 189' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 79 109 1119 145'><span class='ocrx_word' title='bbox 79 109 294 144'>Athenobius,</span> <span class='ocrx_word' title='bbox 334 112 398 139'>Der</span> <span class='ocrx_word' title='bbox 417 115 476 139'>von</span> <span class='ocrx_word' title='bbox 494 112 545 139'>der</span> <span class='ocrx_word' title='bbox 565 112 687 140'>Göttin</span> <span class='ocrx_word' title='bbox 707 112 857 140'>Minerva</span> <span class='ocrx_word' title='bbox 876 112 954 145'>lebt,</span> <span class='ocrx_word' title='bbox 974 112 1043 140'>oder:</span> <span class='ocrx_word' title='bbox 1062 112 1119 140'>Mi»</span><br></span><span class='ocr_line' title='bbox 108 155 300 189'><span class='ocrx_word' title='bbox 108 159 183 182'>nerva</span> <span class='ocrx_word' title='bbox 201 155 300 189'>Bogen.</span></span></p>
|
|
16
16
|
|
|
17
|
-
<p class='ocr_par' title='bbox 74 196 1117 316' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 160 196 1117 232'
|
|
17
|
+
<p class='ocr_par' title='bbox 74 196 1117 316' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 160 196 1117 232'>
|
|
18
|
+
<span class='ocrx_word' title='bbox 160 198 214 225'>Des</span> <span class='ocrx_word' title='bbox 242 197 340 230'>Königs</span> <span class='ocrx_word' title='bbox 367 196 503 230'>Antiochus</span> <span class='ocrx_word' title='bbox 531 197 626 230'>Freund</span> <span class='ocrx_word' title='bbox 655 197 713 225'>oder</span> <span class='ocrx_word' title='bbox 739 196 858 232'>geheimer</span> <span class='ocrx_word' title='bbox 885 196 963 230'>Nath.</span> <span class='ocrx_word' title='bbox 994 199 1005 224'>l</span> <span class='ocrx_word' title='bbox 1033 197 1117 226'>Mack.</span><br></span><span class='ocr_line' title='bbox 109 241 206 274'><span class='ocrx_word' title='bbox 109 241 147 274'>15,</span> <span class='ocrx_word' title='bbox 166 242 206 267'>28.</span><br></span><span class='ocr_line' title='bbox 74 281 1116 316'><span class='ocrx_word' title='bbox 74 281 205 315'>Athlai.</span> <span class='ocrx_word' title='bbox 242 284 310 310'>Dee</span> <span class='ocrx_word' title='bbox 337 282 417 315'>Herr</span> <span class='ocrx_word' title='bbox 440 281 598 315'>zerreißet</span> <span class='ocrx_word' title='bbox 625 282 681 310'>oder</span> <span class='ocrx_word' title='bbox 706 282 864 316'>zerbricht.</span> <span class='ocrx_word' title='bbox 898 282 975 310'>Einer</span> <span class='ocrx_word' title='bbox 999 286 1050 310'>von</span> <span class='ocrx_word' title='bbox 1069 282 1116 310'>den</span></span></p>
|
|
18
19
|
|
|
19
20
|
<p class='ocr_par' title='bbox 74 324 1114 401' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 107 324 594 357'><span class='ocrx_word' title='bbox 107 325 281 357'>Nachlommen</span> <span class='ocrx_word' title='bbox 300 324 392 352'>Bebai.</span> <span class='ocrx_word' title='bbox 410 324 472 356'>Esra</span> <span class='ocrx_word' title='bbox 496 327 533 355'>10,</span> <span class='ocrx_word' title='bbox 553 326 594 351'>28.</span><br></span><span class='ocr_line' title='bbox 74 366 1114 401'><span class='ocrx_word' title='bbox 74 366 189 400'>Athni.</span> <span class='ocrx_word' title='bbox 217 368 296 395'>Eine</span> <span class='ocrx_word' title='bbox 315 367 450 401'>Trübsal</span> <span class='ocrx_word' title='bbox 469 372 528 394'>von</span> <span class='ocrx_word' title='bbox 548 366 638 394'>Gott.</span> <span class='ocrx_word' title='bbox 673 366 722 394'>Ein</span> <span class='ocrx_word' title='bbox 742 366 819 400'>Sohn</span> <span class='ocrx_word' title='bbox 838 366 954 400'>Semaja.</span> <span class='ocrx_word' title='bbox 986 369 998 394'>1</span> <span class='ocrx_word' title='bbox 1018 368 1114 400'>Chron.</span></span></p>
|
|
20
21
|
|
data/lib/hocr_box.rb
CHANGED
data/lib/ocr_element.rb
CHANGED
data/rhocr.gemspec
CHANGED
|
@@ -2,21 +2,21 @@
|
|
|
2
2
|
|
|
3
3
|
Gem::Specification.new do |s|
|
|
4
4
|
s.name = %q{rhocr}
|
|
5
|
-
s.version = "0.1"
|
|
5
|
+
s.version = "0.1.1"
|
|
6
6
|
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
|
8
8
|
s.authors = [%q{Andreas Neumann}]
|
|
9
|
-
s.date = %q{2011-09-
|
|
10
|
-
s.description = %q{Manipulate and use OCR data
|
|
9
|
+
s.date = %q{2011-09-30}
|
|
10
|
+
s.description = %q{Manipulate and use OCR data encoded in hOCR-Format see: http://code.google.com/p/hocr-tools/}
|
|
11
11
|
s.email = %q{andreas@neumann.biz}
|
|
12
|
-
s.extra_rdoc_files = [%q{README}, %q{
|
|
13
|
-
s.files = [%q{Manifest}, %q{README}, %q{Rakefile}, %q{
|
|
12
|
+
s.extra_rdoc_files = [%q{README}, %q{lib/hocr_box.rb}, %q{lib/ocr_document.rb}, %q{lib/ocr_element.rb}, %q{lib/ocr_page.rb}, %q{lib/rhocr.rb}]
|
|
13
|
+
s.files = [%q{Manifest}, %q{README}, %q{Rakefile}, %q{data/Seite_Die_Gartenlaube_242.html}, %q{data/Seite_Tagebuch_H_C_Lang_08.html}, %q{data/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{data/test.html}, %q{data/test.png}, %q{example/example_server.rb}, %q{example/public/OCRTest.css}, %q{example/public/OCRTest.html}, %q{example/public/OCRTest_marker.js}, %q{example/public/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{example/public/img/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{lib/hocr_box.rb}, %q{lib/ocr_document.rb}, %q{lib/ocr_element.rb}, %q{lib/ocr_page.rb}, %q{lib/rhocr.rb}, %q{rhocr.gemspec}, %q{spec/hocr_box_spec.rb}, %q{spec/ocr_document_spec.rb}, %q{spec/ocr_element_spec.rb}, %q{spec/ocr_page_spec.rb}, %q{spec/rhocr_spec.rb}, %q{test.html}]
|
|
14
14
|
s.homepage = %q{http://github.com/daandi/rhocr}
|
|
15
15
|
s.rdoc_options = [%q{--line-numbers}, %q{--inline-source}, %q{--title}, %q{Rhocr}, %q{--main}, %q{README}]
|
|
16
16
|
s.require_paths = [%q{lib}]
|
|
17
17
|
s.rubyforge_project = %q{rhocr}
|
|
18
18
|
s.rubygems_version = %q{1.8.6}
|
|
19
|
-
s.summary = %q{Manipulate and use OCR data
|
|
19
|
+
s.summary = %q{Manipulate and use OCR data encoded in hOCR-Format see: http://code.google.com/p/hocr-tools/}
|
|
20
20
|
|
|
21
21
|
if s.respond_to? :specification_version then
|
|
22
22
|
s.specification_version = 3
|
data/spec/hocr_box_spec.rb
CHANGED
|
@@ -90,5 +90,17 @@ describe HOCRBox do
|
|
|
90
90
|
end
|
|
91
91
|
end
|
|
92
92
|
|
|
93
|
+
describe '#top_distance_to(element)' do
|
|
94
|
+
it 'box should be 2px below of element' do
|
|
95
|
+
HOCRBox.new(109,241,206,274).top_distance_to(HOCRBox.new(160,196,1117,232)).should == 9
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
describe '#bottom_distance_to(element)' do
|
|
100
|
+
it 'box should be 2px above of element' do
|
|
101
|
+
HOCRBox.new(160,196,1117,232).bottom_distance_to(HOCRBox.new(109,241,206,274)).should == 9
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
|
|
93
105
|
|
|
94
106
|
end
|
data/spec/ocr_element_spec.rb
CHANGED
|
@@ -81,6 +81,11 @@ describe OCRElement do
|
|
|
81
81
|
it 'special #to_html method' do
|
|
82
82
|
@ocr_element.lines[0].words[5].to_html.should == "<span class='ocrx_word'>Minerva</span>"
|
|
83
83
|
end
|
|
84
|
+
it 'special #to_s method' do
|
|
85
|
+
@ocr_element.lines[0].words[5].to_s.should == "Minerva:[707, 112, 857, 140]->[]"
|
|
86
|
+
@ocr_element.lines[0].words[5].features << :a_test << :additional
|
|
87
|
+
@ocr_element.lines[0].words[5].to_s.should == "Minerva:[707, 112, 857, 140]->[:a_test, :additional]"
|
|
88
|
+
end
|
|
84
89
|
end
|
|
85
90
|
|
|
86
91
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: rhocr
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version:
|
|
4
|
+
version: 0.1.1
|
|
5
5
|
prerelease:
|
|
6
6
|
platform: ruby
|
|
7
7
|
authors:
|
|
@@ -9,11 +9,11 @@ authors:
|
|
|
9
9
|
autorequire:
|
|
10
10
|
bindir: bin
|
|
11
11
|
cert_chain: []
|
|
12
|
-
date: 2011-09-
|
|
12
|
+
date: 2011-09-30 00:00:00.000000000Z
|
|
13
13
|
dependencies:
|
|
14
14
|
- !ruby/object:Gem::Dependency
|
|
15
15
|
name: nokogiri
|
|
16
|
-
requirement: &
|
|
16
|
+
requirement: &70233340332980 !ruby/object:Gem::Requirement
|
|
17
17
|
none: false
|
|
18
18
|
requirements:
|
|
19
19
|
- - ! '>='
|
|
@@ -21,14 +21,13 @@ dependencies:
|
|
|
21
21
|
version: '0'
|
|
22
22
|
type: :runtime
|
|
23
23
|
prerelease: false
|
|
24
|
-
version_requirements: *
|
|
25
|
-
description: Manipulate and use OCR data
|
|
24
|
+
version_requirements: *70233340332980
|
|
25
|
+
description: ! 'Manipulate and use OCR data encoded in hOCR-Format see: http://code.google.com/p/hocr-tools/'
|
|
26
26
|
email: andreas@neumann.biz
|
|
27
27
|
executables: []
|
|
28
28
|
extensions: []
|
|
29
29
|
extra_rdoc_files:
|
|
30
30
|
- README
|
|
31
|
-
- TODO.txt
|
|
32
31
|
- lib/hocr_box.rb
|
|
33
32
|
- lib/ocr_document.rb
|
|
34
33
|
- lib/ocr_element.rb
|
|
@@ -38,7 +37,6 @@ files:
|
|
|
38
37
|
- Manifest
|
|
39
38
|
- README
|
|
40
39
|
- Rakefile
|
|
41
|
-
- TODO.txt
|
|
42
40
|
- data/Seite_Die_Gartenlaube_242.html
|
|
43
41
|
- data/Seite_Tagebuch_H_C_Lang_08.html
|
|
44
42
|
- data/Seite_Tagebuch_H_C_Lang_08.jpg
|
|
@@ -91,5 +89,5 @@ rubyforge_project: rhocr
|
|
|
91
89
|
rubygems_version: 1.8.6
|
|
92
90
|
signing_key:
|
|
93
91
|
specification_version: 3
|
|
94
|
-
summary: Manipulate and use OCR data
|
|
92
|
+
summary: ! 'Manipulate and use OCR data encoded in hOCR-Format see: http://code.google.com/p/hocr-tools/'
|
|
95
93
|
test_files: []
|
data/TODO.txt
DELETED
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
1. Implementiert
|
|
2
|
-
* HOCR einlesen
|
|
3
|
-
* Auf HOCR-Elemente zugreifen
|
|
4
|
-
* Als Text ausgeben
|
|
5
|
-
* Als formatiertes HTML ausgeben (Stylesheet fehlt)
|
|
6
|
-
* Auf Grafik ausgeben (nur ohne Nesting)
|
|
7
|
-
|
|
8
|
-
* Erste Tests um Elemente mit Features zu versehen
|
|
9
|
-
|
|
10
|
-
2. geplant
|
|
11
|
-
* ABBYY-XML
|
|
12
|
-
* Algorithmen um Elemente mit Features zu versehen
|
|
13
|
-
* logische Strukturen markieren
|
|
14
|
-
|
|
15
|
-
3. Probleme
|
|
16
|
-
* Testdaten, Stabi liefert schlecht Grafiken
|
|
17
|
-
* Geschwindigkeit
|
|
18
|
-
* Algorithmen, wie integrieren? Sollte modular sein
|
|
19
|
-
* Feste Ordnung wird zur Zeit voraussgesetz
|
|
20
|
-
|
|
21
|
-
4. Bisheriges Vorgehen
|
|
22
|
-
=> HOCR einlesen, parsen
|
|
23
|
-
=> OCRElemente nach bestimmten Kriterien mit Features versehen
|
|
24
|
-
=> Anhand von Features logische Strukturen ermitteln
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
5. geplante Schritte
|
|
29
|
-
* OCRs in ABBYY-XML von Dokumenten in Antrag anfertigen
|
|
30
|
-
* Tests in Ruby-Version
|
|
31
|
-
* weitere Algorithmen um Features zu finden
|
|
32
|
-
|
|
33
|
-
* Scala/Java - Version
|
|
34
|
-
* Uli/Thorsten XML integrieren
|
|
35
|
-
* GUI
|
|
36
|
-
|
|
37
|
-
6. Idee:
|
|
38
|
-
=> Nutzer wählt Bereiche in GUI aus und benennt sie.
|
|
39
|
-
=> Features der gewählten Bereiche werden ermittelt.
|
|
40
|
-
=> Anhand der Features werden aus dem gesamten Dokument die logischen Strukturen extrahiert
|
|
41
|
-
=> Feedbackschleife in GUI mit
|
|
42
|
-
Grafik+Marker|Text als HTML mit Marker|
|