rhocr 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Manifest +18 -0
- data/README +1 -0
- data/Rakefile +12 -0
- data/data/Seite_Tagebuch_H_C_Lang_08.html +28 -0
- data/example/example_server.rb +29 -0
- data/example/public/OCRTest.css +30 -0
- data/example/public/OCRTest.html +54 -0
- data/example/public/OCRTest_marker.js +83 -0
- data/example/public/img/Seite_Tagebuch_H_C_Lang_05.jpg +0 -0
- data/example/public/img/Seite_Tagebuch_H_C_Lang_08.jpg +0 -0
- data/lib/hocr.rb +1 -0
- data/lib/ocr_box.rb +43 -0
- data/lib/ocr_page.rb +43 -0
- data/lib/ocrx_word.rb +23 -0
- data/rhocr.gemspec +29 -0
- data/rspec/ocr_box_spec.rb +48 -0
- data/rspec/ocr_page_spec.rb +17 -0
- data/rspec/ocrx_word_spec.rb +32 -0
- data/test.rb +8 -0
- metadata +83 -0
data/Manifest
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
README
|
2
|
+
Rakefile
|
3
|
+
data/Seite_Tagebuch_H_C_Lang_08.html
|
4
|
+
example/example_server.rb
|
5
|
+
example/public/OCRTest.css
|
6
|
+
example/public/OCRTest.html
|
7
|
+
example/public/OCRTest_marker.js
|
8
|
+
example/public/img/Seite_Tagebuch_H_C_Lang_05.jpg
|
9
|
+
example/public/img/Seite_Tagebuch_H_C_Lang_08.jpg
|
10
|
+
lib/hocr.rb
|
11
|
+
lib/ocr_box.rb
|
12
|
+
lib/ocr_page.rb
|
13
|
+
lib/ocrx_word.rb
|
14
|
+
rspec/ocr_box_spec.rb
|
15
|
+
rspec/ocr_page_spec.rb
|
16
|
+
rspec/ocrx_word_spec.rb
|
17
|
+
test.rb
|
18
|
+
Manifest
|
data/README
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
Ruby Library to work with OCR-Data in the HOCR-Format.
|
data/Rakefile
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
require 'echoe'
|
4
|
+
|
5
|
+
Echoe.new('rhocr', '0.0.1') do |p|
|
6
|
+
p.description = "Manipulate and use OCR data encode in HOCR"
|
7
|
+
p.url = "http://github.com/daandi/rhocr"
|
8
|
+
p.author = "Andreas Neumann"
|
9
|
+
p.email = "info @nospam@ an-it.com"
|
10
|
+
p.ignore_pattern = ["tmp/*", "script/*"]
|
11
|
+
p.development_dependencies = []
|
12
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<title>OCR Output</title>
|
5
|
+
<meta http-equiv="content-type" content="text/html; charset=utf-8" />
|
6
|
+
<meta http-equiv="content-style-type" content="text/css" />
|
7
|
+
<meta name="ocr-capabilities" content="ocr_page ocr_par ocrx_word ocr_line" />
|
8
|
+
<meta name="ocr-system" content="ABBYY fre-8.0.1.1024" />
|
9
|
+
<meta name="ocr-number-of-pages" content="1" />
|
10
|
+
</head>
|
11
|
+
<body bgcolor="#ffffff">
|
12
|
+
<div class="ocr_page" title="bbox 0 0 1709 1709;ppageno 20">
|
13
|
+
|
14
|
+
<div class="ocrx_block" title="bboxnull 111 1472 2270" style="font-size:9pt;font-family:"Arial";font-style:normal"><br>
|
15
|
+
<p class="ocr_par" style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 153 title="bbox 184 115 205 153"><span class="ocrx_word" title="bbox 184 115 205 153">8</span></span> <br></p>
|
16
|
+
<p class="ocr_par" align=Justified leftIndent=100 startIndent=1900 lineSpacing=1056 style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 249 title="bbox 264 216 1462 256"><span class="ocrx_word" title="bbox 264 216 333 248">Den</span> <span class="ocrx_word" title="bbox 356 216 402 248">20.</span> <span class="ocrx_word" title="bbox 426 216 620 249">Novembris</span> <span class="ocrx_word" title="bbox 643 218 698 249">bin</span> <span class="ocrx_word" title="bbox 720 218 758 256">ich</span> <span class="ocrx_word" title="bbox 781 217 917 248">widrum</span> <span class="ocrx_word" title="bbox 940 217 973 256">uf</span> <span class="ocrx_word" title="bbox 997 217 1220 256">Schaffhausen</span> <span class="ocrx_word" title="bbox 1245 226 1376 256">gezogen</span> <span class="ocrx_word" title="bbox 1399 218 1462 249">und</span></span> <br><span class="ocr_line" baseline= 293 title="bbox 187 261 1464 301"><span class="ocrx_word" title="bbox 187 262 242 299">bey</span> <span class="ocrx_word" title="bbox 262 261 405 292">Meinem</span> <span class="ocrx_word" title="bbox 426 261 546 299">Herren</span> <span class="ocrx_word" title="bbox 567 263 748 294">verblieben</span> <span class="ocrx_word" title="bbox 769 263 820 293">bis</span> <span class="ocrx_word" title="bbox 843 263 876 300">uf</span> <span class="ocrx_word" title="bbox 898 263 943 294">10.</span> <span class="ocrx_word" title="bbox 963 262 1131 300">Dezember</span> <span class="ocrx_word" title="bbox 1153 264 1239 301">1620,</span> <span class="ocrx_word" title="bbox 1260 264 1318 294">wie</span> <span class="ocrx_word" title="bbox 1337 263 1464 300">hiervor</span></span> <br><span class="ocr_line" baseline= 338 title="bbox 187 306 1464 346"><span class="ocrx_word" title="bbox 187 306 337 344">gemeldet</span> <span class="ocrx_word" title="bbox 366 306 410 344">ist,</span> <span class="ocrx_word" title="bbox 440 307 502 338">und</span> <span class="ocrx_word" title="bbox 533 306 588 338">bin</span> <span class="ocrx_word" title="bbox 616 307 650 346">uf</span> <span class="ocrx_word" title="bbox 679 308 737 338">den</span> <span class="ocrx_word" title="bbox 768 308 813 339">17.</span> <span class="ocrx_word" title="bbox 843 307 1011 345">Dezember</span> <span class="ocrx_word" title="bbox 1042 307 1118 338">1620</span> <span class="ocrx_word" title="bbox 1147 308 1283 339">widrum</span> <span class="ocrx_word" title="bbox 1313 308 1400 344">haim</span> <span class="ocrx_word" title="bbox 1430 307 1464 344">uf</span></span> <br><span class="ocr_line" baseline= 383 title="bbox 186 351 1462 391"><span class="ocrx_word" title="bbox 186 351 273 389">Mne</span> <span class="ocrx_word" title="bbox 303 351 441 382">kommen</span> <span class="ocrx_word" title="bbox 471 352 534 382">und</span> <span class="ocrx_word" title="bbox 565 352 652 383">allda</span> <span class="ocrx_word" title="bbox 681 352 863 383">verblieben</span> <span class="ocrx_word" title="bbox 893 352 943 382">bis</span> <span class="ocrx_word" title="bbox 973 352 1028 389">auf</span> <span class="ocrx_word" title="bbox 1058 352 1116 383">den</span> <span class="ocrx_word" title="bbox 1145 354 1172 384">9.</span> <span class="ocrx_word" title="bbox 1201 353 1373 391">Februarii</span> <span class="ocrx_word" title="bbox 1404 352 1462 383">Ao.</span></span> <br><span class="ocr_line" baseline= 427 title="bbox 187 396 1463 435"><span class="ocrx_word" title="bbox 187 396 272 435">1621,</span> <span class="ocrx_word" title="bbox 294 397 331 428">do</span> <span class="ocrx_word" title="bbox 355 396 410 427">bin</span> <span class="ocrx_word" title="bbox 433 404 491 433">gen</span> <span class="ocrx_word" title="bbox 513 396 734 435">Memmingen</span> <span class="ocrx_word" title="bbox 756 405 859 435">zogen,</span> <span class="ocrx_word" title="bbox 883 397 926 427">im</span> <span class="ocrx_word" title="bbox 950 396 1075 428">Namen</span> <span class="ocrx_word" title="bbox 1099 398 1223 435">Gottes,</span> <span class="ocrx_word" title="bbox 1246 398 1304 428">mit</span> <span class="ocrx_word" title="bbox 1328 396 1463 435">Maifter</span></span> <br><span class="ocr_line" baseline= 472 title="bbox 188 441 1463 480"><span class="ocrx_word" title="bbox 188 441 379 473">Jeronimus</span> <span class="ocrx_word" title="bbox 414 441 571 480">Andreae,</span> <span class="ocrx_word" title="bbox 605 442 677 473">umb</span> <span class="ocrx_word" title="bbox 714 442 806 473">einen</span> <span class="ocrx_word" title="bbox 841 441 990 479">ehrlichen</span> <span class="ocrx_word" title="bbox 1024 441 1159 480">Maifter</span> <span class="ocrx_word" title="bbox 1193 450 1229 480">zu</span> <span class="ocrx_word" title="bbox 1264 442 1369 480">fechen,</span> <span class="ocrx_word" title="bbox 1404 442 1463 472">das</span></span> <br><span class="ocr_line" baseline= 517 title="bbox 185 485 1463 525"><span class="ocrx_word" title="bbox 185 485 584 524">Tuochfchererhandtwerck</span> <span class="ocrx_word" title="bbox 607 495 642 525">zu</span> <span class="ocrx_word" title="bbox 665 487 785 524">lernen,</span> <span class="ocrx_word" title="bbox 810 486 911 517">damit</span> <span class="ocrx_word" title="bbox 936 486 974 523">ich</span> <span class="ocrx_word" title="bbox 1000 487 1072 516">weit</span> <span class="ocrx_word" title="bbox 1095 487 1150 524">hin</span> <span class="ocrx_word" title="bbox 1176 487 1237 517">und</span> <span class="ocrx_word" title="bbox 1263 487 1360 517">wider</span> <span class="ocrx_word" title="bbox 1384 486 1463 523">ohne</span></span> <br><span class="ocr_line" baseline= 561 title="bbox 185 530 1463 569"><span class="ocrx_word" title="bbox 185 530 259 562">Gelt</span> <span class="ocrx_word" title="bbox 281 530 398 568">Raifen</span> <span class="ocrx_word" title="bbox 421 530 483 560">und</span> <span class="ocrx_word" title="bbox 505 530 724 569">Fortkommen</span> <span class="ocrx_word" title="bbox 747 531 810 561">und</span> <span class="ocrx_word" title="bbox 835 531 898 568">also</span> <span class="ocrx_word" title="bbox 920 531 968 561">die</span> <span class="ocrx_word" title="bbox 992 530 1115 562">Länder</span> <span class="ocrx_word" title="bbox 1139 531 1217 569">ohne</span> <span class="ocrx_word" title="bbox 1240 530 1391 568">Unkosten</span> <span class="ocrx_word" title="bbox 1414 530 1463 561">be-</span></span> <br><span class="ocr_line" baseline= 606 title="bbox 184 575 1465 614"><span class="ocrx_word" title="bbox 184 575 271 613">sehen</span> <span class="ocrx_word" title="bbox 295 575 397 606">könde.</span> <span class="ocrx_word" title="bbox 430 575 500 613">Hab</span> <span class="ocrx_word" title="bbox 524 575 596 613">mich</span> <span class="ocrx_word" title="bbox 621 576 694 608">alda</span> <span class="ocrx_word" title="bbox 717 576 911 614">versprochen</span> <span class="ocrx_word" title="bbox 935 576 1006 612">nach</span> <span class="ocrx_word" title="bbox 1031 576 1239 614">gewohnheit,</span> <span class="ocrx_word" title="bbox 1261 576 1279 606">2</span> <span class="ocrx_word" title="bbox 1303 575 1388 613">Jahr</span> <span class="ocrx_word" title="bbox 1412 583 1465 613">zue</span></span> <br><span class="ocr_line" baseline= 652 title="bbox 187 619 1464 659"><span class="ocrx_word" title="bbox 187 619 321 658">Maifter</span> <span class="ocrx_word" title="bbox 346 620 449 658">Georg</span> <span class="ocrx_word" title="bbox 474 619 641 658">Schillern.</span> <span class="ocrx_word" title="bbox 686 621 756 659">Hab</span> <span class="ocrx_word" title="bbox 783 620 879 659">Jhme</span> <span class="ocrx_word" title="bbox 905 620 1097 659">versprochen</span> <span class="ocrx_word" title="bbox 1123 622 1161 652">40</span> <span class="ocrx_word" title="bbox 1185 621 1219 659">fl.</span> <span class="ocrx_word" title="bbox 1245 620 1404 659">Lehrlohn</span> <span class="ocrx_word" title="bbox 1429 628 1464 659">zu</span></span> <br><span class="ocr_line" baseline= 695 title="bbox 184 665 289 702"><span class="ocrx_word" title="bbox 184 665 289 702">geben.</span></span> <br></p>
|
17
|
+
<p class="ocr_par" align=Justified leftIndent=100 startIndent=1900 lineSpacing=1056 style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 750 title="bbox 264 718 1463 759"><span class="ocrx_word" title="bbox 264 719 349 750">Dato</span> <span class="ocrx_word" title="bbox 369 719 427 750">den</span> <span class="ocrx_word" title="bbox 448 718 493 749">10.</span> <span class="ocrx_word" title="bbox 512 719 677 758">February</span> <span class="ocrx_word" title="bbox 698 720 800 758">haben</span> <span class="ocrx_word" title="bbox 820 720 879 751">wir</span> <span class="ocrx_word" title="bbox 899 720 954 757">bey</span> <span class="ocrx_word" title="bbox 975 720 1041 750">dem</span> <span class="ocrx_word" title="bbox 1062 719 1194 759">Weißen</span> <span class="ocrx_word" title="bbox 1213 720 1302 757">Oxen</span> <span class="ocrx_word" title="bbox 1323 719 1463 758">verzehrt</span></span> <br><span class="ocr_line" baseline= 795 title="bbox 186 763 1463 803"><span class="ocrx_word" title="bbox 186 764 204 794">6</span> <span class="ocrx_word" title="bbox 228 764 262 802">fl.</span> <span class="ocrx_word" title="bbox 286 765 304 794">7</span> <span class="ocrx_word" title="bbox 330 763 400 802">batz.</span> <span class="ocrx_word" title="bbox 436 763 550 795">Daran</span> <span class="ocrx_word" title="bbox 575 764 629 802">hat</span> <span class="ocrx_word" title="bbox 655 765 708 796">der</span> <span class="ocrx_word" title="bbox 734 764 869 803">Maifter</span> <span class="ocrx_word" title="bbox 894 765 911 795">3</span> <span class="ocrx_word" title="bbox 936 764 970 802">fl.</span> <span class="ocrx_word" title="bbox 995 764 1058 802">zalt</span> <span class="ocrx_word" title="bbox 1084 765 1147 796">und</span> <span class="ocrx_word" title="bbox 1173 765 1226 802">Ich</span> <span class="ocrx_word" title="bbox 1251 765 1310 795">das</span> <span class="ocrx_word" title="bbox 1336 764 1463 801">Uebrig.</span></span> <br><span class="ocr_line" baseline= 840 title="bbox 185 808 1463 848"><span class="ocrx_word" title="bbox 185 808 256 840">Und</span> <span class="ocrx_word" title="bbox 282 808 337 840">bin</span> <span class="ocrx_word" title="bbox 364 808 403 846">ich</span> <span class="ocrx_word" title="bbox 431 808 465 846">uf</span> <span class="ocrx_word" title="bbox 491 808 549 840">den</span> <span class="ocrx_word" title="bbox 576 809 623 841">20.</span> <span class="ocrx_word" title="bbox 649 809 816 848">February</span> <span class="ocrx_word" title="bbox 843 810 919 839">1621</span> <span class="ocrx_word" title="bbox 947 809 1002 846">bey</span> <span class="ocrx_word" title="bbox 1031 809 1127 846">Jhme</span> <span class="ocrx_word" title="bbox 1154 809 1384 848">eingestanden.</span> <span class="ocrx_word" title="bbox 1422 808 1463 846">Uf</span></span> <br><span class="ocr_line" baseline= 884 title="bbox 185 853 1462 892"><span class="ocrx_word" title="bbox 185 854 222 884">2?.</span> <span class="ocrx_word" title="bbox 246 854 306 885">dto.</span> <span class="ocrx_word" title="bbox 330 854 364 884">in</span> <span class="ocrx_word" title="bbox 388 854 440 885">der</span> <span class="ocrx_word" title="bbox 464 853 564 892">Zunft</span> <span class="ocrx_word" title="bbox 588 854 828 892">eingeschrieben</span> <span class="ocrx_word" title="bbox 851 854 987 885">worden.</span> <span class="ocrx_word" title="bbox 1020 854 1104 885">Dato</span> <span class="ocrx_word" title="bbox 1129 862 1197 892">zum</span> <span class="ocrx_word" title="bbox 1219 854 1352 892">Weihen</span> <span class="ocrx_word" title="bbox 1374 854 1462 891">Oxen</span></span> <br><span class="ocr_line" baseline= 929 title="bbox 185 897 951 937"><span class="ocrx_word" title="bbox 185 898 324 936">verzehrt</span> <span class="ocrx_word" title="bbox 345 899 363 928">6</span> <span class="ocrx_word" title="bbox 384 897 429 937">fl.,</span> <span class="ocrx_word" title="bbox 451 898 512 936">hab</span> <span class="ocrx_word" title="bbox 533 899 572 937">ich</span> <span class="ocrx_word" title="bbox 593 899 708 937">halben</span> <span class="ocrx_word" title="bbox 730 899 822 937">Theil</span> <span class="ocrx_word" title="bbox 846 898 951 936">geben.</span></span> <br></p>
|
18
|
+
<p class="ocr_par" leftIndent=10600 style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 1001 title="bbox 625 970 1016 1003"><span class="ocrx_word" title="bbox 625 971 733 1003">Gott</span> <span class="ocrx_word" title="bbox 751 970 865 1002">Gebe</span> <span class="ocrx_word" title="bbox 883 970 1016 1002">Gnad.</span></span> <br></p>
|
19
|
+
<p class="ocr_par" align=Justified leftIndent=100 startIndent=1900 lineSpacing=1056 style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 1054 title="bbox 265 1023 1464 1062"><span class="ocrx_word" title="bbox 265 1023 454 1061">Sambstag,</span> <span class="ocrx_word" title="bbox 474 1023 532 1055">den</span> <span class="ocrx_word" title="bbox 553 1024 598 1054">18.</span> <span class="ocrx_word" title="bbox 620 1024 751 1062">Augusti</span> <span class="ocrx_word" title="bbox 773 1024 850 1054">1621</span> <span class="ocrx_word" title="bbox 871 1024 906 1062">ist</span> <span class="ocrx_word" title="bbox 927 1024 1021 1055">Mein</span> <span class="ocrx_word" title="bbox 1040 1023 1243 1061">Lehrmeister</span> <span class="ocrx_word" title="bbox 1265 1024 1344 1061">Jerg</span> <span class="ocrx_word" title="bbox 1364 1023 1464 1061">Schik-</span></span> <br><span class="ocr_line" baseline= 1099 title="bbox 187 1067 1464 1106"><span class="ocrx_word" title="bbox 187 1068 232 1099">ler</span> <span class="ocrx_word" title="bbox 260 1069 294 1099">in</span> <span class="ocrx_word" title="bbox 321 1068 398 1099">Gott</span> <span class="ocrx_word" title="bbox 425 1067 572 1106">seeliglich</span> <span class="ocrx_word" title="bbox 600 1068 792 1106">entschlafen.</span> <span class="ocrx_word" title="bbox 829 1069 906 1100">Gott</span> <span class="ocrx_word" title="bbox 933 1069 985 1100">der</span> <span class="ocrx_word" title="bbox 1013 1067 1194 1106">Allmechtig</span> <span class="ocrx_word" title="bbox 1221 1068 1341 1105">verleih</span> <span class="ocrx_word" title="bbox 1368 1068 1464 1105">Jhme</span></span> <br><span class="ocr_line" baseline= 1144 title="bbox 188 1112 1464 1152"><span class="ocrx_word" title="bbox 188 1113 255 1144">eine</span> <span class="ocrx_word" title="bbox 274 1112 417 1150">fröhliche</span> <span class="ocrx_word" title="bbox 437 1112 666 1152">Auferstehung</span> <span class="ocrx_word" title="bbox 687 1114 750 1144">und</span> <span class="ocrx_word" title="bbox 769 1121 832 1144">uns</span> <span class="ocrx_word" title="bbox 853 1112 949 1151">allen,</span> <span class="ocrx_word" title="bbox 968 1113 1069 1144">einem</span> <span class="ocrx_word" title="bbox 1089 1112 1192 1145">Jeden</span> <span class="ocrx_word" title="bbox 1211 1121 1246 1151">zu</span> <span class="ocrx_word" title="bbox 1265 1113 1363 1151">seiner</span> <span class="ocrx_word" title="bbox 1384 1112 1464 1151">Zeit,</span></span> <br><span class="ocr_line" baseline= 1188 title="bbox 186 1157 1463 1197"><span class="ocrx_word" title="bbox 186 1158 236 1189">ein</span> <span class="ocrx_word" title="bbox 261 1158 390 1195">Seeligs</span> <span class="ocrx_word" title="bbox 415 1158 494 1189">End.</span> <span class="ocrx_word" title="bbox 531 1158 640 1190">Amen.</span> <span class="ocrx_word" title="bbox 676 1158 725 1197">Ist</span> <span class="ocrx_word" title="bbox 750 1159 783 1189">in</span> <span class="ocrx_word" title="bbox 809 1158 913 1189">Allem</span> <span class="ocrx_word" title="bbox 938 1158 956 1187">8</span> <span class="ocrx_word" title="bbox 981 1157 1109 1195">Wochen</span> <span class="ocrx_word" title="bbox 1132 1158 1150 1188">2</span> <span class="ocrx_word" title="bbox 1174 1158 1243 1195">Tag</span> <span class="ocrx_word" title="bbox 1268 1158 1358 1188">krank</span> <span class="ocrx_word" title="bbox 1383 1157 1463 1195">gele¬</span></span> <br><span class="ocr_line" baseline= 1233 title="bbox 185 1202 1463 1241"><span class="ocrx_word" title="bbox 185 1211 253 1241">gen,</span> <span class="ocrx_word" title="bbox 278 1202 471 1240">unterdessen</span> <span class="ocrx_word" title="bbox 496 1203 556 1240">hab</span> <span class="ocrx_word" title="bbox 582 1203 621 1240">ich</span> <span class="ocrx_word" title="bbox 645 1204 694 1234">die</span> <span class="ocrx_word" title="bbox 718 1203 902 1240">Werckstatt,</span> <span class="ocrx_word" title="bbox 928 1202 1069 1240">Gottlob,</span> <span class="ocrx_word" title="bbox 1096 1203 1147 1233">als</span> <span class="ocrx_word" title="bbox 1173 1203 1222 1233">ein</span> <span class="ocrx_word" title="bbox 1249 1202 1351 1240">Gesell</span> <span class="ocrx_word" title="bbox 1377 1202 1463 1240">füeh-</span></span> <br><span class="ocr_line" baseline= 1278 title="bbox 185 1247 1314 1285"><span class="ocrx_word" title="bbox 185 1255 240 1278">ren</span> <span class="ocrx_word" title="bbox 264 1247 396 1285">müeffen</span> <span class="ocrx_word" title="bbox 421 1247 484 1277">und</span> <span class="ocrx_word" title="bbox 508 1248 562 1285">hat</span> <span class="ocrx_word" title="bbox 585 1249 646 1280">mir</span> <span class="ocrx_word" title="bbox 670 1247 803 1279">Gottlob</span> <span class="ocrx_word" title="bbox 826 1247 902 1277">eben</span> <span class="ocrx_word" title="bbox 927 1247 1009 1284">wohl</span> <span class="ocrx_word" title="bbox 1033 1247 1134 1278">damit</span> <span class="ocrx_word" title="bbox 1159 1247 1314 1285">gelungen</span></span> <br></p>
|
20
|
+
<p class="ocr_par" align=Justified leftIndent=100 startIndent=1900 lineSpacing=1056 style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 1323 title="bbox 266 1291 1464 1331"><span class="ocrx_word" title="bbox 266 1291 331 1330">Auf</span> <span class="ocrx_word" title="bbox 354 1292 413 1323">den</span> <span class="ocrx_word" title="bbox 438 1291 534 1330">ersten</span> <span class="ocrx_word" title="bbox 558 1292 690 1331">Augusti</span> <span class="ocrx_word" title="bbox 715 1293 791 1323">1621</span> <span class="ocrx_word" title="bbox 814 1292 875 1329">hab</span> <span class="ocrx_word" title="bbox 899 1292 937 1330">ich</span> <span class="ocrx_word" title="bbox 960 1291 1032 1330">mich</span> <span class="ocrx_word" title="bbox 1056 1299 1090 1330">zu</span> <span class="ocrx_word" title="bbox 1114 1292 1236 1330">Herren</span> <span class="ocrx_word" title="bbox 1259 1292 1335 1324">Veit</span> <span class="ocrx_word" title="bbox 1357 1291 1464 1329">Schal-</span></span> <br><span class="ocr_line" baseline= 1367 title="bbox 187 1336 1174 1375"><span class="ocrx_word" title="bbox 187 1337 239 1368">ken</span> <span class="ocrx_word" title="bbox 265 1336 468 1375">versprochen,</span> <span class="ocrx_word" title="bbox 496 1336 554 1368">das</span> <span class="ocrx_word" title="bbox 580 1337 754 1374">Handwerk</span> <span class="ocrx_word" title="bbox 780 1337 927 1368">vollends</span> <span class="ocrx_word" title="bbox 953 1336 1174 1374">auszulernen.</span></span> <br></p>
|
21
|
+
<p class="ocr_par" align=Justified leftIndent=100 startIndent=1900 lineSpacing=1056 style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 1412 title="bbox 265 1381 1466 1420"><span class="ocrx_word" title="bbox 265 1382 336 1413">Und</span> <span class="ocrx_word" title="bbox 356 1382 416 1419">hab</span> <span class="ocrx_word" title="bbox 435 1381 530 1419">Jhme</span> <span class="ocrx_word" title="bbox 551 1382 586 1412">18</span> <span class="ocrx_word" title="bbox 605 1383 638 1420">fl.</span> <span class="ocrx_word" title="bbox 658 1382 764 1420">geben.</span> <span class="ocrx_word" title="bbox 795 1381 879 1413">Aber</span> <span class="ocrx_word" title="bbox 907 1382 975 1412">dem</span> <span class="ocrx_word" title="bbox 1003 1382 1134 1419">vorigen</span> <span class="ocrx_word" title="bbox 1162 1381 1297 1419">Maifter</span> <span class="ocrx_word" title="bbox 1326 1381 1466 1419">(dieweil</span></span> <br><span class="ocr_line" baseline= 1457 title="bbox 187 1425 1464 1465"><span class="ocrx_word" title="bbox 187 1426 235 1457">bei</span> <span class="ocrx_word" title="bbox 258 1426 311 1457">der</span> <span class="ocrx_word" title="bbox 333 1425 533 1463">Wittfrauen</span> <span class="ocrx_word" title="bbox 556 1427 635 1464">nicht</span> <span class="ocrx_word" title="bbox 659 1427 832 1457">auslernen</span> <span class="ocrx_word" title="bbox 857 1426 989 1465">könden)</span> <span class="ocrx_word" title="bbox 1015 1434 1076 1456">nur</span> <span class="ocrx_word" title="bbox 1100 1427 1138 1456">20</span> <span class="ocrx_word" title="bbox 1161 1426 1195 1464">fl.</span> <span class="ocrx_word" title="bbox 1230 1426 1316 1458">Dato</span> <span class="ocrx_word" title="bbox 1339 1426 1400 1464">hab</span> <span class="ocrx_word" title="bbox 1424 1426 1464 1462">ich</span></span> <br><span class="ocr_line" baseline= 1502 title="bbox 185 1471 585 1510"><span class="ocrx_word" title="bbox 185 1479 255 1509">zum</span> <span class="ocrx_word" title="bbox 276 1471 376 1509">besten</span> <span class="ocrx_word" title="bbox 395 1471 492 1509">geben</span> <span class="ocrx_word" title="bbox 512 1472 531 1502">2</span> <span class="ocrx_word" title="bbox 551 1471 585 1510">fl.</span></span> <br></p>
|
22
|
+
<p class="ocr_par" align=Justified leftIndent=100 startIndent=1900 lineSpacing=1056 style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 1555 title="bbox 266 1524 1464 1563"><span class="ocrx_word" title="bbox 266 1525 338 1556">Adi.</span> <span class="ocrx_word" title="bbox 355 1525 447 1562">Juny</span> <span class="ocrx_word" title="bbox 465 1524 541 1555">1622</span> <span class="ocrx_word" title="bbox 560 1526 648 1563">(weil</span> <span class="ocrx_word" title="bbox 665 1534 697 1556">es</span> <span class="ocrx_word" title="bbox 714 1526 763 1556">die</span> <span class="ocrx_word" title="bbox 780 1525 978 1563">gelegenhait</span> <span class="ocrx_word" title="bbox 995 1524 1112 1562">sonsten</span> <span class="ocrx_word" title="bbox 1127 1525 1239 1563">geben)</span> <span class="ocrx_word" title="bbox 1259 1525 1315 1555">bin</span> <span class="ocrx_word" title="bbox 1331 1524 1384 1561">Ich</span> <span class="ocrx_word" title="bbox 1402 1532 1464 1554">von</span></span> <br><span class="ocr_line" baseline= 1600 title="bbox 188 1569 1464 1608"><span class="ocrx_word" title="bbox 188 1569 322 1607">Maifter</span> <span class="ocrx_word" title="bbox 346 1570 467 1607">Caspar</span> <span class="ocrx_word" title="bbox 491 1569 626 1608">Müller,</span> <span class="ocrx_word" title="bbox 652 1570 827 1608">Schleifern</span> <span class="ocrx_word" title="bbox 854 1578 889 1607">zu</span> <span class="ocrx_word" title="bbox 913 1569 1144 1607">Memmingen,</span> <span class="ocrx_word" title="bbox 1170 1577 1239 1607">zum</span> <span class="ocrx_word" title="bbox 1265 1569 1390 1607">Gfellen</span> <span class="ocrx_word" title="bbox 1416 1576 1464 1606">ge-</span></span> <br><span class="ocr_line" baseline= 1644 title="bbox 187 1613 1464 1652"><span class="ocrx_word" title="bbox 187 1614 282 1652">macht</span> <span class="ocrx_word" title="bbox 311 1614 373 1645">und</span> <span class="ocrx_word" title="bbox 402 1613 644 1651">aufgenommen</span> <span class="ocrx_word" title="bbox 672 1615 808 1645">worden.</span> <span class="ocrx_word" title="bbox 846 1614 964 1651">Sampt</span> <span class="ocrx_word" title="bbox 995 1613 1098 1645">Lucas</span> <span class="ocrx_word" title="bbox 1126 1613 1248 1651">Hursich</span> <span class="ocrx_word" title="bbox 1277 1613 1340 1644">und</span> <span class="ocrx_word" title="bbox 1368 1613 1464 1651">Peter</span></span> <br><span class="ocr_line" baseline= 1689 title="bbox 185 1657 1463 1697"><span class="ocrx_word" title="bbox 185 1659 346 1697">Holzwart</span> <span class="ocrx_word" title="bbox 369 1667 430 1690">von</span> <span class="ocrx_word" title="bbox 455 1658 684 1697">Memmingen.</span> <span class="ocrx_word" title="bbox 718 1660 802 1691">Dato</span> <span class="ocrx_word" title="bbox 825 1658 928 1697">haben</span> <span class="ocrx_word" title="bbox 952 1659 1010 1689">das</span> <span class="ocrx_word" title="bbox 1033 1658 1217 1696">Handwerk,</span> <span class="ocrx_word" title="bbox 1241 1658 1376 1696">Maifter</span> <span class="ocrx_word" title="bbox 1401 1657 1463 1688">und</span></span> <br><span class="ocr_line" baseline= 1734 title="bbox 187 1702 1464 1742"><span class="ocrx_word" title="bbox 187 1704 328 1742">Gesellen</span> <span class="ocrx_word" title="bbox 353 1703 492 1741">verzehrt</span> <span class="ocrx_word" title="bbox 518 1704 573 1741">bey</span> <span class="ocrx_word" title="bbox 597 1704 664 1735">dem</span> <span class="ocrx_word" title="bbox 688 1704 821 1742">Weißen</span> <span class="ocrx_word" title="bbox 846 1703 934 1741">Oxen</span> <span class="ocrx_word" title="bbox 957 1704 994 1733">34</span> <span class="ocrx_word" title="bbox 1020 1703 1064 1740">fl.,</span> <span class="ocrx_word" title="bbox 1089 1703 1147 1734">den</span> <span class="ocrx_word" title="bbox 1173 1703 1291 1740">Thaler</span> <span class="ocrx_word" title="bbox 1317 1710 1352 1740">zu</span> <span class="ocrx_word" title="bbox 1376 1703 1394 1733">9</span> <span class="ocrx_word" title="bbox 1419 1702 1464 1740">fl.,</span></span> <br><span class="ocr_line" baseline= 1778 title="bbox 185 1747 1463 1787"><span class="ocrx_word" title="bbox 185 1749 288 1779">daran</span> <span class="ocrx_word" title="bbox 309 1748 344 1787">ist</span> <span class="ocrx_word" title="bbox 366 1749 428 1779">mir</span> <span class="ocrx_word" title="bbox 452 1747 608 1786">auferlegt</span> <span class="ocrx_word" title="bbox 631 1749 758 1779">worden</span> <span class="ocrx_word" title="bbox 781 1756 815 1787">zu</span> <span class="ocrx_word" title="bbox 839 1747 987 1786">bezahlen</span> <span class="ocrx_word" title="bbox 1010 1748 1047 1777">24</span> <span class="ocrx_word" title="bbox 1070 1747 1115 1786">fl.,</span> <span class="ocrx_word" title="bbox 1137 1748 1196 1778">das</span> <span class="ocrx_word" title="bbox 1219 1747 1337 1785">Uebrig</span> <span class="ocrx_word" title="bbox 1360 1747 1463 1785">haben</span></span> <br><span class="ocr_line" baseline= 1824 title="bbox 187 1793 621 1831"><span class="ocrx_word" title="bbox 187 1794 235 1825">die</span> <span class="ocrx_word" title="bbox 255 1793 375 1824">andern</span> <span class="ocrx_word" title="bbox 395 1793 492 1830">Zwen</span> <span class="ocrx_word" title="bbox 511 1793 621 1831">bezalt.</span></span> <br></p>
|
23
|
+
<p class="ocr_par" align=Justified leftIndent=100 startIndent=1900 lineSpacing=1056 style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 1868 title="bbox 265 1836 1465 1876"><span class="ocrx_word" title="bbox 265 1838 467 1876">Morndrigs,</span> <span class="ocrx_word" title="bbox 490 1838 549 1869">den</span> <span class="ocrx_word" title="bbox 603 1855 609 1861">•</span> <span class="ocrx_word" title="bbox 631 1838 723 1876">Juny</span> <span class="ocrx_word" title="bbox 748 1837 823 1868">1622</span> <span class="ocrx_word" title="bbox 847 1837 949 1876">haben</span> <span class="ocrx_word" title="bbox 980 1837 1027 1868">die</span> <span class="ocrx_word" title="bbox 1059 1836 1194 1875">Maister</span> <span class="ocrx_word" title="bbox 1226 1836 1290 1868">und</span> <span class="ocrx_word" title="bbox 1322 1836 1465 1874">Gesellen</span></span> <br><span class="ocr_line" baseline= 1913 title="bbox 189 1882 1464 1921"><span class="ocrx_word" title="bbox 189 1883 323 1914">widrum</span> <span class="ocrx_word" title="bbox 353 1883 403 1913">ein</span> <span class="ocrx_word" title="bbox 432 1883 531 1914">trunck</span> <span class="ocrx_word" title="bbox 560 1884 683 1921">gethon.</span> <span class="ocrx_word" title="bbox 723 1883 793 1921">Hab</span> <span class="ocrx_word" title="bbox 824 1883 862 1920">ich</span> <span class="ocrx_word" title="bbox 892 1882 1001 1920">zahlen</span> <span class="ocrx_word" title="bbox 1031 1882 1163 1921">müessen</span> <span class="ocrx_word" title="bbox 1192 1883 1210 1913">5</span> <span class="ocrx_word" title="bbox 1240 1882 1273 1920">fl.</span> <span class="ocrx_word" title="bbox 1303 1882 1366 1912">und</span> <span class="ocrx_word" title="bbox 1397 1882 1464 1912">dem</span></span> <br><span class="ocr_line" baseline= 1958 title="bbox 188 1927 869 1966"><span class="ocrx_word" title="bbox 188 1927 387 1966">Zunftknecht</span> <span class="ocrx_word" title="bbox 407 1928 422 1957">1</span> <span class="ocrx_word" title="bbox 441 1928 514 1966">maß</span> <span class="ocrx_word" title="bbox 532 1927 627 1959">Wein</span> <span class="ocrx_word" title="bbox 646 1936 701 1965">pro</span> <span class="ocrx_word" title="bbox 720 1929 738 1958">9</span> <span class="ocrx_word" title="bbox 758 1927 869 1965">batzen.</span></span> <br></p>
|
24
|
+
<p class="ocr_par" align=Justified leftIndent=100 startIndent=1900 lineSpacing=1104 style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 2011 title="bbox 268 1979 1465 2019"><span class="ocrx_word" title="bbox 268 1981 402 2019">Maister</span> <span class="ocrx_word" title="bbox 428 1980 549 2018">Caspar</span> <span class="ocrx_word" title="bbox 575 1980 708 2019">Müller,</span> <span class="ocrx_word" title="bbox 734 1980 886 2018">Schleifer</span> <span class="ocrx_word" title="bbox 912 1981 974 2011">und</span> <span class="ocrx_word" title="bbox 1001 1979 1127 2017">Bürger</span> <span class="ocrx_word" title="bbox 1155 1988 1207 2018">zue</span> <span class="ocrx_word" title="bbox 1234 1979 1465 2017">Memmingen,</span></span> <br><span class="ocr_line" baseline= 2057 title="bbox 187 2025 528 2064"><span class="ocrx_word" title="bbox 187 2027 246 2064">gab</span> <span class="ocrx_word" title="bbox 268 2027 329 2058">mir</span> <span class="ocrx_word" title="bbox 351 2025 411 2064">dise</span> <span class="ocrx_word" title="bbox 433 2025 528 2063">Lehr:</span></span> <br></p>
|
25
|
+
<p class="ocr_par" leftIndent=2900 lineSpacing=1056 style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 2110 title="bbox 305 2078 1346 2118"><span class="ocrx_word" title="bbox 305 2079 415 2118">Fürcht</span> <span class="ocrx_word" title="bbox 435 2080 498 2110">und</span> <span class="ocrx_word" title="bbox 520 2080 582 2111">lieb</span> <span class="ocrx_word" title="bbox 603 2080 690 2118">Gott,</span> <span class="ocrx_word" title="bbox 712 2079 777 2110">und</span> <span class="ocrx_word" title="bbox 795 2079 919 2111">Deinen</span> <span class="ocrx_word" title="bbox 943 2078 1065 2116">negsten</span> <span class="ocrx_word" title="bbox 1089 2079 1140 2110">als</span> <span class="ocrx_word" title="bbox 1162 2079 1233 2116">Dich</span> <span class="ocrx_word" title="bbox 1255 2078 1346 2116">selbs,</span></span> <br><span class="ocr_line" baseline= 2154 title="bbox 305 2123 1107 2162"><span class="ocrx_word" title="bbox 305 2125 334 2162">so</span> <span class="ocrx_word" title="bbox 359 2124 474 2162">würstu</span> <span class="ocrx_word" title="bbox 497 2132 559 2155">von</span> <span class="ocrx_word" title="bbox 582 2124 678 2157">Allen</span> <span class="ocrx_word" title="bbox 702 2124 824 2161">bayden</span> <span class="ocrx_word" title="bbox 847 2123 952 2161">geehrt</span> <span class="ocrx_word" title="bbox 975 2123 1107 2155">werden.</span></span> <br></p>
|
26
|
+
<p class="ocr_par" align=Justified leftIndent=100 startIndent=1900 lineSpacing=1128 style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 2208 title="bbox 266 2176 1462 2217"><span class="ocrx_word" title="bbox 266 2178 400 2216">Maister</span> <span class="ocrx_word" title="bbox 419 2177 545 2209">Andres</span> <span class="ocrx_word" title="bbox 563 2178 695 2217">Hursich,</span> <span class="ocrx_word" title="bbox 713 2177 923 2216">Tuochscherer</span> <span class="ocrx_word" title="bbox 941 2185 976 2214">zu</span> <span class="ocrx_word" title="bbox 994 2176 1224 2215">Memmingen,</span> <span class="ocrx_word" title="bbox 1244 2176 1304 2214">gab</span> <span class="ocrx_word" title="bbox 1323 2177 1384 2208">mir</span> <span class="ocrx_word" title="bbox 1402 2176 1462 2214">dise</span></span> <br><span class="ocr_line" baseline= 2254 title="bbox 186 2223 280 2263"><span class="ocrx_word" title="bbox 186 2223 280 2263">Lehr:</span></span> <br></p></div></div>
|
27
|
+
</body>
|
28
|
+
</html>
|
@@ -0,0 +1,29 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
require 'sinatra'
|
3
|
+
|
4
|
+
require_relative '../lib/ocr_page'
|
5
|
+
|
6
|
+
get '/' do
|
7
|
+
"<a href='OCRTest.html'>OCRTest</a>"
|
8
|
+
end
|
9
|
+
|
10
|
+
get '/mark' do
|
11
|
+
x1 = params[:x1]
|
12
|
+
y1 = params[:y1]
|
13
|
+
x2 = params[:x2]
|
14
|
+
y2 = params[:y2]
|
15
|
+
page = params[:page]
|
16
|
+
|
17
|
+
if x1 and y1 and x2 and y2 and page then
|
18
|
+
words = get_enclosed_words(x1, y1, x2, y2, page).join("<br/>")
|
19
|
+
words
|
20
|
+
else
|
21
|
+
"Not enough parameters"
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
def get_enclosed_words(x1, y1, x2 ,y2, page)
|
27
|
+
@page = OCRPage.new("../data/#{page}")
|
28
|
+
@page.enclosed_words( OCRBox.new(x1.to_i, y1.to_i, x2.to_i, y2.to_i) )
|
29
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
body {
|
2
|
+
font: 16px Helvetica, Arial;
|
3
|
+
margin:0px;
|
4
|
+
padding:0px;
|
5
|
+
}
|
6
|
+
|
7
|
+
.marked {
|
8
|
+
color:#FFF;
|
9
|
+
position: absolute;
|
10
|
+
background-color:purple;
|
11
|
+
opacity: 0.4;
|
12
|
+
}
|
13
|
+
|
14
|
+
#ocr_image {
|
15
|
+
background-image: url(img/Seite_Tagebuch_H_C_Lang_08.jpg);
|
16
|
+
width:1600px;
|
17
|
+
height:2495px;
|
18
|
+
}
|
19
|
+
|
20
|
+
#marked_words {
|
21
|
+
font-family:monospace;
|
22
|
+
border:0.2em solid #333;
|
23
|
+
}
|
24
|
+
|
25
|
+
.selected_words {
|
26
|
+
background-color:#EDEDED;
|
27
|
+
margin:0.2em;
|
28
|
+
padding:0.2em;
|
29
|
+
}
|
30
|
+
|
@@ -0,0 +1,54 @@
|
|
1
|
+
<!DOCTYPE html>
|
2
|
+
<html lang="de">
|
3
|
+
<!--
|
4
|
+
|
5
|
+
Created using http://jsbin.com/
|
6
|
+
Source can be edited via http://jsbin.com/azare/edit
|
7
|
+
|
8
|
+
-->
|
9
|
+
<head>
|
10
|
+
<meta charset="utf-8" />
|
11
|
+
<title>OCRTest</title>
|
12
|
+
<link rel="stylesheet" href="http://ajax.googleapis.com/ajax/libs/jqueryui/1.7.2/themes/base/jquery-ui.css" type="text/css" />
|
13
|
+
<link rel="stylesheet" href="OCRTest.css" type="text/css" />
|
14
|
+
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js"></script>
|
15
|
+
<script src="https://ajax.googleapis.com/ajax/libs/jqueryui/1.7.2/jquery-ui.min.js"></script>
|
16
|
+
<script language="javascript" type="text/javascript" src="OCRTest_marker.js"></script>
|
17
|
+
</head>
|
18
|
+
<body>
|
19
|
+
<header>
|
20
|
+
<h1>Mark words test</h1>
|
21
|
+
</header>
|
22
|
+
|
23
|
+
|
24
|
+
|
25
|
+
<div id="ocr_image">
|
26
|
+
</div>
|
27
|
+
|
28
|
+
<div id="marked_words">
|
29
|
+
</div>
|
30
|
+
|
31
|
+
<script type="text/javascript">
|
32
|
+
$(document).ready(function(){
|
33
|
+
/*Initialisierung der Position für JS*/
|
34
|
+
ocr_image_left = $("#ocr_image").offset().left;
|
35
|
+
ocr_image_top = $("#ocr_image").offset().top;
|
36
|
+
|
37
|
+
|
38
|
+
$("#ocr_image").mousedown( function(e) {
|
39
|
+
startX = e.pageX - ocr_image_left;
|
40
|
+
startY = e.pageY - ocr_image_top;
|
41
|
+
});
|
42
|
+
|
43
|
+
|
44
|
+
$("#ocr_image").mouseup( function(e) {
|
45
|
+
endX = e.pageX - ocr_image_left;
|
46
|
+
endY = e.pageY - ocr_image_top;
|
47
|
+
draw_rectangle();
|
48
|
+
});
|
49
|
+
/*Initialisierung abgeschlossen*/
|
50
|
+
})
|
51
|
+
|
52
|
+
</script>
|
53
|
+
</body>
|
54
|
+
</html>
|
@@ -0,0 +1,83 @@
|
|
1
|
+
var startX;
|
2
|
+
var startY;
|
3
|
+
var endX;
|
4
|
+
var endY;
|
5
|
+
|
6
|
+
var ocr_image_left;
|
7
|
+
var ocr_image_top;
|
8
|
+
|
9
|
+
|
10
|
+
function draw_rectangle() {
|
11
|
+
var top;
|
12
|
+
var left;
|
13
|
+
var height;
|
14
|
+
var width;
|
15
|
+
|
16
|
+
// Linker Abstand und Seitenlänge
|
17
|
+
if (startX < endX) {
|
18
|
+
left = startX;
|
19
|
+
width = endX - startX;
|
20
|
+
}
|
21
|
+
else {
|
22
|
+
left = endX;
|
23
|
+
width = startX - endX;
|
24
|
+
}
|
25
|
+
|
26
|
+
// Abstand oben und Seitenlänge
|
27
|
+
if(startY < endY) {
|
28
|
+
top = startY;
|
29
|
+
height = endY - startY;
|
30
|
+
}
|
31
|
+
else {
|
32
|
+
top = endY;
|
33
|
+
height = startY - endY;
|
34
|
+
}
|
35
|
+
|
36
|
+
// Muss draufgerechnet werden, damit Position relativ zum Elternelement
|
37
|
+
top += ocr_image_top;
|
38
|
+
left += ocr_image_left;
|
39
|
+
|
40
|
+
var div = $("<span></span>").addClass('marked').css('top',top).css('left',left).height(height).width(width);
|
41
|
+
div.append(startX + " "+ startY + " "+ endX +" " + endY);
|
42
|
+
$("#ocr_image").append(div);
|
43
|
+
|
44
|
+
get_marked_words();
|
45
|
+
}
|
46
|
+
|
47
|
+
|
48
|
+
function get_marked_words() {
|
49
|
+
var x1;
|
50
|
+
var y1;
|
51
|
+
var x2;
|
52
|
+
var y2;
|
53
|
+
|
54
|
+
if (startX < endX) {
|
55
|
+
x1 = startX;
|
56
|
+
x2 = endX;
|
57
|
+
}
|
58
|
+
else {
|
59
|
+
x1 = endX;
|
60
|
+
x2 = startX
|
61
|
+
}
|
62
|
+
|
63
|
+
if (startY < endY) {
|
64
|
+
y1 = startY;
|
65
|
+
y2 = endY ;
|
66
|
+
}
|
67
|
+
else {
|
68
|
+
y1 = endY;
|
69
|
+
y2t = startY;
|
70
|
+
}
|
71
|
+
|
72
|
+
$.ajax({
|
73
|
+
url: 'http://localhost:4567/mark',
|
74
|
+
data: 'x1=' + x1 + '&y1=' + y1 + '&x2='+ x2 + '&y2=' + y2 + '&page=Seite_Tagebuch_H_C_Lang_08.html',
|
75
|
+
success: add_marked_words
|
76
|
+
})
|
77
|
+
}
|
78
|
+
|
79
|
+
function add_marked_words(data) {
|
80
|
+
var span = $("<div></div>").addClass('selected_words').append(data);
|
81
|
+
$("#marked_words").append( span );
|
82
|
+
|
83
|
+
}
|
Binary file
|
Binary file
|
data/lib/hocr.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
#coding: utf-8
|
data/lib/ocr_box.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
|
3
|
+
class OCRBox
|
4
|
+
|
5
|
+
attr_reader :x1, :y1, :x2, :y2
|
6
|
+
|
7
|
+
def initialize(x1, y1 , x2, y2)
|
8
|
+
@x1 = x1
|
9
|
+
@y1 = y1
|
10
|
+
@x2 = x2
|
11
|
+
@y2 = y2
|
12
|
+
end
|
13
|
+
|
14
|
+
def encloses?(element)
|
15
|
+
@x1 <= element.x1 and
|
16
|
+
@x2 >= element.x2 and
|
17
|
+
@y1 <= element.y1 and
|
18
|
+
@y2 >= element.y2
|
19
|
+
end
|
20
|
+
|
21
|
+
def enclosed_by?(element)
|
22
|
+
return element.encloses? self
|
23
|
+
end
|
24
|
+
|
25
|
+
def to_s
|
26
|
+
"tl->(x:#{@x1} y:#{@y1})/br->:(x:#{@x2} y:#{@y2})"
|
27
|
+
end
|
28
|
+
|
29
|
+
def to_css_style
|
30
|
+
top = @y1
|
31
|
+
left = @x1
|
32
|
+
height = @y2 - @y1
|
33
|
+
width = @x2 - @x1
|
34
|
+
|
35
|
+
"position:absolute; top:#{top}px; left:#{left}px; height:#{height}px; width:#{width}px;"
|
36
|
+
end
|
37
|
+
|
38
|
+
def to_json
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
|
data/lib/ocr_page.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
require_relative "ocrx_word"
|
3
|
+
|
4
|
+
class OCRPage < OCRBox
|
5
|
+
attr_reader :lines, :words
|
6
|
+
|
7
|
+
def initialize(filename)
|
8
|
+
@lines = hocr_lines( file_as_string(filename) ).select {|line| line.length > 0}
|
9
|
+
end
|
10
|
+
|
11
|
+
def hocr_lines( hocr_contents)
|
12
|
+
hocr_array = []
|
13
|
+
for line in hocr_contents.split(/<span class="ocr_line"/) do
|
14
|
+
line_array = []
|
15
|
+
for ocrx_word in line.scan(/<span class="ocrx_word"[^>]+>[^<]+<\/span>/) do
|
16
|
+
ocrx_word =~ /title="bbox (\d+) (\d+) (\d+) (\d+)">([^<]+)</
|
17
|
+
current_word = OCRXWord.new($1,$2,$3,$4,$5)
|
18
|
+
line_array << current_word
|
19
|
+
end
|
20
|
+
hocr_array << line_array
|
21
|
+
end
|
22
|
+
hocr_array
|
23
|
+
end
|
24
|
+
|
25
|
+
def words
|
26
|
+
@words ||= @lines.flatten
|
27
|
+
end
|
28
|
+
|
29
|
+
def enclosed_words(box)
|
30
|
+
words.select { |word| word.enclosed_by? box }
|
31
|
+
end
|
32
|
+
|
33
|
+
def get_position(element)
|
34
|
+
element =~ /title="bbox (\d+) (\d+) (\d+) (\d+)">/
|
35
|
+
[$1,$2,$3,$4]
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
def file_as_string(filename)
|
40
|
+
hocr_page_contents = File.open(filename,"r") { |f| f.read }
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
data/lib/ocrx_word.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
|
3
|
+
require 'cgi'
|
4
|
+
require_relative 'ocr_box'
|
5
|
+
|
6
|
+
class OCRXWord < OCRBox
|
7
|
+
|
8
|
+
attr_reader :text
|
9
|
+
|
10
|
+
def initialize(x1,y1,x2,y2,word)
|
11
|
+
super(x1.to_i, y1.to_i, x2.to_i, y2.to_i)
|
12
|
+
@text = word
|
13
|
+
end
|
14
|
+
|
15
|
+
def to_html(css_class = 'ocrx_word')
|
16
|
+
"<span style='#{ to_css_style }' class='#{css_class}'><span class='word'>" + CGI::escapeHTML(@text) +"</span></span>"
|
17
|
+
end
|
18
|
+
|
19
|
+
def to_s
|
20
|
+
"#{@text}\t#{super}"
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
data/rhocr.gemspec
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = %q{rhocr}
|
5
|
+
s.version = "0.0.1"
|
6
|
+
|
7
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
|
+
s.authors = ["Andreas Neumann"]
|
9
|
+
s.date = %q{2011-07-01}
|
10
|
+
s.description = %q{Manipulate and use OCR data encode in HOCR}
|
11
|
+
s.email = %q{info @nospam@ an-it.com}
|
12
|
+
s.extra_rdoc_files = ["README", "lib/hocr.rb", "lib/ocr_box.rb", "lib/ocr_page.rb", "lib/ocrx_word.rb"]
|
13
|
+
s.files = ["README", "Rakefile", "data/Seite_Tagebuch_H_C_Lang_08.html", "example/example_server.rb", "example/public/OCRTest.css", "example/public/OCRTest.html", "example/public/OCRTest_marker.js", "example/public/img/Seite_Tagebuch_H_C_Lang_05.jpg", "example/public/img/Seite_Tagebuch_H_C_Lang_08.jpg", "lib/hocr.rb", "lib/ocr_box.rb", "lib/ocr_page.rb", "lib/ocrx_word.rb", "rspec/ocr_box_spec.rb", "rspec/ocr_page_spec.rb", "rspec/ocrx_word_spec.rb", "test.rb", "Manifest", "rhocr.gemspec"]
|
14
|
+
s.homepage = %q{http://github.com/daandi/rhocr}
|
15
|
+
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Rhocr", "--main", "README"]
|
16
|
+
s.require_paths = ["lib"]
|
17
|
+
s.rubyforge_project = %q{rhocr}
|
18
|
+
s.rubygems_version = %q{1.6.2}
|
19
|
+
s.summary = %q{Manipulate and use OCR data encode in HOCR}
|
20
|
+
|
21
|
+
if s.respond_to? :specification_version then
|
22
|
+
s.specification_version = 3
|
23
|
+
|
24
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
25
|
+
else
|
26
|
+
end
|
27
|
+
else
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
|
3
|
+
require_relative '../lib/ocr_box'
|
4
|
+
|
5
|
+
describe OCRBox do
|
6
|
+
|
7
|
+
before(:each) do
|
8
|
+
@box ||= OCRBox.new(1,2,20,8)
|
9
|
+
end
|
10
|
+
|
11
|
+
describe "#to_s" do
|
12
|
+
it "prints a human readable Box-Version with coordinates upper_left(x,y) bottom_right(x,y)" do
|
13
|
+
@box.to_s.should == "tl->(x:1 y:2)/br->:(x:20 y:8)"
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
describe '#encloses?(element)' do
|
18
|
+
it "tests wather given OCRBox is enclosed by the current OCRBox" do
|
19
|
+
@box.encloses?( OCRBox.new(0,3,19,7) ).should be_false
|
20
|
+
@box.encloses?( OCRBox.new(2,3,19,7) ).should be_true
|
21
|
+
end
|
22
|
+
it "encloses also itself" do
|
23
|
+
@box.encloses?( @box ).should be_true
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
describe '#to_css_style' do
|
28
|
+
it 'should create css-style attributes' do
|
29
|
+
@box.to_css_style.should == 'position:absolute; top:2px; left:1px; height:6px; width:19px;'
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
describe '#enclosed_by?(element)' do
|
34
|
+
it 'should be enclosed by Boxes bigger than itself' do
|
35
|
+
@box.enclosed_by?( OCRBox.new(0,1,21,9) ).should be_true
|
36
|
+
end
|
37
|
+
it 'should not be enclosed by Boxes smaller than itself' do
|
38
|
+
@box.enclosed_by?( OCRBox.new(2,3,19,7) ).should be_false
|
39
|
+
end
|
40
|
+
it 'should be enclosed by Boxes of the same size' do
|
41
|
+
@box.enclosed_by?( @box ).should be_true
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
|
46
|
+
|
47
|
+
|
48
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
|
3
|
+
require_relative '../lib/ocr_page'
|
4
|
+
|
5
|
+
describe OCRPage do
|
6
|
+
|
7
|
+
before(:each) do
|
8
|
+
@ocr_page ||= OCRPage.new('../data/Seite_Tagebuch_H_C_Lang_08.html')
|
9
|
+
end
|
10
|
+
|
11
|
+
describe '' do
|
12
|
+
it '' do
|
13
|
+
p @ocr_page.enclosed_words( OCRBox.new(500,1703,1200,1800) )
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
|
3
|
+
require_relative '../lib/ocrx_word'
|
4
|
+
|
5
|
+
describe OCRXWord do
|
6
|
+
|
7
|
+
before(:each) do
|
8
|
+
@ocrx_word = OCRXWord.new(10,15,20,20,'WORT')
|
9
|
+
end
|
10
|
+
|
11
|
+
describe '#to_s' do
|
12
|
+
it 'should print the coordinates of the box and the textual information' do
|
13
|
+
p @ocrx_word
|
14
|
+
@ocrx_word.to_s.should == "WORT\ttl->(x:10 y:15)/br->:(x:20 y:20)"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
describe '#to_html(css_class)' do
|
19
|
+
it 'should create an span elment to overlay an image on an html-page' do
|
20
|
+
@ocrx_word.to_html.should == "<span style='position:absolute; top:15px; left:10px; height:5px; width:10px;' class='ocrx_word'><span class='word'>WORT</span></span>"
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'no css_class_class given should default to ocrx_word' do
|
24
|
+
@ocrx_word.to_html.should =~ /class='ocrx_word'/
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'css_class given should be part of genearted html' do
|
28
|
+
@ocrx_word.to_html('rosebud').should =~ /class='rosebud'/
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
data/test.rb
ADDED
metadata
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rhocr
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 0.0.1
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Andreas Neumann
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2011-07-01 00:00:00 +02:00
|
14
|
+
default_executable:
|
15
|
+
dependencies: []
|
16
|
+
|
17
|
+
description: Manipulate and use OCR data encode in HOCR
|
18
|
+
email: info @nospam@ an-it.com
|
19
|
+
executables: []
|
20
|
+
|
21
|
+
extensions: []
|
22
|
+
|
23
|
+
extra_rdoc_files:
|
24
|
+
- README
|
25
|
+
- lib/hocr.rb
|
26
|
+
- lib/ocr_box.rb
|
27
|
+
- lib/ocr_page.rb
|
28
|
+
- lib/ocrx_word.rb
|
29
|
+
files:
|
30
|
+
- README
|
31
|
+
- Rakefile
|
32
|
+
- data/Seite_Tagebuch_H_C_Lang_08.html
|
33
|
+
- example/example_server.rb
|
34
|
+
- example/public/OCRTest.css
|
35
|
+
- example/public/OCRTest.html
|
36
|
+
- example/public/OCRTest_marker.js
|
37
|
+
- example/public/img/Seite_Tagebuch_H_C_Lang_05.jpg
|
38
|
+
- example/public/img/Seite_Tagebuch_H_C_Lang_08.jpg
|
39
|
+
- lib/hocr.rb
|
40
|
+
- lib/ocr_box.rb
|
41
|
+
- lib/ocr_page.rb
|
42
|
+
- lib/ocrx_word.rb
|
43
|
+
- rspec/ocr_box_spec.rb
|
44
|
+
- rspec/ocr_page_spec.rb
|
45
|
+
- rspec/ocrx_word_spec.rb
|
46
|
+
- test.rb
|
47
|
+
- Manifest
|
48
|
+
- rhocr.gemspec
|
49
|
+
has_rdoc: true
|
50
|
+
homepage: http://github.com/daandi/rhocr
|
51
|
+
licenses: []
|
52
|
+
|
53
|
+
post_install_message:
|
54
|
+
rdoc_options:
|
55
|
+
- --line-numbers
|
56
|
+
- --inline-source
|
57
|
+
- --title
|
58
|
+
- Rhocr
|
59
|
+
- --main
|
60
|
+
- README
|
61
|
+
require_paths:
|
62
|
+
- lib
|
63
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
64
|
+
none: false
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: "0"
|
69
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
70
|
+
none: false
|
71
|
+
requirements:
|
72
|
+
- - ">="
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: "1.2"
|
75
|
+
requirements: []
|
76
|
+
|
77
|
+
rubyforge_project: rhocr
|
78
|
+
rubygems_version: 1.6.2
|
79
|
+
signing_key:
|
80
|
+
specification_version: 3
|
81
|
+
summary: Manipulate and use OCR data encode in HOCR
|
82
|
+
test_files: []
|
83
|
+
|