rhocr 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/Manifest ADDED
@@ -0,0 +1,18 @@
1
+ README
2
+ Rakefile
3
+ data/Seite_Tagebuch_H_C_Lang_08.html
4
+ example/example_server.rb
5
+ example/public/OCRTest.css
6
+ example/public/OCRTest.html
7
+ example/public/OCRTest_marker.js
8
+ example/public/img/Seite_Tagebuch_H_C_Lang_05.jpg
9
+ example/public/img/Seite_Tagebuch_H_C_Lang_08.jpg
10
+ lib/hocr.rb
11
+ lib/ocr_box.rb
12
+ lib/ocr_page.rb
13
+ lib/ocrx_word.rb
14
+ rspec/ocr_box_spec.rb
15
+ rspec/ocr_page_spec.rb
16
+ rspec/ocrx_word_spec.rb
17
+ test.rb
18
+ Manifest
data/README ADDED
@@ -0,0 +1 @@
1
+ Ruby Library to work with OCR-Data in the HOCR-Format.
data/Rakefile ADDED
@@ -0,0 +1,12 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'echoe'
4
+
5
+ Echoe.new('rhocr', '0.0.1') do |p|
6
+ p.description = "Manipulate and use OCR data encode in HOCR"
7
+ p.url = "http://github.com/daandi/rhocr"
8
+ p.author = "Andreas Neumann"
9
+ p.email = "info @nospam@ an-it.com"
10
+ p.ignore_pattern = ["tmp/*", "script/*"]
11
+ p.development_dependencies = []
12
+ end
@@ -0,0 +1,28 @@
1
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
2
+ <html>
3
+ <head>
4
+ <title>OCR Output</title>
5
+ <meta http-equiv="content-type" content="text/html; charset=utf-8" />
6
+ <meta http-equiv="content-style-type" content="text/css" />
7
+ <meta name="ocr-capabilities" content="ocr_page ocr_par ocrx_word ocr_line" />
8
+ <meta name="ocr-system" content="ABBYY fre-8.0.1.1024" />
9
+ <meta name="ocr-number-of-pages" content="1" />
10
+ </head>
11
+ <body bgcolor="#ffffff">
12
+ <div class="ocr_page" title="bbox 0 0 1709 1709;ppageno 20">
13
+
14
+ <div class="ocrx_block" title="bboxnull 111 1472 2270" style="font-size:9pt;font-family:"Arial";font-style:normal"><br>
15
+ <p class="ocr_par" style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 153 title="bbox 184 115 205 153"><span class="ocrx_word" title="bbox 184 115 205 153">8</span></span> <br></p>
16
+ <p class="ocr_par" align=Justified leftIndent=100 startIndent=1900 lineSpacing=1056 style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 249 title="bbox 264 216 1462 256"><span class="ocrx_word" title="bbox 264 216 333 248">Den</span> <span class="ocrx_word" title="bbox 356 216 402 248">20.</span> <span class="ocrx_word" title="bbox 426 216 620 249">Novembris</span> <span class="ocrx_word" title="bbox 643 218 698 249">bin</span> <span class="ocrx_word" title="bbox 720 218 758 256">ich</span> <span class="ocrx_word" title="bbox 781 217 917 248">widrum</span> <span class="ocrx_word" title="bbox 940 217 973 256">uf</span> <span class="ocrx_word" title="bbox 997 217 1220 256">Schaffhausen</span> <span class="ocrx_word" title="bbox 1245 226 1376 256">gezogen</span> <span class="ocrx_word" title="bbox 1399 218 1462 249">und</span></span> <br><span class="ocr_line" baseline= 293 title="bbox 187 261 1464 301"><span class="ocrx_word" title="bbox 187 262 242 299">bey</span> <span class="ocrx_word" title="bbox 262 261 405 292">Meinem</span> <span class="ocrx_word" title="bbox 426 261 546 299">Herren</span> <span class="ocrx_word" title="bbox 567 263 748 294">verblieben</span> <span class="ocrx_word" title="bbox 769 263 820 293">bis</span> <span class="ocrx_word" title="bbox 843 263 876 300">uf</span> <span class="ocrx_word" title="bbox 898 263 943 294">10.</span> <span class="ocrx_word" title="bbox 963 262 1131 300">Dezember</span> <span class="ocrx_word" title="bbox 1153 264 1239 301">1620,</span> <span class="ocrx_word" title="bbox 1260 264 1318 294">wie</span> <span class="ocrx_word" title="bbox 1337 263 1464 300">hiervor</span></span> <br><span class="ocr_line" baseline= 338 title="bbox 187 306 1464 346"><span class="ocrx_word" title="bbox 187 306 337 344">gemeldet</span> <span class="ocrx_word" title="bbox 366 306 410 344">ist,</span> <span class="ocrx_word" title="bbox 440 307 502 338">und</span> <span class="ocrx_word" title="bbox 533 306 588 338">bin</span> <span class="ocrx_word" title="bbox 616 307 650 346">uf</span> <span class="ocrx_word" title="bbox 679 308 737 338">den</span> <span class="ocrx_word" title="bbox 768 308 813 339">17.</span> <span class="ocrx_word" title="bbox 843 307 1011 345">Dezember</span> <span class="ocrx_word" title="bbox 1042 307 1118 338">1620</span> <span class="ocrx_word" title="bbox 1147 308 1283 339">widrum</span> <span class="ocrx_word" title="bbox 1313 308 1400 344">haim</span> <span class="ocrx_word" title="bbox 1430 307 1464 344">uf</span></span> <br><span class="ocr_line" baseline= 383 title="bbox 186 351 1462 391"><span class="ocrx_word" title="bbox 186 351 273 389">Mne</span> <span class="ocrx_word" title="bbox 303 351 441 382">kommen</span> <span class="ocrx_word" title="bbox 471 352 534 382">und</span> <span class="ocrx_word" title="bbox 565 352 652 383">allda</span> <span class="ocrx_word" title="bbox 681 352 863 383">verblieben</span> <span class="ocrx_word" title="bbox 893 352 943 382">bis</span> <span class="ocrx_word" title="bbox 973 352 1028 389">auf</span> <span class="ocrx_word" title="bbox 1058 352 1116 383">den</span> <span class="ocrx_word" title="bbox 1145 354 1172 384">9.</span> <span class="ocrx_word" title="bbox 1201 353 1373 391">Februarii</span> <span class="ocrx_word" title="bbox 1404 352 1462 383">Ao.</span></span> <br><span class="ocr_line" baseline= 427 title="bbox 187 396 1463 435"><span class="ocrx_word" title="bbox 187 396 272 435">1621,</span> <span class="ocrx_word" title="bbox 294 397 331 428">do</span> <span class="ocrx_word" title="bbox 355 396 410 427">bin</span> <span class="ocrx_word" title="bbox 433 404 491 433">gen</span> <span class="ocrx_word" title="bbox 513 396 734 435">Memmingen</span> <span class="ocrx_word" title="bbox 756 405 859 435">zogen,</span> <span class="ocrx_word" title="bbox 883 397 926 427">im</span> <span class="ocrx_word" title="bbox 950 396 1075 428">Namen</span> <span class="ocrx_word" title="bbox 1099 398 1223 435">Gottes,</span> <span class="ocrx_word" title="bbox 1246 398 1304 428">mit</span> <span class="ocrx_word" title="bbox 1328 396 1463 435">Maifter</span></span> <br><span class="ocr_line" baseline= 472 title="bbox 188 441 1463 480"><span class="ocrx_word" title="bbox 188 441 379 473">Jeronimus</span> <span class="ocrx_word" title="bbox 414 441 571 480">Andreae,</span> <span class="ocrx_word" title="bbox 605 442 677 473">umb</span> <span class="ocrx_word" title="bbox 714 442 806 473">einen</span> <span class="ocrx_word" title="bbox 841 441 990 479">ehrlichen</span> <span class="ocrx_word" title="bbox 1024 441 1159 480">Maifter</span> <span class="ocrx_word" title="bbox 1193 450 1229 480">zu</span> <span class="ocrx_word" title="bbox 1264 442 1369 480">fechen,</span> <span class="ocrx_word" title="bbox 1404 442 1463 472">das</span></span> <br><span class="ocr_line" baseline= 517 title="bbox 185 485 1463 525"><span class="ocrx_word" title="bbox 185 485 584 524">Tuochfchererhandtwerck</span> <span class="ocrx_word" title="bbox 607 495 642 525">zu</span> <span class="ocrx_word" title="bbox 665 487 785 524">lernen,</span> <span class="ocrx_word" title="bbox 810 486 911 517">damit</span> <span class="ocrx_word" title="bbox 936 486 974 523">ich</span> <span class="ocrx_word" title="bbox 1000 487 1072 516">weit</span> <span class="ocrx_word" title="bbox 1095 487 1150 524">hin</span> <span class="ocrx_word" title="bbox 1176 487 1237 517">und</span> <span class="ocrx_word" title="bbox 1263 487 1360 517">wider</span> <span class="ocrx_word" title="bbox 1384 486 1463 523">ohne</span></span> <br><span class="ocr_line" baseline= 561 title="bbox 185 530 1463 569"><span class="ocrx_word" title="bbox 185 530 259 562">Gelt</span> <span class="ocrx_word" title="bbox 281 530 398 568">Raifen</span> <span class="ocrx_word" title="bbox 421 530 483 560">und</span> <span class="ocrx_word" title="bbox 505 530 724 569">Fortkommen</span> <span class="ocrx_word" title="bbox 747 531 810 561">und</span> <span class="ocrx_word" title="bbox 835 531 898 568">also</span> <span class="ocrx_word" title="bbox 920 531 968 561">die</span> <span class="ocrx_word" title="bbox 992 530 1115 562">Länder</span> <span class="ocrx_word" title="bbox 1139 531 1217 569">ohne</span> <span class="ocrx_word" title="bbox 1240 530 1391 568">Unkosten</span> <span class="ocrx_word" title="bbox 1414 530 1463 561">be-</span></span> <br><span class="ocr_line" baseline= 606 title="bbox 184 575 1465 614"><span class="ocrx_word" title="bbox 184 575 271 613">sehen</span> <span class="ocrx_word" title="bbox 295 575 397 606">könde.</span> <span class="ocrx_word" title="bbox 430 575 500 613">Hab</span> <span class="ocrx_word" title="bbox 524 575 596 613">mich</span> <span class="ocrx_word" title="bbox 621 576 694 608">alda</span> <span class="ocrx_word" title="bbox 717 576 911 614">versprochen</span> <span class="ocrx_word" title="bbox 935 576 1006 612">nach</span> <span class="ocrx_word" title="bbox 1031 576 1239 614">gewohnheit,</span> <span class="ocrx_word" title="bbox 1261 576 1279 606">2</span> <span class="ocrx_word" title="bbox 1303 575 1388 613">Jahr</span> <span class="ocrx_word" title="bbox 1412 583 1465 613">zue</span></span> <br><span class="ocr_line" baseline= 652 title="bbox 187 619 1464 659"><span class="ocrx_word" title="bbox 187 619 321 658">Maifter</span> <span class="ocrx_word" title="bbox 346 620 449 658">Georg</span> <span class="ocrx_word" title="bbox 474 619 641 658">Schillern.</span> <span class="ocrx_word" title="bbox 686 621 756 659">Hab</span> <span class="ocrx_word" title="bbox 783 620 879 659">Jhme</span> <span class="ocrx_word" title="bbox 905 620 1097 659">versprochen</span> <span class="ocrx_word" title="bbox 1123 622 1161 652">40</span> <span class="ocrx_word" title="bbox 1185 621 1219 659">fl.</span> <span class="ocrx_word" title="bbox 1245 620 1404 659">Lehrlohn</span> <span class="ocrx_word" title="bbox 1429 628 1464 659">zu</span></span> <br><span class="ocr_line" baseline= 695 title="bbox 184 665 289 702"><span class="ocrx_word" title="bbox 184 665 289 702">geben.</span></span> <br></p>
17
+ <p class="ocr_par" align=Justified leftIndent=100 startIndent=1900 lineSpacing=1056 style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 750 title="bbox 264 718 1463 759"><span class="ocrx_word" title="bbox 264 719 349 750">Dato</span> <span class="ocrx_word" title="bbox 369 719 427 750">den</span> <span class="ocrx_word" title="bbox 448 718 493 749">10.</span> <span class="ocrx_word" title="bbox 512 719 677 758">February</span> <span class="ocrx_word" title="bbox 698 720 800 758">haben</span> <span class="ocrx_word" title="bbox 820 720 879 751">wir</span> <span class="ocrx_word" title="bbox 899 720 954 757">bey</span> <span class="ocrx_word" title="bbox 975 720 1041 750">dem</span> <span class="ocrx_word" title="bbox 1062 719 1194 759">Weißen</span> <span class="ocrx_word" title="bbox 1213 720 1302 757">Oxen</span> <span class="ocrx_word" title="bbox 1323 719 1463 758">verzehrt</span></span> <br><span class="ocr_line" baseline= 795 title="bbox 186 763 1463 803"><span class="ocrx_word" title="bbox 186 764 204 794">6</span> <span class="ocrx_word" title="bbox 228 764 262 802">fl.</span> <span class="ocrx_word" title="bbox 286 765 304 794">7</span> <span class="ocrx_word" title="bbox 330 763 400 802">batz.</span> <span class="ocrx_word" title="bbox 436 763 550 795">Daran</span> <span class="ocrx_word" title="bbox 575 764 629 802">hat</span> <span class="ocrx_word" title="bbox 655 765 708 796">der</span> <span class="ocrx_word" title="bbox 734 764 869 803">Maifter</span> <span class="ocrx_word" title="bbox 894 765 911 795">3</span> <span class="ocrx_word" title="bbox 936 764 970 802">fl.</span> <span class="ocrx_word" title="bbox 995 764 1058 802">zalt</span> <span class="ocrx_word" title="bbox 1084 765 1147 796">und</span> <span class="ocrx_word" title="bbox 1173 765 1226 802">Ich</span> <span class="ocrx_word" title="bbox 1251 765 1310 795">das</span> <span class="ocrx_word" title="bbox 1336 764 1463 801">Uebrig.</span></span> <br><span class="ocr_line" baseline= 840 title="bbox 185 808 1463 848"><span class="ocrx_word" title="bbox 185 808 256 840">Und</span> <span class="ocrx_word" title="bbox 282 808 337 840">bin</span> <span class="ocrx_word" title="bbox 364 808 403 846">ich</span> <span class="ocrx_word" title="bbox 431 808 465 846">uf</span> <span class="ocrx_word" title="bbox 491 808 549 840">den</span> <span class="ocrx_word" title="bbox 576 809 623 841">20.</span> <span class="ocrx_word" title="bbox 649 809 816 848">February</span> <span class="ocrx_word" title="bbox 843 810 919 839">1621</span> <span class="ocrx_word" title="bbox 947 809 1002 846">bey</span> <span class="ocrx_word" title="bbox 1031 809 1127 846">Jhme</span> <span class="ocrx_word" title="bbox 1154 809 1384 848">eingestanden.</span> <span class="ocrx_word" title="bbox 1422 808 1463 846">Uf</span></span> <br><span class="ocr_line" baseline= 884 title="bbox 185 853 1462 892"><span class="ocrx_word" title="bbox 185 854 222 884">2?.</span> <span class="ocrx_word" title="bbox 246 854 306 885">dto.</span> <span class="ocrx_word" title="bbox 330 854 364 884">in</span> <span class="ocrx_word" title="bbox 388 854 440 885">der</span> <span class="ocrx_word" title="bbox 464 853 564 892">Zunft</span> <span class="ocrx_word" title="bbox 588 854 828 892">eingeschrieben</span> <span class="ocrx_word" title="bbox 851 854 987 885">worden.</span> <span class="ocrx_word" title="bbox 1020 854 1104 885">Dato</span> <span class="ocrx_word" title="bbox 1129 862 1197 892">zum</span> <span class="ocrx_word" title="bbox 1219 854 1352 892">Weihen</span> <span class="ocrx_word" title="bbox 1374 854 1462 891">Oxen</span></span> <br><span class="ocr_line" baseline= 929 title="bbox 185 897 951 937"><span class="ocrx_word" title="bbox 185 898 324 936">verzehrt</span> <span class="ocrx_word" title="bbox 345 899 363 928">6</span> <span class="ocrx_word" title="bbox 384 897 429 937">fl.,</span> <span class="ocrx_word" title="bbox 451 898 512 936">hab</span> <span class="ocrx_word" title="bbox 533 899 572 937">ich</span> <span class="ocrx_word" title="bbox 593 899 708 937">halben</span> <span class="ocrx_word" title="bbox 730 899 822 937">Theil</span> <span class="ocrx_word" title="bbox 846 898 951 936">geben.</span></span> <br></p>
18
+ <p class="ocr_par" leftIndent=10600 style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 1001 title="bbox 625 970 1016 1003"><span class="ocrx_word" title="bbox 625 971 733 1003">Gott</span> <span class="ocrx_word" title="bbox 751 970 865 1002">Gebe</span> <span class="ocrx_word" title="bbox 883 970 1016 1002">Gnad.</span></span> <br></p>
19
+ <p class="ocr_par" align=Justified leftIndent=100 startIndent=1900 lineSpacing=1056 style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 1054 title="bbox 265 1023 1464 1062"><span class="ocrx_word" title="bbox 265 1023 454 1061">Sambstag,</span> <span class="ocrx_word" title="bbox 474 1023 532 1055">den</span> <span class="ocrx_word" title="bbox 553 1024 598 1054">18.</span> <span class="ocrx_word" title="bbox 620 1024 751 1062">Augusti</span> <span class="ocrx_word" title="bbox 773 1024 850 1054">1621</span> <span class="ocrx_word" title="bbox 871 1024 906 1062">ist</span> <span class="ocrx_word" title="bbox 927 1024 1021 1055">Mein</span> <span class="ocrx_word" title="bbox 1040 1023 1243 1061">Lehrmeister</span> <span class="ocrx_word" title="bbox 1265 1024 1344 1061">Jerg</span> <span class="ocrx_word" title="bbox 1364 1023 1464 1061">Schik-</span></span> <br><span class="ocr_line" baseline= 1099 title="bbox 187 1067 1464 1106"><span class="ocrx_word" title="bbox 187 1068 232 1099">ler</span> <span class="ocrx_word" title="bbox 260 1069 294 1099">in</span> <span class="ocrx_word" title="bbox 321 1068 398 1099">Gott</span> <span class="ocrx_word" title="bbox 425 1067 572 1106">seeliglich</span> <span class="ocrx_word" title="bbox 600 1068 792 1106">entschlafen.</span> <span class="ocrx_word" title="bbox 829 1069 906 1100">Gott</span> <span class="ocrx_word" title="bbox 933 1069 985 1100">der</span> <span class="ocrx_word" title="bbox 1013 1067 1194 1106">Allmechtig</span> <span class="ocrx_word" title="bbox 1221 1068 1341 1105">verleih</span> <span class="ocrx_word" title="bbox 1368 1068 1464 1105">Jhme</span></span> <br><span class="ocr_line" baseline= 1144 title="bbox 188 1112 1464 1152"><span class="ocrx_word" title="bbox 188 1113 255 1144">eine</span> <span class="ocrx_word" title="bbox 274 1112 417 1150">fröhliche</span> <span class="ocrx_word" title="bbox 437 1112 666 1152">Auferstehung</span> <span class="ocrx_word" title="bbox 687 1114 750 1144">und</span> <span class="ocrx_word" title="bbox 769 1121 832 1144">uns</span> <span class="ocrx_word" title="bbox 853 1112 949 1151">allen,</span> <span class="ocrx_word" title="bbox 968 1113 1069 1144">einem</span> <span class="ocrx_word" title="bbox 1089 1112 1192 1145">Jeden</span> <span class="ocrx_word" title="bbox 1211 1121 1246 1151">zu</span> <span class="ocrx_word" title="bbox 1265 1113 1363 1151">seiner</span> <span class="ocrx_word" title="bbox 1384 1112 1464 1151">Zeit,</span></span> <br><span class="ocr_line" baseline= 1188 title="bbox 186 1157 1463 1197"><span class="ocrx_word" title="bbox 186 1158 236 1189">ein</span> <span class="ocrx_word" title="bbox 261 1158 390 1195">Seeligs</span> <span class="ocrx_word" title="bbox 415 1158 494 1189">End.</span> <span class="ocrx_word" title="bbox 531 1158 640 1190">Amen.</span> <span class="ocrx_word" title="bbox 676 1158 725 1197">Ist</span> <span class="ocrx_word" title="bbox 750 1159 783 1189">in</span> <span class="ocrx_word" title="bbox 809 1158 913 1189">Allem</span> <span class="ocrx_word" title="bbox 938 1158 956 1187">8</span> <span class="ocrx_word" title="bbox 981 1157 1109 1195">Wochen</span> <span class="ocrx_word" title="bbox 1132 1158 1150 1188">2</span> <span class="ocrx_word" title="bbox 1174 1158 1243 1195">Tag</span> <span class="ocrx_word" title="bbox 1268 1158 1358 1188">krank</span> <span class="ocrx_word" title="bbox 1383 1157 1463 1195">gele¬</span></span> <br><span class="ocr_line" baseline= 1233 title="bbox 185 1202 1463 1241"><span class="ocrx_word" title="bbox 185 1211 253 1241">gen,</span> <span class="ocrx_word" title="bbox 278 1202 471 1240">unterdessen</span> <span class="ocrx_word" title="bbox 496 1203 556 1240">hab</span> <span class="ocrx_word" title="bbox 582 1203 621 1240">ich</span> <span class="ocrx_word" title="bbox 645 1204 694 1234">die</span> <span class="ocrx_word" title="bbox 718 1203 902 1240">Werckstatt,</span> <span class="ocrx_word" title="bbox 928 1202 1069 1240">Gottlob,</span> <span class="ocrx_word" title="bbox 1096 1203 1147 1233">als</span> <span class="ocrx_word" title="bbox 1173 1203 1222 1233">ein</span> <span class="ocrx_word" title="bbox 1249 1202 1351 1240">Gesell</span> <span class="ocrx_word" title="bbox 1377 1202 1463 1240">füeh-</span></span> <br><span class="ocr_line" baseline= 1278 title="bbox 185 1247 1314 1285"><span class="ocrx_word" title="bbox 185 1255 240 1278">ren</span> <span class="ocrx_word" title="bbox 264 1247 396 1285">müeffen</span> <span class="ocrx_word" title="bbox 421 1247 484 1277">und</span> <span class="ocrx_word" title="bbox 508 1248 562 1285">hat</span> <span class="ocrx_word" title="bbox 585 1249 646 1280">mir</span> <span class="ocrx_word" title="bbox 670 1247 803 1279">Gottlob</span> <span class="ocrx_word" title="bbox 826 1247 902 1277">eben</span> <span class="ocrx_word" title="bbox 927 1247 1009 1284">wohl</span> <span class="ocrx_word" title="bbox 1033 1247 1134 1278">damit</span> <span class="ocrx_word" title="bbox 1159 1247 1314 1285">gelungen</span></span> <br></p>
20
+ <p class="ocr_par" align=Justified leftIndent=100 startIndent=1900 lineSpacing=1056 style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 1323 title="bbox 266 1291 1464 1331"><span class="ocrx_word" title="bbox 266 1291 331 1330">Auf</span> <span class="ocrx_word" title="bbox 354 1292 413 1323">den</span> <span class="ocrx_word" title="bbox 438 1291 534 1330">ersten</span> <span class="ocrx_word" title="bbox 558 1292 690 1331">Augusti</span> <span class="ocrx_word" title="bbox 715 1293 791 1323">1621</span> <span class="ocrx_word" title="bbox 814 1292 875 1329">hab</span> <span class="ocrx_word" title="bbox 899 1292 937 1330">ich</span> <span class="ocrx_word" title="bbox 960 1291 1032 1330">mich</span> <span class="ocrx_word" title="bbox 1056 1299 1090 1330">zu</span> <span class="ocrx_word" title="bbox 1114 1292 1236 1330">Herren</span> <span class="ocrx_word" title="bbox 1259 1292 1335 1324">Veit</span> <span class="ocrx_word" title="bbox 1357 1291 1464 1329">Schal-</span></span> <br><span class="ocr_line" baseline= 1367 title="bbox 187 1336 1174 1375"><span class="ocrx_word" title="bbox 187 1337 239 1368">ken</span> <span class="ocrx_word" title="bbox 265 1336 468 1375">versprochen,</span> <span class="ocrx_word" title="bbox 496 1336 554 1368">das</span> <span class="ocrx_word" title="bbox 580 1337 754 1374">Handwerk</span> <span class="ocrx_word" title="bbox 780 1337 927 1368">vollends</span> <span class="ocrx_word" title="bbox 953 1336 1174 1374">auszulernen.</span></span> <br></p>
21
+ <p class="ocr_par" align=Justified leftIndent=100 startIndent=1900 lineSpacing=1056 style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 1412 title="bbox 265 1381 1466 1420"><span class="ocrx_word" title="bbox 265 1382 336 1413">Und</span> <span class="ocrx_word" title="bbox 356 1382 416 1419">hab</span> <span class="ocrx_word" title="bbox 435 1381 530 1419">Jhme</span> <span class="ocrx_word" title="bbox 551 1382 586 1412">18</span> <span class="ocrx_word" title="bbox 605 1383 638 1420">fl.</span> <span class="ocrx_word" title="bbox 658 1382 764 1420">geben.</span> <span class="ocrx_word" title="bbox 795 1381 879 1413">Aber</span> <span class="ocrx_word" title="bbox 907 1382 975 1412">dem</span> <span class="ocrx_word" title="bbox 1003 1382 1134 1419">vorigen</span> <span class="ocrx_word" title="bbox 1162 1381 1297 1419">Maifter</span> <span class="ocrx_word" title="bbox 1326 1381 1466 1419">(dieweil</span></span> <br><span class="ocr_line" baseline= 1457 title="bbox 187 1425 1464 1465"><span class="ocrx_word" title="bbox 187 1426 235 1457">bei</span> <span class="ocrx_word" title="bbox 258 1426 311 1457">der</span> <span class="ocrx_word" title="bbox 333 1425 533 1463">Wittfrauen</span> <span class="ocrx_word" title="bbox 556 1427 635 1464">nicht</span> <span class="ocrx_word" title="bbox 659 1427 832 1457">auslernen</span> <span class="ocrx_word" title="bbox 857 1426 989 1465">könden)</span> <span class="ocrx_word" title="bbox 1015 1434 1076 1456">nur</span> <span class="ocrx_word" title="bbox 1100 1427 1138 1456">20</span> <span class="ocrx_word" title="bbox 1161 1426 1195 1464">fl.</span> <span class="ocrx_word" title="bbox 1230 1426 1316 1458">Dato</span> <span class="ocrx_word" title="bbox 1339 1426 1400 1464">hab</span> <span class="ocrx_word" title="bbox 1424 1426 1464 1462">ich</span></span> <br><span class="ocr_line" baseline= 1502 title="bbox 185 1471 585 1510"><span class="ocrx_word" title="bbox 185 1479 255 1509">zum</span> <span class="ocrx_word" title="bbox 276 1471 376 1509">besten</span> <span class="ocrx_word" title="bbox 395 1471 492 1509">geben</span> <span class="ocrx_word" title="bbox 512 1472 531 1502">2</span> <span class="ocrx_word" title="bbox 551 1471 585 1510">fl.</span></span> <br></p>
22
+ <p class="ocr_par" align=Justified leftIndent=100 startIndent=1900 lineSpacing=1056 style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 1555 title="bbox 266 1524 1464 1563"><span class="ocrx_word" title="bbox 266 1525 338 1556">Adi.</span> <span class="ocrx_word" title="bbox 355 1525 447 1562">Juny</span> <span class="ocrx_word" title="bbox 465 1524 541 1555">1622</span> <span class="ocrx_word" title="bbox 560 1526 648 1563">(weil</span> <span class="ocrx_word" title="bbox 665 1534 697 1556">es</span> <span class="ocrx_word" title="bbox 714 1526 763 1556">die</span> <span class="ocrx_word" title="bbox 780 1525 978 1563">gelegenhait</span> <span class="ocrx_word" title="bbox 995 1524 1112 1562">sonsten</span> <span class="ocrx_word" title="bbox 1127 1525 1239 1563">geben)</span> <span class="ocrx_word" title="bbox 1259 1525 1315 1555">bin</span> <span class="ocrx_word" title="bbox 1331 1524 1384 1561">Ich</span> <span class="ocrx_word" title="bbox 1402 1532 1464 1554">von</span></span> <br><span class="ocr_line" baseline= 1600 title="bbox 188 1569 1464 1608"><span class="ocrx_word" title="bbox 188 1569 322 1607">Maifter</span> <span class="ocrx_word" title="bbox 346 1570 467 1607">Caspar</span> <span class="ocrx_word" title="bbox 491 1569 626 1608">Müller,</span> <span class="ocrx_word" title="bbox 652 1570 827 1608">Schleifern</span> <span class="ocrx_word" title="bbox 854 1578 889 1607">zu</span> <span class="ocrx_word" title="bbox 913 1569 1144 1607">Memmingen,</span> <span class="ocrx_word" title="bbox 1170 1577 1239 1607">zum</span> <span class="ocrx_word" title="bbox 1265 1569 1390 1607">Gfellen</span> <span class="ocrx_word" title="bbox 1416 1576 1464 1606">ge-</span></span> <br><span class="ocr_line" baseline= 1644 title="bbox 187 1613 1464 1652"><span class="ocrx_word" title="bbox 187 1614 282 1652">macht</span> <span class="ocrx_word" title="bbox 311 1614 373 1645">und</span> <span class="ocrx_word" title="bbox 402 1613 644 1651">aufgenommen</span> <span class="ocrx_word" title="bbox 672 1615 808 1645">worden.</span> <span class="ocrx_word" title="bbox 846 1614 964 1651">Sampt</span> <span class="ocrx_word" title="bbox 995 1613 1098 1645">Lucas</span> <span class="ocrx_word" title="bbox 1126 1613 1248 1651">Hursich</span> <span class="ocrx_word" title="bbox 1277 1613 1340 1644">und</span> <span class="ocrx_word" title="bbox 1368 1613 1464 1651">Peter</span></span> <br><span class="ocr_line" baseline= 1689 title="bbox 185 1657 1463 1697"><span class="ocrx_word" title="bbox 185 1659 346 1697">Holzwart</span> <span class="ocrx_word" title="bbox 369 1667 430 1690">von</span> <span class="ocrx_word" title="bbox 455 1658 684 1697">Memmingen.</span> <span class="ocrx_word" title="bbox 718 1660 802 1691">Dato</span> <span class="ocrx_word" title="bbox 825 1658 928 1697">haben</span> <span class="ocrx_word" title="bbox 952 1659 1010 1689">das</span> <span class="ocrx_word" title="bbox 1033 1658 1217 1696">Handwerk,</span> <span class="ocrx_word" title="bbox 1241 1658 1376 1696">Maifter</span> <span class="ocrx_word" title="bbox 1401 1657 1463 1688">und</span></span> <br><span class="ocr_line" baseline= 1734 title="bbox 187 1702 1464 1742"><span class="ocrx_word" title="bbox 187 1704 328 1742">Gesellen</span> <span class="ocrx_word" title="bbox 353 1703 492 1741">verzehrt</span> <span class="ocrx_word" title="bbox 518 1704 573 1741">bey</span> <span class="ocrx_word" title="bbox 597 1704 664 1735">dem</span> <span class="ocrx_word" title="bbox 688 1704 821 1742">Weißen</span> <span class="ocrx_word" title="bbox 846 1703 934 1741">Oxen</span> <span class="ocrx_word" title="bbox 957 1704 994 1733">34</span> <span class="ocrx_word" title="bbox 1020 1703 1064 1740">fl.,</span> <span class="ocrx_word" title="bbox 1089 1703 1147 1734">den</span> <span class="ocrx_word" title="bbox 1173 1703 1291 1740">Thaler</span> <span class="ocrx_word" title="bbox 1317 1710 1352 1740">zu</span> <span class="ocrx_word" title="bbox 1376 1703 1394 1733">9</span> <span class="ocrx_word" title="bbox 1419 1702 1464 1740">fl.,</span></span> <br><span class="ocr_line" baseline= 1778 title="bbox 185 1747 1463 1787"><span class="ocrx_word" title="bbox 185 1749 288 1779">daran</span> <span class="ocrx_word" title="bbox 309 1748 344 1787">ist</span> <span class="ocrx_word" title="bbox 366 1749 428 1779">mir</span> <span class="ocrx_word" title="bbox 452 1747 608 1786">auferlegt</span> <span class="ocrx_word" title="bbox 631 1749 758 1779">worden</span> <span class="ocrx_word" title="bbox 781 1756 815 1787">zu</span> <span class="ocrx_word" title="bbox 839 1747 987 1786">bezahlen</span> <span class="ocrx_word" title="bbox 1010 1748 1047 1777">24</span> <span class="ocrx_word" title="bbox 1070 1747 1115 1786">fl.,</span> <span class="ocrx_word" title="bbox 1137 1748 1196 1778">das</span> <span class="ocrx_word" title="bbox 1219 1747 1337 1785">Uebrig</span> <span class="ocrx_word" title="bbox 1360 1747 1463 1785">haben</span></span> <br><span class="ocr_line" baseline= 1824 title="bbox 187 1793 621 1831"><span class="ocrx_word" title="bbox 187 1794 235 1825">die</span> <span class="ocrx_word" title="bbox 255 1793 375 1824">andern</span> <span class="ocrx_word" title="bbox 395 1793 492 1830">Zwen</span> <span class="ocrx_word" title="bbox 511 1793 621 1831">bezalt.</span></span> <br></p>
23
+ <p class="ocr_par" align=Justified leftIndent=100 startIndent=1900 lineSpacing=1056 style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 1868 title="bbox 265 1836 1465 1876"><span class="ocrx_word" title="bbox 265 1838 467 1876">Morndrigs,</span> <span class="ocrx_word" title="bbox 490 1838 549 1869">den</span> <span class="ocrx_word" title="bbox 603 1855 609 1861">•</span> <span class="ocrx_word" title="bbox 631 1838 723 1876">Juny</span> <span class="ocrx_word" title="bbox 748 1837 823 1868">1622</span> <span class="ocrx_word" title="bbox 847 1837 949 1876">haben</span> <span class="ocrx_word" title="bbox 980 1837 1027 1868">die</span> <span class="ocrx_word" title="bbox 1059 1836 1194 1875">Maister</span> <span class="ocrx_word" title="bbox 1226 1836 1290 1868">und</span> <span class="ocrx_word" title="bbox 1322 1836 1465 1874">Gesellen</span></span> <br><span class="ocr_line" baseline= 1913 title="bbox 189 1882 1464 1921"><span class="ocrx_word" title="bbox 189 1883 323 1914">widrum</span> <span class="ocrx_word" title="bbox 353 1883 403 1913">ein</span> <span class="ocrx_word" title="bbox 432 1883 531 1914">trunck</span> <span class="ocrx_word" title="bbox 560 1884 683 1921">gethon.</span> <span class="ocrx_word" title="bbox 723 1883 793 1921">Hab</span> <span class="ocrx_word" title="bbox 824 1883 862 1920">ich</span> <span class="ocrx_word" title="bbox 892 1882 1001 1920">zahlen</span> <span class="ocrx_word" title="bbox 1031 1882 1163 1921">müessen</span> <span class="ocrx_word" title="bbox 1192 1883 1210 1913">5</span> <span class="ocrx_word" title="bbox 1240 1882 1273 1920">fl.</span> <span class="ocrx_word" title="bbox 1303 1882 1366 1912">und</span> <span class="ocrx_word" title="bbox 1397 1882 1464 1912">dem</span></span> <br><span class="ocr_line" baseline= 1958 title="bbox 188 1927 869 1966"><span class="ocrx_word" title="bbox 188 1927 387 1966">Zunftknecht</span> <span class="ocrx_word" title="bbox 407 1928 422 1957">1</span> <span class="ocrx_word" title="bbox 441 1928 514 1966">maß</span> <span class="ocrx_word" title="bbox 532 1927 627 1959">Wein</span> <span class="ocrx_word" title="bbox 646 1936 701 1965">pro</span> <span class="ocrx_word" title="bbox 720 1929 738 1958">9</span> <span class="ocrx_word" title="bbox 758 1927 869 1965">batzen.</span></span> <br></p>
24
+ <p class="ocr_par" align=Justified leftIndent=100 startIndent=1900 lineSpacing=1104 style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 2011 title="bbox 268 1979 1465 2019"><span class="ocrx_word" title="bbox 268 1981 402 2019">Maister</span> <span class="ocrx_word" title="bbox 428 1980 549 2018">Caspar</span> <span class="ocrx_word" title="bbox 575 1980 708 2019">Müller,</span> <span class="ocrx_word" title="bbox 734 1980 886 2018">Schleifer</span> <span class="ocrx_word" title="bbox 912 1981 974 2011">und</span> <span class="ocrx_word" title="bbox 1001 1979 1127 2017">Bürger</span> <span class="ocrx_word" title="bbox 1155 1988 1207 2018">zue</span> <span class="ocrx_word" title="bbox 1234 1979 1465 2017">Memmingen,</span></span> <br><span class="ocr_line" baseline= 2057 title="bbox 187 2025 528 2064"><span class="ocrx_word" title="bbox 187 2027 246 2064">gab</span> <span class="ocrx_word" title="bbox 268 2027 329 2058">mir</span> <span class="ocrx_word" title="bbox 351 2025 411 2064">dise</span> <span class="ocrx_word" title="bbox 433 2025 528 2063">Lehr:</span></span> <br></p>
25
+ <p class="ocr_par" leftIndent=2900 lineSpacing=1056 style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 2110 title="bbox 305 2078 1346 2118"><span class="ocrx_word" title="bbox 305 2079 415 2118">Fürcht</span> <span class="ocrx_word" title="bbox 435 2080 498 2110">und</span> <span class="ocrx_word" title="bbox 520 2080 582 2111">lieb</span> <span class="ocrx_word" title="bbox 603 2080 690 2118">Gott,</span> <span class="ocrx_word" title="bbox 712 2079 777 2110">und</span> <span class="ocrx_word" title="bbox 795 2079 919 2111">Deinen</span> <span class="ocrx_word" title="bbox 943 2078 1065 2116">negsten</span> <span class="ocrx_word" title="bbox 1089 2079 1140 2110">als</span> <span class="ocrx_word" title="bbox 1162 2079 1233 2116">Dich</span> <span class="ocrx_word" title="bbox 1255 2078 1346 2116">selbs,</span></span> <br><span class="ocr_line" baseline= 2154 title="bbox 305 2123 1107 2162"><span class="ocrx_word" title="bbox 305 2125 334 2162">so</span> <span class="ocrx_word" title="bbox 359 2124 474 2162">würstu</span> <span class="ocrx_word" title="bbox 497 2132 559 2155">von</span> <span class="ocrx_word" title="bbox 582 2124 678 2157">Allen</span> <span class="ocrx_word" title="bbox 702 2124 824 2161">bayden</span> <span class="ocrx_word" title="bbox 847 2123 952 2161">geehrt</span> <span class="ocrx_word" title="bbox 975 2123 1107 2155">werden.</span></span> <br></p>
26
+ <p class="ocr_par" align=Justified leftIndent=100 startIndent=1900 lineSpacing=1128 style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 2208 title="bbox 266 2176 1462 2217"><span class="ocrx_word" title="bbox 266 2178 400 2216">Maister</span> <span class="ocrx_word" title="bbox 419 2177 545 2209">Andres</span> <span class="ocrx_word" title="bbox 563 2178 695 2217">Hursich,</span> <span class="ocrx_word" title="bbox 713 2177 923 2216">Tuochscherer</span> <span class="ocrx_word" title="bbox 941 2185 976 2214">zu</span> <span class="ocrx_word" title="bbox 994 2176 1224 2215">Memmingen,</span> <span class="ocrx_word" title="bbox 1244 2176 1304 2214">gab</span> <span class="ocrx_word" title="bbox 1323 2177 1384 2208">mir</span> <span class="ocrx_word" title="bbox 1402 2176 1462 2214">dise</span></span> <br><span class="ocr_line" baseline= 2254 title="bbox 186 2223 280 2263"><span class="ocrx_word" title="bbox 186 2223 280 2263">Lehr:</span></span> <br></p></div></div>
27
+ </body>
28
+ </html>
@@ -0,0 +1,29 @@
1
+ #coding: utf-8
2
+ require 'sinatra'
3
+
4
+ require_relative '../lib/ocr_page'
5
+
6
+ get '/' do
7
+ "<a href='OCRTest.html'>OCRTest</a>"
8
+ end
9
+
10
+ get '/mark' do
11
+ x1 = params[:x1]
12
+ y1 = params[:y1]
13
+ x2 = params[:x2]
14
+ y2 = params[:y2]
15
+ page = params[:page]
16
+
17
+ if x1 and y1 and x2 and y2 and page then
18
+ words = get_enclosed_words(x1, y1, x2, y2, page).join("<br/>")
19
+ words
20
+ else
21
+ "Not enough parameters"
22
+ end
23
+
24
+ end
25
+
26
+ def get_enclosed_words(x1, y1, x2 ,y2, page)
27
+ @page = OCRPage.new("../data/#{page}")
28
+ @page.enclosed_words( OCRBox.new(x1.to_i, y1.to_i, x2.to_i, y2.to_i) )
29
+ end
@@ -0,0 +1,30 @@
1
+ body {
2
+ font: 16px Helvetica, Arial;
3
+ margin:0px;
4
+ padding:0px;
5
+ }
6
+
7
+ .marked {
8
+ color:#FFF;
9
+ position: absolute;
10
+ background-color:purple;
11
+ opacity: 0.4;
12
+ }
13
+
14
+ #ocr_image {
15
+ background-image: url(img/Seite_Tagebuch_H_C_Lang_08.jpg);
16
+ width:1600px;
17
+ height:2495px;
18
+ }
19
+
20
+ #marked_words {
21
+ font-family:monospace;
22
+ border:0.2em solid #333;
23
+ }
24
+
25
+ .selected_words {
26
+ background-color:#EDEDED;
27
+ margin:0.2em;
28
+ padding:0.2em;
29
+ }
30
+
@@ -0,0 +1,54 @@
1
+ <!DOCTYPE html>
2
+ <html lang="de">
3
+ <!--
4
+
5
+ Created using http://jsbin.com/
6
+ Source can be edited via http://jsbin.com/azare/edit
7
+
8
+ -->
9
+ <head>
10
+ <meta charset="utf-8" />
11
+ <title>OCRTest</title>
12
+ <link rel="stylesheet" href="http://ajax.googleapis.com/ajax/libs/jqueryui/1.7.2/themes/base/jquery-ui.css" type="text/css" />
13
+ <link rel="stylesheet" href="OCRTest.css" type="text/css" />
14
+ <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js"></script>
15
+ <script src="https://ajax.googleapis.com/ajax/libs/jqueryui/1.7.2/jquery-ui.min.js"></script>
16
+ <script language="javascript" type="text/javascript" src="OCRTest_marker.js"></script>
17
+ </head>
18
+ <body>
19
+ <header>
20
+ <h1>Mark words test</h1>
21
+ </header>
22
+
23
+
24
+
25
+ <div id="ocr_image">
26
+ </div>
27
+
28
+ <div id="marked_words">
29
+ </div>
30
+
31
+ <script type="text/javascript">
32
+ $(document).ready(function(){
33
+ /*Initialisierung der Position für JS*/
34
+ ocr_image_left = $("#ocr_image").offset().left;
35
+ ocr_image_top = $("#ocr_image").offset().top;
36
+
37
+
38
+ $("#ocr_image").mousedown( function(e) {
39
+ startX = e.pageX - ocr_image_left;
40
+ startY = e.pageY - ocr_image_top;
41
+ });
42
+
43
+
44
+ $("#ocr_image").mouseup( function(e) {
45
+ endX = e.pageX - ocr_image_left;
46
+ endY = e.pageY - ocr_image_top;
47
+ draw_rectangle();
48
+ });
49
+ /*Initialisierung abgeschlossen*/
50
+ })
51
+
52
+ </script>
53
+ </body>
54
+ </html>
@@ -0,0 +1,83 @@
1
+ var startX;
2
+ var startY;
3
+ var endX;
4
+ var endY;
5
+
6
+ var ocr_image_left;
7
+ var ocr_image_top;
8
+
9
+
10
+ function draw_rectangle() {
11
+ var top;
12
+ var left;
13
+ var height;
14
+ var width;
15
+
16
+ // Linker Abstand und Seitenlänge
17
+ if (startX < endX) {
18
+ left = startX;
19
+ width = endX - startX;
20
+ }
21
+ else {
22
+ left = endX;
23
+ width = startX - endX;
24
+ }
25
+
26
+ // Abstand oben und Seitenlänge
27
+ if(startY < endY) {
28
+ top = startY;
29
+ height = endY - startY;
30
+ }
31
+ else {
32
+ top = endY;
33
+ height = startY - endY;
34
+ }
35
+
36
+ // Muss draufgerechnet werden, damit Position relativ zum Elternelement
37
+ top += ocr_image_top;
38
+ left += ocr_image_left;
39
+
40
+ var div = $("<span></span>").addClass('marked').css('top',top).css('left',left).height(height).width(width);
41
+ div.append(startX + " "+ startY + " "+ endX +" " + endY);
42
+ $("#ocr_image").append(div);
43
+
44
+ get_marked_words();
45
+ }
46
+
47
+
48
+ function get_marked_words() {
49
+ var x1;
50
+ var y1;
51
+ var x2;
52
+ var y2;
53
+
54
+ if (startX < endX) {
55
+ x1 = startX;
56
+ x2 = endX;
57
+ }
58
+ else {
59
+ x1 = endX;
60
+ x2 = startX
61
+ }
62
+
63
+ if (startY < endY) {
64
+ y1 = startY;
65
+ y2 = endY ;
66
+ }
67
+ else {
68
+ y1 = endY;
69
+ y2t = startY;
70
+ }
71
+
72
+ $.ajax({
73
+ url: 'http://localhost:4567/mark',
74
+ data: 'x1=' + x1 + '&y1=' + y1 + '&x2='+ x2 + '&y2=' + y2 + '&page=Seite_Tagebuch_H_C_Lang_08.html',
75
+ success: add_marked_words
76
+ })
77
+ }
78
+
79
+ function add_marked_words(data) {
80
+ var span = $("<div></div>").addClass('selected_words').append(data);
81
+ $("#marked_words").append( span );
82
+
83
+ }
data/lib/hocr.rb ADDED
@@ -0,0 +1 @@
1
+ #coding: utf-8
data/lib/ocr_box.rb ADDED
@@ -0,0 +1,43 @@
1
+ #coding: utf-8
2
+
3
+ class OCRBox
4
+
5
+ attr_reader :x1, :y1, :x2, :y2
6
+
7
+ def initialize(x1, y1 , x2, y2)
8
+ @x1 = x1
9
+ @y1 = y1
10
+ @x2 = x2
11
+ @y2 = y2
12
+ end
13
+
14
+ def encloses?(element)
15
+ @x1 <= element.x1 and
16
+ @x2 >= element.x2 and
17
+ @y1 <= element.y1 and
18
+ @y2 >= element.y2
19
+ end
20
+
21
+ def enclosed_by?(element)
22
+ return element.encloses? self
23
+ end
24
+
25
+ def to_s
26
+ "tl->(x:#{@x1} y:#{@y1})/br->:(x:#{@x2} y:#{@y2})"
27
+ end
28
+
29
+ def to_css_style
30
+ top = @y1
31
+ left = @x1
32
+ height = @y2 - @y1
33
+ width = @x2 - @x1
34
+
35
+ "position:absolute; top:#{top}px; left:#{left}px; height:#{height}px; width:#{width}px;"
36
+ end
37
+
38
+ def to_json
39
+
40
+ end
41
+
42
+ end
43
+
data/lib/ocr_page.rb ADDED
@@ -0,0 +1,43 @@
1
+ #coding: utf-8
2
+ require_relative "ocrx_word"
3
+
4
+ class OCRPage < OCRBox
5
+ attr_reader :lines, :words
6
+
7
+ def initialize(filename)
8
+ @lines = hocr_lines( file_as_string(filename) ).select {|line| line.length > 0}
9
+ end
10
+
11
+ def hocr_lines( hocr_contents)
12
+ hocr_array = []
13
+ for line in hocr_contents.split(/<span class="ocr_line"/) do
14
+ line_array = []
15
+ for ocrx_word in line.scan(/<span class="ocrx_word"[^>]+>[^<]+<\/span>/) do
16
+ ocrx_word =~ /title="bbox (\d+) (\d+) (\d+) (\d+)">([^<]+)</
17
+ current_word = OCRXWord.new($1,$2,$3,$4,$5)
18
+ line_array << current_word
19
+ end
20
+ hocr_array << line_array
21
+ end
22
+ hocr_array
23
+ end
24
+
25
+ def words
26
+ @words ||= @lines.flatten
27
+ end
28
+
29
+ def enclosed_words(box)
30
+ words.select { |word| word.enclosed_by? box }
31
+ end
32
+
33
+ def get_position(element)
34
+ element =~ /title="bbox (\d+) (\d+) (\d+) (\d+)">/
35
+ [$1,$2,$3,$4]
36
+ end
37
+
38
+
39
+ def file_as_string(filename)
40
+ hocr_page_contents = File.open(filename,"r") { |f| f.read }
41
+ end
42
+
43
+ end
data/lib/ocrx_word.rb ADDED
@@ -0,0 +1,23 @@
1
+ #coding: utf-8
2
+
3
+ require 'cgi'
4
+ require_relative 'ocr_box'
5
+
6
+ class OCRXWord < OCRBox
7
+
8
+ attr_reader :text
9
+
10
+ def initialize(x1,y1,x2,y2,word)
11
+ super(x1.to_i, y1.to_i, x2.to_i, y2.to_i)
12
+ @text = word
13
+ end
14
+
15
+ def to_html(css_class = 'ocrx_word')
16
+ "<span style='#{ to_css_style }' class='#{css_class}'><span class='word'>" + CGI::escapeHTML(@text) +"</span></span>"
17
+ end
18
+
19
+ def to_s
20
+ "#{@text}\t#{super}"
21
+ end
22
+
23
+ end
data/rhocr.gemspec ADDED
@@ -0,0 +1,29 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = %q{rhocr}
5
+ s.version = "0.0.1"
6
+
7
+ s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
+ s.authors = ["Andreas Neumann"]
9
+ s.date = %q{2011-07-01}
10
+ s.description = %q{Manipulate and use OCR data encode in HOCR}
11
+ s.email = %q{info @nospam@ an-it.com}
12
+ s.extra_rdoc_files = ["README", "lib/hocr.rb", "lib/ocr_box.rb", "lib/ocr_page.rb", "lib/ocrx_word.rb"]
13
+ s.files = ["README", "Rakefile", "data/Seite_Tagebuch_H_C_Lang_08.html", "example/example_server.rb", "example/public/OCRTest.css", "example/public/OCRTest.html", "example/public/OCRTest_marker.js", "example/public/img/Seite_Tagebuch_H_C_Lang_05.jpg", "example/public/img/Seite_Tagebuch_H_C_Lang_08.jpg", "lib/hocr.rb", "lib/ocr_box.rb", "lib/ocr_page.rb", "lib/ocrx_word.rb", "rspec/ocr_box_spec.rb", "rspec/ocr_page_spec.rb", "rspec/ocrx_word_spec.rb", "test.rb", "Manifest", "rhocr.gemspec"]
14
+ s.homepage = %q{http://github.com/daandi/rhocr}
15
+ s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Rhocr", "--main", "README"]
16
+ s.require_paths = ["lib"]
17
+ s.rubyforge_project = %q{rhocr}
18
+ s.rubygems_version = %q{1.6.2}
19
+ s.summary = %q{Manipulate and use OCR data encode in HOCR}
20
+
21
+ if s.respond_to? :specification_version then
22
+ s.specification_version = 3
23
+
24
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
25
+ else
26
+ end
27
+ else
28
+ end
29
+ end
@@ -0,0 +1,48 @@
1
+ #coding: utf-8
2
+
3
+ require_relative '../lib/ocr_box'
4
+
5
+ describe OCRBox do
6
+
7
+ before(:each) do
8
+ @box ||= OCRBox.new(1,2,20,8)
9
+ end
10
+
11
+ describe "#to_s" do
12
+ it "prints a human readable Box-Version with coordinates upper_left(x,y) bottom_right(x,y)" do
13
+ @box.to_s.should == "tl->(x:1 y:2)/br->:(x:20 y:8)"
14
+ end
15
+ end
16
+
17
+ describe '#encloses?(element)' do
18
+ it "tests wather given OCRBox is enclosed by the current OCRBox" do
19
+ @box.encloses?( OCRBox.new(0,3,19,7) ).should be_false
20
+ @box.encloses?( OCRBox.new(2,3,19,7) ).should be_true
21
+ end
22
+ it "encloses also itself" do
23
+ @box.encloses?( @box ).should be_true
24
+ end
25
+ end
26
+
27
+ describe '#to_css_style' do
28
+ it 'should create css-style attributes' do
29
+ @box.to_css_style.should == 'position:absolute; top:2px; left:1px; height:6px; width:19px;'
30
+ end
31
+ end
32
+
33
+ describe '#enclosed_by?(element)' do
34
+ it 'should be enclosed by Boxes bigger than itself' do
35
+ @box.enclosed_by?( OCRBox.new(0,1,21,9) ).should be_true
36
+ end
37
+ it 'should not be enclosed by Boxes smaller than itself' do
38
+ @box.enclosed_by?( OCRBox.new(2,3,19,7) ).should be_false
39
+ end
40
+ it 'should be enclosed by Boxes of the same size' do
41
+ @box.enclosed_by?( @box ).should be_true
42
+ end
43
+ end
44
+
45
+
46
+
47
+
48
+ end
@@ -0,0 +1,17 @@
1
+ #coding: utf-8
2
+
3
+ require_relative '../lib/ocr_page'
4
+
5
+ describe OCRPage do
6
+
7
+ before(:each) do
8
+ @ocr_page ||= OCRPage.new('../data/Seite_Tagebuch_H_C_Lang_08.html')
9
+ end
10
+
11
+ describe '' do
12
+ it '' do
13
+ p @ocr_page.enclosed_words( OCRBox.new(500,1703,1200,1800) )
14
+ end
15
+ end
16
+
17
+ end
@@ -0,0 +1,32 @@
1
+ #coding: utf-8
2
+
3
+ require_relative '../lib/ocrx_word'
4
+
5
+ describe OCRXWord do
6
+
7
+ before(:each) do
8
+ @ocrx_word = OCRXWord.new(10,15,20,20,'WORT')
9
+ end
10
+
11
+ describe '#to_s' do
12
+ it 'should print the coordinates of the box and the textual information' do
13
+ p @ocrx_word
14
+ @ocrx_word.to_s.should == "WORT\ttl->(x:10 y:15)/br->:(x:20 y:20)"
15
+ end
16
+ end
17
+
18
+ describe '#to_html(css_class)' do
19
+ it 'should create an span elment to overlay an image on an html-page' do
20
+ @ocrx_word.to_html.should == "<span style='position:absolute; top:15px; left:10px; height:5px; width:10px;' class='ocrx_word'><span class='word'>WORT</span></span>"
21
+ end
22
+
23
+ it 'no css_class_class given should default to ocrx_word' do
24
+ @ocrx_word.to_html.should =~ /class='ocrx_word'/
25
+ end
26
+
27
+ it 'css_class given should be part of genearted html' do
28
+ @ocrx_word.to_html('rosebud').should =~ /class='rosebud'/
29
+ end
30
+ end
31
+
32
+ end
data/test.rb ADDED
@@ -0,0 +1,8 @@
1
+ # coding: utf-8
2
+ require_relative "lib/ocr_page.rb"
3
+
4
+ ocr = OCRPage.new("data/Seite_Tagebuch_H_C_Lang_08.html")
5
+
6
+ p ocr.lines[1]
7
+ p ocr.words
8
+
metadata ADDED
@@ -0,0 +1,83 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rhocr
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.0.1
6
+ platform: ruby
7
+ authors:
8
+ - Andreas Neumann
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-07-01 00:00:00 +02:00
14
+ default_executable:
15
+ dependencies: []
16
+
17
+ description: Manipulate and use OCR data encode in HOCR
18
+ email: info @nospam@ an-it.com
19
+ executables: []
20
+
21
+ extensions: []
22
+
23
+ extra_rdoc_files:
24
+ - README
25
+ - lib/hocr.rb
26
+ - lib/ocr_box.rb
27
+ - lib/ocr_page.rb
28
+ - lib/ocrx_word.rb
29
+ files:
30
+ - README
31
+ - Rakefile
32
+ - data/Seite_Tagebuch_H_C_Lang_08.html
33
+ - example/example_server.rb
34
+ - example/public/OCRTest.css
35
+ - example/public/OCRTest.html
36
+ - example/public/OCRTest_marker.js
37
+ - example/public/img/Seite_Tagebuch_H_C_Lang_05.jpg
38
+ - example/public/img/Seite_Tagebuch_H_C_Lang_08.jpg
39
+ - lib/hocr.rb
40
+ - lib/ocr_box.rb
41
+ - lib/ocr_page.rb
42
+ - lib/ocrx_word.rb
43
+ - rspec/ocr_box_spec.rb
44
+ - rspec/ocr_page_spec.rb
45
+ - rspec/ocrx_word_spec.rb
46
+ - test.rb
47
+ - Manifest
48
+ - rhocr.gemspec
49
+ has_rdoc: true
50
+ homepage: http://github.com/daandi/rhocr
51
+ licenses: []
52
+
53
+ post_install_message:
54
+ rdoc_options:
55
+ - --line-numbers
56
+ - --inline-source
57
+ - --title
58
+ - Rhocr
59
+ - --main
60
+ - README
61
+ require_paths:
62
+ - lib
63
+ required_ruby_version: !ruby/object:Gem::Requirement
64
+ none: false
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: "0"
69
+ required_rubygems_version: !ruby/object:Gem::Requirement
70
+ none: false
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: "1.2"
75
+ requirements: []
76
+
77
+ rubyforge_project: rhocr
78
+ rubygems_version: 1.6.2
79
+ signing_key:
80
+ specification_version: 3
81
+ summary: Manipulate and use OCR data encode in HOCR
82
+ test_files: []
83
+