rhocr 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Manifest +18 -0
- data/README +1 -0
- data/Rakefile +12 -0
- data/data/Seite_Tagebuch_H_C_Lang_08.html +28 -0
- data/example/example_server.rb +29 -0
- data/example/public/OCRTest.css +30 -0
- data/example/public/OCRTest.html +54 -0
- data/example/public/OCRTest_marker.js +83 -0
- data/example/public/img/Seite_Tagebuch_H_C_Lang_05.jpg +0 -0
- data/example/public/img/Seite_Tagebuch_H_C_Lang_08.jpg +0 -0
- data/lib/hocr.rb +1 -0
- data/lib/ocr_box.rb +43 -0
- data/lib/ocr_page.rb +43 -0
- data/lib/ocrx_word.rb +23 -0
- data/rhocr.gemspec +29 -0
- data/rspec/ocr_box_spec.rb +48 -0
- data/rspec/ocr_page_spec.rb +17 -0
- data/rspec/ocrx_word_spec.rb +32 -0
- data/test.rb +8 -0
- metadata +83 -0
data/Manifest
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
README
|
|
2
|
+
Rakefile
|
|
3
|
+
data/Seite_Tagebuch_H_C_Lang_08.html
|
|
4
|
+
example/example_server.rb
|
|
5
|
+
example/public/OCRTest.css
|
|
6
|
+
example/public/OCRTest.html
|
|
7
|
+
example/public/OCRTest_marker.js
|
|
8
|
+
example/public/img/Seite_Tagebuch_H_C_Lang_05.jpg
|
|
9
|
+
example/public/img/Seite_Tagebuch_H_C_Lang_08.jpg
|
|
10
|
+
lib/hocr.rb
|
|
11
|
+
lib/ocr_box.rb
|
|
12
|
+
lib/ocr_page.rb
|
|
13
|
+
lib/ocrx_word.rb
|
|
14
|
+
rspec/ocr_box_spec.rb
|
|
15
|
+
rspec/ocr_page_spec.rb
|
|
16
|
+
rspec/ocrx_word_spec.rb
|
|
17
|
+
test.rb
|
|
18
|
+
Manifest
|
data/README
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
Ruby Library to work with OCR-Data in the HOCR-Format.
|
data/Rakefile
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
require 'rubygems'
|
|
2
|
+
require 'rake'
|
|
3
|
+
require 'echoe'
|
|
4
|
+
|
|
5
|
+
Echoe.new('rhocr', '0.0.1') do |p|
|
|
6
|
+
p.description = "Manipulate and use OCR data encode in HOCR"
|
|
7
|
+
p.url = "http://github.com/daandi/rhocr"
|
|
8
|
+
p.author = "Andreas Neumann"
|
|
9
|
+
p.email = "info @nospam@ an-it.com"
|
|
10
|
+
p.ignore_pattern = ["tmp/*", "script/*"]
|
|
11
|
+
p.development_dependencies = []
|
|
12
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
|
|
2
|
+
<html>
|
|
3
|
+
<head>
|
|
4
|
+
<title>OCR Output</title>
|
|
5
|
+
<meta http-equiv="content-type" content="text/html; charset=utf-8" />
|
|
6
|
+
<meta http-equiv="content-style-type" content="text/css" />
|
|
7
|
+
<meta name="ocr-capabilities" content="ocr_page ocr_par ocrx_word ocr_line" />
|
|
8
|
+
<meta name="ocr-system" content="ABBYY fre-8.0.1.1024" />
|
|
9
|
+
<meta name="ocr-number-of-pages" content="1" />
|
|
10
|
+
</head>
|
|
11
|
+
<body bgcolor="#ffffff">
|
|
12
|
+
<div class="ocr_page" title="bbox 0 0 1709 1709;ppageno 20">
|
|
13
|
+
|
|
14
|
+
<div class="ocrx_block" title="bboxnull 111 1472 2270" style="font-size:9pt;font-family:"Arial";font-style:normal"><br>
|
|
15
|
+
<p class="ocr_par" style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 153 title="bbox 184 115 205 153"><span class="ocrx_word" title="bbox 184 115 205 153">8</span></span> <br></p>
|
|
16
|
+
<p class="ocr_par" align=Justified leftIndent=100 startIndent=1900 lineSpacing=1056 style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 249 title="bbox 264 216 1462 256"><span class="ocrx_word" title="bbox 264 216 333 248">Den</span> <span class="ocrx_word" title="bbox 356 216 402 248">20.</span> <span class="ocrx_word" title="bbox 426 216 620 249">Novembris</span> <span class="ocrx_word" title="bbox 643 218 698 249">bin</span> <span class="ocrx_word" title="bbox 720 218 758 256">ich</span> <span class="ocrx_word" title="bbox 781 217 917 248">widrum</span> <span class="ocrx_word" title="bbox 940 217 973 256">uf</span> <span class="ocrx_word" title="bbox 997 217 1220 256">Schaffhausen</span> <span class="ocrx_word" title="bbox 1245 226 1376 256">gezogen</span> <span class="ocrx_word" title="bbox 1399 218 1462 249">und</span></span> <br><span class="ocr_line" baseline= 293 title="bbox 187 261 1464 301"><span class="ocrx_word" title="bbox 187 262 242 299">bey</span> <span class="ocrx_word" title="bbox 262 261 405 292">Meinem</span> <span class="ocrx_word" title="bbox 426 261 546 299">Herren</span> <span class="ocrx_word" title="bbox 567 263 748 294">verblieben</span> <span class="ocrx_word" title="bbox 769 263 820 293">bis</span> <span class="ocrx_word" title="bbox 843 263 876 300">uf</span> <span class="ocrx_word" title="bbox 898 263 943 294">10.</span> <span class="ocrx_word" title="bbox 963 262 1131 300">Dezember</span> <span class="ocrx_word" title="bbox 1153 264 1239 301">1620,</span> <span class="ocrx_word" title="bbox 1260 264 1318 294">wie</span> <span class="ocrx_word" title="bbox 1337 263 1464 300">hiervor</span></span> <br><span class="ocr_line" baseline= 338 title="bbox 187 306 1464 346"><span class="ocrx_word" title="bbox 187 306 337 344">gemeldet</span> <span class="ocrx_word" title="bbox 366 306 410 344">ist,</span> <span class="ocrx_word" title="bbox 440 307 502 338">und</span> <span class="ocrx_word" title="bbox 533 306 588 338">bin</span> <span class="ocrx_word" title="bbox 616 307 650 346">uf</span> <span class="ocrx_word" title="bbox 679 308 737 338">den</span> <span class="ocrx_word" title="bbox 768 308 813 339">17.</span> <span class="ocrx_word" title="bbox 843 307 1011 345">Dezember</span> <span class="ocrx_word" title="bbox 1042 307 1118 338">1620</span> <span class="ocrx_word" title="bbox 1147 308 1283 339">widrum</span> <span class="ocrx_word" title="bbox 1313 308 1400 344">haim</span> <span class="ocrx_word" title="bbox 1430 307 1464 344">uf</span></span> <br><span class="ocr_line" baseline= 383 title="bbox 186 351 1462 391"><span class="ocrx_word" title="bbox 186 351 273 389">Mne</span> <span class="ocrx_word" title="bbox 303 351 441 382">kommen</span> <span class="ocrx_word" title="bbox 471 352 534 382">und</span> <span class="ocrx_word" title="bbox 565 352 652 383">allda</span> <span class="ocrx_word" title="bbox 681 352 863 383">verblieben</span> <span class="ocrx_word" title="bbox 893 352 943 382">bis</span> <span class="ocrx_word" title="bbox 973 352 1028 389">auf</span> <span class="ocrx_word" title="bbox 1058 352 1116 383">den</span> <span class="ocrx_word" title="bbox 1145 354 1172 384">9.</span> <span class="ocrx_word" title="bbox 1201 353 1373 391">Februarii</span> <span class="ocrx_word" title="bbox 1404 352 1462 383">Ao.</span></span> <br><span class="ocr_line" baseline= 427 title="bbox 187 396 1463 435"><span class="ocrx_word" title="bbox 187 396 272 435">1621,</span> <span class="ocrx_word" title="bbox 294 397 331 428">do</span> <span class="ocrx_word" title="bbox 355 396 410 427">bin</span> <span class="ocrx_word" title="bbox 433 404 491 433">gen</span> <span class="ocrx_word" title="bbox 513 396 734 435">Memmingen</span> <span class="ocrx_word" title="bbox 756 405 859 435">zogen,</span> <span class="ocrx_word" title="bbox 883 397 926 427">im</span> <span class="ocrx_word" title="bbox 950 396 1075 428">Namen</span> <span class="ocrx_word" title="bbox 1099 398 1223 435">Gottes,</span> <span class="ocrx_word" title="bbox 1246 398 1304 428">mit</span> <span class="ocrx_word" title="bbox 1328 396 1463 435">Maifter</span></span> <br><span class="ocr_line" baseline= 472 title="bbox 188 441 1463 480"><span class="ocrx_word" title="bbox 188 441 379 473">Jeronimus</span> <span class="ocrx_word" title="bbox 414 441 571 480">Andreae,</span> <span class="ocrx_word" title="bbox 605 442 677 473">umb</span> <span class="ocrx_word" title="bbox 714 442 806 473">einen</span> <span class="ocrx_word" title="bbox 841 441 990 479">ehrlichen</span> <span class="ocrx_word" title="bbox 1024 441 1159 480">Maifter</span> <span class="ocrx_word" title="bbox 1193 450 1229 480">zu</span> <span class="ocrx_word" title="bbox 1264 442 1369 480">fechen,</span> <span class="ocrx_word" title="bbox 1404 442 1463 472">das</span></span> <br><span class="ocr_line" baseline= 517 title="bbox 185 485 1463 525"><span class="ocrx_word" title="bbox 185 485 584 524">Tuochfchererhandtwerck</span> <span class="ocrx_word" title="bbox 607 495 642 525">zu</span> <span class="ocrx_word" title="bbox 665 487 785 524">lernen,</span> <span class="ocrx_word" title="bbox 810 486 911 517">damit</span> <span class="ocrx_word" title="bbox 936 486 974 523">ich</span> <span class="ocrx_word" title="bbox 1000 487 1072 516">weit</span> <span class="ocrx_word" title="bbox 1095 487 1150 524">hin</span> <span class="ocrx_word" title="bbox 1176 487 1237 517">und</span> <span class="ocrx_word" title="bbox 1263 487 1360 517">wider</span> <span class="ocrx_word" title="bbox 1384 486 1463 523">ohne</span></span> <br><span class="ocr_line" baseline= 561 title="bbox 185 530 1463 569"><span class="ocrx_word" title="bbox 185 530 259 562">Gelt</span> <span class="ocrx_word" title="bbox 281 530 398 568">Raifen</span> <span class="ocrx_word" title="bbox 421 530 483 560">und</span> <span class="ocrx_word" title="bbox 505 530 724 569">Fortkommen</span> <span class="ocrx_word" title="bbox 747 531 810 561">und</span> <span class="ocrx_word" title="bbox 835 531 898 568">also</span> <span class="ocrx_word" title="bbox 920 531 968 561">die</span> <span class="ocrx_word" title="bbox 992 530 1115 562">Länder</span> <span class="ocrx_word" title="bbox 1139 531 1217 569">ohne</span> <span class="ocrx_word" title="bbox 1240 530 1391 568">Unkosten</span> <span class="ocrx_word" title="bbox 1414 530 1463 561">be-</span></span> <br><span class="ocr_line" baseline= 606 title="bbox 184 575 1465 614"><span class="ocrx_word" title="bbox 184 575 271 613">sehen</span> <span class="ocrx_word" title="bbox 295 575 397 606">könde.</span> <span class="ocrx_word" title="bbox 430 575 500 613">Hab</span> <span class="ocrx_word" title="bbox 524 575 596 613">mich</span> <span class="ocrx_word" title="bbox 621 576 694 608">alda</span> <span class="ocrx_word" title="bbox 717 576 911 614">versprochen</span> <span class="ocrx_word" title="bbox 935 576 1006 612">nach</span> <span class="ocrx_word" title="bbox 1031 576 1239 614">gewohnheit,</span> <span class="ocrx_word" title="bbox 1261 576 1279 606">2</span> <span class="ocrx_word" title="bbox 1303 575 1388 613">Jahr</span> <span class="ocrx_word" title="bbox 1412 583 1465 613">zue</span></span> <br><span class="ocr_line" baseline= 652 title="bbox 187 619 1464 659"><span class="ocrx_word" title="bbox 187 619 321 658">Maifter</span> <span class="ocrx_word" title="bbox 346 620 449 658">Georg</span> <span class="ocrx_word" title="bbox 474 619 641 658">Schillern.</span> <span class="ocrx_word" title="bbox 686 621 756 659">Hab</span> <span class="ocrx_word" title="bbox 783 620 879 659">Jhme</span> <span class="ocrx_word" title="bbox 905 620 1097 659">versprochen</span> <span class="ocrx_word" title="bbox 1123 622 1161 652">40</span> <span class="ocrx_word" title="bbox 1185 621 1219 659">fl.</span> <span class="ocrx_word" title="bbox 1245 620 1404 659">Lehrlohn</span> <span class="ocrx_word" title="bbox 1429 628 1464 659">zu</span></span> <br><span class="ocr_line" baseline= 695 title="bbox 184 665 289 702"><span class="ocrx_word" title="bbox 184 665 289 702">geben.</span></span> <br></p>
|
|
17
|
+
<p class="ocr_par" align=Justified leftIndent=100 startIndent=1900 lineSpacing=1056 style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 750 title="bbox 264 718 1463 759"><span class="ocrx_word" title="bbox 264 719 349 750">Dato</span> <span class="ocrx_word" title="bbox 369 719 427 750">den</span> <span class="ocrx_word" title="bbox 448 718 493 749">10.</span> <span class="ocrx_word" title="bbox 512 719 677 758">February</span> <span class="ocrx_word" title="bbox 698 720 800 758">haben</span> <span class="ocrx_word" title="bbox 820 720 879 751">wir</span> <span class="ocrx_word" title="bbox 899 720 954 757">bey</span> <span class="ocrx_word" title="bbox 975 720 1041 750">dem</span> <span class="ocrx_word" title="bbox 1062 719 1194 759">Weißen</span> <span class="ocrx_word" title="bbox 1213 720 1302 757">Oxen</span> <span class="ocrx_word" title="bbox 1323 719 1463 758">verzehrt</span></span> <br><span class="ocr_line" baseline= 795 title="bbox 186 763 1463 803"><span class="ocrx_word" title="bbox 186 764 204 794">6</span> <span class="ocrx_word" title="bbox 228 764 262 802">fl.</span> <span class="ocrx_word" title="bbox 286 765 304 794">7</span> <span class="ocrx_word" title="bbox 330 763 400 802">batz.</span> <span class="ocrx_word" title="bbox 436 763 550 795">Daran</span> <span class="ocrx_word" title="bbox 575 764 629 802">hat</span> <span class="ocrx_word" title="bbox 655 765 708 796">der</span> <span class="ocrx_word" title="bbox 734 764 869 803">Maifter</span> <span class="ocrx_word" title="bbox 894 765 911 795">3</span> <span class="ocrx_word" title="bbox 936 764 970 802">fl.</span> <span class="ocrx_word" title="bbox 995 764 1058 802">zalt</span> <span class="ocrx_word" title="bbox 1084 765 1147 796">und</span> <span class="ocrx_word" title="bbox 1173 765 1226 802">Ich</span> <span class="ocrx_word" title="bbox 1251 765 1310 795">das</span> <span class="ocrx_word" title="bbox 1336 764 1463 801">Uebrig.</span></span> <br><span class="ocr_line" baseline= 840 title="bbox 185 808 1463 848"><span class="ocrx_word" title="bbox 185 808 256 840">Und</span> <span class="ocrx_word" title="bbox 282 808 337 840">bin</span> <span class="ocrx_word" title="bbox 364 808 403 846">ich</span> <span class="ocrx_word" title="bbox 431 808 465 846">uf</span> <span class="ocrx_word" title="bbox 491 808 549 840">den</span> <span class="ocrx_word" title="bbox 576 809 623 841">20.</span> <span class="ocrx_word" title="bbox 649 809 816 848">February</span> <span class="ocrx_word" title="bbox 843 810 919 839">1621</span> <span class="ocrx_word" title="bbox 947 809 1002 846">bey</span> <span class="ocrx_word" title="bbox 1031 809 1127 846">Jhme</span> <span class="ocrx_word" title="bbox 1154 809 1384 848">eingestanden.</span> <span class="ocrx_word" title="bbox 1422 808 1463 846">Uf</span></span> <br><span class="ocr_line" baseline= 884 title="bbox 185 853 1462 892"><span class="ocrx_word" title="bbox 185 854 222 884">2?.</span> <span class="ocrx_word" title="bbox 246 854 306 885">dto.</span> <span class="ocrx_word" title="bbox 330 854 364 884">in</span> <span class="ocrx_word" title="bbox 388 854 440 885">der</span> <span class="ocrx_word" title="bbox 464 853 564 892">Zunft</span> <span class="ocrx_word" title="bbox 588 854 828 892">eingeschrieben</span> <span class="ocrx_word" title="bbox 851 854 987 885">worden.</span> <span class="ocrx_word" title="bbox 1020 854 1104 885">Dato</span> <span class="ocrx_word" title="bbox 1129 862 1197 892">zum</span> <span class="ocrx_word" title="bbox 1219 854 1352 892">Weihen</span> <span class="ocrx_word" title="bbox 1374 854 1462 891">Oxen</span></span> <br><span class="ocr_line" baseline= 929 title="bbox 185 897 951 937"><span class="ocrx_word" title="bbox 185 898 324 936">verzehrt</span> <span class="ocrx_word" title="bbox 345 899 363 928">6</span> <span class="ocrx_word" title="bbox 384 897 429 937">fl.,</span> <span class="ocrx_word" title="bbox 451 898 512 936">hab</span> <span class="ocrx_word" title="bbox 533 899 572 937">ich</span> <span class="ocrx_word" title="bbox 593 899 708 937">halben</span> <span class="ocrx_word" title="bbox 730 899 822 937">Theil</span> <span class="ocrx_word" title="bbox 846 898 951 936">geben.</span></span> <br></p>
|
|
18
|
+
<p class="ocr_par" leftIndent=10600 style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 1001 title="bbox 625 970 1016 1003"><span class="ocrx_word" title="bbox 625 971 733 1003">Gott</span> <span class="ocrx_word" title="bbox 751 970 865 1002">Gebe</span> <span class="ocrx_word" title="bbox 883 970 1016 1002">Gnad.</span></span> <br></p>
|
|
19
|
+
<p class="ocr_par" align=Justified leftIndent=100 startIndent=1900 lineSpacing=1056 style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 1054 title="bbox 265 1023 1464 1062"><span class="ocrx_word" title="bbox 265 1023 454 1061">Sambstag,</span> <span class="ocrx_word" title="bbox 474 1023 532 1055">den</span> <span class="ocrx_word" title="bbox 553 1024 598 1054">18.</span> <span class="ocrx_word" title="bbox 620 1024 751 1062">Augusti</span> <span class="ocrx_word" title="bbox 773 1024 850 1054">1621</span> <span class="ocrx_word" title="bbox 871 1024 906 1062">ist</span> <span class="ocrx_word" title="bbox 927 1024 1021 1055">Mein</span> <span class="ocrx_word" title="bbox 1040 1023 1243 1061">Lehrmeister</span> <span class="ocrx_word" title="bbox 1265 1024 1344 1061">Jerg</span> <span class="ocrx_word" title="bbox 1364 1023 1464 1061">Schik-</span></span> <br><span class="ocr_line" baseline= 1099 title="bbox 187 1067 1464 1106"><span class="ocrx_word" title="bbox 187 1068 232 1099">ler</span> <span class="ocrx_word" title="bbox 260 1069 294 1099">in</span> <span class="ocrx_word" title="bbox 321 1068 398 1099">Gott</span> <span class="ocrx_word" title="bbox 425 1067 572 1106">seeliglich</span> <span class="ocrx_word" title="bbox 600 1068 792 1106">entschlafen.</span> <span class="ocrx_word" title="bbox 829 1069 906 1100">Gott</span> <span class="ocrx_word" title="bbox 933 1069 985 1100">der</span> <span class="ocrx_word" title="bbox 1013 1067 1194 1106">Allmechtig</span> <span class="ocrx_word" title="bbox 1221 1068 1341 1105">verleih</span> <span class="ocrx_word" title="bbox 1368 1068 1464 1105">Jhme</span></span> <br><span class="ocr_line" baseline= 1144 title="bbox 188 1112 1464 1152"><span class="ocrx_word" title="bbox 188 1113 255 1144">eine</span> <span class="ocrx_word" title="bbox 274 1112 417 1150">fröhliche</span> <span class="ocrx_word" title="bbox 437 1112 666 1152">Auferstehung</span> <span class="ocrx_word" title="bbox 687 1114 750 1144">und</span> <span class="ocrx_word" title="bbox 769 1121 832 1144">uns</span> <span class="ocrx_word" title="bbox 853 1112 949 1151">allen,</span> <span class="ocrx_word" title="bbox 968 1113 1069 1144">einem</span> <span class="ocrx_word" title="bbox 1089 1112 1192 1145">Jeden</span> <span class="ocrx_word" title="bbox 1211 1121 1246 1151">zu</span> <span class="ocrx_word" title="bbox 1265 1113 1363 1151">seiner</span> <span class="ocrx_word" title="bbox 1384 1112 1464 1151">Zeit,</span></span> <br><span class="ocr_line" baseline= 1188 title="bbox 186 1157 1463 1197"><span class="ocrx_word" title="bbox 186 1158 236 1189">ein</span> <span class="ocrx_word" title="bbox 261 1158 390 1195">Seeligs</span> <span class="ocrx_word" title="bbox 415 1158 494 1189">End.</span> <span class="ocrx_word" title="bbox 531 1158 640 1190">Amen.</span> <span class="ocrx_word" title="bbox 676 1158 725 1197">Ist</span> <span class="ocrx_word" title="bbox 750 1159 783 1189">in</span> <span class="ocrx_word" title="bbox 809 1158 913 1189">Allem</span> <span class="ocrx_word" title="bbox 938 1158 956 1187">8</span> <span class="ocrx_word" title="bbox 981 1157 1109 1195">Wochen</span> <span class="ocrx_word" title="bbox 1132 1158 1150 1188">2</span> <span class="ocrx_word" title="bbox 1174 1158 1243 1195">Tag</span> <span class="ocrx_word" title="bbox 1268 1158 1358 1188">krank</span> <span class="ocrx_word" title="bbox 1383 1157 1463 1195">gele¬</span></span> <br><span class="ocr_line" baseline= 1233 title="bbox 185 1202 1463 1241"><span class="ocrx_word" title="bbox 185 1211 253 1241">gen,</span> <span class="ocrx_word" title="bbox 278 1202 471 1240">unterdessen</span> <span class="ocrx_word" title="bbox 496 1203 556 1240">hab</span> <span class="ocrx_word" title="bbox 582 1203 621 1240">ich</span> <span class="ocrx_word" title="bbox 645 1204 694 1234">die</span> <span class="ocrx_word" title="bbox 718 1203 902 1240">Werckstatt,</span> <span class="ocrx_word" title="bbox 928 1202 1069 1240">Gottlob,</span> <span class="ocrx_word" title="bbox 1096 1203 1147 1233">als</span> <span class="ocrx_word" title="bbox 1173 1203 1222 1233">ein</span> <span class="ocrx_word" title="bbox 1249 1202 1351 1240">Gesell</span> <span class="ocrx_word" title="bbox 1377 1202 1463 1240">füeh-</span></span> <br><span class="ocr_line" baseline= 1278 title="bbox 185 1247 1314 1285"><span class="ocrx_word" title="bbox 185 1255 240 1278">ren</span> <span class="ocrx_word" title="bbox 264 1247 396 1285">müeffen</span> <span class="ocrx_word" title="bbox 421 1247 484 1277">und</span> <span class="ocrx_word" title="bbox 508 1248 562 1285">hat</span> <span class="ocrx_word" title="bbox 585 1249 646 1280">mir</span> <span class="ocrx_word" title="bbox 670 1247 803 1279">Gottlob</span> <span class="ocrx_word" title="bbox 826 1247 902 1277">eben</span> <span class="ocrx_word" title="bbox 927 1247 1009 1284">wohl</span> <span class="ocrx_word" title="bbox 1033 1247 1134 1278">damit</span> <span class="ocrx_word" title="bbox 1159 1247 1314 1285">gelungen</span></span> <br></p>
|
|
20
|
+
<p class="ocr_par" align=Justified leftIndent=100 startIndent=1900 lineSpacing=1056 style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 1323 title="bbox 266 1291 1464 1331"><span class="ocrx_word" title="bbox 266 1291 331 1330">Auf</span> <span class="ocrx_word" title="bbox 354 1292 413 1323">den</span> <span class="ocrx_word" title="bbox 438 1291 534 1330">ersten</span> <span class="ocrx_word" title="bbox 558 1292 690 1331">Augusti</span> <span class="ocrx_word" title="bbox 715 1293 791 1323">1621</span> <span class="ocrx_word" title="bbox 814 1292 875 1329">hab</span> <span class="ocrx_word" title="bbox 899 1292 937 1330">ich</span> <span class="ocrx_word" title="bbox 960 1291 1032 1330">mich</span> <span class="ocrx_word" title="bbox 1056 1299 1090 1330">zu</span> <span class="ocrx_word" title="bbox 1114 1292 1236 1330">Herren</span> <span class="ocrx_word" title="bbox 1259 1292 1335 1324">Veit</span> <span class="ocrx_word" title="bbox 1357 1291 1464 1329">Schal-</span></span> <br><span class="ocr_line" baseline= 1367 title="bbox 187 1336 1174 1375"><span class="ocrx_word" title="bbox 187 1337 239 1368">ken</span> <span class="ocrx_word" title="bbox 265 1336 468 1375">versprochen,</span> <span class="ocrx_word" title="bbox 496 1336 554 1368">das</span> <span class="ocrx_word" title="bbox 580 1337 754 1374">Handwerk</span> <span class="ocrx_word" title="bbox 780 1337 927 1368">vollends</span> <span class="ocrx_word" title="bbox 953 1336 1174 1374">auszulernen.</span></span> <br></p>
|
|
21
|
+
<p class="ocr_par" align=Justified leftIndent=100 startIndent=1900 lineSpacing=1056 style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 1412 title="bbox 265 1381 1466 1420"><span class="ocrx_word" title="bbox 265 1382 336 1413">Und</span> <span class="ocrx_word" title="bbox 356 1382 416 1419">hab</span> <span class="ocrx_word" title="bbox 435 1381 530 1419">Jhme</span> <span class="ocrx_word" title="bbox 551 1382 586 1412">18</span> <span class="ocrx_word" title="bbox 605 1383 638 1420">fl.</span> <span class="ocrx_word" title="bbox 658 1382 764 1420">geben.</span> <span class="ocrx_word" title="bbox 795 1381 879 1413">Aber</span> <span class="ocrx_word" title="bbox 907 1382 975 1412">dem</span> <span class="ocrx_word" title="bbox 1003 1382 1134 1419">vorigen</span> <span class="ocrx_word" title="bbox 1162 1381 1297 1419">Maifter</span> <span class="ocrx_word" title="bbox 1326 1381 1466 1419">(dieweil</span></span> <br><span class="ocr_line" baseline= 1457 title="bbox 187 1425 1464 1465"><span class="ocrx_word" title="bbox 187 1426 235 1457">bei</span> <span class="ocrx_word" title="bbox 258 1426 311 1457">der</span> <span class="ocrx_word" title="bbox 333 1425 533 1463">Wittfrauen</span> <span class="ocrx_word" title="bbox 556 1427 635 1464">nicht</span> <span class="ocrx_word" title="bbox 659 1427 832 1457">auslernen</span> <span class="ocrx_word" title="bbox 857 1426 989 1465">könden)</span> <span class="ocrx_word" title="bbox 1015 1434 1076 1456">nur</span> <span class="ocrx_word" title="bbox 1100 1427 1138 1456">20</span> <span class="ocrx_word" title="bbox 1161 1426 1195 1464">fl.</span> <span class="ocrx_word" title="bbox 1230 1426 1316 1458">Dato</span> <span class="ocrx_word" title="bbox 1339 1426 1400 1464">hab</span> <span class="ocrx_word" title="bbox 1424 1426 1464 1462">ich</span></span> <br><span class="ocr_line" baseline= 1502 title="bbox 185 1471 585 1510"><span class="ocrx_word" title="bbox 185 1479 255 1509">zum</span> <span class="ocrx_word" title="bbox 276 1471 376 1509">besten</span> <span class="ocrx_word" title="bbox 395 1471 492 1509">geben</span> <span class="ocrx_word" title="bbox 512 1472 531 1502">2</span> <span class="ocrx_word" title="bbox 551 1471 585 1510">fl.</span></span> <br></p>
|
|
22
|
+
<p class="ocr_par" align=Justified leftIndent=100 startIndent=1900 lineSpacing=1056 style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 1555 title="bbox 266 1524 1464 1563"><span class="ocrx_word" title="bbox 266 1525 338 1556">Adi.</span> <span class="ocrx_word" title="bbox 355 1525 447 1562">Juny</span> <span class="ocrx_word" title="bbox 465 1524 541 1555">1622</span> <span class="ocrx_word" title="bbox 560 1526 648 1563">(weil</span> <span class="ocrx_word" title="bbox 665 1534 697 1556">es</span> <span class="ocrx_word" title="bbox 714 1526 763 1556">die</span> <span class="ocrx_word" title="bbox 780 1525 978 1563">gelegenhait</span> <span class="ocrx_word" title="bbox 995 1524 1112 1562">sonsten</span> <span class="ocrx_word" title="bbox 1127 1525 1239 1563">geben)</span> <span class="ocrx_word" title="bbox 1259 1525 1315 1555">bin</span> <span class="ocrx_word" title="bbox 1331 1524 1384 1561">Ich</span> <span class="ocrx_word" title="bbox 1402 1532 1464 1554">von</span></span> <br><span class="ocr_line" baseline= 1600 title="bbox 188 1569 1464 1608"><span class="ocrx_word" title="bbox 188 1569 322 1607">Maifter</span> <span class="ocrx_word" title="bbox 346 1570 467 1607">Caspar</span> <span class="ocrx_word" title="bbox 491 1569 626 1608">Müller,</span> <span class="ocrx_word" title="bbox 652 1570 827 1608">Schleifern</span> <span class="ocrx_word" title="bbox 854 1578 889 1607">zu</span> <span class="ocrx_word" title="bbox 913 1569 1144 1607">Memmingen,</span> <span class="ocrx_word" title="bbox 1170 1577 1239 1607">zum</span> <span class="ocrx_word" title="bbox 1265 1569 1390 1607">Gfellen</span> <span class="ocrx_word" title="bbox 1416 1576 1464 1606">ge-</span></span> <br><span class="ocr_line" baseline= 1644 title="bbox 187 1613 1464 1652"><span class="ocrx_word" title="bbox 187 1614 282 1652">macht</span> <span class="ocrx_word" title="bbox 311 1614 373 1645">und</span> <span class="ocrx_word" title="bbox 402 1613 644 1651">aufgenommen</span> <span class="ocrx_word" title="bbox 672 1615 808 1645">worden.</span> <span class="ocrx_word" title="bbox 846 1614 964 1651">Sampt</span> <span class="ocrx_word" title="bbox 995 1613 1098 1645">Lucas</span> <span class="ocrx_word" title="bbox 1126 1613 1248 1651">Hursich</span> <span class="ocrx_word" title="bbox 1277 1613 1340 1644">und</span> <span class="ocrx_word" title="bbox 1368 1613 1464 1651">Peter</span></span> <br><span class="ocr_line" baseline= 1689 title="bbox 185 1657 1463 1697"><span class="ocrx_word" title="bbox 185 1659 346 1697">Holzwart</span> <span class="ocrx_word" title="bbox 369 1667 430 1690">von</span> <span class="ocrx_word" title="bbox 455 1658 684 1697">Memmingen.</span> <span class="ocrx_word" title="bbox 718 1660 802 1691">Dato</span> <span class="ocrx_word" title="bbox 825 1658 928 1697">haben</span> <span class="ocrx_word" title="bbox 952 1659 1010 1689">das</span> <span class="ocrx_word" title="bbox 1033 1658 1217 1696">Handwerk,</span> <span class="ocrx_word" title="bbox 1241 1658 1376 1696">Maifter</span> <span class="ocrx_word" title="bbox 1401 1657 1463 1688">und</span></span> <br><span class="ocr_line" baseline= 1734 title="bbox 187 1702 1464 1742"><span class="ocrx_word" title="bbox 187 1704 328 1742">Gesellen</span> <span class="ocrx_word" title="bbox 353 1703 492 1741">verzehrt</span> <span class="ocrx_word" title="bbox 518 1704 573 1741">bey</span> <span class="ocrx_word" title="bbox 597 1704 664 1735">dem</span> <span class="ocrx_word" title="bbox 688 1704 821 1742">Weißen</span> <span class="ocrx_word" title="bbox 846 1703 934 1741">Oxen</span> <span class="ocrx_word" title="bbox 957 1704 994 1733">34</span> <span class="ocrx_word" title="bbox 1020 1703 1064 1740">fl.,</span> <span class="ocrx_word" title="bbox 1089 1703 1147 1734">den</span> <span class="ocrx_word" title="bbox 1173 1703 1291 1740">Thaler</span> <span class="ocrx_word" title="bbox 1317 1710 1352 1740">zu</span> <span class="ocrx_word" title="bbox 1376 1703 1394 1733">9</span> <span class="ocrx_word" title="bbox 1419 1702 1464 1740">fl.,</span></span> <br><span class="ocr_line" baseline= 1778 title="bbox 185 1747 1463 1787"><span class="ocrx_word" title="bbox 185 1749 288 1779">daran</span> <span class="ocrx_word" title="bbox 309 1748 344 1787">ist</span> <span class="ocrx_word" title="bbox 366 1749 428 1779">mir</span> <span class="ocrx_word" title="bbox 452 1747 608 1786">auferlegt</span> <span class="ocrx_word" title="bbox 631 1749 758 1779">worden</span> <span class="ocrx_word" title="bbox 781 1756 815 1787">zu</span> <span class="ocrx_word" title="bbox 839 1747 987 1786">bezahlen</span> <span class="ocrx_word" title="bbox 1010 1748 1047 1777">24</span> <span class="ocrx_word" title="bbox 1070 1747 1115 1786">fl.,</span> <span class="ocrx_word" title="bbox 1137 1748 1196 1778">das</span> <span class="ocrx_word" title="bbox 1219 1747 1337 1785">Uebrig</span> <span class="ocrx_word" title="bbox 1360 1747 1463 1785">haben</span></span> <br><span class="ocr_line" baseline= 1824 title="bbox 187 1793 621 1831"><span class="ocrx_word" title="bbox 187 1794 235 1825">die</span> <span class="ocrx_word" title="bbox 255 1793 375 1824">andern</span> <span class="ocrx_word" title="bbox 395 1793 492 1830">Zwen</span> <span class="ocrx_word" title="bbox 511 1793 621 1831">bezalt.</span></span> <br></p>
|
|
23
|
+
<p class="ocr_par" align=Justified leftIndent=100 startIndent=1900 lineSpacing=1056 style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 1868 title="bbox 265 1836 1465 1876"><span class="ocrx_word" title="bbox 265 1838 467 1876">Morndrigs,</span> <span class="ocrx_word" title="bbox 490 1838 549 1869">den</span> <span class="ocrx_word" title="bbox 603 1855 609 1861">•</span> <span class="ocrx_word" title="bbox 631 1838 723 1876">Juny</span> <span class="ocrx_word" title="bbox 748 1837 823 1868">1622</span> <span class="ocrx_word" title="bbox 847 1837 949 1876">haben</span> <span class="ocrx_word" title="bbox 980 1837 1027 1868">die</span> <span class="ocrx_word" title="bbox 1059 1836 1194 1875">Maister</span> <span class="ocrx_word" title="bbox 1226 1836 1290 1868">und</span> <span class="ocrx_word" title="bbox 1322 1836 1465 1874">Gesellen</span></span> <br><span class="ocr_line" baseline= 1913 title="bbox 189 1882 1464 1921"><span class="ocrx_word" title="bbox 189 1883 323 1914">widrum</span> <span class="ocrx_word" title="bbox 353 1883 403 1913">ein</span> <span class="ocrx_word" title="bbox 432 1883 531 1914">trunck</span> <span class="ocrx_word" title="bbox 560 1884 683 1921">gethon.</span> <span class="ocrx_word" title="bbox 723 1883 793 1921">Hab</span> <span class="ocrx_word" title="bbox 824 1883 862 1920">ich</span> <span class="ocrx_word" title="bbox 892 1882 1001 1920">zahlen</span> <span class="ocrx_word" title="bbox 1031 1882 1163 1921">müessen</span> <span class="ocrx_word" title="bbox 1192 1883 1210 1913">5</span> <span class="ocrx_word" title="bbox 1240 1882 1273 1920">fl.</span> <span class="ocrx_word" title="bbox 1303 1882 1366 1912">und</span> <span class="ocrx_word" title="bbox 1397 1882 1464 1912">dem</span></span> <br><span class="ocr_line" baseline= 1958 title="bbox 188 1927 869 1966"><span class="ocrx_word" title="bbox 188 1927 387 1966">Zunftknecht</span> <span class="ocrx_word" title="bbox 407 1928 422 1957">1</span> <span class="ocrx_word" title="bbox 441 1928 514 1966">maß</span> <span class="ocrx_word" title="bbox 532 1927 627 1959">Wein</span> <span class="ocrx_word" title="bbox 646 1936 701 1965">pro</span> <span class="ocrx_word" title="bbox 720 1929 738 1958">9</span> <span class="ocrx_word" title="bbox 758 1927 869 1965">batzen.</span></span> <br></p>
|
|
24
|
+
<p class="ocr_par" align=Justified leftIndent=100 startIndent=1900 lineSpacing=1104 style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 2011 title="bbox 268 1979 1465 2019"><span class="ocrx_word" title="bbox 268 1981 402 2019">Maister</span> <span class="ocrx_word" title="bbox 428 1980 549 2018">Caspar</span> <span class="ocrx_word" title="bbox 575 1980 708 2019">Müller,</span> <span class="ocrx_word" title="bbox 734 1980 886 2018">Schleifer</span> <span class="ocrx_word" title="bbox 912 1981 974 2011">und</span> <span class="ocrx_word" title="bbox 1001 1979 1127 2017">Bürger</span> <span class="ocrx_word" title="bbox 1155 1988 1207 2018">zue</span> <span class="ocrx_word" title="bbox 1234 1979 1465 2017">Memmingen,</span></span> <br><span class="ocr_line" baseline= 2057 title="bbox 187 2025 528 2064"><span class="ocrx_word" title="bbox 187 2027 246 2064">gab</span> <span class="ocrx_word" title="bbox 268 2027 329 2058">mir</span> <span class="ocrx_word" title="bbox 351 2025 411 2064">dise</span> <span class="ocrx_word" title="bbox 433 2025 528 2063">Lehr:</span></span> <br></p>
|
|
25
|
+
<p class="ocr_par" leftIndent=2900 lineSpacing=1056 style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 2110 title="bbox 305 2078 1346 2118"><span class="ocrx_word" title="bbox 305 2079 415 2118">Fürcht</span> <span class="ocrx_word" title="bbox 435 2080 498 2110">und</span> <span class="ocrx_word" title="bbox 520 2080 582 2111">lieb</span> <span class="ocrx_word" title="bbox 603 2080 690 2118">Gott,</span> <span class="ocrx_word" title="bbox 712 2079 777 2110">und</span> <span class="ocrx_word" title="bbox 795 2079 919 2111">Deinen</span> <span class="ocrx_word" title="bbox 943 2078 1065 2116">negsten</span> <span class="ocrx_word" title="bbox 1089 2079 1140 2110">als</span> <span class="ocrx_word" title="bbox 1162 2079 1233 2116">Dich</span> <span class="ocrx_word" title="bbox 1255 2078 1346 2116">selbs,</span></span> <br><span class="ocr_line" baseline= 2154 title="bbox 305 2123 1107 2162"><span class="ocrx_word" title="bbox 305 2125 334 2162">so</span> <span class="ocrx_word" title="bbox 359 2124 474 2162">würstu</span> <span class="ocrx_word" title="bbox 497 2132 559 2155">von</span> <span class="ocrx_word" title="bbox 582 2124 678 2157">Allen</span> <span class="ocrx_word" title="bbox 702 2124 824 2161">bayden</span> <span class="ocrx_word" title="bbox 847 2123 952 2161">geehrt</span> <span class="ocrx_word" title="bbox 975 2123 1107 2155">werden.</span></span> <br></p>
|
|
26
|
+
<p class="ocr_par" align=Justified leftIndent=100 startIndent=1900 lineSpacing=1128 style="font-size:10pt;font-family:"Times New Roman";font-style:normal"><br><span class="ocr_line" baseline= 2208 title="bbox 266 2176 1462 2217"><span class="ocrx_word" title="bbox 266 2178 400 2216">Maister</span> <span class="ocrx_word" title="bbox 419 2177 545 2209">Andres</span> <span class="ocrx_word" title="bbox 563 2178 695 2217">Hursich,</span> <span class="ocrx_word" title="bbox 713 2177 923 2216">Tuochscherer</span> <span class="ocrx_word" title="bbox 941 2185 976 2214">zu</span> <span class="ocrx_word" title="bbox 994 2176 1224 2215">Memmingen,</span> <span class="ocrx_word" title="bbox 1244 2176 1304 2214">gab</span> <span class="ocrx_word" title="bbox 1323 2177 1384 2208">mir</span> <span class="ocrx_word" title="bbox 1402 2176 1462 2214">dise</span></span> <br><span class="ocr_line" baseline= 2254 title="bbox 186 2223 280 2263"><span class="ocrx_word" title="bbox 186 2223 280 2263">Lehr:</span></span> <br></p></div></div>
|
|
27
|
+
</body>
|
|
28
|
+
</html>
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
#coding: utf-8
|
|
2
|
+
require 'sinatra'
|
|
3
|
+
|
|
4
|
+
require_relative '../lib/ocr_page'
|
|
5
|
+
|
|
6
|
+
get '/' do
|
|
7
|
+
"<a href='OCRTest.html'>OCRTest</a>"
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
get '/mark' do
|
|
11
|
+
x1 = params[:x1]
|
|
12
|
+
y1 = params[:y1]
|
|
13
|
+
x2 = params[:x2]
|
|
14
|
+
y2 = params[:y2]
|
|
15
|
+
page = params[:page]
|
|
16
|
+
|
|
17
|
+
if x1 and y1 and x2 and y2 and page then
|
|
18
|
+
words = get_enclosed_words(x1, y1, x2, y2, page).join("<br/>")
|
|
19
|
+
words
|
|
20
|
+
else
|
|
21
|
+
"Not enough parameters"
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def get_enclosed_words(x1, y1, x2 ,y2, page)
|
|
27
|
+
@page = OCRPage.new("../data/#{page}")
|
|
28
|
+
@page.enclosed_words( OCRBox.new(x1.to_i, y1.to_i, x2.to_i, y2.to_i) )
|
|
29
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
body {
|
|
2
|
+
font: 16px Helvetica, Arial;
|
|
3
|
+
margin:0px;
|
|
4
|
+
padding:0px;
|
|
5
|
+
}
|
|
6
|
+
|
|
7
|
+
.marked {
|
|
8
|
+
color:#FFF;
|
|
9
|
+
position: absolute;
|
|
10
|
+
background-color:purple;
|
|
11
|
+
opacity: 0.4;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
#ocr_image {
|
|
15
|
+
background-image: url(img/Seite_Tagebuch_H_C_Lang_08.jpg);
|
|
16
|
+
width:1600px;
|
|
17
|
+
height:2495px;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
#marked_words {
|
|
21
|
+
font-family:monospace;
|
|
22
|
+
border:0.2em solid #333;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
.selected_words {
|
|
26
|
+
background-color:#EDEDED;
|
|
27
|
+
margin:0.2em;
|
|
28
|
+
padding:0.2em;
|
|
29
|
+
}
|
|
30
|
+
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
<!DOCTYPE html>
|
|
2
|
+
<html lang="de">
|
|
3
|
+
<!--
|
|
4
|
+
|
|
5
|
+
Created using http://jsbin.com/
|
|
6
|
+
Source can be edited via http://jsbin.com/azare/edit
|
|
7
|
+
|
|
8
|
+
-->
|
|
9
|
+
<head>
|
|
10
|
+
<meta charset="utf-8" />
|
|
11
|
+
<title>OCRTest</title>
|
|
12
|
+
<link rel="stylesheet" href="http://ajax.googleapis.com/ajax/libs/jqueryui/1.7.2/themes/base/jquery-ui.css" type="text/css" />
|
|
13
|
+
<link rel="stylesheet" href="OCRTest.css" type="text/css" />
|
|
14
|
+
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js"></script>
|
|
15
|
+
<script src="https://ajax.googleapis.com/ajax/libs/jqueryui/1.7.2/jquery-ui.min.js"></script>
|
|
16
|
+
<script language="javascript" type="text/javascript" src="OCRTest_marker.js"></script>
|
|
17
|
+
</head>
|
|
18
|
+
<body>
|
|
19
|
+
<header>
|
|
20
|
+
<h1>Mark words test</h1>
|
|
21
|
+
</header>
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
<div id="ocr_image">
|
|
26
|
+
</div>
|
|
27
|
+
|
|
28
|
+
<div id="marked_words">
|
|
29
|
+
</div>
|
|
30
|
+
|
|
31
|
+
<script type="text/javascript">
|
|
32
|
+
$(document).ready(function(){
|
|
33
|
+
/*Initialisierung der Position für JS*/
|
|
34
|
+
ocr_image_left = $("#ocr_image").offset().left;
|
|
35
|
+
ocr_image_top = $("#ocr_image").offset().top;
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
$("#ocr_image").mousedown( function(e) {
|
|
39
|
+
startX = e.pageX - ocr_image_left;
|
|
40
|
+
startY = e.pageY - ocr_image_top;
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
$("#ocr_image").mouseup( function(e) {
|
|
45
|
+
endX = e.pageX - ocr_image_left;
|
|
46
|
+
endY = e.pageY - ocr_image_top;
|
|
47
|
+
draw_rectangle();
|
|
48
|
+
});
|
|
49
|
+
/*Initialisierung abgeschlossen*/
|
|
50
|
+
})
|
|
51
|
+
|
|
52
|
+
</script>
|
|
53
|
+
</body>
|
|
54
|
+
</html>
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
var startX;
|
|
2
|
+
var startY;
|
|
3
|
+
var endX;
|
|
4
|
+
var endY;
|
|
5
|
+
|
|
6
|
+
var ocr_image_left;
|
|
7
|
+
var ocr_image_top;
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
function draw_rectangle() {
|
|
11
|
+
var top;
|
|
12
|
+
var left;
|
|
13
|
+
var height;
|
|
14
|
+
var width;
|
|
15
|
+
|
|
16
|
+
// Linker Abstand und Seitenlänge
|
|
17
|
+
if (startX < endX) {
|
|
18
|
+
left = startX;
|
|
19
|
+
width = endX - startX;
|
|
20
|
+
}
|
|
21
|
+
else {
|
|
22
|
+
left = endX;
|
|
23
|
+
width = startX - endX;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
// Abstand oben und Seitenlänge
|
|
27
|
+
if(startY < endY) {
|
|
28
|
+
top = startY;
|
|
29
|
+
height = endY - startY;
|
|
30
|
+
}
|
|
31
|
+
else {
|
|
32
|
+
top = endY;
|
|
33
|
+
height = startY - endY;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// Muss draufgerechnet werden, damit Position relativ zum Elternelement
|
|
37
|
+
top += ocr_image_top;
|
|
38
|
+
left += ocr_image_left;
|
|
39
|
+
|
|
40
|
+
var div = $("<span></span>").addClass('marked').css('top',top).css('left',left).height(height).width(width);
|
|
41
|
+
div.append(startX + " "+ startY + " "+ endX +" " + endY);
|
|
42
|
+
$("#ocr_image").append(div);
|
|
43
|
+
|
|
44
|
+
get_marked_words();
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
function get_marked_words() {
|
|
49
|
+
var x1;
|
|
50
|
+
var y1;
|
|
51
|
+
var x2;
|
|
52
|
+
var y2;
|
|
53
|
+
|
|
54
|
+
if (startX < endX) {
|
|
55
|
+
x1 = startX;
|
|
56
|
+
x2 = endX;
|
|
57
|
+
}
|
|
58
|
+
else {
|
|
59
|
+
x1 = endX;
|
|
60
|
+
x2 = startX
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
if (startY < endY) {
|
|
64
|
+
y1 = startY;
|
|
65
|
+
y2 = endY ;
|
|
66
|
+
}
|
|
67
|
+
else {
|
|
68
|
+
y1 = endY;
|
|
69
|
+
y2t = startY;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
$.ajax({
|
|
73
|
+
url: 'http://localhost:4567/mark',
|
|
74
|
+
data: 'x1=' + x1 + '&y1=' + y1 + '&x2='+ x2 + '&y2=' + y2 + '&page=Seite_Tagebuch_H_C_Lang_08.html',
|
|
75
|
+
success: add_marked_words
|
|
76
|
+
})
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
function add_marked_words(data) {
|
|
80
|
+
var span = $("<div></div>").addClass('selected_words').append(data);
|
|
81
|
+
$("#marked_words").append( span );
|
|
82
|
+
|
|
83
|
+
}
|
|
Binary file
|
|
Binary file
|
data/lib/hocr.rb
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
#coding: utf-8
|
data/lib/ocr_box.rb
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
#coding: utf-8
|
|
2
|
+
|
|
3
|
+
class OCRBox
|
|
4
|
+
|
|
5
|
+
attr_reader :x1, :y1, :x2, :y2
|
|
6
|
+
|
|
7
|
+
def initialize(x1, y1 , x2, y2)
|
|
8
|
+
@x1 = x1
|
|
9
|
+
@y1 = y1
|
|
10
|
+
@x2 = x2
|
|
11
|
+
@y2 = y2
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def encloses?(element)
|
|
15
|
+
@x1 <= element.x1 and
|
|
16
|
+
@x2 >= element.x2 and
|
|
17
|
+
@y1 <= element.y1 and
|
|
18
|
+
@y2 >= element.y2
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def enclosed_by?(element)
|
|
22
|
+
return element.encloses? self
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def to_s
|
|
26
|
+
"tl->(x:#{@x1} y:#{@y1})/br->:(x:#{@x2} y:#{@y2})"
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def to_css_style
|
|
30
|
+
top = @y1
|
|
31
|
+
left = @x1
|
|
32
|
+
height = @y2 - @y1
|
|
33
|
+
width = @x2 - @x1
|
|
34
|
+
|
|
35
|
+
"position:absolute; top:#{top}px; left:#{left}px; height:#{height}px; width:#{width}px;"
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def to_json
|
|
39
|
+
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
end
|
|
43
|
+
|
data/lib/ocr_page.rb
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
#coding: utf-8
|
|
2
|
+
require_relative "ocrx_word"
|
|
3
|
+
|
|
4
|
+
class OCRPage < OCRBox
|
|
5
|
+
attr_reader :lines, :words
|
|
6
|
+
|
|
7
|
+
def initialize(filename)
|
|
8
|
+
@lines = hocr_lines( file_as_string(filename) ).select {|line| line.length > 0}
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def hocr_lines( hocr_contents)
|
|
12
|
+
hocr_array = []
|
|
13
|
+
for line in hocr_contents.split(/<span class="ocr_line"/) do
|
|
14
|
+
line_array = []
|
|
15
|
+
for ocrx_word in line.scan(/<span class="ocrx_word"[^>]+>[^<]+<\/span>/) do
|
|
16
|
+
ocrx_word =~ /title="bbox (\d+) (\d+) (\d+) (\d+)">([^<]+)</
|
|
17
|
+
current_word = OCRXWord.new($1,$2,$3,$4,$5)
|
|
18
|
+
line_array << current_word
|
|
19
|
+
end
|
|
20
|
+
hocr_array << line_array
|
|
21
|
+
end
|
|
22
|
+
hocr_array
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def words
|
|
26
|
+
@words ||= @lines.flatten
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def enclosed_words(box)
|
|
30
|
+
words.select { |word| word.enclosed_by? box }
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def get_position(element)
|
|
34
|
+
element =~ /title="bbox (\d+) (\d+) (\d+) (\d+)">/
|
|
35
|
+
[$1,$2,$3,$4]
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def file_as_string(filename)
|
|
40
|
+
hocr_page_contents = File.open(filename,"r") { |f| f.read }
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
end
|
data/lib/ocrx_word.rb
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
#coding: utf-8
|
|
2
|
+
|
|
3
|
+
require 'cgi'
|
|
4
|
+
require_relative 'ocr_box'
|
|
5
|
+
|
|
6
|
+
class OCRXWord < OCRBox
|
|
7
|
+
|
|
8
|
+
attr_reader :text
|
|
9
|
+
|
|
10
|
+
def initialize(x1,y1,x2,y2,word)
|
|
11
|
+
super(x1.to_i, y1.to_i, x2.to_i, y2.to_i)
|
|
12
|
+
@text = word
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def to_html(css_class = 'ocrx_word')
|
|
16
|
+
"<span style='#{ to_css_style }' class='#{css_class}'><span class='word'>" + CGI::escapeHTML(@text) +"</span></span>"
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def to_s
|
|
20
|
+
"#{@text}\t#{super}"
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
end
|
data/rhocr.gemspec
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
Gem::Specification.new do |s|
|
|
4
|
+
s.name = %q{rhocr}
|
|
5
|
+
s.version = "0.0.1"
|
|
6
|
+
|
|
7
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
|
8
|
+
s.authors = ["Andreas Neumann"]
|
|
9
|
+
s.date = %q{2011-07-01}
|
|
10
|
+
s.description = %q{Manipulate and use OCR data encode in HOCR}
|
|
11
|
+
s.email = %q{info @nospam@ an-it.com}
|
|
12
|
+
s.extra_rdoc_files = ["README", "lib/hocr.rb", "lib/ocr_box.rb", "lib/ocr_page.rb", "lib/ocrx_word.rb"]
|
|
13
|
+
s.files = ["README", "Rakefile", "data/Seite_Tagebuch_H_C_Lang_08.html", "example/example_server.rb", "example/public/OCRTest.css", "example/public/OCRTest.html", "example/public/OCRTest_marker.js", "example/public/img/Seite_Tagebuch_H_C_Lang_05.jpg", "example/public/img/Seite_Tagebuch_H_C_Lang_08.jpg", "lib/hocr.rb", "lib/ocr_box.rb", "lib/ocr_page.rb", "lib/ocrx_word.rb", "rspec/ocr_box_spec.rb", "rspec/ocr_page_spec.rb", "rspec/ocrx_word_spec.rb", "test.rb", "Manifest", "rhocr.gemspec"]
|
|
14
|
+
s.homepage = %q{http://github.com/daandi/rhocr}
|
|
15
|
+
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Rhocr", "--main", "README"]
|
|
16
|
+
s.require_paths = ["lib"]
|
|
17
|
+
s.rubyforge_project = %q{rhocr}
|
|
18
|
+
s.rubygems_version = %q{1.6.2}
|
|
19
|
+
s.summary = %q{Manipulate and use OCR data encode in HOCR}
|
|
20
|
+
|
|
21
|
+
if s.respond_to? :specification_version then
|
|
22
|
+
s.specification_version = 3
|
|
23
|
+
|
|
24
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
|
25
|
+
else
|
|
26
|
+
end
|
|
27
|
+
else
|
|
28
|
+
end
|
|
29
|
+
end
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
#coding: utf-8
|
|
2
|
+
|
|
3
|
+
require_relative '../lib/ocr_box'
|
|
4
|
+
|
|
5
|
+
describe OCRBox do
|
|
6
|
+
|
|
7
|
+
before(:each) do
|
|
8
|
+
@box ||= OCRBox.new(1,2,20,8)
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
describe "#to_s" do
|
|
12
|
+
it "prints a human readable Box-Version with coordinates upper_left(x,y) bottom_right(x,y)" do
|
|
13
|
+
@box.to_s.should == "tl->(x:1 y:2)/br->:(x:20 y:8)"
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
describe '#encloses?(element)' do
|
|
18
|
+
it "tests wather given OCRBox is enclosed by the current OCRBox" do
|
|
19
|
+
@box.encloses?( OCRBox.new(0,3,19,7) ).should be_false
|
|
20
|
+
@box.encloses?( OCRBox.new(2,3,19,7) ).should be_true
|
|
21
|
+
end
|
|
22
|
+
it "encloses also itself" do
|
|
23
|
+
@box.encloses?( @box ).should be_true
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
describe '#to_css_style' do
|
|
28
|
+
it 'should create css-style attributes' do
|
|
29
|
+
@box.to_css_style.should == 'position:absolute; top:2px; left:1px; height:6px; width:19px;'
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
describe '#enclosed_by?(element)' do
|
|
34
|
+
it 'should be enclosed by Boxes bigger than itself' do
|
|
35
|
+
@box.enclosed_by?( OCRBox.new(0,1,21,9) ).should be_true
|
|
36
|
+
end
|
|
37
|
+
it 'should not be enclosed by Boxes smaller than itself' do
|
|
38
|
+
@box.enclosed_by?( OCRBox.new(2,3,19,7) ).should be_false
|
|
39
|
+
end
|
|
40
|
+
it 'should be enclosed by Boxes of the same size' do
|
|
41
|
+
@box.enclosed_by?( @box ).should be_true
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
#coding: utf-8
|
|
2
|
+
|
|
3
|
+
require_relative '../lib/ocr_page'
|
|
4
|
+
|
|
5
|
+
describe OCRPage do
|
|
6
|
+
|
|
7
|
+
before(:each) do
|
|
8
|
+
@ocr_page ||= OCRPage.new('../data/Seite_Tagebuch_H_C_Lang_08.html')
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
describe '' do
|
|
12
|
+
it '' do
|
|
13
|
+
p @ocr_page.enclosed_words( OCRBox.new(500,1703,1200,1800) )
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
end
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
#coding: utf-8
|
|
2
|
+
|
|
3
|
+
require_relative '../lib/ocrx_word'
|
|
4
|
+
|
|
5
|
+
describe OCRXWord do
|
|
6
|
+
|
|
7
|
+
before(:each) do
|
|
8
|
+
@ocrx_word = OCRXWord.new(10,15,20,20,'WORT')
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
describe '#to_s' do
|
|
12
|
+
it 'should print the coordinates of the box and the textual information' do
|
|
13
|
+
p @ocrx_word
|
|
14
|
+
@ocrx_word.to_s.should == "WORT\ttl->(x:10 y:15)/br->:(x:20 y:20)"
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
describe '#to_html(css_class)' do
|
|
19
|
+
it 'should create an span elment to overlay an image on an html-page' do
|
|
20
|
+
@ocrx_word.to_html.should == "<span style='position:absolute; top:15px; left:10px; height:5px; width:10px;' class='ocrx_word'><span class='word'>WORT</span></span>"
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
it 'no css_class_class given should default to ocrx_word' do
|
|
24
|
+
@ocrx_word.to_html.should =~ /class='ocrx_word'/
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
it 'css_class given should be part of genearted html' do
|
|
28
|
+
@ocrx_word.to_html('rosebud').should =~ /class='rosebud'/
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
end
|
data/test.rb
ADDED
metadata
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: rhocr
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
prerelease:
|
|
5
|
+
version: 0.0.1
|
|
6
|
+
platform: ruby
|
|
7
|
+
authors:
|
|
8
|
+
- Andreas Neumann
|
|
9
|
+
autorequire:
|
|
10
|
+
bindir: bin
|
|
11
|
+
cert_chain: []
|
|
12
|
+
|
|
13
|
+
date: 2011-07-01 00:00:00 +02:00
|
|
14
|
+
default_executable:
|
|
15
|
+
dependencies: []
|
|
16
|
+
|
|
17
|
+
description: Manipulate and use OCR data encode in HOCR
|
|
18
|
+
email: info @nospam@ an-it.com
|
|
19
|
+
executables: []
|
|
20
|
+
|
|
21
|
+
extensions: []
|
|
22
|
+
|
|
23
|
+
extra_rdoc_files:
|
|
24
|
+
- README
|
|
25
|
+
- lib/hocr.rb
|
|
26
|
+
- lib/ocr_box.rb
|
|
27
|
+
- lib/ocr_page.rb
|
|
28
|
+
- lib/ocrx_word.rb
|
|
29
|
+
files:
|
|
30
|
+
- README
|
|
31
|
+
- Rakefile
|
|
32
|
+
- data/Seite_Tagebuch_H_C_Lang_08.html
|
|
33
|
+
- example/example_server.rb
|
|
34
|
+
- example/public/OCRTest.css
|
|
35
|
+
- example/public/OCRTest.html
|
|
36
|
+
- example/public/OCRTest_marker.js
|
|
37
|
+
- example/public/img/Seite_Tagebuch_H_C_Lang_05.jpg
|
|
38
|
+
- example/public/img/Seite_Tagebuch_H_C_Lang_08.jpg
|
|
39
|
+
- lib/hocr.rb
|
|
40
|
+
- lib/ocr_box.rb
|
|
41
|
+
- lib/ocr_page.rb
|
|
42
|
+
- lib/ocrx_word.rb
|
|
43
|
+
- rspec/ocr_box_spec.rb
|
|
44
|
+
- rspec/ocr_page_spec.rb
|
|
45
|
+
- rspec/ocrx_word_spec.rb
|
|
46
|
+
- test.rb
|
|
47
|
+
- Manifest
|
|
48
|
+
- rhocr.gemspec
|
|
49
|
+
has_rdoc: true
|
|
50
|
+
homepage: http://github.com/daandi/rhocr
|
|
51
|
+
licenses: []
|
|
52
|
+
|
|
53
|
+
post_install_message:
|
|
54
|
+
rdoc_options:
|
|
55
|
+
- --line-numbers
|
|
56
|
+
- --inline-source
|
|
57
|
+
- --title
|
|
58
|
+
- Rhocr
|
|
59
|
+
- --main
|
|
60
|
+
- README
|
|
61
|
+
require_paths:
|
|
62
|
+
- lib
|
|
63
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
64
|
+
none: false
|
|
65
|
+
requirements:
|
|
66
|
+
- - ">="
|
|
67
|
+
- !ruby/object:Gem::Version
|
|
68
|
+
version: "0"
|
|
69
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
70
|
+
none: false
|
|
71
|
+
requirements:
|
|
72
|
+
- - ">="
|
|
73
|
+
- !ruby/object:Gem::Version
|
|
74
|
+
version: "1.2"
|
|
75
|
+
requirements: []
|
|
76
|
+
|
|
77
|
+
rubyforge_project: rhocr
|
|
78
|
+
rubygems_version: 1.6.2
|
|
79
|
+
signing_key:
|
|
80
|
+
specification_version: 3
|
|
81
|
+
summary: Manipulate and use OCR data encode in HOCR
|
|
82
|
+
test_files: []
|
|
83
|
+
|