rhocr 0.0.3 → 0.1

Sign up to get free protection for your applications and to get access to all the features.
data/data/test.html ADDED
@@ -0,0 +1,71 @@
1
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
2
+ <html>
3
+ <head>
4
+ <title>OCR Output</title>
5
+ <meta http-equiv='content-type' content='text/html; charset=utf-8' />
6
+ <meta http-equiv='content-style-type' content='text/css' />
7
+ <meta name='ocr-capabilities' content='ocr_page ocr_par ocrx_word ocr_line' />
8
+ <meta name='ocr-system' content='ABBYY fre-8.0.1.1024' />
9
+ <meta name='ocr-number-of-pages' content='1' />
10
+ </head><body bgcolor='#ffffff'>
11
+ <div class='ocr_page' title='bbox 0 0 1326 1326;ppageno 33'>
12
+ <div class='ocrx_block' title='bbox 55 32 1135 1887'>
13
+ <p class='ocr_par' title='bbox 432 32 1117 71' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 432 32 1117 71'><span class='ocrx_word' title='bbox 432 32 588 67'>Athenobius</span> <span class='ocrx_word' title='bbox 606 48 640 54'>—</span> <span class='ocrx_word' title='bbox 657 34 749 62'>Aulon.</span> <span class='ocrx_word' title='bbox 1074 37 1117 71'>29</span></span></p>
14
+
15
+ <p class='ocr_par' title='bbox 79 109 1119 189' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 79 109 1119 145'><span class='ocrx_word' title='bbox 79 109 294 144'>Athenobius,</span> <span class='ocrx_word' title='bbox 334 112 398 139'>Der</span> <span class='ocrx_word' title='bbox 417 115 476 139'>von</span> <span class='ocrx_word' title='bbox 494 112 545 139'>der</span> <span class='ocrx_word' title='bbox 565 112 687 140'>Göttin</span> <span class='ocrx_word' title='bbox 707 112 857 140'>Minerva</span> <span class='ocrx_word' title='bbox 876 112 954 145'>lebt,</span> <span class='ocrx_word' title='bbox 974 112 1043 140'>oder:</span> <span class='ocrx_word' title='bbox 1062 112 1119 140'>Mi»</span><br></span><span class='ocr_line' title='bbox 108 155 300 189'><span class='ocrx_word' title='bbox 108 159 183 182'>nerva</span> <span class='ocrx_word' title='bbox 201 155 300 189'>Bogen.</span></span></p>
16
+
17
+ <p class='ocr_par' title='bbox 74 196 1117 316' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 160 196 1117 232'><span class='ocrx_word' title='bbox 160 198 214 225'>Des</span> <span class='ocrx_word' title='bbox 242 197 340 230'>Königs</span> <span class='ocrx_word' title='bbox 367 196 503 230'>Antiochus</span> <span class='ocrx_word' title='bbox 531 197 626 230'>Freund</span> <span class='ocrx_word' title='bbox 655 197 713 225'>oder</span> <span class='ocrx_word' title='bbox 739 196 858 232'>geheimer</span> <span class='ocrx_word' title='bbox 885 196 963 230'>Nath.</span> <span class='ocrx_word' title='bbox 994 199 1005 224'>l</span> <span class='ocrx_word' title='bbox 1033 197 1117 226'>Mack.</span><br></span><span class='ocr_line' title='bbox 109 241 206 274'><span class='ocrx_word' title='bbox 109 241 147 274'>15,</span> <span class='ocrx_word' title='bbox 166 242 206 267'>28.</span><br></span><span class='ocr_line' title='bbox 74 281 1116 316'><span class='ocrx_word' title='bbox 74 281 205 315'>Athlai.</span> <span class='ocrx_word' title='bbox 242 284 310 310'>Dee</span> <span class='ocrx_word' title='bbox 337 282 417 315'>Herr</span> <span class='ocrx_word' title='bbox 440 281 598 315'>zerreißet</span> <span class='ocrx_word' title='bbox 625 282 681 310'>oder</span> <span class='ocrx_word' title='bbox 706 282 864 316'>zerbricht.</span> <span class='ocrx_word' title='bbox 898 282 975 310'>Einer</span> <span class='ocrx_word' title='bbox 999 286 1050 310'>von</span> <span class='ocrx_word' title='bbox 1069 282 1116 310'>den</span></span></p>
18
+
19
+ <p class='ocr_par' title='bbox 74 324 1114 401' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 107 324 594 357'><span class='ocrx_word' title='bbox 107 325 281 357'>Nachlommen</span> <span class='ocrx_word' title='bbox 300 324 392 352'>Bebai.</span> <span class='ocrx_word' title='bbox 410 324 472 356'>Esra</span> <span class='ocrx_word' title='bbox 496 327 533 355'>10,</span> <span class='ocrx_word' title='bbox 553 326 594 351'>28.</span><br></span><span class='ocr_line' title='bbox 74 366 1114 401'><span class='ocrx_word' title='bbox 74 366 189 400'>Athni.</span> <span class='ocrx_word' title='bbox 217 368 296 395'>Eine</span> <span class='ocrx_word' title='bbox 315 367 450 401'>Trübsal</span> <span class='ocrx_word' title='bbox 469 372 528 394'>von</span> <span class='ocrx_word' title='bbox 548 366 638 394'>Gott.</span> <span class='ocrx_word' title='bbox 673 366 722 394'>Ein</span> <span class='ocrx_word' title='bbox 742 366 819 400'>Sohn</span> <span class='ocrx_word' title='bbox 838 366 954 400'>Semaja.</span> <span class='ocrx_word' title='bbox 986 369 998 394'>1</span> <span class='ocrx_word' title='bbox 1018 368 1114 400'>Chron.</span></span></p>
20
+
21
+ <p class='ocr_par' title='bbox 71 412 1112 488' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 104 412 187 440'><span class='ocrx_word' title='bbox 104 412 144 440'>27.</span> <span class='ocrx_word' title='bbox 163 413 187 438'>7.</span><br></span><span class='ocr_line' title='bbox 71 451 1112 488'><span class='ocrx_word' title='bbox 71 451 217 485'>Athniel.</span> <span class='ocrx_word' title='bbox 246 452 364 479'>Gottes</span> <span class='ocrx_word' title='bbox 384 451 531 484'>Trübsal,</span> <span class='ocrx_word' title='bbox 550 451 572 479'>d.</span> <span class='ocrx_word' title='bbox 591 451 608 479'>i.</span> <span class='ocrx_word' title='bbox 627 451 681 479'>eine</span> <span class='ocrx_word' title='bbox 699 451 819 484'>Trübsal,</span> <span class='ocrx_word' title='bbox 839 459 888 479'>von</span> <span class='ocrx_word' title='bbox 908 452 970 480'>Gott</span> <span class='ocrx_word' title='bbox 990 452 1112 488'>zugesügt.</span></span></p>
22
+
23
+ <p class='ocr_par' title='bbox 102 494 1110 528' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 102 494 1110 528'><span class='ocrx_word' title='bbox 102 496 152 523'>Ein</span> <span class='ocrx_word' title='bbox 172 495 248 526'>Sohn</span> <span class='ocrx_word' title='bbox 268 495 362 525'>Kenas,</span> <span class='ocrx_word' title='bbox 380 495 424 521'>des</span> <span class='ocrx_word' title='bbox 445 494 557 521'>Bruders</span> <span class='ocrx_word' title='bbox 576 494 665 526'>Kaleb;</span> <span class='ocrx_word' title='bbox 693 500 798 528'>gewann</span> <span class='ocrx_word' title='bbox 818 495 916 528'>Kiriath</span> <span class='ocrx_word' title='bbox 936 495 1042 528'>Sepher,</span> <span class='ocrx_word' title='bbox 1061 495 1110 523'>und</span></span></p>
24
+
25
+ <p class='ocr_par' title='bbox 68 535 1119 614' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 100 535 1037 570'><span class='ocrx_word' title='bbox 100 538 180 565'>damit</span> <span class='ocrx_word' title='bbox 199 537 281 570'>Achsa.</span> <span class='ocrx_word' title='bbox 300 538 336 564'>die</span> <span class='ocrx_word' title='bbox 356 537 454 569'>Tochter</span> <span class='ocrx_word' title='bbox 472 535 553 567'>seines</span> <span class='ocrx_word' title='bbox 574 537 674 564'>Betters</span> <span class='ocrx_word' title='bbox 694 537 780 564'>Kaleb.</span> <span class='ocrx_word' title='bbox 800 536 877 570'>Nicht,</span> <span class='ocrx_word' title='bbox 899 540 919 569'>1.</span> <span class='ocrx_word' title='bbox 940 539 978 565'>12.</span> <span class='ocrx_word' title='bbox 1000 539 1037 564'>13.</span><br></span><span class='ocr_line' title='bbox 68 576 1119 614'><span class='ocrx_word' title='bbox 68 578 376 614'>Atroth-Sophan,</span> <span class='ocrx_word' title='bbox 396 580 433 606'>die</span> <span class='ocrx_word' title='bbox 454 580 555 607'>Krone</span> <span class='ocrx_word' title='bbox 580 580 633 606'>oder</span> <span class='ocrx_word' title='bbox 658 579 760 609'>Decke,</span> <span class='ocrx_word' title='bbox 785 580 841 606'>oder</span> <span class='ocrx_word' title='bbox 860 580 1037 614'>Bedeckung</span> <span class='ocrx_word' title='bbox 1057 576 1119 608'>des'</span></span></p>
26
+
27
+ <p class='ocr_par' title='bbox 100 621 1111 657' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 100 621 1111 657'><span class='ocrx_word' title='bbox 100 624 235 657'>Hügels.</span> <span class='ocrx_word' title='bbox 273 623 335 650'>Eine</span> <span class='ocrx_word' title='bbox 355 623 436 649'>Stadt</span> <span class='ocrx_word' title='bbox 456 623 496 649'>der</span> <span class='ocrx_word' title='bbox 515 621 656 649'>Rubeniten</span> <span class='ocrx_word' title='bbox 680 621 715 648'>im</span> <span class='ocrx_word' title='bbox 734 622 875 656'>Königreich</span> <span class='ocrx_word' title='bbox 895 622 992 654'>Sthon.</span> <span class='ocrx_word' title='bbox 1008 624 1024 650'>4</span> <span class='ocrx_word' title='bbox 1043 623 1111 657'>Mos.</span></span></p>
28
+
29
+ <p class='ocr_par' title='bbox 67 668 1112 742' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 98 668 200 698'><span class='ocrx_word' title='bbox 98 669 139 698'>32,</span> <span class='ocrx_word' title='bbox 158 668 200 693'>35.</span><br></span><span class='ocr_line' title='bbox 67 706 1112 742'><span class='ocrx_word' title='bbox 67 707 341 742'>AtrothAddar:</span> <span class='ocrx_word' title='bbox 356 706 418 735'>Die</span> <span class='ocrx_word' title='bbox 432 707 537 734'>Krone</span> <span class='ocrx_word' title='bbox 551 706 661 733'>Addar</span> <span class='ocrx_word' title='bbox 675 706 729 737'>(des</span> <span class='ocrx_word' title='bbox 744 706 849 739'>Sohnes</span> <span class='ocrx_word' title='bbox 861 706 1016 740'>Benjamin).</span> <span class='ocrx_word' title='bbox 1037 708 1112 741'>Diese</span></span></p>
30
+
31
+ <p class='ocr_par' title='bbox 98 748 1111 785' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 98 748 1111 785'><span class='ocrx_word' title='bbox 98 752 184 780'>Stadt</span> <span class='ocrx_word' title='bbox 202 751 300 785'>gehörte</span> <span class='ocrx_word' title='bbox 322 750 369 777'>den</span> <span class='ocrx_word' title='bbox 395 748 611 781'>Benjaminitern,</span> <span class='ocrx_word' title='bbox 635 749 678 782'>lag</span> <span class='ocrx_word' title='bbox 702 748 728 775'>in</span> <span class='ocrx_word' title='bbox 753 749 798 776'>den</span> <span class='ocrx_word' title='bbox 826 749 938 784'>Grenzen</span> <span class='ocrx_word' title='bbox 962 750 1033 783'>Iuda</span> <span class='ocrx_word' title='bbox 1057 750 1111 778'>tmd</span></span></p>
32
+
33
+ <p class='ocr_par' title='bbox 64 794 1112 870' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 98 794 232 825'><span class='ocrx_word' title='bbox 98 794 232 825'>Ephraim.</span><br></span><span class='ocr_line' title='bbox 64 833 1112 870'><span class='ocrx_word' title='bbox 64 834 419 870'>Atroth.Beth-Ioab,</span> <span class='ocrx_word' title='bbox 438 836 460 862'>d.</span> <span class='ocrx_word' title='bbox 480 835 497 862'>i.</span> <span class='ocrx_word' title='bbox 517 834 565 861'>die</span> <span class='ocrx_word' title='bbox 584 834 688 861'>Krone</span> <span class='ocrx_word' title='bbox 712 833 764 861'>des</span> <span class='ocrx_word' title='bbox 784 833 909 866'>Hauses</span> <span class='ocrx_word' title='bbox 926 834 1020 866'>Ioab.</span> <span class='ocrx_word' title='bbox 1050 835 1112 863'>Eine</span></span></p>
34
+
35
+ <p class='ocr_par' title='bbox 101 876 1113 911' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 101 876 1113 911'><span class='ocrx_word' title='bbox 101 879 187 907'>Stadt</span> <span class='ocrx_word' title='bbox 201 879 227 906'>in</span> <span class='ocrx_word' title='bbox 249 879 329 911'>Iuda,</span> <span class='ocrx_word' title='bbox 350 883 388 905'>wo</span> <span class='ocrx_word' title='bbox 408 878 445 905'>die</span> <span class='ocrx_word' title='bbox 456 877 634 909'>Nachlommen</span> <span class='ocrx_word' title='bbox 645 876 742 904'>Salma</span> <span class='ocrx_word' title='bbox 762 876 876 910'>gewohnt</span> <span class='ocrx_word' title='bbox 887 877 972 910'>haben.</span> <span class='ocrx_word' title='bbox 992 880 1002 904'>1</span> <span class='ocrx_word' title='bbox 1021 877 1113 911'>Chron.</span></span></p>
36
+
37
+ <p class='ocr_par' title='bbox 64 923 1112 996' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 96 923 185 953'><span class='ocrx_word' title='bbox 96 923 120 953'>2,</span> <span class='ocrx_word' title='bbox 141 924 185 949'>54.</span><br></span><span class='ocr_line' title='bbox 64 961 1112 996'><span class='ocrx_word' title='bbox 64 962 212 993'>Attalia.</span> <span class='ocrx_word' title='bbox 249 963 312 990'>Eine</span> <span class='ocrx_word' title='bbox 330 963 411 990'>Stadt</span> <span class='ocrx_word' title='bbox 428 962 455 988'>in</span> <span class='ocrx_word' title='bbox 474 961 637 994'>Pamphilien</span> <span class='ocrx_word' title='bbox 665 962 701 990'>od.</span> <span class='ocrx_word' title='bbox 720 961 824 994'>Libyen,</span> <span class='ocrx_word' title='bbox 848 967 897 989'>von</span> <span class='ocrx_word' title='bbox 916 962 1004 989'>Attala</span> <span class='ocrx_word' title='bbox 1023 962 1112 996'>Phila.</span></span></p>
38
+
39
+ <p class='ocr_par' title='bbox 62 1005 1135 1082' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 95 1005 594 1039'><span class='ocrx_word' title='bbox 95 1006 185 1039'>delpho</span> <span class='ocrx_word' title='bbox 206 1006 302 1033'>erbaut.</span> <span class='ocrx_word' title='bbox 321 1006 368 1037'>Ap.</span> <span class='ocrx_word' title='bbox 388 1005 471 1038'>Gesch.</span> <span class='ocrx_word' title='bbox 494 1006 533 1038'>14,</span> <span class='ocrx_word' title='bbox 553 1006 594 1031'>25.</span><br></span><span class='ocr_line' title='bbox 62 1046 1135 1082'><span class='ocrx_word' title='bbox 62 1048 208 1077'>Attalus</span> <span class='ocrx_word' title='bbox 255 1048 306 1075'>Ein</span> <span class='ocrx_word' title='bbox 330 1046 411 1080'>König</span> <span class='ocrx_word' title='bbox 436 1047 462 1073'>in</span> <span class='ocrx_word' title='bbox 482 1046 594 1078'>Mysien,</span> <span class='ocrx_word' title='bbox 623 1046 722 1078'>welches</span> <span class='ocrx_word' title='bbox 747 1047 819 1073'>unter</span> <span class='ocrx_word' title='bbox 843 1046 972 1081'>Phrygien</span> <span class='ocrx_word' title='bbox 997 1047 1112 1082'>gehörte;</span> <span class='ocrx_word' title='bbox 1128 1058 1135 1067'>,</span></span></p>
40
+
41
+ <p class='ocr_par' title='bbox 95 1089 1111 1125' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 95 1089 1111 1125'><span class='ocrx_word' title='bbox 95 1093 207 1125'>genannt</span> <span class='ocrx_word' title='bbox 227 1095 274 1117'>von</span> <span class='ocrx_word' title='bbox 294 1090 416 1119'>Attale,</span> <span class='ocrx_word' title='bbox 435 1090 534 1120'>welches</span> <span class='ocrx_word' title='bbox 553 1089 591 1116'>bei</span> <span class='ocrx_word' title='bbox 610 1089 657 1115'>den</span> <span class='ocrx_word' title='bbox 676 1089 819 1122'>Phrygiern</span> <span class='ocrx_word' title='bbox 838 1089 917 1121'>Kropf</span> <span class='ocrx_word' title='bbox 935 1090 991 1116'>oder</span> <span class='ocrx_word' title='bbox 1012 1090 1111 1124'>Gurgel</span></span></p>
42
+
43
+ <p class='ocr_par' title='bbox 94 1131 1110 1168' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 94 1131 1110 1168'><span class='ocrx_word' title='bbox 94 1133 213 1168'>geheißen</span> <span class='ocrx_word' title='bbox 233 1133 310 1165'>haben</span> <span class='ocrx_word' title='bbox 329 1131 380 1161'>soll.</span> <span class='ocrx_word' title='bbox 393 1157 397 1161'>,</span> <span class='ocrx_word' title='bbox 417 1131 479 1159'>War</span> <span class='ocrx_word' title='bbox 497 1132 536 1158'>ein</span> <span class='ocrx_word' title='bbox 555 1132 637 1165'>König</span> <span class='ocrx_word' title='bbox 660 1132 700 1158'>der</span> <span class='ocrx_word' title='bbox 719 1132 889 1165'>Pergamener</span> <span class='ocrx_word' title='bbox 907 1133 956 1158'>und</span> <span class='ocrx_word' title='bbox 976 1131 1110 1165'>Phrvgier.</span></span></p>
44
+
45
+ <p class='ocr_par' title='bbox 59 1175 1110 1250' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 95 1175 332 1207'><span class='ocrx_word' title='bbox 95 1177 106 1201'>l</span> <span class='ocrx_word' title='bbox 130 1175 211 1202'>Mack.</span> <span class='ocrx_word' title='bbox 233 1176 272 1207'>15,</span> <span class='ocrx_word' title='bbox 292 1176 332 1201'>22.</span><br></span><span class='ocr_line' title='bbox 59 1216 1110 1250'><span class='ocrx_word' title='bbox 59 1217 146 1246'>Ava.</span> <span class='ocrx_word' title='bbox 184 1217 224 1250'>Ist</span> <span class='ocrx_word' title='bbox 242 1217 280 1243'>bei</span> <span class='ocrx_word' title='bbox 299 1218 344 1243'>den</span> <span class='ocrx_word' title='bbox 364 1217 432 1243'>alten</span> <span class='ocrx_word' title='bbox 453 1216 568 1248'>Griechen</span> <span class='ocrx_word' title='bbox 587 1216 636 1242'>Aia</span> <span class='ocrx_word' title='bbox 655 1216 691 1242'>od.</span> <span class='ocrx_word' title='bbox 711 1216 771 1247'>Aea,</span> <span class='ocrx_word' title='bbox 790 1216 828 1242'>die</span> <span class='ocrx_word' title='bbox 842 1216 993 1249'>Hauptstadt</span> <span class='ocrx_word' title='bbox 1010 1217 1036 1243'>in</span> <span class='ocrx_word' title='bbox 1051 1216 1110 1245'>Col»</span></span></p>
46
+
47
+ <p class='ocr_par' title='bbox 92 1258 1108 1293' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 92 1258 1108 1293'><span class='ocrx_word' title='bbox 92 1261 165 1293'>chide,</span> <span class='ocrx_word' title='bbox 186 1265 224 1286'>wo</span> <span class='ocrx_word' title='bbox 244 1260 323 1285'>Aetas</span> <span class='ocrx_word' title='bbox 341 1258 451 1292'>regierte.</span> <span class='ocrx_word' title='bbox 494 1258 593 1291'>Colchis</span> <span class='ocrx_word' title='bbox 612 1258 678 1291'>heißt</span> <span class='ocrx_word' title='bbox 697 1258 812 1292'>heutiges</span> <span class='ocrx_word' title='bbox 833 1258 916 1292'>Tages</span> <span class='ocrx_word' title='bbox 938 1258 1108 1293'>Mengrelicn,</span></span></p>
48
+
49
+ <p class='ocr_par' title='bbox 92 1300 1109 1335' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 92 1300 1109 1335'><span class='ocrx_word' title='bbox 92 1303 131 1330'>die</span> <span class='ocrx_word' title='bbox 160 1303 258 1335'>meisten</span> <span class='ocrx_word' title='bbox 277 1302 430 1332'>Einwohner</span> <span class='ocrx_word' title='bbox 455 1300 505 1333'>sind</span> <span class='ocrx_word' title='bbox 531 1300 653 1334'>Christen.</span> <span class='ocrx_word' title='bbox 698 1300 755 1328'>Von</span> <span class='ocrx_word' title='bbox 780 1300 831 1332'>hier</span> <span class='ocrx_word' title='bbox 855 1302 956 1328'>wurden</span> <span class='ocrx_word' title='bbox 980 1302 1018 1328'>die</span> <span class='ocrx_word' title='bbox 1037 1301 1109 1329'>Leute</span></span></p>
50
+
51
+ <p class='ocr_par' title='bbox 92 1342 1110 1378' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 92 1342 1110 1378'><span class='ocrx_word' title='bbox 92 1349 144 1371'>von</span> <span class='ocrx_word' title='bbox 165 1344 347 1376'>Salmanasscr</span> <span class='ocrx_word' title='bbox 371 1344 428 1375'>nach</span> <span class='ocrx_word' title='bbox 454 1343 582 1370'>Samaria</span> <span class='ocrx_word' title='bbox 606 1343 716 1378'>gesührt,</span> <span class='ocrx_word' title='bbox 741 1347 779 1369'>wo</span> <span class='ocrx_word' title='bbox 804 1342 835 1375'>sie</span> <span class='ocrx_word' title='bbox 859 1343 917 1375'>noch</span> <span class='ocrx_word' title='bbox 942 1343 993 1376'>ihre</span> <span class='ocrx_word' title='bbox 1019 1344 1110 1371'>Götter</span></span></p>
52
+
53
+ <p class='ocr_par' title='bbox 58 1386 1109 1462' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 93 1386 875 1419'><span class='ocrx_word' title='bbox 93 1387 211 1419'>Nibehas</span> <span class='ocrx_word' title='bbox 231 1387 279 1413'>und</span> <span class='ocrx_word' title='bbox 298 1386 421 1418'>Tharthac</span> <span class='ocrx_word' title='bbox 440 1386 578 1413'>anbeteten.</span> <span class='ocrx_word' title='bbox 597 1388 612 1412'>2</span> <span class='ocrx_word' title='bbox 630 1387 694 1413'>Kön.</span> <span class='ocrx_word' title='bbox 716 1388 754 1416'>l7,</span> <span class='ocrx_word' title='bbox 775 1386 816 1413'>24.</span> <span class='ocrx_word' title='bbox 835 1387 875 1413'>31.</span><br></span><span class='ocr_line' title='bbox 58 1428 1109 1462'><span class='ocrx_word' title='bbox 58 1428 175 1458'>Aven.</span> <span class='ocrx_word' title='bbox 213 1429 314 1462'>Götze,</span> <span class='ocrx_word' title='bbox 339 1428 502 1455'>Eitelleit.</span> <span class='ocrx_word' title='bbox 538 1428 580 1455'>So</span> <span class='ocrx_word' title='bbox 605 1429 665 1455'>wird</span> <span class='ocrx_word' title='bbox 690 1428 778 1461'>Bethel</span> <span class='ocrx_word' title='bbox 802 1429 920 1462'>genannt.</span> <span class='ocrx_word' title='bbox 939 1428 995 1462'>Hos.</span> <span class='ocrx_word' title='bbox 1023 1431 1061 1460'>10,</span> <span class='ocrx_word' title='bbox 1086 1431 1109 1456'>8.</span></span></p>
54
+
55
+ <p class='ocr_par' title='bbox 92 1471 1110 1505' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 92 1471 1110 1505'><span class='ocrx_word' title='bbox 92 1478 180 1505'>wegen</span> <span class='ocrx_word' title='bbox 199 1472 241 1498'>der</span> <span class='ocrx_word' title='bbox 261 1471 365 1504'>Götzen,</span> <span class='ocrx_word' title='bbox 390 1471 429 1497'>die</span> <span class='ocrx_word' title='bbox 456 1471 556 1503'>daselbst</span> <span class='ocrx_word' title='bbox 579 1476 628 1497'>von</span> <span class='ocrx_word' title='bbox 651 1472 694 1497'>den</span> <span class='ocrx_word' title='bbox 715 1471 852 1503'>Israeliten</span> <span class='ocrx_word' title='bbox 877 1471 973 1503'>verehrt</span> <span class='ocrx_word' title='bbox 1000 1472 1110 1498'>wurden.</span></span></p>
56
+
57
+ <p class='ocr_par' title='bbox 91 1513 1110 1548' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 91 1513 1110 1548'><span class='ocrx_word' title='bbox 91 1515 149 1541'>Mit</span> <span class='ocrx_word' title='bbox 167 1515 220 1541'>dem</span> <span class='ocrx_word' title='bbox 240 1520 332 1548'>ganzen</span> <span class='ocrx_word' title='bbox 352 1513 461 1540'>Namen:</span> <span class='ocrx_word' title='bbox 482 1513 640 1544'>Beth»Aven,</span> <span class='ocrx_word' title='bbox 658 1513 708 1540'>das</span> <span class='ocrx_word' title='bbox 726 1513 893 1548'>Götzenhaus,</span> <span class='ocrx_word' title='bbox 914 1513 979 1546'>oder,</span> <span class='ocrx_word' title='bbox 997 1514 1029 1540'>da</span> <span class='ocrx_word' title='bbox 1048 1520 1110 1541'>man</span></span></p>
58
+
59
+ <p class='ocr_par' title='bbox 56 1555 1110 1633' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 91 1555 631 1588'><span class='ocrx_word' title='bbox 91 1558 149 1584'>dem</span> <span class='ocrx_word' title='bbox 169 1557 254 1583'>Eiteln</span> <span class='ocrx_word' title='bbox 275 1556 453 1587'>nachwandelt.</span> <span class='ocrx_word' title='bbox 473 1555 532 1588'>Hos.</span> <span class='ocrx_word' title='bbox 549 1558 571 1584'>4,</span> <span class='ocrx_word' title='bbox 593 1558 631 1581'>15.</span><br></span><span class='ocr_line' title='bbox 56 1597 1110 1633'><span class='ocrx_word' title='bbox 56 1597 242 1633'>Augustus.</span> <span class='ocrx_word' title='bbox 287 1597 419 1631'>Würdig</span> <span class='ocrx_word' title='bbox 448 1598 577 1630'>verehrt</span> <span class='ocrx_word' title='bbox 608 1598 668 1624'>und</span> <span class='ocrx_word' title='bbox 704 1599 876 1632'>angebetet</span> <span class='ocrx_word' title='bbox 908 1605 944 1632'>zu</span> <span class='ocrx_word' title='bbox 978 1599 1110 1627'>werden.</span></span></p>
60
+
61
+ <p class='ocr_par' title='bbox 92 1640 1108 1674' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 92 1640 1108 1674'><span class='ocrx_word' title='bbox 92 1640 189 1674'>Diesen</span> <span class='ocrx_word' title='bbox 211 1640 311 1667'>Namen</span> <span class='ocrx_word' title='bbox 340 1641 386 1674'>gab</span> <span class='ocrx_word' title='bbox 413 1641 459 1667'>das</span> <span class='ocrx_word' title='bbox 487 1640 596 1672'>romische</span> <span class='ocrx_word' title='bbox 616 1640 678 1667'>Voll</span> <span class='ocrx_word' title='bbox 706 1641 758 1667'>dem</span> <span class='ocrx_word' title='bbox 781 1641 869 1672'>Kaiser</span> <span class='ocrx_word' title='bbox 887 1641 1030 1672'>Octavian,</span> <span class='ocrx_word' title='bbox 1058 1642 1108 1668'>und</span></span></p>
62
+
63
+ <p class='ocr_par' title='bbox 93 1682 1108 1716' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 93 1682 1108 1716'><span class='ocrx_word' title='bbox 93 1683 142 1710'>alle</span> <span class='ocrx_word' title='bbox 161 1683 291 1715'>romischen</span> <span class='ocrx_word' title='bbox 310 1683 396 1715'>Kaiser</span> <span class='ocrx_word' title='bbox 416 1683 495 1713'>haben</span> <span class='ocrx_word' title='bbox 514 1682 594 1714'>diesen</span> <span class='ocrx_word' title='bbox 614 1682 709 1709'>Namen</span> <span class='ocrx_word' title='bbox 728 1683 898 1716'>beibehalten,</span> <span class='ocrx_word' title='bbox 917 1682 964 1715'>daß</span> <span class='ocrx_word' title='bbox 983 1682 1014 1715'>sie</span> <span class='ocrx_word' title='bbox 1033 1692 1108 1710'>«au,-</span></span></p>
64
+
65
+ <p class='ocr_par' title='bbox 55 1724 1107 1843' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 90 1724 997 1759'><span class='ocrx_word' title='bbox 90 1733 142 1758'>per</span> <span class='ocrx_word' title='bbox 160 1727 284 1758'>2ußr>«ti,</span> <span class='ocrx_word' title='bbox 304 1726 326 1751'>d.</span> <span class='ocrx_word' title='bbox 345 1724 360 1750'>i,</span> <span class='ocrx_word' title='bbox 382 1725 462 1757'>allzeit</span> <span class='ocrx_word' title='bbox 482 1725 581 1756'>Mehrer</span> <span class='ocrx_word' title='bbox 600 1725 643 1751'>des</span> <span class='ocrx_word' title='bbox 664 1724 750 1756'>Reichs</span> <span class='ocrx_word' title='bbox 770 1725 885 1759'>geheißen</span> <span class='ocrx_word' title='bbox 904 1725 997 1758'>haben.</span><br></span><span class='ocr_line' title='bbox 55 1766 952 1800'><span class='ocrx_word' title='bbox 55 1766 176 1800'>Avith.</span> <span class='ocrx_word' title='bbox 212 1767 324 1799'>Haufe.</span> <span class='ocrx_word' title='bbox 361 1766 424 1793'>Eine</span> <span class='ocrx_word' title='bbox 443 1766 524 1794'>Stadt</span> <span class='ocrx_word' title='bbox 542 1768 569 1793'>in</span> <span class='ocrx_word' title='bbox 587 1767 709 1799'>Idumäa.</span> <span class='ocrx_word' title='bbox 732 1769 742 1793'>1</span> <span class='ocrx_word' title='bbox 763 1767 831 1799'>Mos.</span> <span class='ocrx_word' title='bbox 849 1769 892 1798'>36,</span> <span class='ocrx_word' title='bbox 910 1769 952 1795'>35.</span><br></span><span class='ocr_line' title='bbox 57 1809 1107 1843'><span class='ocrx_word' title='bbox 57 1809 182 1839'>Aulon.</span> <span class='ocrx_word' title='bbox 236 1809 445 1843'>Ausgehöhlt.</span> <span class='ocrx_word' title='bbox 491 1809 553 1836'>Das</span> <span class='ocrx_word' title='bbox 581 1809 653 1843'>große</span> <span class='ocrx_word' title='bbox 681 1809 757 1841'>Thal,</span> <span class='ocrx_word' title='bbox 791 1810 871 1837'>worin</span> <span class='ocrx_word' title='bbox 897 1810 935 1837'>die</span> <span class='ocrx_word' title='bbox 962 1809 1107 1843'>berühmten</span></span></p>
66
+
67
+ <p class='ocr_par' title='bbox 89 1851 1106 1887' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 89 1851 1106 1887'><span class='ocrx_word' title='bbox 89 1853 186 1882'>Städte</span> <span class='ocrx_word' title='bbox 204 1852 315 1886'>Vethsan</span> <span class='ocrx_word' title='bbox 334 1852 390 1879'>oder</span> <span class='ocrx_word' title='bbox 409 1852 588 1886'>Scythopolis,</span> <span class='ocrx_word' title='bbox 605 1851 732 1885'>Tlberias,</span> <span class='ocrx_word' title='bbox 751 1852 862 1887'>Iericho,</span> <span class='ocrx_word' title='bbox 881 1851 929 1880'>das</span> <span class='ocrx_word' title='bbox 949 1853 1013 1880'>todte</span> <span class='ocrx_word' title='bbox 1033 1852 1106 1880'>Meer</span></span></p>
68
+
69
+ </div>
70
+ </div></body>
71
+ </html>
data/data/test.png ADDED
Binary file
@@ -1,7 +1,7 @@
1
1
  #coding: utf-8
2
2
  require 'sinatra'
3
3
 
4
- require_relative '../lib/ocr_page'
4
+ require_relative '../lib/rhocr'
5
5
 
6
6
  get '/' do
7
7
  "<a href='OCRTest.html'>OCRTest</a>"
@@ -25,5 +25,5 @@ end
25
25
 
26
26
  def get_enclosed_words(x1, y1, x2 ,y2, page)
27
27
  @page = OCRPage.new("../data/#{page}")
28
- @page.enclosed_words( OCRBox.new(x1.to_i, y1.to_i, x2.to_i, y2.to_i) )
28
+ @page.enclosed_words( HOCRBox.new(x1, y1, x2 ,y2) )
29
29
  end
data/lib/hocr_box.rb ADDED
@@ -0,0 +1,67 @@
1
+ #coding: utf-8
2
+
3
+ class HOCRBox
4
+
5
+ attr_reader :left, :top, :right, :bottom, :upper_left, :lower_right, :coordinates
6
+ def initialize(* coordinates)
7
+
8
+ @left, @top, @right, @bottom = coordinates.flatten.collect { |x| x.to_i}
9
+
10
+ @height = @bottom - @top
11
+ @width = @right - @left
12
+ @upper_left = [ @left, @top]
13
+ @lower_rigth = [ @right, @bottom ]
14
+ @coordinates = [ @left, @top,@right, @bottom ]
15
+
16
+ if left > right || top > bottom then
17
+ raise " Negative dimensions of OCRBox ar not allowed. left #{@left} / right #{@right} - top #{@top} / bottom #{@bottom}"
18
+ end
19
+
20
+ end
21
+
22
+ def encloses?(other)
23
+ @left <= other.left and
24
+ @right >= other.right and
25
+ @top <= other.top and
26
+ @bottom >= other.bottom
27
+ end
28
+
29
+ def enclosed_by?(other)
30
+ return other.encloses? self
31
+ end
32
+
33
+ def left_of?(other)
34
+ @right < other.left
35
+ end
36
+
37
+ def right_of?(other)
38
+ @left > other.right
39
+ end
40
+
41
+ def left_distance_to(other)
42
+ @left - other.right
43
+ end
44
+
45
+ def right_distance_to(other)
46
+ other.left_distance_to(self)
47
+ end
48
+
49
+ def to_s
50
+ coordinates_to_s
51
+ end
52
+
53
+ def coordinates_to_s
54
+ "(#{@left},#{@top})/(#{@right},#{@bottom})"
55
+ end
56
+
57
+ def to_css_style
58
+ "position:absolute; top:#{@top}px; left:#{@left}px; height:#{@height}px; width:#{@width}px;"
59
+ end
60
+
61
+ def to_image_html(css_class = 'hocr_box')
62
+ "<span style='#{ to_css_style }' class='#{css_class}'></span>"
63
+ end
64
+
65
+
66
+ end
67
+
@@ -0,0 +1,50 @@
1
+ #coding: utf-8
2
+
3
+ require_relative 'ocr_page'
4
+
5
+ class OCRDocument
6
+ attr_reader :pages, :page_count
7
+
8
+ def initialize
9
+ @pages = Hash.new()
10
+ @page_count = 0
11
+ end
12
+
13
+ def add_pages( list_o_pages )
14
+ for file in list_o_pages do
15
+ add_page(file)
16
+ end
17
+ end
18
+
19
+ def add_page( file )
20
+ page = OCRPage.new( file )
21
+ @pages[page.page_number] = page
22
+ @page_count += 1
23
+ end
24
+
25
+ def page( number )
26
+ @pages[number]
27
+ end
28
+
29
+ def each_line
30
+ for page in @pages.values do
31
+ page.each_line do |line|
32
+ yield line
33
+ end
34
+ end
35
+ end
36
+
37
+ def each_word
38
+ for page in @pages.values do
39
+ page.each_line do |line|
40
+ line.each do |word|
41
+ yield word
42
+ end
43
+ end
44
+ end
45
+ end
46
+
47
+
48
+ alias :add_files :add_pages
49
+ alias :add_file :add_page
50
+ end
@@ -0,0 +1,149 @@
1
+ #coding:utf-8
2
+
3
+ require_relative 'hocr_box'
4
+ class OCRElement < HOCRBox
5
+
6
+ include Enumerable
7
+
8
+ attr_reader :ocr_class, :children
9
+ attr_accessor :features
10
+
11
+ class << self
12
+ def create_from_html(ocr_element_html)
13
+ create ocr_element_html
14
+ end
15
+
16
+ def create(ocr_element_html)
17
+ ocr_class = extract_ocr_class(ocr_element_html)
18
+ coordinates = extract_coordinates(ocr_element_html)
19
+
20
+ unless ocr_class == 'ocrx_word'
21
+ children = extract_children(ocr_element_html)
22
+ else
23
+ children = extract_word_children(ocr_element_html)
24
+ end
25
+
26
+ case ocr_class
27
+ when 'ocrx_block' then
28
+ OCRBlock.new(ocr_class,children,coordinates)
29
+ when 'ocr_par' then
30
+ OCRParagraph.new(ocr_class,children,coordinates)
31
+ when 'ocr_line' then
32
+ OCRLine.new(ocr_class,children,coordinates)
33
+ when 'ocrx_word' then
34
+ OCRWord.new(ocr_class,children,coordinates)
35
+ else
36
+ OCRElement.new(ocr_class,children,coordinates)
37
+ end
38
+ end
39
+
40
+ def extract_word_children(ocr_element_html)
41
+ [ocr_element_html.text]
42
+ end
43
+
44
+ def extract_children(ocr_element_html)
45
+ children = []
46
+ for child_fragment_html in ocr_element_html.elements do
47
+ children << OCRElement.create(child_fragment_html)
48
+ end
49
+ #br Elemente ausfiltern
50
+ children.reject { |child| child.ocr_class == nil}
51
+ end
52
+
53
+
54
+ def extract_coordinates(ocr_element_html)
55
+ extract_coordinates_from_string ocr_element_html['title']
56
+ end
57
+
58
+ def extract_coordinates_from_string(s)
59
+ s =~ /bbox (\d+) (\d+) (\d+) (\d+)/
60
+ [$1, $2, $3, $4]
61
+ end
62
+
63
+ def extract_ocr_class(ocr_element_html)
64
+ ocr_element_html['class']
65
+ end
66
+ end
67
+
68
+ def initialize(ocr_class, children, coordinates)
69
+ @children = children
70
+ @ocr_class = ocr_class
71
+ @features = []
72
+ super coordinates
73
+ end
74
+
75
+ def each
76
+ children.each do |child|
77
+ yield child
78
+ end
79
+ end
80
+
81
+ def to_s
82
+ "#{self.class}:#{@features}#{ coordinates_to_s }->\n" + children.map { |c| "\t#{c.to_s}" }.join("\n")
83
+ end
84
+
85
+ def mark_in_rspec(color)
86
+ "<span style='color: #{color}'>#{to_s}</span>"
87
+ end
88
+
89
+ def to_image_html(dipslay_class = @ocr_class)
90
+ children_html = @children.map {|c| c.to_image_html}.join("")
91
+ "<span class='#{ dipslay_class }' style='#{ to_css_style }' ></span>#{ children_html }"
92
+ end
93
+
94
+ def to_html( display_class = @ocr_class, style = nil )
95
+ children_html = @children.map {|c| c.to_html}.join("")
96
+ "<span class='#{ display_class }'> #{ children_html } </span>"
97
+ end
98
+
99
+ end
100
+
101
+ class OCRWord < OCRElement
102
+
103
+ def text
104
+ children.flatten[0]
105
+ end
106
+
107
+ def to_s
108
+ "#{text}[#{@features}]"
109
+ end
110
+
111
+ def to_image_html
112
+ "<span class='#{ @ocr_class }' style='#{ to_css_style }'>#{ text }</span>"
113
+ end
114
+
115
+ def to_html
116
+ "<span class='#{ @ocr_class }'>#{ text }</span>"
117
+ end
118
+
119
+ end
120
+
121
+ class OCRLine < OCRElement
122
+
123
+ def to_s
124
+ "#{self.class} #{coordinates_to_s} ->[\n" +
125
+ words.map {|w| "#{w.coordinates_to_s}\t#{w.to_s}"}.join("\n") +
126
+ "]"
127
+ end
128
+
129
+ def simple_line
130
+ "#{self.class} #{coordinates_to_s} ->[\n" +
131
+ words.map {|w| w.to_s}.join("\n") +
132
+ "]"
133
+ end
134
+
135
+ def to_text
136
+ words.map { |w| w.text }.join(" ")
137
+ end
138
+
139
+ alias :words :children
140
+ end
141
+
142
+ class OCRParagraph < OCRElement
143
+ alias :lines :children
144
+ end
145
+
146
+ class OCRBlock < OCRElement
147
+ alias :paragraphs :children
148
+ end
149
+
data/lib/ocr_page.rb CHANGED
@@ -1,43 +1,98 @@
1
1
  #coding: utf-8
2
- require_relative "ocrx_word"
2
+ require_relative "ocr_element"
3
+ require 'nokogiri'
4
+ require 'pp'
3
5
 
4
- class OCRPage < OCRBox
5
- attr_reader :lines, :words
6
+ class OCRPage < OCRElement
6
7
 
7
- def initialize(filename)
8
- @lines = hocr_lines( file_as_string(filename) ).select {|line| line.length > 0}
8
+ attr_reader :meta_data, :page_number, :dimensions, :lines, :image
9
+ alias :each_block :each
10
+ alias :blocks :children
11
+
12
+ def initialize(file_path , image_path = nil )
13
+ doc = process_hocr_html_file(file_path)
14
+ page_content = doc.at_css("div.ocr_page")
15
+ coordinates, @page_number = extract_bbox_ppageno( page_content['title'] )
16
+
17
+ @page_content = doc.at_css("div.ocr_page")
18
+ children = OCRElement.extract_children(@page_content)
19
+ super('ocr_page', children, coordinates)
20
+ @image = image_path
21
+
9
22
  end
10
23
 
11
- def hocr_lines( hocr_contents)
12
- hocr_array = []
13
- for line in hocr_contents.split(/<span class=['"]ocr_line['"]/) do
14
- line_array = []
15
- for ocrx_word in line.scan(/<span class=['"]ocrx_word['"][^>]+>[^<]+<\/span>/) do
16
- ocrx_word =~ /title=['"]bbox (\d+) (\d+) (\d+) (\d+)['"]>([^<]+)</
17
- current_word = OCRXWord.new($1,$2,$3,$4,$5)
18
- line_array << current_word
24
+
25
+ def each_paragraph
26
+ for block in blocks do
27
+ for paragraph in block do
28
+ yield paragraph
19
29
  end
20
- hocr_array << line_array
21
- end
22
- hocr_array
30
+ end
23
31
  end
24
32
 
25
- def words
26
- @words ||= @lines.flatten
33
+ def each_line
34
+ for block in blocks do
35
+ for paragraph in block do
36
+ for line in paragraph do
37
+ yield line
38
+ end
39
+ end
40
+ end
27
41
  end
28
42
 
29
- def enclosed_words(box)
30
- words.select { |word| word.enclosed_by? box }
43
+ def each_word
44
+ for block in blocks do
45
+ for paragraph in block do
46
+ for line in paragraph do
47
+ for word in line do
48
+ yield word
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
54
+
55
+ #deprecated
56
+ def lines
57
+ unless @lines then
58
+ @lines = []
59
+
60
+ each_line do |line|
61
+ @lines << line
62
+ end
63
+
64
+ end
65
+ @lines
66
+ end
67
+
68
+ def extract_bbox_ppageno( ocr_html_text_fragment )
69
+ bbox, ppageno = ocr_html_text_fragment.split(';')
70
+ ppageno =~ /(\d+)/
71
+ [ OCRElement.extract_coordinates_from_string(bbox) , $1.to_i ]
72
+ end
73
+
74
+ def process_hocr_html_file(filename)
75
+ html_string = File.open(filename,"r").read
76
+ Nokogiri::HTML(html_string).elements
31
77
  end
32
78
 
33
- def get_position(element)
34
- element =~ /title="bbox (\d+) (\d+) (\d+) (\d+)">/
35
- [$1,$2,$3,$4]
79
+ def to_text
80
+ lines.map {|line| line.to_text}.join("\n")
36
81
  end
37
82
 
83
+ def to_image_html(dipslay_class = @ocr_class)
84
+ children_html = @children.map {|c| c.to_image_html}.join("")
85
+ "<div class='#{ dipslay_class }' style='#{ to_css_style };background-image: url(#{@image}); width:#{@width}px; height:#{@height}>px ;'>#{children_html}</div>"
86
+ end
38
87
 
39
- def file_as_string(filename)
40
- hocr_page_contents = File.open(filename,"r") { |f| f.read }
88
+ def enclosed_words(ocr_box)
89
+ a = []
90
+ each_word do |w|
91
+ if w.enclosed_by? ocr_box then
92
+ a << w
93
+ end
94
+ end
95
+ a
41
96
  end
42
97
 
43
98
  end
data/lib/rhocr.rb CHANGED
@@ -1,2 +1,31 @@
1
1
  #coding: utf-8
2
- require_relative "ocr_page"
2
+
3
+ require_relative "ocr_document"
4
+ class RHOCR < OCRDocument
5
+
6
+ attr_reader :words, :lines
7
+
8
+ def add_folder(path)
9
+ add_files Dir[path]
10
+ compute_lines
11
+ compute_words
12
+ self
13
+ end
14
+
15
+ #should be called if new pages are added
16
+ def compute_words
17
+ @words = []
18
+ each_word do |w|
19
+ @words << w
20
+ end
21
+ end
22
+
23
+ #should be called if new pages are added
24
+ def compute_lines
25
+ @lines = []
26
+ each_line do |l|
27
+ @lines << l
28
+ end
29
+ end
30
+
31
+ end
data/rhocr.gemspec CHANGED
@@ -2,28 +2,31 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{rhocr}
5
- s.version = "0.0.3"
5
+ s.version = "0.1"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
- s.authors = ["Andreas Neumann"]
9
- s.date = %q{2011-07-03}
8
+ s.authors = [%q{Andreas Neumann}]
9
+ s.date = %q{2011-09-08}
10
10
  s.description = %q{Manipulate and use OCR data encode in HOCR}
11
- s.email = %q{info @nospam@ an-it.com}
12
- s.extra_rdoc_files = ["README", "lib/ocr_box.rb", "lib/ocr_page.rb", "lib/ocrx_word.rb", "lib/rhocr.rb"]
13
- s.files = ["Manifest", "README", "Rakefile", "data/Seite_Tagebuch_H_C_Lang_08.html", "example/example_server.rb", "example/public/OCRTest.css", "example/public/OCRTest.html", "example/public/OCRTest_marker.js", "example/public/img/Seite_Tagebuch_H_C_Lang_05.jpg", "example/public/img/Seite_Tagebuch_H_C_Lang_08.jpg", "lib/ocr_box.rb", "lib/ocr_page.rb", "lib/ocrx_word.rb", "lib/rhocr.rb", "rhocr.gemspec", "rspec/ocr_box_spec.rb", "rspec/ocr_page_spec.rb", "rspec/ocrx_word_spec.rb"]
11
+ s.email = %q{andreas@neumann.biz}
12
+ s.extra_rdoc_files = [%q{README}, %q{TODO.txt}, %q{lib/hocr_box.rb}, %q{lib/ocr_document.rb}, %q{lib/ocr_element.rb}, %q{lib/ocr_page.rb}, %q{lib/rhocr.rb}]
13
+ s.files = [%q{Manifest}, %q{README}, %q{Rakefile}, %q{TODO.txt}, %q{data/Seite_Die_Gartenlaube_242.html}, %q{data/Seite_Tagebuch_H_C_Lang_08.html}, %q{data/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{data/test.html}, %q{data/test.png}, %q{example/example_server.rb}, %q{example/public/OCRTest.css}, %q{example/public/OCRTest.html}, %q{example/public/OCRTest_marker.js}, %q{example/public/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{example/public/img/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{lib/hocr_box.rb}, %q{lib/ocr_document.rb}, %q{lib/ocr_element.rb}, %q{lib/ocr_page.rb}, %q{lib/rhocr.rb}, %q{rhocr.gemspec}, %q{spec/hocr_box_spec.rb}, %q{spec/ocr_document_spec.rb}, %q{spec/ocr_element_spec.rb}, %q{spec/ocr_page_spec.rb}, %q{spec/rhocr_spec.rb}, %q{test.html}]
14
14
  s.homepage = %q{http://github.com/daandi/rhocr}
15
- s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Rhocr", "--main", "README"]
16
- s.require_paths = ["lib"]
15
+ s.rdoc_options = [%q{--line-numbers}, %q{--inline-source}, %q{--title}, %q{Rhocr}, %q{--main}, %q{README}]
16
+ s.require_paths = [%q{lib}]
17
17
  s.rubyforge_project = %q{rhocr}
18
- s.rubygems_version = %q{1.6.2}
18
+ s.rubygems_version = %q{1.8.6}
19
19
  s.summary = %q{Manipulate and use OCR data encode in HOCR}
20
20
 
21
21
  if s.respond_to? :specification_version then
22
22
  s.specification_version = 3
23
23
 
24
24
  if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
25
+ s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
25
26
  else
27
+ s.add_dependency(%q<nokogiri>, [">= 0"])
26
28
  end
27
29
  else
30
+ s.add_dependency(%q<nokogiri>, [">= 0"])
28
31
  end
29
32
  end