rhocr 0.0.3 → 0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Manifest +15 -6
- data/Rakefile +3 -3
- data/TODO.txt +42 -0
- data/data/Seite_Die_Gartenlaube_242.html +42 -0
- data/data/Seite_Tagebuch_H_C_Lang_08.jpg +0 -0
- data/data/test.html +71 -0
- data/data/test.png +0 -0
- data/example/example_server.rb +2 -2
- data/example/public/Seite_Tagebuch_H_C_Lang_08.jpg +0 -0
- data/lib/hocr_box.rb +67 -0
- data/lib/ocr_document.rb +50 -0
- data/lib/ocr_element.rb +149 -0
- data/lib/ocr_page.rb +80 -25
- data/lib/rhocr.rb +30 -1
- data/rhocr.gemspec +12 -9
- data/spec/hocr_box_spec.rb +94 -0
- data/spec/ocr_document_spec.rb +80 -0
- data/spec/ocr_element_spec.rb +86 -0
- data/spec/ocr_page_spec.rb +116 -0
- data/spec/rhocr_spec.rb +34 -0
- data/test.html +1 -0
- metadata +52 -39
- data/example/public/img/Seite_Tagebuch_H_C_Lang_05.jpg +0 -0
- data/lib/ocr_box.rb +0 -43
- data/lib/ocrx_word.rb +0 -23
- data/rspec/ocr_box_spec.rb +0 -48
- data/rspec/ocr_page_spec.rb +0 -17
- data/rspec/ocrx_word_spec.rb +0 -32
data/data/test.html
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<title>OCR Output</title>
|
5
|
+
<meta http-equiv='content-type' content='text/html; charset=utf-8' />
|
6
|
+
<meta http-equiv='content-style-type' content='text/css' />
|
7
|
+
<meta name='ocr-capabilities' content='ocr_page ocr_par ocrx_word ocr_line' />
|
8
|
+
<meta name='ocr-system' content='ABBYY fre-8.0.1.1024' />
|
9
|
+
<meta name='ocr-number-of-pages' content='1' />
|
10
|
+
</head><body bgcolor='#ffffff'>
|
11
|
+
<div class='ocr_page' title='bbox 0 0 1326 1326;ppageno 33'>
|
12
|
+
<div class='ocrx_block' title='bbox 55 32 1135 1887'>
|
13
|
+
<p class='ocr_par' title='bbox 432 32 1117 71' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 432 32 1117 71'><span class='ocrx_word' title='bbox 432 32 588 67'>Athenobius</span> <span class='ocrx_word' title='bbox 606 48 640 54'>—</span> <span class='ocrx_word' title='bbox 657 34 749 62'>Aulon.</span> <span class='ocrx_word' title='bbox 1074 37 1117 71'>29</span></span></p>
|
14
|
+
|
15
|
+
<p class='ocr_par' title='bbox 79 109 1119 189' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 79 109 1119 145'><span class='ocrx_word' title='bbox 79 109 294 144'>Athenobius,</span> <span class='ocrx_word' title='bbox 334 112 398 139'>Der</span> <span class='ocrx_word' title='bbox 417 115 476 139'>von</span> <span class='ocrx_word' title='bbox 494 112 545 139'>der</span> <span class='ocrx_word' title='bbox 565 112 687 140'>Göttin</span> <span class='ocrx_word' title='bbox 707 112 857 140'>Minerva</span> <span class='ocrx_word' title='bbox 876 112 954 145'>lebt,</span> <span class='ocrx_word' title='bbox 974 112 1043 140'>oder:</span> <span class='ocrx_word' title='bbox 1062 112 1119 140'>Mi»</span><br></span><span class='ocr_line' title='bbox 108 155 300 189'><span class='ocrx_word' title='bbox 108 159 183 182'>nerva</span> <span class='ocrx_word' title='bbox 201 155 300 189'>Bogen.</span></span></p>
|
16
|
+
|
17
|
+
<p class='ocr_par' title='bbox 74 196 1117 316' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 160 196 1117 232'><span class='ocrx_word' title='bbox 160 198 214 225'>Des</span> <span class='ocrx_word' title='bbox 242 197 340 230'>Königs</span> <span class='ocrx_word' title='bbox 367 196 503 230'>Antiochus</span> <span class='ocrx_word' title='bbox 531 197 626 230'>Freund</span> <span class='ocrx_word' title='bbox 655 197 713 225'>oder</span> <span class='ocrx_word' title='bbox 739 196 858 232'>geheimer</span> <span class='ocrx_word' title='bbox 885 196 963 230'>Nath.</span> <span class='ocrx_word' title='bbox 994 199 1005 224'>l</span> <span class='ocrx_word' title='bbox 1033 197 1117 226'>Mack.</span><br></span><span class='ocr_line' title='bbox 109 241 206 274'><span class='ocrx_word' title='bbox 109 241 147 274'>15,</span> <span class='ocrx_word' title='bbox 166 242 206 267'>28.</span><br></span><span class='ocr_line' title='bbox 74 281 1116 316'><span class='ocrx_word' title='bbox 74 281 205 315'>Athlai.</span> <span class='ocrx_word' title='bbox 242 284 310 310'>Dee</span> <span class='ocrx_word' title='bbox 337 282 417 315'>Herr</span> <span class='ocrx_word' title='bbox 440 281 598 315'>zerreißet</span> <span class='ocrx_word' title='bbox 625 282 681 310'>oder</span> <span class='ocrx_word' title='bbox 706 282 864 316'>zerbricht.</span> <span class='ocrx_word' title='bbox 898 282 975 310'>Einer</span> <span class='ocrx_word' title='bbox 999 286 1050 310'>von</span> <span class='ocrx_word' title='bbox 1069 282 1116 310'>den</span></span></p>
|
18
|
+
|
19
|
+
<p class='ocr_par' title='bbox 74 324 1114 401' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 107 324 594 357'><span class='ocrx_word' title='bbox 107 325 281 357'>Nachlommen</span> <span class='ocrx_word' title='bbox 300 324 392 352'>Bebai.</span> <span class='ocrx_word' title='bbox 410 324 472 356'>Esra</span> <span class='ocrx_word' title='bbox 496 327 533 355'>10,</span> <span class='ocrx_word' title='bbox 553 326 594 351'>28.</span><br></span><span class='ocr_line' title='bbox 74 366 1114 401'><span class='ocrx_word' title='bbox 74 366 189 400'>Athni.</span> <span class='ocrx_word' title='bbox 217 368 296 395'>Eine</span> <span class='ocrx_word' title='bbox 315 367 450 401'>Trübsal</span> <span class='ocrx_word' title='bbox 469 372 528 394'>von</span> <span class='ocrx_word' title='bbox 548 366 638 394'>Gott.</span> <span class='ocrx_word' title='bbox 673 366 722 394'>Ein</span> <span class='ocrx_word' title='bbox 742 366 819 400'>Sohn</span> <span class='ocrx_word' title='bbox 838 366 954 400'>Semaja.</span> <span class='ocrx_word' title='bbox 986 369 998 394'>1</span> <span class='ocrx_word' title='bbox 1018 368 1114 400'>Chron.</span></span></p>
|
20
|
+
|
21
|
+
<p class='ocr_par' title='bbox 71 412 1112 488' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 104 412 187 440'><span class='ocrx_word' title='bbox 104 412 144 440'>27.</span> <span class='ocrx_word' title='bbox 163 413 187 438'>7.</span><br></span><span class='ocr_line' title='bbox 71 451 1112 488'><span class='ocrx_word' title='bbox 71 451 217 485'>Athniel.</span> <span class='ocrx_word' title='bbox 246 452 364 479'>Gottes</span> <span class='ocrx_word' title='bbox 384 451 531 484'>Trübsal,</span> <span class='ocrx_word' title='bbox 550 451 572 479'>d.</span> <span class='ocrx_word' title='bbox 591 451 608 479'>i.</span> <span class='ocrx_word' title='bbox 627 451 681 479'>eine</span> <span class='ocrx_word' title='bbox 699 451 819 484'>Trübsal,</span> <span class='ocrx_word' title='bbox 839 459 888 479'>von</span> <span class='ocrx_word' title='bbox 908 452 970 480'>Gott</span> <span class='ocrx_word' title='bbox 990 452 1112 488'>zugesügt.</span></span></p>
|
22
|
+
|
23
|
+
<p class='ocr_par' title='bbox 102 494 1110 528' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 102 494 1110 528'><span class='ocrx_word' title='bbox 102 496 152 523'>Ein</span> <span class='ocrx_word' title='bbox 172 495 248 526'>Sohn</span> <span class='ocrx_word' title='bbox 268 495 362 525'>Kenas,</span> <span class='ocrx_word' title='bbox 380 495 424 521'>des</span> <span class='ocrx_word' title='bbox 445 494 557 521'>Bruders</span> <span class='ocrx_word' title='bbox 576 494 665 526'>Kaleb;</span> <span class='ocrx_word' title='bbox 693 500 798 528'>gewann</span> <span class='ocrx_word' title='bbox 818 495 916 528'>Kiriath</span> <span class='ocrx_word' title='bbox 936 495 1042 528'>Sepher,</span> <span class='ocrx_word' title='bbox 1061 495 1110 523'>und</span></span></p>
|
24
|
+
|
25
|
+
<p class='ocr_par' title='bbox 68 535 1119 614' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 100 535 1037 570'><span class='ocrx_word' title='bbox 100 538 180 565'>damit</span> <span class='ocrx_word' title='bbox 199 537 281 570'>Achsa.</span> <span class='ocrx_word' title='bbox 300 538 336 564'>die</span> <span class='ocrx_word' title='bbox 356 537 454 569'>Tochter</span> <span class='ocrx_word' title='bbox 472 535 553 567'>seines</span> <span class='ocrx_word' title='bbox 574 537 674 564'>Betters</span> <span class='ocrx_word' title='bbox 694 537 780 564'>Kaleb.</span> <span class='ocrx_word' title='bbox 800 536 877 570'>Nicht,</span> <span class='ocrx_word' title='bbox 899 540 919 569'>1.</span> <span class='ocrx_word' title='bbox 940 539 978 565'>12.</span> <span class='ocrx_word' title='bbox 1000 539 1037 564'>13.</span><br></span><span class='ocr_line' title='bbox 68 576 1119 614'><span class='ocrx_word' title='bbox 68 578 376 614'>Atroth-Sophan,</span> <span class='ocrx_word' title='bbox 396 580 433 606'>die</span> <span class='ocrx_word' title='bbox 454 580 555 607'>Krone</span> <span class='ocrx_word' title='bbox 580 580 633 606'>oder</span> <span class='ocrx_word' title='bbox 658 579 760 609'>Decke,</span> <span class='ocrx_word' title='bbox 785 580 841 606'>oder</span> <span class='ocrx_word' title='bbox 860 580 1037 614'>Bedeckung</span> <span class='ocrx_word' title='bbox 1057 576 1119 608'>des'</span></span></p>
|
26
|
+
|
27
|
+
<p class='ocr_par' title='bbox 100 621 1111 657' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 100 621 1111 657'><span class='ocrx_word' title='bbox 100 624 235 657'>Hügels.</span> <span class='ocrx_word' title='bbox 273 623 335 650'>Eine</span> <span class='ocrx_word' title='bbox 355 623 436 649'>Stadt</span> <span class='ocrx_word' title='bbox 456 623 496 649'>der</span> <span class='ocrx_word' title='bbox 515 621 656 649'>Rubeniten</span> <span class='ocrx_word' title='bbox 680 621 715 648'>im</span> <span class='ocrx_word' title='bbox 734 622 875 656'>Königreich</span> <span class='ocrx_word' title='bbox 895 622 992 654'>Sthon.</span> <span class='ocrx_word' title='bbox 1008 624 1024 650'>4</span> <span class='ocrx_word' title='bbox 1043 623 1111 657'>Mos.</span></span></p>
|
28
|
+
|
29
|
+
<p class='ocr_par' title='bbox 67 668 1112 742' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 98 668 200 698'><span class='ocrx_word' title='bbox 98 669 139 698'>32,</span> <span class='ocrx_word' title='bbox 158 668 200 693'>35.</span><br></span><span class='ocr_line' title='bbox 67 706 1112 742'><span class='ocrx_word' title='bbox 67 707 341 742'>AtrothAddar:</span> <span class='ocrx_word' title='bbox 356 706 418 735'>Die</span> <span class='ocrx_word' title='bbox 432 707 537 734'>Krone</span> <span class='ocrx_word' title='bbox 551 706 661 733'>Addar</span> <span class='ocrx_word' title='bbox 675 706 729 737'>(des</span> <span class='ocrx_word' title='bbox 744 706 849 739'>Sohnes</span> <span class='ocrx_word' title='bbox 861 706 1016 740'>Benjamin).</span> <span class='ocrx_word' title='bbox 1037 708 1112 741'>Diese</span></span></p>
|
30
|
+
|
31
|
+
<p class='ocr_par' title='bbox 98 748 1111 785' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 98 748 1111 785'><span class='ocrx_word' title='bbox 98 752 184 780'>Stadt</span> <span class='ocrx_word' title='bbox 202 751 300 785'>gehörte</span> <span class='ocrx_word' title='bbox 322 750 369 777'>den</span> <span class='ocrx_word' title='bbox 395 748 611 781'>Benjaminitern,</span> <span class='ocrx_word' title='bbox 635 749 678 782'>lag</span> <span class='ocrx_word' title='bbox 702 748 728 775'>in</span> <span class='ocrx_word' title='bbox 753 749 798 776'>den</span> <span class='ocrx_word' title='bbox 826 749 938 784'>Grenzen</span> <span class='ocrx_word' title='bbox 962 750 1033 783'>Iuda</span> <span class='ocrx_word' title='bbox 1057 750 1111 778'>tmd</span></span></p>
|
32
|
+
|
33
|
+
<p class='ocr_par' title='bbox 64 794 1112 870' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 98 794 232 825'><span class='ocrx_word' title='bbox 98 794 232 825'>Ephraim.</span><br></span><span class='ocr_line' title='bbox 64 833 1112 870'><span class='ocrx_word' title='bbox 64 834 419 870'>Atroth.Beth-Ioab,</span> <span class='ocrx_word' title='bbox 438 836 460 862'>d.</span> <span class='ocrx_word' title='bbox 480 835 497 862'>i.</span> <span class='ocrx_word' title='bbox 517 834 565 861'>die</span> <span class='ocrx_word' title='bbox 584 834 688 861'>Krone</span> <span class='ocrx_word' title='bbox 712 833 764 861'>des</span> <span class='ocrx_word' title='bbox 784 833 909 866'>Hauses</span> <span class='ocrx_word' title='bbox 926 834 1020 866'>Ioab.</span> <span class='ocrx_word' title='bbox 1050 835 1112 863'>Eine</span></span></p>
|
34
|
+
|
35
|
+
<p class='ocr_par' title='bbox 101 876 1113 911' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 101 876 1113 911'><span class='ocrx_word' title='bbox 101 879 187 907'>Stadt</span> <span class='ocrx_word' title='bbox 201 879 227 906'>in</span> <span class='ocrx_word' title='bbox 249 879 329 911'>Iuda,</span> <span class='ocrx_word' title='bbox 350 883 388 905'>wo</span> <span class='ocrx_word' title='bbox 408 878 445 905'>die</span> <span class='ocrx_word' title='bbox 456 877 634 909'>Nachlommen</span> <span class='ocrx_word' title='bbox 645 876 742 904'>Salma</span> <span class='ocrx_word' title='bbox 762 876 876 910'>gewohnt</span> <span class='ocrx_word' title='bbox 887 877 972 910'>haben.</span> <span class='ocrx_word' title='bbox 992 880 1002 904'>1</span> <span class='ocrx_word' title='bbox 1021 877 1113 911'>Chron.</span></span></p>
|
36
|
+
|
37
|
+
<p class='ocr_par' title='bbox 64 923 1112 996' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 96 923 185 953'><span class='ocrx_word' title='bbox 96 923 120 953'>2,</span> <span class='ocrx_word' title='bbox 141 924 185 949'>54.</span><br></span><span class='ocr_line' title='bbox 64 961 1112 996'><span class='ocrx_word' title='bbox 64 962 212 993'>Attalia.</span> <span class='ocrx_word' title='bbox 249 963 312 990'>Eine</span> <span class='ocrx_word' title='bbox 330 963 411 990'>Stadt</span> <span class='ocrx_word' title='bbox 428 962 455 988'>in</span> <span class='ocrx_word' title='bbox 474 961 637 994'>Pamphilien</span> <span class='ocrx_word' title='bbox 665 962 701 990'>od.</span> <span class='ocrx_word' title='bbox 720 961 824 994'>Libyen,</span> <span class='ocrx_word' title='bbox 848 967 897 989'>von</span> <span class='ocrx_word' title='bbox 916 962 1004 989'>Attala</span> <span class='ocrx_word' title='bbox 1023 962 1112 996'>Phila.</span></span></p>
|
38
|
+
|
39
|
+
<p class='ocr_par' title='bbox 62 1005 1135 1082' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 95 1005 594 1039'><span class='ocrx_word' title='bbox 95 1006 185 1039'>delpho</span> <span class='ocrx_word' title='bbox 206 1006 302 1033'>erbaut.</span> <span class='ocrx_word' title='bbox 321 1006 368 1037'>Ap.</span> <span class='ocrx_word' title='bbox 388 1005 471 1038'>Gesch.</span> <span class='ocrx_word' title='bbox 494 1006 533 1038'>14,</span> <span class='ocrx_word' title='bbox 553 1006 594 1031'>25.</span><br></span><span class='ocr_line' title='bbox 62 1046 1135 1082'><span class='ocrx_word' title='bbox 62 1048 208 1077'>Attalus</span> <span class='ocrx_word' title='bbox 255 1048 306 1075'>Ein</span> <span class='ocrx_word' title='bbox 330 1046 411 1080'>König</span> <span class='ocrx_word' title='bbox 436 1047 462 1073'>in</span> <span class='ocrx_word' title='bbox 482 1046 594 1078'>Mysien,</span> <span class='ocrx_word' title='bbox 623 1046 722 1078'>welches</span> <span class='ocrx_word' title='bbox 747 1047 819 1073'>unter</span> <span class='ocrx_word' title='bbox 843 1046 972 1081'>Phrygien</span> <span class='ocrx_word' title='bbox 997 1047 1112 1082'>gehörte;</span> <span class='ocrx_word' title='bbox 1128 1058 1135 1067'>,</span></span></p>
|
40
|
+
|
41
|
+
<p class='ocr_par' title='bbox 95 1089 1111 1125' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 95 1089 1111 1125'><span class='ocrx_word' title='bbox 95 1093 207 1125'>genannt</span> <span class='ocrx_word' title='bbox 227 1095 274 1117'>von</span> <span class='ocrx_word' title='bbox 294 1090 416 1119'>Attale,</span> <span class='ocrx_word' title='bbox 435 1090 534 1120'>welches</span> <span class='ocrx_word' title='bbox 553 1089 591 1116'>bei</span> <span class='ocrx_word' title='bbox 610 1089 657 1115'>den</span> <span class='ocrx_word' title='bbox 676 1089 819 1122'>Phrygiern</span> <span class='ocrx_word' title='bbox 838 1089 917 1121'>Kropf</span> <span class='ocrx_word' title='bbox 935 1090 991 1116'>oder</span> <span class='ocrx_word' title='bbox 1012 1090 1111 1124'>Gurgel</span></span></p>
|
42
|
+
|
43
|
+
<p class='ocr_par' title='bbox 94 1131 1110 1168' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 94 1131 1110 1168'><span class='ocrx_word' title='bbox 94 1133 213 1168'>geheißen</span> <span class='ocrx_word' title='bbox 233 1133 310 1165'>haben</span> <span class='ocrx_word' title='bbox 329 1131 380 1161'>soll.</span> <span class='ocrx_word' title='bbox 393 1157 397 1161'>,</span> <span class='ocrx_word' title='bbox 417 1131 479 1159'>War</span> <span class='ocrx_word' title='bbox 497 1132 536 1158'>ein</span> <span class='ocrx_word' title='bbox 555 1132 637 1165'>König</span> <span class='ocrx_word' title='bbox 660 1132 700 1158'>der</span> <span class='ocrx_word' title='bbox 719 1132 889 1165'>Pergamener</span> <span class='ocrx_word' title='bbox 907 1133 956 1158'>und</span> <span class='ocrx_word' title='bbox 976 1131 1110 1165'>Phrvgier.</span></span></p>
|
44
|
+
|
45
|
+
<p class='ocr_par' title='bbox 59 1175 1110 1250' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 95 1175 332 1207'><span class='ocrx_word' title='bbox 95 1177 106 1201'>l</span> <span class='ocrx_word' title='bbox 130 1175 211 1202'>Mack.</span> <span class='ocrx_word' title='bbox 233 1176 272 1207'>15,</span> <span class='ocrx_word' title='bbox 292 1176 332 1201'>22.</span><br></span><span class='ocr_line' title='bbox 59 1216 1110 1250'><span class='ocrx_word' title='bbox 59 1217 146 1246'>Ava.</span> <span class='ocrx_word' title='bbox 184 1217 224 1250'>Ist</span> <span class='ocrx_word' title='bbox 242 1217 280 1243'>bei</span> <span class='ocrx_word' title='bbox 299 1218 344 1243'>den</span> <span class='ocrx_word' title='bbox 364 1217 432 1243'>alten</span> <span class='ocrx_word' title='bbox 453 1216 568 1248'>Griechen</span> <span class='ocrx_word' title='bbox 587 1216 636 1242'>Aia</span> <span class='ocrx_word' title='bbox 655 1216 691 1242'>od.</span> <span class='ocrx_word' title='bbox 711 1216 771 1247'>Aea,</span> <span class='ocrx_word' title='bbox 790 1216 828 1242'>die</span> <span class='ocrx_word' title='bbox 842 1216 993 1249'>Hauptstadt</span> <span class='ocrx_word' title='bbox 1010 1217 1036 1243'>in</span> <span class='ocrx_word' title='bbox 1051 1216 1110 1245'>Col»</span></span></p>
|
46
|
+
|
47
|
+
<p class='ocr_par' title='bbox 92 1258 1108 1293' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 92 1258 1108 1293'><span class='ocrx_word' title='bbox 92 1261 165 1293'>chide,</span> <span class='ocrx_word' title='bbox 186 1265 224 1286'>wo</span> <span class='ocrx_word' title='bbox 244 1260 323 1285'>Aetas</span> <span class='ocrx_word' title='bbox 341 1258 451 1292'>regierte.</span> <span class='ocrx_word' title='bbox 494 1258 593 1291'>Colchis</span> <span class='ocrx_word' title='bbox 612 1258 678 1291'>heißt</span> <span class='ocrx_word' title='bbox 697 1258 812 1292'>heutiges</span> <span class='ocrx_word' title='bbox 833 1258 916 1292'>Tages</span> <span class='ocrx_word' title='bbox 938 1258 1108 1293'>Mengrelicn,</span></span></p>
|
48
|
+
|
49
|
+
<p class='ocr_par' title='bbox 92 1300 1109 1335' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 92 1300 1109 1335'><span class='ocrx_word' title='bbox 92 1303 131 1330'>die</span> <span class='ocrx_word' title='bbox 160 1303 258 1335'>meisten</span> <span class='ocrx_word' title='bbox 277 1302 430 1332'>Einwohner</span> <span class='ocrx_word' title='bbox 455 1300 505 1333'>sind</span> <span class='ocrx_word' title='bbox 531 1300 653 1334'>Christen.</span> <span class='ocrx_word' title='bbox 698 1300 755 1328'>Von</span> <span class='ocrx_word' title='bbox 780 1300 831 1332'>hier</span> <span class='ocrx_word' title='bbox 855 1302 956 1328'>wurden</span> <span class='ocrx_word' title='bbox 980 1302 1018 1328'>die</span> <span class='ocrx_word' title='bbox 1037 1301 1109 1329'>Leute</span></span></p>
|
50
|
+
|
51
|
+
<p class='ocr_par' title='bbox 92 1342 1110 1378' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 92 1342 1110 1378'><span class='ocrx_word' title='bbox 92 1349 144 1371'>von</span> <span class='ocrx_word' title='bbox 165 1344 347 1376'>Salmanasscr</span> <span class='ocrx_word' title='bbox 371 1344 428 1375'>nach</span> <span class='ocrx_word' title='bbox 454 1343 582 1370'>Samaria</span> <span class='ocrx_word' title='bbox 606 1343 716 1378'>gesührt,</span> <span class='ocrx_word' title='bbox 741 1347 779 1369'>wo</span> <span class='ocrx_word' title='bbox 804 1342 835 1375'>sie</span> <span class='ocrx_word' title='bbox 859 1343 917 1375'>noch</span> <span class='ocrx_word' title='bbox 942 1343 993 1376'>ihre</span> <span class='ocrx_word' title='bbox 1019 1344 1110 1371'>Götter</span></span></p>
|
52
|
+
|
53
|
+
<p class='ocr_par' title='bbox 58 1386 1109 1462' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 93 1386 875 1419'><span class='ocrx_word' title='bbox 93 1387 211 1419'>Nibehas</span> <span class='ocrx_word' title='bbox 231 1387 279 1413'>und</span> <span class='ocrx_word' title='bbox 298 1386 421 1418'>Tharthac</span> <span class='ocrx_word' title='bbox 440 1386 578 1413'>anbeteten.</span> <span class='ocrx_word' title='bbox 597 1388 612 1412'>2</span> <span class='ocrx_word' title='bbox 630 1387 694 1413'>Kön.</span> <span class='ocrx_word' title='bbox 716 1388 754 1416'>l7,</span> <span class='ocrx_word' title='bbox 775 1386 816 1413'>24.</span> <span class='ocrx_word' title='bbox 835 1387 875 1413'>31.</span><br></span><span class='ocr_line' title='bbox 58 1428 1109 1462'><span class='ocrx_word' title='bbox 58 1428 175 1458'>Aven.</span> <span class='ocrx_word' title='bbox 213 1429 314 1462'>Götze,</span> <span class='ocrx_word' title='bbox 339 1428 502 1455'>Eitelleit.</span> <span class='ocrx_word' title='bbox 538 1428 580 1455'>So</span> <span class='ocrx_word' title='bbox 605 1429 665 1455'>wird</span> <span class='ocrx_word' title='bbox 690 1428 778 1461'>Bethel</span> <span class='ocrx_word' title='bbox 802 1429 920 1462'>genannt.</span> <span class='ocrx_word' title='bbox 939 1428 995 1462'>Hos.</span> <span class='ocrx_word' title='bbox 1023 1431 1061 1460'>10,</span> <span class='ocrx_word' title='bbox 1086 1431 1109 1456'>8.</span></span></p>
|
54
|
+
|
55
|
+
<p class='ocr_par' title='bbox 92 1471 1110 1505' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 92 1471 1110 1505'><span class='ocrx_word' title='bbox 92 1478 180 1505'>wegen</span> <span class='ocrx_word' title='bbox 199 1472 241 1498'>der</span> <span class='ocrx_word' title='bbox 261 1471 365 1504'>Götzen,</span> <span class='ocrx_word' title='bbox 390 1471 429 1497'>die</span> <span class='ocrx_word' title='bbox 456 1471 556 1503'>daselbst</span> <span class='ocrx_word' title='bbox 579 1476 628 1497'>von</span> <span class='ocrx_word' title='bbox 651 1472 694 1497'>den</span> <span class='ocrx_word' title='bbox 715 1471 852 1503'>Israeliten</span> <span class='ocrx_word' title='bbox 877 1471 973 1503'>verehrt</span> <span class='ocrx_word' title='bbox 1000 1472 1110 1498'>wurden.</span></span></p>
|
56
|
+
|
57
|
+
<p class='ocr_par' title='bbox 91 1513 1110 1548' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 91 1513 1110 1548'><span class='ocrx_word' title='bbox 91 1515 149 1541'>Mit</span> <span class='ocrx_word' title='bbox 167 1515 220 1541'>dem</span> <span class='ocrx_word' title='bbox 240 1520 332 1548'>ganzen</span> <span class='ocrx_word' title='bbox 352 1513 461 1540'>Namen:</span> <span class='ocrx_word' title='bbox 482 1513 640 1544'>Beth»Aven,</span> <span class='ocrx_word' title='bbox 658 1513 708 1540'>das</span> <span class='ocrx_word' title='bbox 726 1513 893 1548'>Götzenhaus,</span> <span class='ocrx_word' title='bbox 914 1513 979 1546'>oder,</span> <span class='ocrx_word' title='bbox 997 1514 1029 1540'>da</span> <span class='ocrx_word' title='bbox 1048 1520 1110 1541'>man</span></span></p>
|
58
|
+
|
59
|
+
<p class='ocr_par' title='bbox 56 1555 1110 1633' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 91 1555 631 1588'><span class='ocrx_word' title='bbox 91 1558 149 1584'>dem</span> <span class='ocrx_word' title='bbox 169 1557 254 1583'>Eiteln</span> <span class='ocrx_word' title='bbox 275 1556 453 1587'>nachwandelt.</span> <span class='ocrx_word' title='bbox 473 1555 532 1588'>Hos.</span> <span class='ocrx_word' title='bbox 549 1558 571 1584'>4,</span> <span class='ocrx_word' title='bbox 593 1558 631 1581'>15.</span><br></span><span class='ocr_line' title='bbox 56 1597 1110 1633'><span class='ocrx_word' title='bbox 56 1597 242 1633'>Augustus.</span> <span class='ocrx_word' title='bbox 287 1597 419 1631'>Würdig</span> <span class='ocrx_word' title='bbox 448 1598 577 1630'>verehrt</span> <span class='ocrx_word' title='bbox 608 1598 668 1624'>und</span> <span class='ocrx_word' title='bbox 704 1599 876 1632'>angebetet</span> <span class='ocrx_word' title='bbox 908 1605 944 1632'>zu</span> <span class='ocrx_word' title='bbox 978 1599 1110 1627'>werden.</span></span></p>
|
60
|
+
|
61
|
+
<p class='ocr_par' title='bbox 92 1640 1108 1674' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 92 1640 1108 1674'><span class='ocrx_word' title='bbox 92 1640 189 1674'>Diesen</span> <span class='ocrx_word' title='bbox 211 1640 311 1667'>Namen</span> <span class='ocrx_word' title='bbox 340 1641 386 1674'>gab</span> <span class='ocrx_word' title='bbox 413 1641 459 1667'>das</span> <span class='ocrx_word' title='bbox 487 1640 596 1672'>romische</span> <span class='ocrx_word' title='bbox 616 1640 678 1667'>Voll</span> <span class='ocrx_word' title='bbox 706 1641 758 1667'>dem</span> <span class='ocrx_word' title='bbox 781 1641 869 1672'>Kaiser</span> <span class='ocrx_word' title='bbox 887 1641 1030 1672'>Octavian,</span> <span class='ocrx_word' title='bbox 1058 1642 1108 1668'>und</span></span></p>
|
62
|
+
|
63
|
+
<p class='ocr_par' title='bbox 93 1682 1108 1716' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 93 1682 1108 1716'><span class='ocrx_word' title='bbox 93 1683 142 1710'>alle</span> <span class='ocrx_word' title='bbox 161 1683 291 1715'>romischen</span> <span class='ocrx_word' title='bbox 310 1683 396 1715'>Kaiser</span> <span class='ocrx_word' title='bbox 416 1683 495 1713'>haben</span> <span class='ocrx_word' title='bbox 514 1682 594 1714'>diesen</span> <span class='ocrx_word' title='bbox 614 1682 709 1709'>Namen</span> <span class='ocrx_word' title='bbox 728 1683 898 1716'>beibehalten,</span> <span class='ocrx_word' title='bbox 917 1682 964 1715'>daß</span> <span class='ocrx_word' title='bbox 983 1682 1014 1715'>sie</span> <span class='ocrx_word' title='bbox 1033 1692 1108 1710'>«au,-</span></span></p>
|
64
|
+
|
65
|
+
<p class='ocr_par' title='bbox 55 1724 1107 1843' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 90 1724 997 1759'><span class='ocrx_word' title='bbox 90 1733 142 1758'>per</span> <span class='ocrx_word' title='bbox 160 1727 284 1758'>2ußr>«ti,</span> <span class='ocrx_word' title='bbox 304 1726 326 1751'>d.</span> <span class='ocrx_word' title='bbox 345 1724 360 1750'>i,</span> <span class='ocrx_word' title='bbox 382 1725 462 1757'>allzeit</span> <span class='ocrx_word' title='bbox 482 1725 581 1756'>Mehrer</span> <span class='ocrx_word' title='bbox 600 1725 643 1751'>des</span> <span class='ocrx_word' title='bbox 664 1724 750 1756'>Reichs</span> <span class='ocrx_word' title='bbox 770 1725 885 1759'>geheißen</span> <span class='ocrx_word' title='bbox 904 1725 997 1758'>haben.</span><br></span><span class='ocr_line' title='bbox 55 1766 952 1800'><span class='ocrx_word' title='bbox 55 1766 176 1800'>Avith.</span> <span class='ocrx_word' title='bbox 212 1767 324 1799'>Haufe.</span> <span class='ocrx_word' title='bbox 361 1766 424 1793'>Eine</span> <span class='ocrx_word' title='bbox 443 1766 524 1794'>Stadt</span> <span class='ocrx_word' title='bbox 542 1768 569 1793'>in</span> <span class='ocrx_word' title='bbox 587 1767 709 1799'>Idumäa.</span> <span class='ocrx_word' title='bbox 732 1769 742 1793'>1</span> <span class='ocrx_word' title='bbox 763 1767 831 1799'>Mos.</span> <span class='ocrx_word' title='bbox 849 1769 892 1798'>36,</span> <span class='ocrx_word' title='bbox 910 1769 952 1795'>35.</span><br></span><span class='ocr_line' title='bbox 57 1809 1107 1843'><span class='ocrx_word' title='bbox 57 1809 182 1839'>Aulon.</span> <span class='ocrx_word' title='bbox 236 1809 445 1843'>Ausgehöhlt.</span> <span class='ocrx_word' title='bbox 491 1809 553 1836'>Das</span> <span class='ocrx_word' title='bbox 581 1809 653 1843'>große</span> <span class='ocrx_word' title='bbox 681 1809 757 1841'>Thal,</span> <span class='ocrx_word' title='bbox 791 1810 871 1837'>worin</span> <span class='ocrx_word' title='bbox 897 1810 935 1837'>die</span> <span class='ocrx_word' title='bbox 962 1809 1107 1843'>berühmten</span></span></p>
|
66
|
+
|
67
|
+
<p class='ocr_par' title='bbox 89 1851 1106 1887' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 89 1851 1106 1887'><span class='ocrx_word' title='bbox 89 1853 186 1882'>Städte</span> <span class='ocrx_word' title='bbox 204 1852 315 1886'>Vethsan</span> <span class='ocrx_word' title='bbox 334 1852 390 1879'>oder</span> <span class='ocrx_word' title='bbox 409 1852 588 1886'>Scythopolis,</span> <span class='ocrx_word' title='bbox 605 1851 732 1885'>Tlberias,</span> <span class='ocrx_word' title='bbox 751 1852 862 1887'>Iericho,</span> <span class='ocrx_word' title='bbox 881 1851 929 1880'>das</span> <span class='ocrx_word' title='bbox 949 1853 1013 1880'>todte</span> <span class='ocrx_word' title='bbox 1033 1852 1106 1880'>Meer</span></span></p>
|
68
|
+
|
69
|
+
</div>
|
70
|
+
</div></body>
|
71
|
+
</html>
|
data/data/test.png
ADDED
Binary file
|
data/example/example_server.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#coding: utf-8
|
2
2
|
require 'sinatra'
|
3
3
|
|
4
|
-
require_relative '../lib/
|
4
|
+
require_relative '../lib/rhocr'
|
5
5
|
|
6
6
|
get '/' do
|
7
7
|
"<a href='OCRTest.html'>OCRTest</a>"
|
@@ -25,5 +25,5 @@ end
|
|
25
25
|
|
26
26
|
def get_enclosed_words(x1, y1, x2 ,y2, page)
|
27
27
|
@page = OCRPage.new("../data/#{page}")
|
28
|
-
@page.enclosed_words(
|
28
|
+
@page.enclosed_words( HOCRBox.new(x1, y1, x2 ,y2) )
|
29
29
|
end
|
Binary file
|
data/lib/hocr_box.rb
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
|
3
|
+
class HOCRBox
|
4
|
+
|
5
|
+
attr_reader :left, :top, :right, :bottom, :upper_left, :lower_right, :coordinates
|
6
|
+
def initialize(* coordinates)
|
7
|
+
|
8
|
+
@left, @top, @right, @bottom = coordinates.flatten.collect { |x| x.to_i}
|
9
|
+
|
10
|
+
@height = @bottom - @top
|
11
|
+
@width = @right - @left
|
12
|
+
@upper_left = [ @left, @top]
|
13
|
+
@lower_rigth = [ @right, @bottom ]
|
14
|
+
@coordinates = [ @left, @top,@right, @bottom ]
|
15
|
+
|
16
|
+
if left > right || top > bottom then
|
17
|
+
raise " Negative dimensions of OCRBox ar not allowed. left #{@left} / right #{@right} - top #{@top} / bottom #{@bottom}"
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
|
22
|
+
def encloses?(other)
|
23
|
+
@left <= other.left and
|
24
|
+
@right >= other.right and
|
25
|
+
@top <= other.top and
|
26
|
+
@bottom >= other.bottom
|
27
|
+
end
|
28
|
+
|
29
|
+
def enclosed_by?(other)
|
30
|
+
return other.encloses? self
|
31
|
+
end
|
32
|
+
|
33
|
+
def left_of?(other)
|
34
|
+
@right < other.left
|
35
|
+
end
|
36
|
+
|
37
|
+
def right_of?(other)
|
38
|
+
@left > other.right
|
39
|
+
end
|
40
|
+
|
41
|
+
def left_distance_to(other)
|
42
|
+
@left - other.right
|
43
|
+
end
|
44
|
+
|
45
|
+
def right_distance_to(other)
|
46
|
+
other.left_distance_to(self)
|
47
|
+
end
|
48
|
+
|
49
|
+
def to_s
|
50
|
+
coordinates_to_s
|
51
|
+
end
|
52
|
+
|
53
|
+
def coordinates_to_s
|
54
|
+
"(#{@left},#{@top})/(#{@right},#{@bottom})"
|
55
|
+
end
|
56
|
+
|
57
|
+
def to_css_style
|
58
|
+
"position:absolute; top:#{@top}px; left:#{@left}px; height:#{@height}px; width:#{@width}px;"
|
59
|
+
end
|
60
|
+
|
61
|
+
def to_image_html(css_class = 'hocr_box')
|
62
|
+
"<span style='#{ to_css_style }' class='#{css_class}'></span>"
|
63
|
+
end
|
64
|
+
|
65
|
+
|
66
|
+
end
|
67
|
+
|
data/lib/ocr_document.rb
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
|
3
|
+
require_relative 'ocr_page'
|
4
|
+
|
5
|
+
class OCRDocument
|
6
|
+
attr_reader :pages, :page_count
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@pages = Hash.new()
|
10
|
+
@page_count = 0
|
11
|
+
end
|
12
|
+
|
13
|
+
def add_pages( list_o_pages )
|
14
|
+
for file in list_o_pages do
|
15
|
+
add_page(file)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def add_page( file )
|
20
|
+
page = OCRPage.new( file )
|
21
|
+
@pages[page.page_number] = page
|
22
|
+
@page_count += 1
|
23
|
+
end
|
24
|
+
|
25
|
+
def page( number )
|
26
|
+
@pages[number]
|
27
|
+
end
|
28
|
+
|
29
|
+
def each_line
|
30
|
+
for page in @pages.values do
|
31
|
+
page.each_line do |line|
|
32
|
+
yield line
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def each_word
|
38
|
+
for page in @pages.values do
|
39
|
+
page.each_line do |line|
|
40
|
+
line.each do |word|
|
41
|
+
yield word
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
|
48
|
+
alias :add_files :add_pages
|
49
|
+
alias :add_file :add_page
|
50
|
+
end
|
data/lib/ocr_element.rb
ADDED
@@ -0,0 +1,149 @@
|
|
1
|
+
#coding:utf-8
|
2
|
+
|
3
|
+
require_relative 'hocr_box'
|
4
|
+
class OCRElement < HOCRBox
|
5
|
+
|
6
|
+
include Enumerable
|
7
|
+
|
8
|
+
attr_reader :ocr_class, :children
|
9
|
+
attr_accessor :features
|
10
|
+
|
11
|
+
class << self
|
12
|
+
def create_from_html(ocr_element_html)
|
13
|
+
create ocr_element_html
|
14
|
+
end
|
15
|
+
|
16
|
+
def create(ocr_element_html)
|
17
|
+
ocr_class = extract_ocr_class(ocr_element_html)
|
18
|
+
coordinates = extract_coordinates(ocr_element_html)
|
19
|
+
|
20
|
+
unless ocr_class == 'ocrx_word'
|
21
|
+
children = extract_children(ocr_element_html)
|
22
|
+
else
|
23
|
+
children = extract_word_children(ocr_element_html)
|
24
|
+
end
|
25
|
+
|
26
|
+
case ocr_class
|
27
|
+
when 'ocrx_block' then
|
28
|
+
OCRBlock.new(ocr_class,children,coordinates)
|
29
|
+
when 'ocr_par' then
|
30
|
+
OCRParagraph.new(ocr_class,children,coordinates)
|
31
|
+
when 'ocr_line' then
|
32
|
+
OCRLine.new(ocr_class,children,coordinates)
|
33
|
+
when 'ocrx_word' then
|
34
|
+
OCRWord.new(ocr_class,children,coordinates)
|
35
|
+
else
|
36
|
+
OCRElement.new(ocr_class,children,coordinates)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def extract_word_children(ocr_element_html)
|
41
|
+
[ocr_element_html.text]
|
42
|
+
end
|
43
|
+
|
44
|
+
def extract_children(ocr_element_html)
|
45
|
+
children = []
|
46
|
+
for child_fragment_html in ocr_element_html.elements do
|
47
|
+
children << OCRElement.create(child_fragment_html)
|
48
|
+
end
|
49
|
+
#br Elemente ausfiltern
|
50
|
+
children.reject { |child| child.ocr_class == nil}
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
def extract_coordinates(ocr_element_html)
|
55
|
+
extract_coordinates_from_string ocr_element_html['title']
|
56
|
+
end
|
57
|
+
|
58
|
+
def extract_coordinates_from_string(s)
|
59
|
+
s =~ /bbox (\d+) (\d+) (\d+) (\d+)/
|
60
|
+
[$1, $2, $3, $4]
|
61
|
+
end
|
62
|
+
|
63
|
+
def extract_ocr_class(ocr_element_html)
|
64
|
+
ocr_element_html['class']
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def initialize(ocr_class, children, coordinates)
|
69
|
+
@children = children
|
70
|
+
@ocr_class = ocr_class
|
71
|
+
@features = []
|
72
|
+
super coordinates
|
73
|
+
end
|
74
|
+
|
75
|
+
def each
|
76
|
+
children.each do |child|
|
77
|
+
yield child
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def to_s
|
82
|
+
"#{self.class}:#{@features}#{ coordinates_to_s }->\n" + children.map { |c| "\t#{c.to_s}" }.join("\n")
|
83
|
+
end
|
84
|
+
|
85
|
+
def mark_in_rspec(color)
|
86
|
+
"<span style='color: #{color}'>#{to_s}</span>"
|
87
|
+
end
|
88
|
+
|
89
|
+
def to_image_html(dipslay_class = @ocr_class)
|
90
|
+
children_html = @children.map {|c| c.to_image_html}.join("")
|
91
|
+
"<span class='#{ dipslay_class }' style='#{ to_css_style }' ></span>#{ children_html }"
|
92
|
+
end
|
93
|
+
|
94
|
+
def to_html( display_class = @ocr_class, style = nil )
|
95
|
+
children_html = @children.map {|c| c.to_html}.join("")
|
96
|
+
"<span class='#{ display_class }'> #{ children_html } </span>"
|
97
|
+
end
|
98
|
+
|
99
|
+
end
|
100
|
+
|
101
|
+
class OCRWord < OCRElement
|
102
|
+
|
103
|
+
def text
|
104
|
+
children.flatten[0]
|
105
|
+
end
|
106
|
+
|
107
|
+
def to_s
|
108
|
+
"#{text}[#{@features}]"
|
109
|
+
end
|
110
|
+
|
111
|
+
def to_image_html
|
112
|
+
"<span class='#{ @ocr_class }' style='#{ to_css_style }'>#{ text }</span>"
|
113
|
+
end
|
114
|
+
|
115
|
+
def to_html
|
116
|
+
"<span class='#{ @ocr_class }'>#{ text }</span>"
|
117
|
+
end
|
118
|
+
|
119
|
+
end
|
120
|
+
|
121
|
+
class OCRLine < OCRElement
|
122
|
+
|
123
|
+
def to_s
|
124
|
+
"#{self.class} #{coordinates_to_s} ->[\n" +
|
125
|
+
words.map {|w| "#{w.coordinates_to_s}\t#{w.to_s}"}.join("\n") +
|
126
|
+
"]"
|
127
|
+
end
|
128
|
+
|
129
|
+
def simple_line
|
130
|
+
"#{self.class} #{coordinates_to_s} ->[\n" +
|
131
|
+
words.map {|w| w.to_s}.join("\n") +
|
132
|
+
"]"
|
133
|
+
end
|
134
|
+
|
135
|
+
def to_text
|
136
|
+
words.map { |w| w.text }.join(" ")
|
137
|
+
end
|
138
|
+
|
139
|
+
alias :words :children
|
140
|
+
end
|
141
|
+
|
142
|
+
class OCRParagraph < OCRElement
|
143
|
+
alias :lines :children
|
144
|
+
end
|
145
|
+
|
146
|
+
class OCRBlock < OCRElement
|
147
|
+
alias :paragraphs :children
|
148
|
+
end
|
149
|
+
|
data/lib/ocr_page.rb
CHANGED
@@ -1,43 +1,98 @@
|
|
1
1
|
#coding: utf-8
|
2
|
-
require_relative "
|
2
|
+
require_relative "ocr_element"
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'pp'
|
3
5
|
|
4
|
-
class OCRPage <
|
5
|
-
attr_reader :lines, :words
|
6
|
+
class OCRPage < OCRElement
|
6
7
|
|
7
|
-
|
8
|
-
|
8
|
+
attr_reader :meta_data, :page_number, :dimensions, :lines, :image
|
9
|
+
alias :each_block :each
|
10
|
+
alias :blocks :children
|
11
|
+
|
12
|
+
def initialize(file_path , image_path = nil )
|
13
|
+
doc = process_hocr_html_file(file_path)
|
14
|
+
page_content = doc.at_css("div.ocr_page")
|
15
|
+
coordinates, @page_number = extract_bbox_ppageno( page_content['title'] )
|
16
|
+
|
17
|
+
@page_content = doc.at_css("div.ocr_page")
|
18
|
+
children = OCRElement.extract_children(@page_content)
|
19
|
+
super('ocr_page', children, coordinates)
|
20
|
+
@image = image_path
|
21
|
+
|
9
22
|
end
|
10
23
|
|
11
|
-
|
12
|
-
|
13
|
-
for
|
14
|
-
|
15
|
-
|
16
|
-
ocrx_word =~ /title=['"]bbox (\d+) (\d+) (\d+) (\d+)['"]>([^<]+)</
|
17
|
-
current_word = OCRXWord.new($1,$2,$3,$4,$5)
|
18
|
-
line_array << current_word
|
24
|
+
|
25
|
+
def each_paragraph
|
26
|
+
for block in blocks do
|
27
|
+
for paragraph in block do
|
28
|
+
yield paragraph
|
19
29
|
end
|
20
|
-
|
21
|
-
end
|
22
|
-
hocr_array
|
30
|
+
end
|
23
31
|
end
|
24
32
|
|
25
|
-
def
|
26
|
-
|
33
|
+
def each_line
|
34
|
+
for block in blocks do
|
35
|
+
for paragraph in block do
|
36
|
+
for line in paragraph do
|
37
|
+
yield line
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
27
41
|
end
|
28
42
|
|
29
|
-
def
|
30
|
-
|
43
|
+
def each_word
|
44
|
+
for block in blocks do
|
45
|
+
for paragraph in block do
|
46
|
+
for line in paragraph do
|
47
|
+
for word in line do
|
48
|
+
yield word
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
#deprecated
|
56
|
+
def lines
|
57
|
+
unless @lines then
|
58
|
+
@lines = []
|
59
|
+
|
60
|
+
each_line do |line|
|
61
|
+
@lines << line
|
62
|
+
end
|
63
|
+
|
64
|
+
end
|
65
|
+
@lines
|
66
|
+
end
|
67
|
+
|
68
|
+
def extract_bbox_ppageno( ocr_html_text_fragment )
|
69
|
+
bbox, ppageno = ocr_html_text_fragment.split(';')
|
70
|
+
ppageno =~ /(\d+)/
|
71
|
+
[ OCRElement.extract_coordinates_from_string(bbox) , $1.to_i ]
|
72
|
+
end
|
73
|
+
|
74
|
+
def process_hocr_html_file(filename)
|
75
|
+
html_string = File.open(filename,"r").read
|
76
|
+
Nokogiri::HTML(html_string).elements
|
31
77
|
end
|
32
78
|
|
33
|
-
def
|
34
|
-
|
35
|
-
[$1,$2,$3,$4]
|
79
|
+
def to_text
|
80
|
+
lines.map {|line| line.to_text}.join("\n")
|
36
81
|
end
|
37
82
|
|
83
|
+
def to_image_html(dipslay_class = @ocr_class)
|
84
|
+
children_html = @children.map {|c| c.to_image_html}.join("")
|
85
|
+
"<div class='#{ dipslay_class }' style='#{ to_css_style };background-image: url(#{@image}); width:#{@width}px; height:#{@height}>px ;'>#{children_html}</div>"
|
86
|
+
end
|
38
87
|
|
39
|
-
def
|
40
|
-
|
88
|
+
def enclosed_words(ocr_box)
|
89
|
+
a = []
|
90
|
+
each_word do |w|
|
91
|
+
if w.enclosed_by? ocr_box then
|
92
|
+
a << w
|
93
|
+
end
|
94
|
+
end
|
95
|
+
a
|
41
96
|
end
|
42
97
|
|
43
98
|
end
|
data/lib/rhocr.rb
CHANGED
@@ -1,2 +1,31 @@
|
|
1
1
|
#coding: utf-8
|
2
|
-
|
2
|
+
|
3
|
+
require_relative "ocr_document"
|
4
|
+
class RHOCR < OCRDocument
|
5
|
+
|
6
|
+
attr_reader :words, :lines
|
7
|
+
|
8
|
+
def add_folder(path)
|
9
|
+
add_files Dir[path]
|
10
|
+
compute_lines
|
11
|
+
compute_words
|
12
|
+
self
|
13
|
+
end
|
14
|
+
|
15
|
+
#should be called if new pages are added
|
16
|
+
def compute_words
|
17
|
+
@words = []
|
18
|
+
each_word do |w|
|
19
|
+
@words << w
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
#should be called if new pages are added
|
24
|
+
def compute_lines
|
25
|
+
@lines = []
|
26
|
+
each_line do |l|
|
27
|
+
@lines << l
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
data/rhocr.gemspec
CHANGED
@@ -2,28 +2,31 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{rhocr}
|
5
|
-
s.version = "0.
|
5
|
+
s.version = "0.1"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
|
-
s.authors = [
|
9
|
-
s.date = %q{2011-
|
8
|
+
s.authors = [%q{Andreas Neumann}]
|
9
|
+
s.date = %q{2011-09-08}
|
10
10
|
s.description = %q{Manipulate and use OCR data encode in HOCR}
|
11
|
-
s.email = %q{
|
12
|
-
s.extra_rdoc_files = [
|
13
|
-
s.files = [
|
11
|
+
s.email = %q{andreas@neumann.biz}
|
12
|
+
s.extra_rdoc_files = [%q{README}, %q{TODO.txt}, %q{lib/hocr_box.rb}, %q{lib/ocr_document.rb}, %q{lib/ocr_element.rb}, %q{lib/ocr_page.rb}, %q{lib/rhocr.rb}]
|
13
|
+
s.files = [%q{Manifest}, %q{README}, %q{Rakefile}, %q{TODO.txt}, %q{data/Seite_Die_Gartenlaube_242.html}, %q{data/Seite_Tagebuch_H_C_Lang_08.html}, %q{data/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{data/test.html}, %q{data/test.png}, %q{example/example_server.rb}, %q{example/public/OCRTest.css}, %q{example/public/OCRTest.html}, %q{example/public/OCRTest_marker.js}, %q{example/public/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{example/public/img/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{lib/hocr_box.rb}, %q{lib/ocr_document.rb}, %q{lib/ocr_element.rb}, %q{lib/ocr_page.rb}, %q{lib/rhocr.rb}, %q{rhocr.gemspec}, %q{spec/hocr_box_spec.rb}, %q{spec/ocr_document_spec.rb}, %q{spec/ocr_element_spec.rb}, %q{spec/ocr_page_spec.rb}, %q{spec/rhocr_spec.rb}, %q{test.html}]
|
14
14
|
s.homepage = %q{http://github.com/daandi/rhocr}
|
15
|
-
s.rdoc_options = [
|
16
|
-
s.require_paths = [
|
15
|
+
s.rdoc_options = [%q{--line-numbers}, %q{--inline-source}, %q{--title}, %q{Rhocr}, %q{--main}, %q{README}]
|
16
|
+
s.require_paths = [%q{lib}]
|
17
17
|
s.rubyforge_project = %q{rhocr}
|
18
|
-
s.rubygems_version = %q{1.6
|
18
|
+
s.rubygems_version = %q{1.8.6}
|
19
19
|
s.summary = %q{Manipulate and use OCR data encode in HOCR}
|
20
20
|
|
21
21
|
if s.respond_to? :specification_version then
|
22
22
|
s.specification_version = 3
|
23
23
|
|
24
24
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
25
|
+
s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
|
25
26
|
else
|
27
|
+
s.add_dependency(%q<nokogiri>, [">= 0"])
|
26
28
|
end
|
27
29
|
else
|
30
|
+
s.add_dependency(%q<nokogiri>, [">= 0"])
|
28
31
|
end
|
29
32
|
end
|