rhocr 0.0.3 → 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Manifest +15 -6
- data/Rakefile +3 -3
- data/TODO.txt +42 -0
- data/data/Seite_Die_Gartenlaube_242.html +42 -0
- data/data/Seite_Tagebuch_H_C_Lang_08.jpg +0 -0
- data/data/test.html +71 -0
- data/data/test.png +0 -0
- data/example/example_server.rb +2 -2
- data/example/public/Seite_Tagebuch_H_C_Lang_08.jpg +0 -0
- data/lib/hocr_box.rb +67 -0
- data/lib/ocr_document.rb +50 -0
- data/lib/ocr_element.rb +149 -0
- data/lib/ocr_page.rb +80 -25
- data/lib/rhocr.rb +30 -1
- data/rhocr.gemspec +12 -9
- data/spec/hocr_box_spec.rb +94 -0
- data/spec/ocr_document_spec.rb +80 -0
- data/spec/ocr_element_spec.rb +86 -0
- data/spec/ocr_page_spec.rb +116 -0
- data/spec/rhocr_spec.rb +34 -0
- data/test.html +1 -0
- metadata +52 -39
- data/example/public/img/Seite_Tagebuch_H_C_Lang_05.jpg +0 -0
- data/lib/ocr_box.rb +0 -43
- data/lib/ocrx_word.rb +0 -23
- data/rspec/ocr_box_spec.rb +0 -48
- data/rspec/ocr_page_spec.rb +0 -17
- data/rspec/ocrx_word_spec.rb +0 -32
data/data/test.html
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<title>OCR Output</title>
|
5
|
+
<meta http-equiv='content-type' content='text/html; charset=utf-8' />
|
6
|
+
<meta http-equiv='content-style-type' content='text/css' />
|
7
|
+
<meta name='ocr-capabilities' content='ocr_page ocr_par ocrx_word ocr_line' />
|
8
|
+
<meta name='ocr-system' content='ABBYY fre-8.0.1.1024' />
|
9
|
+
<meta name='ocr-number-of-pages' content='1' />
|
10
|
+
</head><body bgcolor='#ffffff'>
|
11
|
+
<div class='ocr_page' title='bbox 0 0 1326 1326;ppageno 33'>
|
12
|
+
<div class='ocrx_block' title='bbox 55 32 1135 1887'>
|
13
|
+
<p class='ocr_par' title='bbox 432 32 1117 71' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 432 32 1117 71'><span class='ocrx_word' title='bbox 432 32 588 67'>Athenobius</span> <span class='ocrx_word' title='bbox 606 48 640 54'>—</span> <span class='ocrx_word' title='bbox 657 34 749 62'>Aulon.</span> <span class='ocrx_word' title='bbox 1074 37 1117 71'>29</span></span></p>
|
14
|
+
|
15
|
+
<p class='ocr_par' title='bbox 79 109 1119 189' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 79 109 1119 145'><span class='ocrx_word' title='bbox 79 109 294 144'>Athenobius,</span> <span class='ocrx_word' title='bbox 334 112 398 139'>Der</span> <span class='ocrx_word' title='bbox 417 115 476 139'>von</span> <span class='ocrx_word' title='bbox 494 112 545 139'>der</span> <span class='ocrx_word' title='bbox 565 112 687 140'>Göttin</span> <span class='ocrx_word' title='bbox 707 112 857 140'>Minerva</span> <span class='ocrx_word' title='bbox 876 112 954 145'>lebt,</span> <span class='ocrx_word' title='bbox 974 112 1043 140'>oder:</span> <span class='ocrx_word' title='bbox 1062 112 1119 140'>Mi»</span><br></span><span class='ocr_line' title='bbox 108 155 300 189'><span class='ocrx_word' title='bbox 108 159 183 182'>nerva</span> <span class='ocrx_word' title='bbox 201 155 300 189'>Bogen.</span></span></p>
|
16
|
+
|
17
|
+
<p class='ocr_par' title='bbox 74 196 1117 316' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 160 196 1117 232'><span class='ocrx_word' title='bbox 160 198 214 225'>Des</span> <span class='ocrx_word' title='bbox 242 197 340 230'>Königs</span> <span class='ocrx_word' title='bbox 367 196 503 230'>Antiochus</span> <span class='ocrx_word' title='bbox 531 197 626 230'>Freund</span> <span class='ocrx_word' title='bbox 655 197 713 225'>oder</span> <span class='ocrx_word' title='bbox 739 196 858 232'>geheimer</span> <span class='ocrx_word' title='bbox 885 196 963 230'>Nath.</span> <span class='ocrx_word' title='bbox 994 199 1005 224'>l</span> <span class='ocrx_word' title='bbox 1033 197 1117 226'>Mack.</span><br></span><span class='ocr_line' title='bbox 109 241 206 274'><span class='ocrx_word' title='bbox 109 241 147 274'>15,</span> <span class='ocrx_word' title='bbox 166 242 206 267'>28.</span><br></span><span class='ocr_line' title='bbox 74 281 1116 316'><span class='ocrx_word' title='bbox 74 281 205 315'>Athlai.</span> <span class='ocrx_word' title='bbox 242 284 310 310'>Dee</span> <span class='ocrx_word' title='bbox 337 282 417 315'>Herr</span> <span class='ocrx_word' title='bbox 440 281 598 315'>zerreißet</span> <span class='ocrx_word' title='bbox 625 282 681 310'>oder</span> <span class='ocrx_word' title='bbox 706 282 864 316'>zerbricht.</span> <span class='ocrx_word' title='bbox 898 282 975 310'>Einer</span> <span class='ocrx_word' title='bbox 999 286 1050 310'>von</span> <span class='ocrx_word' title='bbox 1069 282 1116 310'>den</span></span></p>
|
18
|
+
|
19
|
+
<p class='ocr_par' title='bbox 74 324 1114 401' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 107 324 594 357'><span class='ocrx_word' title='bbox 107 325 281 357'>Nachlommen</span> <span class='ocrx_word' title='bbox 300 324 392 352'>Bebai.</span> <span class='ocrx_word' title='bbox 410 324 472 356'>Esra</span> <span class='ocrx_word' title='bbox 496 327 533 355'>10,</span> <span class='ocrx_word' title='bbox 553 326 594 351'>28.</span><br></span><span class='ocr_line' title='bbox 74 366 1114 401'><span class='ocrx_word' title='bbox 74 366 189 400'>Athni.</span> <span class='ocrx_word' title='bbox 217 368 296 395'>Eine</span> <span class='ocrx_word' title='bbox 315 367 450 401'>Trübsal</span> <span class='ocrx_word' title='bbox 469 372 528 394'>von</span> <span class='ocrx_word' title='bbox 548 366 638 394'>Gott.</span> <span class='ocrx_word' title='bbox 673 366 722 394'>Ein</span> <span class='ocrx_word' title='bbox 742 366 819 400'>Sohn</span> <span class='ocrx_word' title='bbox 838 366 954 400'>Semaja.</span> <span class='ocrx_word' title='bbox 986 369 998 394'>1</span> <span class='ocrx_word' title='bbox 1018 368 1114 400'>Chron.</span></span></p>
|
20
|
+
|
21
|
+
<p class='ocr_par' title='bbox 71 412 1112 488' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 104 412 187 440'><span class='ocrx_word' title='bbox 104 412 144 440'>27.</span> <span class='ocrx_word' title='bbox 163 413 187 438'>7.</span><br></span><span class='ocr_line' title='bbox 71 451 1112 488'><span class='ocrx_word' title='bbox 71 451 217 485'>Athniel.</span> <span class='ocrx_word' title='bbox 246 452 364 479'>Gottes</span> <span class='ocrx_word' title='bbox 384 451 531 484'>Trübsal,</span> <span class='ocrx_word' title='bbox 550 451 572 479'>d.</span> <span class='ocrx_word' title='bbox 591 451 608 479'>i.</span> <span class='ocrx_word' title='bbox 627 451 681 479'>eine</span> <span class='ocrx_word' title='bbox 699 451 819 484'>Trübsal,</span> <span class='ocrx_word' title='bbox 839 459 888 479'>von</span> <span class='ocrx_word' title='bbox 908 452 970 480'>Gott</span> <span class='ocrx_word' title='bbox 990 452 1112 488'>zugesügt.</span></span></p>
|
22
|
+
|
23
|
+
<p class='ocr_par' title='bbox 102 494 1110 528' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 102 494 1110 528'><span class='ocrx_word' title='bbox 102 496 152 523'>Ein</span> <span class='ocrx_word' title='bbox 172 495 248 526'>Sohn</span> <span class='ocrx_word' title='bbox 268 495 362 525'>Kenas,</span> <span class='ocrx_word' title='bbox 380 495 424 521'>des</span> <span class='ocrx_word' title='bbox 445 494 557 521'>Bruders</span> <span class='ocrx_word' title='bbox 576 494 665 526'>Kaleb;</span> <span class='ocrx_word' title='bbox 693 500 798 528'>gewann</span> <span class='ocrx_word' title='bbox 818 495 916 528'>Kiriath</span> <span class='ocrx_word' title='bbox 936 495 1042 528'>Sepher,</span> <span class='ocrx_word' title='bbox 1061 495 1110 523'>und</span></span></p>
|
24
|
+
|
25
|
+
<p class='ocr_par' title='bbox 68 535 1119 614' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 100 535 1037 570'><span class='ocrx_word' title='bbox 100 538 180 565'>damit</span> <span class='ocrx_word' title='bbox 199 537 281 570'>Achsa.</span> <span class='ocrx_word' title='bbox 300 538 336 564'>die</span> <span class='ocrx_word' title='bbox 356 537 454 569'>Tochter</span> <span class='ocrx_word' title='bbox 472 535 553 567'>seines</span> <span class='ocrx_word' title='bbox 574 537 674 564'>Betters</span> <span class='ocrx_word' title='bbox 694 537 780 564'>Kaleb.</span> <span class='ocrx_word' title='bbox 800 536 877 570'>Nicht,</span> <span class='ocrx_word' title='bbox 899 540 919 569'>1.</span> <span class='ocrx_word' title='bbox 940 539 978 565'>12.</span> <span class='ocrx_word' title='bbox 1000 539 1037 564'>13.</span><br></span><span class='ocr_line' title='bbox 68 576 1119 614'><span class='ocrx_word' title='bbox 68 578 376 614'>Atroth-Sophan,</span> <span class='ocrx_word' title='bbox 396 580 433 606'>die</span> <span class='ocrx_word' title='bbox 454 580 555 607'>Krone</span> <span class='ocrx_word' title='bbox 580 580 633 606'>oder</span> <span class='ocrx_word' title='bbox 658 579 760 609'>Decke,</span> <span class='ocrx_word' title='bbox 785 580 841 606'>oder</span> <span class='ocrx_word' title='bbox 860 580 1037 614'>Bedeckung</span> <span class='ocrx_word' title='bbox 1057 576 1119 608'>des'</span></span></p>
|
26
|
+
|
27
|
+
<p class='ocr_par' title='bbox 100 621 1111 657' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 100 621 1111 657'><span class='ocrx_word' title='bbox 100 624 235 657'>Hügels.</span> <span class='ocrx_word' title='bbox 273 623 335 650'>Eine</span> <span class='ocrx_word' title='bbox 355 623 436 649'>Stadt</span> <span class='ocrx_word' title='bbox 456 623 496 649'>der</span> <span class='ocrx_word' title='bbox 515 621 656 649'>Rubeniten</span> <span class='ocrx_word' title='bbox 680 621 715 648'>im</span> <span class='ocrx_word' title='bbox 734 622 875 656'>Königreich</span> <span class='ocrx_word' title='bbox 895 622 992 654'>Sthon.</span> <span class='ocrx_word' title='bbox 1008 624 1024 650'>4</span> <span class='ocrx_word' title='bbox 1043 623 1111 657'>Mos.</span></span></p>
|
28
|
+
|
29
|
+
<p class='ocr_par' title='bbox 67 668 1112 742' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 98 668 200 698'><span class='ocrx_word' title='bbox 98 669 139 698'>32,</span> <span class='ocrx_word' title='bbox 158 668 200 693'>35.</span><br></span><span class='ocr_line' title='bbox 67 706 1112 742'><span class='ocrx_word' title='bbox 67 707 341 742'>AtrothAddar:</span> <span class='ocrx_word' title='bbox 356 706 418 735'>Die</span> <span class='ocrx_word' title='bbox 432 707 537 734'>Krone</span> <span class='ocrx_word' title='bbox 551 706 661 733'>Addar</span> <span class='ocrx_word' title='bbox 675 706 729 737'>(des</span> <span class='ocrx_word' title='bbox 744 706 849 739'>Sohnes</span> <span class='ocrx_word' title='bbox 861 706 1016 740'>Benjamin).</span> <span class='ocrx_word' title='bbox 1037 708 1112 741'>Diese</span></span></p>
|
30
|
+
|
31
|
+
<p class='ocr_par' title='bbox 98 748 1111 785' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 98 748 1111 785'><span class='ocrx_word' title='bbox 98 752 184 780'>Stadt</span> <span class='ocrx_word' title='bbox 202 751 300 785'>gehörte</span> <span class='ocrx_word' title='bbox 322 750 369 777'>den</span> <span class='ocrx_word' title='bbox 395 748 611 781'>Benjaminitern,</span> <span class='ocrx_word' title='bbox 635 749 678 782'>lag</span> <span class='ocrx_word' title='bbox 702 748 728 775'>in</span> <span class='ocrx_word' title='bbox 753 749 798 776'>den</span> <span class='ocrx_word' title='bbox 826 749 938 784'>Grenzen</span> <span class='ocrx_word' title='bbox 962 750 1033 783'>Iuda</span> <span class='ocrx_word' title='bbox 1057 750 1111 778'>tmd</span></span></p>
|
32
|
+
|
33
|
+
<p class='ocr_par' title='bbox 64 794 1112 870' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 98 794 232 825'><span class='ocrx_word' title='bbox 98 794 232 825'>Ephraim.</span><br></span><span class='ocr_line' title='bbox 64 833 1112 870'><span class='ocrx_word' title='bbox 64 834 419 870'>Atroth.Beth-Ioab,</span> <span class='ocrx_word' title='bbox 438 836 460 862'>d.</span> <span class='ocrx_word' title='bbox 480 835 497 862'>i.</span> <span class='ocrx_word' title='bbox 517 834 565 861'>die</span> <span class='ocrx_word' title='bbox 584 834 688 861'>Krone</span> <span class='ocrx_word' title='bbox 712 833 764 861'>des</span> <span class='ocrx_word' title='bbox 784 833 909 866'>Hauses</span> <span class='ocrx_word' title='bbox 926 834 1020 866'>Ioab.</span> <span class='ocrx_word' title='bbox 1050 835 1112 863'>Eine</span></span></p>
|
34
|
+
|
35
|
+
<p class='ocr_par' title='bbox 101 876 1113 911' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 101 876 1113 911'><span class='ocrx_word' title='bbox 101 879 187 907'>Stadt</span> <span class='ocrx_word' title='bbox 201 879 227 906'>in</span> <span class='ocrx_word' title='bbox 249 879 329 911'>Iuda,</span> <span class='ocrx_word' title='bbox 350 883 388 905'>wo</span> <span class='ocrx_word' title='bbox 408 878 445 905'>die</span> <span class='ocrx_word' title='bbox 456 877 634 909'>Nachlommen</span> <span class='ocrx_word' title='bbox 645 876 742 904'>Salma</span> <span class='ocrx_word' title='bbox 762 876 876 910'>gewohnt</span> <span class='ocrx_word' title='bbox 887 877 972 910'>haben.</span> <span class='ocrx_word' title='bbox 992 880 1002 904'>1</span> <span class='ocrx_word' title='bbox 1021 877 1113 911'>Chron.</span></span></p>
|
36
|
+
|
37
|
+
<p class='ocr_par' title='bbox 64 923 1112 996' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 96 923 185 953'><span class='ocrx_word' title='bbox 96 923 120 953'>2,</span> <span class='ocrx_word' title='bbox 141 924 185 949'>54.</span><br></span><span class='ocr_line' title='bbox 64 961 1112 996'><span class='ocrx_word' title='bbox 64 962 212 993'>Attalia.</span> <span class='ocrx_word' title='bbox 249 963 312 990'>Eine</span> <span class='ocrx_word' title='bbox 330 963 411 990'>Stadt</span> <span class='ocrx_word' title='bbox 428 962 455 988'>in</span> <span class='ocrx_word' title='bbox 474 961 637 994'>Pamphilien</span> <span class='ocrx_word' title='bbox 665 962 701 990'>od.</span> <span class='ocrx_word' title='bbox 720 961 824 994'>Libyen,</span> <span class='ocrx_word' title='bbox 848 967 897 989'>von</span> <span class='ocrx_word' title='bbox 916 962 1004 989'>Attala</span> <span class='ocrx_word' title='bbox 1023 962 1112 996'>Phila.</span></span></p>
|
38
|
+
|
39
|
+
<p class='ocr_par' title='bbox 62 1005 1135 1082' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 95 1005 594 1039'><span class='ocrx_word' title='bbox 95 1006 185 1039'>delpho</span> <span class='ocrx_word' title='bbox 206 1006 302 1033'>erbaut.</span> <span class='ocrx_word' title='bbox 321 1006 368 1037'>Ap.</span> <span class='ocrx_word' title='bbox 388 1005 471 1038'>Gesch.</span> <span class='ocrx_word' title='bbox 494 1006 533 1038'>14,</span> <span class='ocrx_word' title='bbox 553 1006 594 1031'>25.</span><br></span><span class='ocr_line' title='bbox 62 1046 1135 1082'><span class='ocrx_word' title='bbox 62 1048 208 1077'>Attalus</span> <span class='ocrx_word' title='bbox 255 1048 306 1075'>Ein</span> <span class='ocrx_word' title='bbox 330 1046 411 1080'>König</span> <span class='ocrx_word' title='bbox 436 1047 462 1073'>in</span> <span class='ocrx_word' title='bbox 482 1046 594 1078'>Mysien,</span> <span class='ocrx_word' title='bbox 623 1046 722 1078'>welches</span> <span class='ocrx_word' title='bbox 747 1047 819 1073'>unter</span> <span class='ocrx_word' title='bbox 843 1046 972 1081'>Phrygien</span> <span class='ocrx_word' title='bbox 997 1047 1112 1082'>gehörte;</span> <span class='ocrx_word' title='bbox 1128 1058 1135 1067'>,</span></span></p>
|
40
|
+
|
41
|
+
<p class='ocr_par' title='bbox 95 1089 1111 1125' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 95 1089 1111 1125'><span class='ocrx_word' title='bbox 95 1093 207 1125'>genannt</span> <span class='ocrx_word' title='bbox 227 1095 274 1117'>von</span> <span class='ocrx_word' title='bbox 294 1090 416 1119'>Attale,</span> <span class='ocrx_word' title='bbox 435 1090 534 1120'>welches</span> <span class='ocrx_word' title='bbox 553 1089 591 1116'>bei</span> <span class='ocrx_word' title='bbox 610 1089 657 1115'>den</span> <span class='ocrx_word' title='bbox 676 1089 819 1122'>Phrygiern</span> <span class='ocrx_word' title='bbox 838 1089 917 1121'>Kropf</span> <span class='ocrx_word' title='bbox 935 1090 991 1116'>oder</span> <span class='ocrx_word' title='bbox 1012 1090 1111 1124'>Gurgel</span></span></p>
|
42
|
+
|
43
|
+
<p class='ocr_par' title='bbox 94 1131 1110 1168' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 94 1131 1110 1168'><span class='ocrx_word' title='bbox 94 1133 213 1168'>geheißen</span> <span class='ocrx_word' title='bbox 233 1133 310 1165'>haben</span> <span class='ocrx_word' title='bbox 329 1131 380 1161'>soll.</span> <span class='ocrx_word' title='bbox 393 1157 397 1161'>,</span> <span class='ocrx_word' title='bbox 417 1131 479 1159'>War</span> <span class='ocrx_word' title='bbox 497 1132 536 1158'>ein</span> <span class='ocrx_word' title='bbox 555 1132 637 1165'>König</span> <span class='ocrx_word' title='bbox 660 1132 700 1158'>der</span> <span class='ocrx_word' title='bbox 719 1132 889 1165'>Pergamener</span> <span class='ocrx_word' title='bbox 907 1133 956 1158'>und</span> <span class='ocrx_word' title='bbox 976 1131 1110 1165'>Phrvgier.</span></span></p>
|
44
|
+
|
45
|
+
<p class='ocr_par' title='bbox 59 1175 1110 1250' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 95 1175 332 1207'><span class='ocrx_word' title='bbox 95 1177 106 1201'>l</span> <span class='ocrx_word' title='bbox 130 1175 211 1202'>Mack.</span> <span class='ocrx_word' title='bbox 233 1176 272 1207'>15,</span> <span class='ocrx_word' title='bbox 292 1176 332 1201'>22.</span><br></span><span class='ocr_line' title='bbox 59 1216 1110 1250'><span class='ocrx_word' title='bbox 59 1217 146 1246'>Ava.</span> <span class='ocrx_word' title='bbox 184 1217 224 1250'>Ist</span> <span class='ocrx_word' title='bbox 242 1217 280 1243'>bei</span> <span class='ocrx_word' title='bbox 299 1218 344 1243'>den</span> <span class='ocrx_word' title='bbox 364 1217 432 1243'>alten</span> <span class='ocrx_word' title='bbox 453 1216 568 1248'>Griechen</span> <span class='ocrx_word' title='bbox 587 1216 636 1242'>Aia</span> <span class='ocrx_word' title='bbox 655 1216 691 1242'>od.</span> <span class='ocrx_word' title='bbox 711 1216 771 1247'>Aea,</span> <span class='ocrx_word' title='bbox 790 1216 828 1242'>die</span> <span class='ocrx_word' title='bbox 842 1216 993 1249'>Hauptstadt</span> <span class='ocrx_word' title='bbox 1010 1217 1036 1243'>in</span> <span class='ocrx_word' title='bbox 1051 1216 1110 1245'>Col»</span></span></p>
|
46
|
+
|
47
|
+
<p class='ocr_par' title='bbox 92 1258 1108 1293' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 92 1258 1108 1293'><span class='ocrx_word' title='bbox 92 1261 165 1293'>chide,</span> <span class='ocrx_word' title='bbox 186 1265 224 1286'>wo</span> <span class='ocrx_word' title='bbox 244 1260 323 1285'>Aetas</span> <span class='ocrx_word' title='bbox 341 1258 451 1292'>regierte.</span> <span class='ocrx_word' title='bbox 494 1258 593 1291'>Colchis</span> <span class='ocrx_word' title='bbox 612 1258 678 1291'>heißt</span> <span class='ocrx_word' title='bbox 697 1258 812 1292'>heutiges</span> <span class='ocrx_word' title='bbox 833 1258 916 1292'>Tages</span> <span class='ocrx_word' title='bbox 938 1258 1108 1293'>Mengrelicn,</span></span></p>
|
48
|
+
|
49
|
+
<p class='ocr_par' title='bbox 92 1300 1109 1335' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 92 1300 1109 1335'><span class='ocrx_word' title='bbox 92 1303 131 1330'>die</span> <span class='ocrx_word' title='bbox 160 1303 258 1335'>meisten</span> <span class='ocrx_word' title='bbox 277 1302 430 1332'>Einwohner</span> <span class='ocrx_word' title='bbox 455 1300 505 1333'>sind</span> <span class='ocrx_word' title='bbox 531 1300 653 1334'>Christen.</span> <span class='ocrx_word' title='bbox 698 1300 755 1328'>Von</span> <span class='ocrx_word' title='bbox 780 1300 831 1332'>hier</span> <span class='ocrx_word' title='bbox 855 1302 956 1328'>wurden</span> <span class='ocrx_word' title='bbox 980 1302 1018 1328'>die</span> <span class='ocrx_word' title='bbox 1037 1301 1109 1329'>Leute</span></span></p>
|
50
|
+
|
51
|
+
<p class='ocr_par' title='bbox 92 1342 1110 1378' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 92 1342 1110 1378'><span class='ocrx_word' title='bbox 92 1349 144 1371'>von</span> <span class='ocrx_word' title='bbox 165 1344 347 1376'>Salmanasscr</span> <span class='ocrx_word' title='bbox 371 1344 428 1375'>nach</span> <span class='ocrx_word' title='bbox 454 1343 582 1370'>Samaria</span> <span class='ocrx_word' title='bbox 606 1343 716 1378'>gesührt,</span> <span class='ocrx_word' title='bbox 741 1347 779 1369'>wo</span> <span class='ocrx_word' title='bbox 804 1342 835 1375'>sie</span> <span class='ocrx_word' title='bbox 859 1343 917 1375'>noch</span> <span class='ocrx_word' title='bbox 942 1343 993 1376'>ihre</span> <span class='ocrx_word' title='bbox 1019 1344 1110 1371'>Götter</span></span></p>
|
52
|
+
|
53
|
+
<p class='ocr_par' title='bbox 58 1386 1109 1462' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 93 1386 875 1419'><span class='ocrx_word' title='bbox 93 1387 211 1419'>Nibehas</span> <span class='ocrx_word' title='bbox 231 1387 279 1413'>und</span> <span class='ocrx_word' title='bbox 298 1386 421 1418'>Tharthac</span> <span class='ocrx_word' title='bbox 440 1386 578 1413'>anbeteten.</span> <span class='ocrx_word' title='bbox 597 1388 612 1412'>2</span> <span class='ocrx_word' title='bbox 630 1387 694 1413'>Kön.</span> <span class='ocrx_word' title='bbox 716 1388 754 1416'>l7,</span> <span class='ocrx_word' title='bbox 775 1386 816 1413'>24.</span> <span class='ocrx_word' title='bbox 835 1387 875 1413'>31.</span><br></span><span class='ocr_line' title='bbox 58 1428 1109 1462'><span class='ocrx_word' title='bbox 58 1428 175 1458'>Aven.</span> <span class='ocrx_word' title='bbox 213 1429 314 1462'>Götze,</span> <span class='ocrx_word' title='bbox 339 1428 502 1455'>Eitelleit.</span> <span class='ocrx_word' title='bbox 538 1428 580 1455'>So</span> <span class='ocrx_word' title='bbox 605 1429 665 1455'>wird</span> <span class='ocrx_word' title='bbox 690 1428 778 1461'>Bethel</span> <span class='ocrx_word' title='bbox 802 1429 920 1462'>genannt.</span> <span class='ocrx_word' title='bbox 939 1428 995 1462'>Hos.</span> <span class='ocrx_word' title='bbox 1023 1431 1061 1460'>10,</span> <span class='ocrx_word' title='bbox 1086 1431 1109 1456'>8.</span></span></p>
|
54
|
+
|
55
|
+
<p class='ocr_par' title='bbox 92 1471 1110 1505' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 92 1471 1110 1505'><span class='ocrx_word' title='bbox 92 1478 180 1505'>wegen</span> <span class='ocrx_word' title='bbox 199 1472 241 1498'>der</span> <span class='ocrx_word' title='bbox 261 1471 365 1504'>Götzen,</span> <span class='ocrx_word' title='bbox 390 1471 429 1497'>die</span> <span class='ocrx_word' title='bbox 456 1471 556 1503'>daselbst</span> <span class='ocrx_word' title='bbox 579 1476 628 1497'>von</span> <span class='ocrx_word' title='bbox 651 1472 694 1497'>den</span> <span class='ocrx_word' title='bbox 715 1471 852 1503'>Israeliten</span> <span class='ocrx_word' title='bbox 877 1471 973 1503'>verehrt</span> <span class='ocrx_word' title='bbox 1000 1472 1110 1498'>wurden.</span></span></p>
|
56
|
+
|
57
|
+
<p class='ocr_par' title='bbox 91 1513 1110 1548' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 91 1513 1110 1548'><span class='ocrx_word' title='bbox 91 1515 149 1541'>Mit</span> <span class='ocrx_word' title='bbox 167 1515 220 1541'>dem</span> <span class='ocrx_word' title='bbox 240 1520 332 1548'>ganzen</span> <span class='ocrx_word' title='bbox 352 1513 461 1540'>Namen:</span> <span class='ocrx_word' title='bbox 482 1513 640 1544'>Beth»Aven,</span> <span class='ocrx_word' title='bbox 658 1513 708 1540'>das</span> <span class='ocrx_word' title='bbox 726 1513 893 1548'>Götzenhaus,</span> <span class='ocrx_word' title='bbox 914 1513 979 1546'>oder,</span> <span class='ocrx_word' title='bbox 997 1514 1029 1540'>da</span> <span class='ocrx_word' title='bbox 1048 1520 1110 1541'>man</span></span></p>
|
58
|
+
|
59
|
+
<p class='ocr_par' title='bbox 56 1555 1110 1633' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 91 1555 631 1588'><span class='ocrx_word' title='bbox 91 1558 149 1584'>dem</span> <span class='ocrx_word' title='bbox 169 1557 254 1583'>Eiteln</span> <span class='ocrx_word' title='bbox 275 1556 453 1587'>nachwandelt.</span> <span class='ocrx_word' title='bbox 473 1555 532 1588'>Hos.</span> <span class='ocrx_word' title='bbox 549 1558 571 1584'>4,</span> <span class='ocrx_word' title='bbox 593 1558 631 1581'>15.</span><br></span><span class='ocr_line' title='bbox 56 1597 1110 1633'><span class='ocrx_word' title='bbox 56 1597 242 1633'>Augustus.</span> <span class='ocrx_word' title='bbox 287 1597 419 1631'>Würdig</span> <span class='ocrx_word' title='bbox 448 1598 577 1630'>verehrt</span> <span class='ocrx_word' title='bbox 608 1598 668 1624'>und</span> <span class='ocrx_word' title='bbox 704 1599 876 1632'>angebetet</span> <span class='ocrx_word' title='bbox 908 1605 944 1632'>zu</span> <span class='ocrx_word' title='bbox 978 1599 1110 1627'>werden.</span></span></p>
|
60
|
+
|
61
|
+
<p class='ocr_par' title='bbox 92 1640 1108 1674' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 92 1640 1108 1674'><span class='ocrx_word' title='bbox 92 1640 189 1674'>Diesen</span> <span class='ocrx_word' title='bbox 211 1640 311 1667'>Namen</span> <span class='ocrx_word' title='bbox 340 1641 386 1674'>gab</span> <span class='ocrx_word' title='bbox 413 1641 459 1667'>das</span> <span class='ocrx_word' title='bbox 487 1640 596 1672'>romische</span> <span class='ocrx_word' title='bbox 616 1640 678 1667'>Voll</span> <span class='ocrx_word' title='bbox 706 1641 758 1667'>dem</span> <span class='ocrx_word' title='bbox 781 1641 869 1672'>Kaiser</span> <span class='ocrx_word' title='bbox 887 1641 1030 1672'>Octavian,</span> <span class='ocrx_word' title='bbox 1058 1642 1108 1668'>und</span></span></p>
|
62
|
+
|
63
|
+
<p class='ocr_par' title='bbox 93 1682 1108 1716' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 93 1682 1108 1716'><span class='ocrx_word' title='bbox 93 1683 142 1710'>alle</span> <span class='ocrx_word' title='bbox 161 1683 291 1715'>romischen</span> <span class='ocrx_word' title='bbox 310 1683 396 1715'>Kaiser</span> <span class='ocrx_word' title='bbox 416 1683 495 1713'>haben</span> <span class='ocrx_word' title='bbox 514 1682 594 1714'>diesen</span> <span class='ocrx_word' title='bbox 614 1682 709 1709'>Namen</span> <span class='ocrx_word' title='bbox 728 1683 898 1716'>beibehalten,</span> <span class='ocrx_word' title='bbox 917 1682 964 1715'>daß</span> <span class='ocrx_word' title='bbox 983 1682 1014 1715'>sie</span> <span class='ocrx_word' title='bbox 1033 1692 1108 1710'>«au,-</span></span></p>
|
64
|
+
|
65
|
+
<p class='ocr_par' title='bbox 55 1724 1107 1843' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 90 1724 997 1759'><span class='ocrx_word' title='bbox 90 1733 142 1758'>per</span> <span class='ocrx_word' title='bbox 160 1727 284 1758'>2ußr>«ti,</span> <span class='ocrx_word' title='bbox 304 1726 326 1751'>d.</span> <span class='ocrx_word' title='bbox 345 1724 360 1750'>i,</span> <span class='ocrx_word' title='bbox 382 1725 462 1757'>allzeit</span> <span class='ocrx_word' title='bbox 482 1725 581 1756'>Mehrer</span> <span class='ocrx_word' title='bbox 600 1725 643 1751'>des</span> <span class='ocrx_word' title='bbox 664 1724 750 1756'>Reichs</span> <span class='ocrx_word' title='bbox 770 1725 885 1759'>geheißen</span> <span class='ocrx_word' title='bbox 904 1725 997 1758'>haben.</span><br></span><span class='ocr_line' title='bbox 55 1766 952 1800'><span class='ocrx_word' title='bbox 55 1766 176 1800'>Avith.</span> <span class='ocrx_word' title='bbox 212 1767 324 1799'>Haufe.</span> <span class='ocrx_word' title='bbox 361 1766 424 1793'>Eine</span> <span class='ocrx_word' title='bbox 443 1766 524 1794'>Stadt</span> <span class='ocrx_word' title='bbox 542 1768 569 1793'>in</span> <span class='ocrx_word' title='bbox 587 1767 709 1799'>Idumäa.</span> <span class='ocrx_word' title='bbox 732 1769 742 1793'>1</span> <span class='ocrx_word' title='bbox 763 1767 831 1799'>Mos.</span> <span class='ocrx_word' title='bbox 849 1769 892 1798'>36,</span> <span class='ocrx_word' title='bbox 910 1769 952 1795'>35.</span><br></span><span class='ocr_line' title='bbox 57 1809 1107 1843'><span class='ocrx_word' title='bbox 57 1809 182 1839'>Aulon.</span> <span class='ocrx_word' title='bbox 236 1809 445 1843'>Ausgehöhlt.</span> <span class='ocrx_word' title='bbox 491 1809 553 1836'>Das</span> <span class='ocrx_word' title='bbox 581 1809 653 1843'>große</span> <span class='ocrx_word' title='bbox 681 1809 757 1841'>Thal,</span> <span class='ocrx_word' title='bbox 791 1810 871 1837'>worin</span> <span class='ocrx_word' title='bbox 897 1810 935 1837'>die</span> <span class='ocrx_word' title='bbox 962 1809 1107 1843'>berühmten</span></span></p>
|
66
|
+
|
67
|
+
<p class='ocr_par' title='bbox 89 1851 1106 1887' style='font-size:8pt;font-family:"Arial";font-style:normal'><span class='ocr_line' title='bbox 89 1851 1106 1887'><span class='ocrx_word' title='bbox 89 1853 186 1882'>Städte</span> <span class='ocrx_word' title='bbox 204 1852 315 1886'>Vethsan</span> <span class='ocrx_word' title='bbox 334 1852 390 1879'>oder</span> <span class='ocrx_word' title='bbox 409 1852 588 1886'>Scythopolis,</span> <span class='ocrx_word' title='bbox 605 1851 732 1885'>Tlberias,</span> <span class='ocrx_word' title='bbox 751 1852 862 1887'>Iericho,</span> <span class='ocrx_word' title='bbox 881 1851 929 1880'>das</span> <span class='ocrx_word' title='bbox 949 1853 1013 1880'>todte</span> <span class='ocrx_word' title='bbox 1033 1852 1106 1880'>Meer</span></span></p>
|
68
|
+
|
69
|
+
</div>
|
70
|
+
</div></body>
|
71
|
+
</html>
|
data/data/test.png
ADDED
Binary file
|
data/example/example_server.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#coding: utf-8
|
2
2
|
require 'sinatra'
|
3
3
|
|
4
|
-
require_relative '../lib/
|
4
|
+
require_relative '../lib/rhocr'
|
5
5
|
|
6
6
|
get '/' do
|
7
7
|
"<a href='OCRTest.html'>OCRTest</a>"
|
@@ -25,5 +25,5 @@ end
|
|
25
25
|
|
26
26
|
def get_enclosed_words(x1, y1, x2 ,y2, page)
|
27
27
|
@page = OCRPage.new("../data/#{page}")
|
28
|
-
@page.enclosed_words(
|
28
|
+
@page.enclosed_words( HOCRBox.new(x1, y1, x2 ,y2) )
|
29
29
|
end
|
Binary file
|
data/lib/hocr_box.rb
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
|
3
|
+
class HOCRBox
|
4
|
+
|
5
|
+
attr_reader :left, :top, :right, :bottom, :upper_left, :lower_right, :coordinates
|
6
|
+
def initialize(* coordinates)
|
7
|
+
|
8
|
+
@left, @top, @right, @bottom = coordinates.flatten.collect { |x| x.to_i}
|
9
|
+
|
10
|
+
@height = @bottom - @top
|
11
|
+
@width = @right - @left
|
12
|
+
@upper_left = [ @left, @top]
|
13
|
+
@lower_rigth = [ @right, @bottom ]
|
14
|
+
@coordinates = [ @left, @top,@right, @bottom ]
|
15
|
+
|
16
|
+
if left > right || top > bottom then
|
17
|
+
raise " Negative dimensions of OCRBox ar not allowed. left #{@left} / right #{@right} - top #{@top} / bottom #{@bottom}"
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
|
22
|
+
def encloses?(other)
|
23
|
+
@left <= other.left and
|
24
|
+
@right >= other.right and
|
25
|
+
@top <= other.top and
|
26
|
+
@bottom >= other.bottom
|
27
|
+
end
|
28
|
+
|
29
|
+
def enclosed_by?(other)
|
30
|
+
return other.encloses? self
|
31
|
+
end
|
32
|
+
|
33
|
+
def left_of?(other)
|
34
|
+
@right < other.left
|
35
|
+
end
|
36
|
+
|
37
|
+
def right_of?(other)
|
38
|
+
@left > other.right
|
39
|
+
end
|
40
|
+
|
41
|
+
def left_distance_to(other)
|
42
|
+
@left - other.right
|
43
|
+
end
|
44
|
+
|
45
|
+
def right_distance_to(other)
|
46
|
+
other.left_distance_to(self)
|
47
|
+
end
|
48
|
+
|
49
|
+
def to_s
|
50
|
+
coordinates_to_s
|
51
|
+
end
|
52
|
+
|
53
|
+
def coordinates_to_s
|
54
|
+
"(#{@left},#{@top})/(#{@right},#{@bottom})"
|
55
|
+
end
|
56
|
+
|
57
|
+
def to_css_style
|
58
|
+
"position:absolute; top:#{@top}px; left:#{@left}px; height:#{@height}px; width:#{@width}px;"
|
59
|
+
end
|
60
|
+
|
61
|
+
def to_image_html(css_class = 'hocr_box')
|
62
|
+
"<span style='#{ to_css_style }' class='#{css_class}'></span>"
|
63
|
+
end
|
64
|
+
|
65
|
+
|
66
|
+
end
|
67
|
+
|
data/lib/ocr_document.rb
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
|
3
|
+
require_relative 'ocr_page'
|
4
|
+
|
5
|
+
class OCRDocument
|
6
|
+
attr_reader :pages, :page_count
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@pages = Hash.new()
|
10
|
+
@page_count = 0
|
11
|
+
end
|
12
|
+
|
13
|
+
def add_pages( list_o_pages )
|
14
|
+
for file in list_o_pages do
|
15
|
+
add_page(file)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def add_page( file )
|
20
|
+
page = OCRPage.new( file )
|
21
|
+
@pages[page.page_number] = page
|
22
|
+
@page_count += 1
|
23
|
+
end
|
24
|
+
|
25
|
+
def page( number )
|
26
|
+
@pages[number]
|
27
|
+
end
|
28
|
+
|
29
|
+
def each_line
|
30
|
+
for page in @pages.values do
|
31
|
+
page.each_line do |line|
|
32
|
+
yield line
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def each_word
|
38
|
+
for page in @pages.values do
|
39
|
+
page.each_line do |line|
|
40
|
+
line.each do |word|
|
41
|
+
yield word
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
|
48
|
+
alias :add_files :add_pages
|
49
|
+
alias :add_file :add_page
|
50
|
+
end
|
data/lib/ocr_element.rb
ADDED
@@ -0,0 +1,149 @@
|
|
1
|
+
#coding:utf-8
|
2
|
+
|
3
|
+
require_relative 'hocr_box'
|
4
|
+
class OCRElement < HOCRBox
|
5
|
+
|
6
|
+
include Enumerable
|
7
|
+
|
8
|
+
attr_reader :ocr_class, :children
|
9
|
+
attr_accessor :features
|
10
|
+
|
11
|
+
class << self
|
12
|
+
def create_from_html(ocr_element_html)
|
13
|
+
create ocr_element_html
|
14
|
+
end
|
15
|
+
|
16
|
+
def create(ocr_element_html)
|
17
|
+
ocr_class = extract_ocr_class(ocr_element_html)
|
18
|
+
coordinates = extract_coordinates(ocr_element_html)
|
19
|
+
|
20
|
+
unless ocr_class == 'ocrx_word'
|
21
|
+
children = extract_children(ocr_element_html)
|
22
|
+
else
|
23
|
+
children = extract_word_children(ocr_element_html)
|
24
|
+
end
|
25
|
+
|
26
|
+
case ocr_class
|
27
|
+
when 'ocrx_block' then
|
28
|
+
OCRBlock.new(ocr_class,children,coordinates)
|
29
|
+
when 'ocr_par' then
|
30
|
+
OCRParagraph.new(ocr_class,children,coordinates)
|
31
|
+
when 'ocr_line' then
|
32
|
+
OCRLine.new(ocr_class,children,coordinates)
|
33
|
+
when 'ocrx_word' then
|
34
|
+
OCRWord.new(ocr_class,children,coordinates)
|
35
|
+
else
|
36
|
+
OCRElement.new(ocr_class,children,coordinates)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def extract_word_children(ocr_element_html)
|
41
|
+
[ocr_element_html.text]
|
42
|
+
end
|
43
|
+
|
44
|
+
def extract_children(ocr_element_html)
|
45
|
+
children = []
|
46
|
+
for child_fragment_html in ocr_element_html.elements do
|
47
|
+
children << OCRElement.create(child_fragment_html)
|
48
|
+
end
|
49
|
+
#br Elemente ausfiltern
|
50
|
+
children.reject { |child| child.ocr_class == nil}
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
def extract_coordinates(ocr_element_html)
|
55
|
+
extract_coordinates_from_string ocr_element_html['title']
|
56
|
+
end
|
57
|
+
|
58
|
+
def extract_coordinates_from_string(s)
|
59
|
+
s =~ /bbox (\d+) (\d+) (\d+) (\d+)/
|
60
|
+
[$1, $2, $3, $4]
|
61
|
+
end
|
62
|
+
|
63
|
+
def extract_ocr_class(ocr_element_html)
|
64
|
+
ocr_element_html['class']
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def initialize(ocr_class, children, coordinates)
|
69
|
+
@children = children
|
70
|
+
@ocr_class = ocr_class
|
71
|
+
@features = []
|
72
|
+
super coordinates
|
73
|
+
end
|
74
|
+
|
75
|
+
def each
|
76
|
+
children.each do |child|
|
77
|
+
yield child
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def to_s
|
82
|
+
"#{self.class}:#{@features}#{ coordinates_to_s }->\n" + children.map { |c| "\t#{c.to_s}" }.join("\n")
|
83
|
+
end
|
84
|
+
|
85
|
+
def mark_in_rspec(color)
|
86
|
+
"<span style='color: #{color}'>#{to_s}</span>"
|
87
|
+
end
|
88
|
+
|
89
|
+
def to_image_html(dipslay_class = @ocr_class)
|
90
|
+
children_html = @children.map {|c| c.to_image_html}.join("")
|
91
|
+
"<span class='#{ dipslay_class }' style='#{ to_css_style }' ></span>#{ children_html }"
|
92
|
+
end
|
93
|
+
|
94
|
+
def to_html( display_class = @ocr_class, style = nil )
|
95
|
+
children_html = @children.map {|c| c.to_html}.join("")
|
96
|
+
"<span class='#{ display_class }'> #{ children_html } </span>"
|
97
|
+
end
|
98
|
+
|
99
|
+
end
|
100
|
+
|
101
|
+
class OCRWord < OCRElement
|
102
|
+
|
103
|
+
def text
|
104
|
+
children.flatten[0]
|
105
|
+
end
|
106
|
+
|
107
|
+
def to_s
|
108
|
+
"#{text}[#{@features}]"
|
109
|
+
end
|
110
|
+
|
111
|
+
def to_image_html
|
112
|
+
"<span class='#{ @ocr_class }' style='#{ to_css_style }'>#{ text }</span>"
|
113
|
+
end
|
114
|
+
|
115
|
+
def to_html
|
116
|
+
"<span class='#{ @ocr_class }'>#{ text }</span>"
|
117
|
+
end
|
118
|
+
|
119
|
+
end
|
120
|
+
|
121
|
+
class OCRLine < OCRElement
|
122
|
+
|
123
|
+
def to_s
|
124
|
+
"#{self.class} #{coordinates_to_s} ->[\n" +
|
125
|
+
words.map {|w| "#{w.coordinates_to_s}\t#{w.to_s}"}.join("\n") +
|
126
|
+
"]"
|
127
|
+
end
|
128
|
+
|
129
|
+
def simple_line
|
130
|
+
"#{self.class} #{coordinates_to_s} ->[\n" +
|
131
|
+
words.map {|w| w.to_s}.join("\n") +
|
132
|
+
"]"
|
133
|
+
end
|
134
|
+
|
135
|
+
def to_text
|
136
|
+
words.map { |w| w.text }.join(" ")
|
137
|
+
end
|
138
|
+
|
139
|
+
alias :words :children
|
140
|
+
end
|
141
|
+
|
142
|
+
class OCRParagraph < OCRElement
|
143
|
+
alias :lines :children
|
144
|
+
end
|
145
|
+
|
146
|
+
class OCRBlock < OCRElement
|
147
|
+
alias :paragraphs :children
|
148
|
+
end
|
149
|
+
|
data/lib/ocr_page.rb
CHANGED
@@ -1,43 +1,98 @@
|
|
1
1
|
#coding: utf-8
|
2
|
-
require_relative "
|
2
|
+
require_relative "ocr_element"
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'pp'
|
3
5
|
|
4
|
-
class OCRPage <
|
5
|
-
attr_reader :lines, :words
|
6
|
+
class OCRPage < OCRElement
|
6
7
|
|
7
|
-
|
8
|
-
|
8
|
+
attr_reader :meta_data, :page_number, :dimensions, :lines, :image
|
9
|
+
alias :each_block :each
|
10
|
+
alias :blocks :children
|
11
|
+
|
12
|
+
def initialize(file_path , image_path = nil )
|
13
|
+
doc = process_hocr_html_file(file_path)
|
14
|
+
page_content = doc.at_css("div.ocr_page")
|
15
|
+
coordinates, @page_number = extract_bbox_ppageno( page_content['title'] )
|
16
|
+
|
17
|
+
@page_content = doc.at_css("div.ocr_page")
|
18
|
+
children = OCRElement.extract_children(@page_content)
|
19
|
+
super('ocr_page', children, coordinates)
|
20
|
+
@image = image_path
|
21
|
+
|
9
22
|
end
|
10
23
|
|
11
|
-
|
12
|
-
|
13
|
-
for
|
14
|
-
|
15
|
-
|
16
|
-
ocrx_word =~ /title=['"]bbox (\d+) (\d+) (\d+) (\d+)['"]>([^<]+)</
|
17
|
-
current_word = OCRXWord.new($1,$2,$3,$4,$5)
|
18
|
-
line_array << current_word
|
24
|
+
|
25
|
+
def each_paragraph
|
26
|
+
for block in blocks do
|
27
|
+
for paragraph in block do
|
28
|
+
yield paragraph
|
19
29
|
end
|
20
|
-
|
21
|
-
end
|
22
|
-
hocr_array
|
30
|
+
end
|
23
31
|
end
|
24
32
|
|
25
|
-
def
|
26
|
-
|
33
|
+
def each_line
|
34
|
+
for block in blocks do
|
35
|
+
for paragraph in block do
|
36
|
+
for line in paragraph do
|
37
|
+
yield line
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
27
41
|
end
|
28
42
|
|
29
|
-
def
|
30
|
-
|
43
|
+
def each_word
|
44
|
+
for block in blocks do
|
45
|
+
for paragraph in block do
|
46
|
+
for line in paragraph do
|
47
|
+
for word in line do
|
48
|
+
yield word
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
#deprecated
|
56
|
+
def lines
|
57
|
+
unless @lines then
|
58
|
+
@lines = []
|
59
|
+
|
60
|
+
each_line do |line|
|
61
|
+
@lines << line
|
62
|
+
end
|
63
|
+
|
64
|
+
end
|
65
|
+
@lines
|
66
|
+
end
|
67
|
+
|
68
|
+
def extract_bbox_ppageno( ocr_html_text_fragment )
|
69
|
+
bbox, ppageno = ocr_html_text_fragment.split(';')
|
70
|
+
ppageno =~ /(\d+)/
|
71
|
+
[ OCRElement.extract_coordinates_from_string(bbox) , $1.to_i ]
|
72
|
+
end
|
73
|
+
|
74
|
+
def process_hocr_html_file(filename)
|
75
|
+
html_string = File.open(filename,"r").read
|
76
|
+
Nokogiri::HTML(html_string).elements
|
31
77
|
end
|
32
78
|
|
33
|
-
def
|
34
|
-
|
35
|
-
[$1,$2,$3,$4]
|
79
|
+
def to_text
|
80
|
+
lines.map {|line| line.to_text}.join("\n")
|
36
81
|
end
|
37
82
|
|
83
|
+
def to_image_html(dipslay_class = @ocr_class)
|
84
|
+
children_html = @children.map {|c| c.to_image_html}.join("")
|
85
|
+
"<div class='#{ dipslay_class }' style='#{ to_css_style };background-image: url(#{@image}); width:#{@width}px; height:#{@height}>px ;'>#{children_html}</div>"
|
86
|
+
end
|
38
87
|
|
39
|
-
def
|
40
|
-
|
88
|
+
def enclosed_words(ocr_box)
|
89
|
+
a = []
|
90
|
+
each_word do |w|
|
91
|
+
if w.enclosed_by? ocr_box then
|
92
|
+
a << w
|
93
|
+
end
|
94
|
+
end
|
95
|
+
a
|
41
96
|
end
|
42
97
|
|
43
98
|
end
|
data/lib/rhocr.rb
CHANGED
@@ -1,2 +1,31 @@
|
|
1
1
|
#coding: utf-8
|
2
|
-
|
2
|
+
|
3
|
+
require_relative "ocr_document"
|
4
|
+
class RHOCR < OCRDocument
|
5
|
+
|
6
|
+
attr_reader :words, :lines
|
7
|
+
|
8
|
+
def add_folder(path)
|
9
|
+
add_files Dir[path]
|
10
|
+
compute_lines
|
11
|
+
compute_words
|
12
|
+
self
|
13
|
+
end
|
14
|
+
|
15
|
+
#should be called if new pages are added
|
16
|
+
def compute_words
|
17
|
+
@words = []
|
18
|
+
each_word do |w|
|
19
|
+
@words << w
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
#should be called if new pages are added
|
24
|
+
def compute_lines
|
25
|
+
@lines = []
|
26
|
+
each_line do |l|
|
27
|
+
@lines << l
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
data/rhocr.gemspec
CHANGED
@@ -2,28 +2,31 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{rhocr}
|
5
|
-
s.version = "0.
|
5
|
+
s.version = "0.1"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
|
-
s.authors = [
|
9
|
-
s.date = %q{2011-
|
8
|
+
s.authors = [%q{Andreas Neumann}]
|
9
|
+
s.date = %q{2011-09-08}
|
10
10
|
s.description = %q{Manipulate and use OCR data encode in HOCR}
|
11
|
-
s.email = %q{
|
12
|
-
s.extra_rdoc_files = [
|
13
|
-
s.files = [
|
11
|
+
s.email = %q{andreas@neumann.biz}
|
12
|
+
s.extra_rdoc_files = [%q{README}, %q{TODO.txt}, %q{lib/hocr_box.rb}, %q{lib/ocr_document.rb}, %q{lib/ocr_element.rb}, %q{lib/ocr_page.rb}, %q{lib/rhocr.rb}]
|
13
|
+
s.files = [%q{Manifest}, %q{README}, %q{Rakefile}, %q{TODO.txt}, %q{data/Seite_Die_Gartenlaube_242.html}, %q{data/Seite_Tagebuch_H_C_Lang_08.html}, %q{data/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{data/test.html}, %q{data/test.png}, %q{example/example_server.rb}, %q{example/public/OCRTest.css}, %q{example/public/OCRTest.html}, %q{example/public/OCRTest_marker.js}, %q{example/public/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{example/public/img/Seite_Tagebuch_H_C_Lang_08.jpg}, %q{lib/hocr_box.rb}, %q{lib/ocr_document.rb}, %q{lib/ocr_element.rb}, %q{lib/ocr_page.rb}, %q{lib/rhocr.rb}, %q{rhocr.gemspec}, %q{spec/hocr_box_spec.rb}, %q{spec/ocr_document_spec.rb}, %q{spec/ocr_element_spec.rb}, %q{spec/ocr_page_spec.rb}, %q{spec/rhocr_spec.rb}, %q{test.html}]
|
14
14
|
s.homepage = %q{http://github.com/daandi/rhocr}
|
15
|
-
s.rdoc_options = [
|
16
|
-
s.require_paths = [
|
15
|
+
s.rdoc_options = [%q{--line-numbers}, %q{--inline-source}, %q{--title}, %q{Rhocr}, %q{--main}, %q{README}]
|
16
|
+
s.require_paths = [%q{lib}]
|
17
17
|
s.rubyforge_project = %q{rhocr}
|
18
|
-
s.rubygems_version = %q{1.6
|
18
|
+
s.rubygems_version = %q{1.8.6}
|
19
19
|
s.summary = %q{Manipulate and use OCR data encode in HOCR}
|
20
20
|
|
21
21
|
if s.respond_to? :specification_version then
|
22
22
|
s.specification_version = 3
|
23
23
|
|
24
24
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
25
|
+
s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
|
25
26
|
else
|
27
|
+
s.add_dependency(%q<nokogiri>, [">= 0"])
|
26
28
|
end
|
27
29
|
else
|
30
|
+
s.add_dependency(%q<nokogiri>, [">= 0"])
|
28
31
|
end
|
29
32
|
end
|