gocr-ruby 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +22 -0
  3. data/Gemfile +4 -0
  4. data/LICENSE +21 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +29 -0
  7. data/Rakefile +49 -0
  8. data/ext/gocr/Makefile +141 -0
  9. data/ext/gocr/Makefile.in +140 -0
  10. data/ext/gocr/amiga.h +31 -0
  11. data/ext/gocr/barcode.c +2108 -0
  12. data/ext/gocr/barcode.h +11 -0
  13. data/ext/gocr/box.c +496 -0
  14. data/ext/gocr/config.h +37 -0
  15. data/ext/gocr/config.h.in +36 -0
  16. data/ext/gocr/database.c +468 -0
  17. data/ext/gocr/detect.c +1003 -0
  18. data/ext/gocr/extconf.rb +6 -0
  19. data/ext/gocr/gocr.c +436 -0
  20. data/ext/gocr/gocr.h +290 -0
  21. data/ext/gocr/jconv.c +168 -0
  22. data/ext/gocr/job.c +92 -0
  23. data/ext/gocr/lines.c +364 -0
  24. data/ext/gocr/list.c +334 -0
  25. data/ext/gocr/list.h +91 -0
  26. data/ext/gocr/ocr0.c +7312 -0
  27. data/ext/gocr/ocr0.h +63 -0
  28. data/ext/gocr/ocr0n.c +1527 -0
  29. data/ext/gocr/ocr1.c +85 -0
  30. data/ext/gocr/ocr1.h +3 -0
  31. data/ext/gocr/otsu.c +310 -0
  32. data/ext/gocr/otsu.h +23 -0
  33. data/ext/gocr/output.c +291 -0
  34. data/ext/gocr/output.h +37 -0
  35. data/ext/gocr/pcx.c +153 -0
  36. data/ext/gocr/pcx.h +9 -0
  37. data/ext/gocr/pgm2asc.c +3259 -0
  38. data/ext/gocr/pgm2asc.h +105 -0
  39. data/ext/gocr/pixel.c +538 -0
  40. data/ext/gocr/pnm.c +538 -0
  41. data/ext/gocr/pnm.h +35 -0
  42. data/ext/gocr/progress.c +87 -0
  43. data/ext/gocr/progress.h +42 -0
  44. data/ext/gocr/remove.c +715 -0
  45. data/ext/gocr/tga.c +87 -0
  46. data/ext/gocr/tga.h +6 -0
  47. data/ext/gocr/unicode.c +1318 -0
  48. data/ext/gocr/unicode.h +62 -0
  49. data/ext/gocr/unicode_defs.h +1245 -0
  50. data/ext/gocr/version.h +2 -0
  51. data/gocr-ruby.gemspec +28 -0
  52. data/image.png +0 -0
  53. data/lib/gocr.rb +6 -0
  54. data/lib/gocr/image.rb +8 -0
  55. data/lib/gocr/version.rb +3 -0
  56. metadata +156 -0
@@ -0,0 +1,37 @@
1
+ /*
2
+ This is a Optical-Character-Recognition program
3
+ Copyright (C) 2000 Joerg Schulenburg
4
+
5
+ This program is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU General Public License
7
+ as published by the Free Software Foundation; either version 2
8
+ of the License, or (at your option) any later version.
9
+
10
+ This program is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ GNU General Public License for more details.
14
+
15
+ You should have received a copy of the GNU General Public License
16
+ along with this program; if not, write to the Free Software
17
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18
+
19
+ see README for EMAIL-address */
20
+
21
+ #ifndef OUTPUT_H
22
+ #define OUTPUT_H
23
+
24
+ #include <stdlib.h>
25
+ #include <stdio.h>
26
+ #include "pnm.h"
27
+ #include "gocr.h"
28
+ #include "list.h"
29
+
30
+ void out_b(struct box *px, pix *b, int x0, int y0, int dx, int dy, int cs );
31
+ void out_x(struct box *px);
32
+ void out_x2(struct box *box1,struct box *box2);
33
+ int output_list(job_t *job);
34
+ int debug_img(char *fname, struct job_s *job, int opt);
35
+
36
+
37
+ #endif
@@ -0,0 +1,153 @@
1
+ /*
2
+ This is a Optical-Character-Recognition program
3
+ Copyright (C) 1999 Joerg Schulenburg
4
+
5
+ This program is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU General Public License
7
+ as published by the Free Software Foundation; either version 2
8
+ of the License, or (at your option) any later version.
9
+
10
+ This program is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ GNU General Public License for more details.
14
+
15
+ You should have received a copy of the GNU General Public License
16
+ along with this program; if not, write to the Free Software
17
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18
+
19
+ see README for EMAIL-address
20
+ */
21
+ /* plan: use popen("ppm2pcx -packed ...","w"); for writing pcx */
22
+
23
+ #include <stdio.h>
24
+ #include <stdlib.h>
25
+ /* #include <assert.h> */
26
+
27
+ #include "pcx.h"
28
+
29
+ typedef unsigned char byte;
30
+
31
+ #define ERR(x) { fprintf(stderr,"ERROR "__FILE__" L%d: " x "\n",__LINE__);exit(1);}
32
+
33
+ int err;
34
+ /* --- needed for reading PCX-files */
35
+ unsigned char read_b(FILE *f1){
36
+ unsigned char c=0; c=fgetc(f1); if(feof(f1) || ferror(f1))err=1; return c;
37
+ }
38
+
39
+ /* something here is wrong! */
40
+ void readpcx(char *name,pix *p,int vvv){ /* see pcx.format.txt */
41
+ int page,pages,nx,ny,i,j,b,x,y,bpl,bits,pal[256][3];
42
+ FILE *f1;
43
+ unsigned char *pic,h[128],bb,b1,b2,b3;
44
+ err=0;
45
+ for(i=0;i<256;i++)for(j=0;j<3;j++)pal[i][j]=i;
46
+ f1=fopen(name,"rb"); if(!f1) ERR("open");
47
+ if(fread(h,1,128,f1)!=128)ERR("read PCX header"); /* 128 Byte lesen -> h[] */
48
+ if(h[0]!=10)ERR("no ZSoft sign"); /* ZSoft sign */
49
+ if(h[2]> 1)ERR("unknown coding"); /* run length encoding */
50
+ bits = h[3]; /* 1 or 8 */
51
+ if(bits!=1 && bits!=8)ERR("only 1 or 8 bits supported");
52
+ nx = h[ 9]*256+h[ 8] - h[ 5]*256-h[ 4] +1; /* Xmax-Xmin */
53
+ ny = h[11]*256+h[10] - h[ 7]*256-h[ 6] +1; /* Ymax-Ymin */
54
+ pages=h[65]; bpl=h[66]+256*h[67]; /* bytes per line */
55
+ if(vvv)
56
+ fprintf(stderr,"# PCX version=%d bits=%d x=%d y=%d HRes=%d VRes=%d\n"
57
+ "# NPlanes=%d BytesPerLine=%d Palette=%s",
58
+ h[1],bits,nx,ny,h[12]+256*h[13],h[14]+256*h[15],
59
+ pages,bpl,((h[68]==1)?"1=color/bw":"2=gray"));
60
+ /* line1(NP=4): RRRRR...,GGGG....,BBBBB...,IIII...., line2: RRRR...,GGGG.... */
61
+ /* C4 EF = (C4&3F)*EF = EF EF EF EF */
62
+ fflush(stdout);
63
+ /* palette: for(i=0;i<16;i++) for(j=0;j<3;j++) h[16+3*i+j] */
64
+ if(pages>1)for(b=0;b<16;b++) for(i=0;i<16;i++)
65
+ for(j=0;j< 3;j++) pal[b*16+i][j]=h[16+3*i+j]>>2;
66
+ if(bits>7){
67
+ fseek(f1,-3*256,2); if(fread(pal,3,256,f1)!=256)ERR("read palette");
68
+ for(i=0;i<256;i++) for(j=0;j<3;j++) pal[i][j]>>=2;
69
+ }
70
+ fseek(f1,128,0);
71
+ pic=(unsigned char *)malloc( nx*ny );
72
+ if(pic==NULL)ERR("no memory"); /* no memory */
73
+ x=y=0;
74
+ do {
75
+ for(page=0;page<pages;page++) /* 192 == 0xc0 => b1=counter */
76
+ do {
77
+ b1=1; bb=read_b(f1); b2=bb; if(b1==192)fprintf(stderr,"?");
78
+ if((b2>=192) && (h[2]==1)){b1=b2&63;bb=read_b(f1);b2=bb;}
79
+ if(err){fprintf(stderr,"\nread error x=%d y=%d\n",x,y);x=nx;y=ny;break;}
80
+ for(b3=0;b3<b1;b3++)for(b=0;b<8;b+=bits,x++)if(x<nx){
81
+ bb=(b2>>(8-bits-b)) & ~((~0)<<bits);
82
+ if(bits==1 && bb==1) bb=240;
83
+ if(page==0) pic[x+nx*y] =(byte)bb;
84
+ else pic[x+nx*y]|=(byte)bb<<(page*bits);
85
+ }
86
+ } while(x<(9-bits)*bpl); x=0; y++;
87
+ } while(y<ny);
88
+ /* */
89
+ fclose(f1);
90
+ p->p=pic; p->x=nx; p->y=ny; p->bpp=1;
91
+ if(vvv)fprintf(stderr,"\n");
92
+ }
93
+
94
+ /* -----------------------------------------------------------------------
95
+ // write bmp 8bit palette no RLE
96
+ // bit 2+3 used for color coding (markers)
97
+ // replaced by writeppm (ppm.gz) and is obsolate now, removed later
98
+ */
99
+ void writebmp(char *name,pix p,int vvv){ /* see pcx.format.txt */
100
+ int nx,ny,i,y,rest[4]={0,0,0,0};
101
+ FILE *f1;
102
+ /*FIXME jb static*/static unsigned char *pic, h[54+4*256];
103
+ long fs,fo,hs,is; /* filesize, offset, headersize, imagesize */
104
+
105
+ nx=p.x; ny=p.y; pic=p.p;
106
+ if (nx&3) nx+=4-(nx&3); /* must be mod4 ? */
107
+ hs=40; /* bmi headersize fix */
108
+ is=nx*ny; /* imagesize */
109
+ fo=14+hs+4*256;
110
+ fs=fo+is;
111
+ for(i=0;i<54;i++){ h[i]=0; }
112
+ /* BITMAPFILEHEADER */
113
+ h[ 0]='B'; h[ 1]='M'; /* type of file BMP */
114
+ h[ 2]= fs &255; h[ 3]=(fs>> 8)&255;
115
+ h[ 4]=(fs>>16)&255; h[ 5]=(fs>>24)&255; /* size of file */
116
+ h[10]= fo &255; h[11]=(fo>> 8)&255;
117
+ h[12]=(fo>>16)&255; h[13]=(fo>>24)&255; /* offset to image data */
118
+ /* BITMAPINFO (BITMAPCOREHEADER not used here) */
119
+ /* 14 - HEADER */
120
+ h[14]= hs &255; h[15]=(hs>> 8)&255;
121
+ h[16]=(hs>>16)&255; h[17]=(hs>>24)&255; /* bmi-header size */
122
+ h[18]= nx &255; h[19]=(nx>> 8)&255;
123
+ h[20]=(0l>>16)&255; h[21]=(0l>>24)&255; /* WIDTH/pixel */
124
+ h[22]= ny &255; h[23]=(ny>> 8)&255;
125
+ h[24]=(0l>>16)&255; h[25]=(0l>>24)&255; /* HIGH/pixel */
126
+ h[26]=1; /* planes */
127
+ h[28]=8; /* bits/pixel 1,4,8,24 */
128
+ h[30]=0; /* compression */
129
+ h[34]= is &255; h[35]=(is>> 8)&255;
130
+ h[36]=(is>>16)&255; h[37]=(is>>24)&255; /* sizeImage (can be 0 if ~RLE) */
131
+ h[38]=0;h[39]=1; /* ca 100dpi, x/meter */
132
+ h[42]=0;h[43]=1; /* y/meter */
133
+ h[46]=0;h[47]=1; /* colorused (0=maximum) */
134
+ h[50]=0;h[51]=1; /* colorimportand (0=all) */
135
+ /* 54 - endofheader */
136
+ for(i=0;i<256;i++){
137
+ h[54+4*i+0]=((~((i & 2)*64)) & (i & (128+64)))|63;
138
+ h[54+4*i+1]=((~((i & 2)*64)) & (~((i & 4)*32)) & (i & (128+64)))|63;
139
+ h[54+4*i+2]=( ((i & 2)* 8) | ((~((i & 4)*32)) & (i & (128+64)))|63);
140
+ } /* blue-green-red */
141
+ f1=fopen(name,"wb"); if(!f1) fprintf(stderr," error opening file\n");
142
+ if(!f1)ERR("open"); /* open-error */
143
+ if(fwrite(h,1,54+4*256,f1)!=54+4*256)ERR("write head");
144
+ if(vvv) fprintf(stderr,"# write BMP x=%d y=%d\n",nx,ny);
145
+ for(y=ny-1;y>=0;y--){
146
+ if(((int)fwrite(pic+p.x*y,1,p.x,f1))!=p.x)ERR("write");
147
+ if(nx>p.x)
148
+ if(((int)fwrite(rest,1,nx-p.x,f1))!=nx-p.x)ERR("write");
149
+ }
150
+ fclose(f1);
151
+ }
152
+
153
+ /* ---------------------------------------------------------------------- */
@@ -0,0 +1,9 @@
1
+
2
+ #include "pnm.h"
3
+
4
+ void readpcx(char *name,pix *p,int vvv);
5
+
6
+ /* write 8bit palette no RLE, ToDo: obsolete? */
7
+ void writebmp(char *name,pix p,int vvv);
8
+
9
+ /* ------------------------------------------------------------------------ */
@@ -0,0 +1,3259 @@
1
+ /*
2
+ This is a Optical-Character-Recognition program
3
+ Copyright (C) 2000-2012 Joerg Schulenburg
4
+
5
+ This program is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU General Public License
7
+ as published by the Free Software Foundation; either version 2
8
+ of the License, or (at your option) any later version.
9
+
10
+ This program is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ GNU General Public License for more details.
14
+
15
+ You should have received a copy of the GNU General Public License
16
+ along with this program; if not, write to the Free Software
17
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18
+
19
+ see README for EMAIL-address
20
+
21
+ sometimes I have written comments in german language, sorry for that
22
+
23
+ - look for ??? for preliminary code
24
+ - space: avX=22 11-13 (empirical estimated)
25
+ avX=16 5-7
26
+ avX= 7 5-6
27
+
28
+ ToDo: - add filter (r/s mismatch) g300c1
29
+ - better get_line2 function (problems on high resolution)
30
+ - write parallelizable code!
31
+ - learnmode (optimize filter)
32
+ - use ispell for final control or if unsure
33
+ - better line scanning (if not even)
34
+ - step 5: same chars differ? => expert mode
35
+ - chars dx>dy and above 50% hor-crossing > 4 is char-group ?
36
+ - detect color of chars and background
37
+ - better word space calculation (look at the examples)
38
+ (distance: left-left, middle-middle, left-right, thickness of e *0.75)
39
+
40
+ GLOBAL DATA (mostly structures)
41
+ - pix : image - one byte per pixel bits0-2=working
42
+ - lines : rows of the text (points to pix)
43
+ - box : list of bounding box for character
44
+ - obj : objects (lines, splines, etc. building a character)
45
+ */
46
+
47
+
48
+ #include <stdlib.h>
49
+ #include <stdio.h>
50
+ #include <assert.h>
51
+ #include <string.h>
52
+ #include <ctype.h>
53
+ #include "config.h"
54
+ #ifdef HAVE_WCHAR_H
55
+ #include <wchar.h>
56
+ #endif
57
+
58
+ #include "amiga.h"
59
+ #include "list.h"
60
+ #include "pgm2asc.h"
61
+ // #include "pcx.h" /* needed for writebmp (removed later) */
62
+ /* ocr1 is the test-engine - remember: this is development version */
63
+ #include "ocr1.h"
64
+ /* first engine */
65
+ #include "ocr0.h"
66
+ #include "otsu.h"
67
+ #include "barcode.h"
68
+ #include "progress.h"
69
+ #include "unicode_defs.h" /* UNKNOWN + PICTURES + ... */
70
+
71
+ #include "gocr.h"
72
+
73
+ /* wew: will be exceeded by capitals at 1200dpi */
74
+ #define MaxBox (100*200) // largest possible letter (buffersize)
75
+ #define MAX(a,b) ((a) >= (b) ? (a) : (b))
76
+
77
+ /* if the system does not know about wchar.h, define functions here */
78
+ #ifndef HAVE_WCHAR_H
79
+ /* typedef unsigned wchar_t; */
80
+ /* Find the first occurrence of WC in WCS. */
81
+ wchar_t *wcschr (wchar_t *wcs, wchar_t wc) {
82
+ int i; for(i=0;wcs[i];i++) if (wcs[i]==wc) return wcs+i; return NULL;
83
+ }
84
+ wchar_t *wcscpy (wchar_t *dest, const wchar_t *src) {
85
+ int i; for(i=0;src[i];i++) dest[i]=src[i]; dest[i]=0; return dest;
86
+ }
87
+ size_t wcslen (const wchar_t *s){
88
+ size_t i; for(i=0;s[i];i++); return i;
89
+ }
90
+ #endif
91
+ #ifndef HAVE_WCSDUP
92
+ wchar_t * wcsdup (const wchar_t *WS) { /* its a gnu extension */
93
+ wchar_t *copy;
94
+ copy = (wchar_t *) malloc((wcslen(WS)+1)*sizeof(wchar_t));
95
+ if (!copy)return NULL;
96
+ wcscpy(copy, WS);
97
+ return copy;
98
+ }
99
+ #endif
100
+
101
+ // ------------------------ feature extraction -----------------
102
+ // -------------------------------------------------------------
103
+ // detect maximas in of line overlaps (return in %) and line coordinates
104
+ // this is for future use
105
+ #define HOR 1 // horizontal
106
+ #define VER 2 // vertical
107
+ #define RIS 3 // rising=steigend
108
+ #define FAL 4 // falling=fallend
109
+
110
+ /* exchange two variables */
111
+ static void swap(int *a, int *b) {
112
+ int c = *a;
113
+ *a = *b;
114
+ *b = c;
115
+ }
116
+
117
+ // calculate the overlapping of the line (0-1) with black points
118
+ // by recursive bisection
119
+ // line: y=dy/dx*x+b, implicit form: d=F(x,y)=dy*x-dx*y+b*dx=0
120
+ // incremental y(i+1)=m*(x(i)+1)+b, F(x+1,y+1)=f(F(x,y))
121
+ // ret & 1 => inverse pixel!
122
+ // d=2*F(x,y) integer numbers
123
+ int get_line(int x0, int y0, int x1, int y1, pix *p, int cs, int ret){
124
+ int dx,dy,incrE,incrNE,d,x,y,r0,r1,ty,tx,
125
+ *px,*py,*pdx,*pdy,*ptx,*pty,*px1;
126
+ dx=abs(x1-x0); tx=((x1>x0)?1:-1); // tx=x-spiegelung (new)
127
+ dy=abs(y1-y0); ty=((y1>y0)?1:-1); // ty=y-spiegelung (new)
128
+ // rotate coordinate system if dy>dx
129
+ /*bbg: can be faster if instead of pointers we use the variables and swaps? */
130
+ /*js: Do not know, I am happy that the current code is working and is small */
131
+ if(dx>dy){ pdx=&dx;pdy=&dy;px=&x;py=&y;ptx=&tx;pty=&ty;px1=&x1; }
132
+ else { pdx=&dy;pdy=&dx;px=&y;py=&x;ptx=&ty;pty=&tx;px1=&y1; }
133
+ if( *ptx<0 ){ swap(&x0,&x1);swap(&y0,&y1);tx=-tx;ty=-ty; }
134
+ d=((*pdy)<<1)-(*pdx); incrE=(*pdy)<<1; incrNE=((*pdy)-(*pdx))<<1;
135
+ x=x0; y=y0; r0=r1=0; /* dd=tolerance (store max drift) */
136
+ while( (*px)<=(*px1) ){
137
+ if( ((getpixel(p,x,y)<cs)?1:0)^(ret&1) ) r0++; else r1++;
138
+ (*px)++; if( d<=0 ){ d+=incrE; } else { d+=incrNE; (*py)+=(*pty); }
139
+ }
140
+ return (r0*(ret&~1))/(r0+r1); // ret==100 => percentage %
141
+ }
142
+
143
+ // this function should detect whether a direct connection between points
144
+ // exists or not, not finally implemented
145
+ // ret & 1 => inverse pixel!
146
+ // d=2*F(x,y) integer numbers, ideal line: ,I pixel: I@
147
+ // ..@ @@@ .@. ...,@2@. +1..+3 floodfill around line ???
148
+ // ..@ .@@ .@. ...,.@@@ +2..+4 <= that's not implemented yet
149
+ // ..@ ..@ .@. ...,.@@@ +2..+4
150
+ // @.@ @.. .@. ...,@@@. +1..+3
151
+ // @.@ @@. .@. ...I@@@. 0..+3
152
+ // @@@ @@@ .@. ..@1@@.. 0..+2
153
+ // 90% 0% 100% 90% r1-r2
154
+ // I am not satisfied with it
155
+ int get_line2(int x0, int y0, int x1, int y1, pix *p, int cs, int ret){
156
+ int dx,dy,incrE,incrNE,d,x,y,r0,r1,ty,tx,q,ddy,rx,ry,
157
+ *px,*py,*pdx,*pdy,*ptx,*pty,*px1;
158
+ dx=abs(x1-x0); tx=((x1>x0)?1:-1); // tx=x-spiegelung (new)
159
+ dy=abs(y1-y0); ty=((y1>y0)?1:-1); // ty=y-spiegelung (new)
160
+ // rotate coordinate system if dy>dx
161
+ if(dx>dy){ pdx=&dx;pdy=&dy;px=&x;py=&y;ptx=&tx;pty=&ty;px1=&x1;rx=1;ry=0; }
162
+ else { pdx=&dy;pdy=&dx;px=&y;py=&x;ptx=&ty;pty=&tx;px1=&y1;rx=0;ry=1; }
163
+ if( *ptx<0 ){ swap(&x0,&x1);swap(&y0,&y1);tx=-tx;ty=-ty; }
164
+ d=((*pdy)<<1)-(*pdx); incrE=(*pdy)<<1; incrNE=((*pdy)-(*pdx))<<1;
165
+ x=x0; y=y0; r0=r1=0; ddy=3; // tolerance = bit 1 + bit 0 = left+right
166
+ // int t=(*pdx)/16,tl,tr; // tolerance, left-,right delimiter
167
+ while( (*px)<=(*px1) ){ // not finaly implemented
168
+ q=((getpixel(p,x,y)<cs)?1:0)^(ret&1);
169
+ if ( !q ){ // tolerance one pixel perpenticular to the line
170
+ // what about 2 or more pixels tolerance???
171
+ ddy&=(~1)|(((getpixel(p,x+ry,y+rx)<cs)?1:0)^(ret&1));
172
+ ddy&=(~2)|(((getpixel(p,x-ry,y-rx)<cs)?1:0)^(ret&1))*2;
173
+ } else ddy=3;
174
+ if( ddy ) r0++; else r1++;
175
+ (*px)++; if( d<=0 ){ d+=incrE; } else { d+=incrNE; (*py)+=(*pty); }
176
+ }
177
+ return (r0*(ret&~1))/(r0+r1); // ret==100 => percentage %
178
+ }
179
+
180
+ /* Look for dots in the rectangular region x0 <= x <= x1 and y0 <= y
181
+ <= y1 in pixmap p. The two low order bits in mask indicate the color
182
+ of dots to look for: If mask==1 then look for black dots (where a
183
+ pixel value less than cs is considered black). If mask==2 then look
184
+ for white dots. If mask==3 then look for both black and white dots.
185
+ If the dots are found, the corresponding bits are set in the returned
186
+ value. Heavily used by the engine ocr0*.cc */
187
+ char get_bw(int x0, int x1, int y0, int y1, pix * p, int cs, int mask) {
188
+ char rc = 0; // later with error < 2% (1 dot)
189
+ int x, y;
190
+
191
+ if (x0 < 0) x0 = 0;
192
+ if (x1 >= p->x) x1 = p->x - 1;
193
+ if (y0 < 0) y0 = 0;
194
+ if (y1 >= p->y) y1 = p->y - 1;
195
+
196
+ for ( y = y0; y <= y1; y++)
197
+ for ( x = x0; x <= x1; x++) {
198
+ rc |= ((getpixel(p, x, y) < cs) ? 1 : 2); // break if rc==3
199
+ if ((rc & mask) == mask)
200
+ return mask; // break loop
201
+ }
202
+ return (rc & mask);
203
+ }
204
+
205
+ /* more general Mar2000 (x0,x1,y0,y1 instead of x0,y0,x1,y1! (history))
206
+ * look for black crossings throw a line from x0,y0 to x1,y1 and count them
207
+ * follow line and count crossings ([white]-black-transitions)
208
+ * ex: horizontal num_cross of 'm' would return 3
209
+ *
210
+ * fail for: .a... a-to-b counts no transitions, but there is
211
+ * ...#.
212
+ * ..#..
213
+ * .#..b
214
+ */
215
+ int num_cross(int x0, int x1, int y0, int y1, pix *p, int cs) {
216
+ int rc = 0, col = 0, k, x, y, i, d; // rc=crossings col=0=white
217
+ int dx = x1 - x0, dy = y1 - y0;
218
+
219
+ d = MAX(abs(dx), abs(dy));
220
+ for (i = 0, x = x0, y = y0; i <= d; i++) {
221
+ if (d) {
222
+ x = x0 + i * dx / d;
223
+ y = y0 + i * dy / d;
224
+ }
225
+ k = ((getpixel(p, x, y) < cs) ? 1 : 0); // 0=white 1=black
226
+ if (col == 0 && k == 1) // found a white-black transition
227
+ rc++;
228
+ col = k; // last color
229
+ }
230
+ return rc;
231
+ }
232
+
233
+ /* check if test matches pattern
234
+ * possible pattern: "a-zA-Z0-9+--\\" (x-y dont work for c>127)
235
+ * return: 0 means dont fit, 1 means found
236
+ * ToDo: wchar_t cc + matching UTF-8 pattern for nonASCII
237
+ */
238
+ int my_strchr( char *pattern, wchar_t cc ) {
239
+ char *s1;
240
+ if (pattern==(char *)NULL) return 0;
241
+
242
+ /* if (!(cc&0x80)) s1=strchr(pattern,(char)cc); else */
243
+ switch (cc) {
244
+ case '-': /* used as a special character */
245
+ s1=strstr(pattern,"--"); /* search string -- in pattern */
246
+ if (s1) return 1; break;
247
+ default:
248
+ s1=strstr(pattern,decode(cc, UTF8)); /* search string cc in pattern */
249
+ if (s1) return 1; /* cc simply matches */
250
+ /* single char not found, now check the ranges */
251
+ s1=pattern;
252
+ while (s1) {
253
+ s1=strchr(s1+1,'-'); /* look for next '-' */
254
+ if ((!s1) || (!s1[0]) || (!s1[1])) return 0; /* nothing found or end */
255
+ if (*(s1-1)=='-' || *(s1+1)=='-') continue; /* skip -- pattern */
256
+ if (*(s1-1)<=cc && *(s1+1)>=cc) return 1; /* within range */
257
+ }
258
+ }
259
+ return 0;
260
+ }
261
+
262
+ /* set alternate chars and its weight, called from the engine
263
+ if a char is recognized to (weight) percent
264
+ can be used for filtering (only numbers etc)
265
+ often usefull if Il1 are looking very similar
266
+ should this function stay in box.c ???
267
+ weight is between 0 and 100 in percent, 100 means absolutely sure
268
+ - not final, not time critical (js)
269
+ - replace it by a string-function setaobj(*b,"string",weight)
270
+ and let call setac the setas function
271
+ */
272
+
273
+ int setas(struct box *b, char *as, int weight){
274
+ job_t *job=OCR_JOB;
275
+ int i,j;
276
+ if (b->num_ac > NumAlt || b->num_ac<0) {
277
+ fprintf(stderr,"\nDBG: There is something wrong with setas()!");
278
+ b->num_ac=0;
279
+ }
280
+ if (as==NULL) {
281
+ fprintf(stderr,"\nDBG: setas(NULL) makes no sense!"); return 0; }
282
+ if (as[0]==0) {
283
+ fprintf(stderr,"\nDBG: setas(\"\") makes no sense!"
284
+ " x= %d %d", b->x0, b->y0);
285
+ // out_x(b);
286
+ return 0;
287
+ }
288
+
289
+ /* char filter (ex: only numbers) ToDo: cfilter as UTF-8 */
290
+ if (job->cfg.cfilter) {
291
+ /* do not accept chars which are not in the cfilter string */
292
+ if ( as[0]>0 && as[1]==0 )
293
+ if ( !my_strchr(job->cfg.cfilter,as[0]) ) return 0;
294
+ }
295
+ #if 0 /* obsolete, done in setac */
296
+ /* not sure that this is the right place, but where else? */
297
+ if ( as[0]>0 && as[1]==0 )
298
+ if (b->modifier != SPACE && b->modifier != 0) {
299
+ wchar_t newac;
300
+ newac = compose(as[0], b->modifier);
301
+ as = (char *)decode(newac, UTF8); /* was (const char *) */
302
+ if (newac == as[0]) { /* nothing composed */
303
+ fprintf(stderr, "\nDBG setas compose was useless %d %d",b->x0,b->y0);
304
+ // out_x(b);
305
+ }
306
+ }
307
+ #endif
308
+
309
+ /* only the first run gets the full weight */
310
+ weight=(100-job->tmp.n_run)*weight/100;
311
+
312
+ /* remove same entries from table */
313
+ for (i=0;i<b->num_ac;i++)
314
+ if (b->tas[i])
315
+ if (strcmp(as,b->tas[i])==0) break;
316
+ if (b->num_ac>0 && i<b->num_ac){
317
+ if (weight<=b->wac[i]) return 0; /* if found + less weight ignore it */
318
+ /* to insert the new weigth on the right place, we remove it first */
319
+ if (b->tas[i]) free(b->tas[i]);
320
+ for (j=i;j<b->num_ac-1;j++){ /* shift lower entries */
321
+ b->tac[j]=b->tac[j+1]; /* copy the char */
322
+ b->tas[j]=b->tas[j+1]; /* copy the pointer to the string */
323
+ b->wac[j]=b->wac[j+1]; /* copy the weight */
324
+ }
325
+ b->num_ac--; /* shrink table */
326
+ }
327
+ /* sorting and add it to the table */
328
+ for (i=0;i<b->num_ac;i++) if (weight>b->wac[i]) break;
329
+ if (b->num_ac<NumAlt-1) b->num_ac++; /* enlarge table */
330
+ for (j=b->num_ac-1;j>i;j--){ /* shift lower entries */
331
+ b->tac[j]=b->tac[j-1]; /* copy the char */
332
+ b->tas[j]=b->tas[j-1]; /* copy the pointer to the string */
333
+ b->wac[j]=b->wac[j-1]; /* copy the weight */
334
+ }
335
+ if (i<b->num_ac) { /* insert new entry */
336
+ b->tac[i]=0; /* insert the char=0 ... */
337
+ b->tas[i]=(char *)malloc(strlen(as)+1); /* ... string */
338
+ if (b->tas[i]) memcpy(b->tas[i],as,strlen(as)+1);
339
+ b->wac[i]=weight; /* ... and its weight */
340
+ }
341
+ if (i==0) b->c=b->tac[0]; /* char or 0 for string */
342
+ return 0;
343
+ }
344
+
345
+ /* ToDo: this function will be replaced by a call of setas() later */
346
+ int setac(struct box *b, wchar_t ac, int weight){
347
+ int i,j;
348
+ job_t *job=OCR_JOB;
349
+ if ((!b) || b->num_ac > NumAlt || b->num_ac<0) {
350
+ fprintf(stderr,"\nDBG: This is a bad call to setac()!");
351
+ if(b && (job->cfg.verbose & 6)) out_x(b);
352
+ b->num_ac=0;
353
+ }
354
+ if (ac==0 || ac==UNKNOWN) {
355
+ fprintf(stderr,"\nDBG: setac(0) makes no sense!");
356
+ return 0;
357
+ }
358
+ /* char filter (ex: only numbers) ToDo: cfilter as UTF-8 */
359
+ if (job->cfg.cfilter) {
360
+ /* do not accept chars which are not in the cfilter string */
361
+ /* if ( ac>255 || !strchr(job->cfg.cfilter,(char)ac) ) return 0; */
362
+ if ( !my_strchr(job->cfg.cfilter,ac) ) return 0;
363
+ }
364
+ /* not sure that this is the right place, but where else? */
365
+ if (b->modifier != SPACE && b->modifier != 0) {
366
+ wchar_t newac;
367
+ newac = compose(ac, b->modifier);
368
+ if (newac == ac) { /* nothing composed */
369
+ if(job->cfg.verbose & 7)
370
+ fprintf(stderr, "\nDBG %s setac (%d,%d): compose was useless, wac=%d",
371
+ decode(ac,ASCII), b->x0, b->y0, weight);
372
+ /* if(job->cfg.verbose & 6) out_x(b); */
373
+ }
374
+ ac = newac;
375
+ }
376
+
377
+ /* only the first run gets the full weight */
378
+ weight=(100-job->tmp.n_run)*weight/100;
379
+
380
+ /* remove same entries from table */
381
+ for (i=0;i<b->num_ac;i++) if (ac==b->tac[i]) break;
382
+ if (b->num_ac>0 && i<b->num_ac){
383
+ if (weight<=b->wac[i]) return 0;
384
+ if (b->tas[i]) free(b->tas[i]);
385
+ for (j=i;j<b->num_ac-1;j++){ /* shift lower entries */
386
+ b->tac[j]=b->tac[j+1]; /* copy the char */
387
+ b->tas[j]=b->tas[j+1]; /* copy the pointer to the string */
388
+ b->wac[j]=b->wac[j+1]; /* copy the weight */
389
+ }
390
+ b->num_ac--; /* shrink table */
391
+ }
392
+ /* sorting it to the table */
393
+ for (i=0;i<b->num_ac;i++) if (weight>b->wac[i]) break;
394
+ if (b->num_ac<NumAlt-1) b->num_ac++; /* enlarge table */
395
+ for (j=b->num_ac-1;j>i;j--){ /* shift lower entries */
396
+ b->tac[j]=b->tac[j-1]; /* copy the char */
397
+ b->tas[j]=b->tas[j-1]; /* copy the pointer to the string */
398
+ b->wac[j]=b->wac[j-1]; /* copy the weight */
399
+ }
400
+ if (i<b->num_ac) { /* insert new entry */
401
+ b->tac[i]=ac; /* insert the char ... */
402
+ b->tas[j]=NULL; /* ... no string (?) */
403
+ b->wac[i]=weight; /* ... and its weight */
404
+ }
405
+ if (i==0) b->c=ac; /* store best result to b->c (will be obsolete) */
406
+
407
+ return 0;
408
+ }
409
+
410
+ /* test if ac in wac-table
411
+ usefull for contextcorrection and box-splitting
412
+ return 0 if not found
413
+ return wac if found (wac>0)
414
+ */
415
+ int testac(struct box *b, wchar_t ac){
416
+ int i;
417
+ if (b->num_ac > NumAlt || b->num_ac<0) {
418
+ fprintf(stderr,"\n#DEBUG: There is something wrong with testac()!");
419
+ b->num_ac=0;
420
+ }
421
+ /* search entries in table */
422
+ for (i=0;i<b->num_ac;i++) if (ac==b->tac[i]) return b->wac[i];
423
+ return 0;
424
+ }
425
+
426
+
427
+ /* look for edges: follow a line from x0,y0 to x1,y1, record the
428
+ * location of each transition, and return their number.
429
+ * ex: horizontal num_cross of 'm' would return 6
430
+ * remark: this function is not used, obsolete? ToDo: remove?
431
+ */
432
+ int follow_path(int x0, int x1, int y0, int y1, pix *p, int cs, path_t *path) {
433
+ int rc = 0, prev, x, y, i, d, color; // rc=crossings col=0=white
434
+ int dx = x1 - x0, dy = y1 - y0;
435
+
436
+ d = MAX(abs(dx), abs(dy));
437
+ prev = getpixel(p, x0, y0) < cs; // 0=white 1=black
438
+ path->start = prev;
439
+ for (i = 1, x = x0, y = y0; i <= d; i++) {
440
+ if (d) {
441
+ x = x0 + i * dx / d;
442
+ y = y0 + i * dy / d;
443
+ }
444
+ color = getpixel(p, x, y) < cs; // 0=white 1=black
445
+ if (color != prev){
446
+ if (rc>=path->max){
447
+ int n=path->max*2+10;
448
+ path->x = (int *) xrealloc(path->x, n*sizeof(int));
449
+ path->y = (int *) xrealloc(path->y, n*sizeof(int));
450
+ path->max = n;
451
+ }
452
+ path->x[rc]=x;
453
+ path->y[rc]=y;
454
+ rc++;
455
+ }
456
+ prev = color;
457
+ }
458
+ path->num=rc;
459
+ return rc;
460
+ }
461
+
462
+ /* ToDo: only used in follow_path, which is obsolete, remove? */
463
+ void *xrealloc(void *ptr, size_t size){
464
+ void *p;
465
+ p = realloc(ptr, size);
466
+ if (size>0 && (!p)){
467
+ fprintf(stderr, "insufficient memory");
468
+ exit(1);
469
+ }
470
+ return p;
471
+ }
472
+
473
+ /*
474
+ * -------------------------------------------------------------
475
+ * mark edge-points
476
+ * - first move forward until b/w-edge
477
+ * - more than 2 pixel?
478
+ * - loop around
479
+ * - if forward pixel : go up, rotate right
480
+ * - if forward no pixel : rotate left
481
+ * - stop if found first 2 pixel in same order
482
+ * go_along_the_right_wall strategy is very similar and used otherwhere
483
+ * --------------------------------------------------------------
484
+ * turmite game: inp: start-x,y, regel r_black=UP,r_white=RIght until border
485
+ * out: last-position
486
+ *
487
+ * could be used to extract more features:
488
+ * by counting stepps, dead-end streets ,xmax,ymax,ro-,ru-,lo-,lu-edges
489
+ *
490
+ * use this little animal to find features, I first was happy about it
491
+ * but now I prefer the loop() function
492
+ */
493
+
494
+ void turmite(pix *p, int *x, int *y,
495
+ int x0, int x1, int y0, int y1, int cs, int rw, int rb) {
496
+ int r;
497
+ if (outbounds(p, x0, y0)) // out of pixmap
498
+ return;
499
+ while (*x >= x0 && *y >= y0 && *x <= x1 && *y <= y1) {
500
+ r = ((getpixel(p, *x, *y) < cs) ? rb : rw); // select rule
501
+ switch (r) {
502
+ case UP: (*y)--; break;
503
+ case DO: (*y)++; break;
504
+ case RI: (*x)++; break;
505
+ case LE: (*x)--; break;
506
+ case ST: break;
507
+ default: assert(0);
508
+ }
509
+ if( r==ST ) break; /* leave the while-loop */
510
+ }
511
+ }
512
+
513
+ /* search a way from p0 to p1 without crossing pixels of type t
514
+ * only two directions, useful to test if there is a gap 's'
515
+ * labyrinth algorithm - do you know a faster way? */
516
+ int joined(pix *p, int x0, int y0, int x1, int y1, int cs){
517
+ int t,r,x,y,dx,dy,xa,ya,xb,yb;
518
+ x=x0;y=y0;dx=1;dy=0;
519
+ if(x1>x0){xa=x0;xb=x1;} else {xb=x0;xa=x1;}
520
+ if(y1>y0){ya=y0;yb=y1;} else {yb=y0;ya=y1;}
521
+ t=((getpixel(p,x,y)<cs)?1:0);
522
+ for(;;){
523
+ if( t==((getpixel(p,x+dy,y-dx)<cs)?1:0) // right free?
524
+ && x+dy>=xa && x+dy<=xb && y-dx>=ya && y-dx<=yb) // wall
525
+ { r=dy;dy=-dx;dx=r;x+=dx;y+=dy; } // rotate right and step forward
526
+ else { r=dx;dx=-dy;dy=r; } // rotate left
527
+ // fprintf(stderr," path xy %d-%d %d-%d %d %d %d %d\n",xa,xb,ya,yb,x,y,dx,dy);
528
+ if( x==x1 && y==y1 ) return 1;
529
+ if( x==x0 && y==y0 && dx==1) return 0;
530
+ }
531
+ // return 0; // endless loop ?
532
+ }
533
+
534
+ /* move from x,y to direction r until pixel of color col is found
535
+ * or maximum of l steps
536
+ * return the number of steps done */
537
+ int loop(pix *p,int x,int y,int l,int cs,int col, DIRECTION r){
538
+ int i=0;
539
+ if(x>=0 && y>=0 && x<p->x && y<p->y){
540
+ switch (r) {
541
+ case UP:
542
+ for( ;i<l && y>=0;i++,y--)
543
+ if( (getpixel(p,x,y)<cs)^col )
544
+ break;
545
+ break;
546
+ case DO:
547
+ for( ;i<l && y<p->y;i++,y++)
548
+ if( (getpixel(p,x,y)<cs)^col )
549
+ break;
550
+ break;
551
+ case LE:
552
+ for( ;i<l && x>=0;i++,x--)
553
+ if( (getpixel(p,x,y)<cs)^col )
554
+ break;
555
+ break;
556
+ case RI:
557
+ for( ;i<l && x<p->x;i++,x++)
558
+ if( (getpixel(p,x,y)<cs)^col )
559
+ break;
560
+ break;
561
+ default:;
562
+ }
563
+ }
564
+ return i;
565
+ }
566
+
567
+ /* Given a point, frames a rectangle containing all points of the same
568
+ * color surrounding it, and mark these points.
569
+ * ToDo: obsolate and replaced by frame_vector
570
+ *
571
+ * looking for better algo: go horizontally and look for upper/lower non_marked_pixel/nopixel
572
+ * use lowest three bits for mark
573
+ * - recursive version removed! AmigaOS has no Stack-OVL-Event
574
+ * run around the chape using laby-robot
575
+ * bad changes can lead to endless loop!
576
+ * - this is not absolutely sure but mostly works well
577
+ * diag - 0: only pi/2 direction, 1: pi/4 directions (diagonal)
578
+ * mark - 3 bit marker, mark each valid pixel with it
579
+ */
580
+ int frame_nn(pix *p, int x, int y,
581
+ int *x0, int *x1, int *y0, int *y1, // enlarge frame
582
+ int cs, int mark,int diag){
583
+ #if 1 /* flood-fill to detect black objects, simple and faster? */
584
+ int rc = 0, dx, col, maxstack=0; static int overflow=0;
585
+ int bmax=1024, blen=0, *buf; /* buffer as replacement for recursion stack */
586
+
587
+ /* check bounds */
588
+ if (outbounds(p, x, y)) return 0;
589
+ /* check if already marked (with mark since v0.4) */
590
+ if ((marked(p,x,y)&mark)==mark) return 0;
591
+
592
+ col = ((getpixel(p, x, y) < cs) ? 0 : 1);
593
+ buf=(int *)malloc(bmax*sizeof(int)*2);
594
+ if (!buf) { fprintf(stderr,"malloc failed (frame_nn)\n");return 0;}
595
+ buf[0]=x;
596
+ buf[1]=y;
597
+ blen=1;
598
+
599
+ g_debug(fprintf(stderr,"\nframe_nn x=%4d y=%4d",x,y);)
600
+ for ( ; blen ; ) {
601
+ /* max stack depth is complexity of the object */
602
+ if (blen>maxstack) maxstack=blen;
603
+ blen--; /* reduce the stack */
604
+ x=buf[blen*2+0];
605
+ y=buf[blen*2+1];
606
+ if (y < *y0) *y0 = y;
607
+ if (y > *y1) *y1 = y;
608
+ /* first go to leftmost pixel */
609
+ for ( ; x>0 && (col == ((getpixel(p, x-1, y) < cs) ? 0 : 1)) ; x--);
610
+ if ((marked(p,x,y)&mark)==mark) continue; /* already scanned */
611
+ for (dx=-1;dx<2;dx+=2) /* look at upper and lower line, left */
612
+ if ( diag && x<p->x && x-1>0 && y+dx >=0 && y+dx < p->y
613
+ && col != ((getpixel(p, x , y+dx) < cs) ? 0 : 1)
614
+ && col == ((getpixel(p, x-1, y+dx) < cs) ? 0 : 1)
615
+ && !((marked(p,x-1,y+dx)&mark)==mark)
616
+ ) {
617
+ if (blen+1>=bmax) { overflow|=1; continue; }
618
+ buf[blen*2+0]=x-1;
619
+ buf[blen*2+1]=y+dx;
620
+ blen++;
621
+ }
622
+ if (x < *x0) *x0 = x;
623
+ /* second go right, mark and get new starting points */
624
+ for ( ; x<p->x && (col == ((getpixel(p, x , y) < cs) ? 0 : 1)) ; x++) {
625
+ p->p[x + y * p->x] |= (mark & 7); rc++; /* mark pixel */
626
+ /* enlarge frame */
627
+ if (x > *x1) *x1 = x;
628
+ for (dx=-1;dx<2;dx+=2) /* look at upper and lower line */
629
+ if ( col == ((getpixel(p, x , y+dx) < cs) ? 0 : 1)
630
+ && (
631
+ col != ((getpixel(p, x-1, y ) < cs) ? 0 : 1)
632
+ || col != ((getpixel(p, x-1, y+dx) < cs) ? 0 : 1) )
633
+ && !((marked(p,x,y+dx)&mark)==mark) && y+dx<p->y && y+dx>=0
634
+ ) {
635
+ if (blen+1>=bmax) { overflow|=1; continue; }
636
+ buf[blen*2+0]=x;
637
+ buf[blen*2+1]=y+dx;
638
+ blen++;
639
+ }
640
+ }
641
+ for (dx=-1;dx<2;dx+=2) /* look at upper and lower line, right */
642
+ if ( diag && x<p->x && x-1>0 && y+dx >=0 && y+dx < p->y
643
+ && col == ((getpixel(p, x-1, y ) < cs) ? 0 : 1)
644
+ && col != ((getpixel(p, x , y ) < cs) ? 0 : 1)
645
+ && col != ((getpixel(p, x-1, y+dx) < cs) ? 0 : 1)
646
+ && col == ((getpixel(p, x , y+dx) < cs) ? 0 : 1)
647
+ && !((marked(p,x,y+dx)&mark)==mark)
648
+ ) {
649
+ if (blen+1>=bmax) { overflow|=1; continue; }
650
+ buf[blen*2+0]=x;
651
+ buf[blen*2+1]=y+dx;
652
+ blen++;
653
+ }
654
+ }
655
+
656
+ /* debug, ToDo: use info maxstack and pixels for image classification */
657
+ g_debug(fprintf(stderr," maxstack= %4d pixels= %6d",maxstack,rc);)
658
+ if (overflow==1){
659
+ overflow|=2;
660
+ fprintf(stderr,"# Warning: frame_nn stack oerflow\n");
661
+ }
662
+ free(buf);
663
+ #else /* old version, ToDo: improve it for tmp04/005*.pgm.gz */
664
+ int i, j, d, dx, ox, oy, od, nx, ny, rc = 0, rot = 0, x2 = x, y2 = y, ln;
665
+
666
+ static const int d0[8][2] = { { 0, -1} /* up */, {-1, -1},
667
+ {-1, 0} /* left */, {-1, 1},
668
+ { 0, 1} /* down */, { 1, 1},
669
+ { 1, 0} /* right */, { 1, -1}};
670
+
671
+ /* check bounds */
672
+ if (outbounds(p, x, y))
673
+ return 0;
674
+ /* check if already marked */
675
+ if ((marked(p,x,y)&mark)==mark)
676
+ return 0;
677
+
678
+ i = ((getpixel(p, x, y) < cs) ? 0 : 1);
679
+ rc = 0;
680
+
681
+ g_debug(fprintf(stderr," start frame:");)
682
+
683
+ for (ln = 0; ln < 2 && rot >= 0; ln++) { // repeat if right-loop
684
+ g_debug(fprintf(stderr," ln=%d diag=%d cs=%d x=%d y=%d - go to border\n",ln,diag,cs,x,y);)
685
+
686
+ od=d=(8+4*ln-diag)&7; // start robot looks up, right is a wall
687
+ // go to right (left) border
688
+ if (ln==1) {
689
+ x=x2; y=y2;
690
+ }
691
+ /* start on leftmost position */
692
+ for (dx = 1 - 2*ln; x + dx < p->x && x + dx >= 0 /* bounds */ &&
693
+ i == ((getpixel(p, x + dx, y) < cs) ? 0 : 1) /* color */;
694
+ x += dx);
695
+
696
+ g_debug(fprintf(stderr," ln=%d diag=%d cs=%d x=%d y=%d\n",ln,diag,cs,x,y);)
697
+
698
+ /* robot stores start-position */
699
+ ox = x; oy = y;
700
+ for (rot = 0; abs(rot) <= 64; ) { /* for sure max. 8 spirals */
701
+ /* leftmost position */
702
+ if (ln == 0 && x < x2) {
703
+ x2 = x; y2 = y;
704
+ }
705
+
706
+ g_debug(fprintf(stderr," x=%3d y=%3d d=%d i=%d p=%3d rc=%d\n",x,y,d,i,getpixel(p,x,y),rc);)
707
+
708
+ if ( abs(d0[d][1]) ) { /* mark left (right) pixels */
709
+ for (j = 0, dx = d0[d][1]; x + j >= 0 && x + j < p->x
710
+ && i == ((getpixel(p, x + j, y) < cs) ? 0 : 1); j += dx) {
711
+ if (!((marked(p, x + j, y)&mark)==mark))
712
+ rc++;
713
+ p->p[x + j + y * p->x] |= (mark & 7);
714
+ }
715
+ }
716
+ /* look to the front of robot */
717
+ nx = x + d0[d][0];
718
+ ny = y + d0[d][1];
719
+ /* if right is a wall */
720
+ if ( outbounds(p, nx, ny) || i != ((getpixel(p,nx,ny)<cs) ? 0 : 1) ) {
721
+ /* rotate left */
722
+ d=(d+2-diag) & 7; rot-=2-diag;
723
+ }
724
+ else { /* if no wall, go, turn back and rotate left */
725
+ x=nx; y=ny; d=(d+4+2-diag) & 7; rot+=2-diag+4;
726
+ /* enlarge frame */
727
+ if (x < *x0) *x0 = x;
728
+ if (x > *x1) *x1 = x;
729
+ if (y < *y0) *y0 = y;
730
+ if (y > *y1) *y1 = y;
731
+ }
732
+ if(x==ox && y==oy && d==od) break; // round trip finished
733
+ }
734
+ }
735
+ g_debug(fprintf(stderr," rot=%d\n",rot);)
736
+ #endif
737
+ return rc;
738
+ }
739
+
740
+ /* obsolete! replaced by vectors
741
+ * mark neighbouring pixel of same color, return number
742
+ * better with neighbours of same color (more general) ???
743
+ * parameters: (&~7)-pixmap, start-point, critical_value, mark
744
+ * recursion is removed */
745
+ int mark_nn(pix * p, int x, int y, int cs, int r) {
746
+ /* out of bounds or already marked? */
747
+ if (outbounds(p, x, y) || (marked(p, x, y)&r)==r)
748
+ return 0;
749
+ {
750
+ int x0, x1, y0, y1;
751
+ x0 = x1 = x;
752
+ y0 = y1 = y; // not used
753
+ return frame_nn(p, x, y, &x0, &x1, &y0, &y1, cs, r, OCR_JOB->tmp.n_run & 1);
754
+ // using same scheme
755
+ }
756
+ }
757
+
758
+ /* ToDo: finish to replace old frame by this new one
759
+ *
760
+ * @...........#@@@@@@@. # = marked as already scanned black pixels
761
+ * @........@@@@@@@@@@@# only left and right border
762
+ * .......#@@@@@@@@@@@@@ left side on even y
763
+ * ......@@@@@@@@#.@@@@# right side on odd y
764
+ * .....#@@@@@......#@@@ no border is marked twice
765
+ * ....@@@@@#......@@@#. works also for thinn lines
766
+ * ...#@@@@........#@@@. - outer loop is stored as first
767
+ * ..@@@@#........@@@#.. - inner loop is stored as second
768
+ * .#@@@@........#@@@@.. 1st in an extra box (think on white chars)
769
+ * @@@@#.......@@@@#.... 2nd merge in an extra step
770
+ * #@@@@@....#@@@@@.....
771
+ * @@@@@@@@@@@@@@#......
772
+ * .#@@@@@@@@@@@@.......
773
+ *
774
+ * run around the chape using laby-robot
775
+ * - used for scanning boxes, look for horizontal b/w transitions
776
+ * with unmarked black pixels and call this routine
777
+ * - stop if crossing a marked box in same direction (left=up, right=down)
778
+ * box - char box, store frame_vectors and box
779
+ * x,y - starting point
780
+ * mark - 3 bit marker, mark each valid pixel with it
781
+ * diag - 0: only pi/2 direction, 1: pi/4 directions (diagonal)
782
+ * ds - start direction, 6=right of right border, 2=left of left border
783
+ * ret - 0=ok, -1=already marked, -2=max_num_frames_exceeded
784
+ * -7=no border in direction ds
785
+ */
786
+ #if 0
787
+ #undef g_debug
788
+ #define g_debug(x) x
789
+ #endif
790
+ /* grep keywords: scan_vectors frame_vector */
791
+ int frame_vector(struct box *box1, int x, int y,
792
+ int cs, int mark, int diag, int ds) {
793
+ int i1, i2, i2o,
794
+ new_x=1, /* flag for storing the vector x,y */
795
+ steps=1, /* steps between stored vectors, speedup for big frames */
796
+ d, /* direction */
797
+ ox, oy, /* starting point */
798
+ nx, ny, mx, my, /* used for simplification */
799
+ /* ToDo: add periphery to box (german: Umfang?) */
800
+ rc = 1, /* return code, circumference, sum vector lengths */
801
+ rot = 0, /* memory for rotation, rot=8 means one full rotation */
802
+ vol = 0; /* volume inside frame, negative for white inside black */
803
+ pix *p=box1->p;
804
+
805
+ /* translate the 8 directions to (x,y) pairs,
806
+ * if only four directions are used, only every 2nd vector is accessed,
807
+ * +1 turn left, -1 turn right
808
+ */
809
+ static const int d0[8][2] =
810
+ { { 0, -1}, /* up */ {-1, -1}, /* up-le */
811
+ {-1, 0}, /* left */ {-1, 1}, /* do-le */
812
+ { 0, 1}, /* down */ { 1, 1}, /* do-ri */
813
+ { 1, 0}, /* right */ { 1, -1} }; /* up-ri */
814
+
815
+ /* check bounds */
816
+ if (outbounds(p, x, y))
817
+ return 0;
818
+
819
+ /* pixel color we are looking for, 0=black, 1=white */
820
+ d = ds;
821
+ i1 = ((getpixel(p, x, y ) < cs) ? 0 : 1);
822
+ i2 = ((getpixel(p, x + d0[d][0], y + d0[d][1]) < cs) ? 0 : 1);
823
+
824
+ g_debug(fprintf(stderr,"\nLEV2 frame_vector @ %3d %3d d%d %2d %2d"
825
+ " %d-%d pix=%3d mark=%d cs=%d",\
826
+ x,y,ds,d0[ds][0],d0[ds][1],i1,i2,getpixel(p,x,y),mark,cs);)
827
+
828
+ if (i1==i2){
829
+ fprintf(stderr,"ERROR frame_vector: no border\n");
830
+ return -7; /* no border detected */
831
+ }
832
+
833
+ /* initialize boxframe outside this function
834
+ box1->x0=box1->x1=x;
835
+ box1->y0=box1->y1=y;
836
+ */
837
+
838
+ /* initialize boxvector outside this function
839
+ box1->num_frames=0
840
+ num_frame_vectors[0]=0 ???
841
+ and store start value
842
+ */
843
+ if (box1->num_frames >= MaxNumFrames) return -2;
844
+ /* index to next (x,y) */
845
+ i2o=i2=( (box1->num_frames==0)?0:
846
+ box1->num_frame_vectors[ box1->num_frames ] );
847
+ #if 0 // obsolete v0.43
848
+ box1->frame_vector[i2][0]=x;
849
+ box1->frame_vector[i2][1]=y;
850
+ i2++;
851
+ box1->num_frame_vectors[ box1->num_frames ]=i2;
852
+ #endif
853
+ box1->num_frames++;
854
+
855
+ /* robot stores start-position */
856
+ ox = x; oy = y; /* look forward to white pixel */
857
+
858
+ for (;;) { /* stop if same marked pixel touched */
859
+
860
+ g_debug(fprintf(stderr,"\nLEV3: x= %3d %3d d= %d rot= %2d %3d",x,y,d,rot,i2);)
861
+
862
+ /* ToDo: store max. abs(rot) ??? for better recognition */
863
+ if (new_x) {
864
+ g_debug(fprintf(stderr,"\nLEV2: markB xy= %3d %3d ", x, y);)
865
+ p->p[x + y * p->x] |= (mark & 7); /* mark black pixel */
866
+ }
867
+
868
+ /* store a new vector or enlarge the predecessor */
869
+ if (new_x && (rc%steps)==0) { /* dont store everything on big chars */
870
+ if (i2>=MaxFrameVectors) {
871
+ box1->num_frame_vectors[ box1->num_frames-1 ]=i2;
872
+ reduce_vectors(box1,1); /* simplify loop */
873
+ i2=box1->num_frame_vectors[ box1->num_frames-1 ];
874
+ /* enlarge steps on big chars getting speedup */
875
+ steps=(box1->y1-box1->y0+box1->x1-box1->x0)/32+1;
876
+ }
877
+ /* store frame-vector */
878
+ if (i2<MaxFrameVectors) {
879
+ box1->frame_vector[i2][0]=x;
880
+ box1->frame_vector[i2][1]=y;
881
+ /* test if older vector points to the same direction */
882
+ if (i2>1) {
883
+ /* get predecessor */
884
+ nx=box1->frame_vector[i2-1][0]-box1->frame_vector[i2-2][0];
885
+ ny=box1->frame_vector[i2-1][1]-box1->frame_vector[i2-2][1];
886
+ mx=x -box1->frame_vector[i2-1][0];
887
+ my=y -box1->frame_vector[i2-1][1];
888
+ /* same direction? */
889
+ if (nx*my-ny*mx==0 && nx*mx>=0 && ny*my>=0) {
890
+ /* simplify by removing predecessor */
891
+ i2--;
892
+ box1->frame_vector[i2][0]=x;
893
+ box1->frame_vector[i2][1]=y;
894
+ } /* do not simplify */
895
+ }
896
+ i2++;
897
+ box1->num_frame_vectors[ box1->num_frames-1 ]=i2;
898
+ }
899
+ g_debug(fprintf(stderr," stored @ %3d steps= %d", i2-1, steps);)
900
+ }
901
+ new_x=0; /* work for new pixel (x,y) done */
902
+
903
+ /* check if round trip is finished */
904
+ if (x==ox && y==oy && abs(rot)>=8) break;
905
+
906
+ /* look to the front of robot (turtle or ant) */
907
+ nx = x + d0[d][0];
908
+ ny = y + d0[d][1];
909
+
910
+ /* next step, if right is a wall turn the turtle left */
911
+ if ( outbounds(p, nx, ny) || i1 != ((getpixel(p,nx,ny)<cs) ? 0 : 1) ) {
912
+ if (y==ny && nx>=0 && nx<p->x) { /* if inbound */
913
+ g_debug(fprintf(stderr,"\nLEV2: markW xy= %3d %3d ", nx, ny);)
914
+ p->p[nx + ny * p->x] |= (mark & 7); /* mark white pixel */
915
+ }
916
+ /* rotate left 90 or 45 degrees */
917
+ d=(d+2-diag) & 7; rot+=2-diag;
918
+ /* calculate volume inside frame */
919
+ switch (d+diag) {
920
+ case 2+2: vol-=x-1; break;
921
+ case 6+2: vol+=x; break;
922
+ }
923
+ }
924
+ else { /* if no wall, go forward and turn right (90 or 45 degrees) */
925
+ x=nx; y=ny;
926
+ /* turn back and rotate left */
927
+ d=(d+4+2-diag) & 7; rot+=2-diag-4;
928
+ rc++; /* counting steps, used for speedup */
929
+
930
+ /* enlarge frame */
931
+ if (x < box1->x0) box1->x0 = x;
932
+ if (x > box1->x1) box1->x1 = x;
933
+ if (y < box1->y0) box1->y0 = y;
934
+ if (y > box1->y1) box1->y1 = y;
935
+
936
+ new_x=1;
937
+ }
938
+ }
939
+
940
+ /* to distinguish inner and outer frames, store volume as +v or -v */
941
+ box1->frame_vol[ box1->num_frames-1 ] = vol;
942
+ box1->frame_per[ box1->num_frames-1 ] = rc-1;
943
+
944
+ /* dont count and store the first vector twice */
945
+ if (i2-i2o>1) {
946
+ i2--; rc--; box1->num_frame_vectors[ box1->num_frames-1 ]=i2;
947
+ }
948
+ /* output break conditions */
949
+ g_debug(fprintf(stderr,"\nLEV2 o= %3d %3d x= %3d %3d r=%d v=%d",ox,oy,x,y,rot,vol);)
950
+ /* rc=1 for a single point, rc=2 for a two pixel sized point */
951
+ g_debug(fprintf(stderr," steps= %3d vectors= %3d",rc,i2);)
952
+ /* out_x(box1); ToDo: output only the first thousend */
953
+ return rc; /* return number of bordering pixels = periphery? */
954
+ }
955
+
956
+
957
+
958
+ /* clear lowest 3 (marked) bits (they are used for marking) */
959
+ void clr_bits(pix * p, int x0, int x1, int y0, int y1) {
960
+ int x, y;
961
+ for ( y=y0; y <= y1; y++)
962
+ for ( x=x0; x <= x1; x++)
963
+ p->p[x+y*p->x] &= ~7;
964
+ }
965
+
966
+ /* look for white holes surrounded by black points
967
+ * at the moment look for white point with black in all four directions
968
+ * - store position of hole in coordinates relativ to box!
969
+ * ToDo: count only holes with vol>10% ???
970
+ * ToDo: rewrite for frame vectors (faster, no malloc)
971
+ * holes are frames rotating left hand
972
+ * obsolete, do it with vectors
973
+ */
974
+ int num_hole(int x0, int x1, int y0, int y1, pix * p, int cs, holes_t *holes) {
975
+ int num_holes = 0, x, y, hole_size;
976
+ pix b; // temporary mini-page
977
+ int dx = x1 - x0 + 1, dy = y1 - y0 + 1;
978
+ unsigned char *buf; // 2nd copy of picture, for working
979
+
980
+ if (holes) holes->num=0;
981
+ if(dx<3 || dy<3) return 0;
982
+ b.p = buf = (unsigned char *) malloc( dx * dy );
983
+ if( !buf ){
984
+ fprintf( stderr, "\nFATAL: malloc(%d) failed, skip num_hole", dx*dy );
985
+ return 0;
986
+ }
987
+ if (copybox(p, x0, y0, dx, dy, &b, dx * dy))
988
+ { free(b.p); return -1;}
989
+
990
+ // printf(" num_hole(");
991
+ /* --- mark white-points connected with border */
992
+ for (x = 0; x < b.x; x++) {
993
+ if (getpixel(&b, x, 0) >= cs)
994
+ mark_nn(&b, x, 0, cs, AT);
995
+ if (getpixel(&b, x, b.y - 1) >= cs)
996
+ mark_nn(&b, x, b.y - 1, cs, AT);
997
+ }
998
+ for (y = 0; y < b.y; y++) {
999
+ if (getpixel(&b, 0, y) >= cs)
1000
+ mark_nn(&b, 0, y, cs, AT);
1001
+ if (getpixel(&b, b.x - 1, y) >= cs)
1002
+ mark_nn(&b, b.x - 1, y, cs, AT);
1003
+ }
1004
+
1005
+ g_debug(out_b(NULL,&b,0,0,b.x,b.y,cs);)
1006
+ // --- look for unmarked white points => hole
1007
+ for (x = 0; x < b.x; x++)
1008
+ for (y = 0; y < b.y; y++)
1009
+ if (!((marked(&b, x, y)&AT)==AT)) // unmarked
1010
+ if (getpixel(&b, x, y) >= cs) { // hole found
1011
+ #if 0
1012
+ hole_size=mark_nn(&b, x, y, cs, AT); /* old version */
1013
+ if (hole_size > 1 || dx * dy <= 40)
1014
+ num_holes++;
1015
+ #else
1016
+ { /* new version, for future store of hole characteristics */
1017
+ int x0, x1, y0, y1, i, j;
1018
+ x0 = x1 = x;
1019
+ y0 = y1 = y; // not used
1020
+ hole_size=frame_nn(&b, x, y, &x0, &x1, &y0, &y1, cs, AT, OCR_JOB->tmp.n_run & 1);
1021
+ // store hole for future use, num is initialized with 0
1022
+ if (hole_size > 1 || dx * dy <= 40){
1023
+ num_holes++;
1024
+ if (holes) {
1025
+ // sort in table
1026
+ for (i=0;i<holes->num && i<MAX_HOLES;i++)
1027
+ if (holes->hole[i].size < hole_size) break;
1028
+ for (j=MAX_HOLES-2;j>=i;j--)
1029
+ holes->hole[j+1]=holes->hole[j];
1030
+ if (i<MAX_HOLES) {
1031
+ // printf(" i=%d size=%d\n",i,hole_size);
1032
+ holes->hole[i].size=hole_size;
1033
+ holes->hole[i].x=x;
1034
+ holes->hole[i].y=y;
1035
+ holes->hole[i].x0=x0;
1036
+ holes->hole[i].y0=y0;
1037
+ holes->hole[i].x1=x1;
1038
+ holes->hole[i].y1=y1;
1039
+ }
1040
+ holes->num++;
1041
+ }
1042
+ }
1043
+ }
1044
+ #endif
1045
+ }
1046
+ free(b.p);
1047
+ // printf(")=%d",num_holes);
1048
+ return num_holes;
1049
+ }
1050
+
1051
+ /* count for black nonconnected objects --- used for i,auml,ouml,etc. */
1052
+ /* ToDo: obsolete, replaced by vectors and box.num_boxes */
1053
+ int num_obj(int x0, int x1, int y0, int y1, pix * p, int cs) {
1054
+ int x, y, rc = 0; // rc=num_obj
1055
+ unsigned char *buf; // 2nd copy of picture, for working
1056
+ pix b;
1057
+
1058
+ if(x1<x0 || y1<y0) return 0;
1059
+ b.p = buf = (unsigned char *) malloc( (x1-x0+1) * (y1-y0+1) );
1060
+ if( !buf ){
1061
+ fprintf( stderr, "\nFATAL: malloc(%d) failed, skip num_obj",(x1-x0+1)*(y1-y0+1) );
1062
+ return 0;
1063
+ }
1064
+ if (copybox(p, x0, y0, x1 - x0 + 1, y1 - y0 + 1, &b, (x1-x0+1) * (y1-y0+1)))
1065
+ { free(b.p); return -1; }
1066
+ // --- mark black-points connected with neighbours
1067
+ for (x = 0; x < b.x; x++)
1068
+ for (y = 0; y < b.y; y++)
1069
+ if (getpixel(&b, x, y) < cs)
1070
+ if (!((marked(&b, x, y)&AT)==AT)) {
1071
+ rc++;
1072
+ mark_nn(&b, x, y, cs, AT);
1073
+ }
1074
+ free(b.p);
1075
+ return rc;
1076
+ }
1077
+
1078
+ #if 0
1079
+ // ----------------------------------------------------------------------
1080
+ // first idea for making recognition based on probability
1081
+ // - start with a list of all possible chars
1082
+ // - call recognition_of_char(box *)
1083
+ // - remove chars from list which could clearly excluded
1084
+ // - reduce probability of chars which have wrong features
1085
+ // - font types list could also build
1086
+ // at the moment it is only an idea, I should put it to the todo list
1087
+ //
1088
+ char *list="0123456789,.\0xe4\0xf6\0xfc" // "a=228 o=246 u=252
1089
+ "abcdefghijklmnopqrstuvwxyz"
1090
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
1091
+ int wert[100];
1092
+ int listlen=0,numrest=0;
1093
+ // initialize a new character list (for future)
1094
+ void ini_list(){ int i;
1095
+ for(i=0;list[i]!=0 && i<100;i++) wert[i]=0;
1096
+ numrest=listlen=i; }
1097
+ // exclude??? (for future) oh it was long time ago, I wrote that :/
1098
+ void exclude(char *filt){ int i,j;
1099
+ for(j=0;filt[j]!=0 && j<100;j++)
1100
+ for(i=0;list[i]!=0 && i<100;i++)
1101
+ if( filt[j]==list[i] ) { if(!wert[i])numrest--; wert[i]++; } }
1102
+ // get the result after all the work (for future)
1103
+ char getresult(){ int i;
1104
+ if( numrest==1 )
1105
+ for(i=0;list[i]!=0 && i<100;i++) if(!wert[i]) return list[i];
1106
+ return '_';
1107
+ }
1108
+ #endif
1109
+
1110
+ // look at the environment of the pixel too (contrast etc.)
1111
+ // detailed analysis only of diff pixels!
1112
+ //
1113
+ // 100% * "distance", 0 is ideal fit
1114
+ // = similarity of two chars for recognition of garbled (verstuemmelter) chars
1115
+ // weight of pixels with only one same neighbour set to 0
1116
+ // look at contours too! v0.2.4: B==H
1117
+ // changed for v0.41, Mar06
1118
+ int distance( pix *p1, struct box *box1,
1119
+ pix *p2, struct box *box2, int cs){
1120
+ int rc=0,x,y,v1,v2,i1,i2,rgood=0,rbad=0,x1,y1,x2,y2,dx,dy,dx1,dy1,dx2,dy2;
1121
+ x1=box1->x0;y1=box1->y0;x2=box2->x0;y2=box2->y0;
1122
+ dx1=box1->x1-box1->x0+1; dx2=box2->x1-box2->x0+1; dx=((dx1>dx2)?dx1:dx2);
1123
+ dy1=box1->y1-box1->y0+1; dy2=box2->y1-box2->y0+1; dy=((dy1>dy2)?dy1:dy2);
1124
+ if(abs(dx1-dx2)>1+dx/16 || abs(dy1-dy2)>1+dy/16) return 100;
1125
+ // compare relations to baseline and upper line
1126
+ if(2*box1->y1>box1->m3+box1->m4 && 2*box2->y1<box2->m3+box2->m4) rbad+=128;
1127
+ if(2*box1->y0>box1->m1+box1->m2 && 2*box2->y0<box2->m1+box2->m2) rbad+=128;
1128
+ // compare pixels
1129
+ for( y=0;y<dy;y++ )
1130
+ for( x=0;x<dx;x++ ) { // try global shift too ???
1131
+ v1 =((getpixel(p1,x1+x ,y1+y )<cs)?1:0); i1=8; // better gray?
1132
+ v2 =((getpixel(p2,x2+x ,y2+y )<cs)?1:0); i2=8; // better gray?
1133
+ if(v1==v2) { rgood+=8; continue; } // all things are right!
1134
+ // what about different pixel???
1135
+ // test overlap of 8 surounding pixels ??? bad if two nb. are bad
1136
+ v1=-1;
1137
+ for(i1=-1;i1<2;i1++)
1138
+ for(i2=-1;i2<2;i2++)if(i1!=0 || i2!=0){
1139
+ if( ((getpixel(p1,x1+x+i1*(1+dx/32),y1+y+i2*(1+dy/32))<cs)?1:0)
1140
+ !=((getpixel(p2,x2+x+i1*(1+dx/32),y2+y+i2*(1+dy/32))<cs)?1:0) ) v1++;
1141
+ }
1142
+ if (v1>0) rbad+=16*v1;
1143
+ else rbad++;
1144
+ }
1145
+ if(rgood+rbad) rc= (100*rbad+(rgood+rbad-1))/(rgood+rbad); else rc=99;
1146
+ if(rc<10 && OCR_JOB->cfg.verbose & 7){
1147
+ fprintf(stderr,"\n# distance rc=%d good=%d bad=%d",rc,rgood,rbad);
1148
+ // out_x(box1);out_x(box2);
1149
+ }
1150
+ return rc;
1151
+ }
1152
+
1153
+
1154
+
1155
+ // ============================= call OCR engine ================== ;)
1156
+ // nrun=0 from outside, nrun=1 from inside (allows modifications, oobsolete)
1157
+ wchar_t whatletter(struct box *box1, int cs, int nrun){
1158
+ wchar_t bc=UNKNOWN; // best letter
1159
+ wchar_t um=SPACE; // umlaut? '" => modifier
1160
+ pix *p=box1->p; // whole image
1161
+ int x,y,dots,xa,ya,x0,x1,y0,y1,dx,dy,i;
1162
+ pix b; // box
1163
+ struct box bbuf=*box1; // restore after modifikation!
1164
+
1165
+ if (box1->num_ac>0 && box1->wac[0]>=OCR_JOB->cfg.certainty && bc==UNKNOWN) {
1166
+ bc=box1->tac[0];
1167
+ }
1168
+ // if (bc!=UNKNOWN) return bc;
1169
+ // if whatletter() called again, only unknown chars are processed
1170
+ // bad for splitting!
1171
+
1172
+ // store box data, which can be modified for modified chars in 2nd run
1173
+ bbuf.x0=box1->x0; bbuf.y0=box1->y0;
1174
+ bbuf.x1=box1->x1; bbuf.y1=box1->y1;
1175
+
1176
+ xa=box1->x; ya=box1->y;
1177
+ x0=box1->x0; y0=box1->y0;
1178
+ x1=box1->x1; y1=box1->y1;
1179
+ // int vol=(y1-y0+1)*(x1-x0+1); // volume
1180
+ // crossed l-m , divided chars
1181
+ while( get_bw(x0,x1,y0,y0,p,cs,1)!=1 && y0+1<y1) y0++;
1182
+ while( get_bw(x0,x1,y1,y1,p,cs,1)!=1 && y0+1<y1) y1--;
1183
+ dx=x1-x0+1;
1184
+ dy=y1-y0+1; // size
1185
+
1186
+ // better to proof the white frame too!!! ????
1187
+ // --- test for german umlaut and points above, not robust enough???
1188
+ // if three chars are connected i-dots (ari) sometimes were not detected
1189
+ // - therefore after division a test could be useful
1190
+ // modify y0 only in second run!?
1191
+ // we need it here to have the right copybox
1192
+ if (um==SPACE && dy>5 && box1->num_boxes>1)
1193
+ testumlaut(box1,cs,2,&um); /* set box1->modifier + new y0 */
1194
+
1195
+ dots=box1->dots;
1196
+ y0 =box1->y0; // dots==2 => y0 below double dots
1197
+ dy =y1-y0+1;
1198
+
1199
+ // move upper and lower border (for divided letters)
1200
+ while( get_bw(x0,x1,y0,y0,p,cs,1)==0 && y0+1<y1) y0++;
1201
+ while( get_bw(x0,x1,y1,y1,p,cs,1)==0 && y0+1<y1) y1--;
1202
+ while( get_bw(x0,x0,y0,y1,p,cs,1)==0 && x0+1<x1) x0++;
1203
+ while( get_bw(x1,x1,y0,y1,p,cs,1)==0 && x0+1<x1) x1--;
1204
+ dx=x1-x0+1;
1205
+ dy=y1-y0+1; // size
1206
+ box1->x0=x0; box1->y0=y0; // set reduced frame
1207
+ box1->x1=x1; box1->y1=y1;
1208
+
1209
+ // set good startpoint (probably bad from division)?
1210
+ if( xa<x0 || xa>x1 || ya<y0 || ya>y1
1211
+ || getpixel(p,xa,ya)>=cs /* || 2*ya<y0+y1 */ || dots>0 ){
1212
+ // subfunction? also called after division of two glued chars?
1213
+ for(y=y1;y>=y0;y--) // low to high (not i-dot)
1214
+ for(x=(x0+x1)/2,i=0;x>=x0 && x<=x1;i++,x+=((2*i&2)-1)*i) /* is that ok? */
1215
+ if (getpixel(p,x,y)<cs && (getpixel(p,x+1,y)<cs
1216
+ || getpixel(p,x,y+1)<cs)){ xa=x;ya=y;y=-1;break; }
1217
+ /* should box1->x,y be set? */
1218
+ }
1219
+
1220
+ // ----- create char-only-box -------------------------------------
1221
+ // ToDo: this will be obsolete if vectors are used only
1222
+ if(dx<1 || dy<1) return bc; /* should not happen */
1223
+ b.p = (unsigned char *) malloc( dx * dy );
1224
+ if (!b.p) fprintf(stderr,"Warning: malloc failed L%d\n",__LINE__);
1225
+ if( copybox(p,x0,y0,dx,dy,&b,dx*dy) )
1226
+ { free(b.p); return bc; }
1227
+ // clr_bits(&b,0,b.x-1,0,b.y-1);
1228
+ // ------ use diagonal too (only 2nd run?)
1229
+ /* following code failes on ! and ? obsolete if vectors are used
1230
+ ToDo:
1231
+ - mark pixels neighoured to pixels outside and remove them from &b
1232
+ v0.40
1233
+ will be replaced by list of edge vectors
1234
+ - mark accents, dots and remove them from &b
1235
+ */
1236
+ #if 1 /* becomes obsolate by vector code */
1237
+ if (y0>0) // mark upper overlap
1238
+ for ( x=x0; x<=x1; x++) {
1239
+ if (getpixel(p,x,y0-1)<cs
1240
+ && getpixel(p,x,y0 )<cs && (marked(&b,x-x0,0)&1)!=1)
1241
+ mark_nn(&b,x-x0,0,cs,1);
1242
+ }
1243
+ if (x0>0) // mark left overlap
1244
+ for ( y=y0; y<=y1; y++) {
1245
+ if (getpixel(p,x0-1,y)<cs
1246
+ && getpixel(p,x0 ,y)<cs && (marked(&b,0,y-y0 )&1)!=1)
1247
+ mark_nn(&b,0,y-y0,cs,1);
1248
+ }
1249
+ if (x1<p->x-1) // mark right overlap
1250
+ for ( y=y0; y<=y1; y++) {
1251
+ if (getpixel(p,x1+1,y)<cs
1252
+ && getpixel(p,x1 ,y)<cs && (marked(&b,x1-x0,y-y0)&1)!=1)
1253
+ mark_nn(&b,x1-x0,y-y0,cs,1);
1254
+ }
1255
+ mark_nn(&b,xa-x0,ya-y0,cs,2); // not glued chars
1256
+ for(x=0;x<b.x;x++)
1257
+ for(y=0;y<b.y;y++){
1258
+ if ( (marked(&b,x,y )&3)==1 && getpixel(&b,x,y )<cs )
1259
+ b.p[x+y*b.x] = 255&~7; /* reset pixel */
1260
+ }
1261
+ #endif
1262
+
1263
+ // if (bc == UNKNOWN) // cause split to fail
1264
+ bc=ocr0(box1,&b,cs);
1265
+
1266
+ /* ToDo: try to change pixels near cs?? or melt? */
1267
+ if (box1->num_ac>0 && box1->wac[0]>=OCR_JOB->cfg.certainty && bc==UNKNOWN) {
1268
+ bc=box1->tac[0];
1269
+ }
1270
+
1271
+ if (um!=0 && um!=SPACE && bc<127) { /* ToDo: is that obsolete now? */
1272
+ wchar_t newbc;
1273
+ newbc = compose(bc, um );
1274
+ if (newbc == bc) { /* nothing composed */
1275
+ if(OCR_JOB->cfg.verbose & 7)
1276
+ fprintf(stderr, "\nDBG whatletter: compose(%s) was useless (%d,%d)",
1277
+ decode(bc,ASCII), box1->x0, box1->y0);
1278
+ // if(OCR_JOB->cfg.verbose & 6) out_x(box1);
1279
+ }
1280
+ bc = newbc;
1281
+ }
1282
+ // restore modified boxes
1283
+ box1->x0=bbuf.x0; box1->y0=bbuf.y0;
1284
+ box1->x1=bbuf.x1; box1->y1=bbuf.y1;
1285
+ // if (box1->c==UNKNOWN) out_b(box1,&b,0,0,dx,dy,cs); // test
1286
+
1287
+ free(b.p);
1288
+ return bc;
1289
+ }
1290
+
1291
+ /*
1292
+ ** creates a list of boxes/frames around objects detected
1293
+ ** on the pixmap p for further work
1294
+ ** returns number of boxes created.
1295
+ ** - by the way: get average X, Y (avX=sumX/numC,..)
1296
+ */
1297
+ int scan_boxes( job_t *job, pix *p ){
1298
+ int x, y, nx, cs, rc, ds;
1299
+ struct box *box3;
1300
+ // job_t *job=OCR_JOB; /* fixme */
1301
+
1302
+ if (job->cfg.verbose)
1303
+ fprintf(stderr,"# scan_boxes");
1304
+
1305
+ cs = job->cfg.cs;
1306
+ job->res.sumX = job->res.sumY = job->res.numC = 0;
1307
+
1308
+ /* clear the lowest bits of each pixel, later used as "scanned"-marker */
1309
+ clr_bits( p, 0, p->x - 1, 0, p->y - 1);
1310
+
1311
+ for (y=0; y < p->y; y++)
1312
+ for (x=0; x < p->x; x++)
1313
+ for (ds=2; ds<7; ds+=4) { // NO - dust of size 1 is not removed !!!
1314
+ nx=x+((ds==2)?-1:+1);
1315
+ if (nx<0 || nx>=p->x) continue; /* out of image, ex: recframe */
1316
+ if ( getpixel(p, x,y)>=cs || getpixel(p,nx,y)< cs) // b/w transition?
1317
+ continue;
1318
+ if ((marked(p, x,y) & 1)&&(marked(p, nx, y) & 1))
1319
+ continue;
1320
+ /* check (and mark) only horizontal b/w transitions */
1321
+ // --- insert new box in list
1322
+ box3 = (struct box *)malloc_box(NULL);
1323
+ box3->x0=box3->x1=box3->x=x;
1324
+ box3->y0=box3->y1=box3->y=y;
1325
+ box3->num_frames=0;
1326
+ box3->dots=0;
1327
+ box3->num_boxes=1;
1328
+ box3->num_subboxes=0;
1329
+ box3->modifier='\0';
1330
+ box3->num=job->res.numC;
1331
+ box3->line=0; // not used here
1332
+ box3->m1=0; box3->m2=0; box3->m3=0; box3->m4=0;
1333
+ box3->p=p;
1334
+ box3->num_ac=0; // for future use
1335
+
1336
+ /* frame, vectorize and mark only odd/even horizontal b/w transitions
1337
+ * args: box, x,y, cs, mark, diag={0,1}, ds={2,6}
1338
+ * ds - start direction, 6=right of right border, 2=left of left border
1339
+ * ret - 0=ok, -1=already marked, -2=max_num_frames_exceeded
1340
+ * -7=no border in direction ds
1341
+ * ToDo: count errors and print out for debugging
1342
+ */
1343
+ rc=frame_vector(box3, x, y, cs, 1, 1, ds);
1344
+ g_debug(fprintf(stderr,"\n# ... scan xy= %3d %3d rc= %2d", x, y, rc);)
1345
+ if (rc<0) { free_box(box3); continue; }
1346
+ if (box3->num_frames && !box3->num_frame_vectors[0])
1347
+ fprintf(stderr,"\nERROR scan_boxes: no vector in frame (%d,%d)",x,y);
1348
+
1349
+ job->res.numC++;
1350
+ job->res.sumX += box3->x1 - box3->x0 + 1;
1351
+ job->res.sumY += box3->y1 - box3->y0 + 1;
1352
+
1353
+ box3->c=(((box3->y1-box3->y0+1)
1354
+ *(box3->x1-box3->x0+1)>=MaxBox)? PICTURE : UNKNOWN);
1355
+ list_app(&(job->res.boxlist), box3); // append to list
1356
+ // ToDo: debug
1357
+ // if (job->cfg.verbose && box3->y0==29) out_x(box3);
1358
+ }
1359
+ if(job->res.numC){
1360
+ if (job->cfg.verbose)
1361
+ fprintf(stderr," nC= %3d avD= %2d %2d\n",job->res.numC,
1362
+ (job->res.sumX+job->res.numC/2)/job->res.numC,
1363
+ (job->res.sumY+job->res.numC/2)/job->res.numC);
1364
+ }
1365
+ return job->res.numC;
1366
+ }
1367
+
1368
+ /* compare ints for sorting. Return -1, 0, or 1 according to
1369
+ whether *vr < *vs, vr == *vs, or *vr > *vs */
1370
+ int
1371
+ intcompare (const void *vr, const void *vs)
1372
+ {
1373
+ int *r=(int *)vr;
1374
+ int *s=(int *)vs;
1375
+
1376
+ if (*r < *s) return -1;
1377
+ if (*r > *s) return 1;
1378
+ return 0;
1379
+ }
1380
+
1381
+ /*
1382
+ * measure_pitch - detect monospaced font and measure the pitch
1383
+ * measure overall pitch for difficult lines,
1384
+ * after that measure pitch per line
1385
+ * dists arrays are limited to 1024 elements to reduce
1386
+ * cpu usage for qsort on images with extreme high number of objects
1387
+ * insert space if dist>=pitch in list_insert_spaces()
1388
+ * ToDo: ???
1389
+ * - min/max distance-matrix a-a,a-b,a-c,a-d ... etc; td,rd > ie,el,es
1390
+ * - OR measuring distance as min. pixel distance instead of box distance
1391
+ * especially useful for italic font!
1392
+ * - Kerning detection? minspace<=0 ???
1393
+ * - iterate minMono+maxMonoWidth and count fitting and misfitting pairs
1394
+ * Lit:
1395
+ * http://en.wikibooks.org/wiki/LaTeX/Formatting
1396
+ * #The_Space_between_Words_and_Sentences
1397
+ * \frenchspacing == no extra space after periods (word vs. sentences)
1398
+ * \sloppypar == some spaces between words may be to large
1399
+ * inter word space
1400
+ * http://en.wikipedia.org/wiki/Space_(punctuation)
1401
+ * Variable-width general-purpose space == 1/5-em to 1/3-em
1402
+ * http://en.wikipedia.org/wiki/Em_(typography)
1403
+ * em = absolute maximum high,
1404
+ * median cap height=0.70em,
1405
+ * x-height=1ex=0.45..0.48..0.5em
1406
+ * http://en.wikipedia.org/wiki/En_(typography) = n-width=0.5em
1407
+ * http://pfaedit.sourceforge.net/glossary.html#overshoot
1408
+ * i: left + right side bearing (character specifique, may be negative: VA)
1409
+ * http://en.wikipedia.org/wiki/Typeface
1410
+ * http://en.wikipedia.org/wiki/Letter-spacing
1411
+ * http://en.wikipedia.org/wiki/Tracking_(typography) # Overlap VA
1412
+ * http://en.wikipedia.org/wiki/Kerning # Overlap VA AT Tx etc.
1413
+ * similar blank 2D-area between pairs of characters
1414
+ * Helvetica: ry=+30 AV=-80 units?
1415
+ *
1416
+ */
1417
+ void measure_pitch( job_t *job ){ /* word spacing */
1418
+ int numdists=0, spc=0, /* number of stored distances */
1419
+ pitch_p=2, pdist, pdists[1024], /* proportional distances */
1420
+ pitch_m=10, /* monospaced em width */
1421
+ monospaced=1, l1, char_width_min=1023, char_width_max=0,
1422
+ mono_em_min=0, // maximum monospace char width + 1 2010-09-25
1423
+ mono_em_max=2047, // minimum distance left side of two chars
1424
+ d1l, d1r; // left-left and right-right distance of 2 chars
1425
+ int d1, d2; // temporary vars, d1l + d1r sorted
1426
+ struct box *box2, *pre1=NULL, *pre2=NULL;
1427
+
1428
+ if(job->cfg.verbose){ fprintf(stderr,"# check for word pitch"); }
1429
+ for (l1=0; l1<job->res.lines.num; l1++)
1430
+ { /* 0 means all lines */
1431
+ if(job->cfg.verbose){ fprintf(stderr,"\n# line %2d\n# ...",l1); }
1432
+ numdists = 0; /* clear distance lists */
1433
+ monospaced=1; mono_em_min=0; mono_em_max=2047; // reset, 2010-09-28
1434
+ char_width_min=1023; char_width_max=0; // reset, 2010-09-28
1435
+ for_each_data(&(job->res.boxlist)) {
1436
+ box2 = (struct box *)list_get_current(&(job->res.boxlist));
1437
+ if (l1>0 && box2->line!=l1) continue; /* ignore other lines */
1438
+ /* ignore dots and pictures (min. font is 4x6) */
1439
+ if (box2->y1 - box2->y0 + 1 < 4 || box2->c==PICTURE) pre2=pre1=NULL;
1440
+ if (!pre1) { pre1=box2; continue; } /* we need a predecessor */
1441
+ /* use gap for proportional fonts */
1442
+ pdist = box2->x0 - pre1->x1 - 1; /* do not add 1, subtract 1 ! */
1443
+ if (pdist<0) { // new line
1444
+ pre2=NULL; pre1=box2; continue; }
1445
+ if ((box2->x1 - box2->x0 + 1)
1446
+ >2*(box2->y1 - box2->y0 + 1)) { // skip long object
1447
+ continue; }
1448
+ if ((pre1->x1 - pre1->x0 + 1)
1449
+ >2*(pre1->y1 - pre1->y0 + 1)) { // skip long object
1450
+ pre1=box2; continue; }
1451
+ // JS-2010-09 sample spaces20100910.jpg 7 chars, fix bad auto space
1452
+ if (char_width_min > box2->x1 - box2->x0 + 1)
1453
+ char_width_min = box2->x1 - box2->x0 + 1;
1454
+ if (box2->x1 - box2->x0 < 4*(pre1->x1 - pre1->x0)) // ~ big lines
1455
+ if (char_width_max < box2->x1 - box2->x0 + 1)
1456
+ char_width_max = box2->x1 - box2->x0 + 1;
1457
+ // may cause problems if "_" is of width em (not em-1 like mwMW etc.)
1458
+ if (mono_em_min < char_width_max + 1)
1459
+ mono_em_min = char_width_max + 1; // minimum monospaced width
1460
+
1461
+ // will fail on monospaced fonts where chars are not centered
1462
+ if (pre1) { // 2010-09-28
1463
+ d1l = box2->x0 - pre1->x0; // left to left distance
1464
+ d1r = box2->x1 - pre1->x1; // right to right distance
1465
+ if (d1l > d1r) { d1=d1r; d2=d1l; } // thinner char on the right
1466
+ else { d1=d1l; d2=d1r; } // thicker char on the right
1467
+ /* d1 < 2*width && d2 < 2*width, may fail for "IIIM" d2<2*max OK */
1468
+ if (d1>0 && d1 < 2*char_width_max && d2 < 2*mono_em_max) {
1469
+ if (mono_em_min<d1-1) mono_em_min = d1; }
1470
+ if (d1>0) {
1471
+ if (mono_em_max>d2+2) mono_em_max = d2; } // not best, shifted ()
1472
+ // 2010-10-06 examples/ocr-b add -1 +2, bad for "()"
1473
+ #if 1
1474
+ if ((48 & job->cfg.verbose) == 48)
1475
+ if (monospaced && l1) // debugging until monospaced=0
1476
+ fprintf(stderr," L%02d DBG1 x %3d %+4d %3d %+4d d %3d %3d"
1477
+ " em %2d %2d ex %2d\n# ...",
1478
+ l1, pre1->x0, pre1->x1-pre1->x0+1,
1479
+ box2->x0, box2->x1-box2->x0+1, d1, d2,
1480
+ mono_em_min, mono_em_max, char_width_max);
1481
+ #endif
1482
+ }
1483
+ #if 1 // needed for correct spacing of last line of tmp08/0810CSchulze_crop
1484
+ if (pre2) {
1485
+ d1l = box2->x0 - pre2->x0; // left to left distance
1486
+ d1r = box2->x1 - pre2->x1; // right to right distance
1487
+ if (d1l > d1r) { d1=d1r; d2=d1l; } // thinner char on the right
1488
+ else { d1=d1l; d2=d1r; } // thicker char on the right
1489
+ if (d1>0 && d1 < 3*char_width_max && d2 < 3*mono_em_max) {
1490
+ if (2*mono_em_min<d1) mono_em_min = (d1+1)/2; }
1491
+ if (d1>0) {
1492
+ if (2*mono_em_max>d2) mono_em_max = (d2+1)/2; }
1493
+ #if 1
1494
+ if ((48 & job->cfg.verbose) == 48)
1495
+ if (monospaced && l1) // debugging until monospaced=0
1496
+ fprintf(stderr," L%02d DBG2 x %3d %+4d %3d %+4d d %3d %3d"
1497
+ " em %2d %2d ex %2d\n# ...",
1498
+ l1, pre2->x0, pre2->x1-pre2->x0+1,
1499
+ box2->x0, box2->x1-box2->x0+1, d1, d2,
1500
+ mono_em_min, mono_em_max, char_width_max);
1501
+ #endif
1502
+ }
1503
+ #endif
1504
+
1505
+ // the upper part does good work, we do not need this stuff ... ???
1506
+ #if 0
1507
+ // min distance between next neighbours of pre
1508
+ if (pre2 && 1 < box2->x0 - pre2->x1)
1509
+ if (mono_em_max > box2->x0 - pre2->x1)
1510
+ mono_em_max = box2->x0 - pre2->x1;
1511
+ // ToDo: could be a problem for " ???
1512
+ if (pre2)
1513
+ if (pre1->x1 - pre1->x0 >= mono_em_min) // best max mono_dx
1514
+ if (pre1->x1 - pre1->x0 == box2->x1 - box2->x0) // best max mono_dx
1515
+ if (mono_em_max > box2->x0 - pre1->x0)
1516
+ mono_em_max = box2->x0 - pre1->x0;
1517
+ /* ToDo: better take 3 instead of 2 neighbours?, smallest font 4x6 */
1518
+ /* tmp08/gocr0801_bad5.jpg was not mono, need 2 to 3 chars */
1519
+ /* 2010-09-27 gives precise range! 16..22 to 16..17 */
1520
+ /* ToDo: no 2 char variant? */
1521
+ if (pre2 && 1 < box2->x0 - pre2->x1)
1522
+ if (box2->x0-pre1->x1+1 < mono_em_min) // no spc between char + pre1
1523
+ if (pre1->x0-pre2->x1+1 < mono_em_min) // no spc between pre1 + pre2
1524
+ {
1525
+ if (3*mono_em_min < box2->x1 - pre2->x0)
1526
+ mono_em_min = (box2->x1 - pre2->x0 + 2)/3;
1527
+ }
1528
+ #endif
1529
+ //# tmp09/oebb_teletext_836_0001_sw.png
1530
+ //# line 4 12 - 12 pre2 134 142 181 190
1531
+ //# 0 8 47 56
1532
+ //# 0 12 24 36 48
1533
+ // n=2: (n-1)*min < d1 <= (n )*max && (2*n+1)*max < (2*n+2)*min
1534
+ // (n )*min < d2 <= (n+1)*max && (2*n+2)*max < (2*n+3)*min
1535
+ if (monospaced && pre1) { // check 2 chars for non mono space within
1536
+ d1l = box2->x0 - pre1->x0; // left to left distance (do not + 1!)
1537
+ d1r = box2->x1 - pre1->x1; // right to right distance
1538
+ if (d1l > d1r) { d1=d1r; d2=d1l; } // thinner char on the right
1539
+ else { d1=d1l; d2=d1r; } // thicker char on the right
1540
+ if ((box2->x0 - pre1->x1 <= mono_em_min
1541
+ && box2->x1 - pre1->x0 > 2*mono_em_max) // crossing 1 em border?
1542
+ || (box2->x0 - pre1->x1 > mono_em_max
1543
+ && box2->x0 - pre1->x1 <= 2*mono_em_min
1544
+ && box2->x1 - pre1->x0 > 3*mono_em_max)) { // crossing 2 em border?
1545
+ monospaced = 0; // can not be monospaced in that case 2010-09-25
1546
+ if (job->cfg.verbose)
1547
+ fprintf(stderr, " L%02d mono:=0 %d - %d pre1 %d %d %d %d\n# ...",
1548
+ l1, mono_em_min, mono_em_max,
1549
+ pre1->x0, pre1->x1, box2->x0, box2->x1);
1550
+ }
1551
+ }
1552
+ // n=3: (n-1)*min < d1 <= (n )*max && (2*n+1)*max < (2*n+2)*min
1553
+ // (n )*min < d2 <= (n+1)*max && (2*n+2)*max < (2*n+3)*min
1554
+ if (monospaced && pre2 && (2*2+2)*mono_em_max < (2*2+3)*mono_em_min)
1555
+ { // check 2 chars for non mono space within
1556
+ d1l = box2->x0 - pre2->x0; // left to left distance
1557
+ d1r = box2->x1 - pre2->x1; // right to right distance
1558
+ if (d1l > d1r) { d1=d1r; d2=d1l; } // thinner char on the right
1559
+ else { d1=d1l; d2=d1r; } // thicker char on the right
1560
+ if ((box2->x0 - pre2->x1 > mono_em_max
1561
+ && box2->x0 - pre2->x1 <= 2*mono_em_min
1562
+ && box2->x1 - pre2->x0 > 3*mono_em_max) // crossing 2 em border?
1563
+ || (box2->x0 - pre2->x1 > 2*mono_em_max
1564
+ && box2->x0 - pre2->x1 <= 3*mono_em_min // ?????? ToDo oebb
1565
+ && box2->x1 - pre2->x0 > 4*mono_em_max)) { // crossing 3 em border?
1566
+ monospaced = 0; // can not be monospaced in that case 2010-09-25
1567
+ if (job->cfg.verbose)
1568
+ fprintf(stderr, " L%02d mono:=0 %d - %d pre2 %d %d %d %d\n# ...",
1569
+ l1, mono_em_min, mono_em_max,
1570
+ pre2->x0, pre2->x1, box2->x0, box2->x1);
1571
+ }
1572
+ }
1573
+ /* fonts are expected to be 6 to 60 pixels high, which is about
1574
+ 4 to 50 pixels wide. We allow some extra margin.
1575
+ space > 0 2010-09-27
1576
+ ToDo: compare left and right gap (or additional nearest 4 gaps)
1577
+ similar to mono space detection, check min distance
1578
+ between upper, middle and lower rightmost vector of prev char and
1579
+ leftmost vector of right char (hight is defined by the lower char)
1580
+ (if overlapping chars are detected! WAV,Te,...)
1581
+ */
1582
+ if (0 < pdist && pdist < 140) { /* better mdist < 3*Xaverage ? */
1583
+ // ignore extra wide spaces, tmp09/gocr_screen_capture* 2010-09-28
1584
+ if (2*pdist<5*char_width_max)
1585
+ /* two options for overflow: 1) ignore, 2) store randomly */
1586
+ if (numdists<1024) { /* we do ignore here */
1587
+ pdists[numdists] = pdist;
1588
+ numdists++;
1589
+ }
1590
+ }
1591
+ pre2 = pre1; pre1 = box2;
1592
+ } end_for_each(&(job->res.boxlist));
1593
+
1594
+ if (job->cfg.verbose)
1595
+ fprintf(stderr, " L%02d num_gaps= %2d x_width= %2d - %2d"
1596
+ " mono_em= %2d - %2d mono= %d",
1597
+ l1, numdists, char_width_min, char_width_max,
1598
+ mono_em_min, mono_em_max, monospaced);
1599
+ if (numdists<8) {
1600
+ if (job->cfg.verbose && l1==0) /* only for all lines */
1601
+ fprintf(stderr," (WARNING num_gaps<8)");
1602
+ }
1603
+ #if 1 /* debugging */
1604
+ if ((job->cfg.verbose&(32+16))==48) {
1605
+ int i;
1606
+ fprintf(stderr,"\n# ...");
1607
+ for (i=0;i<numdists;i++) fprintf(stderr," %2d",pdists[i]);
1608
+ fprintf(stderr," <- pdist[%d]\n# ...",l1);
1609
+ }
1610
+ #endif
1611
+ if (numdists>0) {
1612
+ int i, diff, ni_min, max, best_p, ni;
1613
+ /* aware: takes long time for big data sets */
1614
+ /* dilute? (german: ausduennen?) */
1615
+ qsort (pdists, numdists, sizeof (int), intcompare);
1616
+ /* the new method, div0? */
1617
+ best_p=4*numdists/5;
1618
+ /* try to find better pitch for monospaced font (ok for prop) */
1619
+ // tolerant to 090729num* tmp09/barcodes090916_interleaved*
1620
+ if (mono_em_min > mono_em_max+mono_em_min/32+1
1621
+ || mono_em_max>=2*mono_em_min)
1622
+ monospaced = 0;
1623
+ else
1624
+ pitch_m=((mono_em_max<3*mono_em_min)?
1625
+ (mono_em_max+3*mono_em_min)/4:mono_em_min);
1626
+ /* try to find better pitch for proportional font */
1627
+ // the largest diff could be the best, if diff is always 1,
1628
+ // take the diff with the lowest weight
1629
+ // JS-2010-09 add numdists<8 sample spaces20100908.jpg
1630
+ // todo: search most offen biggest gapdiff (ignore big table gaps)
1631
+ // mean gapdiff? gap[n-1-i]-gap[0+i] until gapdiff=0, skip table gaps
1632
+ // 2010-09-28 check until end of table, because old bad wide gaps are
1633
+ // no more added to the table
1634
+ for (ni=ni_min=1024,max=0,i=((numdists<8)?0:numdists/2+1);
1635
+ i<numdists;i++) {
1636
+ if (pdists[i]<=char_width_min/3) continue; // JS-2010-09
1637
+ if (pdists[i]> char_width_max*2) {
1638
+ /* set 2nd best which is numdists as default */; break; } // JS-2010-27 table gaps
1639
+ if (numdists<16) // single word?
1640
+ if (pdists[i]<=char_width_max/3) continue; // JS-2010-09
1641
+ diff=pdists[i]-pdists[i-1];
1642
+ if (diff>max) {
1643
+ max=diff; best_p=i-1;
1644
+ if ((job->cfg.verbose&(32+16))==48)
1645
+ fprintf(stderr," L%02d best_p= %3d + maxdiff=%3d\n# ...",
1646
+ l1, pdists[best_p], max);
1647
+ if (max>3 && 3*pdists[i]>=4*pdists[i-1]) { break; }
1648
+ if (max>1 && 3*i>numdists*2 && 3*pdists[i]>=4*pdists[i-1]) { break; }
1649
+ }
1650
+ if (diff) {
1651
+ if (ni<ni_min) {
1652
+ // do not try to divide one word per line
1653
+ ni_min=ni; if (max<=1 && numdists>16) best_p=i-1;
1654
+ if ((job->cfg.verbose&(32+16))==48)
1655
+ fprintf(stderr," L%02d best_p=%3d ni_min=%3d\n# ...",
1656
+ l1, pdists[best_p], ni_min);
1657
+ }
1658
+ ni=1;
1659
+ } else ni++;
1660
+ }
1661
+ if (numdists<16 && max<=1 && ni_min>1) best_p=numdists-1; // one word
1662
+ #if 1 /* debugging */
1663
+ if ((job->cfg.verbose&(32+16))==48) {
1664
+ // fprintf(stderr,"\n# ...");
1665
+ for (i=0;i<numdists;i++) fprintf(stderr," %2d",pdists[i]);
1666
+ fprintf(stderr," <- pdist[%d] sorted\n# ...",l1);
1667
+ fprintf(stderr," L%02d maxdiff=%d min_samediffs=%d", l1, max, ni_min);
1668
+ }
1669
+ #endif
1670
+ /* we measure spaces in two different ways (mono, prop) */
1671
+ /* prop: gap between boxes, mono: distance of middle */
1672
+ if (best_p<numdists-1) pitch_p = ((pdists[best_p]+pdists[best_p+1])/2+1);
1673
+ else pitch_p = (pdists[best_p]+1 );
1674
+ if (numdists)
1675
+ if ( pdists[numdists-1]*2 <= pdists[0]*3
1676
+ || pdists[numdists-1] <= pdists[0]+3) {
1677
+ /* line is just a single word */
1678
+ pitch_p = pdists[numdists-1]+10;
1679
+ }
1680
+ if (l1>0 && job->cfg.spc==0) {
1681
+ job->res.lines.pitch[l1]=(monospaced?pitch_m:pitch_p);
1682
+ job->res.lines.mono[l1]=monospaced;
1683
+ }
1684
+ if (job->cfg.verbose) {
1685
+ fprintf(stderr,"\n# ...");
1686
+ fprintf(stderr," L%02d mono: num=%3d min=%3d max=%3d pitch=%3d\n# ...",
1687
+ l1, numdists, mono_em_min,mono_em_max,pitch_m);
1688
+ fprintf(stderr," L%02d prop: num=%3d min=%3d max=%3d pitch=%3d @ %2d%%\n# ...",
1689
+ l1, numdists, pdists[0],pdists[numdists-1],pitch_p,best_p*100/numdists);
1690
+ fprintf(stderr," L%02d result: mono=%d distance >= %d considered as space\n# ...",
1691
+ l1, monospaced, job->res.lines.pitch[l1]);
1692
+ }
1693
+ } /* if (not) enough spaces */
1694
+ if (l1==0) { /* set default spaces to each line */
1695
+ int l2;
1696
+ spc = job->cfg.spc;
1697
+ if (spc==0) /* set only if not set by option */
1698
+ spc = ((monospaced)?pitch_m:pitch_p);
1699
+ for (l2=0; l2<job->res.lines.num; l2++ )
1700
+ job->res.lines.pitch[l2]=spc;
1701
+ }
1702
+ } /* each line */
1703
+ if (job->cfg.spc==0)
1704
+ job->cfg.spc = spc;
1705
+ if (job->cfg.verbose)
1706
+ fprintf(stderr," overall space width is %d %s\n",
1707
+ spc, ((monospaced)?"monospaced":"proportional"));
1708
+
1709
+
1710
+ }
1711
+
1712
+ /* ---- count subboxes (white holes within black area) --------
1713
+ * new: count boxes lying inside another box (usually holes, ex: "aeobdg")
1714
+ * needed for glue_boxes, dont joining textboxes, tables and other complex
1715
+ * objects
1716
+ * ToDo: count only frames of invers spin? do we need sorted list here? -> no
1717
+ */
1718
+ int count_subboxes( pix *pp ){
1719
+ int ii=0, num_mini=0, num_same=0, cnt=0;
1720
+ struct box *box2,*box4;
1721
+ job_t *job=OCR_JOB; /* fixme */
1722
+ progress_counter_t *pc = NULL;
1723
+ if (job->cfg.verbose) { fprintf(stderr,"# count subboxes\n# ..."); }
1724
+
1725
+ pc = open_progress(job->res.boxlist.n,"count_subboxes");
1726
+ for_each_data(&(job->res.boxlist)) {
1727
+ box2 = (struct box *)list_get_current(&(job->res.boxlist));
1728
+ box2->num_subboxes=0;
1729
+ progress(cnt++,pc);
1730
+ if ( (box2->x1 - box2->x0)<2
1731
+ || (box2->y1 - box2->y0)<2) continue; /* speedup for dotted bg */
1732
+ // holes inside box2 char, aoebdqg, 0.41
1733
+ for_each_data(&(job->res.boxlist)) {
1734
+ box4=(struct box *)list_get_current(&(job->res.boxlist));
1735
+ if (box4->y0 > box2->y1) break; // faster, but boxes need to be sorted
1736
+ // ToDo: better use binary tree (above/below x) to find near boxes?
1737
+ if (box4==box2) continue;
1738
+ if( box4->x0==box2->x0 && box4->x1==box2->x1
1739
+ && box4->y0==box2->y0 && box4->y1==box2->y1)
1740
+ num_same++; /* erroneous!? */
1741
+ if ( box4->x0 >= box2->x0 && box4->x1 <= box2->x1
1742
+ && box4->y0 >= box2->y0 && box4->y1 <= box2->y1
1743
+ && box4->num_subboxes==0 ) /* box4 inside box2? */
1744
+ {
1745
+ box2->num_subboxes++; ii++;
1746
+ if ((box4->x1 - box4->x0 + 1)
1747
+ *(box4->y1 - box4->y0 + 1)<17) num_mini++;
1748
+ }
1749
+ } end_for_each(&(job->res.boxlist));
1750
+ #if 0
1751
+ if (cnt < 1000 && job->cfg.verbose)
1752
+ fprintf(stderr," %4d box %4d %4d %+3d %+3d subboxes %4d\n# ...",
1753
+ cnt, box2->x0, box2->y0, box2->x1-box2->x0,
1754
+ box2->y1-box2->y0, box2->num_subboxes);
1755
+ #endif
1756
+ } end_for_each(&(job->res.boxlist));
1757
+ close_progress(pc);
1758
+ if (job->cfg.verbose)
1759
+ fprintf(stderr," %3d subboxes counted (mini=%d, same=%d) nC= %d\n",
1760
+ ii, num_mini, num_same/2 /* counted twice */, cnt);
1761
+ return 0;
1762
+ }
1763
+
1764
+ /* ---- join holes to chars( before step1 ) v0.42 -----------------------
1765
+ join boxes lying inside another box (usually holes, ex: "aeobdg46890")
1766
+ Dont add dust to a char!
1767
+ lines are not detected yet
1768
+ */
1769
+ int glue_holes_inside_chars( pix *pp ){
1770
+ int ii, cs, x0, y0, x1, y1, cnt=0,
1771
+ glued_same=0, glued_holes=0;
1772
+ struct box *box2, *box4;
1773
+ job_t *job=OCR_JOB; /* fixme */
1774
+ progress_counter_t *pc = NULL;
1775
+ cs=job->cfg.cs;
1776
+ {
1777
+ count_subboxes( pp ); /* move to pgm2asc() later */
1778
+
1779
+ pc = open_progress(job->res.boxlist.n,"glue_holes_inside_chars");
1780
+ if (job->cfg.verbose)
1781
+ fprintf(stderr,"# glue holes to chars nC= %d\n# ...",job->res.numC);
1782
+ ii=0;
1783
+ for_each_data(&(job->res.boxlist)) {
1784
+ // get the smaller box which may be extended by bigger boxes around it
1785
+ box2 = (struct box *)list_get_current(&(job->res.boxlist));
1786
+ x0 = box2->x0; x1 = box2->x1;
1787
+ y0 = box2->y0; y1 = box2->y1;
1788
+
1789
+ progress(cnt++,pc);
1790
+
1791
+ // would it better than moving vectors to build a sub-box-tree?
1792
+
1793
+ // do not remove chars inside pictures (car plates on photos)
1794
+ if( box2->c == PICTURE || box2->num_subboxes > 7) continue;
1795
+
1796
+ // holes inside char, aoebdqg, 0.41
1797
+ // dont merge boxes which have subboxes by itself!
1798
+ // search boxes inside box2
1799
+ // if (x1-x0+1>2 || y1-y0+1>2) /* skip tiny boxes, bad for 4x6 */
1800
+ for_each_data(&(job->res.boxlist)) {
1801
+ box4=(struct box *)list_get_current(&(job->res.boxlist));
1802
+ if(box4!=box2 && box4->c != PICTURE )
1803
+ {
1804
+ // ToDo: dont glue, if size differs by big factors (>16?)
1805
+ // box4 is of same size or smaller
1806
+ //if ((job->cfg.verbose & 48)==48
1807
+ // && abs(box4->x0-x0)<4 && abs(box4->y0-y0)<8)
1808
+ // { fprintf(stderr,"\n# DBG_glue");out_x(box2);out_x(box4); }
1809
+ if (abs(box4->frame_vol[0])
1810
+ >=abs(box2->frame_vol[0])/512) // 2010-10 bad invalid_ogv.jpg
1811
+ if ( ( box4->x0==x0 && box4->x1==x1
1812
+ && box4->y0==y0 && box4->y1==y1 ) /* do not happen !? */
1813
+ || ( box4->x0>=x0 && box4->x1<=x1
1814
+ && box4->y0>=y0 && box4->y1<=y1
1815
+ // 2010-09 subboxes==0 to subboxes<4 for 0 with dot in it
1816
+ && box4->num_subboxes<2 ) ) /* no or very small subboxes? */
1817
+ { // fkt melt(box2,box4)
1818
+ // same box, if very small but hollow char (4x5 o)
1819
+ if( box4->x0==x0 && box4->x1==x1
1820
+ && box4->y0==y0 && box4->y1==y1) glued_same++; else glued_holes++;
1821
+ // fprintf(stderr,"\n# DEBUG merge:");
1822
+ // out_x(box2); // small
1823
+ // out_x(box4); // big
1824
+ if ((job->cfg.verbose & 7)==7) // LEV3
1825
+ fprintf(stderr," join hole %4d %4d %+4d %+4d %+6d"
1826
+ " + %4d %4d %+4d %+4d %+6d %d\n# ...",
1827
+ x0, y0, x1-x0+1, y1-y0+1, box2->frame_vol[0],
1828
+ box4->x0, box4->y0,
1829
+ box4->x1-box4->x0+1, box4->y1-box4->y0+1,
1830
+ box4->frame_vol[0], glued_same);
1831
+ if ((box4->x1-box4->x0+1)< 8*(x1-x0+1)
1832
+ || (box4->y1-box4->y0+1)<12*(y1-y0+1)) // skip dust
1833
+ merge_boxes( box2, box4 ); // add box4 to bigger box2
1834
+ //if ((job->cfg.verbose & 48)==48)
1835
+ // { fprintf(stderr,"\n# DBG_glue_result");out_x(box2); }
1836
+ x0 = box2->x0; x1 = box2->x1;
1837
+ y0 = box2->y0; y1 = box2->y1;
1838
+ job->res.numC--; // dont count fragments as chars
1839
+ ii++; // count removed
1840
+ list_del(&(job->res.boxlist), box4); // remove box4
1841
+ free_box(box4);
1842
+ // now search another hole inside box2
1843
+ }
1844
+ }
1845
+ } end_for_each(&(job->res.boxlist));
1846
+
1847
+ } end_for_each(&(job->res.boxlist));
1848
+
1849
+ if (job->cfg.verbose)
1850
+ fprintf(stderr," joined: %3d holes, %3d same, nC= %d\n",
1851
+ glued_holes, glued_same, job->res.numC);
1852
+ close_progress(pc);
1853
+ }
1854
+ return 0;
1855
+ }
1856
+
1857
+
1858
+ /* ---- join broken chars ( before step1 ??? ) -----------------------
1859
+ use this carefully, do not destroy previous detection ~fi, broken K=k' g
1860
+ join if boxes are near or diagonally connected
1861
+ other strategy: mark boxes for deleting and delete in extra loop at end
1862
+ faster: check only next two following boxes because list is sorted!
1863
+ ToDo: store m4 of upper line to m4_of_prev_line, and check that "-points are below
1864
+ done: join boxes lying inside another box (usually holes, ex: "aeobdg")
1865
+ Dont add dust to a char!
1866
+ lines should be detected already (Test it for m1-m4 unknown)
1867
+ ToDo: divide in glue_idots, glue_thin_chars etc. and optimize it
1868
+ */
1869
+ int glue_broken_chars( job_t *job, pix *pp ){
1870
+ int ii, y, cs, x0, y0, x1, y1, cnt=0,
1871
+ num_frags=0, glued_frags=0, glued_hor=0,
1872
+ do_join=0; /* 1..n means we have a reason to join two objects to one */
1873
+ char *(join_reason)[5]={"no","\"A\"Uij\%","!?;\%","=:;","'',,"};
1874
+ struct box *box2, *box4;
1875
+ // job_t *job=OCR_JOB; /* fixme */
1876
+ progress_counter_t *pc = NULL;
1877
+ cs=job->cfg.cs;
1878
+ {
1879
+ count_subboxes( pp ); /* move to pgm2asc() later */
1880
+
1881
+ pc = open_progress(job->res.boxlist.n,"glue_broken_chars");
1882
+ if (job->cfg.verbose)
1883
+ fprintf(stderr,"# glue broken chars nC= %d avX= %d\n# ...",
1884
+ job->res.numC, job->res.avX);
1885
+ ii=0;
1886
+ for_each_data(&(job->res.boxlist)) {
1887
+ // get the box which may be extended by boxes around it
1888
+ box2 = (struct box *)list_get_current(&(job->res.boxlist));
1889
+ x0 = box2->x0; x1 = box2->x1;
1890
+ y0 = box2->y0; y1 = box2->y1;
1891
+ progress(cnt++,pc);
1892
+ do_join=0;
1893
+ // vertical broken (g965T umlauts etc.)
1894
+ // not: f,
1895
+ // would it better than moving vectors to build a sub-box-tree?
1896
+ // do not remove chars inside pictures (car plates on photos)
1897
+ if (box2->c == PICTURE || box2->num_subboxes > 7) continue;
1898
+ /* continue loop if box is below or above line = dust */
1899
+ if (box2->m4>0 && y0>box2->m4) continue; /* dust outside ? */
1900
+ if (box2->m1>0 && y0<box2->m1-(box2->m3-box2->m2)) continue;
1901
+ /* ToDo:
1902
+ * - check that y0 is greater as m3 of the char/line above
1903
+ */
1904
+ // --- variant 1 = ij-dots umlaut-dots :;= ---
1905
+ // check small boxes (box2) whether they belong
1906
+ // to near same size or bigger boxes (box4)
1907
+ if( 2*(y1-y0) < box2->m4 - box2->m1 // care for dots etc.
1908
+ && ( 2*y1<=(box2->m3+box2->m2) // upper fragments
1909
+ || 2*y0>=(box2->m3+box2->m2)) ) { // lower fragments
1910
+ struct box *box5=NULL, *box6=NULL; // nearest and next nearest box
1911
+ box4=NULL;
1912
+ num_frags++; /* count for debugging */
1913
+ // get the [2nd] next x-nearest box in the same line
1914
+ for_each_data(&(job->res.boxlist)) {
1915
+ box4=(struct box *)list_get_current(&(job->res.boxlist));
1916
+ if (box4 == box2 || box4->c == PICTURE) continue;
1917
+ /* 0.42 speed up for background pixel pattern, box4 to small */
1918
+ if ( box4->x1 - box4->x0 + 1 < x1-x0+1
1919
+ && box4->y1 - box4->y0 + 1 < y1-y0+1 ) continue;
1920
+ // have in mind that line number may be wrong for dust
1921
+ if (box4->line>=0 && box2->line>=0 && box4->line==box2->line)
1922
+ {
1923
+ if (!box5) box5=box4;
1924
+ if ( abs(box4->x0 + box4->x1 - 2*box2->x0)
1925
+ <abs(box5->x0 + box5->x1 - 2*box2->x0))
1926
+ { box6=box5; box5=box4; }
1927
+ }
1928
+ } end_for_each(&(job->res.boxlist));
1929
+ box4=box5; // next nearest box within the same line
1930
+ if (box4) {
1931
+ // do not glue "%^" in 0811qemu2.png 2010-09-28
1932
+ if (box4->x1 - box4->x0 + 1 > job->res.avX / 2
1933
+ && box2->x1 - box2->x0 + 1 > job->res.avX / 2
1934
+ && ( box2->x0 > box4->x1
1935
+ || box4->x0 > box2->x1)) continue;
1936
+ #if 0 /* set this to 1 for debugging of melting bugs */
1937
+ if (job->cfg.verbose & 7) {
1938
+ fprintf(stderr,"\n# next two boxes are candidates for joining");
1939
+ out_x(box2);
1940
+ out_x(box4); }
1941
+ #endif
1942
+ if ( /* umlaut "a "o "u, ij; box2 is the small dot, box4 the body */
1943
+ 4*y1 <= 3*box2->m2 + box2->m3 // y1=box2->y1, ocr-a %
1944
+ && 4*box4->y1 >= 3*box2->m2 + box2->m3 // dont join 2 dots
1945
+ && 2* y1 < box4->y1 + box4->y0 // box2 above box4
1946
+ && box4->x1 + job->res.avX/2 >= x0
1947
+ && box4->x0 - job->res.avX/2 <= x1
1948
+ && (y1 < box4->y0 || x0 < box4->x1) // dont melt "d'"
1949
+ && 3* ( y1 - box4->y0)
1950
+ <= 2* (box4->y1 - box4->y0) // too far away? dust!
1951
+ && 8* ( x1 - x0 + 1)
1952
+ >= (box4->x1 - box4->x0 + 1) // dot must have minimum size
1953
+ && 10* ( y1 - y0 + 1)
1954
+ >= (box4->y1 - box4->y0 + 1) // dot must have minimum size
1955
+ ) do_join=1;
1956
+ if ( (!do_join) /* !?; box2 is the dot, box4 the body */
1957
+ && 2*box4->x1>=x0+x1 /* test if box4 is around box2 */
1958
+ && 2*box4->x0<=2*x1 /* +x0+1 Jan00 */
1959
+ && ( x1-x0 <= box4->x1-box4->x0+2 )
1960
+ && 2*y0>=box2->m2+box2->m3
1961
+ && 4*y1>=box2->m2+3*box2->m3
1962
+ && 4*(y1-y0)<box2->m4-box2->m1
1963
+ && (8*box4->y1 < box4->m2+7*box4->m3
1964
+ || box4->m4-box4->m1<16) /* Jan00 */
1965
+ ) do_join=2;
1966
+ if ( (!do_join) /* =;: box2 is the upper box, box4 the lower box */
1967
+ && 2*box4->x1>=x0+x1 /* test if box4 is around box2 */
1968
+ && 2*box4->x0<=2*x1 /* +x0+1 */
1969
+ && ( x1-x0 <= box4->x1-box4->x0+4 )
1970
+ && ( 4*x0 <= 3*box4->x1+box4->x0 )
1971
+ && (( box2->m2 && box4->m2
1972
+ && y1< box2->m3
1973
+ && 2*box4->y1 > box4->m3+box4->m2 // can be bigger than m3
1974
+ && 4*box4->y0 >= 3*box4->m2+box4->m3
1975
+ && 2*box2->y0 < box2->m3+box2->m2
1976
+ )
1977
+ || ( (!box2->m2) || (!box4->m2) )
1978
+ )
1979
+ ) do_join=3;
1980
+ /* '' ,, tmp08/0811qemu2 2010-10-01 */
1981
+ if ( abs(box2->y1 - box4->y1) <= (y1-y0)/8+1 // same y1
1982
+ && abs(box2->y0 - box4->y0) <= (y1-y0)/8+1 // same y0
1983
+ && abs((box4->x1 - box4->x0) - (x1-x0)) <= (x1-x0)/8+1 // same dx
1984
+ && x1-x0 <= job->res.avX/2 // small width
1985
+ && ( abs(box4->x0 - x1 - 1) <= job->res.avX/2 // small gap
1986
+ || abs(x0 - box4->x1 - 1) <= job->res.avX/2) // ocr-b
1987
+ && ( 4*y1 <= 3*box2->m2 + box2->m3 // ''
1988
+ || 4*y0 >= 2*box2->m2 + 2*box2->m3 ) // ,,
1989
+ ) do_join=4;
1990
+ if (do_join>0) { // fkt melt(box2,box4)
1991
+ if (job->cfg.verbose & 7) // space "( " for better " x"-searching
1992
+ fprintf(stderr," join objects %3d %3d %+4d %+4d"
1993
+ " + %3d %3d %+4d %+4d %s\n# ...",
1994
+ x0, y0, x1-x0+1, y1-y0+1, box4->x0, box4->y0,
1995
+ box4->x1-box4->x0+1, box4->y1-box4->y0+1,join_reason[do_join]);
1996
+ // fprintf(stderr,"\n# DEBUG merge:"); // d=7x34 @ (109,51) ???
1997
+ // if (job->cfg.verbose & 4) out_x(box2);
1998
+ // if (job->cfg.verbose & 4) out_x(box4);
1999
+ merge_boxes( box2, box4 ); // add box4 to box2
2000
+ x0 = box2->x0; x1 = box2->x1;
2001
+ y0 = box2->y0; y1 = box2->y1;
2002
+ #if 0
2003
+ if (job->cfg.verbose & 7) //
2004
+ fprintf(stderr," join objects %3d %3d %+4d %+4d\n# ...",
2005
+ x0, y0, x1-x0+1, y1-y0+1);
2006
+ #endif
2007
+ // if (job->cfg.verbose & 4) out_x(box2);
2008
+ // 2010-09-24 hmm, correct overall hight here, later set bad???
2009
+ // job->res.numC--; // dont count fragments as chars
2010
+ ii++; glued_frags++; // remove
2011
+ // output_list(job);
2012
+ list_del(&(job->res.boxlist), box4); /* ret&1: error-message ??? */
2013
+ // output_list(job);
2014
+ free_box(box4);
2015
+ }
2016
+ }
2017
+ }
2018
+ // continue;
2019
+
2020
+ // horizontally broken w' K'
2021
+ if( 2*y1 < (box2->m3+box2->m2) )
2022
+ if( 2*(y1-y0) < (box2->m3+box2->m2) ) // fragment
2023
+ for_each_data(&(job->res.boxlist)) {
2024
+ box4=(struct box *)list_get_current(&(job->res.boxlist));
2025
+ if (box4!=box2 && box4->c != PICTURE)
2026
+ {
2027
+ if( box4->line>=0 && box4->line==box2->line
2028
+ && box4->x1>=x0-1 && box4->x1<x0 // do not glue 6-
2029
+ && box4->x0+3*box4->x1<4*x0)
2030
+ if( get_bw(x0 ,x0 ,y1,y1 ,pp,cs,1) == 1)
2031
+ if( get_bw(x0-2,x0-1,y1,y1+2,pp,cs,1) == 1)
2032
+ { // fkt melt(box2,box4)
2033
+ if (job->cfg.verbose & 7)
2034
+ fprintf(stderr," join objects %3d %3d %+4d %+4d"
2035
+ " + %3d %3d %+4d %+4d w'K'\n# ...",
2036
+ x0, y0, x1-x0+1, y1-y0+1, box4->x0, box4->y0,
2037
+ box4->x1-box4->x0+1, box4->y1-box4->y0+1);
2038
+ put(pp,x0,y1+1,~(128+64),0);
2039
+ merge_boxes( box2, box4 );
2040
+ x0 = box2->x0; x1 = box2->x1;
2041
+ y0 = box2->y0; y1 = box2->y1;
2042
+ job->res.numC--; ii++; // remove
2043
+ glued_hor++;
2044
+ list_del(&(job->res.boxlist), box4);
2045
+ free_box(box4);
2046
+ }
2047
+ }
2048
+ } end_for_each(&(job->res.boxlist));
2049
+
2050
+ // horizontally broken n h (h=l_) v0.2.5 Jun00
2051
+ if( abs(box2->m2-y0)<=(y1-y0)/8 )
2052
+ if( abs(box2->m3-y1)<=(y1-y0)/8 )
2053
+ if( num_cross(x0, x1,(y0+ y1)/2,(y0+ y1)/2,pp,cs) == 1)
2054
+ if( num_cross(x0, x1,(y0+3*y1)/4,(y0+3*y1)/4,pp,cs) == 1)
2055
+ if( get_bw((3*x0+x1)/4,(3*x0+x1)/4,(3*y0+y1)/4,y1,pp,cs,1) == 0)
2056
+ if( get_bw(x0,(3*x0+x1)/4,(3*y0+y1)/4,(y0+3*y1)/4,pp,cs,1) == 0)
2057
+ if( get_bw(x0, x0, y0,(3*y0+y1)/4,pp,cs,1) == 1)
2058
+ for_each_data(&(job->res.boxlist)) {
2059
+ box4=(struct box *)list_get_current(&(job->res.boxlist));
2060
+ if (box4!=box2 && box4->c != PICTURE)
2061
+ {
2062
+ if( box4->line>=0 && box4->line==box2->line
2063
+ && box4->x1>x0-3 && box4->x1-2<x0
2064
+ && abs(box4->y1-box2->m3)<2)
2065
+ { // fkt melt(box2,box4)
2066
+ if (job->cfg.verbose & 7)
2067
+ fprintf(stderr," join objects %3d %3d %+4d %+4d"
2068
+ " + %3d %3d %+4d %+4d nh\n# ...",
2069
+ x0, y0, x1-x0+1, y1-y0+1, box4->x0, box4->y0,
2070
+ box4->x1-box4->x0+1, box4->y1-box4->y0+1);
2071
+ y=loop(pp,x0,y0,y1-y0,cs,0,DO);if(2*y>y1-y0) continue;
2072
+ put(pp,x0-1,y0+y ,~(128+64),0);
2073
+ put(pp,x0-1,y0+y+1,~(128+64),0);
2074
+ merge_boxes( box2, box4 ); // add box4 to box2
2075
+ x0 = box2->x0; x1 = box2->x1;
2076
+ y0 = box2->y0; y1 = box2->y1;
2077
+ job->res.numC--; ii++; // remove
2078
+ glued_hor++;
2079
+ list_del(&(job->res.boxlist), box4);
2080
+ free_box(box4);
2081
+ }
2082
+ }
2083
+ } end_for_each(&(job->res.boxlist));
2084
+ } end_for_each(&(job->res.boxlist));
2085
+ if (job->cfg.verbose)
2086
+ fprintf(stderr," joined: %3d fragments (found %3d), %3d rest, nC= %d\n",
2087
+ glued_frags, num_frags, glued_hor, job->res.numC);
2088
+ close_progress(pc);
2089
+ }
2090
+ return 0;
2091
+ }
2092
+
2093
+ /*
2094
+ ** this is a simple way to improve results on noisy images:
2095
+ ** - find similar chars (build cluster of same chars)
2096
+ ** - analyze clusters (could be used for generating unknown font-base)
2097
+ ** - the quality of the result depends mainly on the distance function
2098
+ */
2099
+ // ---- analyse boxes, compare chars, compress picture ------------
2100
+ // ToDo: - error-correction only on large chars!
2101
+ int find_same_chars( pix *pp){
2102
+ int i,k,d,cs,dist,n1,dx; struct box *box2,*box3,*box4,*box5;
2103
+ pix p=(*pp);
2104
+ job_t *job=OCR_JOB; /* fixme */
2105
+ cs=job->cfg.cs;
2106
+ {
2107
+ if(job->cfg.verbose)fprintf(stderr,"# packing");
2108
+ i = list_total(&(job->res.boxlist));
2109
+ for_each_data(&(job->res.boxlist)) {
2110
+ box4 = box2 = (struct box *)list_get_current(&(job->res.boxlist));
2111
+ dist=1000; // 100% maximum
2112
+ dx = box2->x1 - box2->x0 + 1;
2113
+
2114
+ if(job->cfg.verbose)fprintf(stderr,"\r# packing %5d",i);
2115
+ if( dx>3 )
2116
+ for(box3=(struct box *)list_next(&(job->res.boxlist),box2);box3;
2117
+ box3=(struct box *)list_next(&(job->res.boxlist),box3)) {
2118
+ if(box2->num!=box3->num){
2119
+ int d=distance(&p,box2,&p,box3,cs);
2120
+ if ( d<dist ) { dist=d; box4=box3; } // best fit
2121
+ if ( d<5 ){ // good limit = 5% ???
2122
+ i--;n1=box3->num; // set all num==box2.num to box2.num
2123
+ for_each_data(&(job->res.boxlist)) {
2124
+ box5=(struct box *)(struct box *)list_get_current(&(job->res.boxlist));
2125
+ if(box5!=box2)
2126
+ if( box5->num==n1 ) box5->num=box2->num;
2127
+ } end_for_each(&(job->res.boxlist));
2128
+ // out_x2(box2,box5);
2129
+ // fprintf(stderr," dist=%d\n",d);
2130
+ }
2131
+ }
2132
+ }
2133
+ // nearest dist to box2 has box4
2134
+ // out_b2(box2,box4);
2135
+ // fprintf(stderr," dist=%d\n",dist);
2136
+ } end_for_each(&(job->res.boxlist));
2137
+ k=0;
2138
+ if(job->cfg.verbose)fprintf(stderr," %d different chars",i);
2139
+ for_each_data(&(job->res.boxlist)) {
2140
+ struct box *box3,*box4;
2141
+ int j,dist;
2142
+ box2=(struct box *)list_get_current(&(job->res.boxlist));
2143
+ for(box3=(struct box *)list_get_header(&(job->res.boxlist));
2144
+ box3!=box2 && box3!=NULL;
2145
+ box3=(struct box *)list_next(&(job->res.boxlist), box3))
2146
+ if(box3->num==box2->num)break;
2147
+ if(box3!=box2 && box3!=NULL)continue;
2148
+ i++;
2149
+ // count number of same chars
2150
+ dist=0;box4=box2;
2151
+
2152
+ for(box3=box2,j=0;box3;
2153
+ box3=(struct box *)list_next(&(job->res.boxlist), box3)) {
2154
+ if(box3->num==box2->num){
2155
+ j++;
2156
+ d=distance(&p,box2,&p,box3,cs);
2157
+ if ( d>dist ) { dist=d; box4=box3; } // worst fit
2158
+ }
2159
+ }
2160
+ if(job->cfg.verbose&8){
2161
+ out_x2(box2,box4);
2162
+ fprintf(stderr," no %d char %4d %5d times maxdist=%d\n",i,box2->num,j,dist);
2163
+ }
2164
+ // calculate mean-char (error-correction)
2165
+ // ToDo: calculate maxdist in group
2166
+ k+=j;
2167
+ // if(j>1)
2168
+ // out_b(box1,NULL,0,0,0,0,cs);
2169
+ if(job->cfg.verbose&8)
2170
+ fprintf(stderr," no %d char %4d %5d times sum=%d\n",i,box2->num,j,k);
2171
+ } end_for_each(&(job->res.boxlist));
2172
+ if(job->cfg.verbose)fprintf(stderr," ok\n");
2173
+ }
2174
+ return 0;
2175
+ }
2176
+
2177
+ /*
2178
+ ** call the first engine for all boxes and set box->c=result;
2179
+ **
2180
+ */
2181
+ int char_recognition( pix *pp, int mo){
2182
+ int i,ii,ni,cs,x0,y0,x1,y1;
2183
+ struct box *box2;
2184
+ progress_counter_t *pc;
2185
+ wchar_t cc;
2186
+ job_t *job=OCR_JOB; /* fixme */
2187
+ cs=job->cfg.cs;
2188
+ // ---- analyse boxes, find chars ---------------------------------
2189
+ if (job->cfg.verbose)
2190
+ fprintf(stderr,"# char recognition");
2191
+ i=ii=ni=0;
2192
+ for_each_data(&(job->res.boxlist)) { /* count boxes */
2193
+ box2 = (struct box *)list_get_current(&(job->res.boxlist));
2194
+ /* wew: isn't this just job->res.numC? */
2195
+ /* js: The program is very complex. I am not sure anymore
2196
+ wether numC is the number of boxes or the number of valid
2197
+ characters.
2198
+ Because its not time consuming I count the boxes here. */
2199
+ if (box2->c==UNKNOWN) i++;
2200
+ if (box2->c==PICTURE) ii++;
2201
+ ni++;
2202
+ } end_for_each(&(job->res.boxlist));
2203
+ if(job->cfg.verbose)
2204
+ fprintf(stderr," unknown= %d picts= %d boxes= %d\n# ",i,ii,ni);
2205
+ if (!ni) return 0;
2206
+ i=ii=0;
2207
+ pc = open_progress(ni,"char_recognition");
2208
+ for_each_data(&(job->res.boxlist)) {
2209
+ box2 = (struct box *)list_get_current(&(job->res.boxlist));
2210
+ x0=box2->x0;x1=box2->x1;
2211
+ y0=box2->y0;y1=box2->y1; // box
2212
+ cc=box2->c;
2213
+ if (cc==PICTURE) continue;
2214
+
2215
+ if ((mo&256)==0) { /* this case should be default (main engine) */
2216
+ if(cc==UNKNOWN || box2->num_ac==0 || box2->wac[0]<job->cfg.certainty)
2217
+ cc=whatletter(box2,cs ,0);
2218
+ }
2219
+
2220
+ if(mo&2)
2221
+ if(cc==UNKNOWN || box2->num_ac==0 || box2->wac[0]<job->cfg.certainty)
2222
+ cc=ocr_db(box2, job);
2223
+
2224
+
2225
+ // box2->c=cc; bad idea (May03 removed)
2226
+ // set(box2,cc,95); ToDo: is that better?
2227
+
2228
+ if(cc==UNKNOWN)
2229
+ i++;
2230
+ ii++;
2231
+
2232
+ if(job->cfg.verbose&8) {
2233
+ fprintf(stderr,"\n# code= %04lx %c",(long)cc,(char)((cc<255)?cc:'_'));
2234
+ out_b(box2,pp,x0,y0,x1-x0+1,y1-y0+1,cs);
2235
+ }
2236
+ progress(ii,pc); /* ii = 0..ni */
2237
+
2238
+ } end_for_each(&(job->res.boxlist));
2239
+ close_progress(pc);
2240
+ if(job->cfg.verbose)fprintf(stderr," %d of %d chars unidentified\n",i,ii);
2241
+ return 0;
2242
+ }
2243
+
2244
+
2245
+ /*
2246
+ ** compare unknown with known chars,
2247
+ ** very similar to the find_similar_char_function but here only to
2248
+ ** improve the result
2249
+ */
2250
+ int compare_unknown_with_known_chars(pix * pp, int mo) {
2251
+ job_t *job=OCR_JOB; /* fixme */
2252
+ int i, cs = job->cfg.cs, dist, d, ad, wac, ni, ii;
2253
+ struct box *box2, *box3, *box4;
2254
+ progress_counter_t *pc=NULL;
2255
+ wchar_t bc;
2256
+ i = ii = 0; // ---- -------------------------------
2257
+ if (job->cfg.verbose)
2258
+ fprintf(stderr, "# try to compare unknown with known chars !(mode&8)");
2259
+ if (!(mo & 8))
2260
+ {
2261
+ ii=ni=0;
2262
+ for_each_data(&(job->res.boxlist)) { ni++; } end_for_each(&(job->res.boxlist));
2263
+ pc = open_progress(ni,"compare_chars");
2264
+ for_each_data(&(job->res.boxlist)) {
2265
+ box2 = (struct box *)list_get_current(&(job->res.boxlist)); ii++;
2266
+ if (box2->c == UNKNOWN || (box2->num_ac>0 && box2->wac[0]<97))
2267
+ if (box2->y1 - box2->y0 > 4 && box2->x1 - box2->x0 > 1) { // no dots!
2268
+ box4 = (struct box *)list_get_header(&(job->res.boxlist));;
2269
+ dist = 1000; /* 100% maximum */
2270
+ bc = UNKNOWN; /* best fit char */
2271
+ for_each_data(&(job->res.boxlist)) {
2272
+ box3 = (struct box *)list_get_current(&(job->res.boxlist));
2273
+ wac=((box3->num_ac>0)?box3->wac[0]:100);
2274
+ if (box3 == box2 || box3->c == UNKNOWN
2275
+ || wac<job->cfg.certainty) continue;
2276
+ if (box2->y1 - box2->y0 < 5 || box2->x1 - box2->x0 < 3) continue;
2277
+ d = distance(pp, box2, pp, box3, cs);
2278
+ if (d < dist) {
2279
+ dist = d; bc = box3->c; box4 = box3;
2280
+ }
2281
+ } end_for_each(&(job->res.boxlist));
2282
+ if (dist < 10) {
2283
+ /* sureness can be maximal of box3 */
2284
+ if (box4->num_ac>0) ad = box4->wac[0];
2285
+ else ad = 97;
2286
+ ad-=dist; if(ad<1) ad=1;
2287
+ /* ToDo: ad should depend on ad of bestfit */
2288
+ setac(box2,(wchar_t)bc,ad);
2289
+ i++;
2290
+ } // limit as option???
2291
+ // => better max distance('e','e') ???
2292
+ if (dist < 50 && (job->cfg.verbose & 7)) { // only for debugging
2293
+ fprintf(stderr,"\n# L%02d xy= %4d %4d best fit was %04x=%c"
2294
+ " dist=%3d%% i=%d", box2->line, box2->x0, box2->y0,
2295
+ (int)bc, (char)((bc<128)?bc:'_'), dist, i);
2296
+ if (box4->num_ac>0) fprintf(stderr," w= %3d%%",box4->wac[0]);
2297
+ if ((job->cfg.verbose & 4) && dist < 10)
2298
+ out_x2(box2, box4);
2299
+ }
2300
+ progress(ii,pc);
2301
+ }
2302
+ } end_for_each(&(job->res.boxlist));
2303
+ close_progress(pc);
2304
+ }
2305
+ if (job->cfg.verbose)
2306
+ fprintf(stderr, " - found %d (nC=%d)\n", i, ii);
2307
+ return 0;
2308
+ }
2309
+
2310
+ /*
2311
+ // ---- divide overlapping chars which !strchr("_,.:;",c);
2312
+ // block-splitting (two ore three glued chars)
2313
+ // division if dots>0 does not work properly! ???
2314
+ //
2315
+ // what about glued "be"?
2316
+ // what about recursive division?
2317
+ // ToDo: mark divided boxes to give the engine a chance to
2318
+ // handle wrong divisions
2319
+ */
2320
+ int try_to_divide_boxes( pix *pp, int mo){
2321
+ struct box *box2, boxa, boxb;
2322
+ job_t *job=OCR_JOB; /* fixme */
2323
+ int cs=job->cfg.cs, ad=100,
2324
+ a2[8], ar, // certainty of each part, ar = product of all certainties
2325
+ cbest; // best certainty, skip search of certainty<cbest-1 for speed
2326
+ wchar_t ci[8], // split max. 8 chars
2327
+ s1[]={ UNKNOWN, '_', '.', ',', '\'', '!', ';', '?', ':', '-',
2328
+ '=', '(', ')', '/', '\\', '\0' }; // not accepted chars, \0-terminated!
2329
+ int x0, x1, y0, y1,
2330
+ xi[8+1]; // cutting positions
2331
+ int i, ii, n1, dy, dx;
2332
+ // pix p=(*pp); // remove!
2333
+ if (job->cfg.verbose)
2334
+ fprintf(stderr,"# try to divide unknown chars !(mode&16)");
2335
+ if(!(mo&16)) // put this to the caller
2336
+ for_each_data(&(job->res.boxlist)) {
2337
+ box2 = (struct box *)list_get_current(&(job->res.boxlist));
2338
+ // don't try to split simple structures (ex: 400x30 square)
2339
+ if ((!box2->num_frames)
2340
+ || box2->num_frame_vectors[ box2->num_frames-1 ]<9) continue;
2341
+ if((box2->c==UNKNOWN || (box2->num_ac && box2->wac[0]<job->cfg.certainty))
2342
+ && box2->x1-box2->x0>5 && box2->y1-box2->y0>4){
2343
+ x0=box2->x0; x1=box2->x1;
2344
+ y0=box2->y0; y1=box2->y1;
2345
+ ad=100;
2346
+ cbest=0;
2347
+
2348
+ /* get minimum vertical lines */
2349
+ n1 = num_cross(x0,x1,( y1+y0)/2,( y1+y0)/2,pp,cs);
2350
+ ii = num_cross(x0,x1,(3*y1+y0)/4,(3*y1+y0)/4,pp,cs); if (ii<n1) n1=ii;
2351
+ if (box2->m2 && box2->m3 > box2->m2+2)
2352
+ for (i=box2->m2+1;i<=box2->m3-1;i++) {
2353
+ if (loop(pp,x0+1,i,x1-x0,cs,1,RI) > (x1-x0-2)) continue; // ll
2354
+ ii = num_cross(x0,x1,i,i,pp,cs); if (ii<n1) n1=ii;
2355
+ } if (n1<2) continue; // seems to make no sense to divide
2356
+ if (n1<4) ad=99*ad/100; // not to strong because m2+m3 could be wrong
2357
+ if (n1<3) ad=99*ad/100;
2358
+
2359
+ if( 2*y1 < box2->m3+box2->m4 /* baseline char ? */
2360
+ && num_cross(x0,x1,y1-1,y1-1,pp,cs)==1 // -1 for slopes
2361
+ && num_cross((x0+2*x1)/3,(x0+3*x1)/4,y0,y1,pp,cs)<3 // not exclude tz
2362
+ && num_cross((3*x0+x1)/4,(2*x0+x1)/3,y0,y1,pp,cs)<3 // not exclude zl
2363
+ && loop(pp,x0,y1-(y1-y0)/32,x1-x0,cs,0,RI)
2364
+ +loop(pp,x1,y1-(y1-y0)/32,x1-x0,cs,0,LE) > (x1-x0+1)/2
2365
+ ) continue; /* do not try on bvdo"o etc. */
2366
+
2367
+ // one vertical line can not be two glued chars, lc?
2368
+ if ( num_cross(x0,x1,(y1+y0)/2,(y1+y0)/2,pp,cs)<=1 ) continue;
2369
+ { // doublet = 2 letters
2370
+ // char buf[4]="\0\0\0"; // 4th byte is string end == \0
2371
+ // buf[0]=c1; // c1 is wchar_t! (0xbf00 to 0) failes
2372
+ // buf[1]=c2;
2373
+ char buf[64]=""; // end == \0
2374
+ if (job->cfg.verbose&2){
2375
+ fprintf(stderr, "\n#\n# divide box: %4d %4d %3d %3d\n",
2376
+ x0, y0, x1-x0+1, y1-y0+1);
2377
+ if (job->cfg.verbose&4) out_x(box2);
2378
+ }
2379
+ // it would be better if testing is only if most right and left char
2380
+ // is has no horizontal gap (below m2) ex: be
2381
+ i=0; // num splittet chars
2382
+ xi[0]=x0; xi[1]=x0+3; xi[2]=x1;
2383
+ for ( ; ; xi[i+1]++) { // x[i] .. x[i+1], slower? but better v0.42
2384
+ /* break if x is to near to the right border */
2385
+ if (xi[i+1]>x1-3) { if (i==0) break; i--; xi[i+2]=x1; continue; }
2386
+ // ToDo: skip if not a local dy-min for speedup
2387
+ { int ymin=y1, ymax=y0, bow=0, // min max at cutting point
2388
+ max0=y0, max1=y0, // max y on left and right side
2389
+ min0=y1, min1=y1; // min y on left and right side
2390
+ for (dy=0,ii=0;ii<box2->num_frame_vectors[ 0 ];ii++) {
2391
+ int pre=ii-1, next=(ii+1)%box2->num_frame_vectors[ 0 ];
2392
+ if (pre<0) pre=box2->num_frame_vectors[ 0 ]-1;
2393
+ // check if vector is inside box to cut
2394
+ if ( box2->frame_vector[ii ][0]<=xi[i ]) continue;
2395
+ if ( box2->frame_vector[ii ][0]> xi[i+2]) continue;
2396
+ // 2nd derivation of y(x)
2397
+ if (abs(box2->frame_vector[ii ][0]-xi[i+1])<2) {
2398
+ dy= 2*box2->frame_vector[ii ][1]
2399
+ -box2->frame_vector[next][1]
2400
+ -box2->frame_vector[pre ][1];
2401
+ dx= box2->frame_vector[next][0]
2402
+ -box2->frame_vector[pre ][0];
2403
+ // rotate 180 degree if dx<0
2404
+ if (((dx>0)?dy:-dy)<-abs(dx)/2) { bow=1; }
2405
+ }
2406
+ // its not the best if we think on glued fi fo etc.
2407
+ if (( box2->frame_vector[pre ][0]<=xi[i+1]
2408
+ && box2->frame_vector[next][0]>=xi[i+1])
2409
+ || ( box2->frame_vector[pre ][0]>=xi[i+1]
2410
+ && box2->frame_vector[next][0]<=xi[i+1])) {
2411
+ if ( box2->frame_vector[ii ][1]>ymax)
2412
+ ymax= box2->frame_vector[ii ][1];
2413
+ if ( box2->frame_vector[ii ][1]<ymin)
2414
+ ymin= box2->frame_vector[ii ][1];
2415
+ }
2416
+ // min and max of left and right side
2417
+ if ( box2->frame_vector[ii ][1]>max0
2418
+ && box2->frame_vector[ii ][0]<=xi[i+1])
2419
+ max0=box2->frame_vector[ii ][1];
2420
+ if ( box2->frame_vector[ii ][1]>max1
2421
+ && box2->frame_vector[ii ][0]> xi[i+1])
2422
+ max1=box2->frame_vector[ii ][1];
2423
+ if ( box2->frame_vector[ii ][1]<min0
2424
+ && box2->frame_vector[ii ][0]<=xi[i+1])
2425
+ min0=box2->frame_vector[ii ][1];
2426
+ if ( box2->frame_vector[ii ][1]<min1
2427
+ && box2->frame_vector[ii ][0]> xi[i+1])
2428
+ min1=box2->frame_vector[ii ][1];
2429
+ }
2430
+ if(job->cfg.verbose&2)
2431
+ fprintf(stderr,"\n# test if to split at x%d= %2d %2d %2d"
2432
+ " bow,(max-min)[i,0,1] %d %3d %3d %3d"
2433
+ , i, xi[i]-x0, xi[i+1]-x0, xi[i+2]-x0, bow, ymax-ymin, max0-min0, max1-min1);
2434
+ /* skip if no local minimum at xi[i+1] or if its not thin enough */
2435
+ // 2010-10-11 failes for ke on tmp08/gocr0801_bad5.jpg ToDo!!!
2436
+ // if (bow==0 || 4*(ymax-ymin)>2*(y1-y0)) continue;
2437
+ if (bow==0) continue;
2438
+ // cuttet parts should have about the same height (max-min)
2439
+ // we dont want to cut an 'n' in three parts!
2440
+ if (2*(max0-min0+1)<(y1-y0+1)) continue; // left height
2441
+ if (2*(max1-min1+1)<(y1-y0+1)) continue; // right height
2442
+ // ToDo: thickness on xi[i+1]?
2443
+ }
2444
+ // try to split successive right box if left box is recognised,
2445
+ // else shift the splitting point further to the right border
2446
+ // removing ->dots if dot only above one char !!! ??? not implemented
2447
+ if(job->cfg.verbose&2)
2448
+ fprintf(stderr,"\n# try to split, newbox[%d].x= %2d ... %2d "
2449
+ "dy= %d ", i, xi[i]-x0, xi[i+1]-x0, dy);
2450
+ boxa=*box2; // copy contents, ToDo: reset ac-list (in cut_box?)
2451
+ boxa.x=xi[i]; boxa.y=y0; // obsolete? mark pixel, overlap?
2452
+ boxa.x0=xi[i];boxa.x1=xi[i+1]; // new horizontal box range
2453
+ cut_box(&boxa); boxa.num_ac=0; // ToDo: add box2 as src argument?
2454
+ // out_x(&boxa);
2455
+ // get wchar + certainty
2456
+ ci[i]=whatletter(&boxa,cs,0); a2[i]=testac(&boxa,ci[i]);
2457
+ if(job->cfg.verbose&2)
2458
+ fprintf(stderr,"\n# certainty %d limit= %d cbest= %d ",
2459
+ a2[i], job->cfg.certainty, cbest);
2460
+ if (a2[i]<job->cfg.certainty || a2[i]<cbest-1
2461
+ || wcschr(s1,ci[i]) ) { continue; } // dont split here
2462
+
2463
+ for (ar=ad,ii=0;ii<=i;ii++) {
2464
+ ar=a2[ii]*ar/100; } // multiply all probabilities
2465
+ if (ar<98*job->cfg.certainty/100 || ar<cbest) {
2466
+ continue; } // dont go deeper, no longer string
2467
+
2468
+ i++; if (i==8) break; // maximum splits
2469
+ if (i==4) break; // at the moment its to slow to go further
2470
+ if (i+1<8) xi[i+1]=x1; // right border of next box
2471
+ if (i+2<8) xi[i+2]=x1;
2472
+
2473
+ if(job->cfg.verbose&2)
2474
+ fprintf(stderr,"\n try end split [%d]=%d [%d]=%d ",
2475
+ i, xi[i]-x0, i+1, xi[i+1]-x0);
2476
+ boxb=*box2; // try rest if it has to be split again
2477
+ boxb.x=xi[i]+1; boxb.y=y0;
2478
+ boxb.x0=xi[i]+1;boxb.x1=xi[i+1];
2479
+ cut_box(&boxb); boxb.num_ac=0;
2480
+ ci[i]=whatletter(&boxb,cs,0); a2[i]=testac(&boxb,ci[i]);
2481
+ if (a2[i]<job->cfg.certainty || a2[i]<cbest-1
2482
+ || wcschr(s1,ci[i]) ) { xi[i+1]=xi[i]+2; continue; } // split rest
2483
+ // now we have everything splittet
2484
+
2485
+ if(job->cfg.verbose&2) {
2486
+ fprintf(stderr,"\n split at/to: ");
2487
+ for (ii=0;ii<=i;ii++)
2488
+ fprintf(stderr," %2d %s (%3d)", xi[ii+1]-x0,
2489
+ decode(ci[ii],ASCII), a2[ii]);
2490
+ fprintf(stderr,"\n");
2491
+ }
2492
+ // boxa..c changed!!! dots should be modified!!!
2493
+ // Question: cut it into boxes v0.40 or set a string v0.41?
2494
+ // new way of building a string v0.41 (can call setas multiple)
2495
+ // usefull if compare unknown with known strings (except barcode?)
2496
+ // ToDo: also create alternate variants? ex: I <-> l
2497
+ for (buf[0]=0,ar=ad,ii=0;ii<=i;ii++) {
2498
+ ar=a2[ii]*ar/100; // multiply all probabilities
2499
+ if (i>0 && ci[ii]=='n' && ci[ii-1]=='r') ar--; // m == rn
2500
+ strncat(buf,decode(ci[ii],job->cfg.out_format),20);
2501
+ }
2502
+
2503
+ if (ar>cbest) cbest=ar; // best (highest) certainty found
2504
+ // reduce, but not if we cross certainty border
2505
+ if (99*ar/100 > job->cfg.certainty) ar=99*ar/100;
2506
+ if (job->cfg.verbose&2)
2507
+ fprintf(stderr,"\n split result= %s (%3d) ",buf, ar);
2508
+ setas(box2,buf,ar); // char *, does it disturb further splitting?
2509
+ buf[0]=0;
2510
+ i--; xi[i+2]=x1;
2511
+ }
2512
+ }
2513
+ }
2514
+ } end_for_each(&(job->res.boxlist));
2515
+ if (job->cfg.verbose) fprintf(stderr,", numC %d\n",job->res.numC);
2516
+ return 0;
2517
+ }
2518
+
2519
+ /*
2520
+ // ---- divide vertical glued boxes (ex: g above T);
2521
+ */
2522
+ int divide_vert_glued_boxes( pix *pp, int mo){
2523
+ struct box *box2,*box3,*box4;
2524
+ job_t *job=OCR_JOB; /* fixme */
2525
+ int y0,y1,y,dy,flag_found,dx;
2526
+ if(job->cfg.verbose)fprintf(stderr,"# divide vertical glued boxes");
2527
+ for_each_data(&(job->res.boxlist)) {
2528
+ box2 = (struct box *)list_get_current(&(job->res.boxlist));
2529
+ if (box2->c != UNKNOWN) continue; /* dont try on pictures */
2530
+ y0=box2->y0; y1=box2->y1; dy=y1-y0+1;
2531
+ dx=4*(job->res.avX+box2->x1-box2->x0+1); // we want to be sure to look at 4ex distance
2532
+ if ( dy>2*job->res.avY && dy<6*job->res.avY && box2->m1
2533
+ && y0<=box2->m2+2 && y0>=box2->m1-2
2534
+ && y1>=box2->m4+job->res.avY-2)
2535
+ { // test if lower end fits one of the other lines?
2536
+ box4=box2; flag_found=0;
2537
+ for_each_data(&(job->res.boxlist)) {
2538
+ box4 = (struct box *)list_get_current(&(job->res.boxlist));
2539
+ if (box4->c != UNKNOWN) continue; /* dont try on pictures */
2540
+ if (box4->x1<box2->x0-dx || box4->x0>box2->x1+dx) continue; // ignore far boxes
2541
+ if (box4->line==box2->line ) flag_found|=1; // near char on same line
2542
+ if (box4->line==box2->line+1) flag_found|=2; // near char on next line
2543
+ if (flag_found==3) break; // we have two vertical glued chars
2544
+ } end_for_each(&(job->res.boxlist));
2545
+ if (flag_found!=3) continue; // do not divide big chars or special symbols
2546
+ y=box2->m4; // lower end of the next line
2547
+ if(job->cfg.verbose&2){
2548
+ fprintf(stderr,"\n# divide box below y=%4d",y-y0);
2549
+ if(job->cfg.verbose&6)out_x(box2);
2550
+ }
2551
+ // --- insert box3 before box2
2552
+ box3= (struct box *) malloc_box(box2);
2553
+ box3->y1=y;
2554
+ box2->y0=y+1; box2->line++; // m1..m4 should be corrected!
2555
+ if (box4->line == box2->line){
2556
+ box2->m1=box4->m1; box2->m2=box4->m2;
2557
+ box2->m3=box4->m3; box2->m4=box4->m4;
2558
+ }
2559
+ box3->num=job->res.numC;
2560
+ if (list_ins(&(job->res.boxlist), box2, box3)) {
2561
+ fprintf(stderr,"ERROR list_ins\n"); };
2562
+ job->res.numC++;
2563
+ }
2564
+ } end_for_each(&(job->res.boxlist));
2565
+ if(job->cfg.verbose)fprintf(stderr,", numC %d\n",job->res.numC);
2566
+ return 0;
2567
+ }
2568
+
2569
+
2570
+ /*
2571
+ on some systems isupper(>255) cause a segmentation fault SIGSEGV
2572
+ therefore this function
2573
+ ToDo: should be replaced (?) by wctype if available on every system
2574
+ */
2575
+ int wisupper(wchar_t cc){ return ((cc<128)?isupper(cc):0); }
2576
+ int wislower(wchar_t cc){ return ((cc<128)?islower(cc):0); }
2577
+ int wisalpha(wchar_t cc){ return ((cc<128)?isalpha(cc):0); }
2578
+ int wisdigit(wchar_t cc){ return ((cc<128)?isdigit(cc):0); }
2579
+ int wisspace(wchar_t cc){ return ((cc<128)?isspace(cc):0); }
2580
+
2581
+ /* set box2->c to cc if cc is in the ac-list of box2, return 1 on success */
2582
+ int setc(struct box *box2, wchar_t cc){
2583
+ int ret=0, w1, w2;
2584
+ w1=((box2->num_ac) ? box2->wac[0] : 0); // weight of replaced char
2585
+ w2=testac(box2,cc);
2586
+ if (OCR_JOB->cfg.verbose) {
2587
+ if (box2->num_ac<2)
2588
+ fprintf(stderr, "\n# change [%d] %s %3d to %s %3d at %4d %4d",
2589
+ box2->num_ac, decode(box2->c,ASCII), w1,
2590
+ decode(cc,ASCII), (100+w2+1)/2, box2->x0, box2->y0);
2591
+ else
2592
+ fprintf(stderr, "\n# change [%d] %s %s %3d %3d to %s %3d at %4d %4d",
2593
+ box2->num_ac, decode(box2->c,ASCII),
2594
+ decode(box2->tac[1],ASCII), box2->wac[0], box2->wac[1],
2595
+ decode(cc,ASCII), (100+w2+1)/2, box2->x0, box2->y0);
2596
+ }
2597
+ if (w2) { if (box2->c!=cc) { ret=1; setac(box2,cc,(100+w2+1)/2); } }
2598
+ // if(OCR_JOB->cfg.verbose & 4) out_x(box2);
2599
+ // ToDo: modify per setac (shift ac)
2600
+ return ret;
2601
+ }
2602
+
2603
+
2604
+ /* ---- proof difficult chars Il1 by context view ----
2605
+ context: separator, number, vowel, nonvowel, upper case ????
2606
+ could be also used to find unknown chars if the environment (nonumbers)
2607
+ can be found in other places!
2608
+ ToDo:
2609
+ - box->tac[] as set of possible chars, ac set by engine, example:
2610
+ ac="l/" (not "Il|/\" because serifs detected and slant>0)
2611
+ correction only to one of the ac-set (alternative chars)!
2612
+ - should be language-settable; Unicode compatible
2613
+ - box2->ad and wac should be changed? (not proper yet)
2614
+ * ------------- */
2615
+ int context_correction( job_t *job ) {
2616
+ // const static char
2617
+ char *l_vowel="aeiouy";
2618
+ // *l_Vowel="AEIOU",chars if the environment (nonumbers)
2619
+ char *l_nonvo = "bcdfghjklmnpqrstvwxzBCDFGHJKLMNPQRSTVWXZ";
2620
+ int hexdigits = 0, hexdivpos = 0; // "O0lI123456789ABCDEFabcdef:"
2621
+ struct box *box3, *box2, *prev, *next, *pre2, *pre3, *pre4;
2622
+ // pix *pp = &(job->src.p);
2623
+ int nc=0, ns=0; // num corrections
2624
+ wchar_t last_double_quotation=0; // correction of different quotations "
2625
+ pre4=pre3=pre2=prev=next=NULL;
2626
+
2627
+ if (job->cfg.verbose)
2628
+ fprintf(stderr, "# context correction Il1 0O");
2629
+
2630
+ for_each_data(&(job->res.boxlist)) {
2631
+ pre4=pre3; pre3 = pre2; pre2 = prev; // 2010-10-01 tmp08/080916_JL*_150
2632
+ box2 = (struct box *)list_get_current(&(job->res.boxlist));
2633
+ prev = (struct box *)list_get_cur_prev(&(job->res.boxlist));
2634
+ next = (struct box *)list_get_cur_next(&(job->res.boxlist));
2635
+ // ToDo: count last_upper, lower, digits, hexdigits
2636
+ // 2010-10-10 hex-mode tmp08/gocr0801_bad5
2637
+ if (strchr("O0lI123456789ABCDEFabcdef",box2->c)) hexdigits++;
2638
+ else if (strchr(": ",box2->c) && prev && prev->c!=box2->c
2639
+ && (hexdigits-hexdivpos==2 || hexdigits-hexdivpos==4))
2640
+ hexdivpos=hexdigits;
2641
+ else { hexdigits=0; hexdivpos=0; }
2642
+ if (box2->c==' ' && prev && prev->c==' ') hexdigits=0;
2643
+ if (box2->c==':' && pre3 && pre3->c!=':') hexdigits=0; // :89:AB:CD:
2644
+ if (strchr("O0",box2->c) && hexdigits>5) nc+=setc(box2,(wchar_t)'0');
2645
+ if (strchr("l1",box2->c) && hexdigits>5) nc+=setc(box2,(wchar_t)'1');
2646
+ // 2010-10-01 sample tmp08/0811CSchulze_crop
2647
+ if (box2->c==DOUBLE_LOW_9_QUOTATION_MARK) {
2648
+ last_double_quotation = box2->tac[0];
2649
+ fprintf(stderr,"\n# ... found DOUBLE_LOW_9_QUOTATION_MARK");
2650
+ }
2651
+ if (box2->c==QUOTATION_MARK // 0x22 = ""
2652
+ && last_double_quotation == DOUBLE_LOW_9_QUOTATION_MARK) {
2653
+ last_double_quotation = 0;
2654
+ box2->c = box2->tac[0] = DOUBLE_HIGH_REVERSED_9_QUOTATION_MARK;
2655
+ if (job->cfg.verbose)
2656
+ fprintf(stderr, "\n# change [%d] %s %3d to %s %3d at %3d %3d",
2657
+ box2->num_ac, "\"", box2->wac[0],
2658
+ decode(box2->c,ASCII), box2->wac[0], box2->x0, box2->y0);
2659
+ }
2660
+
2661
+ if ( box2->c > 0xFF ) continue; // temporary UNICODE fix 1
2662
+ if ((prev) && (prev->c > 0xFF)) continue; // temporary UNICODE fix 2
2663
+ if ((next) && (next->c > 0xFF)) continue; // temporary UNICODE fix 3
2664
+ if (box2->num_ac<2) continue; // no alternatives
2665
+ if (box2->wac[0]==100 && box2->wac[1]<100) continue;
2666
+ if (box2->num_ac && box2->tas[0]) continue; // buggy space_remove 0.42
2667
+
2668
+ /* check for Il1| which are general difficult to distinguish */
2669
+ /* bbg: not very good. Should add some tests to check if is preceded by '.',
2670
+ spelling, etc */
2671
+ /* ToDo: only correct if not 100% sure (wac[i]<100)
2672
+ and new char is in wat[] */
2673
+ if (strchr("Il1|", box2->c) && next && prev) {
2674
+ // if( strchr(" \n",prev->c) // SPC
2675
+ // && strchr(" \n",next->c) ) box2->c='I'; else // bad idea! I have ...
2676
+ if (wisalpha(next->c) && next->c!='i' &&
2677
+ ( prev->c == '\n' ||
2678
+ ( prev->c == ' ' && pre2->c == '.' ) ) )
2679
+ { nc+=setc(box2,(wchar_t)'I'); }
2680
+ else if (
2681
+ ( box2->c!='1' /* lnt => Int, but 1st */
2682
+ && strchr(l_nonvo,next->c)
2683
+ && strchr("\" \n",prev->c))
2684
+ || (prev && ((!pre2) || wisupper(pre2->c) || strchr(" \n",pre2->c))
2685
+ && wisupper(prev->c)
2686
+ && box2->num_frame_vectors[0]==4
2687
+ && box2->frame_vector[0][0]==box2->x0
2688
+ && box2->frame_vector[1][0]==box2->x0
2689
+ && box2->frame_vector[2][0]==box2->x1
2690
+ && box2->frame_vector[3][0]==box2->x1
2691
+ )) // " DI*"
2692
+ /* do not change he'll to he'Il! */
2693
+ { nc+=setc(box2,(wchar_t)'I'); } // set box2->c to 'I' if 'I' is in the ac-list
2694
+ else if (strchr(l_vowel,next->c)) /* unusual? Ii Ie Ia Iy Iu */
2695
+ /* && strchr("KkBbFfgGpP",prev->c)) */ /* kle Kla Kli */
2696
+ { nc+=setc(box2,(wchar_t)'l'); }
2697
+ else if (wisupper(next->c) // ToDo: check 6 neighbours for upper+spaces
2698
+ && !strchr("O0I123456789",next->c)
2699
+ && !strchr("O0I123456789",prev->c)) /* avoid lO => IO (10) */
2700
+ { nc+=setc(box2,(wchar_t)'I'); }
2701
+ else if (prev && wislower(prev->c))
2702
+ { nc+=setc(box2,(wchar_t)'l'); }
2703
+ else if (wisdigit(prev->c)
2704
+ || wisdigit(next->c)
2705
+ || (next && strchr(":-",next->c) && pre2 && pre2->c==next->c
2706
+ && prev && strchr("0123456789ABCDabcd",prev->c)) // hex 2010-10
2707
+ || (next->c=='O' && !wisalpha(prev->c))) /* lO => 10 */
2708
+ { nc+=setc(box2,(wchar_t)'1'); }
2709
+ }
2710
+ // JS-2010-09 (ToDo: only if I is an alternate char!?)
2711
+ if (strchr("Il|", box2->c) && next && !prev) { // first char?
2712
+ if (wisalpha(next->c) && next->c!='i' && !strchr(l_vowel,next->c))
2713
+ { nc+=setc(box2,(wchar_t)'I'); }
2714
+ else if (wisupper(next->c)
2715
+ && !strchr("O0I123456789",next->c)) /* avoid lO => IO (10) */
2716
+ { nc+=setc(box2,(wchar_t)'I'); }
2717
+ }
2718
+
2719
+ // ToDo: count width of all "0O" to decide between wide and narrow O's
2720
+ /* check for O0 */
2721
+ else if (strchr("O0", box2->c)) {
2722
+ if (((!prev) || wisspace(prev->c))
2723
+ && next && ( wisalpha(next->c))
2724
+ && (!wisdigit(next->c)))
2725
+ { nc+=setc(box2,(wchar_t)'O'); } // first letter ?
2726
+ else if (prev && (!wisdigit(prev->c)) && wisupper(prev->c)
2727
+ && next && (!wisdigit(next->c)) && wisupper(next->c))
2728
+ { nc+=setc(box2,(wchar_t)'O'); } // upper case word
2729
+ // ! " Otto"
2730
+ else if (((!prev && next && wisdigit(next->c))
2731
+ || (prev && wisdigit(prev->c))
2732
+ || (prev && pre2 && wisspace(prev->c)) // tmp10/barcode_code128_10*
2733
+ || (prev && strchr("/:.,-+O", prev->c)) // tmp09/barcode090916*
2734
+ || (prev && strchr("abcdABCD", prev->c)
2735
+ && pre2 && next && pre2->c==next->c
2736
+ && strchr("/:- ", pre2->c)) // hex?
2737
+ )
2738
+ && ((next && wisdigit(next->c))
2739
+ || (next && strchr("\n /:.,-+O",next->c)) // 2010-10-01
2740
+ || ((next && strchr("abcdABCD", next->c)
2741
+ && pre2 && wisdigit(pre2->c)
2742
+ && prev && strchr("/:- ", prev->c))) // hex?
2743
+ || !next)
2744
+ && (!pre2
2745
+ || (pre2 && strchr("/:.,-+O",pre2->c)) // 2010-10-01
2746
+ || (pre2 && wisdigit(pre2->c)) // 2010-10-01
2747
+ || (pre2 && wisspace(pre2->c))) // 2010-10-01
2748
+ && ( (prev && wisdigit(prev->c))
2749
+ || (pre2 && wisdigit(pre2->c))
2750
+ || (pre3 && wisdigit(pre3->c))
2751
+ || (next && wisdigit(next->c))) // one near digit required
2752
+ )
2753
+ { nc+=setc(box2,(wchar_t)'0'); }
2754
+ }
2755
+
2756
+ /* check for 5S */
2757
+ else if (strchr("5S", box2->c) && next && prev) {
2758
+ if (wisspace(prev->c) && wisalpha(next->c)) /* initial letter */
2759
+ { nc+=setc(box2,(wchar_t)'S'); }
2760
+ else if (wisalpha(prev->c) && wisalpha(next->c)
2761
+ && wisupper(next->c)) /* word in upper case */
2762
+ { nc+=setc(box2,(wchar_t)'S'); }
2763
+ else if (wisdigit(prev->c) || wisdigit(next->c))
2764
+ { nc+=setc(box2,(wchar_t)'5'); }
2765
+ }
2766
+
2767
+ /* was a space not found? xXx => x Xx ??? */
2768
+ if (wisupper(box2->c) && next && prev) {
2769
+ if (wislower(prev->c) && wislower(next->c)
2770
+ && 2 * (box2->x0 - prev->x1) > 3 * (next->x0 - box2->x1)) {
2771
+ struct box *box3 = malloc_box((struct box *) NULL);
2772
+ box3->x0 = prev->x1 + 2;
2773
+ box3->x1 = box2->x0 - 2;
2774
+ box3->y0 = box2->y0;
2775
+ box3->y1 = box2->y1;
2776
+ box3->x = box2->x0 - 1;
2777
+ box3->y = box2->y0;
2778
+ box3->dots = 0;
2779
+ box3->num_boxes = 0;
2780
+ box3->num_subboxes = 0;
2781
+ box3->c = ' ';
2782
+ box3->modifier = 0;
2783
+ setac(box3,' ',99); /* ToDo: weight depends from distance */
2784
+ box3->num = -1;
2785
+ box3->line = prev->line;
2786
+ box3->m1 = box3->m2 = box3->m3 = box3->m4 = 0;
2787
+ box3->p = &(job->src.p);
2788
+ list_ins(&(job->res.boxlist), box2, box3);
2789
+ }
2790
+ }
2791
+
2792
+ /* a space before punctuation? but not " ./file" */
2793
+ if ( prev && next)
2794
+ if (prev->c == ' ' && strchr(" \n" , next->c)
2795
+ && strchr(".,;:!?)", box2->c))
2796
+ if (prev->x1 - prev->x0 < 2 * job->res.avX) { // carefully on tables
2797
+ box3 = prev;
2798
+ if ( !list_del(&(job->res.boxlist), box3) ) free_box(box3);
2799
+ prev = (struct box *)list_get_cur_prev(&(job->res.boxlist));
2800
+ ns++;
2801
+ }
2802
+
2803
+ /* \'\' to \" */
2804
+ if ( prev )
2805
+ if ( (prev->c == '`' || prev->c == '\'')
2806
+ && (box2->c == '`' || box2->c == '\'') )
2807
+ if (prev->x1 - box2->x0 < job->res.avX) { // carefully on tables
2808
+ box2->c='\"';
2809
+ box3 = prev;
2810
+ list_del(&(job->res.boxlist), box3);
2811
+ free_box(box3);
2812
+ }
2813
+ } end_for_each(&(job->res.boxlist));
2814
+ if (job->cfg.verbose)
2815
+ fprintf(stderr, " num_corrected= %d removed_spaces= %d\n", nc, ns);
2816
+ return 0;
2817
+ }
2818
+
2819
+
2820
+ /* ---- insert spaces ----
2821
+ * depends strongly from the outcome of measure_pitch()
2822
+ * ------------------------ */
2823
+ int list_insert_spaces( pix *pp, job_t *job ) {
2824
+ int i=0, j1, j2, i1, maxline=-1, dy=0, num_nl=0, num_spc=0, min_x0=1023;
2825
+ char cc;
2826
+ struct box *box2, *box3=NULL, *box4=NULL;
2827
+
2828
+ // measure mean line height
2829
+ for(i1=1;i1<job->res.lines.num;i1++) {
2830
+ dy+=job->res.lines.m4[i1]-job->res.lines.m1[i1]+1;
2831
+ if (min_x0>job->res.lines.x0[i1])
2832
+ min_x0=job->res.lines.x0[i1]; // 2010-09-30
2833
+ } if (job->res.lines.num>1) dy/=(job->res.lines.num-1);
2834
+ i=0; j2=0;
2835
+ for(i1=1;i1<job->res.lines.num;i1++) {
2836
+ j1=job->res.lines.m4[i1]-job->res.lines.m1[i1]+1;
2837
+ if (j1>dy*120/100 || j1<dy*80/100) continue; // only most frequently
2838
+ j2+=j1; i++;
2839
+ } if (i>0 && j2/i>7) dy=j2/i;
2840
+ if( job->cfg.verbose&1 )
2841
+ fprintf(stderr,"# insert space between words (dy=%d) ...",dy);
2842
+ if (!dy) dy=(job->res.avY)*110/100+1;
2843
+
2844
+ if (min_x0 < 4) min_x0 = 0; // tmp09/oebb_teletext* monospaced first gap
2845
+ // ToDo: rewrite, replace cc by num_spc + num_nl
2846
+ i=0;
2847
+ for_each_data(&(job->res.boxlist)) {
2848
+ int thispitch=0, thismono=0, pdist=0; // spacing paras per line
2849
+ box2 =(struct box *)list_get_current(&(job->res.boxlist));
2850
+ cc=0; num_nl=0; num_spc=0;
2851
+ box3 = (struct box *)list_prev(&(job->res.boxlist), box2);
2852
+ if (box2->line > maxline) { // new line, lines and chars must be sorted!
2853
+ int ydist=0, ypitch=0;
2854
+ if (maxline>=0) {
2855
+ // num_nl = 1; // ToDo: allow multiple newlines
2856
+ if (box2->line>1)
2857
+ ydist = job->res.lines.m1[ box2->line ]
2858
+ -job->res.lines.m1[ box2->line-1 ]; // 2010-09-26
2859
+ ypitch = job->res.lines.m4[ box2->line ]
2860
+ -job->res.lines.m1[ box2->line ];
2861
+ if (ypitch>4) num_nl = ydist / (2*ypitch); // ToDo: improve it!
2862
+ if (!num_nl) num_nl=1;
2863
+ }
2864
+ maxline=box2->line;
2865
+ }
2866
+ if (box2->line==maxline) { // lines and chars must be sorted!
2867
+ thispitch = job->res.lines.pitch[box2->line];
2868
+ thismono = job->res.lines.mono[ box2->line];
2869
+ if (box3) pdist = box2->x0 - box3->x1 - 1; // 2010-09-26
2870
+ if (pdist < 0) pdist = 0; // overlap like proportional: "VA"
2871
+ if (num_nl || !box3)
2872
+ pdist = box2->x0 - min_x0; // first char of new line
2873
+ // if (pdist >= thispitch) cc=' '; // 2010-09-24 ???
2874
+ if (thismono) num_spc = pdist / thispitch;
2875
+ else num_spc = pdist*2 / (3*job->res.avX); // ToDo: use 1em!
2876
+ if (pdist>=thispitch && !num_spc) num_spc = 1; // proportional font
2877
+ // ToDo: multi spaces for proportional font
2878
+ }
2879
+
2880
+ #if 0
2881
+ if ((job->cfg.verbose&48)==48)
2882
+ fprintf(stderr,"\n# DBG L%02d %d mono=%d %d pitch= %2d"
2883
+ " pdist= %2d nl %d spc %d", maxline, box2->line, thismono,
2884
+ job->res.lines.mono[ box2->line], thispitch, pdist, num_nl, num_spc);
2885
+ #endif
2886
+
2887
+ // call this multiple times
2888
+ for (i1=0;i1<num_nl+num_spc;i1++) {
2889
+ int mdist=0;
2890
+ box4=(struct box *)list_prev(&(job->res.boxlist), box2);
2891
+ if (box4) mdist = box2->x0 - box4->x1 + 1; // 2010-09
2892
+ else mdist = 0;
2893
+ if (mdist<0) mdist=0;
2894
+ box3=(struct box *)malloc_box(NULL);
2895
+ box3->x0=box2->x0-2+((num_spc)?-mdist+ i1 *mdist/num_spc:0);
2896
+ box3->x1=box2->x0-2+((num_spc)?-mdist+(i1+1)*mdist/num_spc:0);
2897
+ box3->y0=box2->y0;
2898
+ box3->y1=box2->y1;
2899
+ if (i1>=num_nl && box4)
2900
+ box3->x0 = box4->x1+2+((num_spc)?i1*mdist/num_spc:0);
2901
+ if (i1< num_nl || !box4)
2902
+ box3->x0 = job->res.lines.x0[box2->line];
2903
+ if (i1< num_nl && box4){
2904
+ box3->y0=box4->y1; // better use lines.y1[box2->pre] ???
2905
+ box3->y1=box2->y0;
2906
+ }
2907
+ box3->x = box3->x0; // 2010-09
2908
+ box3->y = box2->y0;
2909
+ box3->dots = 0;
2910
+ box3->c = cc = ((i1<num_nl)?'\n':' ');
2911
+ box3->num_boxes = 0;
2912
+ box3->num_subboxes = 0;
2913
+ box3->modifier = '\0';
2914
+ box3->num=-1; box3->line=box2->line;
2915
+ box3->m1=box2->m1; box3->m2=box2->m2;
2916
+ box3->m3=box2->m3; box3->m4=box2->m4;
2917
+ box3->p=pp;
2918
+ setac(box3,cc,100); /* ToDo: weight depends from distance */
2919
+ list_ins(&(job->res.boxlist),box2,box3); // insert box3 before box2
2920
+ if( job->cfg.verbose&1 ) {
2921
+ fprintf(stderr,"\n# insert space &%d; at %4d %4d box= %p"
2922
+ " mono %d dx %2d pdx,mdx %2d %2d",
2923
+ (int)box3->c, box3->x0, box3->y0, (void*)box3,
2924
+ thismono, thispitch, pdist, mdist);
2925
+ /* out_x(box3); */
2926
+ }
2927
+ i++;
2928
+ }
2929
+ } end_for_each(&(job->res.boxlist));
2930
+ if( job->cfg.verbose&1 ) fprintf(stderr,"\n# ... found %d spaces\n",i);
2931
+ return 0;
2932
+ }
2933
+
2934
+
2935
+ /*
2936
+ add infos where the box is positioned to the box
2937
+ this is useful for better recognition
2938
+ */
2939
+ int add_line_info( job_t *job /* , List *boxlist2 */){
2940
+ struct tlines *lines = &job->res.lines;
2941
+ struct box *box2;
2942
+ int i,xx,m1,m2,m3,m4,num_line_members=0,num_rest=0;
2943
+ if (job->cfg.verbose&1) fprintf(stderr,"# add_line_info to boxes ...");
2944
+ for_each_data(&(job->res.boxlist)) {
2945
+ box2 =(struct box *)list_get_current(&(job->res.boxlist));
2946
+ for (i=1;i<job->res.lines.num;i++) /* line 0 is a place holder */
2947
+ { // add rotated image correction dy(x)
2948
+ if (lines->dx) xx=lines->dy*((box2->x1+box2->x0)/2)/lines->dx;
2949
+ else xx=0;
2950
+ m1= lines->m1[i]+xx;
2951
+ m2= lines->m2[i]+xx;
2952
+ m3= lines->m3[i]+xx;
2953
+ m4= lines->m4[i]+xx;
2954
+ // fprintf(stderr," test line %d m1=%d %d %d %d\n",i,m1,m2,m3,m4);
2955
+ if (m4-m1==0) continue; /* no text line (line==0) */
2956
+ #if 0
2957
+ if( box2->y1+2*job->res.avY >= m1
2958
+ && box2->y0-2*job->res.avY <= m4 ) /* not to far away */
2959
+ #endif
2960
+ /* give also a comma or dot behind the line a chance */
2961
+ if ( box2->x0 >= lines->x0[i]
2962
+ && box2->x1 <= lines->x1[i]+job->res.avX )
2963
+ if ( box2->y0 <= m4 + 2*job->res.avY // 2010-10-01+09 0811qemu2
2964
+ && box2->y1 >= m1 - job->res.avY/2 // give "a "o ... a chance
2965
+ && box2->y1 <= m4 + 2*job->res.avY ) // 2010-10-09 ocr-b-'_'
2966
+ if ( box2->m2==0 // already put to a line? check y-distance
2967
+ // || abs(box2->y0 - box2->m2) > abs(box2->y0 - m2)
2968
+ // ToDo: m3 vs m2
2969
+ || (abs(box2->y1 - box2->m3) > abs(box2->y1 - m1)
2970
+ && box2->y0 > box2->m4)
2971
+ )
2972
+ { /* found nearest line */
2973
+ box2->m1= m1;
2974
+ box2->m2= m2;
2975
+ box2->m3= m3;
2976
+ box2->m4= m4;
2977
+ box2->line= i;
2978
+ }
2979
+ } // i=1..lines
2980
+ if( box2->y1+2 < box2->m1
2981
+ || box2->y0 < box2->m1 - (box2->m3-box2->m1)/2
2982
+ || box2->y0-2 > box2->m4 + (box2->m3-box2->m2)/2 // bad m4 + ,._ ocr-b
2983
+ || box2->y1 > box2->m3 + (box2->m3-box2->m1)
2984
+ ) /* to far away */
2985
+ { /* reset */
2986
+ box2->m1= 0;
2987
+ box2->m2= 0;
2988
+ box2->m3= 0;
2989
+ box2->m4= 0;
2990
+ box2->line= 0;
2991
+ num_rest++;
2992
+ } else num_line_members++;
2993
+ } end_for_each(&(job->res.boxlist));
2994
+ if (job->cfg.verbose&1)
2995
+ fprintf(stderr," done, num_line_chars=%d rest=%d\n",
2996
+ num_line_members, num_rest);
2997
+ return 0;
2998
+ }
2999
+
3000
+
3001
+ /*
3002
+ * bring the boxes in right order
3003
+ * add_line_info must be executed first!
3004
+ */
3005
+ int sort_box_func (const void *a, const void *b) {
3006
+ struct box *boxa, *boxb;
3007
+
3008
+ boxa = (struct box *)a;
3009
+ boxb = (struct box *)b;
3010
+
3011
+ if ( ( boxb->line < boxa->line ) ||
3012
+ ( boxb->line == boxa->line && boxb->x0 < boxa->x0 ) )
3013
+ return 1;
3014
+ return -1;
3015
+ }
3016
+
3017
+ // -------------------------------------------------------------
3018
+ // ------ use this for entry from other programs
3019
+ // include pnm.h pgm2asc.h
3020
+ // -------------------------------------------------------------
3021
+ // entry point for gocr.c or if it is used as lib
3022
+ // better name is call_ocr ???
3023
+ // jb: OLD COMMENT: not removed due to set_options_* ()
3024
+ // args after pix *pp should be removed and new functions
3025
+ // set_option_mode(int mode), set_option_spacewidth() .... etc.
3026
+ // should be used instead, before calling pgm2asc(pix *pp)
3027
+ // ! change if you can ! - used by X11 frontend
3028
+ int pgm2asc(job_t *job)
3029
+ {
3030
+ pix *pp;
3031
+ progress_counter_t *pc;
3032
+ static int multi_image_count=0; /* number of image within multi-image */
3033
+ int orig_cs=0;
3034
+
3035
+ if (!multi_image_count) orig_cs = job->cfg.cs; /* save for multi-images */
3036
+
3037
+ multi_image_count++;
3038
+
3039
+ assert(job);
3040
+ /* FIXME jb: remove pp */
3041
+ pp = &(job->src.p);
3042
+
3043
+ pc = open_progress(100,"pgm2asc_main");
3044
+ progress(0,pc); /* start progress output 0% 0% */
3045
+ #if 0 /* dont vast memory */
3046
+ /* FIXME jb: malloc */
3047
+ if ( job->cfg.verbose & 32 ) {
3048
+ // generate 2nd imagebuffer for debugging output
3049
+ job->tmp.ppo.p = (unsigned char *)malloc(job->src.p.y * job->src.p.x);
3050
+ // buffer
3051
+ assert(job->tmp.ppo.p);
3052
+ copybox(&job->src.p,
3053
+ 0, 0, job->src.p.x, job->src.p.y,
3054
+ &job->tmp.ppo,
3055
+ job->src.p.x * job->src.p.y);
3056
+ }
3057
+ #else
3058
+ job->tmp.ppo=job->src.p; /* temporarely, removed later */
3059
+ #endif
3060
+ // if (job->cfg.verbose&32) debug_img("out000.ppm",job,0);
3061
+
3062
+ /* ----- count colors ------ create histogram -------
3063
+ - this should be used to create a upper and lower limit for cs
3064
+ - cs is the optimum gray value between cs_min and cs_max
3065
+ - also inverse scans could be detected here later */
3066
+ if (orig_cs==0)
3067
+ job->cfg.cs=otsu( pp->p,pp->y,pp->x,0,0,pp->x,pp->y,job->cfg.verbose & 1);
3068
+ else // dont set cs, output stats + do inversion if needed 2010-10-07
3069
+ otsu( pp->p,pp->y,pp->x,0,0,pp->x,pp->y,job->cfg.verbose & 1);
3070
+ // if (job->cfg.verbose&32) debug_img("out001.ppm",job,0);
3071
+ /* renormalize the image and set the normalized threshold value */
3072
+ job->cfg.cs=thresholding( pp->p,pp->y,pp->x,0,0,pp->x,pp->y, job->cfg.cs );
3073
+ if( job->cfg.verbose )
3074
+ fprintf(stderr, "# thresholding new_threshold= %d\n", job->cfg.cs);
3075
+ // if (job->cfg.verbose&32) debug_img("out002.ppm",job,0);
3076
+
3077
+ progress(5,pc); /* progress is only estimated */
3078
+
3079
+
3080
+ /* this is first step for reorganize the PG
3081
+ ---- look for letters, put rectangular frames around letters
3082
+ letter = connected points near color F
3083
+ should be used by dust removing (faster) and line detection!
3084
+ ---- 0..cs = black letters, last change = Mai99 */
3085
+
3086
+ progress(8,pc); /* progress is only estimated */
3087
+
3088
+ // if (job->cfg.verbose&32) debug_img("out008.ppm",job,8);
3089
+ scan_boxes( job, pp );
3090
+ if ( !job->res.numC ){
3091
+ fprintf( stderr,"# no boxes found - stopped\n" );
3092
+ if(job->cfg.verbose&32) debug_img("out01",job,8);
3093
+ /***** should free stuff, etc) */
3094
+ return(1);
3095
+ }
3096
+ // tmp10/bug100818a.pgm creates artefacts on image
3097
+ // if (job->cfg.verbose&32) debug_img("out00",job,4+8);
3098
+
3099
+ progress(10,pc); /* progress is only estimated */
3100
+ // if(job->cfg.verbose&32) debug_img("out01",job,4+8);
3101
+ // output_list(job); // for debugging
3102
+ // ToDo: matrix printer preprocessing
3103
+
3104
+ remove_dust( job ); /* from the &(job->res.boxlist)! */
3105
+ // if(job->cfg.verbose&32) debug_img("out02",job,4+8);
3106
+ // output_list(job); // for debugging
3107
+ #if 0 // ToDo 2010-10-15 destroys QR-barcodes
3108
+ smooth_borders( job ); /* only for big chars */
3109
+ #endif
3110
+ progress(12,pc); /* progress is only estimated */
3111
+ // if(job->cfg.verbose&32) debug_img("out03",job,4+8);
3112
+ // output_list(job); // for debugging
3113
+
3114
+ detect_barcode( job ); /* mark barcode */
3115
+ // if(job->cfg.verbose&32) debug_img("out04",job,4+8);
3116
+ // output_list(job); // for debugging
3117
+
3118
+ detect_pictures( job ); /* mark pictures */
3119
+ // if(job->cfg.verbose&32) debug_img("out05",job,4+8);
3120
+ // output_list(job); // for debugging
3121
+
3122
+ remove_pictures( job ); /* do this as early as possible, before layout */
3123
+ // if(job->cfg.verbose&32) debug_img("out06",job,4+8);
3124
+ // output_list(job); // for debugging
3125
+
3126
+ glue_holes_inside_chars( pp ); /* including count subboxes (holes) */
3127
+
3128
+ detect_rotation_angle( job );
3129
+
3130
+ #if 1 /* Rotate the whole picture! move boxes */
3131
+ if( job->res.lines.dy!=0 ){ // move down lowest first, move up highest first
3132
+ // in work! ??? (at end set dy=0) think on ppo!
3133
+ }
3134
+ #endif
3135
+ detect_text_lines( pp, job->cfg.mode ); /* detect and mark job->tmp.ppo */
3136
+ // if(job->cfg.verbose&32) debug_img("out07",job,4+8);
3137
+ progress(20,pc); /* progress is only estimated */
3138
+
3139
+ add_line_info( job /* , &(job->res.boxlist) */);
3140
+ if (job->cfg.verbose&32) debug_img("out10",job,4+8);
3141
+
3142
+ divide_vert_glued_boxes( pp, job->cfg.mode); /* after add_line_info, before list_sort! */
3143
+ // if(job->cfg.verbose&32) debug_img("out11",job,0);
3144
+
3145
+ remove_melted_serifs( job, pp ); /* make some corrections on pixmap */
3146
+ /* list_ins seems to sort in the boxes on the wrong place ??? */
3147
+ // if(job->cfg.verbose&32) debug_img("out12",job,4+8);
3148
+
3149
+ glue_broken_chars( job, pp ); /* 2nd glue */
3150
+ // if(job->cfg.verbose&32) debug_img("out14",job,4+8);
3151
+ // 2010-09-24 overall box size is correct here, but later broken
3152
+
3153
+ remove_rest_of_dust( job );
3154
+ // if(job->cfg.verbose&32) debug_img("out15",job,4+8);
3155
+
3156
+ /* better sort after dust is removed (slow for lot of pixels) */
3157
+ list_sort(&(job->res.boxlist), sort_box_func);
3158
+
3159
+ measure_pitch( job );
3160
+
3161
+ if(job->cfg.mode&64) find_same_chars( pp );
3162
+ progress(30,pc); /* progress is only estimated */
3163
+ // if(job->cfg.verbose&32) debug_img("out16",job,4+8);
3164
+
3165
+ char_recognition( pp, job->cfg.mode);
3166
+ progress(60,pc); /* progress is only estimated */
3167
+ // if(job->cfg.verbose&32) debug_img("out17",job,4+8);
3168
+
3169
+ if ( adjust_text_lines( pp, job->cfg.mode ) ) { /* correct using chars */
3170
+ /* may be, characters/pictures have changed line number */
3171
+ list_sort(&(job->res.boxlist), sort_box_func);
3172
+ // 2nd recognition call if lines are adjusted
3173
+ char_recognition( pp, job->cfg.mode);
3174
+ }
3175
+
3176
+ #define BlownUpDrawing 1 /* german: Explosionszeichnung, temporarly */
3177
+ #if BlownUpDrawing == 1 /* german: Explosionszeichnung */
3178
+ { /* just for debugging */
3179
+ int i,ii,ni; struct box *box2;
3180
+ i=ii=ni=0;
3181
+ for_each_data(&(job->res.boxlist)) { /* count boxes */
3182
+ box2 = (struct box *)list_get_current(&(job->res.boxlist));
3183
+ if (box2->c==UNKNOWN) i++;
3184
+ if (box2->c==PICTURE) ii++;
3185
+ ni++;
3186
+ } end_for_each(&(job->res.boxlist));
3187
+ if (job->cfg.verbose)
3188
+ fprintf(stderr,"# debug: unknown= %d picts= %d boxes= %d\n",i,ii,ni);
3189
+ }
3190
+ #endif
3191
+ // ----------- write out20.pgm ----------- mark lines + boxes
3192
+ if (job->cfg.verbose&32) debug_img("out20",job,1+4+8);
3193
+
3194
+ compare_unknown_with_known_chars( pp, job->cfg.mode);
3195
+ progress(70,pc); /* progress is only estimated */
3196
+
3197
+ try_to_divide_boxes( pp, job->cfg.mode);
3198
+ progress(80,pc); /* progress is only estimated */
3199
+
3200
+ /* --- list output ---- for debugging --- */
3201
+ if (job->cfg.verbose&6) output_list(job);
3202
+
3203
+ /* ---- insert spaces ---- */
3204
+ list_insert_spaces( pp , job );
3205
+
3206
+ // ---- proof difficult chars Il1 by context view ----
3207
+ if (job->cfg.verbose)
3208
+ fprintf(stderr,"# context correction if !(mode&32)\n");
3209
+ if (!(job->cfg.mode&32)) context_correction( job );
3210
+
3211
+ store_boxtree_lines( job, job->cfg.mode );
3212
+ progress(90,pc); /* progress is only estimated */
3213
+
3214
+ /* 0050002.pgm.gz ca. 109 digits, only 50 recognized (only in lines?)
3215
+ * ./gocr -v 39 -m 56 -e - -m 4 -C 0-9 -f XML tmp0406/0050002.pbm.gz
3216
+ * awk 'BEGIN{num=0}/1<\/box>/{num++;}END{print num}' o
3217
+ * 15*0 24*1 18*2 19*3 15*4 6*5 6*6 6*7 4*8 8*9 sum=125digits counted boxes
3218
+ * 9*0 19*1 14*2 15*3 11*4 6*5 5*6 6*7 4*8 8*9 sum=97digits recognized
3219
+ * 1*1 1*7 not recognized (Oct04)
3220
+ * 33*SPC 76*NL = 109 spaces + 36*unknown sum=241 * 16 missed
3221
+ */
3222
+ #if BlownUpDrawing == 1 /* german: Explosionszeichnung */
3223
+ { /* just for debugging */
3224
+ int i,ii,ni; struct box *box2; const char *testc="0123456789ABCDEFGHIJK";
3225
+ i=ii=ni=0;
3226
+ for_each_data(&(job->res.boxlist)) { /* count boxes */
3227
+ box2 = (struct box *)list_get_current(&(job->res.boxlist));
3228
+ if (box2->c==UNKNOWN) i++;
3229
+ if (box2->c==PICTURE) ii++;
3230
+ if (box2->c>' ' && box2->c<='z') ni++;
3231
+ } end_for_each(&(job->res.boxlist));
3232
+ if(job->cfg.verbose)
3233
+ fprintf(stderr,"# debug: (_)= %d picts= %d chars= %d",i,ii,ni);
3234
+ for (i=0;i<20;i++) {
3235
+ ni=0;
3236
+ for_each_data(&(job->res.boxlist)) { /* count boxes */
3237
+ box2 = (struct box *)list_get_current(&(job->res.boxlist));
3238
+ if (box2->c==testc[i]) ni++;
3239
+ } end_for_each(&(job->res.boxlist));
3240
+ if(job->cfg.verbose && ni>0)
3241
+ fprintf(stderr," (%c)=%d",testc[i],ni);
3242
+ }
3243
+ if(job->cfg.verbose)
3244
+ fprintf(stderr,"\n");
3245
+ }
3246
+ #endif
3247
+
3248
+ // ---- frame-size-histogram
3249
+ // ---- (my own defined) distance between letters
3250
+ // ---- write internal picture of textsite
3251
+ // ----------- write out30.pgm -----------
3252
+ if( job->cfg.verbose&32 ) debug_img("out30",job,2+4);
3253
+
3254
+ progress(100,pc); /* progress is only estimated */
3255
+
3256
+ close_progress(pc);
3257
+
3258
+ return 0; /* what should I return? error-state? num-of-chars? */
3259
+ }