gocr-ruby 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (56) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +22 -0
  3. data/Gemfile +4 -0
  4. data/LICENSE +21 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +29 -0
  7. data/Rakefile +49 -0
  8. data/ext/gocr/Makefile +141 -0
  9. data/ext/gocr/Makefile.in +140 -0
  10. data/ext/gocr/amiga.h +31 -0
  11. data/ext/gocr/barcode.c +2108 -0
  12. data/ext/gocr/barcode.h +11 -0
  13. data/ext/gocr/box.c +496 -0
  14. data/ext/gocr/config.h +37 -0
  15. data/ext/gocr/config.h.in +36 -0
  16. data/ext/gocr/database.c +468 -0
  17. data/ext/gocr/detect.c +1003 -0
  18. data/ext/gocr/extconf.rb +6 -0
  19. data/ext/gocr/gocr.c +436 -0
  20. data/ext/gocr/gocr.h +290 -0
  21. data/ext/gocr/jconv.c +168 -0
  22. data/ext/gocr/job.c +92 -0
  23. data/ext/gocr/lines.c +364 -0
  24. data/ext/gocr/list.c +334 -0
  25. data/ext/gocr/list.h +91 -0
  26. data/ext/gocr/ocr0.c +7312 -0
  27. data/ext/gocr/ocr0.h +63 -0
  28. data/ext/gocr/ocr0n.c +1527 -0
  29. data/ext/gocr/ocr1.c +85 -0
  30. data/ext/gocr/ocr1.h +3 -0
  31. data/ext/gocr/otsu.c +310 -0
  32. data/ext/gocr/otsu.h +23 -0
  33. data/ext/gocr/output.c +291 -0
  34. data/ext/gocr/output.h +37 -0
  35. data/ext/gocr/pcx.c +153 -0
  36. data/ext/gocr/pcx.h +9 -0
  37. data/ext/gocr/pgm2asc.c +3259 -0
  38. data/ext/gocr/pgm2asc.h +105 -0
  39. data/ext/gocr/pixel.c +538 -0
  40. data/ext/gocr/pnm.c +538 -0
  41. data/ext/gocr/pnm.h +35 -0
  42. data/ext/gocr/progress.c +87 -0
  43. data/ext/gocr/progress.h +42 -0
  44. data/ext/gocr/remove.c +715 -0
  45. data/ext/gocr/tga.c +87 -0
  46. data/ext/gocr/tga.h +6 -0
  47. data/ext/gocr/unicode.c +1318 -0
  48. data/ext/gocr/unicode.h +62 -0
  49. data/ext/gocr/unicode_defs.h +1245 -0
  50. data/ext/gocr/version.h +2 -0
  51. data/gocr-ruby.gemspec +28 -0
  52. data/image.png +0 -0
  53. data/lib/gocr.rb +6 -0
  54. data/lib/gocr/image.rb +8 -0
  55. data/lib/gocr/version.rb +3 -0
  56. metadata +156 -0
@@ -0,0 +1,37 @@
1
+ /* include/config.h. Generated by configure. */
2
+ /* include/config.h.in. Generated automatically from configure.in by autoheader. */
3
+
4
+ /* Define to empty if the keyword does not work. */
5
+ /* #undef const */
6
+
7
+ /* Define if the setvbuf function takes the buffering type as its second
8
+ argument and the buffer pointer as the third, as on System V
9
+ before release 3. */
10
+ /* #undef SETVBUF_REVERSED */
11
+
12
+ /* Define if you have the ANSI C header files. */
13
+ #define STDC_HEADERS 1
14
+
15
+ /* Define if you have the gettimeofday function. */
16
+ #define HAVE_GETTIMEOFDAY 1
17
+
18
+ /* Define if you have the popen function. */
19
+ #define HAVE_POPEN 1
20
+
21
+ /* Define if you have the wcschr function. */
22
+ #define HAVE_WCSCHR 1
23
+
24
+ /* Define if you have the wcsdup function. */
25
+ #define HAVE_WCSDUP 1
26
+
27
+ /* Define if you have the <pam.h> header file. */
28
+ /* #undef HAVE_PAM_H */
29
+
30
+ /* Define if you have the <pnm.h> header file. */
31
+ /* #undef HAVE_PNM_H */
32
+
33
+ /* Define if you have the <unistd.h> header file. */
34
+ #define HAVE_UNISTD_H 1
35
+
36
+ /* Define if you have the <wchar.h> header file. */
37
+ #define HAVE_WCHAR_H 1
@@ -0,0 +1,36 @@
1
+ /* include/config.h.in. Generated automatically from configure.in by autoheader. */
2
+
3
+ /* Define to empty if the keyword does not work. */
4
+ #undef const
5
+
6
+ /* Define if the setvbuf function takes the buffering type as its second
7
+ argument and the buffer pointer as the third, as on System V
8
+ before release 3. */
9
+ #undef SETVBUF_REVERSED
10
+
11
+ /* Define if you have the ANSI C header files. */
12
+ #undef STDC_HEADERS
13
+
14
+ /* Define if you have the gettimeofday function. */
15
+ #undef HAVE_GETTIMEOFDAY
16
+
17
+ /* Define if you have the popen function. */
18
+ #undef HAVE_POPEN
19
+
20
+ /* Define if you have the wcschr function. */
21
+ #undef HAVE_WCSCHR
22
+
23
+ /* Define if you have the wcsdup function. */
24
+ #undef HAVE_WCSDUP
25
+
26
+ /* Define if you have the <pam.h> header file. */
27
+ #undef HAVE_PAM_H
28
+
29
+ /* Define if you have the <pnm.h> header file. */
30
+ #undef HAVE_PNM_H
31
+
32
+ /* Define if you have the <unistd.h> header file. */
33
+ #undef HAVE_UNISTD_H
34
+
35
+ /* Define if you have the <wchar.h> header file. */
36
+ #undef HAVE_WCHAR_H
@@ -0,0 +1,468 @@
1
+ /*
2
+ This is a Optical-Character-Recognition program
3
+ Copyright (C) GPLv2 2000-2013 Joerg Schulenburg
4
+
5
+ This program is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU General Public License
7
+ as published by the Free Software Foundation; either version 2
8
+ of the License, or (at your option) any later version.
9
+
10
+ This program is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ GNU General Public License for more details.
14
+
15
+ You should have received a copy of the GNU General Public License
16
+ along with this program; if not, write to the Free Software
17
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18
+
19
+ see README for EMAIL address
20
+ */
21
+
22
+ #include <stdio.h>
23
+ #include <stdlib.h>
24
+ #include "gocr.h"
25
+ #include "pnm.h"
26
+ #include "pgm2asc.h"
27
+ #include "unicode_defs.h" /* macro UNKNOWN */
28
+ #include <string.h>
29
+ #include <time.h>
30
+
31
+ #define Blen 256
32
+
33
+ // load boxes from database into boxlist (for faster access)
34
+ // used as alternate engine, comparing chars with database
35
+ // uses readpnm() and would conflict with multi images
36
+ int load_db(job_t *job) { // called by gocr.c main()
37
+ FILE *f1;
38
+ char s1[Blen+1],
39
+ s2[Blen+1] = "./db/", /* ToDo: replace by constant! by configure */
40
+ *s3;
41
+ int i, j, ii, i2, line;
42
+ struct box *box1;
43
+ pix *pp;
44
+
45
+ if( job->cfg.db_path ) strncpy(s2, job->cfg.db_path, Blen-1);
46
+ i2=strlen(s2);
47
+ if (job->cfg.verbose)
48
+ fprintf(stderr, "# load database %s %s ... ", s2, job->cfg.db_path);
49
+
50
+ strncpy(s2+i2,"db.lst",Blen-i2);s2[Blen]=0;
51
+ f1 = fopen(s2, "r");
52
+ if (!f1) {
53
+ fprintf(stderr, " DB %s not found\n",s2);
54
+ return 1;
55
+ }
56
+
57
+ line = 0; /* line counter for better error report */
58
+ for (ii = 0; !feof(f1); ii++) {
59
+ /* bbg: should write a better input routine */
60
+ if (!fgets(s1, Blen, f1)) break; line++;
61
+ j = strlen(s1);
62
+ /* remove carriage return sequences from line */
63
+ while (j > 0 && (s1[j - 1] == '\r' || s1[j - 1] == '\n'))
64
+ s1[--j] = 0;
65
+ if (!j) continue; /* skip empty line */
66
+ if (s1[0]=='#') continue; /* skip comments (v0.44) */
67
+ /* copy file name */
68
+ for (i = 0; i < j && i+i2 < Blen && strchr(" \t,;",s1[i]) == 0; i++)
69
+ s2[i2 + i] = s1[i];
70
+ s2[i2+i]=0;
71
+ /* skip spaces */
72
+ for (; i < j && strchr(" \t",s1[i]) != 0; i++);
73
+ /* by now: read pix, fill box, goto next ??? */
74
+ pp = (pix *)malloc(sizeof(pix));
75
+ if( !pp ) fprintf(stderr,"malloc error in load_db pix\n");
76
+
77
+ // if (job->cfg.verbose) fprintf(stderr,"\n# readpgm %s ",s2);
78
+ if (readpgm(s2, pp, 0 * job->cfg.verbose)!=0) {
79
+ fprintf(stderr,"\ndatabase error: readpgm %s\n", s2);
80
+ exit(-1);
81
+ }
82
+
83
+ box1 = (struct box *)malloc_box(NULL);
84
+ if(!box1) fprintf(stderr,"malloc error in load_db box1\n");
85
+ box1->x0 = 0;
86
+ box1->x1 = pp->x-1; // white border 1 pixel width
87
+ box1->y0 = 0;
88
+ box1->y1 = pp->y-1;
89
+ box1->x = 1;
90
+ box1->y = 1;
91
+ box1->dots = 0;
92
+ box1->c = 0;
93
+ box1->modifier = 0; /* ToDo: obsolete */
94
+ box1->tas[0]=NULL;
95
+ box1->tac[0]=0;
96
+ box1->wac[0]=100; /* really 100% sure? */
97
+ box1->num_ac=1;
98
+ if (s1[i]=='"'){ /* parse a string */
99
+ j=strrchr(s1+i+1,'"')-(s1+i+1); /* we only look for first and last "" */
100
+ if (j>=1) {
101
+ s3=(char *)malloc(j+1);
102
+ if (!s3) fprintf (stderr, "malloc error in load_db s3\n");
103
+ if (s3) {
104
+ memcpy(s3,s1+i+1,j);
105
+ s3[j]=0;
106
+ box1->tas[0]=s3;
107
+ // fprintf(stderr,"\nstring=%s",s3);
108
+ }
109
+ } else { fprintf(stderr,"load_db: string parse error L%d\n",line); }
110
+ } else {
111
+ box1->tac[0] = box1->c = s1[i]; /* try to interpret as ASCII */
112
+ /* we can live without hexcode in future if we use UTF8-strings */
113
+ s3=s1+i;
114
+ j=strtol( s1+i, &s3, 16); /* try to read 4 to 8 digit hex unicode */
115
+ /* if its an hexcode, ASCII interpretation is overwritten */
116
+ if( j && i+3<=Blen && s3-s1-i>3 ) box1->tac[0] = box1->c = j;
117
+ // fprintf(stderr,"\nhexcode=%04x=%04x %d",(int)j,(int)box1->c,s3-s1-i);
118
+ }
119
+ box1->num = 0;
120
+ box1->line = -1;
121
+ box1->m1 = 0; /* ToDo: should be given too in the database! */
122
+ box1->m2 = 0;
123
+ box1->m3 = 0;
124
+ box1->m4 = 0;
125
+ box1->p = pp;
126
+ list_app(&job->tmp.dblist, box1); // append to list
127
+ #if 0
128
+ out_x(box1);
129
+ #endif
130
+ }
131
+ fclose(f1);
132
+ if (job->cfg.verbose)
133
+ fprintf(stderr, " %d chars loaded\n", ii);
134
+ return 0;
135
+ }
136
+
137
+ // expand database from box/boxlist name=db_$utime.pbm
138
+ // this is added in version v0.3.3
139
+ int store_db(struct box *box1, job_t *job) {
140
+ FILE *f1;
141
+ char s2[Blen+1] = "./db/", s3[Blen+1];
142
+ int i2, dx, dy;
143
+ unsigned c_out;
144
+ pix b; /* temporary mini page */
145
+
146
+ if( job->cfg.db_path ) strncpy(s2,job->cfg.db_path,Blen-1);
147
+ i2=strlen(s2);
148
+
149
+ /* add (first) char and time to the file name for better debugging */
150
+
151
+ /* decide between 7bit ASCII and UTF8-char or string */
152
+ c_out = ((box1->num_ac && box1->tas[0]) ?
153
+ (unsigned char )box1->tas[0][0] /* char */ :
154
+ box1->c /* wchar */);
155
+ /* (unsigned int)(( char)0x80) = 0xffffff80 */
156
+ /* (unsigned int)((unsigned char)0x80) = 0x00000080 */
157
+
158
+ /* name generation can cause problems, if called twice within a second */
159
+ sprintf(s3,"db_%04x_%08lx.pbm", c_out, (unsigned long)time(NULL));
160
+ /* ToDo: the file name may be not unique */
161
+
162
+ strncpy(s2+i2,"db.lst",Blen-i2);s2[Blen]=0;
163
+ f1 = fopen(s2, "a");
164
+ if (!f1) {
165
+ fprintf(stderr, " could not access %s\n",s2);
166
+ return 1;
167
+ }
168
+ strncpy(s2+i2,s3,strlen(s3)); s2[i2+strlen(s3)]=0;
169
+ /* store image and infos about the char */
170
+ /* ToDo: store the vector list instead of the pixelarray */
171
+
172
+ if (job->cfg.verbose)
173
+ fprintf(stderr, "store_db: add file %s to database (nac=%d c=%04x)"
174
+ "\n#",s3, box1->num_ac, c_out);
175
+
176
+ dx=box1->x1-box1->x0+1;
177
+ dy=box1->y1-box1->y0+1;
178
+ b.p = (unsigned char *) malloc( dx * dy );
179
+ if( !b.p ){
180
+ fprintf( stderr, "\nFATAL: malloc failed, skip store_db" );
181
+ fclose(f1);
182
+ return 2;
183
+ }
184
+ if (copybox(box1->p, box1->x0, box1->y0, dx, dy, &b, dx * dy)) {
185
+ fclose(f1);
186
+ return -1;
187
+ }
188
+
189
+ writepbm(s2,&b); /* What is to do on error? */
190
+ free(b.p);
191
+
192
+ /* store the database line */
193
+ /* some infos about box1->m1,..,m4 should added (base line, high etc.) */
194
+ if (box1->num_ac && box1->tas[0]) {
195
+ fprintf(f1, "%s \"%s\"\n",s3,box1->tas[0]);
196
+ /* ToDo: what if tas contains '"'? */
197
+ } else {
198
+ if( (box1->c >= '0' && box1->c <= '9')
199
+ || (box1->c >= 'A' && box1->c <= 'Z')
200
+ || (box1->c >= 'a' && box1->c <= 'z') )
201
+ fprintf(f1, "%s %c\n",s3,(char)box1->c);
202
+ else {
203
+ if (((box1->c)>>16)>>16)
204
+ fprintf(f1, "%s %08x\n",s3,(unsigned int)box1->c);
205
+ else
206
+ fprintf(f1, "%s %04x\n",s3,(unsigned int)box1->c);
207
+ }
208
+ }
209
+ fclose(f1);
210
+ return 0;
211
+ }
212
+
213
+ /* function is only for user prompt on console to identify chars
214
+ it prints out a part of pixmap b at point x0,y0 to stderr
215
+ using dots .,; if no pixel, and @xoO for pixels
216
+ */
217
+ void out_env(struct box *px, job_t *job){
218
+ int x0,y0,x1,y1,dx,dy,x,y,x2,y2,yy0,tx,ty,i,cs;
219
+ char c1, c2; pix *b;
220
+ cs=job->cfg.cs;
221
+ yy0=px->y0;
222
+ { /* overwrite rest of arguments */
223
+ b=px->p;
224
+ x0=px->x0; x1=px->x1; dx=x1-x0+1;
225
+ y0=px->y0; y1=px->y1; dy=y1-y0+1;
226
+ y0-=2; y1+=2;
227
+ if (px->m4 && y0>px->m1) y0=px->m1;
228
+ if (px->m4 && y1<px->m4) y1=px->m4;
229
+ if (x1-x0+1<52) { x0-=10; x1+=10; } /* fragment? expand frame */
230
+ if (x1-x0+1<52) { x0-=10; x1+=10; } /* fragment? expand frame */
231
+ if (x1-x0+1<62) { x0-=5; x1+=5; }
232
+ if (y1-y0+1<10) { y0-= 4; y1+= 4; } /* fragment? */
233
+ if (x0<0) x0=0; if (x1>=b->x) x1=b->x-1;
234
+ if (y0<0) y0=0; if (y1>=b->y) y1=b->y-1;
235
+ dx=x1-x0+1;
236
+ dy=y1-y0+1; yy0=y0;
237
+ fprintf(stderr,"\n# show box + environment");
238
+ fprintf(stderr,"\n# show box x= %4d %4d d= %3d %3d r= %d %d",
239
+ px->x0, px->y0, px->x1 - px->x0 + 1, px->y1 - px->y0 + 1,
240
+ px->x - px->x0, px->y - px->y0);
241
+ if (px->num_ac){ /* output table of chars and its probabilities */
242
+ fprintf(stderr,"\n# list box char: ");
243
+ for(i=0;i<px->num_ac && i<NumAlt;i++)
244
+ /* output the (xml-)string (picture position, barcodes, glyphs, ...) */
245
+ if (px->tas[i])
246
+ fprintf(stderr," %s(%d)", px->tas[i] ,px->wac[i]);
247
+ else
248
+ fprintf(stderr," %s(%d)",decode(px->tac[i],ASCII),px->wac[i]);
249
+ }
250
+ fprintf(stderr,"\n");
251
+ if (px->dots && px->m2 && px->m1<y0) { yy0=px->m1; dy=px->y1-yy0+1; }
252
+ }
253
+ tx=dx/80+1;
254
+ ty=dy/40+1; // step, usually 1, but greater on large maps
255
+ fprintf(stderr,"# show pattern x= %4d %4d d= %3d %3d t= %d %d\n",
256
+ x0,y0,dx,dy,tx,ty);
257
+ if (dx>0)
258
+ for(y=yy0;y<yy0+dy;y+=ty) { /* reduce the output to max 78x40 */
259
+
260
+ /* image is the boxframe + environment in the original bitmap */
261
+ for(x=x0;x<x0+dx;x+=tx){ /* by merging sub-pixels */
262
+ c1='.';
263
+ for(y2=y;y2<y+ty && y2<y0+dy;y2++) /* sub-pixels */
264
+ for(x2=x;x2<x+tx && x2<x0+dx;x2++)
265
+ { if((getpixel(b,x2,y2)<cs)) c1='#'; }
266
+ // show pixels outside the box thinner/weaker
267
+ if (x+tx-1 < px->x0 || x > px->x1
268
+ || y+ty-1 < px->y0 || y > px->y1) c1=((c1=='#')?'O':',');
269
+ fprintf(stderr,"%c", c1 );
270
+ }
271
+
272
+ c1=c2=' ';
273
+ /* mark lines with < */
274
+ if (px) if (y==px->m1 || y==px->m2 || y==px->m3 || y==px->m4) c1='<';
275
+ if (y==px->y0 || y==px->y1) c2='-'; /* boxmarks */
276
+ fprintf(stderr,"%c%c\n",c1,c2);
277
+ }
278
+ }
279
+
280
+
281
+ /*
282
+ // second variant, for database (with slightly other behaviour)
283
+ // new variant
284
+ // look at the environment of the pixel too (contrast etc.)
285
+ // detailed analysis only of diff pixels!
286
+ //
287
+ // 100% * distance, 0 is best fit
288
+ // = similarity of 2 chars for recognition of noisy chars
289
+ // weigth of pixels with only one same neighbour set to 0
290
+ // look at contours too!
291
+ ToDo: especially on small boxes distance should only be 0 if
292
+ characters are 100% identical!
293
+ */
294
+ // #define DEBUG 2
295
+ int distance2( pix *p1, struct box *box1,
296
+ pix *p2, struct box *box2, int cs, int vvv){
297
+ int rc=0,x,y,v1,v2,i1,i2,rgood=0,rbad=0,
298
+ x1,y1,x2,y2,dx,dy,dx1,dy1,dx2,dy2,tx,ty;
299
+ #if DEBUG == 2
300
+ if (vvv)
301
+ fprintf(stderr," DEBUG: distance2\n");
302
+ #endif
303
+ x1=box1->x0;y1=box1->y0;x2=box2->x0;y2=box2->y0;
304
+ dx1=box1->x1-box1->x0+1; dx2=box2->x1-box2->x0+1; dx=((dx1>dx2)?dx1:dx2);dx=dx1;
305
+ dy1=box1->y1-box1->y0+1; dy2=box2->y1-box2->y0+1; dy=((dy1>dy2)?dy1:dy2);dy=dy1;
306
+ if(abs(dx1-dx2)>1+dx/16 || abs(dy1-dy2)>1+dy/16) rbad++; // how to weight?
307
+ // compare relations to baseline and upper line
308
+ if(box1->m4>0 && box2->m4>0){ // used ???
309
+ if(2*box1->y1>box1->m3+box1->m4 && 2*box2->y1<box2->m3+box2->m4) rbad+=128;
310
+ if(2*box1->y0>box1->m1+box1->m2 && 2*box2->y0<box2->m1+box2->m2) rbad+=128;
311
+ }
312
+ tx=dx/16; if(dx<17)tx=1; // raster
313
+ ty=dy/32; if(dy<33)ty=1;
314
+ // compare pixels
315
+ for( y=0;y<dy;y+=ty )
316
+ for( x=0;x<dx;x+=tx ) { // try global shift too ???
317
+ v1=((getpixel(p1,x1+x*dx1/dx,y1+y*dy1/dy)<cs)?1:0); i1=8; // better gray?
318
+ v2=((getpixel(p2,x2+x*dx2/dx,y2+y*dy2/dy)<cs)?1:0); i2=8; // better gray?
319
+ if(v1==v2) { rgood+=16; continue; } // all things are right!
320
+ // what about different pixel???
321
+ // test overlapp of surounding pixels ???
322
+ v1=1; rbad+=4;
323
+ v1=-1;
324
+ for(i1=-1;i1<2;i1++)
325
+ for(i2=-1;i2<2;i2++)if(i1!=0 || i2!=0){
326
+ if( ((getpixel(p1,x1+x*dx1/dx+i1*(1+dx1/32),y1+y*dy1/dy+i2*(1+dy1/32))<cs)?1:0)
327
+ !=((getpixel(p2,x2+x*dx2/dx+i1*(1+dx2/32),y2+y*dy2/dy+i2*(1+dy2/32))<cs)?1:0) ) v1++;
328
+ }
329
+ if(v1>0)
330
+ rbad+=16*v1;
331
+ }
332
+ if(rgood+rbad) rc= 100*rbad/(rgood+rbad); else rc=99;
333
+ /* if width/high is not correct add badness */
334
+ rc += ( abs(dx1*dy2-dx2*dy1) * 10 ) / (dy1*dy2);
335
+ if (rc>100) rc=100;
336
+ if(/* rc<10 && */ vvv /* &1024 */){
337
+ #if DEBUG == 2
338
+ fprintf(stderr," distance2 rc=%d rgood=%d rbad=%d\n",rc,rgood,rbad);
339
+ // out_b(NULL,p1,box1->x0,box1->y0,box1->x1-box1->x0+1,
340
+ // box1->y1-box1->y0+1,cs);
341
+ // out_b(NULL,p2,box2->x0,box2->y0,box2->x1-box2->x0+1,
342
+ // box2->y1-box2->y0+1,cs);
343
+ out_x(box1);
344
+ out_x(box2);
345
+ #endif
346
+ }
347
+ return rc;
348
+ }
349
+
350
+ wchar_t ocr_db(struct box *box1, job_t *job) {
351
+ int dd = 1000, dist = 1000;
352
+ wchar_t c = UNKNOWN;
353
+ unsigned char buf[200]; /* Oct08 JS: add unsigned to avoid UTF problems */
354
+ Box *box2, *box3;
355
+
356
+ if (!list_empty(&job->tmp.dblist)){
357
+ box3 = (Box *)list_get_header(&job->tmp.dblist);
358
+ if(job->cfg.verbose)
359
+ fprintf(stderr,"\n#DEBUG: ocr_db (%d,%d) ",box1->x0, box1->y0);
360
+
361
+ for_each_data(&job->tmp.dblist) {
362
+ box2 = (Box *)list_get_current(&job->tmp.dblist);
363
+ /* do preselect!!! distance() slowly */
364
+ dd = distance2( box2->p, box2, box1->p, box1,
365
+ job->cfg.cs, job->cfg.verbose);
366
+ if (dd <= dist) { /* new best fit */
367
+ dist = dd;
368
+ box3 = box2; /* box3 is a pointer and not copied box2 */
369
+
370
+ if (dist<100 && 100-dist >= job->cfg.certainty) {
371
+ /* some deviation of the pattern is tolerated */
372
+ int i, wa;
373
+ for (i=0;i<box3->num_ac;i++) {
374
+ wa = (100-dist)*box3->wac[i]/100; /* weight *= (100-dist) */
375
+ if (box3->tas[i]) setas(box1,box3->tas[i],wa);
376
+ else setac(box1,box3->tac[i],wa);
377
+ }
378
+ if (box3->num_ac) c=box3->tac[0]; /* 0 for strings (!UNKNOWN) */
379
+ if (job->cfg.verbose)
380
+ fprintf(stderr, " dist=%4d c= %c 0x%02x %s wc= %3d", dist,
381
+ ((box3->c>32 && box3->c<127) ? (char) box3->c : '.'),
382
+ (int)box3->c, ((box3->tas[0])?box3->tas[0]:""), box3->wac[0]);
383
+ }
384
+ if (dd<=0 && ((box3->num_ac && box3->tas[0]) || box3->c >= 128
385
+ || !strchr ("l1|I0O", box3->c)))
386
+ break; /* speedup if found */
387
+ }
388
+ } end_for_each(&job->tmp.dblist);
389
+
390
+ }
391
+
392
+ if( (job->cfg.mode&128) != 0 && c == UNKNOWN ) { /* prompt the user */
393
+ /* should the output go to stderr or special pipe??? */
394
+ int utf8_ok=0; /* trigger this flag if input is ok */
395
+ int i, endchar; /* index */
396
+ out_env(box1, job); /* old: out_x(box1); */
397
+ fprintf(stderr,"The above pattern was not recognized.\n"
398
+ "Enter UTF8 char or string for above pattern. Leave empty if unsure.\n"
399
+ "Press RET at the end (ALT+RET to store into RAM only) : "
400
+ ); /* ToDo: empty + alt-return (0x1b 0x0a) for help? ^a for skip all */
401
+ /* UTF-8 (man 7 utf-8):
402
+ * 7bit = 0xxxxxxx (0000-007F)
403
+ * 11bit = 110xxxxx 10xxxxxx (0080-07FF)
404
+ * 16bit = 1110xxxx 10xxxxxx 10xxxxxx (0800-FFFF)
405
+ * 21bit = 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
406
+ * 26bit = 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
407
+ * 31bit = 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
408
+ */
409
+ buf[0]=0;
410
+ /* shift/ctrl/altgr-enter acts like enter or ^j or ^m,
411
+ * alt-enter returns 0x1b 0x0a and returns from fgets()
412
+ * ^d (EOF) returns (nil) from fgets()
413
+ * x+(2*)ctrl-d returns from fgets() without returning a 0x0a
414
+ * if not UTF-input-mode, we are in trouble?
415
+ * ^a=0x01, ^b=0x02, ^e=05, ..., ToDo: meaning of no-input or <=space
416
+ */
417
+ fgets((char *)buf,200,stdin); /* including \n=0x0a */
418
+ dd=strlen((char *)buf);
419
+ /* output hexcode if verbose set */
420
+ if (job->cfg.verbose) {
421
+ fprintf(stderr, "\n# fgets [%d]:", dd);
422
+ for(i=0; i<dd; i++)
423
+ fprintf(stderr, " %02x", (unsigned)((unsigned char)buf[i]));
424
+ fprintf(stderr, "\n#");
425
+ }
426
+ /* we dont accept chars which could destroy database file */
427
+ for (i=0; i<dd; i++) if (buf[i]<32) break; /* need unsigned char here */
428
+ endchar=buf[i]; /* last char is 0x0a (ret) 0x00 (EOF) or 0x1b (alt+ret) */
429
+ if (endchar==0x01) { i=0;job->cfg.mode&=~128; } /* skip all */
430
+ buf[dd=i]=0; /* replace final 0x0a or other special codes */
431
+ if (dd==1 && !(buf[0]&128)) { c=buf[0]; utf8_ok=1; } /* single char */
432
+ if (dd>1 && dd<7) { /* try to decode single wide char (utf8) */
433
+ int u0, u1; /* define UTF8-start sequences, u0=0bits u1=1bits */
434
+ u0= 1<<(7-dd); /* compute start byte from UTF8-length */
435
+ u1=255&~((1<<(8-dd))-1);
436
+ /* count number of following 10xxxxxx bytes to i */
437
+ for (i=1;i<dd;i++) if ((buf[i]&0xc0)!=0x80) break; /* 10xxxxxx */
438
+ if (i==dd && (buf[0]&(u0|u1))==u1) { utf8_ok=1;
439
+ c=buf[0]&(u0-1); /* 11..0x.. */
440
+ for (i=1;i<dd;i++) { c<<=6; c|=buf[i]&0x3F; } /* 10xxxxxx */
441
+ }
442
+ }
443
+ if (dd>0){ /* ToDo: skip space and tab too? */
444
+ if (utf8_ok==1) { setac(box1, c, 100); } /* store single wchar */
445
+ if (utf8_ok==0) { /* store a string of chars (UTF8-string) */
446
+ c='_'; /* what should we do with c? probably a bad idea? */
447
+ setas(box1, (char *)buf, 100);
448
+ }
449
+ /* decide between
450
+ * 0) just help gocr to find the results and (dont remember, 0x01)
451
+ * 1) help and remember in the same run (store to memory, 0x1b)
452
+ * 2) expand the database (dont store ugly chars to the database!)
453
+ */
454
+ if (endchar!=0x01){ /* ^a before hit return */
455
+ /* is there a reason to dont store to memory? */
456
+ list_app(&job->tmp.dblist, box1); /* append to list for 1+2 */
457
+ }
458
+ if (endchar!=0x01 && endchar!=0x1b){
459
+ store_db(box1, job); /* store to disk for 2 */
460
+ }
461
+ if (job->cfg.verbose)
462
+ fprintf(stderr, " got char= %c 16bit= 0x%04x string= \"%s\"\n",
463
+ ((c>32 && c<127)?(char)c:'.'), (int)c, buf);
464
+ }
465
+ }
466
+
467
+ return c;
468
+ }