gocr-ruby 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +22 -0
  3. data/Gemfile +4 -0
  4. data/LICENSE +21 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +29 -0
  7. data/Rakefile +49 -0
  8. data/ext/gocr/Makefile +141 -0
  9. data/ext/gocr/Makefile.in +140 -0
  10. data/ext/gocr/amiga.h +31 -0
  11. data/ext/gocr/barcode.c +2108 -0
  12. data/ext/gocr/barcode.h +11 -0
  13. data/ext/gocr/box.c +496 -0
  14. data/ext/gocr/config.h +37 -0
  15. data/ext/gocr/config.h.in +36 -0
  16. data/ext/gocr/database.c +468 -0
  17. data/ext/gocr/detect.c +1003 -0
  18. data/ext/gocr/extconf.rb +6 -0
  19. data/ext/gocr/gocr.c +436 -0
  20. data/ext/gocr/gocr.h +290 -0
  21. data/ext/gocr/jconv.c +168 -0
  22. data/ext/gocr/job.c +92 -0
  23. data/ext/gocr/lines.c +364 -0
  24. data/ext/gocr/list.c +334 -0
  25. data/ext/gocr/list.h +91 -0
  26. data/ext/gocr/ocr0.c +7312 -0
  27. data/ext/gocr/ocr0.h +63 -0
  28. data/ext/gocr/ocr0n.c +1527 -0
  29. data/ext/gocr/ocr1.c +85 -0
  30. data/ext/gocr/ocr1.h +3 -0
  31. data/ext/gocr/otsu.c +310 -0
  32. data/ext/gocr/otsu.h +23 -0
  33. data/ext/gocr/output.c +291 -0
  34. data/ext/gocr/output.h +37 -0
  35. data/ext/gocr/pcx.c +153 -0
  36. data/ext/gocr/pcx.h +9 -0
  37. data/ext/gocr/pgm2asc.c +3259 -0
  38. data/ext/gocr/pgm2asc.h +105 -0
  39. data/ext/gocr/pixel.c +538 -0
  40. data/ext/gocr/pnm.c +538 -0
  41. data/ext/gocr/pnm.h +35 -0
  42. data/ext/gocr/progress.c +87 -0
  43. data/ext/gocr/progress.h +42 -0
  44. data/ext/gocr/remove.c +715 -0
  45. data/ext/gocr/tga.c +87 -0
  46. data/ext/gocr/tga.h +6 -0
  47. data/ext/gocr/unicode.c +1318 -0
  48. data/ext/gocr/unicode.h +62 -0
  49. data/ext/gocr/unicode_defs.h +1245 -0
  50. data/ext/gocr/version.h +2 -0
  51. data/gocr-ruby.gemspec +28 -0
  52. data/image.png +0 -0
  53. data/lib/gocr.rb +6 -0
  54. data/lib/gocr/image.rb +8 -0
  55. data/lib/gocr/version.rb +3 -0
  56. metadata +156 -0
@@ -0,0 +1,37 @@
1
+ /* include/config.h. Generated by configure. */
2
+ /* include/config.h.in. Generated automatically from configure.in by autoheader. */
3
+
4
+ /* Define to empty if the keyword does not work. */
5
+ /* #undef const */
6
+
7
+ /* Define if the setvbuf function takes the buffering type as its second
8
+ argument and the buffer pointer as the third, as on System V
9
+ before release 3. */
10
+ /* #undef SETVBUF_REVERSED */
11
+
12
+ /* Define if you have the ANSI C header files. */
13
+ #define STDC_HEADERS 1
14
+
15
+ /* Define if you have the gettimeofday function. */
16
+ #define HAVE_GETTIMEOFDAY 1
17
+
18
+ /* Define if you have the popen function. */
19
+ #define HAVE_POPEN 1
20
+
21
+ /* Define if you have the wcschr function. */
22
+ #define HAVE_WCSCHR 1
23
+
24
+ /* Define if you have the wcsdup function. */
25
+ #define HAVE_WCSDUP 1
26
+
27
+ /* Define if you have the <pam.h> header file. */
28
+ /* #undef HAVE_PAM_H */
29
+
30
+ /* Define if you have the <pnm.h> header file. */
31
+ /* #undef HAVE_PNM_H */
32
+
33
+ /* Define if you have the <unistd.h> header file. */
34
+ #define HAVE_UNISTD_H 1
35
+
36
+ /* Define if you have the <wchar.h> header file. */
37
+ #define HAVE_WCHAR_H 1
@@ -0,0 +1,36 @@
1
+ /* include/config.h.in. Generated automatically from configure.in by autoheader. */
2
+
3
+ /* Define to empty if the keyword does not work. */
4
+ #undef const
5
+
6
+ /* Define if the setvbuf function takes the buffering type as its second
7
+ argument and the buffer pointer as the third, as on System V
8
+ before release 3. */
9
+ #undef SETVBUF_REVERSED
10
+
11
+ /* Define if you have the ANSI C header files. */
12
+ #undef STDC_HEADERS
13
+
14
+ /* Define if you have the gettimeofday function. */
15
+ #undef HAVE_GETTIMEOFDAY
16
+
17
+ /* Define if you have the popen function. */
18
+ #undef HAVE_POPEN
19
+
20
+ /* Define if you have the wcschr function. */
21
+ #undef HAVE_WCSCHR
22
+
23
+ /* Define if you have the wcsdup function. */
24
+ #undef HAVE_WCSDUP
25
+
26
+ /* Define if you have the <pam.h> header file. */
27
+ #undef HAVE_PAM_H
28
+
29
+ /* Define if you have the <pnm.h> header file. */
30
+ #undef HAVE_PNM_H
31
+
32
+ /* Define if you have the <unistd.h> header file. */
33
+ #undef HAVE_UNISTD_H
34
+
35
+ /* Define if you have the <wchar.h> header file. */
36
+ #undef HAVE_WCHAR_H
@@ -0,0 +1,468 @@
1
+ /*
2
+ This is a Optical-Character-Recognition program
3
+ Copyright (C) GPLv2 2000-2013 Joerg Schulenburg
4
+
5
+ This program is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU General Public License
7
+ as published by the Free Software Foundation; either version 2
8
+ of the License, or (at your option) any later version.
9
+
10
+ This program is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ GNU General Public License for more details.
14
+
15
+ You should have received a copy of the GNU General Public License
16
+ along with this program; if not, write to the Free Software
17
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18
+
19
+ see README for EMAIL address
20
+ */
21
+
22
+ #include <stdio.h>
23
+ #include <stdlib.h>
24
+ #include "gocr.h"
25
+ #include "pnm.h"
26
+ #include "pgm2asc.h"
27
+ #include "unicode_defs.h" /* macro UNKNOWN */
28
+ #include <string.h>
29
+ #include <time.h>
30
+
31
+ #define Blen 256
32
+
33
+ // load boxes from database into boxlist (for faster access)
34
+ // used as alternate engine, comparing chars with database
35
+ // uses readpnm() and would conflict with multi images
36
+ int load_db(job_t *job) { // called by gocr.c main()
37
+ FILE *f1;
38
+ char s1[Blen+1],
39
+ s2[Blen+1] = "./db/", /* ToDo: replace by constant! by configure */
40
+ *s3;
41
+ int i, j, ii, i2, line;
42
+ struct box *box1;
43
+ pix *pp;
44
+
45
+ if( job->cfg.db_path ) strncpy(s2, job->cfg.db_path, Blen-1);
46
+ i2=strlen(s2);
47
+ if (job->cfg.verbose)
48
+ fprintf(stderr, "# load database %s %s ... ", s2, job->cfg.db_path);
49
+
50
+ strncpy(s2+i2,"db.lst",Blen-i2);s2[Blen]=0;
51
+ f1 = fopen(s2, "r");
52
+ if (!f1) {
53
+ fprintf(stderr, " DB %s not found\n",s2);
54
+ return 1;
55
+ }
56
+
57
+ line = 0; /* line counter for better error report */
58
+ for (ii = 0; !feof(f1); ii++) {
59
+ /* bbg: should write a better input routine */
60
+ if (!fgets(s1, Blen, f1)) break; line++;
61
+ j = strlen(s1);
62
+ /* remove carriage return sequences from line */
63
+ while (j > 0 && (s1[j - 1] == '\r' || s1[j - 1] == '\n'))
64
+ s1[--j] = 0;
65
+ if (!j) continue; /* skip empty line */
66
+ if (s1[0]=='#') continue; /* skip comments (v0.44) */
67
+ /* copy file name */
68
+ for (i = 0; i < j && i+i2 < Blen && strchr(" \t,;",s1[i]) == 0; i++)
69
+ s2[i2 + i] = s1[i];
70
+ s2[i2+i]=0;
71
+ /* skip spaces */
72
+ for (; i < j && strchr(" \t",s1[i]) != 0; i++);
73
+ /* by now: read pix, fill box, goto next ??? */
74
+ pp = (pix *)malloc(sizeof(pix));
75
+ if( !pp ) fprintf(stderr,"malloc error in load_db pix\n");
76
+
77
+ // if (job->cfg.verbose) fprintf(stderr,"\n# readpgm %s ",s2);
78
+ if (readpgm(s2, pp, 0 * job->cfg.verbose)!=0) {
79
+ fprintf(stderr,"\ndatabase error: readpgm %s\n", s2);
80
+ exit(-1);
81
+ }
82
+
83
+ box1 = (struct box *)malloc_box(NULL);
84
+ if(!box1) fprintf(stderr,"malloc error in load_db box1\n");
85
+ box1->x0 = 0;
86
+ box1->x1 = pp->x-1; // white border 1 pixel width
87
+ box1->y0 = 0;
88
+ box1->y1 = pp->y-1;
89
+ box1->x = 1;
90
+ box1->y = 1;
91
+ box1->dots = 0;
92
+ box1->c = 0;
93
+ box1->modifier = 0; /* ToDo: obsolete */
94
+ box1->tas[0]=NULL;
95
+ box1->tac[0]=0;
96
+ box1->wac[0]=100; /* really 100% sure? */
97
+ box1->num_ac=1;
98
+ if (s1[i]=='"'){ /* parse a string */
99
+ j=strrchr(s1+i+1,'"')-(s1+i+1); /* we only look for first and last "" */
100
+ if (j>=1) {
101
+ s3=(char *)malloc(j+1);
102
+ if (!s3) fprintf (stderr, "malloc error in load_db s3\n");
103
+ if (s3) {
104
+ memcpy(s3,s1+i+1,j);
105
+ s3[j]=0;
106
+ box1->tas[0]=s3;
107
+ // fprintf(stderr,"\nstring=%s",s3);
108
+ }
109
+ } else { fprintf(stderr,"load_db: string parse error L%d\n",line); }
110
+ } else {
111
+ box1->tac[0] = box1->c = s1[i]; /* try to interpret as ASCII */
112
+ /* we can live without hexcode in future if we use UTF8-strings */
113
+ s3=s1+i;
114
+ j=strtol( s1+i, &s3, 16); /* try to read 4 to 8 digit hex unicode */
115
+ /* if its an hexcode, ASCII interpretation is overwritten */
116
+ if( j && i+3<=Blen && s3-s1-i>3 ) box1->tac[0] = box1->c = j;
117
+ // fprintf(stderr,"\nhexcode=%04x=%04x %d",(int)j,(int)box1->c,s3-s1-i);
118
+ }
119
+ box1->num = 0;
120
+ box1->line = -1;
121
+ box1->m1 = 0; /* ToDo: should be given too in the database! */
122
+ box1->m2 = 0;
123
+ box1->m3 = 0;
124
+ box1->m4 = 0;
125
+ box1->p = pp;
126
+ list_app(&job->tmp.dblist, box1); // append to list
127
+ #if 0
128
+ out_x(box1);
129
+ #endif
130
+ }
131
+ fclose(f1);
132
+ if (job->cfg.verbose)
133
+ fprintf(stderr, " %d chars loaded\n", ii);
134
+ return 0;
135
+ }
136
+
137
+ // expand database from box/boxlist name=db_$utime.pbm
138
+ // this is added in version v0.3.3
139
+ int store_db(struct box *box1, job_t *job) {
140
+ FILE *f1;
141
+ char s2[Blen+1] = "./db/", s3[Blen+1];
142
+ int i2, dx, dy;
143
+ unsigned c_out;
144
+ pix b; /* temporary mini page */
145
+
146
+ if( job->cfg.db_path ) strncpy(s2,job->cfg.db_path,Blen-1);
147
+ i2=strlen(s2);
148
+
149
+ /* add (first) char and time to the file name for better debugging */
150
+
151
+ /* decide between 7bit ASCII and UTF8-char or string */
152
+ c_out = ((box1->num_ac && box1->tas[0]) ?
153
+ (unsigned char )box1->tas[0][0] /* char */ :
154
+ box1->c /* wchar */);
155
+ /* (unsigned int)(( char)0x80) = 0xffffff80 */
156
+ /* (unsigned int)((unsigned char)0x80) = 0x00000080 */
157
+
158
+ /* name generation can cause problems, if called twice within a second */
159
+ sprintf(s3,"db_%04x_%08lx.pbm", c_out, (unsigned long)time(NULL));
160
+ /* ToDo: the file name may be not unique */
161
+
162
+ strncpy(s2+i2,"db.lst",Blen-i2);s2[Blen]=0;
163
+ f1 = fopen(s2, "a");
164
+ if (!f1) {
165
+ fprintf(stderr, " could not access %s\n",s2);
166
+ return 1;
167
+ }
168
+ strncpy(s2+i2,s3,strlen(s3)); s2[i2+strlen(s3)]=0;
169
+ /* store image and infos about the char */
170
+ /* ToDo: store the vector list instead of the pixelarray */
171
+
172
+ if (job->cfg.verbose)
173
+ fprintf(stderr, "store_db: add file %s to database (nac=%d c=%04x)"
174
+ "\n#",s3, box1->num_ac, c_out);
175
+
176
+ dx=box1->x1-box1->x0+1;
177
+ dy=box1->y1-box1->y0+1;
178
+ b.p = (unsigned char *) malloc( dx * dy );
179
+ if( !b.p ){
180
+ fprintf( stderr, "\nFATAL: malloc failed, skip store_db" );
181
+ fclose(f1);
182
+ return 2;
183
+ }
184
+ if (copybox(box1->p, box1->x0, box1->y0, dx, dy, &b, dx * dy)) {
185
+ fclose(f1);
186
+ return -1;
187
+ }
188
+
189
+ writepbm(s2,&b); /* What is to do on error? */
190
+ free(b.p);
191
+
192
+ /* store the database line */
193
+ /* some infos about box1->m1,..,m4 should added (base line, high etc.) */
194
+ if (box1->num_ac && box1->tas[0]) {
195
+ fprintf(f1, "%s \"%s\"\n",s3,box1->tas[0]);
196
+ /* ToDo: what if tas contains '"'? */
197
+ } else {
198
+ if( (box1->c >= '0' && box1->c <= '9')
199
+ || (box1->c >= 'A' && box1->c <= 'Z')
200
+ || (box1->c >= 'a' && box1->c <= 'z') )
201
+ fprintf(f1, "%s %c\n",s3,(char)box1->c);
202
+ else {
203
+ if (((box1->c)>>16)>>16)
204
+ fprintf(f1, "%s %08x\n",s3,(unsigned int)box1->c);
205
+ else
206
+ fprintf(f1, "%s %04x\n",s3,(unsigned int)box1->c);
207
+ }
208
+ }
209
+ fclose(f1);
210
+ return 0;
211
+ }
212
+
213
+ /* function is only for user prompt on console to identify chars
214
+ it prints out a part of pixmap b at point x0,y0 to stderr
215
+ using dots .,; if no pixel, and @xoO for pixels
216
+ */
217
+ void out_env(struct box *px, job_t *job){
218
+ int x0,y0,x1,y1,dx,dy,x,y,x2,y2,yy0,tx,ty,i,cs;
219
+ char c1, c2; pix *b;
220
+ cs=job->cfg.cs;
221
+ yy0=px->y0;
222
+ { /* overwrite rest of arguments */
223
+ b=px->p;
224
+ x0=px->x0; x1=px->x1; dx=x1-x0+1;
225
+ y0=px->y0; y1=px->y1; dy=y1-y0+1;
226
+ y0-=2; y1+=2;
227
+ if (px->m4 && y0>px->m1) y0=px->m1;
228
+ if (px->m4 && y1<px->m4) y1=px->m4;
229
+ if (x1-x0+1<52) { x0-=10; x1+=10; } /* fragment? expand frame */
230
+ if (x1-x0+1<52) { x0-=10; x1+=10; } /* fragment? expand frame */
231
+ if (x1-x0+1<62) { x0-=5; x1+=5; }
232
+ if (y1-y0+1<10) { y0-= 4; y1+= 4; } /* fragment? */
233
+ if (x0<0) x0=0; if (x1>=b->x) x1=b->x-1;
234
+ if (y0<0) y0=0; if (y1>=b->y) y1=b->y-1;
235
+ dx=x1-x0+1;
236
+ dy=y1-y0+1; yy0=y0;
237
+ fprintf(stderr,"\n# show box + environment");
238
+ fprintf(stderr,"\n# show box x= %4d %4d d= %3d %3d r= %d %d",
239
+ px->x0, px->y0, px->x1 - px->x0 + 1, px->y1 - px->y0 + 1,
240
+ px->x - px->x0, px->y - px->y0);
241
+ if (px->num_ac){ /* output table of chars and its probabilities */
242
+ fprintf(stderr,"\n# list box char: ");
243
+ for(i=0;i<px->num_ac && i<NumAlt;i++)
244
+ /* output the (xml-)string (picture position, barcodes, glyphs, ...) */
245
+ if (px->tas[i])
246
+ fprintf(stderr," %s(%d)", px->tas[i] ,px->wac[i]);
247
+ else
248
+ fprintf(stderr," %s(%d)",decode(px->tac[i],ASCII),px->wac[i]);
249
+ }
250
+ fprintf(stderr,"\n");
251
+ if (px->dots && px->m2 && px->m1<y0) { yy0=px->m1; dy=px->y1-yy0+1; }
252
+ }
253
+ tx=dx/80+1;
254
+ ty=dy/40+1; // step, usually 1, but greater on large maps
255
+ fprintf(stderr,"# show pattern x= %4d %4d d= %3d %3d t= %d %d\n",
256
+ x0,y0,dx,dy,tx,ty);
257
+ if (dx>0)
258
+ for(y=yy0;y<yy0+dy;y+=ty) { /* reduce the output to max 78x40 */
259
+
260
+ /* image is the boxframe + environment in the original bitmap */
261
+ for(x=x0;x<x0+dx;x+=tx){ /* by merging sub-pixels */
262
+ c1='.';
263
+ for(y2=y;y2<y+ty && y2<y0+dy;y2++) /* sub-pixels */
264
+ for(x2=x;x2<x+tx && x2<x0+dx;x2++)
265
+ { if((getpixel(b,x2,y2)<cs)) c1='#'; }
266
+ // show pixels outside the box thinner/weaker
267
+ if (x+tx-1 < px->x0 || x > px->x1
268
+ || y+ty-1 < px->y0 || y > px->y1) c1=((c1=='#')?'O':',');
269
+ fprintf(stderr,"%c", c1 );
270
+ }
271
+
272
+ c1=c2=' ';
273
+ /* mark lines with < */
274
+ if (px) if (y==px->m1 || y==px->m2 || y==px->m3 || y==px->m4) c1='<';
275
+ if (y==px->y0 || y==px->y1) c2='-'; /* boxmarks */
276
+ fprintf(stderr,"%c%c\n",c1,c2);
277
+ }
278
+ }
279
+
280
+
281
+ /*
282
+ // second variant, for database (with slightly other behaviour)
283
+ // new variant
284
+ // look at the environment of the pixel too (contrast etc.)
285
+ // detailed analysis only of diff pixels!
286
+ //
287
+ // 100% * distance, 0 is best fit
288
+ // = similarity of 2 chars for recognition of noisy chars
289
+ // weigth of pixels with only one same neighbour set to 0
290
+ // look at contours too!
291
+ ToDo: especially on small boxes distance should only be 0 if
292
+ characters are 100% identical!
293
+ */
294
+ // #define DEBUG 2
295
+ int distance2( pix *p1, struct box *box1,
296
+ pix *p2, struct box *box2, int cs, int vvv){
297
+ int rc=0,x,y,v1,v2,i1,i2,rgood=0,rbad=0,
298
+ x1,y1,x2,y2,dx,dy,dx1,dy1,dx2,dy2,tx,ty;
299
+ #if DEBUG == 2
300
+ if (vvv)
301
+ fprintf(stderr," DEBUG: distance2\n");
302
+ #endif
303
+ x1=box1->x0;y1=box1->y0;x2=box2->x0;y2=box2->y0;
304
+ dx1=box1->x1-box1->x0+1; dx2=box2->x1-box2->x0+1; dx=((dx1>dx2)?dx1:dx2);dx=dx1;
305
+ dy1=box1->y1-box1->y0+1; dy2=box2->y1-box2->y0+1; dy=((dy1>dy2)?dy1:dy2);dy=dy1;
306
+ if(abs(dx1-dx2)>1+dx/16 || abs(dy1-dy2)>1+dy/16) rbad++; // how to weight?
307
+ // compare relations to baseline and upper line
308
+ if(box1->m4>0 && box2->m4>0){ // used ???
309
+ if(2*box1->y1>box1->m3+box1->m4 && 2*box2->y1<box2->m3+box2->m4) rbad+=128;
310
+ if(2*box1->y0>box1->m1+box1->m2 && 2*box2->y0<box2->m1+box2->m2) rbad+=128;
311
+ }
312
+ tx=dx/16; if(dx<17)tx=1; // raster
313
+ ty=dy/32; if(dy<33)ty=1;
314
+ // compare pixels
315
+ for( y=0;y<dy;y+=ty )
316
+ for( x=0;x<dx;x+=tx ) { // try global shift too ???
317
+ v1=((getpixel(p1,x1+x*dx1/dx,y1+y*dy1/dy)<cs)?1:0); i1=8; // better gray?
318
+ v2=((getpixel(p2,x2+x*dx2/dx,y2+y*dy2/dy)<cs)?1:0); i2=8; // better gray?
319
+ if(v1==v2) { rgood+=16; continue; } // all things are right!
320
+ // what about different pixel???
321
+ // test overlapp of surounding pixels ???
322
+ v1=1; rbad+=4;
323
+ v1=-1;
324
+ for(i1=-1;i1<2;i1++)
325
+ for(i2=-1;i2<2;i2++)if(i1!=0 || i2!=0){
326
+ if( ((getpixel(p1,x1+x*dx1/dx+i1*(1+dx1/32),y1+y*dy1/dy+i2*(1+dy1/32))<cs)?1:0)
327
+ !=((getpixel(p2,x2+x*dx2/dx+i1*(1+dx2/32),y2+y*dy2/dy+i2*(1+dy2/32))<cs)?1:0) ) v1++;
328
+ }
329
+ if(v1>0)
330
+ rbad+=16*v1;
331
+ }
332
+ if(rgood+rbad) rc= 100*rbad/(rgood+rbad); else rc=99;
333
+ /* if width/high is not correct add badness */
334
+ rc += ( abs(dx1*dy2-dx2*dy1) * 10 ) / (dy1*dy2);
335
+ if (rc>100) rc=100;
336
+ if(/* rc<10 && */ vvv /* &1024 */){
337
+ #if DEBUG == 2
338
+ fprintf(stderr," distance2 rc=%d rgood=%d rbad=%d\n",rc,rgood,rbad);
339
+ // out_b(NULL,p1,box1->x0,box1->y0,box1->x1-box1->x0+1,
340
+ // box1->y1-box1->y0+1,cs);
341
+ // out_b(NULL,p2,box2->x0,box2->y0,box2->x1-box2->x0+1,
342
+ // box2->y1-box2->y0+1,cs);
343
+ out_x(box1);
344
+ out_x(box2);
345
+ #endif
346
+ }
347
+ return rc;
348
+ }
349
+
350
+ wchar_t ocr_db(struct box *box1, job_t *job) {
351
+ int dd = 1000, dist = 1000;
352
+ wchar_t c = UNKNOWN;
353
+ unsigned char buf[200]; /* Oct08 JS: add unsigned to avoid UTF problems */
354
+ Box *box2, *box3;
355
+
356
+ if (!list_empty(&job->tmp.dblist)){
357
+ box3 = (Box *)list_get_header(&job->tmp.dblist);
358
+ if(job->cfg.verbose)
359
+ fprintf(stderr,"\n#DEBUG: ocr_db (%d,%d) ",box1->x0, box1->y0);
360
+
361
+ for_each_data(&job->tmp.dblist) {
362
+ box2 = (Box *)list_get_current(&job->tmp.dblist);
363
+ /* do preselect!!! distance() slowly */
364
+ dd = distance2( box2->p, box2, box1->p, box1,
365
+ job->cfg.cs, job->cfg.verbose);
366
+ if (dd <= dist) { /* new best fit */
367
+ dist = dd;
368
+ box3 = box2; /* box3 is a pointer and not copied box2 */
369
+
370
+ if (dist<100 && 100-dist >= job->cfg.certainty) {
371
+ /* some deviation of the pattern is tolerated */
372
+ int i, wa;
373
+ for (i=0;i<box3->num_ac;i++) {
374
+ wa = (100-dist)*box3->wac[i]/100; /* weight *= (100-dist) */
375
+ if (box3->tas[i]) setas(box1,box3->tas[i],wa);
376
+ else setac(box1,box3->tac[i],wa);
377
+ }
378
+ if (box3->num_ac) c=box3->tac[0]; /* 0 for strings (!UNKNOWN) */
379
+ if (job->cfg.verbose)
380
+ fprintf(stderr, " dist=%4d c= %c 0x%02x %s wc= %3d", dist,
381
+ ((box3->c>32 && box3->c<127) ? (char) box3->c : '.'),
382
+ (int)box3->c, ((box3->tas[0])?box3->tas[0]:""), box3->wac[0]);
383
+ }
384
+ if (dd<=0 && ((box3->num_ac && box3->tas[0]) || box3->c >= 128
385
+ || !strchr ("l1|I0O", box3->c)))
386
+ break; /* speedup if found */
387
+ }
388
+ } end_for_each(&job->tmp.dblist);
389
+
390
+ }
391
+
392
+ if( (job->cfg.mode&128) != 0 && c == UNKNOWN ) { /* prompt the user */
393
+ /* should the output go to stderr or special pipe??? */
394
+ int utf8_ok=0; /* trigger this flag if input is ok */
395
+ int i, endchar; /* index */
396
+ out_env(box1, job); /* old: out_x(box1); */
397
+ fprintf(stderr,"The above pattern was not recognized.\n"
398
+ "Enter UTF8 char or string for above pattern. Leave empty if unsure.\n"
399
+ "Press RET at the end (ALT+RET to store into RAM only) : "
400
+ ); /* ToDo: empty + alt-return (0x1b 0x0a) for help? ^a for skip all */
401
+ /* UTF-8 (man 7 utf-8):
402
+ * 7bit = 0xxxxxxx (0000-007F)
403
+ * 11bit = 110xxxxx 10xxxxxx (0080-07FF)
404
+ * 16bit = 1110xxxx 10xxxxxx 10xxxxxx (0800-FFFF)
405
+ * 21bit = 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
406
+ * 26bit = 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
407
+ * 31bit = 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
408
+ */
409
+ buf[0]=0;
410
+ /* shift/ctrl/altgr-enter acts like enter or ^j or ^m,
411
+ * alt-enter returns 0x1b 0x0a and returns from fgets()
412
+ * ^d (EOF) returns (nil) from fgets()
413
+ * x+(2*)ctrl-d returns from fgets() without returning a 0x0a
414
+ * if not UTF-input-mode, we are in trouble?
415
+ * ^a=0x01, ^b=0x02, ^e=05, ..., ToDo: meaning of no-input or <=space
416
+ */
417
+ fgets((char *)buf,200,stdin); /* including \n=0x0a */
418
+ dd=strlen((char *)buf);
419
+ /* output hexcode if verbose set */
420
+ if (job->cfg.verbose) {
421
+ fprintf(stderr, "\n# fgets [%d]:", dd);
422
+ for(i=0; i<dd; i++)
423
+ fprintf(stderr, " %02x", (unsigned)((unsigned char)buf[i]));
424
+ fprintf(stderr, "\n#");
425
+ }
426
+ /* we dont accept chars which could destroy database file */
427
+ for (i=0; i<dd; i++) if (buf[i]<32) break; /* need unsigned char here */
428
+ endchar=buf[i]; /* last char is 0x0a (ret) 0x00 (EOF) or 0x1b (alt+ret) */
429
+ if (endchar==0x01) { i=0;job->cfg.mode&=~128; } /* skip all */
430
+ buf[dd=i]=0; /* replace final 0x0a or other special codes */
431
+ if (dd==1 && !(buf[0]&128)) { c=buf[0]; utf8_ok=1; } /* single char */
432
+ if (dd>1 && dd<7) { /* try to decode single wide char (utf8) */
433
+ int u0, u1; /* define UTF8-start sequences, u0=0bits u1=1bits */
434
+ u0= 1<<(7-dd); /* compute start byte from UTF8-length */
435
+ u1=255&~((1<<(8-dd))-1);
436
+ /* count number of following 10xxxxxx bytes to i */
437
+ for (i=1;i<dd;i++) if ((buf[i]&0xc0)!=0x80) break; /* 10xxxxxx */
438
+ if (i==dd && (buf[0]&(u0|u1))==u1) { utf8_ok=1;
439
+ c=buf[0]&(u0-1); /* 11..0x.. */
440
+ for (i=1;i<dd;i++) { c<<=6; c|=buf[i]&0x3F; } /* 10xxxxxx */
441
+ }
442
+ }
443
+ if (dd>0){ /* ToDo: skip space and tab too? */
444
+ if (utf8_ok==1) { setac(box1, c, 100); } /* store single wchar */
445
+ if (utf8_ok==0) { /* store a string of chars (UTF8-string) */
446
+ c='_'; /* what should we do with c? probably a bad idea? */
447
+ setas(box1, (char *)buf, 100);
448
+ }
449
+ /* decide between
450
+ * 0) just help gocr to find the results and (dont remember, 0x01)
451
+ * 1) help and remember in the same run (store to memory, 0x1b)
452
+ * 2) expand the database (dont store ugly chars to the database!)
453
+ */
454
+ if (endchar!=0x01){ /* ^a before hit return */
455
+ /* is there a reason to dont store to memory? */
456
+ list_app(&job->tmp.dblist, box1); /* append to list for 1+2 */
457
+ }
458
+ if (endchar!=0x01 && endchar!=0x1b){
459
+ store_db(box1, job); /* store to disk for 2 */
460
+ }
461
+ if (job->cfg.verbose)
462
+ fprintf(stderr, " got char= %c 16bit= 0x%04x string= \"%s\"\n",
463
+ ((c>32 && c<127)?(char)c:'.'), (int)c, buf);
464
+ }
465
+ }
466
+
467
+ return c;
468
+ }