gocr-ruby 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +22 -0
- data/Gemfile +4 -0
- data/LICENSE +21 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +49 -0
- data/ext/gocr/Makefile +141 -0
- data/ext/gocr/Makefile.in +140 -0
- data/ext/gocr/amiga.h +31 -0
- data/ext/gocr/barcode.c +2108 -0
- data/ext/gocr/barcode.h +11 -0
- data/ext/gocr/box.c +496 -0
- data/ext/gocr/config.h +37 -0
- data/ext/gocr/config.h.in +36 -0
- data/ext/gocr/database.c +468 -0
- data/ext/gocr/detect.c +1003 -0
- data/ext/gocr/extconf.rb +6 -0
- data/ext/gocr/gocr.c +436 -0
- data/ext/gocr/gocr.h +290 -0
- data/ext/gocr/jconv.c +168 -0
- data/ext/gocr/job.c +92 -0
- data/ext/gocr/lines.c +364 -0
- data/ext/gocr/list.c +334 -0
- data/ext/gocr/list.h +91 -0
- data/ext/gocr/ocr0.c +7312 -0
- data/ext/gocr/ocr0.h +63 -0
- data/ext/gocr/ocr0n.c +1527 -0
- data/ext/gocr/ocr1.c +85 -0
- data/ext/gocr/ocr1.h +3 -0
- data/ext/gocr/otsu.c +310 -0
- data/ext/gocr/otsu.h +23 -0
- data/ext/gocr/output.c +291 -0
- data/ext/gocr/output.h +37 -0
- data/ext/gocr/pcx.c +153 -0
- data/ext/gocr/pcx.h +9 -0
- data/ext/gocr/pgm2asc.c +3259 -0
- data/ext/gocr/pgm2asc.h +105 -0
- data/ext/gocr/pixel.c +538 -0
- data/ext/gocr/pnm.c +538 -0
- data/ext/gocr/pnm.h +35 -0
- data/ext/gocr/progress.c +87 -0
- data/ext/gocr/progress.h +42 -0
- data/ext/gocr/remove.c +715 -0
- data/ext/gocr/tga.c +87 -0
- data/ext/gocr/tga.h +6 -0
- data/ext/gocr/unicode.c +1318 -0
- data/ext/gocr/unicode.h +62 -0
- data/ext/gocr/unicode_defs.h +1245 -0
- data/ext/gocr/version.h +2 -0
- data/gocr-ruby.gemspec +28 -0
- data/image.png +0 -0
- data/lib/gocr.rb +6 -0
- data/lib/gocr/image.rb +8 -0
- data/lib/gocr/version.rb +3 -0
- metadata +156 -0
data/ext/gocr/config.h
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
/* include/config.h. Generated by configure. */
|
2
|
+
/* include/config.h.in. Generated automatically from configure.in by autoheader. */
|
3
|
+
|
4
|
+
/* Define to empty if the keyword does not work. */
|
5
|
+
/* #undef const */
|
6
|
+
|
7
|
+
/* Define if the setvbuf function takes the buffering type as its second
|
8
|
+
argument and the buffer pointer as the third, as on System V
|
9
|
+
before release 3. */
|
10
|
+
/* #undef SETVBUF_REVERSED */
|
11
|
+
|
12
|
+
/* Define if you have the ANSI C header files. */
|
13
|
+
#define STDC_HEADERS 1
|
14
|
+
|
15
|
+
/* Define if you have the gettimeofday function. */
|
16
|
+
#define HAVE_GETTIMEOFDAY 1
|
17
|
+
|
18
|
+
/* Define if you have the popen function. */
|
19
|
+
#define HAVE_POPEN 1
|
20
|
+
|
21
|
+
/* Define if you have the wcschr function. */
|
22
|
+
#define HAVE_WCSCHR 1
|
23
|
+
|
24
|
+
/* Define if you have the wcsdup function. */
|
25
|
+
#define HAVE_WCSDUP 1
|
26
|
+
|
27
|
+
/* Define if you have the <pam.h> header file. */
|
28
|
+
/* #undef HAVE_PAM_H */
|
29
|
+
|
30
|
+
/* Define if you have the <pnm.h> header file. */
|
31
|
+
/* #undef HAVE_PNM_H */
|
32
|
+
|
33
|
+
/* Define if you have the <unistd.h> header file. */
|
34
|
+
#define HAVE_UNISTD_H 1
|
35
|
+
|
36
|
+
/* Define if you have the <wchar.h> header file. */
|
37
|
+
#define HAVE_WCHAR_H 1
|
@@ -0,0 +1,36 @@
|
|
1
|
+
/* include/config.h.in. Generated automatically from configure.in by autoheader. */
|
2
|
+
|
3
|
+
/* Define to empty if the keyword does not work. */
|
4
|
+
#undef const
|
5
|
+
|
6
|
+
/* Define if the setvbuf function takes the buffering type as its second
|
7
|
+
argument and the buffer pointer as the third, as on System V
|
8
|
+
before release 3. */
|
9
|
+
#undef SETVBUF_REVERSED
|
10
|
+
|
11
|
+
/* Define if you have the ANSI C header files. */
|
12
|
+
#undef STDC_HEADERS
|
13
|
+
|
14
|
+
/* Define if you have the gettimeofday function. */
|
15
|
+
#undef HAVE_GETTIMEOFDAY
|
16
|
+
|
17
|
+
/* Define if you have the popen function. */
|
18
|
+
#undef HAVE_POPEN
|
19
|
+
|
20
|
+
/* Define if you have the wcschr function. */
|
21
|
+
#undef HAVE_WCSCHR
|
22
|
+
|
23
|
+
/* Define if you have the wcsdup function. */
|
24
|
+
#undef HAVE_WCSDUP
|
25
|
+
|
26
|
+
/* Define if you have the <pam.h> header file. */
|
27
|
+
#undef HAVE_PAM_H
|
28
|
+
|
29
|
+
/* Define if you have the <pnm.h> header file. */
|
30
|
+
#undef HAVE_PNM_H
|
31
|
+
|
32
|
+
/* Define if you have the <unistd.h> header file. */
|
33
|
+
#undef HAVE_UNISTD_H
|
34
|
+
|
35
|
+
/* Define if you have the <wchar.h> header file. */
|
36
|
+
#undef HAVE_WCHAR_H
|
data/ext/gocr/database.c
ADDED
@@ -0,0 +1,468 @@
|
|
1
|
+
/*
|
2
|
+
This is a Optical-Character-Recognition program
|
3
|
+
Copyright (C) GPLv2 2000-2013 Joerg Schulenburg
|
4
|
+
|
5
|
+
This program is free software; you can redistribute it and/or
|
6
|
+
modify it under the terms of the GNU General Public License
|
7
|
+
as published by the Free Software Foundation; either version 2
|
8
|
+
of the License, or (at your option) any later version.
|
9
|
+
|
10
|
+
This program is distributed in the hope that it will be useful,
|
11
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
+
GNU General Public License for more details.
|
14
|
+
|
15
|
+
You should have received a copy of the GNU General Public License
|
16
|
+
along with this program; if not, write to the Free Software
|
17
|
+
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
18
|
+
|
19
|
+
see README for EMAIL address
|
20
|
+
*/
|
21
|
+
|
22
|
+
#include <stdio.h>
|
23
|
+
#include <stdlib.h>
|
24
|
+
#include "gocr.h"
|
25
|
+
#include "pnm.h"
|
26
|
+
#include "pgm2asc.h"
|
27
|
+
#include "unicode_defs.h" /* macro UNKNOWN */
|
28
|
+
#include <string.h>
|
29
|
+
#include <time.h>
|
30
|
+
|
31
|
+
#define Blen 256
|
32
|
+
|
33
|
+
// load boxes from database into boxlist (for faster access)
|
34
|
+
// used as alternate engine, comparing chars with database
|
35
|
+
// uses readpnm() and would conflict with multi images
|
36
|
+
int load_db(job_t *job) { // called by gocr.c main()
|
37
|
+
FILE *f1;
|
38
|
+
char s1[Blen+1],
|
39
|
+
s2[Blen+1] = "./db/", /* ToDo: replace by constant! by configure */
|
40
|
+
*s3;
|
41
|
+
int i, j, ii, i2, line;
|
42
|
+
struct box *box1;
|
43
|
+
pix *pp;
|
44
|
+
|
45
|
+
if( job->cfg.db_path ) strncpy(s2, job->cfg.db_path, Blen-1);
|
46
|
+
i2=strlen(s2);
|
47
|
+
if (job->cfg.verbose)
|
48
|
+
fprintf(stderr, "# load database %s %s ... ", s2, job->cfg.db_path);
|
49
|
+
|
50
|
+
strncpy(s2+i2,"db.lst",Blen-i2);s2[Blen]=0;
|
51
|
+
f1 = fopen(s2, "r");
|
52
|
+
if (!f1) {
|
53
|
+
fprintf(stderr, " DB %s not found\n",s2);
|
54
|
+
return 1;
|
55
|
+
}
|
56
|
+
|
57
|
+
line = 0; /* line counter for better error report */
|
58
|
+
for (ii = 0; !feof(f1); ii++) {
|
59
|
+
/* bbg: should write a better input routine */
|
60
|
+
if (!fgets(s1, Blen, f1)) break; line++;
|
61
|
+
j = strlen(s1);
|
62
|
+
/* remove carriage return sequences from line */
|
63
|
+
while (j > 0 && (s1[j - 1] == '\r' || s1[j - 1] == '\n'))
|
64
|
+
s1[--j] = 0;
|
65
|
+
if (!j) continue; /* skip empty line */
|
66
|
+
if (s1[0]=='#') continue; /* skip comments (v0.44) */
|
67
|
+
/* copy file name */
|
68
|
+
for (i = 0; i < j && i+i2 < Blen && strchr(" \t,;",s1[i]) == 0; i++)
|
69
|
+
s2[i2 + i] = s1[i];
|
70
|
+
s2[i2+i]=0;
|
71
|
+
/* skip spaces */
|
72
|
+
for (; i < j && strchr(" \t",s1[i]) != 0; i++);
|
73
|
+
/* by now: read pix, fill box, goto next ??? */
|
74
|
+
pp = (pix *)malloc(sizeof(pix));
|
75
|
+
if( !pp ) fprintf(stderr,"malloc error in load_db pix\n");
|
76
|
+
|
77
|
+
// if (job->cfg.verbose) fprintf(stderr,"\n# readpgm %s ",s2);
|
78
|
+
if (readpgm(s2, pp, 0 * job->cfg.verbose)!=0) {
|
79
|
+
fprintf(stderr,"\ndatabase error: readpgm %s\n", s2);
|
80
|
+
exit(-1);
|
81
|
+
}
|
82
|
+
|
83
|
+
box1 = (struct box *)malloc_box(NULL);
|
84
|
+
if(!box1) fprintf(stderr,"malloc error in load_db box1\n");
|
85
|
+
box1->x0 = 0;
|
86
|
+
box1->x1 = pp->x-1; // white border 1 pixel width
|
87
|
+
box1->y0 = 0;
|
88
|
+
box1->y1 = pp->y-1;
|
89
|
+
box1->x = 1;
|
90
|
+
box1->y = 1;
|
91
|
+
box1->dots = 0;
|
92
|
+
box1->c = 0;
|
93
|
+
box1->modifier = 0; /* ToDo: obsolete */
|
94
|
+
box1->tas[0]=NULL;
|
95
|
+
box1->tac[0]=0;
|
96
|
+
box1->wac[0]=100; /* really 100% sure? */
|
97
|
+
box1->num_ac=1;
|
98
|
+
if (s1[i]=='"'){ /* parse a string */
|
99
|
+
j=strrchr(s1+i+1,'"')-(s1+i+1); /* we only look for first and last "" */
|
100
|
+
if (j>=1) {
|
101
|
+
s3=(char *)malloc(j+1);
|
102
|
+
if (!s3) fprintf (stderr, "malloc error in load_db s3\n");
|
103
|
+
if (s3) {
|
104
|
+
memcpy(s3,s1+i+1,j);
|
105
|
+
s3[j]=0;
|
106
|
+
box1->tas[0]=s3;
|
107
|
+
// fprintf(stderr,"\nstring=%s",s3);
|
108
|
+
}
|
109
|
+
} else { fprintf(stderr,"load_db: string parse error L%d\n",line); }
|
110
|
+
} else {
|
111
|
+
box1->tac[0] = box1->c = s1[i]; /* try to interpret as ASCII */
|
112
|
+
/* we can live without hexcode in future if we use UTF8-strings */
|
113
|
+
s3=s1+i;
|
114
|
+
j=strtol( s1+i, &s3, 16); /* try to read 4 to 8 digit hex unicode */
|
115
|
+
/* if its an hexcode, ASCII interpretation is overwritten */
|
116
|
+
if( j && i+3<=Blen && s3-s1-i>3 ) box1->tac[0] = box1->c = j;
|
117
|
+
// fprintf(stderr,"\nhexcode=%04x=%04x %d",(int)j,(int)box1->c,s3-s1-i);
|
118
|
+
}
|
119
|
+
box1->num = 0;
|
120
|
+
box1->line = -1;
|
121
|
+
box1->m1 = 0; /* ToDo: should be given too in the database! */
|
122
|
+
box1->m2 = 0;
|
123
|
+
box1->m3 = 0;
|
124
|
+
box1->m4 = 0;
|
125
|
+
box1->p = pp;
|
126
|
+
list_app(&job->tmp.dblist, box1); // append to list
|
127
|
+
#if 0
|
128
|
+
out_x(box1);
|
129
|
+
#endif
|
130
|
+
}
|
131
|
+
fclose(f1);
|
132
|
+
if (job->cfg.verbose)
|
133
|
+
fprintf(stderr, " %d chars loaded\n", ii);
|
134
|
+
return 0;
|
135
|
+
}
|
136
|
+
|
137
|
+
// expand database from box/boxlist name=db_$utime.pbm
|
138
|
+
// this is added in version v0.3.3
|
139
|
+
int store_db(struct box *box1, job_t *job) {
|
140
|
+
FILE *f1;
|
141
|
+
char s2[Blen+1] = "./db/", s3[Blen+1];
|
142
|
+
int i2, dx, dy;
|
143
|
+
unsigned c_out;
|
144
|
+
pix b; /* temporary mini page */
|
145
|
+
|
146
|
+
if( job->cfg.db_path ) strncpy(s2,job->cfg.db_path,Blen-1);
|
147
|
+
i2=strlen(s2);
|
148
|
+
|
149
|
+
/* add (first) char and time to the file name for better debugging */
|
150
|
+
|
151
|
+
/* decide between 7bit ASCII and UTF8-char or string */
|
152
|
+
c_out = ((box1->num_ac && box1->tas[0]) ?
|
153
|
+
(unsigned char )box1->tas[0][0] /* char */ :
|
154
|
+
box1->c /* wchar */);
|
155
|
+
/* (unsigned int)(( char)0x80) = 0xffffff80 */
|
156
|
+
/* (unsigned int)((unsigned char)0x80) = 0x00000080 */
|
157
|
+
|
158
|
+
/* name generation can cause problems, if called twice within a second */
|
159
|
+
sprintf(s3,"db_%04x_%08lx.pbm", c_out, (unsigned long)time(NULL));
|
160
|
+
/* ToDo: the file name may be not unique */
|
161
|
+
|
162
|
+
strncpy(s2+i2,"db.lst",Blen-i2);s2[Blen]=0;
|
163
|
+
f1 = fopen(s2, "a");
|
164
|
+
if (!f1) {
|
165
|
+
fprintf(stderr, " could not access %s\n",s2);
|
166
|
+
return 1;
|
167
|
+
}
|
168
|
+
strncpy(s2+i2,s3,strlen(s3)); s2[i2+strlen(s3)]=0;
|
169
|
+
/* store image and infos about the char */
|
170
|
+
/* ToDo: store the vector list instead of the pixelarray */
|
171
|
+
|
172
|
+
if (job->cfg.verbose)
|
173
|
+
fprintf(stderr, "store_db: add file %s to database (nac=%d c=%04x)"
|
174
|
+
"\n#",s3, box1->num_ac, c_out);
|
175
|
+
|
176
|
+
dx=box1->x1-box1->x0+1;
|
177
|
+
dy=box1->y1-box1->y0+1;
|
178
|
+
b.p = (unsigned char *) malloc( dx * dy );
|
179
|
+
if( !b.p ){
|
180
|
+
fprintf( stderr, "\nFATAL: malloc failed, skip store_db" );
|
181
|
+
fclose(f1);
|
182
|
+
return 2;
|
183
|
+
}
|
184
|
+
if (copybox(box1->p, box1->x0, box1->y0, dx, dy, &b, dx * dy)) {
|
185
|
+
fclose(f1);
|
186
|
+
return -1;
|
187
|
+
}
|
188
|
+
|
189
|
+
writepbm(s2,&b); /* What is to do on error? */
|
190
|
+
free(b.p);
|
191
|
+
|
192
|
+
/* store the database line */
|
193
|
+
/* some infos about box1->m1,..,m4 should added (base line, high etc.) */
|
194
|
+
if (box1->num_ac && box1->tas[0]) {
|
195
|
+
fprintf(f1, "%s \"%s\"\n",s3,box1->tas[0]);
|
196
|
+
/* ToDo: what if tas contains '"'? */
|
197
|
+
} else {
|
198
|
+
if( (box1->c >= '0' && box1->c <= '9')
|
199
|
+
|| (box1->c >= 'A' && box1->c <= 'Z')
|
200
|
+
|| (box1->c >= 'a' && box1->c <= 'z') )
|
201
|
+
fprintf(f1, "%s %c\n",s3,(char)box1->c);
|
202
|
+
else {
|
203
|
+
if (((box1->c)>>16)>>16)
|
204
|
+
fprintf(f1, "%s %08x\n",s3,(unsigned int)box1->c);
|
205
|
+
else
|
206
|
+
fprintf(f1, "%s %04x\n",s3,(unsigned int)box1->c);
|
207
|
+
}
|
208
|
+
}
|
209
|
+
fclose(f1);
|
210
|
+
return 0;
|
211
|
+
}
|
212
|
+
|
213
|
+
/* function is only for user prompt on console to identify chars
|
214
|
+
it prints out a part of pixmap b at point x0,y0 to stderr
|
215
|
+
using dots .,; if no pixel, and @xoO for pixels
|
216
|
+
*/
|
217
|
+
void out_env(struct box *px, job_t *job){
|
218
|
+
int x0,y0,x1,y1,dx,dy,x,y,x2,y2,yy0,tx,ty,i,cs;
|
219
|
+
char c1, c2; pix *b;
|
220
|
+
cs=job->cfg.cs;
|
221
|
+
yy0=px->y0;
|
222
|
+
{ /* overwrite rest of arguments */
|
223
|
+
b=px->p;
|
224
|
+
x0=px->x0; x1=px->x1; dx=x1-x0+1;
|
225
|
+
y0=px->y0; y1=px->y1; dy=y1-y0+1;
|
226
|
+
y0-=2; y1+=2;
|
227
|
+
if (px->m4 && y0>px->m1) y0=px->m1;
|
228
|
+
if (px->m4 && y1<px->m4) y1=px->m4;
|
229
|
+
if (x1-x0+1<52) { x0-=10; x1+=10; } /* fragment? expand frame */
|
230
|
+
if (x1-x0+1<52) { x0-=10; x1+=10; } /* fragment? expand frame */
|
231
|
+
if (x1-x0+1<62) { x0-=5; x1+=5; }
|
232
|
+
if (y1-y0+1<10) { y0-= 4; y1+= 4; } /* fragment? */
|
233
|
+
if (x0<0) x0=0; if (x1>=b->x) x1=b->x-1;
|
234
|
+
if (y0<0) y0=0; if (y1>=b->y) y1=b->y-1;
|
235
|
+
dx=x1-x0+1;
|
236
|
+
dy=y1-y0+1; yy0=y0;
|
237
|
+
fprintf(stderr,"\n# show box + environment");
|
238
|
+
fprintf(stderr,"\n# show box x= %4d %4d d= %3d %3d r= %d %d",
|
239
|
+
px->x0, px->y0, px->x1 - px->x0 + 1, px->y1 - px->y0 + 1,
|
240
|
+
px->x - px->x0, px->y - px->y0);
|
241
|
+
if (px->num_ac){ /* output table of chars and its probabilities */
|
242
|
+
fprintf(stderr,"\n# list box char: ");
|
243
|
+
for(i=0;i<px->num_ac && i<NumAlt;i++)
|
244
|
+
/* output the (xml-)string (picture position, barcodes, glyphs, ...) */
|
245
|
+
if (px->tas[i])
|
246
|
+
fprintf(stderr," %s(%d)", px->tas[i] ,px->wac[i]);
|
247
|
+
else
|
248
|
+
fprintf(stderr," %s(%d)",decode(px->tac[i],ASCII),px->wac[i]);
|
249
|
+
}
|
250
|
+
fprintf(stderr,"\n");
|
251
|
+
if (px->dots && px->m2 && px->m1<y0) { yy0=px->m1; dy=px->y1-yy0+1; }
|
252
|
+
}
|
253
|
+
tx=dx/80+1;
|
254
|
+
ty=dy/40+1; // step, usually 1, but greater on large maps
|
255
|
+
fprintf(stderr,"# show pattern x= %4d %4d d= %3d %3d t= %d %d\n",
|
256
|
+
x0,y0,dx,dy,tx,ty);
|
257
|
+
if (dx>0)
|
258
|
+
for(y=yy0;y<yy0+dy;y+=ty) { /* reduce the output to max 78x40 */
|
259
|
+
|
260
|
+
/* image is the boxframe + environment in the original bitmap */
|
261
|
+
for(x=x0;x<x0+dx;x+=tx){ /* by merging sub-pixels */
|
262
|
+
c1='.';
|
263
|
+
for(y2=y;y2<y+ty && y2<y0+dy;y2++) /* sub-pixels */
|
264
|
+
for(x2=x;x2<x+tx && x2<x0+dx;x2++)
|
265
|
+
{ if((getpixel(b,x2,y2)<cs)) c1='#'; }
|
266
|
+
// show pixels outside the box thinner/weaker
|
267
|
+
if (x+tx-1 < px->x0 || x > px->x1
|
268
|
+
|| y+ty-1 < px->y0 || y > px->y1) c1=((c1=='#')?'O':',');
|
269
|
+
fprintf(stderr,"%c", c1 );
|
270
|
+
}
|
271
|
+
|
272
|
+
c1=c2=' ';
|
273
|
+
/* mark lines with < */
|
274
|
+
if (px) if (y==px->m1 || y==px->m2 || y==px->m3 || y==px->m4) c1='<';
|
275
|
+
if (y==px->y0 || y==px->y1) c2='-'; /* boxmarks */
|
276
|
+
fprintf(stderr,"%c%c\n",c1,c2);
|
277
|
+
}
|
278
|
+
}
|
279
|
+
|
280
|
+
|
281
|
+
/*
|
282
|
+
// second variant, for database (with slightly other behaviour)
|
283
|
+
// new variant
|
284
|
+
// look at the environment of the pixel too (contrast etc.)
|
285
|
+
// detailed analysis only of diff pixels!
|
286
|
+
//
|
287
|
+
// 100% * distance, 0 is best fit
|
288
|
+
// = similarity of 2 chars for recognition of noisy chars
|
289
|
+
// weigth of pixels with only one same neighbour set to 0
|
290
|
+
// look at contours too!
|
291
|
+
ToDo: especially on small boxes distance should only be 0 if
|
292
|
+
characters are 100% identical!
|
293
|
+
*/
|
294
|
+
// #define DEBUG 2
|
295
|
+
int distance2( pix *p1, struct box *box1,
|
296
|
+
pix *p2, struct box *box2, int cs, int vvv){
|
297
|
+
int rc=0,x,y,v1,v2,i1,i2,rgood=0,rbad=0,
|
298
|
+
x1,y1,x2,y2,dx,dy,dx1,dy1,dx2,dy2,tx,ty;
|
299
|
+
#if DEBUG == 2
|
300
|
+
if (vvv)
|
301
|
+
fprintf(stderr," DEBUG: distance2\n");
|
302
|
+
#endif
|
303
|
+
x1=box1->x0;y1=box1->y0;x2=box2->x0;y2=box2->y0;
|
304
|
+
dx1=box1->x1-box1->x0+1; dx2=box2->x1-box2->x0+1; dx=((dx1>dx2)?dx1:dx2);dx=dx1;
|
305
|
+
dy1=box1->y1-box1->y0+1; dy2=box2->y1-box2->y0+1; dy=((dy1>dy2)?dy1:dy2);dy=dy1;
|
306
|
+
if(abs(dx1-dx2)>1+dx/16 || abs(dy1-dy2)>1+dy/16) rbad++; // how to weight?
|
307
|
+
// compare relations to baseline and upper line
|
308
|
+
if(box1->m4>0 && box2->m4>0){ // used ???
|
309
|
+
if(2*box1->y1>box1->m3+box1->m4 && 2*box2->y1<box2->m3+box2->m4) rbad+=128;
|
310
|
+
if(2*box1->y0>box1->m1+box1->m2 && 2*box2->y0<box2->m1+box2->m2) rbad+=128;
|
311
|
+
}
|
312
|
+
tx=dx/16; if(dx<17)tx=1; // raster
|
313
|
+
ty=dy/32; if(dy<33)ty=1;
|
314
|
+
// compare pixels
|
315
|
+
for( y=0;y<dy;y+=ty )
|
316
|
+
for( x=0;x<dx;x+=tx ) { // try global shift too ???
|
317
|
+
v1=((getpixel(p1,x1+x*dx1/dx,y1+y*dy1/dy)<cs)?1:0); i1=8; // better gray?
|
318
|
+
v2=((getpixel(p2,x2+x*dx2/dx,y2+y*dy2/dy)<cs)?1:0); i2=8; // better gray?
|
319
|
+
if(v1==v2) { rgood+=16; continue; } // all things are right!
|
320
|
+
// what about different pixel???
|
321
|
+
// test overlapp of surounding pixels ???
|
322
|
+
v1=1; rbad+=4;
|
323
|
+
v1=-1;
|
324
|
+
for(i1=-1;i1<2;i1++)
|
325
|
+
for(i2=-1;i2<2;i2++)if(i1!=0 || i2!=0){
|
326
|
+
if( ((getpixel(p1,x1+x*dx1/dx+i1*(1+dx1/32),y1+y*dy1/dy+i2*(1+dy1/32))<cs)?1:0)
|
327
|
+
!=((getpixel(p2,x2+x*dx2/dx+i1*(1+dx2/32),y2+y*dy2/dy+i2*(1+dy2/32))<cs)?1:0) ) v1++;
|
328
|
+
}
|
329
|
+
if(v1>0)
|
330
|
+
rbad+=16*v1;
|
331
|
+
}
|
332
|
+
if(rgood+rbad) rc= 100*rbad/(rgood+rbad); else rc=99;
|
333
|
+
/* if width/high is not correct add badness */
|
334
|
+
rc += ( abs(dx1*dy2-dx2*dy1) * 10 ) / (dy1*dy2);
|
335
|
+
if (rc>100) rc=100;
|
336
|
+
if(/* rc<10 && */ vvv /* &1024 */){
|
337
|
+
#if DEBUG == 2
|
338
|
+
fprintf(stderr," distance2 rc=%d rgood=%d rbad=%d\n",rc,rgood,rbad);
|
339
|
+
// out_b(NULL,p1,box1->x0,box1->y0,box1->x1-box1->x0+1,
|
340
|
+
// box1->y1-box1->y0+1,cs);
|
341
|
+
// out_b(NULL,p2,box2->x0,box2->y0,box2->x1-box2->x0+1,
|
342
|
+
// box2->y1-box2->y0+1,cs);
|
343
|
+
out_x(box1);
|
344
|
+
out_x(box2);
|
345
|
+
#endif
|
346
|
+
}
|
347
|
+
return rc;
|
348
|
+
}
|
349
|
+
|
350
|
+
wchar_t ocr_db(struct box *box1, job_t *job) {
|
351
|
+
int dd = 1000, dist = 1000;
|
352
|
+
wchar_t c = UNKNOWN;
|
353
|
+
unsigned char buf[200]; /* Oct08 JS: add unsigned to avoid UTF problems */
|
354
|
+
Box *box2, *box3;
|
355
|
+
|
356
|
+
if (!list_empty(&job->tmp.dblist)){
|
357
|
+
box3 = (Box *)list_get_header(&job->tmp.dblist);
|
358
|
+
if(job->cfg.verbose)
|
359
|
+
fprintf(stderr,"\n#DEBUG: ocr_db (%d,%d) ",box1->x0, box1->y0);
|
360
|
+
|
361
|
+
for_each_data(&job->tmp.dblist) {
|
362
|
+
box2 = (Box *)list_get_current(&job->tmp.dblist);
|
363
|
+
/* do preselect!!! distance() slowly */
|
364
|
+
dd = distance2( box2->p, box2, box1->p, box1,
|
365
|
+
job->cfg.cs, job->cfg.verbose);
|
366
|
+
if (dd <= dist) { /* new best fit */
|
367
|
+
dist = dd;
|
368
|
+
box3 = box2; /* box3 is a pointer and not copied box2 */
|
369
|
+
|
370
|
+
if (dist<100 && 100-dist >= job->cfg.certainty) {
|
371
|
+
/* some deviation of the pattern is tolerated */
|
372
|
+
int i, wa;
|
373
|
+
for (i=0;i<box3->num_ac;i++) {
|
374
|
+
wa = (100-dist)*box3->wac[i]/100; /* weight *= (100-dist) */
|
375
|
+
if (box3->tas[i]) setas(box1,box3->tas[i],wa);
|
376
|
+
else setac(box1,box3->tac[i],wa);
|
377
|
+
}
|
378
|
+
if (box3->num_ac) c=box3->tac[0]; /* 0 for strings (!UNKNOWN) */
|
379
|
+
if (job->cfg.verbose)
|
380
|
+
fprintf(stderr, " dist=%4d c= %c 0x%02x %s wc= %3d", dist,
|
381
|
+
((box3->c>32 && box3->c<127) ? (char) box3->c : '.'),
|
382
|
+
(int)box3->c, ((box3->tas[0])?box3->tas[0]:""), box3->wac[0]);
|
383
|
+
}
|
384
|
+
if (dd<=0 && ((box3->num_ac && box3->tas[0]) || box3->c >= 128
|
385
|
+
|| !strchr ("l1|I0O", box3->c)))
|
386
|
+
break; /* speedup if found */
|
387
|
+
}
|
388
|
+
} end_for_each(&job->tmp.dblist);
|
389
|
+
|
390
|
+
}
|
391
|
+
|
392
|
+
if( (job->cfg.mode&128) != 0 && c == UNKNOWN ) { /* prompt the user */
|
393
|
+
/* should the output go to stderr or special pipe??? */
|
394
|
+
int utf8_ok=0; /* trigger this flag if input is ok */
|
395
|
+
int i, endchar; /* index */
|
396
|
+
out_env(box1, job); /* old: out_x(box1); */
|
397
|
+
fprintf(stderr,"The above pattern was not recognized.\n"
|
398
|
+
"Enter UTF8 char or string for above pattern. Leave empty if unsure.\n"
|
399
|
+
"Press RET at the end (ALT+RET to store into RAM only) : "
|
400
|
+
); /* ToDo: empty + alt-return (0x1b 0x0a) for help? ^a for skip all */
|
401
|
+
/* UTF-8 (man 7 utf-8):
|
402
|
+
* 7bit = 0xxxxxxx (0000-007F)
|
403
|
+
* 11bit = 110xxxxx 10xxxxxx (0080-07FF)
|
404
|
+
* 16bit = 1110xxxx 10xxxxxx 10xxxxxx (0800-FFFF)
|
405
|
+
* 21bit = 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
406
|
+
* 26bit = 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
407
|
+
* 31bit = 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
408
|
+
*/
|
409
|
+
buf[0]=0;
|
410
|
+
/* shift/ctrl/altgr-enter acts like enter or ^j or ^m,
|
411
|
+
* alt-enter returns 0x1b 0x0a and returns from fgets()
|
412
|
+
* ^d (EOF) returns (nil) from fgets()
|
413
|
+
* x+(2*)ctrl-d returns from fgets() without returning a 0x0a
|
414
|
+
* if not UTF-input-mode, we are in trouble?
|
415
|
+
* ^a=0x01, ^b=0x02, ^e=05, ..., ToDo: meaning of no-input or <=space
|
416
|
+
*/
|
417
|
+
fgets((char *)buf,200,stdin); /* including \n=0x0a */
|
418
|
+
dd=strlen((char *)buf);
|
419
|
+
/* output hexcode if verbose set */
|
420
|
+
if (job->cfg.verbose) {
|
421
|
+
fprintf(stderr, "\n# fgets [%d]:", dd);
|
422
|
+
for(i=0; i<dd; i++)
|
423
|
+
fprintf(stderr, " %02x", (unsigned)((unsigned char)buf[i]));
|
424
|
+
fprintf(stderr, "\n#");
|
425
|
+
}
|
426
|
+
/* we dont accept chars which could destroy database file */
|
427
|
+
for (i=0; i<dd; i++) if (buf[i]<32) break; /* need unsigned char here */
|
428
|
+
endchar=buf[i]; /* last char is 0x0a (ret) 0x00 (EOF) or 0x1b (alt+ret) */
|
429
|
+
if (endchar==0x01) { i=0;job->cfg.mode&=~128; } /* skip all */
|
430
|
+
buf[dd=i]=0; /* replace final 0x0a or other special codes */
|
431
|
+
if (dd==1 && !(buf[0]&128)) { c=buf[0]; utf8_ok=1; } /* single char */
|
432
|
+
if (dd>1 && dd<7) { /* try to decode single wide char (utf8) */
|
433
|
+
int u0, u1; /* define UTF8-start sequences, u0=0bits u1=1bits */
|
434
|
+
u0= 1<<(7-dd); /* compute start byte from UTF8-length */
|
435
|
+
u1=255&~((1<<(8-dd))-1);
|
436
|
+
/* count number of following 10xxxxxx bytes to i */
|
437
|
+
for (i=1;i<dd;i++) if ((buf[i]&0xc0)!=0x80) break; /* 10xxxxxx */
|
438
|
+
if (i==dd && (buf[0]&(u0|u1))==u1) { utf8_ok=1;
|
439
|
+
c=buf[0]&(u0-1); /* 11..0x.. */
|
440
|
+
for (i=1;i<dd;i++) { c<<=6; c|=buf[i]&0x3F; } /* 10xxxxxx */
|
441
|
+
}
|
442
|
+
}
|
443
|
+
if (dd>0){ /* ToDo: skip space and tab too? */
|
444
|
+
if (utf8_ok==1) { setac(box1, c, 100); } /* store single wchar */
|
445
|
+
if (utf8_ok==0) { /* store a string of chars (UTF8-string) */
|
446
|
+
c='_'; /* what should we do with c? probably a bad idea? */
|
447
|
+
setas(box1, (char *)buf, 100);
|
448
|
+
}
|
449
|
+
/* decide between
|
450
|
+
* 0) just help gocr to find the results and (dont remember, 0x01)
|
451
|
+
* 1) help and remember in the same run (store to memory, 0x1b)
|
452
|
+
* 2) expand the database (dont store ugly chars to the database!)
|
453
|
+
*/
|
454
|
+
if (endchar!=0x01){ /* ^a before hit return */
|
455
|
+
/* is there a reason to dont store to memory? */
|
456
|
+
list_app(&job->tmp.dblist, box1); /* append to list for 1+2 */
|
457
|
+
}
|
458
|
+
if (endchar!=0x01 && endchar!=0x1b){
|
459
|
+
store_db(box1, job); /* store to disk for 2 */
|
460
|
+
}
|
461
|
+
if (job->cfg.verbose)
|
462
|
+
fprintf(stderr, " got char= %c 16bit= 0x%04x string= \"%s\"\n",
|
463
|
+
((c>32 && c<127)?(char)c:'.'), (int)c, buf);
|
464
|
+
}
|
465
|
+
}
|
466
|
+
|
467
|
+
return c;
|
468
|
+
}
|