gocr-ruby 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +22 -0
- data/Gemfile +4 -0
- data/LICENSE +21 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +49 -0
- data/ext/gocr/Makefile +141 -0
- data/ext/gocr/Makefile.in +140 -0
- data/ext/gocr/amiga.h +31 -0
- data/ext/gocr/barcode.c +2108 -0
- data/ext/gocr/barcode.h +11 -0
- data/ext/gocr/box.c +496 -0
- data/ext/gocr/config.h +37 -0
- data/ext/gocr/config.h.in +36 -0
- data/ext/gocr/database.c +468 -0
- data/ext/gocr/detect.c +1003 -0
- data/ext/gocr/extconf.rb +6 -0
- data/ext/gocr/gocr.c +436 -0
- data/ext/gocr/gocr.h +290 -0
- data/ext/gocr/jconv.c +168 -0
- data/ext/gocr/job.c +92 -0
- data/ext/gocr/lines.c +364 -0
- data/ext/gocr/list.c +334 -0
- data/ext/gocr/list.h +91 -0
- data/ext/gocr/ocr0.c +7312 -0
- data/ext/gocr/ocr0.h +63 -0
- data/ext/gocr/ocr0n.c +1527 -0
- data/ext/gocr/ocr1.c +85 -0
- data/ext/gocr/ocr1.h +3 -0
- data/ext/gocr/otsu.c +310 -0
- data/ext/gocr/otsu.h +23 -0
- data/ext/gocr/output.c +291 -0
- data/ext/gocr/output.h +37 -0
- data/ext/gocr/pcx.c +153 -0
- data/ext/gocr/pcx.h +9 -0
- data/ext/gocr/pgm2asc.c +3259 -0
- data/ext/gocr/pgm2asc.h +105 -0
- data/ext/gocr/pixel.c +538 -0
- data/ext/gocr/pnm.c +538 -0
- data/ext/gocr/pnm.h +35 -0
- data/ext/gocr/progress.c +87 -0
- data/ext/gocr/progress.h +42 -0
- data/ext/gocr/remove.c +715 -0
- data/ext/gocr/tga.c +87 -0
- data/ext/gocr/tga.h +6 -0
- data/ext/gocr/unicode.c +1318 -0
- data/ext/gocr/unicode.h +62 -0
- data/ext/gocr/unicode_defs.h +1245 -0
- data/ext/gocr/version.h +2 -0
- data/gocr-ruby.gemspec +28 -0
- data/image.png +0 -0
- data/lib/gocr.rb +6 -0
- data/lib/gocr/image.rb +8 -0
- data/lib/gocr/version.rb +3 -0
- metadata +156 -0
data/ext/gocr/pnm.h
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
/* Handle PNM-files Dez98 JS
|
2
|
+
* 0,0 = left up
|
3
|
+
* PAM-formats
|
4
|
+
* PAM any P7
|
5
|
+
* PNM-formats
|
6
|
+
* PGM gray ASCII=P2 RAW=P5 dx dy col gray
|
7
|
+
* PPM RGB ASCII=P3 RAW=P6 dx dy col RGB
|
8
|
+
* PBM B/W ASCII=P1 RAW=P4 dx dy bitmap
|
9
|
+
*/
|
10
|
+
|
11
|
+
#ifndef GOCR_PNM_H
|
12
|
+
#define GOCR_PNM_H 1
|
13
|
+
|
14
|
+
#include "config.h"
|
15
|
+
|
16
|
+
struct pixmap {
|
17
|
+
unsigned char *p; /* pointer of image buffer (pixmap) */
|
18
|
+
int x; /* xsize */
|
19
|
+
int y; /* ysize */
|
20
|
+
int bpp; /* bytes per pixel: 1=gray 3=rgb */
|
21
|
+
};
|
22
|
+
typedef struct pixmap pix;
|
23
|
+
|
24
|
+
/* return 1 on multiple images (holding file open), 0 else */
|
25
|
+
int readpgm(char *name, pix *p, int vvv);
|
26
|
+
|
27
|
+
/* write pgm-map to pnm-file */
|
28
|
+
int writepgm(char *nam, pix *p);
|
29
|
+
int writepbm(char *nam, pix *p);
|
30
|
+
int writeppm(char *nam, pix *p); /* use lowest 3 bits for farbcoding */
|
31
|
+
|
32
|
+
/* ----- count colors ------ create histogram ------- */
|
33
|
+
void makehisto(pix p, unsigned col[256], int vvv);
|
34
|
+
|
35
|
+
#endif
|
data/ext/gocr/progress.c
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
/* ---------------------------- progress output ---------------------- */
|
2
|
+
#include <stdlib.h>
|
3
|
+
#include <stdio.h>
|
4
|
+
#include "progress.h"
|
5
|
+
|
6
|
+
FILE *fp=NULL; /* output stream for progress info */
|
7
|
+
time_t printinterval = 10; /* approx. seconds between printouts, 1.. */
|
8
|
+
|
9
|
+
/* initialization of progress output, fname="<fileID>","<filename>","-" */
|
10
|
+
int ini_progress(char *fname){
|
11
|
+
int fd;
|
12
|
+
if (fp) { fclose(fp); fp=NULL; }
|
13
|
+
if (fname) if (fname[0]) {
|
14
|
+
fd=atoi(fname);
|
15
|
+
if(fd>255 || fname[((fd>99)?3:((fd>9)?2:1))]) fd=-1; /* be sure */
|
16
|
+
if (fname[0]=='-' && fname[1]==0) { fp=stdout; }
|
17
|
+
#ifdef __USE_POSIX
|
18
|
+
else if (fd>0) { fp=fdopen(fd,"w"); } /* not sure that "w" is ok ???? */
|
19
|
+
#endif
|
20
|
+
else { fp=fopen(fname,"w");if(!fp)fp=fopen(fname,"a"); }
|
21
|
+
if (!fp) {
|
22
|
+
fprintf(stderr,"could not open %s for progress output\n",fname);
|
23
|
+
return -1; /* no success */
|
24
|
+
}
|
25
|
+
}
|
26
|
+
/* fprintf(stderr,"# progress: fd=%d\n",fileno(fp)); */
|
27
|
+
return 0; /* no error */
|
28
|
+
}
|
29
|
+
|
30
|
+
progress_counter_t *open_progress(int maxcount, const char *name){
|
31
|
+
progress_counter_t *pc;
|
32
|
+
pc = (progress_counter_t*) malloc( sizeof(progress_counter_t) );
|
33
|
+
if (!pc) return 0; /* nonfatal */
|
34
|
+
pc->starttime = time(NULL);
|
35
|
+
pc->maxcount = maxcount;
|
36
|
+
pc->numskip = 0;
|
37
|
+
pc->lastprintcount = -1;
|
38
|
+
pc->name = name;
|
39
|
+
pc->lastprinttime = pc->starttime;
|
40
|
+
return pc;
|
41
|
+
}
|
42
|
+
/* free counter */
|
43
|
+
int close_progress(progress_counter_t *counter){
|
44
|
+
if (counter) free(counter);
|
45
|
+
return 0;
|
46
|
+
}
|
47
|
+
/* progress meter output
|
48
|
+
* only 1output/10s, + estimated endtime (test on pixelfields)
|
49
|
+
* ToDo: to stderr by default? remove subprogress, ini_progress? rm_progress?
|
50
|
+
* test on tcl
|
51
|
+
*/
|
52
|
+
int progress(int counter, progress_counter_t *pc){
|
53
|
+
/* we try to save computing time, so we skip early */
|
54
|
+
if ((!fp) || counter - pc->lastprintcount <= pc->numskip) return 0;
|
55
|
+
{
|
56
|
+
char cr='\n';
|
57
|
+
time_t now = time(NULL);
|
58
|
+
#if 0 /* debugging */
|
59
|
+
if (counter)
|
60
|
+
fprintf(fp," progress %s %3d / %d time %d skip %d\n",
|
61
|
+
pc->name,counter,pc->maxcount,(int)(now - pc->starttime),
|
62
|
+
pc->numskip); fflush(fp);
|
63
|
+
#endif
|
64
|
+
if (5*(now - pc->lastprinttime) < 2*printinterval
|
65
|
+
&& counter - pc->lastprintcount >= pc->numskip) { /* save for tests */
|
66
|
+
if (pc->numskip < 1024) pc->numskip += pc->numskip+1;
|
67
|
+
}
|
68
|
+
if (3*(now - pc->lastprinttime) < 2*printinterval ) {
|
69
|
+
return 0; /* to early for printing */
|
70
|
+
}
|
71
|
+
if (2*(now - pc->lastprinttime) > 3*printinterval ) {
|
72
|
+
pc->numskip >>= 1; /* to late for printing */
|
73
|
+
}
|
74
|
+
if (fileno(fp)<3) cr='\r'; /* may be choosen in ini? */
|
75
|
+
if (counter)
|
76
|
+
fprintf(fp," progress %s %5d / %d time[s] %5d / %5d (skip=%d)%c",
|
77
|
+
pc->name,counter,pc->maxcount,
|
78
|
+
(int)(now - pc->starttime), /* time gone since start */
|
79
|
+
(int)(now - pc->starttime)*pc->maxcount/(counter), /* estimated */
|
80
|
+
pc->numskip, cr);
|
81
|
+
fflush(fp);
|
82
|
+
pc->lastprintcount=counter;
|
83
|
+
pc->lastprinttime=now;
|
84
|
+
}
|
85
|
+
return 0; /* no error */
|
86
|
+
}
|
87
|
+
/* --------------------- end of progress output ---------------------- */
|
data/ext/gocr/progress.h
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
/*
|
2
|
+
---------------------- progress output ----------------------
|
3
|
+
output progress for GUIs to a pipe
|
4
|
+
format: "counter_name" counter maxcounter time estimated_time \r|\n
|
5
|
+
*/
|
6
|
+
#ifndef GOCR_PROGRESS_H
|
7
|
+
#define GOCR_PROGRESS_H "Oct06"
|
8
|
+
#include <time.h>
|
9
|
+
|
10
|
+
/* initialization of progress output, fname="<fileID>","<filename>","-" */
|
11
|
+
int ini_progress(char *fname);
|
12
|
+
|
13
|
+
/* ToDo: add by open_* and close_* */
|
14
|
+
/* place to store values for progress calculation, called often, but
|
15
|
+
* dont call systime so often
|
16
|
+
*/
|
17
|
+
typedef struct progress_counter {
|
18
|
+
const char *name; /* name of counter */
|
19
|
+
int lastprintcount; /* last counter printed for extrapolation */
|
20
|
+
int maxcount; /* max counter */
|
21
|
+
int numskip; /* num of counts to skip before timecall 0..maxcount */
|
22
|
+
time_t starttime; /* start time of this counter */
|
23
|
+
time_t lastprinttime; /* last time printed in seconds */
|
24
|
+
|
25
|
+
} progress_counter_t;
|
26
|
+
|
27
|
+
/* progress output p1=main_progress_0..100% p2=sub_progress_0..100% */
|
28
|
+
/* ToDo: improved_progress: counter, maxcount(ini), counter_name(ini),
|
29
|
+
* printinterval=10 # time before printing out progressmeter
|
30
|
+
* *numskip=1 # if (counter-lastprintcounter<numskip) return; gettime() ...
|
31
|
+
* *startutime, *lastprintutime, *lastprintcounter # numskip*=2 or /=2
|
32
|
+
* only 1output/10s, + estimated endtime (test on pixelfields)
|
33
|
+
* to stderr by default? remove subprogress, ini_progress? rm_progress?
|
34
|
+
* test on tcl
|
35
|
+
*/
|
36
|
+
progress_counter_t *open_progress(int maxcount, const char *name);
|
37
|
+
/* free counter */
|
38
|
+
int close_progress(progress_counter_t *counter);
|
39
|
+
/* output progress for pc */
|
40
|
+
int progress(int counter, progress_counter_t *pc);
|
41
|
+
/* --------------------- end of progress output ---------------------- */
|
42
|
+
#endif
|
data/ext/gocr/remove.c
ADDED
@@ -0,0 +1,715 @@
|
|
1
|
+
/*
|
2
|
+
This is a Optical-Character-Recognition program
|
3
|
+
Copyright (C) 2000-2010 Joerg Schulenburg
|
4
|
+
|
5
|
+
This program is free software; you can redistribute it and/or
|
6
|
+
modify it under the terms of the GNU General Public License
|
7
|
+
as published by the Free Software Foundation; either version 2
|
8
|
+
of the License, or (at your option) any later version.
|
9
|
+
|
10
|
+
This program is distributed in the hope that it will be useful,
|
11
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
+
GNU General Public License for more details.
|
14
|
+
|
15
|
+
You should have received a copy of the GNU General Public License
|
16
|
+
along with this program; if not, write to the Free Software
|
17
|
+
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
18
|
+
|
19
|
+
see README for EMAIL-address
|
20
|
+
*/
|
21
|
+
|
22
|
+
#include <stdlib.h>
|
23
|
+
#include <stdio.h>
|
24
|
+
#include "pgm2asc.h"
|
25
|
+
#include "gocr.h"
|
26
|
+
#include "progress.h"
|
27
|
+
#include "unicode_defs.h" /* UNKNOWN + PICTURE */
|
28
|
+
|
29
|
+
/* measure mean thickness as an criteria for big chars */
|
30
|
+
int mean_thickness( struct box *box2 ){
|
31
|
+
int mt=0, i, y, dx=box2->x1-box2->x0+1, dy, cs=OCR_JOB->cfg.cs;
|
32
|
+
for (y=box2->y0+1; y<box2->y1; y++) {
|
33
|
+
i=loop(box2->p,box2->x0+0,y,dx,cs,0,RI);
|
34
|
+
i=loop(box2->p,box2->x0+i,y,dx,cs,1,RI);
|
35
|
+
mt+=i;
|
36
|
+
}
|
37
|
+
dy = box2->y1 - box2->y0 - 1;
|
38
|
+
if (dy) mt=(mt+dy/2)/dy;
|
39
|
+
return mt;
|
40
|
+
}
|
41
|
+
|
42
|
+
/* ---- remove dust ---------------------------------
|
43
|
+
What is dust? I think, this is a very small pixel cluster without
|
44
|
+
neighbours. Of course not all dust clusters can be detected correct.
|
45
|
+
This feature should be possible to switch off via option.
|
46
|
+
-> may be, all clusters should be stored here?
|
47
|
+
speed is very slow, I know, but I am happy that it is working well
|
48
|
+
*/
|
49
|
+
int remove_dust( job_t *job ){
|
50
|
+
/* new dust removing */
|
51
|
+
/* FIXME jb:remove pp */
|
52
|
+
pix *pp = &job->src.p;
|
53
|
+
int i1,i,j,x,y,x0,x1,y0,y1,nC,sX,sY,sP, cs,vvv=job->cfg.verbose;
|
54
|
+
struct box *box2;
|
55
|
+
#define HISTSIZE 220 /* histogramm size */
|
56
|
+
int histo[HISTSIZE];
|
57
|
+
cs=job->cfg.cs; sP=sX=sY=nC=0;
|
58
|
+
/*
|
59
|
+
* count number of black pixels within a box and store it in .dots
|
60
|
+
* later .dots is re-used for number of objects belonging to the character
|
61
|
+
* should be done in the flood-fill algorithm
|
62
|
+
* volume of white pixels is estimated to big here (left/right rot)
|
63
|
+
* ToDo: mean thickness of char lines?
|
64
|
+
* or interval nesting (minP..maxP) to remove outriders
|
65
|
+
*/
|
66
|
+
j=0;
|
67
|
+
for (i1=0;i1<HISTSIZE;i1++) histo[i1]=0;
|
68
|
+
/* mean value over every black object which is big enough */
|
69
|
+
for_each_data(&(job->res.boxlist)) {
|
70
|
+
box2 = (struct box *)list_get_current(&(job->res.boxlist));
|
71
|
+
if (!box2->num_frames) continue;
|
72
|
+
// bad for big fonts 2010-10 invalid_ogv
|
73
|
+
// if (box2->frame_vol[0]<0) continue; /* don't count inner holes */
|
74
|
+
j = abs(box2->frame_vol[0]);
|
75
|
+
if ((box2->y1-box2->y0+1)>3) {
|
76
|
+
nC++; /* only count potential chars v0.42 */
|
77
|
+
sX+=box2->x1 - box2->x0 + 1;
|
78
|
+
sY+=box2->y1 - box2->y0 + 1;
|
79
|
+
sP+=j;
|
80
|
+
}
|
81
|
+
if (j<HISTSIZE) histo[j]++;
|
82
|
+
} end_for_each(&(job->res.boxlist));
|
83
|
+
|
84
|
+
if (job->cfg.dust_size < 0 && nC > 0) { /* auto detection */
|
85
|
+
/* this formula is empirically, high resolution scans have bigger dust */
|
86
|
+
/* maximum allowed dustsize (min=4*7 ca. 32)
|
87
|
+
* does not work for background pattern!
|
88
|
+
*/
|
89
|
+
job->cfg.dust_size = ( ( sX/nC ) * ( sY/nC ) + 16) / 32; // maximum
|
90
|
+
if (sY/nC<10) job->cfg.dust_size=1; // 2010-09-24 6x9 gocr0801_bad5_FP
|
91
|
+
if (vvv) fprintf(stderr, "# remove.c remove_dust():");
|
92
|
+
if (vvv) fprintf(stderr, "\n# dust size detection, vol num"
|
93
|
+
" obj=%d maxDust=%d mP= %3d mXY= %2d %2d",
|
94
|
+
nC, job->cfg.dust_size, sP/nC, sX/nC, sY/nC);
|
95
|
+
/* we assume that for random dust applies histo[i+1]<histo[i] */
|
96
|
+
for (i=1;i+3<HISTSIZE;i++){
|
97
|
+
if (vvv) fprintf(stderr,"\n# dust size histogram %3d %5d",i,histo[i]);
|
98
|
+
if (histo[i]>=nC) continue; /* v0.42 lot of pixels -> bg pattern < 3 */
|
99
|
+
if (i>=job->cfg.dust_size) break; /* maximum = mean size / 32 */
|
100
|
+
if (i<job->cfg.dust_size/16 && i<9) continue; /* giant font? tmp10/invalid_olv 2010-10 */
|
101
|
+
if (histo[i/*+1*/]==0) break; /* bad statistic */
|
102
|
+
if ((histo[i+2]+histo[i+3])
|
103
|
+
>=(histo[i] +histo[i+1])) break; /* no noise, but to late? */
|
104
|
+
if ( histo[i-1] > 1024*histo[i] &&
|
105
|
+
2*histo[i+1] >=histo[i]) break; /* bg pattern */
|
106
|
+
}
|
107
|
+
if (vvv) fprintf(stderr," break");
|
108
|
+
if (vvv) for (i1=0,j=i+1;j<HISTSIZE;j++) {
|
109
|
+
/* compressed, output only if something is changing */
|
110
|
+
if (j==HISTSIZE-1 || histo[j]!=histo[j-1] || histo[j]!=histo[j+1]) {
|
111
|
+
fprintf(stderr,"\n# dust size histogram %3d %5d",j,histo[j]);
|
112
|
+
if (++i1>20) break; /* dont do excessive output */
|
113
|
+
}
|
114
|
+
}
|
115
|
+
job->cfg.dust_size=i-1;
|
116
|
+
//
|
117
|
+
/* what is the statistic of random dust?
|
118
|
+
* if we have p pixels on a x*y image we should have
|
119
|
+
* (p/(x*y))^1 * (x*y) = p singlets
|
120
|
+
* (p/(x*y))^2 * (x*y) = p^2/(x*y) doublets and
|
121
|
+
* (p/(x*y))^3 * (x*y) = p^3/(x*y)^2 triplets
|
122
|
+
*/
|
123
|
+
if (vvv) fprintf(stderr,"\n# auto dust size = %d nC= %3d .. %3d"
|
124
|
+
" avD= %2d %2d .. %2d %2d\n",
|
125
|
+
job->cfg.dust_size, nC, job->res.numC,
|
126
|
+
(job->res.sumX+job->res.numC/2)/job->res.numC,
|
127
|
+
(job->res.sumY+job->res.numC/2)/job->res.numC, sX/nC, sY/nC);
|
128
|
+
}
|
129
|
+
if (job->cfg.dust_size)
|
130
|
+
{ i=0;
|
131
|
+
if(vvv){
|
132
|
+
fprintf(stderr,"# remove dust of size %2d",job->cfg.dust_size);
|
133
|
+
/* Warning: better use (1/(x*y))^2 as 1/((x*y)^2),
|
134
|
+
* because (x*y)^2 may overflow */
|
135
|
+
fprintf(stderr," histo=%d,%d(?=%d),%d(?=%d),...\n# ...",
|
136
|
+
histo[1],histo[2],histo[1]*histo[1]/(pp->x*pp->y),
|
137
|
+
histo[3], histo[1]*histo[1]/(pp->x*pp->y)
|
138
|
+
*histo[1]/(pp->x*pp->y));
|
139
|
+
}
|
140
|
+
i = 0;
|
141
|
+
for_each_data(&(job->res.boxlist)) {
|
142
|
+
box2 = (struct box *)list_get_current(&(job->res.boxlist));
|
143
|
+
x0=box2->x0;x1=box2->x1;y0=box2->y0;y1=box2->y1; /* box */
|
144
|
+
j=abs(box2->frame_vol[0]);
|
145
|
+
if(j<=job->cfg.dust_size) /* remove this tiny object */
|
146
|
+
{ /* here we should distinguish dust and i-dots,
|
147
|
+
* may be we should sort out dots to a seperate dot list and
|
148
|
+
* after line detection decide, which is dust and which not
|
149
|
+
* dust should be removed to make recognition easier (ToDo)
|
150
|
+
*/
|
151
|
+
#if 0
|
152
|
+
if(get_bw((3*x0+x1)/4,(x0+3*x1)/4,y1+y1-y0+1,y1+8*(y1-y0+1),pp,cs,1))
|
153
|
+
continue; /* this idea was to simple, see kscan003.jpg sample */
|
154
|
+
#endif
|
155
|
+
/* remove from average */
|
156
|
+
job->res.numC--;
|
157
|
+
job->res.sumX-=x1-x0+1;
|
158
|
+
job->res.sumY-=y1-y0+1;
|
159
|
+
/* remove pixels (should only be done with dust) */
|
160
|
+
for(x=x0;x<=x1;x++)
|
161
|
+
for(y=y0;y<=y1;y++){ put(pp,x,y,0,255&~7); }
|
162
|
+
/* remove from list */
|
163
|
+
list_del(&(job->res.boxlist),box2);
|
164
|
+
/* free memory */
|
165
|
+
free_box(box2);
|
166
|
+
i++; /* count as dust particle */
|
167
|
+
continue;
|
168
|
+
}
|
169
|
+
} end_for_each(&(job->res.boxlist));
|
170
|
+
if(vvv)fprintf(stderr," %3d cluster removed, nC= %3d\n",i,job->res.numC);
|
171
|
+
}
|
172
|
+
/* reset dots to 0 and remove white pixels (new) */
|
173
|
+
i=0;
|
174
|
+
for_each_data(&(job->res.boxlist)) {
|
175
|
+
box2 = ((struct box *)list_get_current(&(job->res.boxlist)));
|
176
|
+
if (box2->frame_vol[0]<0) continue; /* for black areas only */
|
177
|
+
x0=box2->x0;x1=box2->x1;y0=box2->y0;y1=box2->y1; /* box */
|
178
|
+
if (x1-x0>16 && y1-y0>30) /* only on large enough chars */
|
179
|
+
for(x=x0+1;x<=x1-1;x++)
|
180
|
+
for(y=y0+1;y<=y1-1;y++){
|
181
|
+
if( pixel_atp(pp,x ,y )>=cs
|
182
|
+
&& pixel_atp(pp,x-1,y ) <cs
|
183
|
+
&& pixel_atp(pp,x+1,y ) <cs
|
184
|
+
&& pixel_atp(pp,x ,y-1) <cs
|
185
|
+
&& pixel_atp(pp,x ,y+1) <cs ) /* remove it */
|
186
|
+
{
|
187
|
+
put(pp,x,y,0,0); i++; /* (x and 0) or 0 */
|
188
|
+
}
|
189
|
+
}
|
190
|
+
} end_for_each(&(job->res.boxlist));
|
191
|
+
if (vvv) fprintf(stderr,"# ... %3d white pixels removed, cs=%d nC= %3d\n",
|
192
|
+
i,cs,job->res.numC);
|
193
|
+
return 0;
|
194
|
+
}
|
195
|
+
|
196
|
+
/* ---- smooth borders ---------------------------------
|
197
|
+
* Big chars often do not have smooth borders, which let fail
|
198
|
+
* the engine. Here we smooth the borders of big chars (>7x16).
|
199
|
+
* Smoothing is important for b/w scans, where we often have
|
200
|
+
* comb like pattern on a vertikal border. I also received
|
201
|
+
* samples with lot of white pixels (sample: 04/02/25).
|
202
|
+
* ToDo: obsolete if vector code is complete
|
203
|
+
*/
|
204
|
+
int smooth_borders( job_t *job ){
|
205
|
+
pix *pp = &job->src.p;
|
206
|
+
int ii=0,x,y,x0,x1,y0,y1,dx,dy,cs,i0,i1,i2,i3,i4,n1,n2,
|
207
|
+
cn[8],cm,vvv=job->cfg.verbose; /* dust found */
|
208
|
+
struct box *box2;
|
209
|
+
cs=job->cfg.cs; n1=n2=0;
|
210
|
+
if(vvv){ fprintf(stderr,"# smooth_borders of big chars 7x16 cs=%d",cs); }
|
211
|
+
/* filter for each big box */
|
212
|
+
for_each_data(&(job->res.boxlist)) { n2++; /* count boxes */
|
213
|
+
box2 = (struct box *)list_get_current(&(job->res.boxlist));
|
214
|
+
/* do not touch small characters! but how we define small characters? */
|
215
|
+
if (box2->x1-box2->x0+1<7 || box2->y1-box2->y0+1<16 ) continue;
|
216
|
+
if (box2->c==PICTURE) continue;
|
217
|
+
if (mean_thickness(box2)<3) continue;
|
218
|
+
n1++; /* count boxes matching big-char criteria */
|
219
|
+
x0=box2->x0; y0=box2->y0;
|
220
|
+
x1=box2->x1; y1=box2->y1;
|
221
|
+
dx=x1-x0+1; dy=y1-y0-1;
|
222
|
+
/* out_x(box2);
|
223
|
+
* dont change to much! only change if absolutely sure!
|
224
|
+
* ....... 1 2 3
|
225
|
+
* ex: .?##### 0 * 4
|
226
|
+
* ....... 7 6 5
|
227
|
+
* we should also avoid removing lines by sytematic remove
|
228
|
+
* from left end to the right, so we concern also about distance>1
|
229
|
+
*/
|
230
|
+
for(x=box2->x0;x<=box2->x1;x++)
|
231
|
+
for(y=box2->y0;y<=box2->y1;y++){ /* filter out high frequencies */
|
232
|
+
/* this is a very primitive solution, only for learning */
|
233
|
+
cn[0]=getpixel(pp,x-1,y);
|
234
|
+
cn[4]=getpixel(pp,x+1,y); /* horizontal */
|
235
|
+
cn[2]=getpixel(pp,x,y-1);
|
236
|
+
cn[6]=getpixel(pp,x,y+1); /* vertical */
|
237
|
+
cn[1]=getpixel(pp,x-1,y-1);
|
238
|
+
cn[3]=getpixel(pp,x+1,y-1); /* diagonal */
|
239
|
+
cn[7]=getpixel(pp,x-1,y+1);
|
240
|
+
cn[5]=getpixel(pp,x+1,y+1);
|
241
|
+
cm=getpixel(pp,x,y);
|
242
|
+
/* check for 5 other and 3 same surrounding pixels */
|
243
|
+
for (i0=0;i0<8;i0++)
|
244
|
+
if ((cn[i0 ]<cs)==(cm<cs)
|
245
|
+
&& (cn[(i0+7) & 7]<cs)!=(cm<cs)) break; /* first same */
|
246
|
+
for (i1=0;i1<8;i1++)
|
247
|
+
if ((cn[(i0+i1) & 7]<cs)!=(cm<cs)) break; /* num same */
|
248
|
+
for (i2=0;i2<8;i2++)
|
249
|
+
if ((cn[(i0+i1+i2) & 7]<cs)==(cm<cs)) break; /* num other */
|
250
|
+
cn[0]=getpixel(pp,x-2,y);
|
251
|
+
cn[4]=getpixel(pp,x+2,y); /* horizontal */
|
252
|
+
cn[2]=getpixel(pp,x,y-2);
|
253
|
+
cn[6]=getpixel(pp,x,y+2); /* vertical */
|
254
|
+
cn[1]=getpixel(pp,x-2,y-2);
|
255
|
+
cn[3]=getpixel(pp,x+2,y-2); /* diagonal */
|
256
|
+
cn[7]=getpixel(pp,x-2,y+2);
|
257
|
+
cn[5]=getpixel(pp,x+2,y+2);
|
258
|
+
/* check for 5 other and 3 same surrounding pixels */
|
259
|
+
for (i0=0;i0<8;i0++)
|
260
|
+
if ((cn[i0 ]<cs)==(cm<cs)
|
261
|
+
&& (cn[(i0+7) & 7]<cs)!=(cm<cs)) break; /* first same */
|
262
|
+
for (i3=0;i3<8;i3++)
|
263
|
+
if ((cn[(i0+i3) & 7]<cs)!=(cm<cs)) break; /* num same */
|
264
|
+
for (i4=0;i4<8;i4++)
|
265
|
+
if ((cn[(i0+i3+i4) & 7]<cs)==(cm<cs)) break; /* num other */
|
266
|
+
if (i1<=3 && i2>=5 && i3>=3 && i4>=3) { /* change only on borders */
|
267
|
+
ii++; /* white : black */
|
268
|
+
put(pp,x,y,7,((cm<cs)?(cs|32):cs/2)&~7);
|
269
|
+
#if 0
|
270
|
+
printf(" x y i0 i1 i2 i3 i4 cm new cs %3d %3d"
|
271
|
+
" %3d %3d %3d %3d %3d %3d %3d %3d\n",
|
272
|
+
x-box2->x0,y-box2->y0,i0,i1,i2,i3,i3,cm,getpixel(pp,x,y),cs);
|
273
|
+
#endif
|
274
|
+
}
|
275
|
+
}
|
276
|
+
#if 0 /* debugging */
|
277
|
+
out_x(box2);
|
278
|
+
#endif
|
279
|
+
} end_for_each(&(job->res.boxlist));
|
280
|
+
if(vvv)fprintf(stderr," ... %3d changes in %d of %d\n",ii,n1,n2);
|
281
|
+
return 0;
|
282
|
+
}
|
283
|
+
|
284
|
+
/* test if a corner of box1 is within box2 */
|
285
|
+
int box_nested( struct box *box1, struct box *box2){
|
286
|
+
/* box1 in box2, +1..-1 frame for pixel-patterns */
|
287
|
+
if ( ( ( box1->x0>=box2->x0-1 && box1->x0<=box2->x1+1 )
|
288
|
+
|| ( box1->x1>=box2->x0-1 && box1->x1<=box2->x1+1 ) )
|
289
|
+
&& ( ( box1->y0>=box2->y0-1 && box1->y0<=box2->y1+1 )
|
290
|
+
|| ( box1->y1>=box2->y0-1 && box1->y1<=box2->y1+1 ) ) )
|
291
|
+
return 1;
|
292
|
+
return 0;
|
293
|
+
}
|
294
|
+
|
295
|
+
/* test if box1 is within box2 */
|
296
|
+
int box_covered( struct box *box1, struct box *box2){
|
297
|
+
/* box1 in box2, +1..-1 frame for pixel-patterns */
|
298
|
+
if ( ( box1->x0>=box2->x0-1 && box1->x1<=box2->x1+1 )
|
299
|
+
&& ( box1->y0>=box2->y0-1 && box1->y1<=box2->y1+1 ) )
|
300
|
+
return 1;
|
301
|
+
return 0;
|
302
|
+
}
|
303
|
+
|
304
|
+
/* ---- remove pictures ------------------------------------------
|
305
|
+
* may be, not deleting or moving to another list is much better!
|
306
|
+
* should be renamed to remove_pictures and border boxes
|
307
|
+
*/
|
308
|
+
int remove_pictures( job_t *job){
|
309
|
+
struct box *box4,*box2;
|
310
|
+
int j=0, j2=0, num_del=0;
|
311
|
+
|
312
|
+
if (job->cfg.verbose)
|
313
|
+
fprintf(stderr, "# "__FILE__" L%d: remove pictures\n# ...",
|
314
|
+
__LINE__);
|
315
|
+
|
316
|
+
/* ToDo: output a list for picture handle scripts */
|
317
|
+
j=0; j2=0;
|
318
|
+
if(job->cfg.verbose)
|
319
|
+
for_each_data(&(job->res.boxlist)) {
|
320
|
+
box4 = (struct box *)list_get_current(&(job->res.boxlist));
|
321
|
+
if (box4->c==PICTURE) j++; else j2++;
|
322
|
+
} end_for_each(&(job->res.boxlist));
|
323
|
+
if (job->cfg.verbose)
|
324
|
+
fprintf(stderr," status: pictures= %d other= %d nC= %d\n# ...",
|
325
|
+
j, j2, job->res.numC);
|
326
|
+
|
327
|
+
/* remove table frames */
|
328
|
+
if (job->res.numC > 8)
|
329
|
+
for_each_data(&(job->res.boxlist)) {
|
330
|
+
box2 = (struct box *)list_get_current(&(job->res.boxlist));
|
331
|
+
if (box2->c==PICTURE
|
332
|
+
&& box2->num_ac==0 /* dont remove barcodes */
|
333
|
+
&& box2->x1-box2->x0+1>box2->p->x/2 /* big table? */
|
334
|
+
&& box2->y1-box2->y0+1>box2->p->y/2 ){ j=0;
|
335
|
+
/* count boxes nested with the picture */
|
336
|
+
for_each_data(&(job->res.boxlist)) {
|
337
|
+
box4 = (struct box *)list_get_current(&(job->res.boxlist));
|
338
|
+
if( box4 != box2 ) /* not count itself */
|
339
|
+
if (box_nested(box4,box2)) j++; /* box4 in box2 */
|
340
|
+
} end_for_each(&(job->res.boxlist));
|
341
|
+
if( j>8 ){ /* remove box if more than 8 chars are within box */
|
342
|
+
list_del(&(job->res.boxlist), box2); /* does not work proper ?! */
|
343
|
+
free_box(box2); num_del++;
|
344
|
+
}
|
345
|
+
}
|
346
|
+
} end_for_each(&(job->res.boxlist));
|
347
|
+
if (job->cfg.verbose)
|
348
|
+
fprintf(stderr, " deleted= %d pictures (table frames)\n# ...",
|
349
|
+
num_del);
|
350
|
+
num_del=0;
|
351
|
+
|
352
|
+
/* remove dark-border-boxes (typical for hard copy of book site,
|
353
|
+
* or spam random border) */
|
354
|
+
if (job->res.numC > 1) /* dont remove the only char */
|
355
|
+
for_each_data(&(job->res.boxlist)) {
|
356
|
+
box2 = (struct box *)list_get_current(&(job->res.boxlist));
|
357
|
+
if (box2->c!=PICTURE) continue; // ToDo: PICTUREs set already?
|
358
|
+
if ( box2->x1-box2->x0+1 > box2->p->x/2
|
359
|
+
&& box2->y1-box2->y0+1 > box2->p->y/2 ) continue;
|
360
|
+
j=0;
|
361
|
+
if (box2->x0==0) j++;
|
362
|
+
if (box2->y0==0) j++; /* on border? */
|
363
|
+
if (box2->x1==box2->p->x-1) j++;
|
364
|
+
if (box2->y1==box2->p->y-1) j++;
|
365
|
+
if (j>2){ /* ToDo: check corner pixel */
|
366
|
+
int cs=job->cfg.cs;
|
367
|
+
j=0;
|
368
|
+
if (getpixel(box2->p,box2->x0,box2->y0)<cs) j++;
|
369
|
+
if (getpixel(box2->p,box2->x1,box2->y0)<cs) j++;
|
370
|
+
if (getpixel(box2->p,box2->x0,box2->y1)<cs) j++;
|
371
|
+
if (getpixel(box2->p,box2->x1,box2->y1)<cs) j++;
|
372
|
+
if (j>2) {
|
373
|
+
list_del(&(job->res.boxlist), box2);
|
374
|
+
free_box(box2); num_del++;
|
375
|
+
}
|
376
|
+
}
|
377
|
+
} end_for_each(&(job->res.boxlist));
|
378
|
+
if (job->cfg.verbose)
|
379
|
+
fprintf(stderr, " deleted= %d pictures (on border)\n# ...",
|
380
|
+
num_del);
|
381
|
+
num_del=0;
|
382
|
+
|
383
|
+
j=0; j2=0;
|
384
|
+
if(job->cfg.verbose)
|
385
|
+
for_each_data(&(job->res.boxlist)) {
|
386
|
+
box4 = (struct box *)list_get_current(&(job->res.boxlist));
|
387
|
+
if( box4->c==PICTURE ) j++; else j2++;
|
388
|
+
} end_for_each(&(job->res.boxlist));
|
389
|
+
if (job->cfg.verbose)
|
390
|
+
fprintf(stderr," status: pictures= %d other= %d nC= %d\n# ...",
|
391
|
+
j, j2, job->res.numC);
|
392
|
+
|
393
|
+
for(j=1;j;){ j=0; /* this is only because list_del does not work */
|
394
|
+
/* can be slow on gray images */
|
395
|
+
for_each_data(&(job->res.boxlist)) {
|
396
|
+
box2 = (struct box *)list_get_current(&(job->res.boxlist));
|
397
|
+
if( box2->c==PICTURE && box2->num_ac==0)
|
398
|
+
for(j=1;j;){ /* let it grow to max before leave */
|
399
|
+
j=0; box4=NULL;
|
400
|
+
/* find boxes nested with the picture and remove */
|
401
|
+
/* its for pictures build by compounds */
|
402
|
+
for_each_data(&(job->res.boxlist)) {
|
403
|
+
box4 = (struct box *)list_get_current(&(job->res.boxlist));
|
404
|
+
if( box4!=box2 /* not destroy self */
|
405
|
+
&& (box4->num_ac==0) /* dont remove barcodes etc. */
|
406
|
+
&& (/* box4->c==UNKNOWN || */
|
407
|
+
box4->c==PICTURE) ) /* dont remove valid chars */
|
408
|
+
if(
|
409
|
+
/* box4 in box2, +1..-1 frame for pixel-patterns */
|
410
|
+
box_nested(box4,box2)
|
411
|
+
/* or box2 in box4 */
|
412
|
+
|| box_nested(box2,box4) /* same? */
|
413
|
+
)
|
414
|
+
if ( box4->x1-box4->x0+1>2*job->res.avX
|
415
|
+
|| box4->x1-box4->x0+1<job->res.avX/2
|
416
|
+
|| box4->y1-box4->y0+1>2*job->res.avY
|
417
|
+
|| box4->y1-box4->y0+1<job->res.avY/2
|
418
|
+
|| box_covered(box4,box2) ) /* box4 completely within box2 */
|
419
|
+
/* dont remove chars! see rotate45.fig */
|
420
|
+
{
|
421
|
+
/* do not remove boxes in inner loop (bug?) ToDo: check why! */
|
422
|
+
/* instead we leave inner loop and mark box4 as valid */
|
423
|
+
if( box4->x0<box2->x0 ) box2->x0=box4->x0;
|
424
|
+
if( box4->x1>box2->x1 ) box2->x1=box4->x1;
|
425
|
+
if( box4->y0<box2->y0 ) box2->y0=box4->y0;
|
426
|
+
if( box4->y1>box2->y1 ) box2->y1=box4->y1;
|
427
|
+
j=1; /* mark box4 as valid */
|
428
|
+
break; /* and leave inner loop */
|
429
|
+
}
|
430
|
+
} end_for_each(&(job->res.boxlist));
|
431
|
+
if (j!=0 && box4!=NULL) { /* check for valid box4 */
|
432
|
+
/* ToDo: melt */
|
433
|
+
list_del(&(job->res.boxlist), box4); /* does not work proper ?! */
|
434
|
+
free_box(box4); /* break; ToDo: necessary to leave after del??? */
|
435
|
+
num_del++;
|
436
|
+
}
|
437
|
+
|
438
|
+
}
|
439
|
+
} end_for_each(&(job->res.boxlist));
|
440
|
+
}
|
441
|
+
|
442
|
+
if (job->cfg.verbose)
|
443
|
+
fprintf(stderr, " deleted= %d nested pictures\n# ...", num_del);
|
444
|
+
|
445
|
+
/* output a list for picture handle scripts */
|
446
|
+
j=0; j2=0;
|
447
|
+
if(job->cfg.verbose)
|
448
|
+
for_each_data(&(job->res.boxlist)) {
|
449
|
+
box4 = (struct box *)list_get_current(&(job->res.boxlist));
|
450
|
+
if( box4->c==PICTURE ) {
|
451
|
+
fprintf(stderr," found picture at %4d %4d size %4d %4d\n# ...",
|
452
|
+
box4->x0, box4->y0, box4->x1-box4->x0+1, box4->y1-box4->y0+1 );
|
453
|
+
j++;
|
454
|
+
} else j2++;
|
455
|
+
} end_for_each(&(job->res.boxlist));
|
456
|
+
if (job->cfg.verbose)
|
457
|
+
fprintf(stderr," status: pictures= %d other= %d nC= %d\n",
|
458
|
+
j, j2, job->res.numC);
|
459
|
+
return 0;
|
460
|
+
}
|
461
|
+
|
462
|
+
|
463
|
+
|
464
|
+
/* ---- remove melted serifs --------------------------------- v0.2.5
|
465
|
+
>>v<<
|
466
|
+
##########.######## <-y0
|
467
|
+
################### like X VW etc.
|
468
|
+
...###.......###... <-y
|
469
|
+
...###......###....
|
470
|
+
j1 j2 j3
|
471
|
+
- can generate new boxes if two characters were glued
|
472
|
+
*/
|
473
|
+
int remove_melted_serifs( job_t *job, pix *pp ){
|
474
|
+
int x,y,j1,j2,j3,j4,i2,i3,i,ii,ni,cs,x0,x1,xa,xb,y0,y1,vvv;
|
475
|
+
struct box *box2, *box3;
|
476
|
+
// job_t *job=OCR_JOB;
|
477
|
+
progress_counter_t *pc = NULL;
|
478
|
+
|
479
|
+
vvv=job->cfg.verbose; cs=job->cfg.cs; i=0; ii=0; ni=0;
|
480
|
+
for_each_data(&(job->res.boxlist)) {
|
481
|
+
ni++;
|
482
|
+
} end_for_each(&(job->res.boxlist));
|
483
|
+
pc = open_progress(ni,"remove_melted_serifs");
|
484
|
+
ni = 0;
|
485
|
+
|
486
|
+
if(vvv){ fprintf(stderr,"# searching melted serifs ..."); }
|
487
|
+
for_each_data(&(job->res.boxlist)) {
|
488
|
+
box2 = (struct box *)list_get_current(&(job->res.boxlist));
|
489
|
+
if (box2->c != UNKNOWN) continue; /* dont try on pictures */
|
490
|
+
x0=box2->x0; x1=box2->x1;
|
491
|
+
y0=box2->y0; y1=box2->y1; /* box */
|
492
|
+
/* upper serifs */
|
493
|
+
for(j1=x0;j1+4<x1;){
|
494
|
+
j1+=loop(pp,j1,y0 ,x1-x0,cs,0,RI);
|
495
|
+
x =loop(pp,j1,y0 ,x1-x0,cs,1,RI); if(j1+x>x1+1) break;
|
496
|
+
y =loop(pp,j1,y0+1,x1-x0,cs,1,RI); if(y>x) x=y; if(j1+x>x1+1) break;
|
497
|
+
/* measure mean thickness of serif pos: (j1,y0)-(j1+x,y0) */
|
498
|
+
for(j2=j3=j4=0,i2=j1;i2<j1+x;i2++){
|
499
|
+
/* 2009-07: bug, j1 used instead of i2 */
|
500
|
+
i3 =loop(pp,i2,y0 ,y1-y0,cs,0,DO); if(8*i3>y1-y0) break;
|
501
|
+
i3+=loop(pp,i2,y0+i3,y1-y0,cs,1,DO); if(8*i3>y1-y0) continue;
|
502
|
+
if(8*i3<y1-y0){ j2+=i3; j3++; } /* sum vert. thickness */
|
503
|
+
} if(j3==0){ j1+=x; continue; } /* no serif, skip this object */
|
504
|
+
y = y0+(j2+j3-1)/j3+(y1-y0+1)/32; /* y0 + mean thickness + dy/32 + 1 */
|
505
|
+
if (vvv&1)
|
506
|
+
fprintf(stderr, "\n# upper serif x0,y0,j1-x0+x,y-y0 %4d %4d %3d + %2d %2d",
|
507
|
+
x0,y0,j1-x0,x,y-y0);
|
508
|
+
|
509
|
+
/* check if really melted serifs */
|
510
|
+
if (loop(pp,j1,y,x1-x0,cs,0,RI)<1) { j1+=x; continue; }
|
511
|
+
if(num_cross(j1 ,j1+x,y,y,pp,cs) < 2 ){ j1+=x;continue; }
|
512
|
+
if (vvv&1)
|
513
|
+
fprintf(stderr, " ok1");
|
514
|
+
j2 = j1 + loop(pp,j1,y,x1-x0,cs,0,RI);
|
515
|
+
j2 = j2 + loop(pp,j2,y,x1-x0,cs,1,RI);
|
516
|
+
i3 = loop(pp,j2,y,x1-x0,cs,0,RI); if(i3<2){j1+=x;continue;}
|
517
|
+
j2 += i3/2;
|
518
|
+
j3 = j2 + loop(pp,j2,y ,x1-j2,cs,0,RI);
|
519
|
+
i3 = j2 + loop(pp,j2,y+1,x1-j2,cs,0,RI); if(i3>j3)j3=i3;
|
520
|
+
j3 = j3 + loop(pp,j3,y ,x1-j3,cs,1,RI);
|
521
|
+
i3 = loop(pp,j3,y ,x1-j3,cs,0,RI);
|
522
|
+
if(i3<2 || j3>=j1+x){j1+=x;continue;}
|
523
|
+
j3 += i3/2;
|
524
|
+
|
525
|
+
if(x>5)
|
526
|
+
{
|
527
|
+
i++; /* snip! */
|
528
|
+
for(y=0;y<(y1-y0+1+4)/8;y++)put(pp,j2,y0+y,255,128+64); /* clear highest bit */
|
529
|
+
if(vvv&4){
|
530
|
+
fprintf(stderr,"\n");
|
531
|
+
out_x(box2);
|
532
|
+
fprintf(stderr,"# melted serifs corrected on %d %d j1=%d j3=%d",
|
533
|
+
j2-x0, y, j1-x0, j3-x0);
|
534
|
+
// ToDo: vector cut with line from xa,ya to xb,yb
|
535
|
+
// two frames of double melted MN become one frame if cut one
|
536
|
+
// of the melted serifs (new function cut_frames_at_line())
|
537
|
+
}
|
538
|
+
for(xb=0,xa=0;xa<(x1-x0+4)/8;xa++){ /* detect vertical gap */
|
539
|
+
i3=y1;
|
540
|
+
if(box2->m3>y0 && 2*y1>box2->m3+box2->m4) i3=box2->m3; /* some IJ */
|
541
|
+
if( loop(pp,j2-xa,i3,i3-y0,cs,0,UP) > (y1-y0+1)/2
|
542
|
+
&& loop(pp,j2,(y0+y1)/2,xa+1,cs,0,LE) >=xa ){ xb=-xa; break; }
|
543
|
+
if( loop(pp,j2+xa,i3,i3-y0,cs,0,UP) > (y1-y0+1)/2
|
544
|
+
&& loop(pp,j2,(y0+y1)/2,xa+1,cs,0,RI) >=xa ){ xb= xa; break; }
|
545
|
+
}
|
546
|
+
if( get_bw(j2 ,j2 ,y0,(y0+y1)/2,pp,cs,1) == 0
|
547
|
+
&& get_bw(j2+xb,j2+xb,(y0+y1)/2,i3,pp,cs,1) == 0 )
|
548
|
+
{ /* divide */
|
549
|
+
box3=malloc_box(box2);
|
550
|
+
box3->x1=j2-1;
|
551
|
+
box2->x0=j2+1; x1=box2->x1;
|
552
|
+
cut_box(box2); /* cut vectors outside the box, see box.c */
|
553
|
+
cut_box(box3);
|
554
|
+
box3->num=job->res.numC;
|
555
|
+
list_ins(&(job->res.boxlist),box2,box3);
|
556
|
+
job->res.numC++; ii++; /* insert box3 before box2 */
|
557
|
+
if(vvv&4) fprintf(stderr," => splitted");
|
558
|
+
j1=x0=box2->x0; x=0; /* hopefully ok, UVW */
|
559
|
+
}
|
560
|
+
}
|
561
|
+
j1+=x;
|
562
|
+
}
|
563
|
+
/* same on lower serifs -- change this later to better function
|
564
|
+
// #### ###
|
565
|
+
// #### v ### # <-y
|
566
|
+
// #################### <-y1
|
567
|
+
// j1 j2 j3
|
568
|
+
*/
|
569
|
+
for(j1=x0;j1<x1;){
|
570
|
+
j1+=loop(pp,j1,y1 ,x1-x0,cs,0,RI);
|
571
|
+
x =loop(pp,j1,y1 ,x1-x0,cs,1,RI); if(j1+x>x1+1) break;
|
572
|
+
y =loop(pp,j1,y1-1,x1-x0,cs,1,RI); if(y>x) x=y; if(j1+x>x1+1) break;
|
573
|
+
/* measure mean thickness of serif */
|
574
|
+
for(j2=j3=j4=0,i2=j1;i2<j1+x;i2++){
|
575
|
+
/* 2009-07: bug, j1 used instead of i2 */
|
576
|
+
i3 =loop(pp,i2,y1 ,y1-y0,cs,0,UP); if(8*i3>y1-y0) break;
|
577
|
+
i3+=loop(pp,i2,y1-i3,y1-y0,cs,1,UP); if(8*i3>y1-y0) continue;
|
578
|
+
if(8*i3<y1-y0){ j2+=i3; j3++; }
|
579
|
+
} if(j3==0){ j1+=x; continue; }
|
580
|
+
y = y1-(j2+j3-1)/j3-(y1-y0+1)/32;
|
581
|
+
if (vvv&1)
|
582
|
+
fprintf(stderr, "\n# lower serif x0,y0,j1-x0+x,y1-y %4d %4d %3d + %2d %2d",
|
583
|
+
x0,y0,j1-x0,x,y1-y);
|
584
|
+
|
585
|
+
/* check if really melted serifs */
|
586
|
+
if( loop(pp,j1,y,x1-x0,cs,0,RI)<1 ) { j1+=x; continue; }
|
587
|
+
if(num_cross(j1 ,j1+x,y,y,pp,cs) < 2 ){ j1+=x;continue; }
|
588
|
+
if (vvv&1) fprintf(stderr, " ok1");
|
589
|
+
j2 = j1 + loop(pp,j1,y,x1-x0,cs,0,RI);
|
590
|
+
j2 = j2 + loop(pp,j2,y,x1-x0,cs,1,RI);
|
591
|
+
i3 = loop(pp,j2,y,x1-x0,cs,0,RI); if(i3<2){j1+=x;continue;}
|
592
|
+
j2 += i3/2;
|
593
|
+
j3 = j2 + loop(pp,j2,y ,x1-j2,cs,0,RI);
|
594
|
+
i3 = j2 + loop(pp,j2,y-1,x1-j2,cs,0,RI); if(i3>j3)j3=i3;
|
595
|
+
j3 = j3 + loop(pp,j3,y ,x1-j3,cs,1,RI);
|
596
|
+
i3 = loop(pp,j3,y,x1-j3,cs,0,RI);
|
597
|
+
if(i3<2 || j3>=j1+x){j1+=x;continue;}
|
598
|
+
j3 += i3/2;
|
599
|
+
|
600
|
+
/* y =y1-(y1-y0+1+4)/8; */
|
601
|
+
if(x>5)
|
602
|
+
{
|
603
|
+
i++; /* snip! */
|
604
|
+
for(i3=0;i3<(y1-y0+1+4)/8;i3++)
|
605
|
+
put(pp,j2,y1-i3,255,128+64); /* clear highest bit */
|
606
|
+
if(vvv&4){
|
607
|
+
fprintf(stderr,"\n");
|
608
|
+
out_x(box2);
|
609
|
+
fprintf(stderr,"# melted serifs corrected on %d %d j1=%d j3=%d",j2-x0,y-y0,j1-x0,j3-x0);
|
610
|
+
}
|
611
|
+
for(xb=0,xa=0;xa<(x1-x0+4)/8;xa++){ /* detect vertical gap */
|
612
|
+
if( loop(pp,j2-xa,y0,y1-y0,cs,0,DO) > (y1-y0+1)/2
|
613
|
+
&& loop(pp,j2,(y0+y1)/2,xa+1,cs,0,LE) >=xa ){ xb=-xa; break; }
|
614
|
+
if( loop(pp,j2+xa,y0,y1-y0,cs,0,DO) > (y1-y0+1)/2
|
615
|
+
&& loop(pp,j2,(y0+y1)/2,xa+1,cs,0,RI) >=xa ){ xb= xa; break; }
|
616
|
+
}
|
617
|
+
if( get_bw(j2 ,j2 ,(y0+y1)/2,y1,pp,cs,1) == 0
|
618
|
+
&& get_bw(j2+xb,j2+xb,y0,(y0+y1)/2,pp,cs,1) == 0 )
|
619
|
+
{ /* divide */
|
620
|
+
box3=malloc_box(box2);
|
621
|
+
box3->x1=j2-1;
|
622
|
+
box2->x0=j2; x1=box2->x1;
|
623
|
+
cut_box(box2); /* cut vectors outside the box */
|
624
|
+
cut_box(box3);
|
625
|
+
box3->num=job->res.numC;
|
626
|
+
list_ins(&(job->res.boxlist),box2,box3); job->res.numC++; ii++;
|
627
|
+
/* box3,box2 in correct order??? */
|
628
|
+
if(vvv&4) fprintf(stderr," => splitted");
|
629
|
+
j1=x0=box2->x0; x=0; /* hopefully ok, NMK */
|
630
|
+
}
|
631
|
+
}
|
632
|
+
j1+=x;
|
633
|
+
}
|
634
|
+
progress(ni++,pc);
|
635
|
+
} end_for_each(&(job->res.boxlist));
|
636
|
+
close_progress(pc);
|
637
|
+
if(vvv)fprintf(stderr," %3d cluster corrected, %d new boxes\n",i,ii);
|
638
|
+
return 0;
|
639
|
+
}
|
640
|
+
|
641
|
+
/* remove black borders often seen on bad scanned copies of books
|
642
|
+
- dust around the border
|
643
|
+
*/
|
644
|
+
int remove_rest_of_dust( job_t *job ) {
|
645
|
+
// job_t *job=OCR_JOB;
|
646
|
+
int i1, i2, vvv = job->cfg.verbose, x0, x1, y0, y1, cnt=0;
|
647
|
+
struct box *box2, *box4;
|
648
|
+
progress_counter_t *pc = NULL;
|
649
|
+
|
650
|
+
i1 = i2 = 0; /* counter for removed boxes */
|
651
|
+
if (vvv)
|
652
|
+
fprintf(stderr, "# remove_rest_of_dust (avX,nC), ... ");
|
653
|
+
/* remove fragments from border */
|
654
|
+
for_each_data(&(job->res.boxlist)) {
|
655
|
+
box2 = (struct box *)list_get_current(&(job->res.boxlist));
|
656
|
+
if (box2->c == UNKNOWN) {
|
657
|
+
x0 = box2->x0; x1 = box2->x1;
|
658
|
+
y0 = box2->y0; y1 = box2->y1; /* box */
|
659
|
+
/* box in char ??? */
|
660
|
+
if ( 2 * job->res.numC * (y1 - y0 + 1) < 3 * job->res.sumY
|
661
|
+
&& ( y1 < box2->p->y/4 || y0 > 3*box2->p->y/4 ) /* not single line */
|
662
|
+
&& job->res.numC > 1 /* do not remove everything */
|
663
|
+
&& ( box2->m4 == 0 ) ) /* remove this */
|
664
|
+
{
|
665
|
+
job->res.numC--; /* ToDo: dont count tiny pixels */
|
666
|
+
/* ToDo: res.sumX,Y must also be corrected */
|
667
|
+
i1++;
|
668
|
+
list_del(&(job->res.boxlist), box2);
|
669
|
+
free_box(box2);
|
670
|
+
if (vvv) fprintf(stderr,"\n# remove1 %3d %3d ", x0, y0);
|
671
|
+
}
|
672
|
+
}
|
673
|
+
} end_for_each(&(job->res.boxlist));
|
674
|
+
|
675
|
+
pc = open_progress(job->res.boxlist.n,"remove_dust2");
|
676
|
+
for_each_data(&(job->res.boxlist)) {
|
677
|
+
box2 = (struct box *)list_get_current(&(job->res.boxlist));
|
678
|
+
progress(cnt++,pc);
|
679
|
+
if (box2->c == PICTURE) continue;
|
680
|
+
x0 = box2->x0; x1 = box2->x1;
|
681
|
+
y0 = box2->y0; y1 = box2->y1; /* box */
|
682
|
+
if (box2->m2 && 4*y0>box2->m2+3*box2->m3 && 2*y1<box2->m3+box2->m4)
|
683
|
+
continue; // dont remove . 2010-10-09 qemu
|
684
|
+
/* remove tiny box2 if to far away from bigger boxes */
|
685
|
+
/* ToDo: remove clouds of tiny pixels (count near small, compare with num bigger) */
|
686
|
+
/* 0.42: remove far away pixel? ToDo: do it at earlier? */
|
687
|
+
if (x1-x0+1<3 && y1-y0+1<3){
|
688
|
+
int xn, yn, xs, ys;
|
689
|
+
int found=0; /* nearest bigger box */
|
690
|
+
/* search near bigger box */
|
691
|
+
for_each_data(&(job->res.boxlist)) {
|
692
|
+
box4 = (struct box *)list_get_current(&(job->res.boxlist));
|
693
|
+
if (found || box4 == box2) continue;
|
694
|
+
if (box4->x1-box4->x0+1<3 && box4->y1-box4->y0+1<3) continue;
|
695
|
+
xs = box4->x1-box4->x0+1;
|
696
|
+
ys = box4->y1-box4->y0+1;
|
697
|
+
xn = abs((box4->x0+box4->x1)/2 - box2->x0);
|
698
|
+
yn = abs((box4->y0+box4->y1)/2 - box2->y0);
|
699
|
+
if (2*xn < 3*xs && 2*yn < 3*ys) { found=1; }
|
700
|
+
} end_for_each(&(job->res.boxlist));
|
701
|
+
if (!found) { /* found nothing, box2 to far from big boxes */
|
702
|
+
i2++;
|
703
|
+
list_del(&(job->res.boxlist), box2);
|
704
|
+
free_box(box2);
|
705
|
+
if (vvv) fprintf(stderr,"\n# remove2 %3d %3d ", x0, y0);
|
706
|
+
}
|
707
|
+
}
|
708
|
+
} end_for_each(&(job->res.boxlist));
|
709
|
+
close_progress(pc);
|
710
|
+
if (vvv)
|
711
|
+
fprintf(stderr, " %3d + %3d boxes deleted, nC= %d ?\n",
|
712
|
+
i1, i2, job->res.numC);
|
713
|
+
|
714
|
+
return 0;
|
715
|
+
}
|