gocr-ruby 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +22 -0
- data/Gemfile +4 -0
- data/LICENSE +21 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +49 -0
- data/ext/gocr/Makefile +141 -0
- data/ext/gocr/Makefile.in +140 -0
- data/ext/gocr/amiga.h +31 -0
- data/ext/gocr/barcode.c +2108 -0
- data/ext/gocr/barcode.h +11 -0
- data/ext/gocr/box.c +496 -0
- data/ext/gocr/config.h +37 -0
- data/ext/gocr/config.h.in +36 -0
- data/ext/gocr/database.c +468 -0
- data/ext/gocr/detect.c +1003 -0
- data/ext/gocr/extconf.rb +6 -0
- data/ext/gocr/gocr.c +436 -0
- data/ext/gocr/gocr.h +290 -0
- data/ext/gocr/jconv.c +168 -0
- data/ext/gocr/job.c +92 -0
- data/ext/gocr/lines.c +364 -0
- data/ext/gocr/list.c +334 -0
- data/ext/gocr/list.h +91 -0
- data/ext/gocr/ocr0.c +7312 -0
- data/ext/gocr/ocr0.h +63 -0
- data/ext/gocr/ocr0n.c +1527 -0
- data/ext/gocr/ocr1.c +85 -0
- data/ext/gocr/ocr1.h +3 -0
- data/ext/gocr/otsu.c +310 -0
- data/ext/gocr/otsu.h +23 -0
- data/ext/gocr/output.c +291 -0
- data/ext/gocr/output.h +37 -0
- data/ext/gocr/pcx.c +153 -0
- data/ext/gocr/pcx.h +9 -0
- data/ext/gocr/pgm2asc.c +3259 -0
- data/ext/gocr/pgm2asc.h +105 -0
- data/ext/gocr/pixel.c +538 -0
- data/ext/gocr/pnm.c +538 -0
- data/ext/gocr/pnm.h +35 -0
- data/ext/gocr/progress.c +87 -0
- data/ext/gocr/progress.h +42 -0
- data/ext/gocr/remove.c +715 -0
- data/ext/gocr/tga.c +87 -0
- data/ext/gocr/tga.h +6 -0
- data/ext/gocr/unicode.c +1318 -0
- data/ext/gocr/unicode.h +62 -0
- data/ext/gocr/unicode_defs.h +1245 -0
- data/ext/gocr/version.h +2 -0
- data/gocr-ruby.gemspec +28 -0
- data/image.png +0 -0
- data/lib/gocr.rb +6 -0
- data/lib/gocr/image.rb +8 -0
- data/lib/gocr/version.rb +3 -0
- metadata +156 -0
data/ext/gocr/jconv.c
ADDED
@@ -0,0 +1,168 @@
|
|
1
|
+
/* OCR Aug00 JS
|
2
|
+
// PGM gray ASCII=P2 RAW=P5
|
3
|
+
// PPM RGB ASCII=P3 RAW=P6
|
4
|
+
// PBM B/W ASCII=P1 RAW=P4
|
5
|
+
// ToDo:
|
6
|
+
// - pbm-raw to pgm also for x!=0 (mod 8)
|
7
|
+
// v0.01 bug eliminated
|
8
|
+
// v0.02 convert renamed into jconv because ImageMagick uses same name
|
9
|
+
// v0.03 code review bbg
|
10
|
+
// program is not used anymore, use "convert -verbose -crop 0x0+1+1" instead
|
11
|
+
*/
|
12
|
+
|
13
|
+
// #include <iostream.h>
|
14
|
+
#include "config.h"
|
15
|
+
#include <stdio.h>
|
16
|
+
#include <stdlib.h>
|
17
|
+
#include <assert.h>
|
18
|
+
#include <string.h>
|
19
|
+
#include "pnm.h"
|
20
|
+
#ifdef HAVE_PAM_H
|
21
|
+
# include <pam.h>
|
22
|
+
#endif
|
23
|
+
#include "pcx.h"
|
24
|
+
#include "tga.h"
|
25
|
+
|
26
|
+
void help( void ) {
|
27
|
+
printf("jconv version Aug2000 JS (pnm-raw,pcx8,tga24)\n"
|
28
|
+
"use: jconv [options] ?infile.pnm? ?outfile.pgm? ?ox? ?oy? ?dx? ?dy?\n"
|
29
|
+
"options: -shrink -pbm -? -help\n"
|
30
|
+
"example: jconv -shrink -pbm font.pbm font.pbm 0 0 0 0\n");
|
31
|
+
exit(1);
|
32
|
+
}
|
33
|
+
|
34
|
+
int main(int argn, char *argv[])
|
35
|
+
{
|
36
|
+
char *inam, *onam;
|
37
|
+
pix bild;
|
38
|
+
int ox, oy, dx, dy, x, y, i, vvv = 0;
|
39
|
+
|
40
|
+
#ifdef HAVE_PAM_H
|
41
|
+
pnm_init(&argn, argv);
|
42
|
+
#endif
|
43
|
+
// skip options
|
44
|
+
for (i = 1; i < argn; i++) {
|
45
|
+
if (argv[i][0] != '-')
|
46
|
+
break;
|
47
|
+
if (!strcmp(argv[i], "-?"))
|
48
|
+
help();
|
49
|
+
else if (!strcmp(argv[i], "-help"))
|
50
|
+
help();
|
51
|
+
else if (!strcmp(argv[i], "-shrink"))
|
52
|
+
vvv |= 2;
|
53
|
+
else if (!strcmp(argv[i], "-pbm"))
|
54
|
+
vvv |= 4;
|
55
|
+
else
|
56
|
+
printf("unknown option: %s\n", argv[i]);
|
57
|
+
}
|
58
|
+
|
59
|
+
if (argn - i != 6)
|
60
|
+
help();
|
61
|
+
inam = argv[i++];
|
62
|
+
onam = argv[i++];
|
63
|
+
ox = atoi(argv[i++]);
|
64
|
+
oy = atoi(argv[i++]);
|
65
|
+
dx = atoi(argv[i++]);
|
66
|
+
dy = atoi(argv[i++]);
|
67
|
+
printf("# in=%s out=%s offs=%d,%d len=%d,%d vvv=%d\n",
|
68
|
+
inam, onam, ox, oy, dx, dy, vvv);
|
69
|
+
|
70
|
+
// ----- read picture
|
71
|
+
if (strstr(inam, ".pbm") ||
|
72
|
+
strstr(inam, ".pgm") ||
|
73
|
+
strstr(inam, ".ppm") ||
|
74
|
+
strstr(inam, ".pnm") ||
|
75
|
+
strstr(inam, ".pam"))
|
76
|
+
readpgm(inam, &bild, 1);
|
77
|
+
else if (strstr(inam, ".pcx"))
|
78
|
+
readpcx(inam, &bild, 1);
|
79
|
+
else if (strstr(inam, ".tga"))
|
80
|
+
readtga(inam, &bild, ((vvv > 1) ? 0 : 1));
|
81
|
+
else {
|
82
|
+
printf("Error: unknown suffix\n");
|
83
|
+
exit(1);
|
84
|
+
}
|
85
|
+
if (ox < 0 || ox >= bild.x)
|
86
|
+
ox = 0;
|
87
|
+
if (oy < 0 || ox >= bild.y)
|
88
|
+
oy = 0;
|
89
|
+
if (dx <= 0 || ox + dx > bild.x)
|
90
|
+
dx = bild.x - ox;
|
91
|
+
if (dy <= 0 || oy + dy > bild.y)
|
92
|
+
dy = bild.y - oy;
|
93
|
+
if ((vvv & 2) == 2 && bild.bpp == 1) { // -shrink
|
94
|
+
int x, y;
|
95
|
+
printf("# shrinking PGM: offs=%d,%d len=%d,%d\n", ox, oy, dx, dy);
|
96
|
+
for (y = 0; y < dy; y++) { // shrink upper border
|
97
|
+
for (x = 0; x < dx; x++)
|
98
|
+
if (bild.p[x + ox + (y + oy) * bild.x] < 127)
|
99
|
+
break;
|
100
|
+
if (x < dx) {
|
101
|
+
if (y > 0)
|
102
|
+
y--;
|
103
|
+
oy += y;
|
104
|
+
dy -= y;
|
105
|
+
break;
|
106
|
+
}
|
107
|
+
}
|
108
|
+
for (y = 0; y < dy; y++) { // shrink lower border
|
109
|
+
for (x = 0; x < dx; x++)
|
110
|
+
if (bild.p[ox + x + (oy + dy - y - 1) * bild.x] < 127)
|
111
|
+
break;
|
112
|
+
if (x < dx) {
|
113
|
+
if (y > 0)
|
114
|
+
y--;
|
115
|
+
dy -= y;
|
116
|
+
break;
|
117
|
+
}
|
118
|
+
}
|
119
|
+
for (x = 0; x < dx; x++) { // shrink left border
|
120
|
+
for (y = 0; y < dy; y++)
|
121
|
+
if (bild.p[x + ox + (y + oy) * bild.x] < 127)
|
122
|
+
break;
|
123
|
+
if (y < dy) {
|
124
|
+
if (x > 0)
|
125
|
+
x--;
|
126
|
+
ox += x;
|
127
|
+
dx -= x;
|
128
|
+
break;
|
129
|
+
}
|
130
|
+
}
|
131
|
+
for (x = 0; x < dx; x++) { // shrink right border
|
132
|
+
for (y = 0; y < dy; y++)
|
133
|
+
if (bild.p[ox + dx - x - 1 + (oy + y) * bild.x] < 127)
|
134
|
+
break;
|
135
|
+
if (y < dy) {
|
136
|
+
if (x > 0)
|
137
|
+
x--;
|
138
|
+
dx -= x;
|
139
|
+
break;
|
140
|
+
}
|
141
|
+
}
|
142
|
+
}
|
143
|
+
printf("# final dimension: offs=%d,%d len=%d,%d bpp=%d\n",
|
144
|
+
ox, oy, dx, dy, bild.bpp);
|
145
|
+
|
146
|
+
/* bbg: could be changed to memmoves */
|
147
|
+
// ---- new size
|
148
|
+
for (y = 0; y < dy; y++)
|
149
|
+
for (x = 0; x < dx; x++)
|
150
|
+
for (i = 0; i < 3; i++)
|
151
|
+
bild.p[i + bild.bpp * (x + dx * y)] =
|
152
|
+
bild.p[i + bild.bpp * (x + ox + (y + oy) * bild.x)];
|
153
|
+
bild.x = dx;
|
154
|
+
bild.y = dy;
|
155
|
+
// ---- write internal picture of textsite
|
156
|
+
printf("# write %s\n", onam);
|
157
|
+
if (strstr(onam, ".pbm"))
|
158
|
+
writepbm(onam, &bild);
|
159
|
+
else if (strstr(onam, ".pgm"))
|
160
|
+
writepgm(onam, &bild);
|
161
|
+
else if (strstr(onam, ".ppm"))
|
162
|
+
writeppm(onam, &bild);
|
163
|
+
else if (strstr(onam, ".pnm"))
|
164
|
+
writepgm(onam, &bild);
|
165
|
+
else
|
166
|
+
printf("Error: unknown suffix");
|
167
|
+
free( bild.p );
|
168
|
+
}
|
data/ext/gocr/job.c
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
/*
|
2
|
+
This is a Optical-Character-Recognition program
|
3
|
+
Copyright (C) 2000-2010 Joerg Schulenburg
|
4
|
+
|
5
|
+
This program is free software; you can redistribute it and/or
|
6
|
+
modify it under the terms of the GNU General Public License
|
7
|
+
as published by the Free Software Foundation; either version 2
|
8
|
+
of the License, or (at your option) any later version.
|
9
|
+
|
10
|
+
This program is distributed in the hope that it will be useful,
|
11
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
+
GNU General Public License for more details.
|
14
|
+
|
15
|
+
You should have received a copy of the GNU General Public License
|
16
|
+
along with this program; if not, write to the Free Software
|
17
|
+
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
18
|
+
|
19
|
+
see README for email address */
|
20
|
+
|
21
|
+
#include "pgm2asc.h"
|
22
|
+
#include "gocr.h"
|
23
|
+
|
24
|
+
/* initialize job structure cfg and db (for all images of a multiimage) */
|
25
|
+
void job_init(job_t *job) {
|
26
|
+
/* init source */
|
27
|
+
job->src.fname = "-";
|
28
|
+
|
29
|
+
/* init temporaries */
|
30
|
+
list_init( &job->tmp.dblist );
|
31
|
+
|
32
|
+
/* init cfg */
|
33
|
+
job->cfg.cs = 0;
|
34
|
+
job->cfg.spc = 0;
|
35
|
+
job->cfg.mode = 0;
|
36
|
+
job->cfg.dust_size = -1; /* auto detect */
|
37
|
+
job->cfg.only_numbers = 0;
|
38
|
+
job->cfg.verbose = 0;
|
39
|
+
job->cfg.out_format = UTF8; /* old: ISO8859_1; */
|
40
|
+
job->cfg.lc = "_";
|
41
|
+
job->cfg.db_path = (char*)NULL;
|
42
|
+
job->cfg.cfilter = (char*)NULL;
|
43
|
+
job->cfg.certainty = 95;
|
44
|
+
job->cfg.unrec_marker = "_";
|
45
|
+
}
|
46
|
+
|
47
|
+
/* initialize job structure for every image (multi-images) */
|
48
|
+
void job_init_image(job_t *job) {
|
49
|
+
|
50
|
+
/* FIXME jb: init pix */
|
51
|
+
job->src.p.p = NULL;
|
52
|
+
|
53
|
+
/* init results */
|
54
|
+
list_init( &job->res.boxlist );
|
55
|
+
list_init( &job->res.linelist );
|
56
|
+
job->res.avX = 5;
|
57
|
+
job->res.avY = 8;
|
58
|
+
job->res.sumX = 0;
|
59
|
+
job->res.sumY = 0;
|
60
|
+
job->res.numC = 0;
|
61
|
+
job->res.lines.dy=0;
|
62
|
+
job->res.lines.num=0;
|
63
|
+
|
64
|
+
/* init temporaries */
|
65
|
+
job->tmp.n_run = 0;
|
66
|
+
/* FIXME jb: init ppo */
|
67
|
+
job->tmp.ppo.p = NULL;
|
68
|
+
job->tmp.ppo.x = 0;
|
69
|
+
job->tmp.ppo.y = 0;
|
70
|
+
|
71
|
+
}
|
72
|
+
|
73
|
+
/* free job structure */
|
74
|
+
void job_free_image(job_t *job) {
|
75
|
+
|
76
|
+
/* if tmp is just a copy of the pointer to the original image */
|
77
|
+
if (job->tmp.ppo.p==job->src.p.p) job->tmp.ppo.p=NULL;
|
78
|
+
|
79
|
+
/* FIMXE jb: free lists
|
80
|
+
* list_free( &job->res.linelist );
|
81
|
+
* list_free( &job->tmp.dblist );
|
82
|
+
*/
|
83
|
+
|
84
|
+
list_and_data_free(&(job->res.boxlist), (void (*)(void *))free_box);
|
85
|
+
|
86
|
+
/* FIXME jb: free pix */
|
87
|
+
if (job->src.p.p) { free(job->src.p.p); job->src.p.p=NULL; }
|
88
|
+
|
89
|
+
/* FIXME jb: free pix */
|
90
|
+
if (job->tmp.ppo.p) { free(job->tmp.ppo.p); job->tmp.ppo.p=NULL; }
|
91
|
+
|
92
|
+
}
|
data/ext/gocr/lines.c
ADDED
@@ -0,0 +1,364 @@
|
|
1
|
+
/*
|
2
|
+
This is a Optical-Character-Recognition program
|
3
|
+
Copyright (C) 2000-2010 Joerg Schulenburg
|
4
|
+
|
5
|
+
This program is free software; you can redistribute it and/or
|
6
|
+
modify it under the terms of the GNU General Public License
|
7
|
+
as published by the Free Software Foundation; either version 2
|
8
|
+
of the License, or (at your option) any later version.
|
9
|
+
|
10
|
+
This program is distributed in the hope that it will be useful,
|
11
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
+
GNU General Public License for more details.
|
14
|
+
|
15
|
+
You should have received a copy of the GNU General Public License
|
16
|
+
along with this program; if not, write to the Free Software
|
17
|
+
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
18
|
+
|
19
|
+
see README for EMAIL-address
|
20
|
+
*/
|
21
|
+
|
22
|
+
#include <stdlib.h>
|
23
|
+
#include <stdio.h>
|
24
|
+
#include <string.h>
|
25
|
+
#include <limits.h>
|
26
|
+
#include <assert.h>
|
27
|
+
#include "pgm2asc.h"
|
28
|
+
#include "gocr.h"
|
29
|
+
#include "unicode.h" /* decode() */
|
30
|
+
#include "unicode_defs.h" /* UNKNOWN */
|
31
|
+
|
32
|
+
const char *getTextLine (List *linelist, int line) {
|
33
|
+
int i;
|
34
|
+
Element *elem;
|
35
|
+
|
36
|
+
if (line < 0 || line > list_total(linelist))
|
37
|
+
return NULL;
|
38
|
+
|
39
|
+
for ( i = 0, elem = linelist->start.next;
|
40
|
+
i < line && elem != NULL; i++ )
|
41
|
+
elem = elem->next;
|
42
|
+
|
43
|
+
if ( elem != NULL )
|
44
|
+
return (const char *)elem->data;
|
45
|
+
|
46
|
+
return NULL;
|
47
|
+
}
|
48
|
+
|
49
|
+
void free_textlines(List *linelist) { // list.h
|
50
|
+
for_each_data(linelist) {
|
51
|
+
if (list_get_current(linelist))
|
52
|
+
free(list_get_current(linelist)); // free list element
|
53
|
+
} end_for_each(linelist);
|
54
|
+
list_free(linelist); // free list structure
|
55
|
+
}
|
56
|
+
|
57
|
+
/* append a string (s1) to the string buffer (buffer) of length (len)
|
58
|
+
* if buffer is to small or len==0 realloc buffer, len+=512
|
59
|
+
*/
|
60
|
+
char *append_to_line(char *buffer, const char *s1, int *len) {
|
61
|
+
char *temp;
|
62
|
+
int slen=0, alen;
|
63
|
+
if( s1==NULL || s1[0] == 0 ){
|
64
|
+
fprintf(stderr,"\n#BUG: appending 0 to a line makes no sense!");
|
65
|
+
return buffer;
|
66
|
+
}
|
67
|
+
if ( *len>0 ) slen= strlen(buffer); // used buffer
|
68
|
+
alen = strlen(s1);
|
69
|
+
if ( slen+alen+1 >= *len ) {
|
70
|
+
*len += (((alen+1)>>9)+1)<<9; // round up to next 512 bytes
|
71
|
+
temp = (char *)realloc(buffer, *len);
|
72
|
+
if( !temp ) { fprintf(stderr,"realloc failed!\n");
|
73
|
+
*len -= (((alen+1)>>9)+1)<<9; return buffer; } // go back
|
74
|
+
else buffer = temp; // buffer successfull enlarged
|
75
|
+
}
|
76
|
+
temp = buffer + slen; // end of buffered string
|
77
|
+
memcpy(temp,s1,alen+1); // copy including end sign '\0'
|
78
|
+
return buffer;
|
79
|
+
}
|
80
|
+
|
81
|
+
int calc_median_gap(struct tlines * lines) {
|
82
|
+
int gaps[MAXlines], l;
|
83
|
+
if (lines->num<2) return 0;
|
84
|
+
for (l = 0; l < lines->num - 1; l++)
|
85
|
+
gaps[l] = lines->m2[l + 1] - lines->m3[l];
|
86
|
+
qsort(gaps, lines->num - 1, sizeof(gaps[0]), intcompare);
|
87
|
+
return gaps[(lines->num - 1) / 2];
|
88
|
+
}
|
89
|
+
|
90
|
+
/*
|
91
|
+
* Return the indent in pixels of the least-indented line.
|
92
|
+
* Will be subtracted as base_indent to avoid negativ indent.
|
93
|
+
*
|
94
|
+
* This is adjusted to account for an angle on the page as
|
95
|
+
* a whole. For instance, if the page is rotated clockwise,
|
96
|
+
* lower lines may be physically closer to the left edge
|
97
|
+
* than higher lines that are logically less indented.
|
98
|
+
* We rotate around (0,0). Note that this rotation could
|
99
|
+
* rotate lines "off the left margin", leading to a negative
|
100
|
+
* indent.
|
101
|
+
*
|
102
|
+
* boxlist -- list of character boxes.
|
103
|
+
* dx, dy -- rotation angle as vector
|
104
|
+
*/
|
105
|
+
int get_least_line_indent(List * boxlist, int dx, int dy, int verbose) {
|
106
|
+
int min_indent = INT_MAX;
|
107
|
+
int adjusted_indent;
|
108
|
+
struct box * box2;
|
109
|
+
if (verbose)
|
110
|
+
fprintf(stderr, "get_least_line_indent: rot.vector dxdy %d %d\n",
|
111
|
+
dx, dy);
|
112
|
+
for_each_data(boxlist) {
|
113
|
+
box2 = (struct box *)list_get_current(boxlist);
|
114
|
+
/* if num == -1, indicates this is a space or newline box,
|
115
|
+
* inserted in list_insert_spaces. */
|
116
|
+
if (box2->num != -1) {
|
117
|
+
adjusted_indent = box2->x0;
|
118
|
+
if (dx) adjusted_indent += box2->y0 * dy / dx;
|
119
|
+
if (adjusted_indent < min_indent) {
|
120
|
+
min_indent = adjusted_indent;
|
121
|
+
if (dy!=0 && verbose)
|
122
|
+
fprintf(stderr,
|
123
|
+
"# Line %2d, unadjusted xy %3d %3d, adjusted x %2d\n",
|
124
|
+
box2->line, box2->x0, box2->y0, adjusted_indent);
|
125
|
+
}
|
126
|
+
}
|
127
|
+
} end_for_each(boxlist);
|
128
|
+
if (verbose)
|
129
|
+
fprintf(stderr, "# Minimum adjusted x: %d (min_indent)\n", min_indent);
|
130
|
+
return min_indent;
|
131
|
+
}
|
132
|
+
|
133
|
+
/* collect all the chars from the box tree and write them to a string buffer
|
134
|
+
mo is the mode: mode&8 means, use chars even if unsure recognized
|
135
|
+
ToDo: store full text(?), store decoded text+boxes+position chars (v0.4)
|
136
|
+
(HTML,UTF,ASCII,XML), not wchar incl. descriptions (at<95% in red)
|
137
|
+
remove decode(*c, job->cfg.out_format) from gocr.c!
|
138
|
+
XML add alternate-tags, format tags and position tags
|
139
|
+
ToDo: better output XML to stdout instead of circumstantial store to lines
|
140
|
+
not all texts/images follow the line concept?
|
141
|
+
Better use a tree of objects where leafes are chars instead of simple list.
|
142
|
+
Chars or objects are taken into account. Objects can be text strings
|
143
|
+
or XML strings.
|
144
|
+
|
145
|
+
ToDo: replacing by output_to_stream(FILE *ostream, int mo) ??
|
146
|
+
can be used via pipes (if library) ???
|
147
|
+
*/
|
148
|
+
void store_boxtree_lines(job_t *job, int mo) {
|
149
|
+
char *buffer; /* temp buffer for text */
|
150
|
+
int i = 0, j = 0;
|
151
|
+
int len = 1024; // initial buffer length for text line
|
152
|
+
struct box *box2;
|
153
|
+
int median_gap = 0;
|
154
|
+
int max_single_space_gap = 0;
|
155
|
+
struct tlines line_info;
|
156
|
+
int line, line_gap, oldline=-1;
|
157
|
+
int left_margin;
|
158
|
+
int i1=0, i2=0;
|
159
|
+
|
160
|
+
buffer = (char *)malloc(len);
|
161
|
+
if ( !buffer ) {
|
162
|
+
fprintf(stderr,"malloc failed!\n"); // ToDo: index_to_error_list
|
163
|
+
return;
|
164
|
+
}
|
165
|
+
*buffer = 0;
|
166
|
+
|
167
|
+
if ( job->cfg.verbose&1 )
|
168
|
+
fprintf(stderr,"# store boxtree to lines ...");
|
169
|
+
|
170
|
+
/* wew: calculate the median line gap, to determine line spacing
|
171
|
+
* for the text output. The line gap used is between one line's
|
172
|
+
* m3 (baseline) and the next line's m2 (height of non-rising
|
173
|
+
* lowercase). We use these lines as they are the least likely
|
174
|
+
* to vary according to actual character content of lines.
|
175
|
+
*/
|
176
|
+
median_gap = calc_median_gap(&job->res.lines);
|
177
|
+
if (median_gap <= 0) {
|
178
|
+
if ( job->cfg.verbose&1 )
|
179
|
+
fprintf(stderr, "# Warning: non-positive median line gap of %d\n",
|
180
|
+
median_gap);
|
181
|
+
median_gap = 8;
|
182
|
+
max_single_space_gap = 12; /* arbitrary */
|
183
|
+
} else {
|
184
|
+
max_single_space_gap = median_gap * 7 / 4;
|
185
|
+
}
|
186
|
+
|
187
|
+
// Will be subtracted as base_indent to avoid negativ indent.
|
188
|
+
left_margin = get_least_line_indent(&job->res.boxlist,
|
189
|
+
job->res.lines.dx,
|
190
|
+
job->res.lines.dy, job->cfg.verbose);
|
191
|
+
|
192
|
+
if (job->cfg.out_format==XML) { /* subject of change */
|
193
|
+
char s1[255]; /* ToDo: avoid potential buffer overflow !!! */
|
194
|
+
/* output lot of usefull information for XML filter */
|
195
|
+
sprintf(s1,"<page x=\"%d\" y=\"%d\" dx=\"%d\" dy=\"%d\">\n",
|
196
|
+
0,0,0,0);
|
197
|
+
buffer=append_to_line(buffer,s1,&len);
|
198
|
+
sprintf(s1,"<block x=\"%d\" y=\"%d\" dx=\"%d\" dy=\"%d\">\n",
|
199
|
+
0,0,0,0);
|
200
|
+
buffer=append_to_line(buffer,s1,&len);
|
201
|
+
}
|
202
|
+
|
203
|
+
for_each_data(&(job->res.boxlist)) {
|
204
|
+
box2 = (struct box *)list_get_current(&(job->res.boxlist));
|
205
|
+
line = box2->line;
|
206
|
+
line_info = job->res.lines;
|
207
|
+
/* reset the output char if certainty is below the limit v0.44 */
|
208
|
+
if (box2->num_ac && box2->wac[0]<job->cfg.certainty) box2->c=UNKNOWN;
|
209
|
+
if (line!=oldline) {
|
210
|
+
if (job->cfg.out_format==XML && oldline>-1) { /* subject of change */
|
211
|
+
buffer=append_to_line(buffer,"</line>\n",&len);
|
212
|
+
list_app( &(job->res.linelist), (void *)strdup(buffer) ); // wcsdup
|
213
|
+
memset(buffer, 0, len);
|
214
|
+
j=0; // reset counter for new line
|
215
|
+
}
|
216
|
+
if (job->cfg.out_format==XML) { /* subject of change */
|
217
|
+
char s1[255]; /* ToDo: avoid potential buffer overflow !!! */
|
218
|
+
/* output lot of usefull information for XML filter */
|
219
|
+
sprintf(s1,"<line x=\"%d\" y=\"%d\" dx=\"%d\" dy=\"%d\" value=\"%d\">\n",
|
220
|
+
line_info.x0[line],line_info.m1[line],
|
221
|
+
line_info.x1[line]-line_info.x0[line]+1,
|
222
|
+
line_info.m4[line]-line_info.m1[line],line);
|
223
|
+
buffer=append_to_line(buffer,s1,&len);
|
224
|
+
}
|
225
|
+
oldline=line;
|
226
|
+
}
|
227
|
+
if (box2->c > ' ' &&
|
228
|
+
box2->c <= 'z') i1++; /* count non-space chars */
|
229
|
+
if (box2->c == '\n') {
|
230
|
+
if (job->cfg.out_format!=XML) { /* subject of change */
|
231
|
+
line_info = job->res.lines;
|
232
|
+
line = box2->line;
|
233
|
+
if (line > 0) {
|
234
|
+
line_gap = line_info.m2[line] - line_info.m3[line - 1];
|
235
|
+
for (line_gap -= max_single_space_gap; line_gap > 0;
|
236
|
+
line_gap -= median_gap) {
|
237
|
+
buffer=append_to_line(buffer,"\n",&len);
|
238
|
+
j++; /* count chars in line */
|
239
|
+
}
|
240
|
+
}
|
241
|
+
list_app( &(job->res.linelist), (void *)strdup(buffer) ); // wcsdup
|
242
|
+
memset(buffer, 0, len);
|
243
|
+
j=0; // reset counter for new line
|
244
|
+
}
|
245
|
+
}
|
246
|
+
if (box2->c == ' ') // fill large gaps with spaces
|
247
|
+
{
|
248
|
+
if (job->res.avX) { /* avoid SIGFPE */
|
249
|
+
if (job->cfg.out_format==XML) { /* subject of change */
|
250
|
+
char s1[255]; /* ToDo: avoid potential buffer overflow !!! */
|
251
|
+
/* output lot of usefull information for XML filter */
|
252
|
+
sprintf(s1," <space x=\"%d\" y=\"%d\" dx=\"%d\" dy=\"%d\" />\n",
|
253
|
+
box2->x0,box2->y0,box2->x1-box2->x0+1,box2->y1-box2->y0+1);
|
254
|
+
buffer=append_to_line(buffer,s1,&len);
|
255
|
+
} else
|
256
|
+
// multi spacing is done now in pgm2asc using insert spaces 2010-09-28
|
257
|
+
// for (i = (box2->x1 - box2->x0 + 1) / (2 * job->res.avX) + 1; i > 0; i--)
|
258
|
+
{
|
259
|
+
buffer=append_to_line(buffer," ",&len);
|
260
|
+
j++; /* number of chars in line */
|
261
|
+
}
|
262
|
+
}
|
263
|
+
}
|
264
|
+
else if (box2->c != '\n') {
|
265
|
+
if (j==0 && job->res.avX) /* first char in new line? */ {
|
266
|
+
int indent = box2->x0 - job->res.lines.x0[box2->line];
|
267
|
+
/* correct for angle of page as a whole. */
|
268
|
+
if (job->res.lines.dx)
|
269
|
+
indent += box2->y0 * job->res.lines.dy / job->res.lines.dx;
|
270
|
+
/* subtract the base margin. */
|
271
|
+
indent -= left_margin;
|
272
|
+
if (job->cfg.out_format==XML) { /* subject of change */
|
273
|
+
char s1[255]; /* ToDo: avoid potential buffer overflow !!! */
|
274
|
+
/* output lot of usefull information for XML filter */
|
275
|
+
sprintf(s1," <space x=\"%d\" y=\"%d\" dx=\"%d\" dy=\"%d\" />\n",
|
276
|
+
box2->x0,box2->y0,box2->x1-box2->x0+1,box2->y1-box2->y0+1);
|
277
|
+
buffer=append_to_line(buffer,s1,&len);
|
278
|
+
} else
|
279
|
+
// ToDo: do the multi "\n" earlier in pgm2asc (like multi spacing)
|
280
|
+
for (i = indent / job->res.avX; i > 0; i--) {
|
281
|
+
buffer=append_to_line(buffer," ",&len); j++;
|
282
|
+
}
|
283
|
+
}
|
284
|
+
if (job->cfg.out_format==XML) { /* subject of change */
|
285
|
+
char s1[255]; /* ToDo: avoid potential buffer overflow !!! */
|
286
|
+
/* output lot of usefull information for XML filter */
|
287
|
+
sprintf(s1," <box x=\"%d\" y=\"%d\" dx=\"%d\" dy=\"%d\" value=\"",
|
288
|
+
box2->x0,box2->y0,box2->x1-box2->x0+1,box2->y1-box2->y0+1);
|
289
|
+
buffer=append_to_line(buffer,s1,&len);
|
290
|
+
if (box2->num_ac>1) { /* ToDo: output a list of alternatives */
|
291
|
+
}
|
292
|
+
}
|
293
|
+
if (box2->c != UNKNOWN && box2->c != 0) {
|
294
|
+
buffer=
|
295
|
+
append_to_line(buffer,decode(box2->c,job->cfg.out_format),&len);
|
296
|
+
if (box2->c > ' ' &&
|
297
|
+
box2->c <= 'z') i2++; /* count non-space chars */
|
298
|
+
} else { /* c == UNKNOWN or 0 */
|
299
|
+
wchar_t cc; cc=box2->c;
|
300
|
+
if (box2->num_ac>0 && box2->tas[0]
|
301
|
+
&& (/* job->cfg.out_format!=XML || 2010-10 */ box2->tas[0][0]!='<')
|
302
|
+
) { // 2010-10 output XML code after XML frame, see below!
|
303
|
+
/* output glued chars or ... (?) Jan08 */
|
304
|
+
buffer=append_to_line(buffer,box2->tas[0],&len);
|
305
|
+
j+=strlen(box2->tas[0]);
|
306
|
+
} // else 2010-10-07
|
307
|
+
if (box2->num_ac == 0 || box2->c == UNKNOWN) {
|
308
|
+
/* ToDo: leave string empty? set placeholder per option */
|
309
|
+
/* output dummy string to mark UNKNOWN */
|
310
|
+
if (job->cfg.unrec_marker[0])
|
311
|
+
buffer = append_to_line(buffer, job->cfg.unrec_marker, &len);
|
312
|
+
}
|
313
|
+
}
|
314
|
+
if (job->cfg.out_format==XML) {
|
315
|
+
if (box2->num_ac>0) {
|
316
|
+
/* output alist ToDo: separate <altbox ...> */
|
317
|
+
int i1; char s1[256];
|
318
|
+
sprintf(s1,"\" numac=\"%d\" weights=\"",box2->num_ac);
|
319
|
+
buffer=append_to_line(buffer,s1,&len);
|
320
|
+
for (i1=0;i1<box2->num_ac;i1++) {
|
321
|
+
sprintf(s1,"%d",box2->wac[i1]);
|
322
|
+
buffer=append_to_line(buffer,s1,&len);
|
323
|
+
if (i1+1<box2->num_ac) buffer=append_to_line(buffer,",",&len);
|
324
|
+
}
|
325
|
+
if (box2->num_ac>1)
|
326
|
+
buffer=append_to_line(buffer,"\" achars=\"",&len);
|
327
|
+
for (i1=1;i1<box2->num_ac;i1++) {
|
328
|
+
if (box2->tas[i1] && box2->tas[i1][0]!='<')
|
329
|
+
buffer=append_to_line(buffer,box2->tas[i1],&len);
|
330
|
+
else
|
331
|
+
buffer=append_to_line(buffer,
|
332
|
+
decode(box2->tac[i1],job->cfg.out_format),&len);
|
333
|
+
// ToDo: add tas[] (achars->avalues or alternate_strings?
|
334
|
+
if (i1+1<box2->num_ac) buffer=append_to_line(buffer,",",&len);
|
335
|
+
}
|
336
|
+
}
|
337
|
+
buffer=append_to_line(buffer,"\" />\n",&len);
|
338
|
+
}
|
339
|
+
if (box2->num_ac && box2->tas[0]) {
|
340
|
+
if (box2->tas[0][0]=='<') { /* output special XML object */
|
341
|
+
buffer=append_to_line(buffer,box2->tas[0],&len);
|
342
|
+
if (job->cfg.out_format==XML) // 2010-10-07
|
343
|
+
buffer=append_to_line(buffer,"\n",&len);
|
344
|
+
j+=strlen(box2->tas[0]);
|
345
|
+
}
|
346
|
+
}
|
347
|
+
j++; /* number of chars in line */
|
348
|
+
}
|
349
|
+
i++;
|
350
|
+
} end_for_each(&(job->res.boxlist));
|
351
|
+
if (job->cfg.out_format==XML && oldline>-1) { /* subject of change */
|
352
|
+
buffer=append_to_line(buffer,"</line>\n",&len);
|
353
|
+
}
|
354
|
+
if (job->cfg.out_format==XML) { /* subject of change */
|
355
|
+
buffer=append_to_line(buffer,"</block>\n</page>\n",&len);
|
356
|
+
}
|
357
|
+
|
358
|
+
/* do not forget last line */
|
359
|
+
// is there no \n in the last line? If there is, delete next line.
|
360
|
+
list_app( &(job->res.linelist), (void *)strdup(buffer) );
|
361
|
+
free(buffer);
|
362
|
+
if( job->cfg.verbose&1 )
|
363
|
+
fprintf(stderr,"... %d lines, boxes= %d, chars= %d\n",i,i1,i2);
|
364
|
+
}
|