gocr-ruby 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (56) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +22 -0
  3. data/Gemfile +4 -0
  4. data/LICENSE +21 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +29 -0
  7. data/Rakefile +49 -0
  8. data/ext/gocr/Makefile +141 -0
  9. data/ext/gocr/Makefile.in +140 -0
  10. data/ext/gocr/amiga.h +31 -0
  11. data/ext/gocr/barcode.c +2108 -0
  12. data/ext/gocr/barcode.h +11 -0
  13. data/ext/gocr/box.c +496 -0
  14. data/ext/gocr/config.h +37 -0
  15. data/ext/gocr/config.h.in +36 -0
  16. data/ext/gocr/database.c +468 -0
  17. data/ext/gocr/detect.c +1003 -0
  18. data/ext/gocr/extconf.rb +6 -0
  19. data/ext/gocr/gocr.c +436 -0
  20. data/ext/gocr/gocr.h +290 -0
  21. data/ext/gocr/jconv.c +168 -0
  22. data/ext/gocr/job.c +92 -0
  23. data/ext/gocr/lines.c +364 -0
  24. data/ext/gocr/list.c +334 -0
  25. data/ext/gocr/list.h +91 -0
  26. data/ext/gocr/ocr0.c +7312 -0
  27. data/ext/gocr/ocr0.h +63 -0
  28. data/ext/gocr/ocr0n.c +1527 -0
  29. data/ext/gocr/ocr1.c +85 -0
  30. data/ext/gocr/ocr1.h +3 -0
  31. data/ext/gocr/otsu.c +310 -0
  32. data/ext/gocr/otsu.h +23 -0
  33. data/ext/gocr/output.c +291 -0
  34. data/ext/gocr/output.h +37 -0
  35. data/ext/gocr/pcx.c +153 -0
  36. data/ext/gocr/pcx.h +9 -0
  37. data/ext/gocr/pgm2asc.c +3259 -0
  38. data/ext/gocr/pgm2asc.h +105 -0
  39. data/ext/gocr/pixel.c +538 -0
  40. data/ext/gocr/pnm.c +538 -0
  41. data/ext/gocr/pnm.h +35 -0
  42. data/ext/gocr/progress.c +87 -0
  43. data/ext/gocr/progress.h +42 -0
  44. data/ext/gocr/remove.c +715 -0
  45. data/ext/gocr/tga.c +87 -0
  46. data/ext/gocr/tga.h +6 -0
  47. data/ext/gocr/unicode.c +1318 -0
  48. data/ext/gocr/unicode.h +62 -0
  49. data/ext/gocr/unicode_defs.h +1245 -0
  50. data/ext/gocr/version.h +2 -0
  51. data/gocr-ruby.gemspec +28 -0
  52. data/image.png +0 -0
  53. data/lib/gocr.rb +6 -0
  54. data/lib/gocr/image.rb +8 -0
  55. data/lib/gocr/version.rb +3 -0
  56. metadata +156 -0
@@ -0,0 +1,168 @@
1
+ /* OCR Aug00 JS
2
+ // PGM gray ASCII=P2 RAW=P5
3
+ // PPM RGB ASCII=P3 RAW=P6
4
+ // PBM B/W ASCII=P1 RAW=P4
5
+ // ToDo:
6
+ // - pbm-raw to pgm also for x!=0 (mod 8)
7
+ // v0.01 bug eliminated
8
+ // v0.02 convert renamed into jconv because ImageMagick uses same name
9
+ // v0.03 code review bbg
10
+ // program is not used anymore, use "convert -verbose -crop 0x0+1+1" instead
11
+ */
12
+
13
+ // #include <iostream.h>
14
+ #include "config.h"
15
+ #include <stdio.h>
16
+ #include <stdlib.h>
17
+ #include <assert.h>
18
+ #include <string.h>
19
+ #include "pnm.h"
20
+ #ifdef HAVE_PAM_H
21
+ # include <pam.h>
22
+ #endif
23
+ #include "pcx.h"
24
+ #include "tga.h"
25
+
26
+ void help( void ) {
27
+ printf("jconv version Aug2000 JS (pnm-raw,pcx8,tga24)\n"
28
+ "use: jconv [options] ?infile.pnm? ?outfile.pgm? ?ox? ?oy? ?dx? ?dy?\n"
29
+ "options: -shrink -pbm -? -help\n"
30
+ "example: jconv -shrink -pbm font.pbm font.pbm 0 0 0 0\n");
31
+ exit(1);
32
+ }
33
+
34
+ int main(int argn, char *argv[])
35
+ {
36
+ char *inam, *onam;
37
+ pix bild;
38
+ int ox, oy, dx, dy, x, y, i, vvv = 0;
39
+
40
+ #ifdef HAVE_PAM_H
41
+ pnm_init(&argn, argv);
42
+ #endif
43
+ // skip options
44
+ for (i = 1; i < argn; i++) {
45
+ if (argv[i][0] != '-')
46
+ break;
47
+ if (!strcmp(argv[i], "-?"))
48
+ help();
49
+ else if (!strcmp(argv[i], "-help"))
50
+ help();
51
+ else if (!strcmp(argv[i], "-shrink"))
52
+ vvv |= 2;
53
+ else if (!strcmp(argv[i], "-pbm"))
54
+ vvv |= 4;
55
+ else
56
+ printf("unknown option: %s\n", argv[i]);
57
+ }
58
+
59
+ if (argn - i != 6)
60
+ help();
61
+ inam = argv[i++];
62
+ onam = argv[i++];
63
+ ox = atoi(argv[i++]);
64
+ oy = atoi(argv[i++]);
65
+ dx = atoi(argv[i++]);
66
+ dy = atoi(argv[i++]);
67
+ printf("# in=%s out=%s offs=%d,%d len=%d,%d vvv=%d\n",
68
+ inam, onam, ox, oy, dx, dy, vvv);
69
+
70
+ // ----- read picture
71
+ if (strstr(inam, ".pbm") ||
72
+ strstr(inam, ".pgm") ||
73
+ strstr(inam, ".ppm") ||
74
+ strstr(inam, ".pnm") ||
75
+ strstr(inam, ".pam"))
76
+ readpgm(inam, &bild, 1);
77
+ else if (strstr(inam, ".pcx"))
78
+ readpcx(inam, &bild, 1);
79
+ else if (strstr(inam, ".tga"))
80
+ readtga(inam, &bild, ((vvv > 1) ? 0 : 1));
81
+ else {
82
+ printf("Error: unknown suffix\n");
83
+ exit(1);
84
+ }
85
+ if (ox < 0 || ox >= bild.x)
86
+ ox = 0;
87
+ if (oy < 0 || ox >= bild.y)
88
+ oy = 0;
89
+ if (dx <= 0 || ox + dx > bild.x)
90
+ dx = bild.x - ox;
91
+ if (dy <= 0 || oy + dy > bild.y)
92
+ dy = bild.y - oy;
93
+ if ((vvv & 2) == 2 && bild.bpp == 1) { // -shrink
94
+ int x, y;
95
+ printf("# shrinking PGM: offs=%d,%d len=%d,%d\n", ox, oy, dx, dy);
96
+ for (y = 0; y < dy; y++) { // shrink upper border
97
+ for (x = 0; x < dx; x++)
98
+ if (bild.p[x + ox + (y + oy) * bild.x] < 127)
99
+ break;
100
+ if (x < dx) {
101
+ if (y > 0)
102
+ y--;
103
+ oy += y;
104
+ dy -= y;
105
+ break;
106
+ }
107
+ }
108
+ for (y = 0; y < dy; y++) { // shrink lower border
109
+ for (x = 0; x < dx; x++)
110
+ if (bild.p[ox + x + (oy + dy - y - 1) * bild.x] < 127)
111
+ break;
112
+ if (x < dx) {
113
+ if (y > 0)
114
+ y--;
115
+ dy -= y;
116
+ break;
117
+ }
118
+ }
119
+ for (x = 0; x < dx; x++) { // shrink left border
120
+ for (y = 0; y < dy; y++)
121
+ if (bild.p[x + ox + (y + oy) * bild.x] < 127)
122
+ break;
123
+ if (y < dy) {
124
+ if (x > 0)
125
+ x--;
126
+ ox += x;
127
+ dx -= x;
128
+ break;
129
+ }
130
+ }
131
+ for (x = 0; x < dx; x++) { // shrink right border
132
+ for (y = 0; y < dy; y++)
133
+ if (bild.p[ox + dx - x - 1 + (oy + y) * bild.x] < 127)
134
+ break;
135
+ if (y < dy) {
136
+ if (x > 0)
137
+ x--;
138
+ dx -= x;
139
+ break;
140
+ }
141
+ }
142
+ }
143
+ printf("# final dimension: offs=%d,%d len=%d,%d bpp=%d\n",
144
+ ox, oy, dx, dy, bild.bpp);
145
+
146
+ /* bbg: could be changed to memmoves */
147
+ // ---- new size
148
+ for (y = 0; y < dy; y++)
149
+ for (x = 0; x < dx; x++)
150
+ for (i = 0; i < 3; i++)
151
+ bild.p[i + bild.bpp * (x + dx * y)] =
152
+ bild.p[i + bild.bpp * (x + ox + (y + oy) * bild.x)];
153
+ bild.x = dx;
154
+ bild.y = dy;
155
+ // ---- write internal picture of textsite
156
+ printf("# write %s\n", onam);
157
+ if (strstr(onam, ".pbm"))
158
+ writepbm(onam, &bild);
159
+ else if (strstr(onam, ".pgm"))
160
+ writepgm(onam, &bild);
161
+ else if (strstr(onam, ".ppm"))
162
+ writeppm(onam, &bild);
163
+ else if (strstr(onam, ".pnm"))
164
+ writepgm(onam, &bild);
165
+ else
166
+ printf("Error: unknown suffix");
167
+ free( bild.p );
168
+ }
@@ -0,0 +1,92 @@
1
+ /*
2
+ This is a Optical-Character-Recognition program
3
+ Copyright (C) 2000-2010 Joerg Schulenburg
4
+
5
+ This program is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU General Public License
7
+ as published by the Free Software Foundation; either version 2
8
+ of the License, or (at your option) any later version.
9
+
10
+ This program is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ GNU General Public License for more details.
14
+
15
+ You should have received a copy of the GNU General Public License
16
+ along with this program; if not, write to the Free Software
17
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18
+
19
+ see README for email address */
20
+
21
+ #include "pgm2asc.h"
22
+ #include "gocr.h"
23
+
24
+ /* initialize job structure cfg and db (for all images of a multiimage) */
25
+ void job_init(job_t *job) {
26
+ /* init source */
27
+ job->src.fname = "-";
28
+
29
+ /* init temporaries */
30
+ list_init( &job->tmp.dblist );
31
+
32
+ /* init cfg */
33
+ job->cfg.cs = 0;
34
+ job->cfg.spc = 0;
35
+ job->cfg.mode = 0;
36
+ job->cfg.dust_size = -1; /* auto detect */
37
+ job->cfg.only_numbers = 0;
38
+ job->cfg.verbose = 0;
39
+ job->cfg.out_format = UTF8; /* old: ISO8859_1; */
40
+ job->cfg.lc = "_";
41
+ job->cfg.db_path = (char*)NULL;
42
+ job->cfg.cfilter = (char*)NULL;
43
+ job->cfg.certainty = 95;
44
+ job->cfg.unrec_marker = "_";
45
+ }
46
+
47
+ /* initialize job structure for every image (multi-images) */
48
+ void job_init_image(job_t *job) {
49
+
50
+ /* FIXME jb: init pix */
51
+ job->src.p.p = NULL;
52
+
53
+ /* init results */
54
+ list_init( &job->res.boxlist );
55
+ list_init( &job->res.linelist );
56
+ job->res.avX = 5;
57
+ job->res.avY = 8;
58
+ job->res.sumX = 0;
59
+ job->res.sumY = 0;
60
+ job->res.numC = 0;
61
+ job->res.lines.dy=0;
62
+ job->res.lines.num=0;
63
+
64
+ /* init temporaries */
65
+ job->tmp.n_run = 0;
66
+ /* FIXME jb: init ppo */
67
+ job->tmp.ppo.p = NULL;
68
+ job->tmp.ppo.x = 0;
69
+ job->tmp.ppo.y = 0;
70
+
71
+ }
72
+
73
+ /* free job structure */
74
+ void job_free_image(job_t *job) {
75
+
76
+ /* if tmp is just a copy of the pointer to the original image */
77
+ if (job->tmp.ppo.p==job->src.p.p) job->tmp.ppo.p=NULL;
78
+
79
+ /* FIMXE jb: free lists
80
+ * list_free( &job->res.linelist );
81
+ * list_free( &job->tmp.dblist );
82
+ */
83
+
84
+ list_and_data_free(&(job->res.boxlist), (void (*)(void *))free_box);
85
+
86
+ /* FIXME jb: free pix */
87
+ if (job->src.p.p) { free(job->src.p.p); job->src.p.p=NULL; }
88
+
89
+ /* FIXME jb: free pix */
90
+ if (job->tmp.ppo.p) { free(job->tmp.ppo.p); job->tmp.ppo.p=NULL; }
91
+
92
+ }
@@ -0,0 +1,364 @@
1
+ /*
2
+ This is a Optical-Character-Recognition program
3
+ Copyright (C) 2000-2010 Joerg Schulenburg
4
+
5
+ This program is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU General Public License
7
+ as published by the Free Software Foundation; either version 2
8
+ of the License, or (at your option) any later version.
9
+
10
+ This program is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ GNU General Public License for more details.
14
+
15
+ You should have received a copy of the GNU General Public License
16
+ along with this program; if not, write to the Free Software
17
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18
+
19
+ see README for EMAIL-address
20
+ */
21
+
22
+ #include <stdlib.h>
23
+ #include <stdio.h>
24
+ #include <string.h>
25
+ #include <limits.h>
26
+ #include <assert.h>
27
+ #include "pgm2asc.h"
28
+ #include "gocr.h"
29
+ #include "unicode.h" /* decode() */
30
+ #include "unicode_defs.h" /* UNKNOWN */
31
+
32
+ const char *getTextLine (List *linelist, int line) {
33
+ int i;
34
+ Element *elem;
35
+
36
+ if (line < 0 || line > list_total(linelist))
37
+ return NULL;
38
+
39
+ for ( i = 0, elem = linelist->start.next;
40
+ i < line && elem != NULL; i++ )
41
+ elem = elem->next;
42
+
43
+ if ( elem != NULL )
44
+ return (const char *)elem->data;
45
+
46
+ return NULL;
47
+ }
48
+
49
+ void free_textlines(List *linelist) { // list.h
50
+ for_each_data(linelist) {
51
+ if (list_get_current(linelist))
52
+ free(list_get_current(linelist)); // free list element
53
+ } end_for_each(linelist);
54
+ list_free(linelist); // free list structure
55
+ }
56
+
57
+ /* append a string (s1) to the string buffer (buffer) of length (len)
58
+ * if buffer is to small or len==0 realloc buffer, len+=512
59
+ */
60
+ char *append_to_line(char *buffer, const char *s1, int *len) {
61
+ char *temp;
62
+ int slen=0, alen;
63
+ if( s1==NULL || s1[0] == 0 ){
64
+ fprintf(stderr,"\n#BUG: appending 0 to a line makes no sense!");
65
+ return buffer;
66
+ }
67
+ if ( *len>0 ) slen= strlen(buffer); // used buffer
68
+ alen = strlen(s1);
69
+ if ( slen+alen+1 >= *len ) {
70
+ *len += (((alen+1)>>9)+1)<<9; // round up to next 512 bytes
71
+ temp = (char *)realloc(buffer, *len);
72
+ if( !temp ) { fprintf(stderr,"realloc failed!\n");
73
+ *len -= (((alen+1)>>9)+1)<<9; return buffer; } // go back
74
+ else buffer = temp; // buffer successfull enlarged
75
+ }
76
+ temp = buffer + slen; // end of buffered string
77
+ memcpy(temp,s1,alen+1); // copy including end sign '\0'
78
+ return buffer;
79
+ }
80
+
81
+ int calc_median_gap(struct tlines * lines) {
82
+ int gaps[MAXlines], l;
83
+ if (lines->num<2) return 0;
84
+ for (l = 0; l < lines->num - 1; l++)
85
+ gaps[l] = lines->m2[l + 1] - lines->m3[l];
86
+ qsort(gaps, lines->num - 1, sizeof(gaps[0]), intcompare);
87
+ return gaps[(lines->num - 1) / 2];
88
+ }
89
+
90
+ /*
91
+ * Return the indent in pixels of the least-indented line.
92
+ * Will be subtracted as base_indent to avoid negativ indent.
93
+ *
94
+ * This is adjusted to account for an angle on the page as
95
+ * a whole. For instance, if the page is rotated clockwise,
96
+ * lower lines may be physically closer to the left edge
97
+ * than higher lines that are logically less indented.
98
+ * We rotate around (0,0). Note that this rotation could
99
+ * rotate lines "off the left margin", leading to a negative
100
+ * indent.
101
+ *
102
+ * boxlist -- list of character boxes.
103
+ * dx, dy -- rotation angle as vector
104
+ */
105
+ int get_least_line_indent(List * boxlist, int dx, int dy, int verbose) {
106
+ int min_indent = INT_MAX;
107
+ int adjusted_indent;
108
+ struct box * box2;
109
+ if (verbose)
110
+ fprintf(stderr, "get_least_line_indent: rot.vector dxdy %d %d\n",
111
+ dx, dy);
112
+ for_each_data(boxlist) {
113
+ box2 = (struct box *)list_get_current(boxlist);
114
+ /* if num == -1, indicates this is a space or newline box,
115
+ * inserted in list_insert_spaces. */
116
+ if (box2->num != -1) {
117
+ adjusted_indent = box2->x0;
118
+ if (dx) adjusted_indent += box2->y0 * dy / dx;
119
+ if (adjusted_indent < min_indent) {
120
+ min_indent = adjusted_indent;
121
+ if (dy!=0 && verbose)
122
+ fprintf(stderr,
123
+ "# Line %2d, unadjusted xy %3d %3d, adjusted x %2d\n",
124
+ box2->line, box2->x0, box2->y0, adjusted_indent);
125
+ }
126
+ }
127
+ } end_for_each(boxlist);
128
+ if (verbose)
129
+ fprintf(stderr, "# Minimum adjusted x: %d (min_indent)\n", min_indent);
130
+ return min_indent;
131
+ }
132
+
133
+ /* collect all the chars from the box tree and write them to a string buffer
134
+ mo is the mode: mode&8 means, use chars even if unsure recognized
135
+ ToDo: store full text(?), store decoded text+boxes+position chars (v0.4)
136
+ (HTML,UTF,ASCII,XML), not wchar incl. descriptions (at<95% in red)
137
+ remove decode(*c, job->cfg.out_format) from gocr.c!
138
+ XML add alternate-tags, format tags and position tags
139
+ ToDo: better output XML to stdout instead of circumstantial store to lines
140
+ not all texts/images follow the line concept?
141
+ Better use a tree of objects where leafes are chars instead of simple list.
142
+ Chars or objects are taken into account. Objects can be text strings
143
+ or XML strings.
144
+
145
+ ToDo: replacing by output_to_stream(FILE *ostream, int mo) ??
146
+ can be used via pipes (if library) ???
147
+ */
148
+ void store_boxtree_lines(job_t *job, int mo) {
149
+ char *buffer; /* temp buffer for text */
150
+ int i = 0, j = 0;
151
+ int len = 1024; // initial buffer length for text line
152
+ struct box *box2;
153
+ int median_gap = 0;
154
+ int max_single_space_gap = 0;
155
+ struct tlines line_info;
156
+ int line, line_gap, oldline=-1;
157
+ int left_margin;
158
+ int i1=0, i2=0;
159
+
160
+ buffer = (char *)malloc(len);
161
+ if ( !buffer ) {
162
+ fprintf(stderr,"malloc failed!\n"); // ToDo: index_to_error_list
163
+ return;
164
+ }
165
+ *buffer = 0;
166
+
167
+ if ( job->cfg.verbose&1 )
168
+ fprintf(stderr,"# store boxtree to lines ...");
169
+
170
+ /* wew: calculate the median line gap, to determine line spacing
171
+ * for the text output. The line gap used is between one line's
172
+ * m3 (baseline) and the next line's m2 (height of non-rising
173
+ * lowercase). We use these lines as they are the least likely
174
+ * to vary according to actual character content of lines.
175
+ */
176
+ median_gap = calc_median_gap(&job->res.lines);
177
+ if (median_gap <= 0) {
178
+ if ( job->cfg.verbose&1 )
179
+ fprintf(stderr, "# Warning: non-positive median line gap of %d\n",
180
+ median_gap);
181
+ median_gap = 8;
182
+ max_single_space_gap = 12; /* arbitrary */
183
+ } else {
184
+ max_single_space_gap = median_gap * 7 / 4;
185
+ }
186
+
187
+ // Will be subtracted as base_indent to avoid negativ indent.
188
+ left_margin = get_least_line_indent(&job->res.boxlist,
189
+ job->res.lines.dx,
190
+ job->res.lines.dy, job->cfg.verbose);
191
+
192
+ if (job->cfg.out_format==XML) { /* subject of change */
193
+ char s1[255]; /* ToDo: avoid potential buffer overflow !!! */
194
+ /* output lot of usefull information for XML filter */
195
+ sprintf(s1,"<page x=\"%d\" y=\"%d\" dx=\"%d\" dy=\"%d\">\n",
196
+ 0,0,0,0);
197
+ buffer=append_to_line(buffer,s1,&len);
198
+ sprintf(s1,"<block x=\"%d\" y=\"%d\" dx=\"%d\" dy=\"%d\">\n",
199
+ 0,0,0,0);
200
+ buffer=append_to_line(buffer,s1,&len);
201
+ }
202
+
203
+ for_each_data(&(job->res.boxlist)) {
204
+ box2 = (struct box *)list_get_current(&(job->res.boxlist));
205
+ line = box2->line;
206
+ line_info = job->res.lines;
207
+ /* reset the output char if certainty is below the limit v0.44 */
208
+ if (box2->num_ac && box2->wac[0]<job->cfg.certainty) box2->c=UNKNOWN;
209
+ if (line!=oldline) {
210
+ if (job->cfg.out_format==XML && oldline>-1) { /* subject of change */
211
+ buffer=append_to_line(buffer,"</line>\n",&len);
212
+ list_app( &(job->res.linelist), (void *)strdup(buffer) ); // wcsdup
213
+ memset(buffer, 0, len);
214
+ j=0; // reset counter for new line
215
+ }
216
+ if (job->cfg.out_format==XML) { /* subject of change */
217
+ char s1[255]; /* ToDo: avoid potential buffer overflow !!! */
218
+ /* output lot of usefull information for XML filter */
219
+ sprintf(s1,"<line x=\"%d\" y=\"%d\" dx=\"%d\" dy=\"%d\" value=\"%d\">\n",
220
+ line_info.x0[line],line_info.m1[line],
221
+ line_info.x1[line]-line_info.x0[line]+1,
222
+ line_info.m4[line]-line_info.m1[line],line);
223
+ buffer=append_to_line(buffer,s1,&len);
224
+ }
225
+ oldline=line;
226
+ }
227
+ if (box2->c > ' ' &&
228
+ box2->c <= 'z') i1++; /* count non-space chars */
229
+ if (box2->c == '\n') {
230
+ if (job->cfg.out_format!=XML) { /* subject of change */
231
+ line_info = job->res.lines;
232
+ line = box2->line;
233
+ if (line > 0) {
234
+ line_gap = line_info.m2[line] - line_info.m3[line - 1];
235
+ for (line_gap -= max_single_space_gap; line_gap > 0;
236
+ line_gap -= median_gap) {
237
+ buffer=append_to_line(buffer,"\n",&len);
238
+ j++; /* count chars in line */
239
+ }
240
+ }
241
+ list_app( &(job->res.linelist), (void *)strdup(buffer) ); // wcsdup
242
+ memset(buffer, 0, len);
243
+ j=0; // reset counter for new line
244
+ }
245
+ }
246
+ if (box2->c == ' ') // fill large gaps with spaces
247
+ {
248
+ if (job->res.avX) { /* avoid SIGFPE */
249
+ if (job->cfg.out_format==XML) { /* subject of change */
250
+ char s1[255]; /* ToDo: avoid potential buffer overflow !!! */
251
+ /* output lot of usefull information for XML filter */
252
+ sprintf(s1," <space x=\"%d\" y=\"%d\" dx=\"%d\" dy=\"%d\" />\n",
253
+ box2->x0,box2->y0,box2->x1-box2->x0+1,box2->y1-box2->y0+1);
254
+ buffer=append_to_line(buffer,s1,&len);
255
+ } else
256
+ // multi spacing is done now in pgm2asc using insert spaces 2010-09-28
257
+ // for (i = (box2->x1 - box2->x0 + 1) / (2 * job->res.avX) + 1; i > 0; i--)
258
+ {
259
+ buffer=append_to_line(buffer," ",&len);
260
+ j++; /* number of chars in line */
261
+ }
262
+ }
263
+ }
264
+ else if (box2->c != '\n') {
265
+ if (j==0 && job->res.avX) /* first char in new line? */ {
266
+ int indent = box2->x0 - job->res.lines.x0[box2->line];
267
+ /* correct for angle of page as a whole. */
268
+ if (job->res.lines.dx)
269
+ indent += box2->y0 * job->res.lines.dy / job->res.lines.dx;
270
+ /* subtract the base margin. */
271
+ indent -= left_margin;
272
+ if (job->cfg.out_format==XML) { /* subject of change */
273
+ char s1[255]; /* ToDo: avoid potential buffer overflow !!! */
274
+ /* output lot of usefull information for XML filter */
275
+ sprintf(s1," <space x=\"%d\" y=\"%d\" dx=\"%d\" dy=\"%d\" />\n",
276
+ box2->x0,box2->y0,box2->x1-box2->x0+1,box2->y1-box2->y0+1);
277
+ buffer=append_to_line(buffer,s1,&len);
278
+ } else
279
+ // ToDo: do the multi "\n" earlier in pgm2asc (like multi spacing)
280
+ for (i = indent / job->res.avX; i > 0; i--) {
281
+ buffer=append_to_line(buffer," ",&len); j++;
282
+ }
283
+ }
284
+ if (job->cfg.out_format==XML) { /* subject of change */
285
+ char s1[255]; /* ToDo: avoid potential buffer overflow !!! */
286
+ /* output lot of usefull information for XML filter */
287
+ sprintf(s1," <box x=\"%d\" y=\"%d\" dx=\"%d\" dy=\"%d\" value=\"",
288
+ box2->x0,box2->y0,box2->x1-box2->x0+1,box2->y1-box2->y0+1);
289
+ buffer=append_to_line(buffer,s1,&len);
290
+ if (box2->num_ac>1) { /* ToDo: output a list of alternatives */
291
+ }
292
+ }
293
+ if (box2->c != UNKNOWN && box2->c != 0) {
294
+ buffer=
295
+ append_to_line(buffer,decode(box2->c,job->cfg.out_format),&len);
296
+ if (box2->c > ' ' &&
297
+ box2->c <= 'z') i2++; /* count non-space chars */
298
+ } else { /* c == UNKNOWN or 0 */
299
+ wchar_t cc; cc=box2->c;
300
+ if (box2->num_ac>0 && box2->tas[0]
301
+ && (/* job->cfg.out_format!=XML || 2010-10 */ box2->tas[0][0]!='<')
302
+ ) { // 2010-10 output XML code after XML frame, see below!
303
+ /* output glued chars or ... (?) Jan08 */
304
+ buffer=append_to_line(buffer,box2->tas[0],&len);
305
+ j+=strlen(box2->tas[0]);
306
+ } // else 2010-10-07
307
+ if (box2->num_ac == 0 || box2->c == UNKNOWN) {
308
+ /* ToDo: leave string empty? set placeholder per option */
309
+ /* output dummy string to mark UNKNOWN */
310
+ if (job->cfg.unrec_marker[0])
311
+ buffer = append_to_line(buffer, job->cfg.unrec_marker, &len);
312
+ }
313
+ }
314
+ if (job->cfg.out_format==XML) {
315
+ if (box2->num_ac>0) {
316
+ /* output alist ToDo: separate <altbox ...> */
317
+ int i1; char s1[256];
318
+ sprintf(s1,"\" numac=\"%d\" weights=\"",box2->num_ac);
319
+ buffer=append_to_line(buffer,s1,&len);
320
+ for (i1=0;i1<box2->num_ac;i1++) {
321
+ sprintf(s1,"%d",box2->wac[i1]);
322
+ buffer=append_to_line(buffer,s1,&len);
323
+ if (i1+1<box2->num_ac) buffer=append_to_line(buffer,",",&len);
324
+ }
325
+ if (box2->num_ac>1)
326
+ buffer=append_to_line(buffer,"\" achars=\"",&len);
327
+ for (i1=1;i1<box2->num_ac;i1++) {
328
+ if (box2->tas[i1] && box2->tas[i1][0]!='<')
329
+ buffer=append_to_line(buffer,box2->tas[i1],&len);
330
+ else
331
+ buffer=append_to_line(buffer,
332
+ decode(box2->tac[i1],job->cfg.out_format),&len);
333
+ // ToDo: add tas[] (achars->avalues or alternate_strings?
334
+ if (i1+1<box2->num_ac) buffer=append_to_line(buffer,",",&len);
335
+ }
336
+ }
337
+ buffer=append_to_line(buffer,"\" />\n",&len);
338
+ }
339
+ if (box2->num_ac && box2->tas[0]) {
340
+ if (box2->tas[0][0]=='<') { /* output special XML object */
341
+ buffer=append_to_line(buffer,box2->tas[0],&len);
342
+ if (job->cfg.out_format==XML) // 2010-10-07
343
+ buffer=append_to_line(buffer,"\n",&len);
344
+ j+=strlen(box2->tas[0]);
345
+ }
346
+ }
347
+ j++; /* number of chars in line */
348
+ }
349
+ i++;
350
+ } end_for_each(&(job->res.boxlist));
351
+ if (job->cfg.out_format==XML && oldline>-1) { /* subject of change */
352
+ buffer=append_to_line(buffer,"</line>\n",&len);
353
+ }
354
+ if (job->cfg.out_format==XML) { /* subject of change */
355
+ buffer=append_to_line(buffer,"</block>\n</page>\n",&len);
356
+ }
357
+
358
+ /* do not forget last line */
359
+ // is there no \n in the last line? If there is, delete next line.
360
+ list_app( &(job->res.linelist), (void *)strdup(buffer) );
361
+ free(buffer);
362
+ if( job->cfg.verbose&1 )
363
+ fprintf(stderr,"... %d lines, boxes= %d, chars= %d\n",i,i1,i2);
364
+ }