isbn 2.0.4 → 2.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/{README → README.md} +5 -11
- data/Rakefile +20 -14
- data/isbn.gemspec +23 -0
- data/lib/isbn.rb +2 -0
- data/test/isbn_spec.rb +1 -1
- metadata +29 -316
- data/VERSION +0 -1
- data/src/gocr-0.48/.cvsignore +0 -6
- data/src/gocr-0.48/AUTHORS +0 -7
- data/src/gocr-0.48/BUGS +0 -55
- data/src/gocr-0.48/CREDITS +0 -17
- data/src/gocr-0.48/HISTORY +0 -243
- data/src/gocr-0.48/INSTALL +0 -83
- data/src/gocr-0.48/Makefile +0 -193
- data/src/gocr-0.48/Makefile.in +0 -193
- data/src/gocr-0.48/README +0 -165
- data/src/gocr-0.48/READMEde.txt +0 -80
- data/src/gocr-0.48/REMARK.txt +0 -18
- data/src/gocr-0.48/REVIEW +0 -538
- data/src/gocr-0.48/TODO +0 -65
- data/src/gocr-0.48/bin/.cvsignore +0 -2
- data/src/gocr-0.48/bin/create_db +0 -38
- data/src/gocr-0.48/bin/gocr.tcl +0 -527
- data/src/gocr-0.48/bin/gocr_chk.sh +0 -44
- data/src/gocr-0.48/configure +0 -4689
- data/src/gocr-0.48/configure.in +0 -71
- data/src/gocr-0.48/doc/.#Makefile.1.6 +0 -39
- data/src/gocr-0.48/doc/.cvsignore +0 -2
- data/src/gocr-0.48/doc/Makefile +0 -39
- data/src/gocr-0.48/doc/Makefile.in +0 -39
- data/src/gocr-0.48/doc/example.dtd +0 -53
- data/src/gocr-0.48/doc/example.xml +0 -21
- data/src/gocr-0.48/doc/examples.txt +0 -67
- data/src/gocr-0.48/doc/gocr.html +0 -578
- data/src/gocr-0.48/doc/unicode.txt +0 -57
- data/src/gocr-0.48/examples/.#Makefile.1.22 +0 -166
- data/src/gocr-0.48/examples/4x6.png +0 -0
- data/src/gocr-0.48/examples/4x6.txt +0 -2
- data/src/gocr-0.48/examples/5x7.png +0 -0
- data/src/gocr-0.48/examples/5x7.png.txt +0 -2
- data/src/gocr-0.48/examples/5x8.png +0 -0
- data/src/gocr-0.48/examples/5x8.png.txt +0 -2
- data/src/gocr-0.48/examples/Makefile +0 -166
- data/src/gocr-0.48/examples/color.fig +0 -20
- data/src/gocr-0.48/examples/ex.fig +0 -16
- data/src/gocr-0.48/examples/font.tex +0 -22
- data/src/gocr-0.48/examples/font1.tex +0 -46
- data/src/gocr-0.48/examples/font2.fig +0 -27
- data/src/gocr-0.48/examples/font_nw.tex +0 -24
- data/src/gocr-0.48/examples/handwrt1.jpg +0 -0
- data/src/gocr-0.48/examples/handwrt1.txt +0 -10
- data/src/gocr-0.48/examples/inverse.fig +0 -20
- data/src/gocr-0.48/examples/matrix.jpg +0 -0
- data/src/gocr-0.48/examples/ocr-a-subset.png +0 -0
- data/src/gocr-0.48/examples/ocr-a-subset.png.txt +0 -4
- data/src/gocr-0.48/examples/ocr-a.png +0 -0
- data/src/gocr-0.48/examples/ocr-a.txt +0 -6
- data/src/gocr-0.48/examples/ocr-b.png +0 -0
- data/src/gocr-0.48/examples/ocr-b.png.txt +0 -4
- data/src/gocr-0.48/examples/polish.tex +0 -28
- data/src/gocr-0.48/examples/rotate45.fig +0 -14
- data/src/gocr-0.48/examples/score +0 -36
- data/src/gocr-0.48/examples/text.tex +0 -28
- data/src/gocr-0.48/gpl.html +0 -537
- data/src/gocr-0.48/include/.cvsignore +0 -2
- data/src/gocr-0.48/include/config.h +0 -36
- data/src/gocr-0.48/include/config.h.in +0 -36
- data/src/gocr-0.48/include/version.h +0 -2
- data/src/gocr-0.48/install-sh +0 -3
- data/src/gocr-0.48/make.bat +0 -57
- data/src/gocr-0.48/man/.cvsignore +0 -2
- data/src/gocr-0.48/man/Makefile +0 -29
- data/src/gocr-0.48/man/Makefile.in +0 -29
- data/src/gocr-0.48/man/man1/gocr.1 +0 -166
- data/src/gocr-0.48/src/.cvsignore +0 -4
- data/src/gocr-0.48/src/Makefile +0 -132
- data/src/gocr-0.48/src/Makefile.in +0 -132
- data/src/gocr-0.48/src/amiga.h +0 -31
- data/src/gocr-0.48/src/barcode.c +0 -846
- data/src/gocr-0.48/src/barcode.c.orig +0 -593
- data/src/gocr-0.48/src/barcode.h +0 -11
- data/src/gocr-0.48/src/box.c +0 -372
- data/src/gocr-0.48/src/database.c +0 -462
- data/src/gocr-0.48/src/detect.c +0 -943
- data/src/gocr-0.48/src/gocr.c +0 -373
- data/src/gocr-0.48/src/gocr.h +0 -288
- data/src/gocr-0.48/src/jconv.c +0 -168
- data/src/gocr-0.48/src/job.c +0 -84
- data/src/gocr-0.48/src/lines.c +0 -350
- data/src/gocr-0.48/src/list.c +0 -334
- data/src/gocr-0.48/src/list.h +0 -90
- data/src/gocr-0.48/src/ocr0.c +0 -6756
- data/src/gocr-0.48/src/ocr0.h +0 -63
- data/src/gocr-0.48/src/ocr0n.c +0 -1475
- data/src/gocr-0.48/src/ocr1.c +0 -85
- data/src/gocr-0.48/src/ocr1.h +0 -3
- data/src/gocr-0.48/src/otsu.c +0 -289
- data/src/gocr-0.48/src/otsu.h +0 -23
- data/src/gocr-0.48/src/output.c +0 -289
- data/src/gocr-0.48/src/output.h +0 -37
- data/src/gocr-0.48/src/pcx.c +0 -153
- data/src/gocr-0.48/src/pcx.h +0 -9
- data/src/gocr-0.48/src/pgm2asc.c +0 -2893
- data/src/gocr-0.48/src/pgm2asc.h +0 -105
- data/src/gocr-0.48/src/pixel.c +0 -537
- data/src/gocr-0.48/src/pnm.c +0 -533
- data/src/gocr-0.48/src/pnm.h +0 -35
- data/src/gocr-0.48/src/progress.c +0 -87
- data/src/gocr-0.48/src/progress.h +0 -42
- data/src/gocr-0.48/src/remove.c +0 -703
- data/src/gocr-0.48/src/tga.c +0 -87
- data/src/gocr-0.48/src/tga.h +0 -6
- data/src/gocr-0.48/src/unicode.c +0 -1314
- data/src/gocr-0.48/src/unicode.h +0 -1257
- data/src/jpeg-7/Makefile.am +0 -133
- data/src/jpeg-7/Makefile.in +0 -1089
- data/src/jpeg-7/README +0 -322
- data/src/jpeg-7/aclocal.m4 +0 -8990
- data/src/jpeg-7/ansi2knr.1 +0 -36
- data/src/jpeg-7/ansi2knr.c +0 -739
- data/src/jpeg-7/cderror.h +0 -132
- data/src/jpeg-7/cdjpeg.c +0 -181
- data/src/jpeg-7/cdjpeg.h +0 -187
- data/src/jpeg-7/change.log +0 -270
- data/src/jpeg-7/cjpeg.1 +0 -325
- data/src/jpeg-7/cjpeg.c +0 -616
- data/src/jpeg-7/ckconfig.c +0 -402
- data/src/jpeg-7/coderules.txt +0 -118
- data/src/jpeg-7/config.guess +0 -1561
- data/src/jpeg-7/config.sub +0 -1686
- data/src/jpeg-7/configure +0 -17139
- data/src/jpeg-7/configure.ac +0 -317
- data/src/jpeg-7/depcomp +0 -630
- data/src/jpeg-7/djpeg.1 +0 -251
- data/src/jpeg-7/djpeg.c +0 -617
- data/src/jpeg-7/example.c +0 -433
- data/src/jpeg-7/filelist.txt +0 -215
- data/src/jpeg-7/install-sh +0 -520
- data/src/jpeg-7/install.txt +0 -1097
- data/src/jpeg-7/jaricom.c +0 -148
- data/src/jpeg-7/jcapimin.c +0 -282
- data/src/jpeg-7/jcapistd.c +0 -161
- data/src/jpeg-7/jcarith.c +0 -921
- data/src/jpeg-7/jccoefct.c +0 -453
- data/src/jpeg-7/jccolor.c +0 -459
- data/src/jpeg-7/jcdctmgr.c +0 -482
- data/src/jpeg-7/jchuff.c +0 -1612
- data/src/jpeg-7/jcinit.c +0 -65
- data/src/jpeg-7/jcmainct.c +0 -293
- data/src/jpeg-7/jcmarker.c +0 -667
- data/src/jpeg-7/jcmaster.c +0 -770
- data/src/jpeg-7/jcomapi.c +0 -106
- data/src/jpeg-7/jconfig.bcc +0 -48
- data/src/jpeg-7/jconfig.cfg +0 -45
- data/src/jpeg-7/jconfig.dj +0 -38
- data/src/jpeg-7/jconfig.mac +0 -43
- data/src/jpeg-7/jconfig.manx +0 -43
- data/src/jpeg-7/jconfig.mc6 +0 -52
- data/src/jpeg-7/jconfig.sas +0 -43
- data/src/jpeg-7/jconfig.st +0 -42
- data/src/jpeg-7/jconfig.txt +0 -155
- data/src/jpeg-7/jconfig.vc +0 -45
- data/src/jpeg-7/jconfig.vms +0 -37
- data/src/jpeg-7/jconfig.wat +0 -38
- data/src/jpeg-7/jcparam.c +0 -632
- data/src/jpeg-7/jcprepct.c +0 -358
- data/src/jpeg-7/jcsample.c +0 -545
- data/src/jpeg-7/jctrans.c +0 -381
- data/src/jpeg-7/jdapimin.c +0 -396
- data/src/jpeg-7/jdapistd.c +0 -275
- data/src/jpeg-7/jdarith.c +0 -762
- data/src/jpeg-7/jdatadst.c +0 -151
- data/src/jpeg-7/jdatasrc.c +0 -212
- data/src/jpeg-7/jdcoefct.c +0 -736
- data/src/jpeg-7/jdcolor.c +0 -396
- data/src/jpeg-7/jdct.h +0 -393
- data/src/jpeg-7/jddctmgr.c +0 -382
- data/src/jpeg-7/jdhuff.c +0 -1309
- data/src/jpeg-7/jdinput.c +0 -384
- data/src/jpeg-7/jdmainct.c +0 -512
- data/src/jpeg-7/jdmarker.c +0 -1360
- data/src/jpeg-7/jdmaster.c +0 -663
- data/src/jpeg-7/jdmerge.c +0 -400
- data/src/jpeg-7/jdpostct.c +0 -290
- data/src/jpeg-7/jdsample.c +0 -361
- data/src/jpeg-7/jdtrans.c +0 -136
- data/src/jpeg-7/jerror.c +0 -252
- data/src/jpeg-7/jerror.h +0 -304
- data/src/jpeg-7/jfdctflt.c +0 -174
- data/src/jpeg-7/jfdctfst.c +0 -230
- data/src/jpeg-7/jfdctint.c +0 -4348
- data/src/jpeg-7/jidctflt.c +0 -242
- data/src/jpeg-7/jidctfst.c +0 -368
- data/src/jpeg-7/jidctint.c +0 -5137
- data/src/jpeg-7/jinclude.h +0 -91
- data/src/jpeg-7/jmemansi.c +0 -167
- data/src/jpeg-7/jmemdos.c +0 -638
- data/src/jpeg-7/jmemdosa.asm +0 -379
- data/src/jpeg-7/jmemmac.c +0 -289
- data/src/jpeg-7/jmemmgr.c +0 -1118
- data/src/jpeg-7/jmemname.c +0 -276
- data/src/jpeg-7/jmemnobs.c +0 -109
- data/src/jpeg-7/jmemsys.h +0 -198
- data/src/jpeg-7/jmorecfg.h +0 -369
- data/src/jpeg-7/jpegint.h +0 -395
- data/src/jpeg-7/jpeglib.h +0 -1135
- data/src/jpeg-7/jpegtran.1 +0 -272
- data/src/jpeg-7/jpegtran.c +0 -546
- data/src/jpeg-7/jquant1.c +0 -856
- data/src/jpeg-7/jquant2.c +0 -1310
- data/src/jpeg-7/jutils.c +0 -179
- data/src/jpeg-7/jversion.h +0 -14
- data/src/jpeg-7/libjpeg.map +0 -4
- data/src/jpeg-7/libjpeg.txt +0 -3067
- data/src/jpeg-7/ltmain.sh +0 -8406
- data/src/jpeg-7/makcjpeg.st +0 -36
- data/src/jpeg-7/makdjpeg.st +0 -36
- data/src/jpeg-7/makeadsw.vc6 +0 -77
- data/src/jpeg-7/makeasln.vc9 +0 -33
- data/src/jpeg-7/makecdep.vc6 +0 -82
- data/src/jpeg-7/makecdsp.vc6 +0 -130
- data/src/jpeg-7/makecmak.vc6 +0 -159
- data/src/jpeg-7/makecvcp.vc9 +0 -186
- data/src/jpeg-7/makeddep.vc6 +0 -82
- data/src/jpeg-7/makeddsp.vc6 +0 -130
- data/src/jpeg-7/makedmak.vc6 +0 -159
- data/src/jpeg-7/makedvcp.vc9 +0 -186
- data/src/jpeg-7/makefile.ansi +0 -220
- data/src/jpeg-7/makefile.bcc +0 -291
- data/src/jpeg-7/makefile.dj +0 -226
- data/src/jpeg-7/makefile.manx +0 -220
- data/src/jpeg-7/makefile.mc6 +0 -255
- data/src/jpeg-7/makefile.mms +0 -224
- data/src/jpeg-7/makefile.sas +0 -258
- data/src/jpeg-7/makefile.unix +0 -234
- data/src/jpeg-7/makefile.vc +0 -217
- data/src/jpeg-7/makefile.vms +0 -142
- data/src/jpeg-7/makefile.wat +0 -239
- data/src/jpeg-7/makejdep.vc6 +0 -423
- data/src/jpeg-7/makejdsp.vc6 +0 -285
- data/src/jpeg-7/makejdsw.vc6 +0 -29
- data/src/jpeg-7/makejmak.vc6 +0 -425
- data/src/jpeg-7/makejsln.vc9 +0 -17
- data/src/jpeg-7/makejvcp.vc9 +0 -328
- data/src/jpeg-7/makeproj.mac +0 -213
- data/src/jpeg-7/makerdep.vc6 +0 -6
- data/src/jpeg-7/makerdsp.vc6 +0 -78
- data/src/jpeg-7/makermak.vc6 +0 -110
- data/src/jpeg-7/makervcp.vc9 +0 -133
- data/src/jpeg-7/maketdep.vc6 +0 -43
- data/src/jpeg-7/maketdsp.vc6 +0 -122
- data/src/jpeg-7/maketmak.vc6 +0 -131
- data/src/jpeg-7/maketvcp.vc9 +0 -178
- data/src/jpeg-7/makewdep.vc6 +0 -6
- data/src/jpeg-7/makewdsp.vc6 +0 -78
- data/src/jpeg-7/makewmak.vc6 +0 -110
- data/src/jpeg-7/makewvcp.vc9 +0 -133
- data/src/jpeg-7/makljpeg.st +0 -68
- data/src/jpeg-7/maktjpeg.st +0 -30
- data/src/jpeg-7/makvms.opt +0 -4
- data/src/jpeg-7/missing +0 -376
- data/src/jpeg-7/rdbmp.c +0 -439
- data/src/jpeg-7/rdcolmap.c +0 -253
- data/src/jpeg-7/rdgif.c +0 -38
- data/src/jpeg-7/rdjpgcom.1 +0 -63
- data/src/jpeg-7/rdjpgcom.c +0 -515
- data/src/jpeg-7/rdppm.c +0 -459
- data/src/jpeg-7/rdrle.c +0 -387
- data/src/jpeg-7/rdswitch.c +0 -365
- data/src/jpeg-7/rdtarga.c +0 -500
- data/src/jpeg-7/structure.txt +0 -945
- data/src/jpeg-7/testimg.bmp +0 -0
- data/src/jpeg-7/testimg.jpg +0 -0
- data/src/jpeg-7/testimg.ppm +0 -4
- data/src/jpeg-7/testimgp.jpg +0 -0
- data/src/jpeg-7/testorig.jpg +0 -0
- data/src/jpeg-7/testprog.jpg +0 -0
- data/src/jpeg-7/transupp.c +0 -1533
- data/src/jpeg-7/transupp.h +0 -205
- data/src/jpeg-7/usage.txt +0 -605
- data/src/jpeg-7/wizard.txt +0 -211
- data/src/jpeg-7/wrbmp.c +0 -442
- data/src/jpeg-7/wrgif.c +0 -399
- data/src/jpeg-7/wrjpgcom.1 +0 -103
- data/src/jpeg-7/wrjpgcom.c +0 -583
- data/src/jpeg-7/wrppm.c +0 -269
- data/src/jpeg-7/wrrle.c +0 -305
- data/src/jpeg-7/wrtarga.c +0 -253
data/src/gocr-0.48/configure.in
DELETED
@@ -1,71 +0,0 @@
|
|
1
|
-
# see /usr/share/info/standards.info,autoconf.info (autoconf 2.57)
|
2
|
-
dnl Process this file with autoconf to produce a configure script.
|
3
|
-
dnl obsolete: AC_INIT(src/pgm2asc.c)
|
4
|
-
AC_INIT(gocr,0.48,,)
|
5
|
-
AC_PREREQ(2.50)
|
6
|
-
AC_CONFIG_HEADERS([include/config.h])
|
7
|
-
dnl AC_EXEEXT is obsolete now
|
8
|
-
|
9
|
-
dnl Checks for programs.
|
10
|
-
AC_PROG_CC
|
11
|
-
AC_PROG_INSTALL
|
12
|
-
AC_PROG_MAKE_SET
|
13
|
-
dnl only needed for libPgm2asc.a, not for the rpm/ebuild
|
14
|
-
dnl AC_PROG_RANLIB
|
15
|
-
dnl AC_CHECK_PROG(AR,ar,ar)
|
16
|
-
dnl needed for developpers to make examples, not for the rpm/ebuild
|
17
|
-
dnl AC_CHECK_PROG(FIG2DEV,fig2dev,fig2dev)
|
18
|
-
|
19
|
-
dnl Check for optional debug mode
|
20
|
-
dnl debug makes program slow, but is very useful for developper
|
21
|
-
dnl ToDo: how to check that c-flags are available?
|
22
|
-
AC_ARG_WITH(debug,
|
23
|
-
[ --with-debug switching on debugging (more verbose output)],
|
24
|
-
[ CPPFLAGS="-Wall -g -fexceptions -DDO_DEBUG=1 $CPPFLAGS" ])
|
25
|
-
if test "$with_debug"; then echo "debugging enabled"; fi
|
26
|
-
|
27
|
-
dnl Check for optional netpbm PACKAGE: --with-netpbm=no == --without-netpbm
|
28
|
-
dnl LDFLAGS+=-R$withval/lib compiles the search path into the file ???
|
29
|
-
AC_ARG_WITH(netpbm,
|
30
|
-
[ --with-netpbm=PATH enter the PATH to netpbm package],
|
31
|
-
[ if test "$withval" != "no"; then
|
32
|
-
LDFLAGS="-L$withval/lib $LDFLAGS";\
|
33
|
-
CPPFLAGS="-I$withval/include $CPPFLAGS";\
|
34
|
-
fi ])
|
35
|
-
if test -n "$with_netpbm"; then echo "option: with_netpbm $with_netpbm"; fi
|
36
|
-
|
37
|
-
dnl Checks for libraries.
|
38
|
-
if test "$with_netpbm" != "no"; then
|
39
|
-
# netpbm-10.26 + SuSE-10.0: netpbm needs mathlib -lm
|
40
|
-
# netpbm-10 : libnetpbm + pam.h (+ p[bgpn]m.h, libp[bgpn]m.so as links)
|
41
|
-
# netpbm-9 : libpnm + pnm.h (+ p[bgp]m.h)
|
42
|
-
# pnm_readpaminit (netpbm-10)
|
43
|
-
# pnm_readpnminit (netpbm-9,10)
|
44
|
-
# ToDo: how to check that -lm is needed?
|
45
|
-
LDFLAGS="-lm $LDFLAGS"
|
46
|
-
AC_SEARCH_LIBS(pnm_readpnminit,[netpbm pnm],[check_netpbm_h="pam.h pnm.h"],
|
47
|
-
[ echo " * * * try option --with-netpbm=PATH"])
|
48
|
-
fi
|
49
|
-
|
50
|
-
dnl Checks for header files.
|
51
|
-
AC_HEADER_STDC
|
52
|
-
AC_CHECK_HEADERS([unistd.h wchar.h ${check_netpbm_h}])
|
53
|
-
|
54
|
-
dnl Checks for typedefs, structures, and compiler characteristics.
|
55
|
-
AC_C_CONST
|
56
|
-
dnl AC_CHECK_TYPE(wchar_t,unsigned)
|
57
|
-
|
58
|
-
dnl Checks for library functions.
|
59
|
-
dnl this macro produces a warning: AC_TRY_RUN called without default ...
|
60
|
-
dnl The message can be ignored as long as you don't configure gOCR for
|
61
|
-
dnl cross-compiling.
|
62
|
-
AC_FUNC_SETVBUF_REVERSED
|
63
|
-
AC_CHECK_FUNCS(wcschr wcsdup gettimeofday popen)
|
64
|
-
|
65
|
-
dnl Checks for system services
|
66
|
-
|
67
|
-
dnl obsolete: AC_OUTPUT(Makefile src/Makefile doc/Makefile man/Makefile)
|
68
|
-
dnl the light weight version of package comes without src/api
|
69
|
-
AC_CONFIG_FILES([Makefile src/Makefile doc/Makefile man/Makefile])
|
70
|
-
dnl AC_CONFIG_COMMANDS([default],,)
|
71
|
-
AC_OUTPUT
|
@@ -1,39 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# Makefile for ./doc path, used by configure
|
3
|
-
#
|
4
|
-
|
5
|
-
# these two lines are for cross-compiling, not tested
|
6
|
-
#srcdir = .
|
7
|
-
#VPATH = .
|
8
|
-
|
9
|
-
# changed to html, tex is not used anymore
|
10
|
-
#LATEX=@LATEX@
|
11
|
-
#DVIPS=@DVIPS@
|
12
|
-
|
13
|
-
OCRDOC=ocr
|
14
|
-
# add other source file to documentation here
|
15
|
-
# SRC=$(OCRDOC).tex
|
16
|
-
|
17
|
-
.PHONY : all clean proper install uninstall
|
18
|
-
default: all
|
19
|
-
|
20
|
-
all: # do nothing!
|
21
|
-
|
22
|
-
#$(OCRDOC).ps: $(OCRDOC).dvi
|
23
|
-
# $(DVIPS) -o $(OCRDOC).ps $?
|
24
|
-
#
|
25
|
-
#$(OCRDOC).dvi: $(SRC)
|
26
|
-
# $(LATEX) $(OCRDOC).tex
|
27
|
-
|
28
|
-
install: all
|
29
|
-
echo "Copy gocr.html to your document path."
|
30
|
-
|
31
|
-
uninstall:
|
32
|
-
echo "Remove gocr.html from your document path."
|
33
|
-
|
34
|
-
clean:
|
35
|
-
-rm -f *.bak *~ $(OCRDOC).{dvi,log,aux}
|
36
|
-
|
37
|
-
proper: clean
|
38
|
-
-rm -f $(OCRDOC).ps
|
39
|
-
|
data/src/gocr-0.48/doc/Makefile
DELETED
@@ -1,39 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# Makefile for ./doc path, used by configure
|
3
|
-
#
|
4
|
-
|
5
|
-
# these two lines are for cross-compiling, not tested
|
6
|
-
#srcdir = .
|
7
|
-
#VPATH = .
|
8
|
-
|
9
|
-
# changed to html, tex is not used anymore
|
10
|
-
#LATEX=@LATEX@
|
11
|
-
#DVIPS=@DVIPS@
|
12
|
-
|
13
|
-
OCRDOC=ocr
|
14
|
-
# add other source file to documentation here
|
15
|
-
# SRC=$(OCRDOC).tex
|
16
|
-
|
17
|
-
.PHONY : all clean proper install uninstall
|
18
|
-
default: all
|
19
|
-
|
20
|
-
all: # do nothing!
|
21
|
-
|
22
|
-
#$(OCRDOC).ps: $(OCRDOC).dvi
|
23
|
-
# $(DVIPS) -o $(OCRDOC).ps $?
|
24
|
-
#
|
25
|
-
#$(OCRDOC).dvi: $(SRC)
|
26
|
-
# $(LATEX) $(OCRDOC).tex
|
27
|
-
|
28
|
-
install: all
|
29
|
-
echo "Copy gocr.html to your document path."
|
30
|
-
|
31
|
-
uninstall:
|
32
|
-
echo "Remove gocr.html from your document path."
|
33
|
-
|
34
|
-
clean:
|
35
|
-
-rm -f *.bak *~ $(OCRDOC).{dvi,log,aux}
|
36
|
-
|
37
|
-
proper: clean
|
38
|
-
-rm -f $(OCRDOC).ps
|
39
|
-
|
@@ -1,39 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# Makefile for ./doc path, used by configure
|
3
|
-
#
|
4
|
-
|
5
|
-
# these two lines are for cross-compiling, not tested
|
6
|
-
#srcdir = @srcdir@
|
7
|
-
#VPATH = @srcdir@
|
8
|
-
|
9
|
-
# changed to html, tex is not used anymore
|
10
|
-
#LATEX=@LATEX@
|
11
|
-
#DVIPS=@DVIPS@
|
12
|
-
|
13
|
-
OCRDOC=ocr
|
14
|
-
# add other source file to documentation here
|
15
|
-
# SRC=$(OCRDOC).tex
|
16
|
-
|
17
|
-
.PHONY : all clean proper install uninstall
|
18
|
-
default: all
|
19
|
-
|
20
|
-
all: # do nothing!
|
21
|
-
|
22
|
-
#$(OCRDOC).ps: $(OCRDOC).dvi
|
23
|
-
# $(DVIPS) -o $(OCRDOC).ps $?
|
24
|
-
#
|
25
|
-
#$(OCRDOC).dvi: $(SRC)
|
26
|
-
# $(LATEX) $(OCRDOC).tex
|
27
|
-
|
28
|
-
install: all
|
29
|
-
echo "Copy gocr.html to your document path."
|
30
|
-
|
31
|
-
uninstall:
|
32
|
-
echo "Remove gocr.html from your document path."
|
33
|
-
|
34
|
-
clean:
|
35
|
-
-rm -f *.bak *~ $(OCRDOC).{dvi,log,aux}
|
36
|
-
|
37
|
-
proper: clean
|
38
|
-
-rm -f $(OCRDOC).ps
|
39
|
-
|
@@ -1,53 +0,0 @@
|
|
1
|
-
<?xml version="1.0"?>
|
2
|
-
<!--
|
3
|
-
first draft by Volker Simonis, reviewed by Joerg Schulenburg
|
4
|
-
Its not ready for use!
|
5
|
-
ToDo:
|
6
|
-
- lynx/links/w3c should show xml like a html file
|
7
|
-
value as <character ...>CharText</character>
|
8
|
-
or <word><character ...></character><...>WordText</word>
|
9
|
-
or as line or as block? whats more useful?
|
10
|
-
- how to code table of alternative chars/words and its probability?
|
11
|
-
- how to handle images (as image tags?)
|
12
|
-
- xmllint -\-htmlout -\-loaddtd jocr/doc/example.dtd o.xml
|
13
|
-
-->
|
14
|
-
<!ENTITY % default.attributes "x CDATA #REQUIRED
|
15
|
-
y CDATA #REQUIRED
|
16
|
-
dx CDATA #REQUIRED
|
17
|
-
dy CDATA #REQUIRED">
|
18
|
-
|
19
|
-
<!ELEMENT box EMPTY>
|
20
|
-
<!ATTLIST box %default.attributes;
|
21
|
-
value CDATA #REQUIRED;>
|
22
|
-
|
23
|
-
<!ELEMENT barcode EMPTY>
|
24
|
-
<!ATTLIST barcode %default.attributes;
|
25
|
-
value CDATA #REQUIRED;>
|
26
|
-
|
27
|
-
<!ELEMENT img EMPTY>
|
28
|
-
<!ATTLIST img %default.attributes;>
|
29
|
-
|
30
|
-
<!ELEMENT page (block*)>
|
31
|
-
<!ATTLIST page %default.attributes;>
|
32
|
-
|
33
|
-
<!ELEMENT block (line*)>
|
34
|
-
<!ATTLIST block %default.attributes;>
|
35
|
-
|
36
|
-
<!ELEMENT line ((word | space | punctuation-mark)*)>
|
37
|
-
<!ATTLIST line %default.attributes;>
|
38
|
-
|
39
|
-
<!ELEMENT word (character*)>
|
40
|
-
<!ATTLIST word %default.attributes;>
|
41
|
-
|
42
|
-
<!ELEMENT char EMPTY>
|
43
|
-
<!ATTLIST char %default.attributes;
|
44
|
-
value CDATA #REQUIRED;
|
45
|
-
(#CDATA)> <!-- is that correct? -->
|
46
|
-
|
47
|
-
<!ELEMENT space EMPTY>
|
48
|
-
<!ATTLIST space %default.attributes;
|
49
|
-
value CDATA #REQUIRED;>
|
50
|
-
|
51
|
-
<!ELEMENT punctuation-mark EMPTY>
|
52
|
-
<!ATTLIST punctuation-mark %default.attributes;
|
53
|
-
value CDATA #REQUIRED;>
|
@@ -1,21 +0,0 @@
|
|
1
|
-
<?xml version="1.0"?>
|
2
|
-
<!DOCTYPE gocr SYSTEM "example.dtd">
|
3
|
-
<!-- example file for example.dtd -->
|
4
|
-
<page>
|
5
|
-
<block x="123" y="11" dx="500" dy="800">
|
6
|
-
<line x="130" y="11" dx="480" dy="30">
|
7
|
-
<word x="130" y="11" dx="80" dy="30">
|
8
|
-
<character x="130" y="11" dx="80" dy="30" value="A"/>
|
9
|
-
<character ... />
|
10
|
-
...
|
11
|
-
</word>
|
12
|
-
<punctuation-mark .. />
|
13
|
-
<space .. />
|
14
|
-
<word ..>
|
15
|
-
...
|
16
|
-
</word>
|
17
|
-
</line>
|
18
|
-
<box ... />
|
19
|
-
...
|
20
|
-
</block>
|
21
|
-
</page>
|
@@ -1,67 +0,0 @@
|
|
1
|
-
Note: this info is related to example files, used to test gOCR. As of this
|
2
|
-
writing, these files are not available to non-developers. So, if you aren't
|
3
|
-
a developer, forget about this file.
|
4
|
-
|
5
|
-
EXAMPLE FILES
|
6
|
-
|
7
|
-
1. Scanning
|
8
|
-
The examples can be scanned from anything; when looking for something, try to
|
9
|
-
have in mind the kind of tests you are expecting to do: if you're testing
|
10
|
-
accents recognition, look for texts in portuguese, french, etc. (pretty obvious,
|
11
|
-
but keeping this in mind will help to have a large gamma of files covering
|
12
|
-
different kinds of tests).
|
13
|
-
|
14
|
-
If you're not interested in testing DPIs, scan at 150 or 300dpi.
|
15
|
-
|
16
|
-
If you're not interested in testing the dust removal, cleaning, etc, functions,
|
17
|
-
do the best scan you can. Usually increasing brightness and contrast will
|
18
|
-
provide a sharper, cleaner image.
|
19
|
-
|
20
|
-
Save the image in a supported format: for example, pgm or jpg. If a compression
|
21
|
-
will result in a significant reduction of size, compress the image. BZIP2
|
22
|
-
usually is the best compressor around, but gzip is more popular in the unix
|
23
|
-
world. In the wintel world, people use ZIP, and usually will have to search for
|
24
|
-
an application capable of opening .gz or .bz2 (though WinZIP opens at least the
|
25
|
-
former).
|
26
|
-
|
27
|
-
2. Sorting
|
28
|
-
To help others to find the files they are looking for, the examples/ directory
|
29
|
-
is divided in several other directories, which may be subdivided. When
|
30
|
-
uploading a new example, look for the most suitable location. Depending of the
|
31
|
-
directory, you probably will name your file with interesting info: for example,
|
32
|
-
when uploading a image with all the characters of the foo font, the best thing
|
33
|
-
to do is to place it at examples/fonts/foo.jpg.
|
34
|
-
|
35
|
-
3. "Translation"
|
36
|
-
Along with the image file, upload a text file with the expected output. Be
|
37
|
-
careful with this file: it must resemble the original text as much as possible.
|
38
|
-
Don't add extra new lines (\n), keep hyphenized words, etc. Name this file with
|
39
|
-
the same name of the image file.
|
40
|
-
|
41
|
-
In the beginning of the text file, you should provide comments, to help
|
42
|
-
searches. Use the following sample:
|
43
|
-
|
44
|
-
# Comments
|
45
|
-
# DPI:
|
46
|
-
# Colors:
|
47
|
-
# Image size (colsXrows):
|
48
|
-
# Fonts:
|
49
|
-
# Font sizes:
|
50
|
-
# Layout form:
|
51
|
-
# Number of pictures:
|
52
|
-
# Language:
|
53
|
-
# Quality of scan:
|
54
|
-
# Non-ASCII characters:
|
55
|
-
# Extra:
|
56
|
-
|
57
|
-
Check existing examples to see what people have been doing.
|
58
|
-
|
59
|
-
Any lines that begin with # will be considered comments, so you may use several
|
60
|
-
lines for comments or add new fields. Though gOCR itself doesn't depend on, and
|
61
|
-
won't use, this file, it will be used by scripts.
|
62
|
-
|
63
|
-
4. Other sources (WEB)
|
64
|
-
|
65
|
-
- http://www.clerkweb.house.gov/elections/elections.htm (Nov2002)
|
66
|
-
PDF-files with lot of tables
|
67
|
-
|
data/src/gocr-0.48/doc/gocr.html
DELETED
@@ -1,578 +0,0 @@
|
|
1
|
-
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
|
2
|
-
|
3
|
-
<HTML>
|
4
|
-
<HEAD>
|
5
|
-
<TITLE>GOCR-documentation</TITLE>
|
6
|
-
<META NAME="description" CONTENT="GOCR-documentation">
|
7
|
-
<META NAME="keywords" CONTENT="ocr">
|
8
|
-
<META NAME="resource-type" CONTENT="document">
|
9
|
-
<META NAME="distribution" CONTENT="global">
|
10
|
-
|
11
|
-
<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=iso-8859-1">
|
12
|
-
<META NAME="Generator" CONTENT="Joerg">
|
13
|
-
</HEAD>
|
14
|
-
|
15
|
-
<BODY >
|
16
|
-
<H1 ALIGN="CENTER">GOCR-documentation</H1>
|
17
|
-
<P ALIGN="CENTER"><STRONG>Jörg Schulenburg</STRONG></P>
|
18
|
-
<P ALIGN="CENTER"><STRONG>Magdeburg, June 3, 2002</STRONG></P>
|
19
|
-
|
20
|
-
<H3>Abstract:</H3>
|
21
|
-
<DIV>
|
22
|
-
In this documentation I describe some ideas for my OCR-program.
|
23
|
-
It contains algorithms and examples and gives you
|
24
|
-
an impression of what the program can (or could) do.
|
25
|
-
</DIV>
|
26
|
-
<P>
|
27
|
-
|
28
|
-
|
29
|
-
<P>
|
30
|
-
<BR><HR>
|
31
|
-
<!--Table of Child-Links-->
|
32
|
-
<A NAME="CHILD_LINKS"></A>
|
33
|
-
|
34
|
-
<UL>
|
35
|
-
<LI><A HREF="#SEC1">Introduction</A>
|
36
|
-
<LI><A HREF="#SEC2">Segmentation of textual regions / Layout analysis</A>
|
37
|
-
<LI><A HREF="#SEC3">Line detection</A>
|
38
|
-
<LI><A HREF="#SEC4">Cluster detection</A>
|
39
|
-
<LI><A HREF="#SEC5">Engines</A>
|
40
|
-
<LI><A HREF="#SEC6">Remove pixels</A>
|
41
|
-
<LI><A HREF="#SEC7">Add pixels</A>
|
42
|
-
<LI><A HREF="#SEC8">Similarity analyzer</A>
|
43
|
-
<LI><A HREF="#SEC9">Overlapping characters</A>
|
44
|
-
<LI><A HREF="#SEC10">Black/White, Gray and Colors</A>
|
45
|
-
<LI><A HREF="#SEC11">Pictures on scanned pages</A>
|
46
|
-
<LI><A HREF="#SEC12">Tools</A>
|
47
|
-
<LI><A HREF="#SEC13">glossary</A>
|
48
|
-
<LI><A HREF="#SEC14">More information?</A>
|
49
|
-
<LI><A HREF="#SEC15">About this document</A>
|
50
|
-
</UL>
|
51
|
-
<!--End of Table of Child-Links-->
|
52
|
-
|
53
|
-
<H1><A NAME="SEC1"> Introduction</A>
|
54
|
-
</H1>
|
55
|
-
First I have to say that I am not a expert in pattern recognition
|
56
|
-
or similar things. My knowledge is based mostly on experiments with my
|
57
|
-
program.
|
58
|
-
Therefore do not worry about stupid algorithms I put in this document.
|
59
|
-
In this documentation I describe some ideas for my OCR-program.
|
60
|
-
The examples give you an impression of how the program handles
|
61
|
-
your images.
|
62
|
-
If you have comments regarding contents or spelling please
|
63
|
-
write to the author.
|
64
|
-
|
65
|
-
<H1><A NAME="SEC2">Segmentation of textual regions / Layout analysis</A></H1>
|
66
|
-
|
67
|
-
This is implemented as a recursive division in two parts.
|
68
|
-
|
69
|
-
<UL>
|
70
|
-
<LI>look for the thickest horizontal or vertical gap through the box</LI>
|
71
|
-
<LI>if the gap is less than five times longer than thick do not divide </LI>
|
72
|
-
<LI>do the same with the two new parts</LI>
|
73
|
-
</UL>
|
74
|
-
I know that this algorithm is not as good as you wish,
|
75
|
-
but I do not know a better one.
|
76
|
-
|
77
|
-
<P>
|
78
|
-
It would be very helpful to know about a function which is able to
|
79
|
-
decide whether the box represents a single text line or a more complex object.
|
80
|
-
|
81
|
-
<H1><A NAME="SEC3">Line detection</A></H1>
|
82
|
-
<P>
|
83
|
-
Line detection is very importand for good recognition.
|
84
|
-
For example it is difficult to distinguish between lowercase letter <B>p</B>
|
85
|
-
and uppercase letter <B>P</B> without having a baseline (same total height).
|
86
|
-
The lowercase version of <B>p</B> has a depht (the lower end is below the
|
87
|
-
baseline) and therefore its easy to distinguish from the uppercase version
|
88
|
-
if the baseline is known. The line detection is responsible for finding the
|
89
|
-
baseline of every text line.
|
90
|
-
|
91
|
-
<P>
|
92
|
-
Lines of characters are detected by looking for interline spaces.
|
93
|
-
These are characterized by a large number of non-black pixels in a
|
94
|
-
row. Image rotation (skewing) presents a problem, therefore the program
|
95
|
-
first looks only at the left half of the image. When a line is
|
96
|
-
found, the left half of the right side is scanned, because lines
|
97
|
-
are often short. The variation in height gives an indication of
|
98
|
-
the rotation angle. Using this angle, a second run detects lines
|
99
|
-
more accurately. Line detection may fail if there is dust on the
|
100
|
-
image.
|
101
|
-
|
102
|
-
<P>
|
103
|
-
In version v0.2.3 this behaviour is slightly changed.
|
104
|
-
To detect the rotation angle, the line through the most
|
105
|
-
characters is detected.
|
106
|
-
|
107
|
-
<H1><A NAME="SEC4">Cluster detection</A></H1>
|
108
|
-
|
109
|
-
A cluster is a group of pixels which are connected with each other.
|
110
|
-
The simplest way to detect a cluster is to look for a pixel.
|
111
|
-
If you find one, look to the neighbouring pixels. This can be done recursively.
|
112
|
-
|
113
|
-
<P>
|
114
|
-
This method needs a lot of stack space if a cluster is very large,
|
115
|
-
and can cause problems with the memory.
|
116
|
-
|
117
|
-
<P>
|
118
|
-
Do you remember the algorithm for leaving a maze?
|
119
|
-
Go along the right (or left) wall. This seems to be a good approach
|
120
|
-
for detecting clusters without recursion.
|
121
|
-
The following picture shows a trace of the maze algorithm.
|
122
|
-
|
123
|
-
<P>
|
124
|
-
<TABLE WIDTH="680">
|
125
|
-
<TR><TD>
|
126
|
-
<PRE>
|
127
|
-
first 35 steps next 36 steps
|
128
|
-
..@@@@@..@@@@<.. ..v<<<<..v<<<@.. * = starting point
|
129
|
-
..@@@@@@@@@.@^<. ..>>v@^<<<@.@@@. >^<v = go right,up,left,down
|
130
|
-
....@@@@@...@@^. ....v@@@@...@@@. @ = black pixel
|
131
|
-
....@@@@....@@^. ....v@@@....@@@.
|
132
|
-
....@@@.....@@^. ....v@@.....@@@.
|
133
|
-
....@@@.....@@^. ....v@@.....@@@.
|
134
|
-
...@@@@.....@@^. ...v<@@.....@@@.
|
135
|
-
...@@@......@@^. ...v@@......@@@.
|
136
|
-
...@@@......@@^. ...v@@......@@@.
|
137
|
-
...@@@.....@@@^. ...v@@.....@@@@.
|
138
|
-
...@@@.....@@>^. ...v@@.....@@@@.
|
139
|
-
...@@@.....@@^.. ...v@@.....@@@..
|
140
|
-
..@@@@.....@@^.. ..v<@@.....@@@..
|
141
|
-
..@@@@....@@@^.. ..v@@@....@@@@..
|
142
|
-
*>>>>>>>>>>>>^<< @@@@@@@@@@@@@@@@
|
143
|
-
</PRE></TD></TR>
|
144
|
-
</TABLE>
|
145
|
-
|
146
|
-
<P>
|
147
|
-
The minimum and maximum coordinates can be used to create a box around the
|
148
|
-
cluster. But does this algorithm work with diagonally connected pixels?
|
149
|
-
|
150
|
-
<H1><A NAME="SEC5">Engines</A></H1>
|
151
|
-
|
152
|
-
GOCR is able to work with different recognition engines.
|
153
|
-
Since version 0.37 engines have to return a probability value together
|
154
|
-
with the recognized character or a table of values to a table of characters.
|
155
|
-
If the probability value is 100, the engine is 100% sure to have found the
|
156
|
-
right character otherwise the value is less. This gives GOCR the possibility
|
157
|
-
to compare results of different engines or in case of a not recognized character
|
158
|
-
to inform the user or another
|
159
|
-
application (spell checker) which characters probably could be there.
|
160
|
-
|
161
|
-
<H2>Base-Engine</H2>
|
162
|
-
The base engine (src/ocrX.c) is the original engine used in the first implementation
|
163
|
-
of GOCR by Jörg. The idea was to get a fast and acceptable result
|
164
|
-
without learning theoretical background. Later it should be replaced or completed
|
165
|
-
by a better engine.
|
166
|
-
The base engine is a rule based engine.
|
167
|
-
The engine was written without theoretical
|
168
|
-
background and is improved by try and error method but is is still far from
|
169
|
-
perfect. The algorithm is very tolerant to size and form af characters
|
170
|
-
(omnifont).
|
171
|
-
How does the engine identify a character? For the explanation look at the
|
172
|
-
following pattern.
|
173
|
-
|
174
|
-
<P>
|
175
|
-
<TABLE WIDTH="680">
|
176
|
-
<TR><TD>
|
177
|
-
<PRE>
|
178
|
-
vvvv vv- white regions
|
179
|
-
......@@...... <- crossing one line
|
180
|
-
......@@......
|
181
|
-
.....@@@@.....
|
182
|
-
.....@@@@.....
|
183
|
-
.....@@@@.....
|
184
|
-
....@..@@@.... <- white hole / crossing two lines
|
185
|
-
....@..@@@.... <- crossing two lines
|
186
|
-
....@..@@@....
|
187
|
-
...@....@@@...
|
188
|
-
...@....@@@...
|
189
|
-
...@....@@@...
|
190
|
-
..@@@@@@@@@@.. <- horizontal line near center
|
191
|
-
..@......@@@..
|
192
|
-
..@......@@@..
|
193
|
-
.@........@@@. v- increasing width of pattern
|
194
|
-
.@........@@@. v
|
195
|
-
.@........@@@. v
|
196
|
-
@@@......@@@@@
|
197
|
-
^^^-- gap
|
198
|
-
</PRE></TD></TR>
|
199
|
-
</TABLE>
|
200
|
-
|
201
|
-
<P>
|
202
|
-
In the future the program
|
203
|
-
should detect edges, vertices, gaps, angles and so on.
|
204
|
-
This is called feature extraction (as far as I know).
|
205
|
-
With such data the engine could make a cluster analysis.
|
206
|
-
But this is a difficult task, if the scanned image is noisy.
|
207
|
-
|
208
|
-
<H2>Database-Engine</H2>
|
209
|
-
The database engine (src/database.c) was the second engine added to GOCR.
|
210
|
-
It was primary written to give users a simple tool to recognize
|
211
|
-
special language-specific characters. The program generates a list
|
212
|
-
(text file db.lst of image filenames and character codes)
|
213
|
-
and image samples (pnm-files) in a database path (./db/).
|
214
|
-
The database can be created by hand or extern programs or by GOCR itself
|
215
|
-
using option (-m 130). In the last case GOCR prompts the user
|
216
|
-
for not recognized characters. If he enters the character the pattern
|
217
|
-
is saved in the database path as pnm-file and its file name is added
|
218
|
-
to the database list (db.lst) together with the text string the pattern
|
219
|
-
should be replaced by.
|
220
|
-
For recognition GOCR first loads the database into memory (option -m 2).
|
221
|
-
The main algorithm compares not recognized characters with stored images
|
222
|
-
and calculates a distance value. If the distance value is small enough,
|
223
|
-
the character is treated as recognized.
|
224
|
-
|
225
|
-
<H1><A NAME="SEC6">Remove pixels</A></H1>
|
226
|
-
|
227
|
-
The following picture shows an <I>n</I> which has additional pixels at the
|
228
|
-
bottom. Therefore it can not be detected as <I>n</I>. What can be done?
|
229
|
-
|
230
|
-
<UL>
|
231
|
-
<LI>classify horizontal (<TT>=</TT>) and vertical (<TT>I</TT>) pixels by
|
232
|
-
comparing the distance between the next vertical and next horizontal white
|
233
|
-
pixels (.)
|
234
|
-
</LI>
|
235
|
-
<LI>measure mean thickness of vertical and horizontal clusters
|
236
|
-
</LI>
|
237
|
-
<LI>erase unusually thin horizontal pixels at the bottom line
|
238
|
-
</LI>
|
239
|
-
</UL>
|
240
|
-
|
241
|
-
<P>
|
242
|
-
<TABLE WIDTH="680">
|
243
|
-
<TR><TD>
|
244
|
-
<PRE>
|
245
|
-
..@@@@@..@@@@@.. ..==III..===II.. dx=16 dy=15
|
246
|
-
..@@@@@@@@@.@@@. ..==III====.III. thickness 2 to 3
|
247
|
-
....@@@@@...@@@. ....III==...III.
|
248
|
-
....@@@@....@@@. ....III=....III.
|
249
|
-
....@@@.....@@@. ....III.....III.
|
250
|
-
....@@@.....@@@. ....III.....III.
|
251
|
-
...@@@@.....@@@. ...IIII.....III.
|
252
|
-
...@@@......@@@. ...III......III.
|
253
|
-
...@@@......@@@. ...III......III.
|
254
|
-
...@@@.....@@@@. ...III.....IIII.
|
255
|
-
...@@@.....@@@@. ...III.....IIII.
|
256
|
-
...@@@.....@@@.. ...III.....III..
|
257
|
-
..@@@@.....@@@.. ..IIII.....III..
|
258
|
-
..@@@@....@@@@.. ..IIII....IIII..
|
259
|
-
@@@@@@@@@@@@@@@@ ================
|
260
|
-
^^^
|
261
|
-
this causes the problem
|
262
|
-
</PRE></TD></TR>
|
263
|
-
</TABLE>
|
264
|
-
|
265
|
-
<P>
|
266
|
-
A better way is to find serifs (horizontal lines glued on the lower end
|
267
|
-
of vertical lines) which touch together (v0.2.5).
|
268
|
-
|
269
|
-
<P>
|
270
|
-
The next picture shows blind pixels which are caused by dust on the paper.
|
271
|
-
The upper right dots are not connected with the rest of the character.
|
272
|
-
This can be detected via fill-algorithms. Currently the program
|
273
|
-
assumes that dots near the upper end of a character are ``i''-dots
|
274
|
-
or diaereses (umlaut dots).
|
275
|
-
|
276
|
-
<P>
|
277
|
-
<TABLE WIDTH="680">
|
278
|
-
<TR><TD>
|
279
|
-
<PRE>
|
280
|
-
..........................O... ..........................O...
|
281
|
-
..........................O... ..........................O...
|
282
|
-
.............................. ..............................
|
283
|
-
.............................. ..............................
|
284
|
-
..........@@@.......@@@@...... ..........@@@.......@@@@......
|
285
|
-
..@@@@..@@@@@@@...@@@@@@@..... ..@@@@..@@@@@@@...@@@@@@@.....
|
286
|
-
@@@@@@@@@@@@@@@@.@@@@@@@@@.... @@@@@@@@@@@@@@@@.@@@@@@@@@....
|
287
|
-
..@@@@@@....@@@@@@.....@@@@... ..@@@@@@....@@@@@@.....@@@@...
|
288
|
-
..@@@@.......@@@@......@@@@... ..@@@@.......@@@@......@@@@...
|
289
|
-
..@@@@.......@@@.......@@@@... ..@@@@.......@@@.......@@@@...
|
290
|
-
..@@@@.......@@@.......@@@@... ..@@@@.......@@@.......@@@@...
|
291
|
-
..@@@@.......@@@.......@@@@... ..@@@@.......@@@.......@@@@...
|
292
|
-
..@@@@.......@@@@......@@@@... ..@@@@.......@@@@......@@@@...
|
293
|
-
..@@@@.......@@@.......@@@@... ..@@@@.......@@@.......@@@@...
|
294
|
-
..@@@@.......@@@@......@@@@... ..@@@@.......@@@@......@@@@...
|
295
|
-
..@@@@.......@@@.......@@@@... ..@@@@.......@@@.......@@@@...
|
296
|
-
..@@@@.......@@@.......@@@@... ..@@@@.......@@@.......@@@@...
|
297
|
-
..@@@@.......@@@.......@@@@... ..@@@@.......@@@.......@@@@...
|
298
|
-
..@@@@.......@@@.......@@@@... ..@@@@.......@@@.......@@@@...
|
299
|
-
..@@@@.......@@@.......@@@@... ..@@@@.......@@@.......@@@@...
|
300
|
-
..@@@@.......@@@@......@@@@... ..@@@@.......@@@@......@@@@...
|
301
|
-
..@@@@......@@@@@......@@@@@.. ..@@@@......@@@@@......@@@@@..
|
302
|
-
@@@@@@@@..@@@@@@@@@..@@@@@@@@@ @@@@@@@@..@@@@@@@@@..@@@@@@@@@
|
303
|
-
</PRE></TD></TR>
|
304
|
-
</TABLE>
|
305
|
-
|
306
|
-
<H1><A NAME="SEC7">
|
307
|
-
Add pixels</A>
|
308
|
-
</H1>
|
309
|
-
The following picture shows an <I>m</I>. The legs are only barely connected.
|
310
|
-
How do we handle this?
|
311
|
-
|
312
|
-
<UL>
|
313
|
-
<LI>if the engine has failed, a filter is switched on and the engine
|
314
|
-
starts over
|
315
|
-
</LI>
|
316
|
-
<LI>the 2x2 filter sets pixels to (<I>O</I>) near barely connected pixels
|
317
|
-
</LI>
|
318
|
-
</UL>
|
319
|
-
|
320
|
-
<P>
|
321
|
-
<TABLE WIDTH="680">
|
322
|
-
<TR><TD>
|
323
|
-
<PRE>
|
324
|
-
vv vv
|
325
|
-
@@@.@@@..@@@... @@@.@@@..@@@...
|
326
|
-
.@@.@@@@.@@@@..< .@@O@@@@O@@@@.. filter: .@ => O@ @. => @O
|
327
|
-
.@@@..@@@..@@..< .@@@..@@@..@@.. @. => @. .@ => .@
|
328
|
-
.@@@..@@@..@@@. .@@@..@@@..@@@.
|
329
|
-
.@@@..@@@..@@@. .@@@..@@@..@@@.
|
330
|
-
.@@@..@@@..@@@. .@@@..@@@..@@@.
|
331
|
-
.@@@..@@@..@@@. .@@@..@@@..@@@.
|
332
|
-
.@@@..@@@..@@@. .@@@..@@@..@@@.
|
333
|
-
.@@@..@@@..@@@. .@@@..@@@..@@@.
|
334
|
-
.@@@..@@@..@@@. .@@@..@@@..@@@.
|
335
|
-
.@@@..@@@..@@@. .@@@..@@@..@@@.
|
336
|
-
.@@@..@@@..@@@. .@@@..@@@..@@@.
|
337
|
-
.@@@..@@@..@@@. .@@@..@@@..@@@.
|
338
|
-
@@@@@.@@@@.@@@@ @@@@@.@@@@.@@@@
|
339
|
-
</PRE></TD></TR>
|
340
|
-
</TABLE>
|
341
|
-
|
342
|
-
<H1><A NAME="SEC8">
|
343
|
-
Similarity analyzer</A>
|
344
|
-
</H1>
|
345
|
-
Some characters are a little bit noisy. These characters can be identified by
|
346
|
-
comparison with other, already recognized characters. This can be done
|
347
|
-
via a good distance function. May be
|
348
|
-
the distance function in the actual version of GOCR is not very good.
|
349
|
-
Feel free to send me your ideas, but be sure it does not waste my time.
|
350
|
-
|
351
|
-
<H1><A NAME="SEC9">
|
352
|
-
Overlapping characters</A>
|
353
|
-
</H1>
|
354
|
-
The following picture shows an overlapping <I>ru</I>.
|
355
|
-
How do we handle this?
|
356
|
-
|
357
|
-
<UL>
|
358
|
-
<LI>look for 3 weak connections (sum over y is small, start in the middle)
|
359
|
-
</LI>
|
360
|
-
<LI>test if the right and left part can be detected by the engine
|
361
|
-
</LI>
|
362
|
-
<LI>correction of surrounding box
|
363
|
-
</LI>
|
364
|
-
</UL>
|
365
|
-
|
366
|
-
<P>
|
367
|
-
<TABLE WIDTH="680">
|
368
|
-
<TR><TD>
|
369
|
-
<PRE>
|
370
|
-
....@@...@@@@@@@@@@....@@@@@@@.. ....@@...@@@@@@@@@@....@@@@@@@..
|
371
|
-
..@@@@..@@@@@..@@@@......@@@@@.. ..@@@@..@@@@@..@@@@......@@@@@..
|
372
|
-
@@@@@@@@@@@@@.,.@@@.......@@@@.. @@@@@@@@@@@@@...@@@.......@@@@..
|
373
|
-
..@@@@@@..@@@...@@@.......@@@@.. ..@@@@@@..@@@...@@@.......@@@@..
|
374
|
-
...@@@@.......,.@@@@......@@@@.. ...@@@@.........@@@@......@@@@..
|
375
|
-
...@@@@.........@@@@......@@@@.. ...@@@@.........@@@@......@@@@..
|
376
|
-
...@@@@.......,.@@@.......@@@@.. ...@@@@.........@@@.......@@@@..
|
377
|
-
...@@@@.........@@@.......@@@@.. ...@@@@.........@@@.......@@@@..
|
378
|
-
...@@@........,.@@@@......@@@@.. ...@@@..........@@@@......@@@@..
|
379
|
-
...@@@..........@@@@......@@@@.. ...@@@..........@@@@......@@@@..
|
380
|
-
...@@@........,.@@@@......@@@@.. ...@@@..........@@@@......@@@@..
|
381
|
-
...@@@..........@@@.......@@@@.. ...@@@..........@@@.......@@@@..
|
382
|
-
...@@@........,.@@@@......@@@@.. ...@@@..........@@@@......@@@@..
|
383
|
-
...@@@..........@@@@......@@@@.. ...@@@..........@@@@......@@@@..
|
384
|
-
...@@@........,.@@@@......@@@@.. ...@@@..........@@@@......@@@@..
|
385
|
-
...@@@..........@@@@@...@@@@@@@. ...@@@..........@@@@@...@@@@@@@.
|
386
|
-
..@@@@@.......,..@@@@@@@@@.@@@@@ ..@@@@@..........@@@@@@@@@.@@@@@
|
387
|
-
@@@@@@@@@.........@@@@@@@..@@@.. @@@@@@@@@.........@@@@@@@..@@@..
|
388
|
-
..............,....@@@.......... ...................@@@..........
|
389
|
-
^^^
|
390
|
-
213 weak vertical lines
|
391
|
-
</PRE></TD></TR>
|
392
|
-
</TABLE>
|
393
|
-
|
394
|
-
<P>
|
395
|
-
Of course the situation is more difficult with slanted characters.
|
396
|
-
|
397
|
-
<P>
|
398
|
-
The following example shows, how to deal with larger clusters.
|
399
|
-
To get a fast program a first test should select the possible positions of
|
400
|
-
division. That can be done by following upper and lower bows to a crease or a break. Than try to break on all detected creases, start at most
|
401
|
-
important one (not implemented yet v0.2.4).
|
402
|
-
|
403
|
-
<P>
|
404
|
-
<TABLE WIDTH="766">
|
405
|
-
<TR><TD>
|
406
|
-
<PRE>
|
407
|
-
>>>>vvv<<<<< >>vv<<<< >>>vvv<<<<
|
408
|
-
......@@@@@@@..................@@.........@@@@@@@..........@@@@@@@.....
|
409
|
-
....@@@@@@@@@@@...............@@@.......@@@@@@@@@@@......@@@@@@@@@@@...
|
410
|
-
...@@@@@@@@@@@@@.............@@@@......@@@@@@@@@@@@@....@@@@@@@@@@@@@..
|
411
|
-
..@@@@.......@@@@...........@@@@@.....@@@@.......@@@@..@@@@.......@@@@.
|
412
|
-
..@@@........@@@@..........@@@@@@@....@@@........@@@@@@@@@........@@@@.
|
413
|
-
.@@@@..........@@.........@@@@@@@@...@@@@..........@@@@@@@.........@@@@
|
414
|
-
.@@@.....................@@@@.@@@@...@@@..............@@...........@@@@
|
415
|
-
.@@@....................@@@@@.@@@@...@@@...........................@@@@
|
416
|
-
@@@...@@@@@@@...........@@@@..@@@...@@@...@@@@@@...................@@@.
|
417
|
-
@@@@.@@@@@@@@@@........@@@@...@@@@..@@@@.@@@@@@@@@@...............@@@@.
|
418
|
-
@@@@@@@@@@@@@@@.......@@@@....@@@@..@@@@@@@@@@@@@@@...............@@@..
|
419
|
-
@@@@@@@.....@@@@@.....@@@.....@@@@..@@@@@@......@@@@@............@@@@..
|
420
|
-
@@@@.........@@@@...@@@@......@@@@..@@@@@........@@@@...........@@@....
|
421
|
-
@@@@..........@@@@.@@@@.......@@@@..@@@@..........@@@..........@@@@....
|
422
|
-
@@@@..........@@@@@@@@@.......@@@@.@@@@@..........@@@.........@@@@.....
|
423
|
-
@@@@..........@@@@@@@@@@@@@@@@@@@@@@@@@@..........@@@@.......@@@@......
|
424
|
-
@@@@..........@@@@@@@@@@@@@@@@@@@@@@@@@@..........@@@@......@@@........
|
425
|
-
.@@@..........@@@@@@@@@@@@@@@@@@@@@@.@@@..........@@@@....@@@@@........
|
426
|
-
.@@@@........@@@@.............@@@@...@@@@........@@@@....@@@@..........
|
427
|
-
..@@@@.......@@@@.............@@@@....@@@@.......@@@@...@@@@...........
|
428
|
-
..@@@@@....@@@@@..............@@@@.....@@@@....@@@@@...@@@@@@..........
|
429
|
-
....@@@@@@@@@@@...............@@@@......@@@@@@@@@@@...@@@@@@@@@@@@@@@@@
|
430
|
-
.....@@@@@@@@@................@@@@........@@@@@@@@....@@@@@@@@@@@@@@@@@
|
431
|
-
........@@@@...................@@..........@@@@@........@@@@@@@..@.@@@.
|
432
|
-
>>>>^ ^<<>>^ ^<<<<< >>>^<<< ^^ ^
|
433
|
-
|
434
|
-
>,< show the path of the detection algorithm
|
435
|
-
</PRE></TD></TR>
|
436
|
-
</TABLE>
|
437
|
-
|
438
|
-
<P>
|
439
|
-
The latest version of GOCR may use different algorithms.
|
440
|
-
You have to look at the sources learn more.
|
441
|
-
|
442
|
-
<H1><A NAME="SEC10">
|
443
|
-
Black/White, Gray and Colors</A>
|
444
|
-
</H1>
|
445
|
-
For simplicity colored images are converted to gray internally.
|
446
|
-
That means a red text on green background will not be detected.
|
447
|
-
You should use your own filter for this purpose.
|
448
|
-
|
449
|
-
<P>
|
450
|
-
If the original image is gray, a critical value is calculated to
|
451
|
-
extract characters from the background. This can fail, if images are
|
452
|
-
on the scanned page or tha scan is bad (dark edges or borders).
|
453
|
-
It is difficult to overcome this problem because graylevels are mostly
|
454
|
-
restricted to the 8 bit limit (16 bit would help to overcome this problem).
|
455
|
-
|
456
|
-
<P>
|
457
|
-
Black/White images are internally converted to gray with two levels (0 and
|
458
|
-
255).
|
459
|
-
|
460
|
-
<P>
|
461
|
-
The lowest 4 bits are not used, because they are used by internal functions
|
462
|
-
(this can be changed in future).
|
463
|
-
|
464
|
-
<P>
|
465
|
-
After calculation of the threshold value (otsu.c) the brightness of
|
466
|
-
every pixel is recalculated to a new internal threshold value
|
467
|
-
of 160 (128+32).
|
468
|
-
This is a bit above the middle of the 8 bit range. The idea is to
|
469
|
-
make the live easier for the other routines. Pixels which does not sure
|
470
|
-
belong to the white or black ones get a value near the threshold value.
|
471
|
-
Some routines can use this bit of more information to ignore outriders.
|
472
|
-
Second point is, that this is necessary for using lowest for bits
|
473
|
-
without destroying image informations.
|
474
|
-
|
475
|
-
<H1><A NAME="SEC11">Pictures on scanned pages</A>
|
476
|
-
</H1>
|
477
|
-
At first all objects on the scanned page are detected.
|
478
|
-
Objects are clusters of black pixels.
|
479
|
-
Pictures are detected if they are larger than 4 times the mean size of
|
480
|
-
all objects. This rule is very simple and can fail some times.
|
481
|
-
But it works fast and mostly the result is ok.
|
482
|
-
|
483
|
-
<H1><A NAME="SEC12">Tools</A></H1>
|
484
|
-
|
485
|
-
<P>
|
486
|
-
<DL COMPACT>
|
487
|
-
<DT>pbmclean:</DT>
|
488
|
-
<DD>This program is written by Angus Duggan and Jef Poskanzer.
|
489
|
-
It cleans up ``snow'' on bitmap images.
|
490
|
-
</DD>
|
491
|
-
<DT>pnmtools:</DT>
|
492
|
-
<DD>This tools are used to convert different image-formats to
|
493
|
-
easy readable PNM (PBM,PGM,PPM) format.
|
494
|
-
GOCR uses the popen-routine to call this programs if the
|
495
|
-
suffix of the filename matches to a list in pnm.c.
|
496
|
-
This will fail if pnmtools are missing.
|
497
|
-
</DD>
|
498
|
-
</DL>
|
499
|
-
|
500
|
-
<H1><A NAME="SEC12b">related projects (to learn from)</A></H1>
|
501
|
-
|
502
|
-
<P>
|
503
|
-
<DL COMPACT>
|
504
|
-
<DT>unpaper:</DT> <!-- Dec05 JS -->
|
505
|
-
<DD> <a href="http://unpaper.berlios.de/">unpaper</a> -
|
506
|
-
post-processing scanned and photocopied book pages,
|
507
|
-
written by Jens Gulden 2005, GPL
|
508
|
-
</DD>
|
509
|
-
</DL>
|
510
|
-
|
511
|
-
<H1><A NAME="SEC13">glossary</A> </H1> <DL COMPACT>
|
512
|
-
<DT>font series:</DT> <DD>bold, condensed</DD>
|
513
|
-
<DT>font shape: </DT> <DD>normal, italic, slanted, sc... </DD>
|
514
|
-
<DT>points:</DT>
|
515
|
-
<DD>length unit used for font size, 1/72 inch,
|
516
|
-
but I do not know its exact relation to the font size (height?
|
517
|
-
totalheight? width? 10pt and 300dpi results in 40 pixel heigh font?)
|
518
|
-
</DD>
|
519
|
-
<DT>sans serif:</DT>
|
520
|
-
<DD>font without the (often thin) lines on the ends
|
521
|
-
of the character
|
522
|
-
</DD>
|
523
|
-
<DT>descewing:</DT>
|
524
|
-
<DD>compensation of (slightly) rotated text
|
525
|
-
</DD>
|
526
|
-
</DL>
|
527
|
-
|
528
|
-
<H1><A NAME="SEC14"> More information?</A> </H1>
|
529
|
-
<DL COMPACT>
|
530
|
-
<DT>·</DT>
|
531
|
-
<DD>see "/usr/share/doc/package/tetex/texmf/.../fntguide.dvi"
|
532
|
-
in the documentation of the tetex package
|
533
|
-
|
534
|
-
</DD>
|
535
|
-
<DT>·</DT>
|
536
|
-
<DD>the fonts-HOWTO file is helpfully too
|
537
|
-
("www.faqs.org/faqs/fonts-faq/")
|
538
|
-
|
539
|
-
</DD>
|
540
|
-
<DT>RTF:</DT>
|
541
|
-
<DD> RichTextFormat - does someone have a good documontation?
|
542
|
-
</DD>
|
543
|
-
</DL>
|
544
|
-
|
545
|
-
<H1><A NAME="SEC15"> About this document</A> </H1>
|
546
|
-
This Document was originaly written in LaTeX.
|
547
|
-
In May 2002 Joerg has convertet it to HTML. The reason is, that
|
548
|
-
you can read it now directly and you does not need to have LaTeX and
|
549
|
-
Ghostscript installed on your computer to read it.
|
550
|
-
As a side effect you do not need tetex package to build the gocr.rpm-package.
|
551
|
-
A good viewer to read this document is lynx, links or w3m.
|
552
|
-
|
553
|
-
<BR>
|
554
|
-
<HR>
|
555
|
-
<ADDRESS> jNOschulen-at-gSmPAMx.de (remove NO+S+PAM) </ADDRESS>
|
556
|
-
</BODY>
|
557
|
-
</HTML>
|
558
|
-
<!---
|
559
|
-
%
|
560
|
-
% -----------------------------------------------------------------
|
561
|
-
% # v
|
562
|
-
% # v mark connected points via fifo-stack
|
563
|
-
% ### *<< and 3bit direction code per pixel,
|
564
|
-
% # # ^ ^ should be better for overlapping letters
|
565
|
-
% ### ^<<
|
566
|
-
% --------------------- point connections ----------------------
|
567
|
-
% searching next nearest point
|
568
|
-
% /----\
|
569
|
-
% | |
|
570
|
-
% ^ | ## |
|
571
|
-
% | | ## |
|
572
|
-
% | | | |
|
573
|
-
% | \--/ |
|
574
|
-
% \------/
|
575
|
-
%-------------------------------------------------------------
|
576
|
-
%
|
577
|
-
|
578
|
-
--->
|