isbn 2.0.4 → 2.0.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (288) hide show
  1. data/{README → README.md} +5 -11
  2. data/Rakefile +20 -14
  3. data/isbn.gemspec +23 -0
  4. data/lib/isbn.rb +2 -0
  5. data/test/isbn_spec.rb +1 -1
  6. metadata +29 -316
  7. data/VERSION +0 -1
  8. data/src/gocr-0.48/.cvsignore +0 -6
  9. data/src/gocr-0.48/AUTHORS +0 -7
  10. data/src/gocr-0.48/BUGS +0 -55
  11. data/src/gocr-0.48/CREDITS +0 -17
  12. data/src/gocr-0.48/HISTORY +0 -243
  13. data/src/gocr-0.48/INSTALL +0 -83
  14. data/src/gocr-0.48/Makefile +0 -193
  15. data/src/gocr-0.48/Makefile.in +0 -193
  16. data/src/gocr-0.48/README +0 -165
  17. data/src/gocr-0.48/READMEde.txt +0 -80
  18. data/src/gocr-0.48/REMARK.txt +0 -18
  19. data/src/gocr-0.48/REVIEW +0 -538
  20. data/src/gocr-0.48/TODO +0 -65
  21. data/src/gocr-0.48/bin/.cvsignore +0 -2
  22. data/src/gocr-0.48/bin/create_db +0 -38
  23. data/src/gocr-0.48/bin/gocr.tcl +0 -527
  24. data/src/gocr-0.48/bin/gocr_chk.sh +0 -44
  25. data/src/gocr-0.48/configure +0 -4689
  26. data/src/gocr-0.48/configure.in +0 -71
  27. data/src/gocr-0.48/doc/.#Makefile.1.6 +0 -39
  28. data/src/gocr-0.48/doc/.cvsignore +0 -2
  29. data/src/gocr-0.48/doc/Makefile +0 -39
  30. data/src/gocr-0.48/doc/Makefile.in +0 -39
  31. data/src/gocr-0.48/doc/example.dtd +0 -53
  32. data/src/gocr-0.48/doc/example.xml +0 -21
  33. data/src/gocr-0.48/doc/examples.txt +0 -67
  34. data/src/gocr-0.48/doc/gocr.html +0 -578
  35. data/src/gocr-0.48/doc/unicode.txt +0 -57
  36. data/src/gocr-0.48/examples/.#Makefile.1.22 +0 -166
  37. data/src/gocr-0.48/examples/4x6.png +0 -0
  38. data/src/gocr-0.48/examples/4x6.txt +0 -2
  39. data/src/gocr-0.48/examples/5x7.png +0 -0
  40. data/src/gocr-0.48/examples/5x7.png.txt +0 -2
  41. data/src/gocr-0.48/examples/5x8.png +0 -0
  42. data/src/gocr-0.48/examples/5x8.png.txt +0 -2
  43. data/src/gocr-0.48/examples/Makefile +0 -166
  44. data/src/gocr-0.48/examples/color.fig +0 -20
  45. data/src/gocr-0.48/examples/ex.fig +0 -16
  46. data/src/gocr-0.48/examples/font.tex +0 -22
  47. data/src/gocr-0.48/examples/font1.tex +0 -46
  48. data/src/gocr-0.48/examples/font2.fig +0 -27
  49. data/src/gocr-0.48/examples/font_nw.tex +0 -24
  50. data/src/gocr-0.48/examples/handwrt1.jpg +0 -0
  51. data/src/gocr-0.48/examples/handwrt1.txt +0 -10
  52. data/src/gocr-0.48/examples/inverse.fig +0 -20
  53. data/src/gocr-0.48/examples/matrix.jpg +0 -0
  54. data/src/gocr-0.48/examples/ocr-a-subset.png +0 -0
  55. data/src/gocr-0.48/examples/ocr-a-subset.png.txt +0 -4
  56. data/src/gocr-0.48/examples/ocr-a.png +0 -0
  57. data/src/gocr-0.48/examples/ocr-a.txt +0 -6
  58. data/src/gocr-0.48/examples/ocr-b.png +0 -0
  59. data/src/gocr-0.48/examples/ocr-b.png.txt +0 -4
  60. data/src/gocr-0.48/examples/polish.tex +0 -28
  61. data/src/gocr-0.48/examples/rotate45.fig +0 -14
  62. data/src/gocr-0.48/examples/score +0 -36
  63. data/src/gocr-0.48/examples/text.tex +0 -28
  64. data/src/gocr-0.48/gpl.html +0 -537
  65. data/src/gocr-0.48/include/.cvsignore +0 -2
  66. data/src/gocr-0.48/include/config.h +0 -36
  67. data/src/gocr-0.48/include/config.h.in +0 -36
  68. data/src/gocr-0.48/include/version.h +0 -2
  69. data/src/gocr-0.48/install-sh +0 -3
  70. data/src/gocr-0.48/make.bat +0 -57
  71. data/src/gocr-0.48/man/.cvsignore +0 -2
  72. data/src/gocr-0.48/man/Makefile +0 -29
  73. data/src/gocr-0.48/man/Makefile.in +0 -29
  74. data/src/gocr-0.48/man/man1/gocr.1 +0 -166
  75. data/src/gocr-0.48/src/.cvsignore +0 -4
  76. data/src/gocr-0.48/src/Makefile +0 -132
  77. data/src/gocr-0.48/src/Makefile.in +0 -132
  78. data/src/gocr-0.48/src/amiga.h +0 -31
  79. data/src/gocr-0.48/src/barcode.c +0 -846
  80. data/src/gocr-0.48/src/barcode.c.orig +0 -593
  81. data/src/gocr-0.48/src/barcode.h +0 -11
  82. data/src/gocr-0.48/src/box.c +0 -372
  83. data/src/gocr-0.48/src/database.c +0 -462
  84. data/src/gocr-0.48/src/detect.c +0 -943
  85. data/src/gocr-0.48/src/gocr.c +0 -373
  86. data/src/gocr-0.48/src/gocr.h +0 -288
  87. data/src/gocr-0.48/src/jconv.c +0 -168
  88. data/src/gocr-0.48/src/job.c +0 -84
  89. data/src/gocr-0.48/src/lines.c +0 -350
  90. data/src/gocr-0.48/src/list.c +0 -334
  91. data/src/gocr-0.48/src/list.h +0 -90
  92. data/src/gocr-0.48/src/ocr0.c +0 -6756
  93. data/src/gocr-0.48/src/ocr0.h +0 -63
  94. data/src/gocr-0.48/src/ocr0n.c +0 -1475
  95. data/src/gocr-0.48/src/ocr1.c +0 -85
  96. data/src/gocr-0.48/src/ocr1.h +0 -3
  97. data/src/gocr-0.48/src/otsu.c +0 -289
  98. data/src/gocr-0.48/src/otsu.h +0 -23
  99. data/src/gocr-0.48/src/output.c +0 -289
  100. data/src/gocr-0.48/src/output.h +0 -37
  101. data/src/gocr-0.48/src/pcx.c +0 -153
  102. data/src/gocr-0.48/src/pcx.h +0 -9
  103. data/src/gocr-0.48/src/pgm2asc.c +0 -2893
  104. data/src/gocr-0.48/src/pgm2asc.h +0 -105
  105. data/src/gocr-0.48/src/pixel.c +0 -537
  106. data/src/gocr-0.48/src/pnm.c +0 -533
  107. data/src/gocr-0.48/src/pnm.h +0 -35
  108. data/src/gocr-0.48/src/progress.c +0 -87
  109. data/src/gocr-0.48/src/progress.h +0 -42
  110. data/src/gocr-0.48/src/remove.c +0 -703
  111. data/src/gocr-0.48/src/tga.c +0 -87
  112. data/src/gocr-0.48/src/tga.h +0 -6
  113. data/src/gocr-0.48/src/unicode.c +0 -1314
  114. data/src/gocr-0.48/src/unicode.h +0 -1257
  115. data/src/jpeg-7/Makefile.am +0 -133
  116. data/src/jpeg-7/Makefile.in +0 -1089
  117. data/src/jpeg-7/README +0 -322
  118. data/src/jpeg-7/aclocal.m4 +0 -8990
  119. data/src/jpeg-7/ansi2knr.1 +0 -36
  120. data/src/jpeg-7/ansi2knr.c +0 -739
  121. data/src/jpeg-7/cderror.h +0 -132
  122. data/src/jpeg-7/cdjpeg.c +0 -181
  123. data/src/jpeg-7/cdjpeg.h +0 -187
  124. data/src/jpeg-7/change.log +0 -270
  125. data/src/jpeg-7/cjpeg.1 +0 -325
  126. data/src/jpeg-7/cjpeg.c +0 -616
  127. data/src/jpeg-7/ckconfig.c +0 -402
  128. data/src/jpeg-7/coderules.txt +0 -118
  129. data/src/jpeg-7/config.guess +0 -1561
  130. data/src/jpeg-7/config.sub +0 -1686
  131. data/src/jpeg-7/configure +0 -17139
  132. data/src/jpeg-7/configure.ac +0 -317
  133. data/src/jpeg-7/depcomp +0 -630
  134. data/src/jpeg-7/djpeg.1 +0 -251
  135. data/src/jpeg-7/djpeg.c +0 -617
  136. data/src/jpeg-7/example.c +0 -433
  137. data/src/jpeg-7/filelist.txt +0 -215
  138. data/src/jpeg-7/install-sh +0 -520
  139. data/src/jpeg-7/install.txt +0 -1097
  140. data/src/jpeg-7/jaricom.c +0 -148
  141. data/src/jpeg-7/jcapimin.c +0 -282
  142. data/src/jpeg-7/jcapistd.c +0 -161
  143. data/src/jpeg-7/jcarith.c +0 -921
  144. data/src/jpeg-7/jccoefct.c +0 -453
  145. data/src/jpeg-7/jccolor.c +0 -459
  146. data/src/jpeg-7/jcdctmgr.c +0 -482
  147. data/src/jpeg-7/jchuff.c +0 -1612
  148. data/src/jpeg-7/jcinit.c +0 -65
  149. data/src/jpeg-7/jcmainct.c +0 -293
  150. data/src/jpeg-7/jcmarker.c +0 -667
  151. data/src/jpeg-7/jcmaster.c +0 -770
  152. data/src/jpeg-7/jcomapi.c +0 -106
  153. data/src/jpeg-7/jconfig.bcc +0 -48
  154. data/src/jpeg-7/jconfig.cfg +0 -45
  155. data/src/jpeg-7/jconfig.dj +0 -38
  156. data/src/jpeg-7/jconfig.mac +0 -43
  157. data/src/jpeg-7/jconfig.manx +0 -43
  158. data/src/jpeg-7/jconfig.mc6 +0 -52
  159. data/src/jpeg-7/jconfig.sas +0 -43
  160. data/src/jpeg-7/jconfig.st +0 -42
  161. data/src/jpeg-7/jconfig.txt +0 -155
  162. data/src/jpeg-7/jconfig.vc +0 -45
  163. data/src/jpeg-7/jconfig.vms +0 -37
  164. data/src/jpeg-7/jconfig.wat +0 -38
  165. data/src/jpeg-7/jcparam.c +0 -632
  166. data/src/jpeg-7/jcprepct.c +0 -358
  167. data/src/jpeg-7/jcsample.c +0 -545
  168. data/src/jpeg-7/jctrans.c +0 -381
  169. data/src/jpeg-7/jdapimin.c +0 -396
  170. data/src/jpeg-7/jdapistd.c +0 -275
  171. data/src/jpeg-7/jdarith.c +0 -762
  172. data/src/jpeg-7/jdatadst.c +0 -151
  173. data/src/jpeg-7/jdatasrc.c +0 -212
  174. data/src/jpeg-7/jdcoefct.c +0 -736
  175. data/src/jpeg-7/jdcolor.c +0 -396
  176. data/src/jpeg-7/jdct.h +0 -393
  177. data/src/jpeg-7/jddctmgr.c +0 -382
  178. data/src/jpeg-7/jdhuff.c +0 -1309
  179. data/src/jpeg-7/jdinput.c +0 -384
  180. data/src/jpeg-7/jdmainct.c +0 -512
  181. data/src/jpeg-7/jdmarker.c +0 -1360
  182. data/src/jpeg-7/jdmaster.c +0 -663
  183. data/src/jpeg-7/jdmerge.c +0 -400
  184. data/src/jpeg-7/jdpostct.c +0 -290
  185. data/src/jpeg-7/jdsample.c +0 -361
  186. data/src/jpeg-7/jdtrans.c +0 -136
  187. data/src/jpeg-7/jerror.c +0 -252
  188. data/src/jpeg-7/jerror.h +0 -304
  189. data/src/jpeg-7/jfdctflt.c +0 -174
  190. data/src/jpeg-7/jfdctfst.c +0 -230
  191. data/src/jpeg-7/jfdctint.c +0 -4348
  192. data/src/jpeg-7/jidctflt.c +0 -242
  193. data/src/jpeg-7/jidctfst.c +0 -368
  194. data/src/jpeg-7/jidctint.c +0 -5137
  195. data/src/jpeg-7/jinclude.h +0 -91
  196. data/src/jpeg-7/jmemansi.c +0 -167
  197. data/src/jpeg-7/jmemdos.c +0 -638
  198. data/src/jpeg-7/jmemdosa.asm +0 -379
  199. data/src/jpeg-7/jmemmac.c +0 -289
  200. data/src/jpeg-7/jmemmgr.c +0 -1118
  201. data/src/jpeg-7/jmemname.c +0 -276
  202. data/src/jpeg-7/jmemnobs.c +0 -109
  203. data/src/jpeg-7/jmemsys.h +0 -198
  204. data/src/jpeg-7/jmorecfg.h +0 -369
  205. data/src/jpeg-7/jpegint.h +0 -395
  206. data/src/jpeg-7/jpeglib.h +0 -1135
  207. data/src/jpeg-7/jpegtran.1 +0 -272
  208. data/src/jpeg-7/jpegtran.c +0 -546
  209. data/src/jpeg-7/jquant1.c +0 -856
  210. data/src/jpeg-7/jquant2.c +0 -1310
  211. data/src/jpeg-7/jutils.c +0 -179
  212. data/src/jpeg-7/jversion.h +0 -14
  213. data/src/jpeg-7/libjpeg.map +0 -4
  214. data/src/jpeg-7/libjpeg.txt +0 -3067
  215. data/src/jpeg-7/ltmain.sh +0 -8406
  216. data/src/jpeg-7/makcjpeg.st +0 -36
  217. data/src/jpeg-7/makdjpeg.st +0 -36
  218. data/src/jpeg-7/makeadsw.vc6 +0 -77
  219. data/src/jpeg-7/makeasln.vc9 +0 -33
  220. data/src/jpeg-7/makecdep.vc6 +0 -82
  221. data/src/jpeg-7/makecdsp.vc6 +0 -130
  222. data/src/jpeg-7/makecmak.vc6 +0 -159
  223. data/src/jpeg-7/makecvcp.vc9 +0 -186
  224. data/src/jpeg-7/makeddep.vc6 +0 -82
  225. data/src/jpeg-7/makeddsp.vc6 +0 -130
  226. data/src/jpeg-7/makedmak.vc6 +0 -159
  227. data/src/jpeg-7/makedvcp.vc9 +0 -186
  228. data/src/jpeg-7/makefile.ansi +0 -220
  229. data/src/jpeg-7/makefile.bcc +0 -291
  230. data/src/jpeg-7/makefile.dj +0 -226
  231. data/src/jpeg-7/makefile.manx +0 -220
  232. data/src/jpeg-7/makefile.mc6 +0 -255
  233. data/src/jpeg-7/makefile.mms +0 -224
  234. data/src/jpeg-7/makefile.sas +0 -258
  235. data/src/jpeg-7/makefile.unix +0 -234
  236. data/src/jpeg-7/makefile.vc +0 -217
  237. data/src/jpeg-7/makefile.vms +0 -142
  238. data/src/jpeg-7/makefile.wat +0 -239
  239. data/src/jpeg-7/makejdep.vc6 +0 -423
  240. data/src/jpeg-7/makejdsp.vc6 +0 -285
  241. data/src/jpeg-7/makejdsw.vc6 +0 -29
  242. data/src/jpeg-7/makejmak.vc6 +0 -425
  243. data/src/jpeg-7/makejsln.vc9 +0 -17
  244. data/src/jpeg-7/makejvcp.vc9 +0 -328
  245. data/src/jpeg-7/makeproj.mac +0 -213
  246. data/src/jpeg-7/makerdep.vc6 +0 -6
  247. data/src/jpeg-7/makerdsp.vc6 +0 -78
  248. data/src/jpeg-7/makermak.vc6 +0 -110
  249. data/src/jpeg-7/makervcp.vc9 +0 -133
  250. data/src/jpeg-7/maketdep.vc6 +0 -43
  251. data/src/jpeg-7/maketdsp.vc6 +0 -122
  252. data/src/jpeg-7/maketmak.vc6 +0 -131
  253. data/src/jpeg-7/maketvcp.vc9 +0 -178
  254. data/src/jpeg-7/makewdep.vc6 +0 -6
  255. data/src/jpeg-7/makewdsp.vc6 +0 -78
  256. data/src/jpeg-7/makewmak.vc6 +0 -110
  257. data/src/jpeg-7/makewvcp.vc9 +0 -133
  258. data/src/jpeg-7/makljpeg.st +0 -68
  259. data/src/jpeg-7/maktjpeg.st +0 -30
  260. data/src/jpeg-7/makvms.opt +0 -4
  261. data/src/jpeg-7/missing +0 -376
  262. data/src/jpeg-7/rdbmp.c +0 -439
  263. data/src/jpeg-7/rdcolmap.c +0 -253
  264. data/src/jpeg-7/rdgif.c +0 -38
  265. data/src/jpeg-7/rdjpgcom.1 +0 -63
  266. data/src/jpeg-7/rdjpgcom.c +0 -515
  267. data/src/jpeg-7/rdppm.c +0 -459
  268. data/src/jpeg-7/rdrle.c +0 -387
  269. data/src/jpeg-7/rdswitch.c +0 -365
  270. data/src/jpeg-7/rdtarga.c +0 -500
  271. data/src/jpeg-7/structure.txt +0 -945
  272. data/src/jpeg-7/testimg.bmp +0 -0
  273. data/src/jpeg-7/testimg.jpg +0 -0
  274. data/src/jpeg-7/testimg.ppm +0 -4
  275. data/src/jpeg-7/testimgp.jpg +0 -0
  276. data/src/jpeg-7/testorig.jpg +0 -0
  277. data/src/jpeg-7/testprog.jpg +0 -0
  278. data/src/jpeg-7/transupp.c +0 -1533
  279. data/src/jpeg-7/transupp.h +0 -205
  280. data/src/jpeg-7/usage.txt +0 -605
  281. data/src/jpeg-7/wizard.txt +0 -211
  282. data/src/jpeg-7/wrbmp.c +0 -442
  283. data/src/jpeg-7/wrgif.c +0 -399
  284. data/src/jpeg-7/wrjpgcom.1 +0 -103
  285. data/src/jpeg-7/wrjpgcom.c +0 -583
  286. data/src/jpeg-7/wrppm.c +0 -269
  287. data/src/jpeg-7/wrrle.c +0 -305
  288. data/src/jpeg-7/wrtarga.c +0 -253
@@ -1,71 +0,0 @@
1
- # see /usr/share/info/standards.info,autoconf.info (autoconf 2.57)
2
- dnl Process this file with autoconf to produce a configure script.
3
- dnl obsolete: AC_INIT(src/pgm2asc.c)
4
- AC_INIT(gocr,0.48,,)
5
- AC_PREREQ(2.50)
6
- AC_CONFIG_HEADERS([include/config.h])
7
- dnl AC_EXEEXT is obsolete now
8
-
9
- dnl Checks for programs.
10
- AC_PROG_CC
11
- AC_PROG_INSTALL
12
- AC_PROG_MAKE_SET
13
- dnl only needed for libPgm2asc.a, not for the rpm/ebuild
14
- dnl AC_PROG_RANLIB
15
- dnl AC_CHECK_PROG(AR,ar,ar)
16
- dnl needed for developpers to make examples, not for the rpm/ebuild
17
- dnl AC_CHECK_PROG(FIG2DEV,fig2dev,fig2dev)
18
-
19
- dnl Check for optional debug mode
20
- dnl debug makes program slow, but is very useful for developper
21
- dnl ToDo: how to check that c-flags are available?
22
- AC_ARG_WITH(debug,
23
- [ --with-debug switching on debugging (more verbose output)],
24
- [ CPPFLAGS="-Wall -g -fexceptions -DDO_DEBUG=1 $CPPFLAGS" ])
25
- if test "$with_debug"; then echo "debugging enabled"; fi
26
-
27
- dnl Check for optional netpbm PACKAGE: --with-netpbm=no == --without-netpbm
28
- dnl LDFLAGS+=-R$withval/lib compiles the search path into the file ???
29
- AC_ARG_WITH(netpbm,
30
- [ --with-netpbm=PATH enter the PATH to netpbm package],
31
- [ if test "$withval" != "no"; then
32
- LDFLAGS="-L$withval/lib $LDFLAGS";\
33
- CPPFLAGS="-I$withval/include $CPPFLAGS";\
34
- fi ])
35
- if test -n "$with_netpbm"; then echo "option: with_netpbm $with_netpbm"; fi
36
-
37
- dnl Checks for libraries.
38
- if test "$with_netpbm" != "no"; then
39
- # netpbm-10.26 + SuSE-10.0: netpbm needs mathlib -lm
40
- # netpbm-10 : libnetpbm + pam.h (+ p[bgpn]m.h, libp[bgpn]m.so as links)
41
- # netpbm-9 : libpnm + pnm.h (+ p[bgp]m.h)
42
- # pnm_readpaminit (netpbm-10)
43
- # pnm_readpnminit (netpbm-9,10)
44
- # ToDo: how to check that -lm is needed?
45
- LDFLAGS="-lm $LDFLAGS"
46
- AC_SEARCH_LIBS(pnm_readpnminit,[netpbm pnm],[check_netpbm_h="pam.h pnm.h"],
47
- [ echo " * * * try option --with-netpbm=PATH"])
48
- fi
49
-
50
- dnl Checks for header files.
51
- AC_HEADER_STDC
52
- AC_CHECK_HEADERS([unistd.h wchar.h ${check_netpbm_h}])
53
-
54
- dnl Checks for typedefs, structures, and compiler characteristics.
55
- AC_C_CONST
56
- dnl AC_CHECK_TYPE(wchar_t,unsigned)
57
-
58
- dnl Checks for library functions.
59
- dnl this macro produces a warning: AC_TRY_RUN called without default ...
60
- dnl The message can be ignored as long as you don't configure gOCR for
61
- dnl cross-compiling.
62
- AC_FUNC_SETVBUF_REVERSED
63
- AC_CHECK_FUNCS(wcschr wcsdup gettimeofday popen)
64
-
65
- dnl Checks for system services
66
-
67
- dnl obsolete: AC_OUTPUT(Makefile src/Makefile doc/Makefile man/Makefile)
68
- dnl the light weight version of package comes without src/api
69
- AC_CONFIG_FILES([Makefile src/Makefile doc/Makefile man/Makefile])
70
- dnl AC_CONFIG_COMMANDS([default],,)
71
- AC_OUTPUT
@@ -1,39 +0,0 @@
1
- #
2
- # Makefile for ./doc path, used by configure
3
- #
4
-
5
- # these two lines are for cross-compiling, not tested
6
- #srcdir = .
7
- #VPATH = .
8
-
9
- # changed to html, tex is not used anymore
10
- #LATEX=@LATEX@
11
- #DVIPS=@DVIPS@
12
-
13
- OCRDOC=ocr
14
- # add other source file to documentation here
15
- # SRC=$(OCRDOC).tex
16
-
17
- .PHONY : all clean proper install uninstall
18
- default: all
19
-
20
- all: # do nothing!
21
-
22
- #$(OCRDOC).ps: $(OCRDOC).dvi
23
- # $(DVIPS) -o $(OCRDOC).ps $?
24
- #
25
- #$(OCRDOC).dvi: $(SRC)
26
- # $(LATEX) $(OCRDOC).tex
27
-
28
- install: all
29
- echo "Copy gocr.html to your document path."
30
-
31
- uninstall:
32
- echo "Remove gocr.html from your document path."
33
-
34
- clean:
35
- -rm -f *.bak *~ $(OCRDOC).{dvi,log,aux}
36
-
37
- proper: clean
38
- -rm -f $(OCRDOC).ps
39
-
@@ -1,2 +0,0 @@
1
- .cvsignore
2
- Makefile
@@ -1,39 +0,0 @@
1
- #
2
- # Makefile for ./doc path, used by configure
3
- #
4
-
5
- # these two lines are for cross-compiling, not tested
6
- #srcdir = .
7
- #VPATH = .
8
-
9
- # changed to html, tex is not used anymore
10
- #LATEX=@LATEX@
11
- #DVIPS=@DVIPS@
12
-
13
- OCRDOC=ocr
14
- # add other source file to documentation here
15
- # SRC=$(OCRDOC).tex
16
-
17
- .PHONY : all clean proper install uninstall
18
- default: all
19
-
20
- all: # do nothing!
21
-
22
- #$(OCRDOC).ps: $(OCRDOC).dvi
23
- # $(DVIPS) -o $(OCRDOC).ps $?
24
- #
25
- #$(OCRDOC).dvi: $(SRC)
26
- # $(LATEX) $(OCRDOC).tex
27
-
28
- install: all
29
- echo "Copy gocr.html to your document path."
30
-
31
- uninstall:
32
- echo "Remove gocr.html from your document path."
33
-
34
- clean:
35
- -rm -f *.bak *~ $(OCRDOC).{dvi,log,aux}
36
-
37
- proper: clean
38
- -rm -f $(OCRDOC).ps
39
-
@@ -1,39 +0,0 @@
1
- #
2
- # Makefile for ./doc path, used by configure
3
- #
4
-
5
- # these two lines are for cross-compiling, not tested
6
- #srcdir = @srcdir@
7
- #VPATH = @srcdir@
8
-
9
- # changed to html, tex is not used anymore
10
- #LATEX=@LATEX@
11
- #DVIPS=@DVIPS@
12
-
13
- OCRDOC=ocr
14
- # add other source file to documentation here
15
- # SRC=$(OCRDOC).tex
16
-
17
- .PHONY : all clean proper install uninstall
18
- default: all
19
-
20
- all: # do nothing!
21
-
22
- #$(OCRDOC).ps: $(OCRDOC).dvi
23
- # $(DVIPS) -o $(OCRDOC).ps $?
24
- #
25
- #$(OCRDOC).dvi: $(SRC)
26
- # $(LATEX) $(OCRDOC).tex
27
-
28
- install: all
29
- echo "Copy gocr.html to your document path."
30
-
31
- uninstall:
32
- echo "Remove gocr.html from your document path."
33
-
34
- clean:
35
- -rm -f *.bak *~ $(OCRDOC).{dvi,log,aux}
36
-
37
- proper: clean
38
- -rm -f $(OCRDOC).ps
39
-
@@ -1,53 +0,0 @@
1
- <?xml version="1.0"?>
2
- <!--
3
- first draft by Volker Simonis, reviewed by Joerg Schulenburg
4
- Its not ready for use!
5
- ToDo:
6
- - lynx/links/w3c should show xml like a html file
7
- value as <character ...>CharText</character>
8
- or <word><character ...></character><...>WordText</word>
9
- or as line or as block? whats more useful?
10
- - how to code table of alternative chars/words and its probability?
11
- - how to handle images (as image tags?)
12
- - xmllint -\-htmlout -\-loaddtd jocr/doc/example.dtd o.xml
13
- -->
14
- <!ENTITY % default.attributes "x CDATA #REQUIRED
15
- y CDATA #REQUIRED
16
- dx CDATA #REQUIRED
17
- dy CDATA #REQUIRED">
18
-
19
- <!ELEMENT box EMPTY>
20
- <!ATTLIST box %default.attributes;
21
- value CDATA #REQUIRED;>
22
-
23
- <!ELEMENT barcode EMPTY>
24
- <!ATTLIST barcode %default.attributes;
25
- value CDATA #REQUIRED;>
26
-
27
- <!ELEMENT img EMPTY>
28
- <!ATTLIST img %default.attributes;>
29
-
30
- <!ELEMENT page (block*)>
31
- <!ATTLIST page %default.attributes;>
32
-
33
- <!ELEMENT block (line*)>
34
- <!ATTLIST block %default.attributes;>
35
-
36
- <!ELEMENT line ((word | space | punctuation-mark)*)>
37
- <!ATTLIST line %default.attributes;>
38
-
39
- <!ELEMENT word (character*)>
40
- <!ATTLIST word %default.attributes;>
41
-
42
- <!ELEMENT char EMPTY>
43
- <!ATTLIST char %default.attributes;
44
- value CDATA #REQUIRED;
45
- (#CDATA)> <!-- is that correct? -->
46
-
47
- <!ELEMENT space EMPTY>
48
- <!ATTLIST space %default.attributes;
49
- value CDATA #REQUIRED;>
50
-
51
- <!ELEMENT punctuation-mark EMPTY>
52
- <!ATTLIST punctuation-mark %default.attributes;
53
- value CDATA #REQUIRED;>
@@ -1,21 +0,0 @@
1
- <?xml version="1.0"?>
2
- <!DOCTYPE gocr SYSTEM "example.dtd">
3
- <!-- example file for example.dtd -->
4
- <page>
5
- <block x="123" y="11" dx="500" dy="800">
6
- <line x="130" y="11" dx="480" dy="30">
7
- <word x="130" y="11" dx="80" dy="30">
8
- <character x="130" y="11" dx="80" dy="30" value="A"/>
9
- <character ... />
10
- ...
11
- </word>
12
- <punctuation-mark .. />
13
- <space .. />
14
- <word ..>
15
- ...
16
- </word>
17
- </line>
18
- <box ... />
19
- ...
20
- </block>
21
- </page>
@@ -1,67 +0,0 @@
1
- Note: this info is related to example files, used to test gOCR. As of this
2
- writing, these files are not available to non-developers. So, if you aren't
3
- a developer, forget about this file.
4
-
5
- EXAMPLE FILES
6
-
7
- 1. Scanning
8
- The examples can be scanned from anything; when looking for something, try to
9
- have in mind the kind of tests you are expecting to do: if you're testing
10
- accents recognition, look for texts in portuguese, french, etc. (pretty obvious,
11
- but keeping this in mind will help to have a large gamma of files covering
12
- different kinds of tests).
13
-
14
- If you're not interested in testing DPIs, scan at 150 or 300dpi.
15
-
16
- If you're not interested in testing the dust removal, cleaning, etc, functions,
17
- do the best scan you can. Usually increasing brightness and contrast will
18
- provide a sharper, cleaner image.
19
-
20
- Save the image in a supported format: for example, pgm or jpg. If a compression
21
- will result in a significant reduction of size, compress the image. BZIP2
22
- usually is the best compressor around, but gzip is more popular in the unix
23
- world. In the wintel world, people use ZIP, and usually will have to search for
24
- an application capable of opening .gz or .bz2 (though WinZIP opens at least the
25
- former).
26
-
27
- 2. Sorting
28
- To help others to find the files they are looking for, the examples/ directory
29
- is divided in several other directories, which may be subdivided. When
30
- uploading a new example, look for the most suitable location. Depending of the
31
- directory, you probably will name your file with interesting info: for example,
32
- when uploading a image with all the characters of the foo font, the best thing
33
- to do is to place it at examples/fonts/foo.jpg.
34
-
35
- 3. "Translation"
36
- Along with the image file, upload a text file with the expected output. Be
37
- careful with this file: it must resemble the original text as much as possible.
38
- Don't add extra new lines (\n), keep hyphenized words, etc. Name this file with
39
- the same name of the image file.
40
-
41
- In the beginning of the text file, you should provide comments, to help
42
- searches. Use the following sample:
43
-
44
- # Comments
45
- # DPI:
46
- # Colors:
47
- # Image size (colsXrows):
48
- # Fonts:
49
- # Font sizes:
50
- # Layout form:
51
- # Number of pictures:
52
- # Language:
53
- # Quality of scan:
54
- # Non-ASCII characters:
55
- # Extra:
56
-
57
- Check existing examples to see what people have been doing.
58
-
59
- Any lines that begin with # will be considered comments, so you may use several
60
- lines for comments or add new fields. Though gOCR itself doesn't depend on, and
61
- won't use, this file, it will be used by scripts.
62
-
63
- 4. Other sources (WEB)
64
-
65
- - http://www.clerkweb.house.gov/elections/elections.htm (Nov2002)
66
- PDF-files with lot of tables
67
-
@@ -1,578 +0,0 @@
1
- <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
2
-
3
- <HTML>
4
- <HEAD>
5
- <TITLE>GOCR-documentation</TITLE>
6
- <META NAME="description" CONTENT="GOCR-documentation">
7
- <META NAME="keywords" CONTENT="ocr">
8
- <META NAME="resource-type" CONTENT="document">
9
- <META NAME="distribution" CONTENT="global">
10
-
11
- <META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=iso-8859-1">
12
- <META NAME="Generator" CONTENT="Joerg">
13
- </HEAD>
14
-
15
- <BODY >
16
- <H1 ALIGN="CENTER">GOCR-documentation</H1>
17
- <P ALIGN="CENTER"><STRONG>J&ouml;rg Schulenburg</STRONG></P>
18
- <P ALIGN="CENTER"><STRONG>Magdeburg, June 3, 2002</STRONG></P>
19
-
20
- <H3>Abstract:</H3>
21
- <DIV>
22
- In this documentation I describe some ideas for my OCR-program.
23
- It contains algorithms and examples and gives you
24
- an impression of what the program can (or could) do.
25
- </DIV>
26
- <P>
27
-
28
-
29
- <P>
30
- <BR><HR>
31
- <!--Table of Child-Links-->
32
- <A NAME="CHILD_LINKS"></A>
33
-
34
- <UL>
35
- <LI><A HREF="#SEC1">Introduction</A>
36
- <LI><A HREF="#SEC2">Segmentation of textual regions / Layout analysis</A>
37
- <LI><A HREF="#SEC3">Line detection</A>
38
- <LI><A HREF="#SEC4">Cluster detection</A>
39
- <LI><A HREF="#SEC5">Engines</A>
40
- <LI><A HREF="#SEC6">Remove pixels</A>
41
- <LI><A HREF="#SEC7">Add pixels</A>
42
- <LI><A HREF="#SEC8">Similarity analyzer</A>
43
- <LI><A HREF="#SEC9">Overlapping characters</A>
44
- <LI><A HREF="#SEC10">Black/White, Gray and Colors</A>
45
- <LI><A HREF="#SEC11">Pictures on scanned pages</A>
46
- <LI><A HREF="#SEC12">Tools</A>
47
- <LI><A HREF="#SEC13">glossary</A>
48
- <LI><A HREF="#SEC14">More information?</A>
49
- <LI><A HREF="#SEC15">About this document</A>
50
- </UL>
51
- <!--End of Table of Child-Links-->
52
-
53
- <H1><A NAME="SEC1"> Introduction</A>
54
- </H1>
55
- First I have to say that I am not a expert in pattern recognition
56
- or similar things. My knowledge is based mostly on experiments with my
57
- program.
58
- Therefore do not worry about stupid algorithms I put in this document.
59
- In this documentation I describe some ideas for my OCR-program.
60
- The examples give you an impression of how the program handles
61
- your images.
62
- If you have comments regarding contents or spelling please
63
- write to the author.
64
-
65
- <H1><A NAME="SEC2">Segmentation of textual regions / Layout analysis</A></H1>
66
-
67
- This is implemented as a recursive division in two parts.
68
-
69
- <UL>
70
- <LI>look for the thickest horizontal or vertical gap through the box</LI>
71
- <LI>if the gap is less than five times longer than thick do not divide </LI>
72
- <LI>do the same with the two new parts</LI>
73
- </UL>
74
- I know that this algorithm is not as good as you wish,
75
- but I do not know a better one.
76
-
77
- <P>
78
- It would be very helpful to know about a function which is able to
79
- decide whether the box represents a single text line or a more complex object.
80
-
81
- <H1><A NAME="SEC3">Line detection</A></H1>
82
- <P>
83
- Line detection is very importand for good recognition.
84
- For example it is difficult to distinguish between lowercase letter <B>p</B>
85
- and uppercase letter <B>P</B> without having a baseline (same total height).
86
- The lowercase version of <B>p</B> has a depht (the lower end is below the
87
- baseline) and therefore its easy to distinguish from the uppercase version
88
- if the baseline is known. The line detection is responsible for finding the
89
- baseline of every text line.
90
-
91
- <P>
92
- Lines of characters are detected by looking for interline spaces.
93
- These are characterized by a large number of non-black pixels in a
94
- row. Image rotation (skewing) presents a problem, therefore the program
95
- first looks only at the left half of the image. When a line is
96
- found, the left half of the right side is scanned, because lines
97
- are often short. The variation in height gives an indication of
98
- the rotation angle. Using this angle, a second run detects lines
99
- more accurately. Line detection may fail if there is dust on the
100
- image.
101
-
102
- <P>
103
- In version v0.2.3 this behaviour is slightly changed.
104
- To detect the rotation angle, the line through the most
105
- characters is detected.
106
-
107
- <H1><A NAME="SEC4">Cluster detection</A></H1>
108
-
109
- A cluster is a group of pixels which are connected with each other.
110
- The simplest way to detect a cluster is to look for a pixel.
111
- If you find one, look to the neighbouring pixels. This can be done recursively.
112
-
113
- <P>
114
- This method needs a lot of stack space if a cluster is very large,
115
- and can cause problems with the memory.
116
-
117
- <P>
118
- Do you remember the algorithm for leaving a maze?
119
- Go along the right (or left) wall. This seems to be a good approach
120
- for detecting clusters without recursion.
121
- The following picture shows a trace of the maze algorithm.
122
-
123
- <P>
124
- <TABLE WIDTH="680">
125
- <TR><TD>
126
- <PRE>
127
- first 35 steps next 36 steps
128
- ..@@@@@..@@@@&lt;.. ..v&lt;&lt;&lt;&lt;..v&lt;&lt;&lt;@.. * = starting point
129
- ..@@@@@@@@@.@^&lt;. ..&gt;&gt;v@^&lt;&lt;&lt;@.@@@. &gt;^&lt;v = go right,up,left,down
130
- ....@@@@@...@@^. ....v@@@@...@@@. @ = black pixel
131
- ....@@@@....@@^. ....v@@@....@@@.
132
- ....@@@.....@@^. ....v@@.....@@@.
133
- ....@@@.....@@^. ....v@@.....@@@.
134
- ...@@@@.....@@^. ...v&lt;@@.....@@@.
135
- ...@@@......@@^. ...v@@......@@@.
136
- ...@@@......@@^. ...v@@......@@@.
137
- ...@@@.....@@@^. ...v@@.....@@@@.
138
- ...@@@.....@@&gt;^. ...v@@.....@@@@.
139
- ...@@@.....@@^.. ...v@@.....@@@..
140
- ..@@@@.....@@^.. ..v&lt;@@.....@@@..
141
- ..@@@@....@@@^.. ..v@@@....@@@@..
142
- *&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;^&lt;&lt; @@@@@@@@@@@@@@@@
143
- </PRE></TD></TR>
144
- </TABLE>
145
-
146
- <P>
147
- The minimum and maximum coordinates can be used to create a box around the
148
- cluster. But does this algorithm work with diagonally connected pixels?
149
-
150
- <H1><A NAME="SEC5">Engines</A></H1>
151
-
152
- GOCR is able to work with different recognition engines.
153
- Since version 0.37 engines have to return a probability value together
154
- with the recognized character or a table of values to a table of characters.
155
- If the probability value is 100, the engine is 100% sure to have found the
156
- right character otherwise the value is less. This gives GOCR the possibility
157
- to compare results of different engines or in case of a not recognized character
158
- to inform the user or another
159
- application (spell checker) which characters probably could be there.
160
-
161
- <H2>Base-Engine</H2>
162
- The base engine (src/ocrX.c) is the original engine used in the first implementation
163
- of GOCR by J&ouml;rg. The idea was to get a fast and acceptable result
164
- without learning theoretical background. Later it should be replaced or completed
165
- by a better engine.
166
- The base engine is a rule based engine.
167
- The engine was written without theoretical
168
- background and is improved by try and error method but is is still far from
169
- perfect. The algorithm is very tolerant to size and form af characters
170
- (omnifont).
171
- How does the engine identify a character? For the explanation look at the
172
- following pattern.
173
-
174
- <P>
175
- <TABLE WIDTH="680">
176
- <TR><TD>
177
- <PRE>
178
- vvvv vv- white regions
179
- ......@@...... &lt;- crossing one line
180
- ......@@......
181
- .....@@@@.....
182
- .....@@@@.....
183
- .....@@@@.....
184
- ....@..@@@.... &lt;- white hole / crossing two lines
185
- ....@..@@@.... &lt;- crossing two lines
186
- ....@..@@@....
187
- ...@....@@@...
188
- ...@....@@@...
189
- ...@....@@@...
190
- ..@@@@@@@@@@.. &lt;- horizontal line near center
191
- ..@......@@@..
192
- ..@......@@@..
193
- .@........@@@. v- increasing width of pattern
194
- .@........@@@. v
195
- .@........@@@. v
196
- @@@......@@@@@
197
- ^^^-- gap
198
- </PRE></TD></TR>
199
- </TABLE>
200
-
201
- <P>
202
- In the future the program
203
- should detect edges, vertices, gaps, angles and so on.
204
- This is called feature extraction (as far as I know).
205
- With such data the engine could make a cluster analysis.
206
- But this is a difficult task, if the scanned image is noisy.
207
-
208
- <H2>Database-Engine</H2>
209
- The database engine (src/database.c) was the second engine added to GOCR.
210
- It was primary written to give users a simple tool to recognize
211
- special language-specific characters. The program generates a list
212
- (text file db.lst of image filenames and character codes)
213
- and image samples (pnm-files) in a database path (./db/).
214
- The database can be created by hand or extern programs or by GOCR itself
215
- using option (-m 130). In the last case GOCR prompts the user
216
- for not recognized characters. If he enters the character the pattern
217
- is saved in the database path as pnm-file and its file name is added
218
- to the database list (db.lst) together with the text string the pattern
219
- should be replaced by.
220
- For recognition GOCR first loads the database into memory (option -m 2).
221
- The main algorithm compares not recognized characters with stored images
222
- and calculates a distance value. If the distance value is small enough,
223
- the character is treated as recognized.
224
-
225
- <H1><A NAME="SEC6">Remove pixels</A></H1>
226
-
227
- The following picture shows an <I>n</I> which has additional pixels at the
228
- bottom. Therefore it can not be detected as <I>n</I>. What can be done?
229
-
230
- <UL>
231
- <LI>classify horizontal (<TT>=</TT>) and vertical (<TT>I</TT>) pixels by
232
- comparing the distance between the next vertical and next horizontal white
233
- pixels (.)
234
- </LI>
235
- <LI>measure mean thickness of vertical and horizontal clusters
236
- </LI>
237
- <LI>erase unusually thin horizontal pixels at the bottom line
238
- </LI>
239
- </UL>
240
-
241
- <P>
242
- <TABLE WIDTH="680">
243
- <TR><TD>
244
- <PRE>
245
- ..@@@@@..@@@@@.. ..==III..===II.. dx=16 dy=15
246
- ..@@@@@@@@@.@@@. ..==III====.III. thickness 2 to 3
247
- ....@@@@@...@@@. ....III==...III.
248
- ....@@@@....@@@. ....III=....III.
249
- ....@@@.....@@@. ....III.....III.
250
- ....@@@.....@@@. ....III.....III.
251
- ...@@@@.....@@@. ...IIII.....III.
252
- ...@@@......@@@. ...III......III.
253
- ...@@@......@@@. ...III......III.
254
- ...@@@.....@@@@. ...III.....IIII.
255
- ...@@@.....@@@@. ...III.....IIII.
256
- ...@@@.....@@@.. ...III.....III..
257
- ..@@@@.....@@@.. ..IIII.....III..
258
- ..@@@@....@@@@.. ..IIII....IIII..
259
- @@@@@@@@@@@@@@@@ ================
260
- ^^^
261
- this causes the problem
262
- </PRE></TD></TR>
263
- </TABLE>
264
-
265
- <P>
266
- A better way is to find serifs (horizontal lines glued on the lower end
267
- of vertical lines) which touch together (v0.2.5).
268
-
269
- <P>
270
- The next picture shows blind pixels which are caused by dust on the paper.
271
- The upper right dots are not connected with the rest of the character.
272
- This can be detected via fill-algorithms. Currently the program
273
- assumes that dots near the upper end of a character are ``i''-dots
274
- or diaereses (umlaut dots).
275
-
276
- <P>
277
- <TABLE WIDTH="680">
278
- <TR><TD>
279
- <PRE>
280
- ..........................O... ..........................O...
281
- ..........................O... ..........................O...
282
- .............................. ..............................
283
- .............................. ..............................
284
- ..........@@@.......@@@@...... ..........@@@.......@@@@......
285
- ..@@@@..@@@@@@@...@@@@@@@..... ..@@@@..@@@@@@@...@@@@@@@.....
286
- @@@@@@@@@@@@@@@@.@@@@@@@@@.... @@@@@@@@@@@@@@@@.@@@@@@@@@....
287
- ..@@@@@@....@@@@@@.....@@@@... ..@@@@@@....@@@@@@.....@@@@...
288
- ..@@@@.......@@@@......@@@@... ..@@@@.......@@@@......@@@@...
289
- ..@@@@.......@@@.......@@@@... ..@@@@.......@@@.......@@@@...
290
- ..@@@@.......@@@.......@@@@... ..@@@@.......@@@.......@@@@...
291
- ..@@@@.......@@@.......@@@@... ..@@@@.......@@@.......@@@@...
292
- ..@@@@.......@@@@......@@@@... ..@@@@.......@@@@......@@@@...
293
- ..@@@@.......@@@.......@@@@... ..@@@@.......@@@.......@@@@...
294
- ..@@@@.......@@@@......@@@@... ..@@@@.......@@@@......@@@@...
295
- ..@@@@.......@@@.......@@@@... ..@@@@.......@@@.......@@@@...
296
- ..@@@@.......@@@.......@@@@... ..@@@@.......@@@.......@@@@...
297
- ..@@@@.......@@@.......@@@@... ..@@@@.......@@@.......@@@@...
298
- ..@@@@.......@@@.......@@@@... ..@@@@.......@@@.......@@@@...
299
- ..@@@@.......@@@.......@@@@... ..@@@@.......@@@.......@@@@...
300
- ..@@@@.......@@@@......@@@@... ..@@@@.......@@@@......@@@@...
301
- ..@@@@......@@@@@......@@@@@.. ..@@@@......@@@@@......@@@@@..
302
- @@@@@@@@..@@@@@@@@@..@@@@@@@@@ @@@@@@@@..@@@@@@@@@..@@@@@@@@@
303
- </PRE></TD></TR>
304
- </TABLE>
305
-
306
- <H1><A NAME="SEC7">
307
- Add pixels</A>
308
- </H1>
309
- The following picture shows an <I>m</I>. The legs are only barely connected.
310
- How do we handle this?
311
-
312
- <UL>
313
- <LI>if the engine has failed, a filter is switched on and the engine
314
- starts over
315
- </LI>
316
- <LI>the 2x2 filter sets pixels to (<I>O</I>) near barely connected pixels
317
- </LI>
318
- </UL>
319
-
320
- <P>
321
- <TABLE WIDTH="680">
322
- <TR><TD>
323
- <PRE>
324
- vv vv
325
- @@@.@@@..@@@... @@@.@@@..@@@...
326
- .@@.@@@@.@@@@..&lt; .@@O@@@@O@@@@.. filter: .@ =&gt; O@ @. =&gt; @O
327
- .@@@..@@@..@@..&lt; .@@@..@@@..@@.. @. =&gt; @. .@ =&gt; .@
328
- .@@@..@@@..@@@. .@@@..@@@..@@@.
329
- .@@@..@@@..@@@. .@@@..@@@..@@@.
330
- .@@@..@@@..@@@. .@@@..@@@..@@@.
331
- .@@@..@@@..@@@. .@@@..@@@..@@@.
332
- .@@@..@@@..@@@. .@@@..@@@..@@@.
333
- .@@@..@@@..@@@. .@@@..@@@..@@@.
334
- .@@@..@@@..@@@. .@@@..@@@..@@@.
335
- .@@@..@@@..@@@. .@@@..@@@..@@@.
336
- .@@@..@@@..@@@. .@@@..@@@..@@@.
337
- .@@@..@@@..@@@. .@@@..@@@..@@@.
338
- @@@@@.@@@@.@@@@ @@@@@.@@@@.@@@@
339
- </PRE></TD></TR>
340
- </TABLE>
341
-
342
- <H1><A NAME="SEC8">
343
- Similarity analyzer</A>
344
- </H1>
345
- Some characters are a little bit noisy. These characters can be identified by
346
- comparison with other, already recognized characters. This can be done
347
- via a good distance function. May be
348
- the distance function in the actual version of GOCR is not very good.
349
- Feel free to send me your ideas, but be sure it does not waste my time.
350
-
351
- <H1><A NAME="SEC9">
352
- Overlapping characters</A>
353
- </H1>
354
- The following picture shows an overlapping <I>ru</I>.
355
- How do we handle this?
356
-
357
- <UL>
358
- <LI>look for 3 weak connections (sum over y is small, start in the middle)
359
- </LI>
360
- <LI>test if the right and left part can be detected by the engine
361
- </LI>
362
- <LI>correction of surrounding box
363
- </LI>
364
- </UL>
365
-
366
- <P>
367
- <TABLE WIDTH="680">
368
- <TR><TD>
369
- <PRE>
370
- ....@@...@@@@@@@@@@....@@@@@@@.. ....@@...@@@@@@@@@@....@@@@@@@..
371
- ..@@@@..@@@@@..@@@@......@@@@@.. ..@@@@..@@@@@..@@@@......@@@@@..
372
- @@@@@@@@@@@@@.,.@@@.......@@@@.. @@@@@@@@@@@@@...@@@.......@@@@..
373
- ..@@@@@@..@@@...@@@.......@@@@.. ..@@@@@@..@@@...@@@.......@@@@..
374
- ...@@@@.......,.@@@@......@@@@.. ...@@@@.........@@@@......@@@@..
375
- ...@@@@.........@@@@......@@@@.. ...@@@@.........@@@@......@@@@..
376
- ...@@@@.......,.@@@.......@@@@.. ...@@@@.........@@@.......@@@@..
377
- ...@@@@.........@@@.......@@@@.. ...@@@@.........@@@.......@@@@..
378
- ...@@@........,.@@@@......@@@@.. ...@@@..........@@@@......@@@@..
379
- ...@@@..........@@@@......@@@@.. ...@@@..........@@@@......@@@@..
380
- ...@@@........,.@@@@......@@@@.. ...@@@..........@@@@......@@@@..
381
- ...@@@..........@@@.......@@@@.. ...@@@..........@@@.......@@@@..
382
- ...@@@........,.@@@@......@@@@.. ...@@@..........@@@@......@@@@..
383
- ...@@@..........@@@@......@@@@.. ...@@@..........@@@@......@@@@..
384
- ...@@@........,.@@@@......@@@@.. ...@@@..........@@@@......@@@@..
385
- ...@@@..........@@@@@...@@@@@@@. ...@@@..........@@@@@...@@@@@@@.
386
- ..@@@@@.......,..@@@@@@@@@.@@@@@ ..@@@@@..........@@@@@@@@@.@@@@@
387
- @@@@@@@@@.........@@@@@@@..@@@.. @@@@@@@@@.........@@@@@@@..@@@..
388
- ..............,....@@@.......... ...................@@@..........
389
- ^^^
390
- 213 weak vertical lines
391
- </PRE></TD></TR>
392
- </TABLE>
393
-
394
- <P>
395
- Of course the situation is more difficult with slanted characters.
396
-
397
- <P>
398
- The following example shows, how to deal with larger clusters.
399
- To get a fast program a first test should select the possible positions of
400
- division. That can be done by following upper and lower bows to a crease or a break. Than try to break on all detected creases, start at most
401
- important one (not implemented yet v0.2.4).
402
-
403
- <P>
404
- <TABLE WIDTH="766">
405
- <TR><TD>
406
- <PRE>
407
- &gt;&gt;&gt;&gt;vvv&lt;&lt;&lt;&lt;&lt; &gt;&gt;vv&lt;&lt;&lt;&lt; &gt;&gt;&gt;vvv&lt;&lt;&lt;&lt;
408
- ......@@@@@@@..................@@.........@@@@@@@..........@@@@@@@.....
409
- ....@@@@@@@@@@@...............@@@.......@@@@@@@@@@@......@@@@@@@@@@@...
410
- ...@@@@@@@@@@@@@.............@@@@......@@@@@@@@@@@@@....@@@@@@@@@@@@@..
411
- ..@@@@.......@@@@...........@@@@@.....@@@@.......@@@@..@@@@.......@@@@.
412
- ..@@@........@@@@..........@@@@@@@....@@@........@@@@@@@@@........@@@@.
413
- .@@@@..........@@.........@@@@@@@@...@@@@..........@@@@@@@.........@@@@
414
- .@@@.....................@@@@.@@@@...@@@..............@@...........@@@@
415
- .@@@....................@@@@@.@@@@...@@@...........................@@@@
416
- @@@...@@@@@@@...........@@@@..@@@...@@@...@@@@@@...................@@@.
417
- @@@@.@@@@@@@@@@........@@@@...@@@@..@@@@.@@@@@@@@@@...............@@@@.
418
- @@@@@@@@@@@@@@@.......@@@@....@@@@..@@@@@@@@@@@@@@@...............@@@..
419
- @@@@@@@.....@@@@@.....@@@.....@@@@..@@@@@@......@@@@@............@@@@..
420
- @@@@.........@@@@...@@@@......@@@@..@@@@@........@@@@...........@@@....
421
- @@@@..........@@@@.@@@@.......@@@@..@@@@..........@@@..........@@@@....
422
- @@@@..........@@@@@@@@@.......@@@@.@@@@@..........@@@.........@@@@.....
423
- @@@@..........@@@@@@@@@@@@@@@@@@@@@@@@@@..........@@@@.......@@@@......
424
- @@@@..........@@@@@@@@@@@@@@@@@@@@@@@@@@..........@@@@......@@@........
425
- .@@@..........@@@@@@@@@@@@@@@@@@@@@@.@@@..........@@@@....@@@@@........
426
- .@@@@........@@@@.............@@@@...@@@@........@@@@....@@@@..........
427
- ..@@@@.......@@@@.............@@@@....@@@@.......@@@@...@@@@...........
428
- ..@@@@@....@@@@@..............@@@@.....@@@@....@@@@@...@@@@@@..........
429
- ....@@@@@@@@@@@...............@@@@......@@@@@@@@@@@...@@@@@@@@@@@@@@@@@
430
- .....@@@@@@@@@................@@@@........@@@@@@@@....@@@@@@@@@@@@@@@@@
431
- ........@@@@...................@@..........@@@@@........@@@@@@@..@.@@@.
432
- &gt;&gt;&gt;&gt;^ ^&lt;&lt;&gt;&gt;^ ^&lt;&lt;&lt;&lt;&lt; &gt;&gt;&gt;^&lt;&lt;&lt; ^^ ^
433
-
434
- &gt;,&lt; show the path of the detection algorithm
435
- </PRE></TD></TR>
436
- </TABLE>
437
-
438
- <P>
439
- The latest version of GOCR may use different algorithms.
440
- You have to look at the sources learn more.
441
-
442
- <H1><A NAME="SEC10">
443
- Black/White, Gray and Colors</A>
444
- </H1>
445
- For simplicity colored images are converted to gray internally.
446
- That means a red text on green background will not be detected.
447
- You should use your own filter for this purpose.
448
-
449
- <P>
450
- If the original image is gray, a critical value is calculated to
451
- extract characters from the background. This can fail, if images are
452
- on the scanned page or tha scan is bad (dark edges or borders).
453
- It is difficult to overcome this problem because graylevels are mostly
454
- restricted to the 8 bit limit (16 bit would help to overcome this problem).
455
-
456
- <P>
457
- Black/White images are internally converted to gray with two levels (0 and
458
- 255).
459
-
460
- <P>
461
- The lowest 4 bits are not used, because they are used by internal functions
462
- (this can be changed in future).
463
-
464
- <P>
465
- After calculation of the threshold value (otsu.c) the brightness of
466
- every pixel is recalculated to a new internal threshold value
467
- of 160 (128+32).
468
- This is a bit above the middle of the 8 bit range. The idea is to
469
- make the live easier for the other routines. Pixels which does not sure
470
- belong to the white or black ones get a value near the threshold value.
471
- Some routines can use this bit of more information to ignore outriders.
472
- Second point is, that this is necessary for using lowest for bits
473
- without destroying image informations.
474
-
475
- <H1><A NAME="SEC11">Pictures on scanned pages</A>
476
- </H1>
477
- At first all objects on the scanned page are detected.
478
- Objects are clusters of black pixels.
479
- Pictures are detected if they are larger than 4 times the mean size of
480
- all objects. This rule is very simple and can fail some times.
481
- But it works fast and mostly the result is ok.
482
-
483
- <H1><A NAME="SEC12">Tools</A></H1>
484
-
485
- <P>
486
- <DL COMPACT>
487
- <DT>pbmclean:</DT>
488
- <DD>This program is written by Angus Duggan and Jef Poskanzer.
489
- It cleans up ``snow'' on bitmap images.
490
- </DD>
491
- <DT>pnmtools:</DT>
492
- <DD>This tools are used to convert different image-formats to
493
- easy readable PNM (PBM,PGM,PPM) format.
494
- GOCR uses the popen-routine to call this programs if the
495
- suffix of the filename matches to a list in pnm.c.
496
- This will fail if pnmtools are missing.
497
- </DD>
498
- </DL>
499
-
500
- <H1><A NAME="SEC12b">related projects (to learn from)</A></H1>
501
-
502
- <P>
503
- <DL COMPACT>
504
- <DT>unpaper:</DT> <!-- Dec05 JS -->
505
- <DD> <a href="http://unpaper.berlios.de/">unpaper</a> -
506
- post-processing scanned and photocopied book pages,
507
- written by Jens Gulden 2005, GPL
508
- </DD>
509
- </DL>
510
-
511
- <H1><A NAME="SEC13">glossary</A> </H1> <DL COMPACT>
512
- <DT>font series:</DT> <DD>bold, condensed</DD>
513
- <DT>font shape: </DT> <DD>normal, italic, slanted, sc... </DD>
514
- <DT>points:</DT>
515
- <DD>length unit used for font size, 1/72 inch,
516
- but I do not know its exact relation to the font size (height?
517
- totalheight? width? 10pt and 300dpi results in 40 pixel heigh font?)
518
- </DD>
519
- <DT>sans serif:</DT>
520
- <DD>font without the (often thin) lines on the ends
521
- of the character
522
- </DD>
523
- <DT>descewing:</DT>
524
- <DD>compensation of (slightly) rotated text
525
- </DD>
526
- </DL>
527
-
528
- <H1><A NAME="SEC14"> More information?</A> </H1>
529
- <DL COMPACT>
530
- <DT>&middot;</DT>
531
- <DD>see "/usr/share/doc/package/tetex/texmf/.../fntguide.dvi"
532
- in the documentation of the tetex package
533
-
534
- </DD>
535
- <DT>&middot;</DT>
536
- <DD>the fonts-HOWTO file is helpfully too
537
- ("www.faqs.org/faqs/fonts-faq/")
538
-
539
- </DD>
540
- <DT>RTF:</DT>
541
- <DD> RichTextFormat - does someone have a good documontation?
542
- </DD>
543
- </DL>
544
-
545
- <H1><A NAME="SEC15"> About this document</A> </H1>
546
- This Document was originaly written in LaTeX.
547
- In May 2002 Joerg has convertet it to HTML. The reason is, that
548
- you can read it now directly and you does not need to have LaTeX and
549
- Ghostscript installed on your computer to read it.
550
- As a side effect you do not need tetex package to build the gocr.rpm-package.
551
- A good viewer to read this document is lynx, links or w3m.
552
-
553
- <BR>
554
- <HR>
555
- <ADDRESS> jNOschulen-at-gSmPAMx.de (remove NO+S+PAM) </ADDRESS>
556
- </BODY>
557
- </HTML>
558
- <!---
559
- %
560
- % -----------------------------------------------------------------
561
- % # v
562
- % # v mark connected points via fifo-stack
563
- % ### *<< and 3bit direction code per pixel,
564
- % # # ^ ^ should be better for overlapping letters
565
- % ### ^<<
566
- % --------------------- point connections ----------------------
567
- % searching next nearest point
568
- % /----\
569
- % | |
570
- % ^ | ## |
571
- % | | ## |
572
- % | | | |
573
- % | \--/ |
574
- % \------/
575
- %-------------------------------------------------------------
576
- %
577
-
578
- --->