isbn 1.4.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (291) hide show
  1. data/.gitignore +4 -0
  2. data/README +9 -0
  3. data/Rakefile +13 -0
  4. data/VERSION +1 -0
  5. data/isbn.gemspec +329 -0
  6. data/lib/isbn.rb +90 -0
  7. data/src/gocr-0.48/.cvsignore +6 -0
  8. data/src/gocr-0.48/AUTHORS +7 -0
  9. data/src/gocr-0.48/BUGS +55 -0
  10. data/src/gocr-0.48/CREDITS +17 -0
  11. data/src/gocr-0.48/HISTORY +243 -0
  12. data/src/gocr-0.48/INSTALL +83 -0
  13. data/src/gocr-0.48/Makefile +193 -0
  14. data/src/gocr-0.48/Makefile.in +193 -0
  15. data/src/gocr-0.48/README +165 -0
  16. data/src/gocr-0.48/READMEde.txt +80 -0
  17. data/src/gocr-0.48/REMARK.txt +18 -0
  18. data/src/gocr-0.48/REVIEW +538 -0
  19. data/src/gocr-0.48/TODO +65 -0
  20. data/src/gocr-0.48/bin/.cvsignore +2 -0
  21. data/src/gocr-0.48/bin/create_db +38 -0
  22. data/src/gocr-0.48/bin/gocr.tcl +527 -0
  23. data/src/gocr-0.48/bin/gocr_chk.sh +44 -0
  24. data/src/gocr-0.48/configure +4689 -0
  25. data/src/gocr-0.48/configure.in +71 -0
  26. data/src/gocr-0.48/doc/.#Makefile.1.6 +39 -0
  27. data/src/gocr-0.48/doc/.cvsignore +2 -0
  28. data/src/gocr-0.48/doc/Makefile +39 -0
  29. data/src/gocr-0.48/doc/Makefile.in +39 -0
  30. data/src/gocr-0.48/doc/example.dtd +53 -0
  31. data/src/gocr-0.48/doc/example.xml +21 -0
  32. data/src/gocr-0.48/doc/examples.txt +67 -0
  33. data/src/gocr-0.48/doc/gocr.html +578 -0
  34. data/src/gocr-0.48/doc/unicode.txt +57 -0
  35. data/src/gocr-0.48/examples/.#Makefile.1.22 +166 -0
  36. data/src/gocr-0.48/examples/4x6.png +0 -0
  37. data/src/gocr-0.48/examples/4x6.txt +2 -0
  38. data/src/gocr-0.48/examples/5x7.png +0 -0
  39. data/src/gocr-0.48/examples/5x7.png.txt +2 -0
  40. data/src/gocr-0.48/examples/5x8.png +0 -0
  41. data/src/gocr-0.48/examples/5x8.png.txt +2 -0
  42. data/src/gocr-0.48/examples/Makefile +166 -0
  43. data/src/gocr-0.48/examples/color.fig +20 -0
  44. data/src/gocr-0.48/examples/ex.fig +16 -0
  45. data/src/gocr-0.48/examples/font.tex +22 -0
  46. data/src/gocr-0.48/examples/font1.tex +46 -0
  47. data/src/gocr-0.48/examples/font2.fig +27 -0
  48. data/src/gocr-0.48/examples/font_nw.tex +24 -0
  49. data/src/gocr-0.48/examples/handwrt1.jpg +0 -0
  50. data/src/gocr-0.48/examples/handwrt1.txt +10 -0
  51. data/src/gocr-0.48/examples/inverse.fig +20 -0
  52. data/src/gocr-0.48/examples/matrix.jpg +0 -0
  53. data/src/gocr-0.48/examples/ocr-a-subset.png +0 -0
  54. data/src/gocr-0.48/examples/ocr-a-subset.png.txt +4 -0
  55. data/src/gocr-0.48/examples/ocr-a.png +0 -0
  56. data/src/gocr-0.48/examples/ocr-a.txt +6 -0
  57. data/src/gocr-0.48/examples/ocr-b.png +0 -0
  58. data/src/gocr-0.48/examples/ocr-b.png.txt +4 -0
  59. data/src/gocr-0.48/examples/polish.tex +28 -0
  60. data/src/gocr-0.48/examples/rotate45.fig +14 -0
  61. data/src/gocr-0.48/examples/score +36 -0
  62. data/src/gocr-0.48/examples/text.tex +28 -0
  63. data/src/gocr-0.48/gocr.spec +143 -0
  64. data/src/gocr-0.48/gpl.html +537 -0
  65. data/src/gocr-0.48/include/.cvsignore +2 -0
  66. data/src/gocr-0.48/include/config.h +36 -0
  67. data/src/gocr-0.48/include/config.h.in +36 -0
  68. data/src/gocr-0.48/include/version.h +2 -0
  69. data/src/gocr-0.48/install-sh +3 -0
  70. data/src/gocr-0.48/make.bat +57 -0
  71. data/src/gocr-0.48/man/.cvsignore +2 -0
  72. data/src/gocr-0.48/man/Makefile +29 -0
  73. data/src/gocr-0.48/man/Makefile.in +29 -0
  74. data/src/gocr-0.48/man/man1/gocr.1 +166 -0
  75. data/src/gocr-0.48/src/.cvsignore +4 -0
  76. data/src/gocr-0.48/src/Makefile +132 -0
  77. data/src/gocr-0.48/src/Makefile.in +132 -0
  78. data/src/gocr-0.48/src/amiga.h +31 -0
  79. data/src/gocr-0.48/src/barcode.c +846 -0
  80. data/src/gocr-0.48/src/barcode.c.orig +593 -0
  81. data/src/gocr-0.48/src/barcode.h +11 -0
  82. data/src/gocr-0.48/src/box.c +372 -0
  83. data/src/gocr-0.48/src/database.c +462 -0
  84. data/src/gocr-0.48/src/detect.c +943 -0
  85. data/src/gocr-0.48/src/gocr.c +373 -0
  86. data/src/gocr-0.48/src/gocr.h +288 -0
  87. data/src/gocr-0.48/src/jconv.c +168 -0
  88. data/src/gocr-0.48/src/job.c +84 -0
  89. data/src/gocr-0.48/src/lines.c +350 -0
  90. data/src/gocr-0.48/src/list.c +334 -0
  91. data/src/gocr-0.48/src/list.h +90 -0
  92. data/src/gocr-0.48/src/ocr0.c +6756 -0
  93. data/src/gocr-0.48/src/ocr0.h +63 -0
  94. data/src/gocr-0.48/src/ocr0n.c +1475 -0
  95. data/src/gocr-0.48/src/ocr1.c +85 -0
  96. data/src/gocr-0.48/src/ocr1.h +3 -0
  97. data/src/gocr-0.48/src/otsu.c +289 -0
  98. data/src/gocr-0.48/src/otsu.h +23 -0
  99. data/src/gocr-0.48/src/output.c +289 -0
  100. data/src/gocr-0.48/src/output.h +37 -0
  101. data/src/gocr-0.48/src/pcx.c +153 -0
  102. data/src/gocr-0.48/src/pcx.h +9 -0
  103. data/src/gocr-0.48/src/pgm2asc.c +2893 -0
  104. data/src/gocr-0.48/src/pgm2asc.h +105 -0
  105. data/src/gocr-0.48/src/pixel.c +537 -0
  106. data/src/gocr-0.48/src/pnm.c +533 -0
  107. data/src/gocr-0.48/src/pnm.h +35 -0
  108. data/src/gocr-0.48/src/progress.c +87 -0
  109. data/src/gocr-0.48/src/progress.h +42 -0
  110. data/src/gocr-0.48/src/remove.c +703 -0
  111. data/src/gocr-0.48/src/tga.c +87 -0
  112. data/src/gocr-0.48/src/tga.h +6 -0
  113. data/src/gocr-0.48/src/unicode.c +1314 -0
  114. data/src/gocr-0.48/src/unicode.h +1257 -0
  115. data/src/jpeg-7/Makefile.am +133 -0
  116. data/src/jpeg-7/Makefile.in +1089 -0
  117. data/src/jpeg-7/README +322 -0
  118. data/src/jpeg-7/aclocal.m4 +8990 -0
  119. data/src/jpeg-7/ansi2knr.1 +36 -0
  120. data/src/jpeg-7/ansi2knr.c +739 -0
  121. data/src/jpeg-7/cderror.h +132 -0
  122. data/src/jpeg-7/cdjpeg.c +181 -0
  123. data/src/jpeg-7/cdjpeg.h +187 -0
  124. data/src/jpeg-7/change.log +270 -0
  125. data/src/jpeg-7/cjpeg.1 +325 -0
  126. data/src/jpeg-7/cjpeg.c +616 -0
  127. data/src/jpeg-7/ckconfig.c +402 -0
  128. data/src/jpeg-7/coderules.txt +118 -0
  129. data/src/jpeg-7/config.guess +1561 -0
  130. data/src/jpeg-7/config.sub +1686 -0
  131. data/src/jpeg-7/configure +17139 -0
  132. data/src/jpeg-7/configure.ac +317 -0
  133. data/src/jpeg-7/depcomp +630 -0
  134. data/src/jpeg-7/djpeg.1 +251 -0
  135. data/src/jpeg-7/djpeg.c +617 -0
  136. data/src/jpeg-7/example.c +433 -0
  137. data/src/jpeg-7/filelist.txt +215 -0
  138. data/src/jpeg-7/install-sh +520 -0
  139. data/src/jpeg-7/install.txt +1097 -0
  140. data/src/jpeg-7/jaricom.c +148 -0
  141. data/src/jpeg-7/jcapimin.c +282 -0
  142. data/src/jpeg-7/jcapistd.c +161 -0
  143. data/src/jpeg-7/jcarith.c +921 -0
  144. data/src/jpeg-7/jccoefct.c +453 -0
  145. data/src/jpeg-7/jccolor.c +459 -0
  146. data/src/jpeg-7/jcdctmgr.c +482 -0
  147. data/src/jpeg-7/jchuff.c +1612 -0
  148. data/src/jpeg-7/jcinit.c +65 -0
  149. data/src/jpeg-7/jcmainct.c +293 -0
  150. data/src/jpeg-7/jcmarker.c +667 -0
  151. data/src/jpeg-7/jcmaster.c +770 -0
  152. data/src/jpeg-7/jcomapi.c +106 -0
  153. data/src/jpeg-7/jconfig.bcc +48 -0
  154. data/src/jpeg-7/jconfig.cfg +45 -0
  155. data/src/jpeg-7/jconfig.dj +38 -0
  156. data/src/jpeg-7/jconfig.mac +43 -0
  157. data/src/jpeg-7/jconfig.manx +43 -0
  158. data/src/jpeg-7/jconfig.mc6 +52 -0
  159. data/src/jpeg-7/jconfig.sas +43 -0
  160. data/src/jpeg-7/jconfig.st +42 -0
  161. data/src/jpeg-7/jconfig.txt +155 -0
  162. data/src/jpeg-7/jconfig.vc +45 -0
  163. data/src/jpeg-7/jconfig.vms +37 -0
  164. data/src/jpeg-7/jconfig.wat +38 -0
  165. data/src/jpeg-7/jcparam.c +632 -0
  166. data/src/jpeg-7/jcprepct.c +358 -0
  167. data/src/jpeg-7/jcsample.c +545 -0
  168. data/src/jpeg-7/jctrans.c +381 -0
  169. data/src/jpeg-7/jdapimin.c +396 -0
  170. data/src/jpeg-7/jdapistd.c +275 -0
  171. data/src/jpeg-7/jdarith.c +762 -0
  172. data/src/jpeg-7/jdatadst.c +151 -0
  173. data/src/jpeg-7/jdatasrc.c +212 -0
  174. data/src/jpeg-7/jdcoefct.c +736 -0
  175. data/src/jpeg-7/jdcolor.c +396 -0
  176. data/src/jpeg-7/jdct.h +393 -0
  177. data/src/jpeg-7/jddctmgr.c +382 -0
  178. data/src/jpeg-7/jdhuff.c +1309 -0
  179. data/src/jpeg-7/jdinput.c +384 -0
  180. data/src/jpeg-7/jdmainct.c +512 -0
  181. data/src/jpeg-7/jdmarker.c +1360 -0
  182. data/src/jpeg-7/jdmaster.c +663 -0
  183. data/src/jpeg-7/jdmerge.c +400 -0
  184. data/src/jpeg-7/jdpostct.c +290 -0
  185. data/src/jpeg-7/jdsample.c +361 -0
  186. data/src/jpeg-7/jdtrans.c +136 -0
  187. data/src/jpeg-7/jerror.c +252 -0
  188. data/src/jpeg-7/jerror.h +304 -0
  189. data/src/jpeg-7/jfdctflt.c +174 -0
  190. data/src/jpeg-7/jfdctfst.c +230 -0
  191. data/src/jpeg-7/jfdctint.c +4348 -0
  192. data/src/jpeg-7/jidctflt.c +242 -0
  193. data/src/jpeg-7/jidctfst.c +368 -0
  194. data/src/jpeg-7/jidctint.c +5137 -0
  195. data/src/jpeg-7/jinclude.h +91 -0
  196. data/src/jpeg-7/jmemansi.c +167 -0
  197. data/src/jpeg-7/jmemdos.c +638 -0
  198. data/src/jpeg-7/jmemdosa.asm +379 -0
  199. data/src/jpeg-7/jmemmac.c +289 -0
  200. data/src/jpeg-7/jmemmgr.c +1118 -0
  201. data/src/jpeg-7/jmemname.c +276 -0
  202. data/src/jpeg-7/jmemnobs.c +109 -0
  203. data/src/jpeg-7/jmemsys.h +198 -0
  204. data/src/jpeg-7/jmorecfg.h +369 -0
  205. data/src/jpeg-7/jpegint.h +395 -0
  206. data/src/jpeg-7/jpeglib.h +1135 -0
  207. data/src/jpeg-7/jpegtran.1 +272 -0
  208. data/src/jpeg-7/jpegtran.c +546 -0
  209. data/src/jpeg-7/jquant1.c +856 -0
  210. data/src/jpeg-7/jquant2.c +1310 -0
  211. data/src/jpeg-7/jutils.c +179 -0
  212. data/src/jpeg-7/jversion.h +14 -0
  213. data/src/jpeg-7/libjpeg.map +4 -0
  214. data/src/jpeg-7/libjpeg.txt +3067 -0
  215. data/src/jpeg-7/ltmain.sh +8406 -0
  216. data/src/jpeg-7/makcjpeg.st +36 -0
  217. data/src/jpeg-7/makdjpeg.st +36 -0
  218. data/src/jpeg-7/makeadsw.vc6 +77 -0
  219. data/src/jpeg-7/makeasln.vc9 +33 -0
  220. data/src/jpeg-7/makecdep.vc6 +82 -0
  221. data/src/jpeg-7/makecdsp.vc6 +130 -0
  222. data/src/jpeg-7/makecmak.vc6 +159 -0
  223. data/src/jpeg-7/makecvcp.vc9 +186 -0
  224. data/src/jpeg-7/makeddep.vc6 +82 -0
  225. data/src/jpeg-7/makeddsp.vc6 +130 -0
  226. data/src/jpeg-7/makedmak.vc6 +159 -0
  227. data/src/jpeg-7/makedvcp.vc9 +186 -0
  228. data/src/jpeg-7/makefile.ansi +220 -0
  229. data/src/jpeg-7/makefile.bcc +291 -0
  230. data/src/jpeg-7/makefile.dj +226 -0
  231. data/src/jpeg-7/makefile.manx +220 -0
  232. data/src/jpeg-7/makefile.mc6 +255 -0
  233. data/src/jpeg-7/makefile.mms +224 -0
  234. data/src/jpeg-7/makefile.sas +258 -0
  235. data/src/jpeg-7/makefile.unix +234 -0
  236. data/src/jpeg-7/makefile.vc +217 -0
  237. data/src/jpeg-7/makefile.vms +142 -0
  238. data/src/jpeg-7/makefile.wat +239 -0
  239. data/src/jpeg-7/makejdep.vc6 +423 -0
  240. data/src/jpeg-7/makejdsp.vc6 +285 -0
  241. data/src/jpeg-7/makejdsw.vc6 +29 -0
  242. data/src/jpeg-7/makejmak.vc6 +425 -0
  243. data/src/jpeg-7/makejsln.vc9 +17 -0
  244. data/src/jpeg-7/makejvcp.vc9 +328 -0
  245. data/src/jpeg-7/makeproj.mac +213 -0
  246. data/src/jpeg-7/makerdep.vc6 +6 -0
  247. data/src/jpeg-7/makerdsp.vc6 +78 -0
  248. data/src/jpeg-7/makermak.vc6 +110 -0
  249. data/src/jpeg-7/makervcp.vc9 +133 -0
  250. data/src/jpeg-7/maketdep.vc6 +43 -0
  251. data/src/jpeg-7/maketdsp.vc6 +122 -0
  252. data/src/jpeg-7/maketmak.vc6 +131 -0
  253. data/src/jpeg-7/maketvcp.vc9 +178 -0
  254. data/src/jpeg-7/makewdep.vc6 +6 -0
  255. data/src/jpeg-7/makewdsp.vc6 +78 -0
  256. data/src/jpeg-7/makewmak.vc6 +110 -0
  257. data/src/jpeg-7/makewvcp.vc9 +133 -0
  258. data/src/jpeg-7/makljpeg.st +68 -0
  259. data/src/jpeg-7/maktjpeg.st +30 -0
  260. data/src/jpeg-7/makvms.opt +4 -0
  261. data/src/jpeg-7/missing +376 -0
  262. data/src/jpeg-7/rdbmp.c +439 -0
  263. data/src/jpeg-7/rdcolmap.c +253 -0
  264. data/src/jpeg-7/rdgif.c +38 -0
  265. data/src/jpeg-7/rdjpgcom.1 +63 -0
  266. data/src/jpeg-7/rdjpgcom.c +515 -0
  267. data/src/jpeg-7/rdppm.c +459 -0
  268. data/src/jpeg-7/rdrle.c +387 -0
  269. data/src/jpeg-7/rdswitch.c +365 -0
  270. data/src/jpeg-7/rdtarga.c +500 -0
  271. data/src/jpeg-7/structure.txt +945 -0
  272. data/src/jpeg-7/testimg.bmp +0 -0
  273. data/src/jpeg-7/testimg.jpg +0 -0
  274. data/src/jpeg-7/testimg.ppm +4 -0
  275. data/src/jpeg-7/testimgp.jpg +0 -0
  276. data/src/jpeg-7/testorig.jpg +0 -0
  277. data/src/jpeg-7/testprog.jpg +0 -0
  278. data/src/jpeg-7/transupp.c +1533 -0
  279. data/src/jpeg-7/transupp.h +205 -0
  280. data/src/jpeg-7/usage.txt +605 -0
  281. data/src/jpeg-7/wizard.txt +211 -0
  282. data/src/jpeg-7/wrbmp.c +442 -0
  283. data/src/jpeg-7/wrgif.c +399 -0
  284. data/src/jpeg-7/wrjpgcom.1 +103 -0
  285. data/src/jpeg-7/wrjpgcom.c +583 -0
  286. data/src/jpeg-7/wrppm.c +269 -0
  287. data/src/jpeg-7/wrrle.c +305 -0
  288. data/src/jpeg-7/wrtarga.c +253 -0
  289. data/test/isbn_test.rb +7 -0
  290. data/test/test_helper.rb +7 -0
  291. metadata +345 -0
@@ -0,0 +1,71 @@
1
+ # see /usr/share/info/standards.info,autoconf.info (autoconf 2.57)
2
+ dnl Process this file with autoconf to produce a configure script.
3
+ dnl obsolete: AC_INIT(src/pgm2asc.c)
4
+ AC_INIT(gocr,0.48,,)
5
+ AC_PREREQ(2.50)
6
+ AC_CONFIG_HEADERS([include/config.h])
7
+ dnl AC_EXEEXT is obsolete now
8
+
9
+ dnl Checks for programs.
10
+ AC_PROG_CC
11
+ AC_PROG_INSTALL
12
+ AC_PROG_MAKE_SET
13
+ dnl only needed for libPgm2asc.a, not for the rpm/ebuild
14
+ dnl AC_PROG_RANLIB
15
+ dnl AC_CHECK_PROG(AR,ar,ar)
16
+ dnl needed for developpers to make examples, not for the rpm/ebuild
17
+ dnl AC_CHECK_PROG(FIG2DEV,fig2dev,fig2dev)
18
+
19
+ dnl Check for optional debug mode
20
+ dnl debug makes program slow, but is very useful for developper
21
+ dnl ToDo: how to check that c-flags are available?
22
+ AC_ARG_WITH(debug,
23
+ [ --with-debug switching on debugging (more verbose output)],
24
+ [ CPPFLAGS="-Wall -g -fexceptions -DDO_DEBUG=1 $CPPFLAGS" ])
25
+ if test "$with_debug"; then echo "debugging enabled"; fi
26
+
27
+ dnl Check for optional netpbm PACKAGE: --with-netpbm=no == --without-netpbm
28
+ dnl LDFLAGS+=-R$withval/lib compiles the search path into the file ???
29
+ AC_ARG_WITH(netpbm,
30
+ [ --with-netpbm=PATH enter the PATH to netpbm package],
31
+ [ if test "$withval" != "no"; then
32
+ LDFLAGS="-L$withval/lib $LDFLAGS";\
33
+ CPPFLAGS="-I$withval/include $CPPFLAGS";\
34
+ fi ])
35
+ if test -n "$with_netpbm"; then echo "option: with_netpbm $with_netpbm"; fi
36
+
37
+ dnl Checks for libraries.
38
+ if test "$with_netpbm" != "no"; then
39
+ # netpbm-10.26 + SuSE-10.0: netpbm needs mathlib -lm
40
+ # netpbm-10 : libnetpbm + pam.h (+ p[bgpn]m.h, libp[bgpn]m.so as links)
41
+ # netpbm-9 : libpnm + pnm.h (+ p[bgp]m.h)
42
+ # pnm_readpaminit (netpbm-10)
43
+ # pnm_readpnminit (netpbm-9,10)
44
+ # ToDo: how to check that -lm is needed?
45
+ LDFLAGS="-lm $LDFLAGS"
46
+ AC_SEARCH_LIBS(pnm_readpnminit,[netpbm pnm],[check_netpbm_h="pam.h pnm.h"],
47
+ [ echo " * * * try option --with-netpbm=PATH"])
48
+ fi
49
+
50
+ dnl Checks for header files.
51
+ AC_HEADER_STDC
52
+ AC_CHECK_HEADERS([unistd.h wchar.h ${check_netpbm_h}])
53
+
54
+ dnl Checks for typedefs, structures, and compiler characteristics.
55
+ AC_C_CONST
56
+ dnl AC_CHECK_TYPE(wchar_t,unsigned)
57
+
58
+ dnl Checks for library functions.
59
+ dnl this macro produces a warning: AC_TRY_RUN called without default ...
60
+ dnl The message can be ignored as long as you don't configure gOCR for
61
+ dnl cross-compiling.
62
+ AC_FUNC_SETVBUF_REVERSED
63
+ AC_CHECK_FUNCS(wcschr wcsdup gettimeofday popen)
64
+
65
+ dnl Checks for system services
66
+
67
+ dnl obsolete: AC_OUTPUT(Makefile src/Makefile doc/Makefile man/Makefile)
68
+ dnl the light weight version of package comes without src/api
69
+ AC_CONFIG_FILES([Makefile src/Makefile doc/Makefile man/Makefile])
70
+ dnl AC_CONFIG_COMMANDS([default],,)
71
+ AC_OUTPUT
@@ -0,0 +1,39 @@
1
+ #
2
+ # Makefile for ./doc path, used by configure
3
+ #
4
+
5
+ # these two lines are for cross-compiling, not tested
6
+ #srcdir = .
7
+ #VPATH = .
8
+
9
+ # changed to html, tex is not used anymore
10
+ #LATEX=@LATEX@
11
+ #DVIPS=@DVIPS@
12
+
13
+ OCRDOC=ocr
14
+ # add other source file to documentation here
15
+ # SRC=$(OCRDOC).tex
16
+
17
+ .PHONY : all clean proper install uninstall
18
+ default: all
19
+
20
+ all: # do nothing!
21
+
22
+ #$(OCRDOC).ps: $(OCRDOC).dvi
23
+ # $(DVIPS) -o $(OCRDOC).ps $?
24
+ #
25
+ #$(OCRDOC).dvi: $(SRC)
26
+ # $(LATEX) $(OCRDOC).tex
27
+
28
+ install: all
29
+ echo "Copy gocr.html to your document path."
30
+
31
+ uninstall:
32
+ echo "Remove gocr.html from your document path."
33
+
34
+ clean:
35
+ -rm -f *.bak *~ $(OCRDOC).{dvi,log,aux}
36
+
37
+ proper: clean
38
+ -rm -f $(OCRDOC).ps
39
+
@@ -0,0 +1,2 @@
1
+ .cvsignore
2
+ Makefile
@@ -0,0 +1,39 @@
1
+ #
2
+ # Makefile for ./doc path, used by configure
3
+ #
4
+
5
+ # these two lines are for cross-compiling, not tested
6
+ #srcdir = .
7
+ #VPATH = .
8
+
9
+ # changed to html, tex is not used anymore
10
+ #LATEX=@LATEX@
11
+ #DVIPS=@DVIPS@
12
+
13
+ OCRDOC=ocr
14
+ # add other source file to documentation here
15
+ # SRC=$(OCRDOC).tex
16
+
17
+ .PHONY : all clean proper install uninstall
18
+ default: all
19
+
20
+ all: # do nothing!
21
+
22
+ #$(OCRDOC).ps: $(OCRDOC).dvi
23
+ # $(DVIPS) -o $(OCRDOC).ps $?
24
+ #
25
+ #$(OCRDOC).dvi: $(SRC)
26
+ # $(LATEX) $(OCRDOC).tex
27
+
28
+ install: all
29
+ echo "Copy gocr.html to your document path."
30
+
31
+ uninstall:
32
+ echo "Remove gocr.html from your document path."
33
+
34
+ clean:
35
+ -rm -f *.bak *~ $(OCRDOC).{dvi,log,aux}
36
+
37
+ proper: clean
38
+ -rm -f $(OCRDOC).ps
39
+
@@ -0,0 +1,39 @@
1
+ #
2
+ # Makefile for ./doc path, used by configure
3
+ #
4
+
5
+ # these two lines are for cross-compiling, not tested
6
+ #srcdir = @srcdir@
7
+ #VPATH = @srcdir@
8
+
9
+ # changed to html, tex is not used anymore
10
+ #LATEX=@LATEX@
11
+ #DVIPS=@DVIPS@
12
+
13
+ OCRDOC=ocr
14
+ # add other source file to documentation here
15
+ # SRC=$(OCRDOC).tex
16
+
17
+ .PHONY : all clean proper install uninstall
18
+ default: all
19
+
20
+ all: # do nothing!
21
+
22
+ #$(OCRDOC).ps: $(OCRDOC).dvi
23
+ # $(DVIPS) -o $(OCRDOC).ps $?
24
+ #
25
+ #$(OCRDOC).dvi: $(SRC)
26
+ # $(LATEX) $(OCRDOC).tex
27
+
28
+ install: all
29
+ echo "Copy gocr.html to your document path."
30
+
31
+ uninstall:
32
+ echo "Remove gocr.html from your document path."
33
+
34
+ clean:
35
+ -rm -f *.bak *~ $(OCRDOC).{dvi,log,aux}
36
+
37
+ proper: clean
38
+ -rm -f $(OCRDOC).ps
39
+
@@ -0,0 +1,53 @@
1
+ <?xml version="1.0"?>
2
+ <!--
3
+ first draft by Volker Simonis, reviewed by Joerg Schulenburg
4
+ Its not ready for use!
5
+ ToDo:
6
+ - lynx/links/w3c should show xml like a html file
7
+ value as <character ...>CharText</character>
8
+ or <word><character ...></character><...>WordText</word>
9
+ or as line or as block? whats more useful?
10
+ - how to code table of alternative chars/words and its probability?
11
+ - how to handle images (as image tags?)
12
+ - xmllint -\-htmlout -\-loaddtd jocr/doc/example.dtd o.xml
13
+ -->
14
+ <!ENTITY % default.attributes "x CDATA #REQUIRED
15
+ y CDATA #REQUIRED
16
+ dx CDATA #REQUIRED
17
+ dy CDATA #REQUIRED">
18
+
19
+ <!ELEMENT box EMPTY>
20
+ <!ATTLIST box %default.attributes;
21
+ value CDATA #REQUIRED;>
22
+
23
+ <!ELEMENT barcode EMPTY>
24
+ <!ATTLIST barcode %default.attributes;
25
+ value CDATA #REQUIRED;>
26
+
27
+ <!ELEMENT img EMPTY>
28
+ <!ATTLIST img %default.attributes;>
29
+
30
+ <!ELEMENT page (block*)>
31
+ <!ATTLIST page %default.attributes;>
32
+
33
+ <!ELEMENT block (line*)>
34
+ <!ATTLIST block %default.attributes;>
35
+
36
+ <!ELEMENT line ((word | space | punctuation-mark)*)>
37
+ <!ATTLIST line %default.attributes;>
38
+
39
+ <!ELEMENT word (character*)>
40
+ <!ATTLIST word %default.attributes;>
41
+
42
+ <!ELEMENT char EMPTY>
43
+ <!ATTLIST char %default.attributes;
44
+ value CDATA #REQUIRED;
45
+ (#CDATA)> <!-- is that correct? -->
46
+
47
+ <!ELEMENT space EMPTY>
48
+ <!ATTLIST space %default.attributes;
49
+ value CDATA #REQUIRED;>
50
+
51
+ <!ELEMENT punctuation-mark EMPTY>
52
+ <!ATTLIST punctuation-mark %default.attributes;
53
+ value CDATA #REQUIRED;>
@@ -0,0 +1,21 @@
1
+ <?xml version="1.0"?>
2
+ <!DOCTYPE gocr SYSTEM "example.dtd">
3
+ <!-- example file for example.dtd -->
4
+ <page>
5
+ <block x="123" y="11" dx="500" dy="800">
6
+ <line x="130" y="11" dx="480" dy="30">
7
+ <word x="130" y="11" dx="80" dy="30">
8
+ <character x="130" y="11" dx="80" dy="30" value="A"/>
9
+ <character ... />
10
+ ...
11
+ </word>
12
+ <punctuation-mark .. />
13
+ <space .. />
14
+ <word ..>
15
+ ...
16
+ </word>
17
+ </line>
18
+ <box ... />
19
+ ...
20
+ </block>
21
+ </page>
@@ -0,0 +1,67 @@
1
+ Note: this info is related to example files, used to test gOCR. As of this
2
+ writing, these files are not available to non-developers. So, if you aren't
3
+ a developer, forget about this file.
4
+
5
+ EXAMPLE FILES
6
+
7
+ 1. Scanning
8
+ The examples can be scanned from anything; when looking for something, try to
9
+ have in mind the kind of tests you are expecting to do: if you're testing
10
+ accents recognition, look for texts in portuguese, french, etc. (pretty obvious,
11
+ but keeping this in mind will help to have a large gamma of files covering
12
+ different kinds of tests).
13
+
14
+ If you're not interested in testing DPIs, scan at 150 or 300dpi.
15
+
16
+ If you're not interested in testing the dust removal, cleaning, etc, functions,
17
+ do the best scan you can. Usually increasing brightness and contrast will
18
+ provide a sharper, cleaner image.
19
+
20
+ Save the image in a supported format: for example, pgm or jpg. If a compression
21
+ will result in a significant reduction of size, compress the image. BZIP2
22
+ usually is the best compressor around, but gzip is more popular in the unix
23
+ world. In the wintel world, people use ZIP, and usually will have to search for
24
+ an application capable of opening .gz or .bz2 (though WinZIP opens at least the
25
+ former).
26
+
27
+ 2. Sorting
28
+ To help others to find the files they are looking for, the examples/ directory
29
+ is divided in several other directories, which may be subdivided. When
30
+ uploading a new example, look for the most suitable location. Depending of the
31
+ directory, you probably will name your file with interesting info: for example,
32
+ when uploading a image with all the characters of the foo font, the best thing
33
+ to do is to place it at examples/fonts/foo.jpg.
34
+
35
+ 3. "Translation"
36
+ Along with the image file, upload a text file with the expected output. Be
37
+ careful with this file: it must resemble the original text as much as possible.
38
+ Don't add extra new lines (\n), keep hyphenized words, etc. Name this file with
39
+ the same name of the image file.
40
+
41
+ In the beginning of the text file, you should provide comments, to help
42
+ searches. Use the following sample:
43
+
44
+ # Comments
45
+ # DPI:
46
+ # Colors:
47
+ # Image size (colsXrows):
48
+ # Fonts:
49
+ # Font sizes:
50
+ # Layout form:
51
+ # Number of pictures:
52
+ # Language:
53
+ # Quality of scan:
54
+ # Non-ASCII characters:
55
+ # Extra:
56
+
57
+ Check existing examples to see what people have been doing.
58
+
59
+ Any lines that begin with # will be considered comments, so you may use several
60
+ lines for comments or add new fields. Though gOCR itself doesn't depend on, and
61
+ won't use, this file, it will be used by scripts.
62
+
63
+ 4. Other sources (WEB)
64
+
65
+ - http://www.clerkweb.house.gov/elections/elections.htm (Nov2002)
66
+ PDF-files with lot of tables
67
+
@@ -0,0 +1,578 @@
1
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
2
+
3
+ <HTML>
4
+ <HEAD>
5
+ <TITLE>GOCR-documentation</TITLE>
6
+ <META NAME="description" CONTENT="GOCR-documentation">
7
+ <META NAME="keywords" CONTENT="ocr">
8
+ <META NAME="resource-type" CONTENT="document">
9
+ <META NAME="distribution" CONTENT="global">
10
+
11
+ <META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=iso-8859-1">
12
+ <META NAME="Generator" CONTENT="Joerg">
13
+ </HEAD>
14
+
15
+ <BODY >
16
+ <H1 ALIGN="CENTER">GOCR-documentation</H1>
17
+ <P ALIGN="CENTER"><STRONG>J&ouml;rg Schulenburg</STRONG></P>
18
+ <P ALIGN="CENTER"><STRONG>Magdeburg, June 3, 2002</STRONG></P>
19
+
20
+ <H3>Abstract:</H3>
21
+ <DIV>
22
+ In this documentation I describe some ideas for my OCR-program.
23
+ It contains algorithms and examples and gives you
24
+ an impression of what the program can (or could) do.
25
+ </DIV>
26
+ <P>
27
+
28
+
29
+ <P>
30
+ <BR><HR>
31
+ <!--Table of Child-Links-->
32
+ <A NAME="CHILD_LINKS"></A>
33
+
34
+ <UL>
35
+ <LI><A HREF="#SEC1">Introduction</A>
36
+ <LI><A HREF="#SEC2">Segmentation of textual regions / Layout analysis</A>
37
+ <LI><A HREF="#SEC3">Line detection</A>
38
+ <LI><A HREF="#SEC4">Cluster detection</A>
39
+ <LI><A HREF="#SEC5">Engines</A>
40
+ <LI><A HREF="#SEC6">Remove pixels</A>
41
+ <LI><A HREF="#SEC7">Add pixels</A>
42
+ <LI><A HREF="#SEC8">Similarity analyzer</A>
43
+ <LI><A HREF="#SEC9">Overlapping characters</A>
44
+ <LI><A HREF="#SEC10">Black/White, Gray and Colors</A>
45
+ <LI><A HREF="#SEC11">Pictures on scanned pages</A>
46
+ <LI><A HREF="#SEC12">Tools</A>
47
+ <LI><A HREF="#SEC13">glossary</A>
48
+ <LI><A HREF="#SEC14">More information?</A>
49
+ <LI><A HREF="#SEC15">About this document</A>
50
+ </UL>
51
+ <!--End of Table of Child-Links-->
52
+
53
+ <H1><A NAME="SEC1"> Introduction</A>
54
+ </H1>
55
+ First I have to say that I am not a expert in pattern recognition
56
+ or similar things. My knowledge is based mostly on experiments with my
57
+ program.
58
+ Therefore do not worry about stupid algorithms I put in this document.
59
+ In this documentation I describe some ideas for my OCR-program.
60
+ The examples give you an impression of how the program handles
61
+ your images.
62
+ If you have comments regarding contents or spelling please
63
+ write to the author.
64
+
65
+ <H1><A NAME="SEC2">Segmentation of textual regions / Layout analysis</A></H1>
66
+
67
+ This is implemented as a recursive division in two parts.
68
+
69
+ <UL>
70
+ <LI>look for the thickest horizontal or vertical gap through the box</LI>
71
+ <LI>if the gap is less than five times longer than thick do not divide </LI>
72
+ <LI>do the same with the two new parts</LI>
73
+ </UL>
74
+ I know that this algorithm is not as good as you wish,
75
+ but I do not know a better one.
76
+
77
+ <P>
78
+ It would be very helpful to know about a function which is able to
79
+ decide whether the box represents a single text line or a more complex object.
80
+
81
+ <H1><A NAME="SEC3">Line detection</A></H1>
82
+ <P>
83
+ Line detection is very importand for good recognition.
84
+ For example it is difficult to distinguish between lowercase letter <B>p</B>
85
+ and uppercase letter <B>P</B> without having a baseline (same total height).
86
+ The lowercase version of <B>p</B> has a depht (the lower end is below the
87
+ baseline) and therefore its easy to distinguish from the uppercase version
88
+ if the baseline is known. The line detection is responsible for finding the
89
+ baseline of every text line.
90
+
91
+ <P>
92
+ Lines of characters are detected by looking for interline spaces.
93
+ These are characterized by a large number of non-black pixels in a
94
+ row. Image rotation (skewing) presents a problem, therefore the program
95
+ first looks only at the left half of the image. When a line is
96
+ found, the left half of the right side is scanned, because lines
97
+ are often short. The variation in height gives an indication of
98
+ the rotation angle. Using this angle, a second run detects lines
99
+ more accurately. Line detection may fail if there is dust on the
100
+ image.
101
+
102
+ <P>
103
+ In version v0.2.3 this behaviour is slightly changed.
104
+ To detect the rotation angle, the line through the most
105
+ characters is detected.
106
+
107
+ <H1><A NAME="SEC4">Cluster detection</A></H1>
108
+
109
+ A cluster is a group of pixels which are connected with each other.
110
+ The simplest way to detect a cluster is to look for a pixel.
111
+ If you find one, look to the neighbouring pixels. This can be done recursively.
112
+
113
+ <P>
114
+ This method needs a lot of stack space if a cluster is very large,
115
+ and can cause problems with the memory.
116
+
117
+ <P>
118
+ Do you remember the algorithm for leaving a maze?
119
+ Go along the right (or left) wall. This seems to be a good approach
120
+ for detecting clusters without recursion.
121
+ The following picture shows a trace of the maze algorithm.
122
+
123
+ <P>
124
+ <TABLE WIDTH="680">
125
+ <TR><TD>
126
+ <PRE>
127
+ first 35 steps next 36 steps
128
+ ..@@@@@..@@@@&lt;.. ..v&lt;&lt;&lt;&lt;..v&lt;&lt;&lt;@.. * = starting point
129
+ ..@@@@@@@@@.@^&lt;. ..&gt;&gt;v@^&lt;&lt;&lt;@.@@@. &gt;^&lt;v = go right,up,left,down
130
+ ....@@@@@...@@^. ....v@@@@...@@@. @ = black pixel
131
+ ....@@@@....@@^. ....v@@@....@@@.
132
+ ....@@@.....@@^. ....v@@.....@@@.
133
+ ....@@@.....@@^. ....v@@.....@@@.
134
+ ...@@@@.....@@^. ...v&lt;@@.....@@@.
135
+ ...@@@......@@^. ...v@@......@@@.
136
+ ...@@@......@@^. ...v@@......@@@.
137
+ ...@@@.....@@@^. ...v@@.....@@@@.
138
+ ...@@@.....@@&gt;^. ...v@@.....@@@@.
139
+ ...@@@.....@@^.. ...v@@.....@@@..
140
+ ..@@@@.....@@^.. ..v&lt;@@.....@@@..
141
+ ..@@@@....@@@^.. ..v@@@....@@@@..
142
+ *&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;^&lt;&lt; @@@@@@@@@@@@@@@@
143
+ </PRE></TD></TR>
144
+ </TABLE>
145
+
146
+ <P>
147
+ The minimum and maximum coordinates can be used to create a box around the
148
+ cluster. But does this algorithm work with diagonally connected pixels?
149
+
150
+ <H1><A NAME="SEC5">Engines</A></H1>
151
+
152
+ GOCR is able to work with different recognition engines.
153
+ Since version 0.37 engines have to return a probability value together
154
+ with the recognized character or a table of values to a table of characters.
155
+ If the probability value is 100, the engine is 100% sure to have found the
156
+ right character otherwise the value is less. This gives GOCR the possibility
157
+ to compare results of different engines or in case of a not recognized character
158
+ to inform the user or another
159
+ application (spell checker) which characters probably could be there.
160
+
161
+ <H2>Base-Engine</H2>
162
+ The base engine (src/ocrX.c) is the original engine used in the first implementation
163
+ of GOCR by J&ouml;rg. The idea was to get a fast and acceptable result
164
+ without learning theoretical background. Later it should be replaced or completed
165
+ by a better engine.
166
+ The base engine is a rule based engine.
167
+ The engine was written without theoretical
168
+ background and is improved by try and error method but is is still far from
169
+ perfect. The algorithm is very tolerant to size and form af characters
170
+ (omnifont).
171
+ How does the engine identify a character? For the explanation look at the
172
+ following pattern.
173
+
174
+ <P>
175
+ <TABLE WIDTH="680">
176
+ <TR><TD>
177
+ <PRE>
178
+ vvvv vv- white regions
179
+ ......@@...... &lt;- crossing one line
180
+ ......@@......
181
+ .....@@@@.....
182
+ .....@@@@.....
183
+ .....@@@@.....
184
+ ....@..@@@.... &lt;- white hole / crossing two lines
185
+ ....@..@@@.... &lt;- crossing two lines
186
+ ....@..@@@....
187
+ ...@....@@@...
188
+ ...@....@@@...
189
+ ...@....@@@...
190
+ ..@@@@@@@@@@.. &lt;- horizontal line near center
191
+ ..@......@@@..
192
+ ..@......@@@..
193
+ .@........@@@. v- increasing width of pattern
194
+ .@........@@@. v
195
+ .@........@@@. v
196
+ @@@......@@@@@
197
+ ^^^-- gap
198
+ </PRE></TD></TR>
199
+ </TABLE>
200
+
201
+ <P>
202
+ In the future the program
203
+ should detect edges, vertices, gaps, angles and so on.
204
+ This is called feature extraction (as far as I know).
205
+ With such data the engine could make a cluster analysis.
206
+ But this is a difficult task, if the scanned image is noisy.
207
+
208
+ <H2>Database-Engine</H2>
209
+ The database engine (src/database.c) was the second engine added to GOCR.
210
+ It was primary written to give users a simple tool to recognize
211
+ special language-specific characters. The program generates a list
212
+ (text file db.lst of image filenames and character codes)
213
+ and image samples (pnm-files) in a database path (./db/).
214
+ The database can be created by hand or extern programs or by GOCR itself
215
+ using option (-m 130). In the last case GOCR prompts the user
216
+ for not recognized characters. If he enters the character the pattern
217
+ is saved in the database path as pnm-file and its file name is added
218
+ to the database list (db.lst) together with the text string the pattern
219
+ should be replaced by.
220
+ For recognition GOCR first loads the database into memory (option -m 2).
221
+ The main algorithm compares not recognized characters with stored images
222
+ and calculates a distance value. If the distance value is small enough,
223
+ the character is treated as recognized.
224
+
225
+ <H1><A NAME="SEC6">Remove pixels</A></H1>
226
+
227
+ The following picture shows an <I>n</I> which has additional pixels at the
228
+ bottom. Therefore it can not be detected as <I>n</I>. What can be done?
229
+
230
+ <UL>
231
+ <LI>classify horizontal (<TT>=</TT>) and vertical (<TT>I</TT>) pixels by
232
+ comparing the distance between the next vertical and next horizontal white
233
+ pixels (.)
234
+ </LI>
235
+ <LI>measure mean thickness of vertical and horizontal clusters
236
+ </LI>
237
+ <LI>erase unusually thin horizontal pixels at the bottom line
238
+ </LI>
239
+ </UL>
240
+
241
+ <P>
242
+ <TABLE WIDTH="680">
243
+ <TR><TD>
244
+ <PRE>
245
+ ..@@@@@..@@@@@.. ..==III..===II.. dx=16 dy=15
246
+ ..@@@@@@@@@.@@@. ..==III====.III. thickness 2 to 3
247
+ ....@@@@@...@@@. ....III==...III.
248
+ ....@@@@....@@@. ....III=....III.
249
+ ....@@@.....@@@. ....III.....III.
250
+ ....@@@.....@@@. ....III.....III.
251
+ ...@@@@.....@@@. ...IIII.....III.
252
+ ...@@@......@@@. ...III......III.
253
+ ...@@@......@@@. ...III......III.
254
+ ...@@@.....@@@@. ...III.....IIII.
255
+ ...@@@.....@@@@. ...III.....IIII.
256
+ ...@@@.....@@@.. ...III.....III..
257
+ ..@@@@.....@@@.. ..IIII.....III..
258
+ ..@@@@....@@@@.. ..IIII....IIII..
259
+ @@@@@@@@@@@@@@@@ ================
260
+ ^^^
261
+ this causes the problem
262
+ </PRE></TD></TR>
263
+ </TABLE>
264
+
265
+ <P>
266
+ A better way is to find serifs (horizontal lines glued on the lower end
267
+ of vertical lines) which touch together (v0.2.5).
268
+
269
+ <P>
270
+ The next picture shows blind pixels which are caused by dust on the paper.
271
+ The upper right dots are not connected with the rest of the character.
272
+ This can be detected via fill-algorithms. Currently the program
273
+ assumes that dots near the upper end of a character are ``i''-dots
274
+ or diaereses (umlaut dots).
275
+
276
+ <P>
277
+ <TABLE WIDTH="680">
278
+ <TR><TD>
279
+ <PRE>
280
+ ..........................O... ..........................O...
281
+ ..........................O... ..........................O...
282
+ .............................. ..............................
283
+ .............................. ..............................
284
+ ..........@@@.......@@@@...... ..........@@@.......@@@@......
285
+ ..@@@@..@@@@@@@...@@@@@@@..... ..@@@@..@@@@@@@...@@@@@@@.....
286
+ @@@@@@@@@@@@@@@@.@@@@@@@@@.... @@@@@@@@@@@@@@@@.@@@@@@@@@....
287
+ ..@@@@@@....@@@@@@.....@@@@... ..@@@@@@....@@@@@@.....@@@@...
288
+ ..@@@@.......@@@@......@@@@... ..@@@@.......@@@@......@@@@...
289
+ ..@@@@.......@@@.......@@@@... ..@@@@.......@@@.......@@@@...
290
+ ..@@@@.......@@@.......@@@@... ..@@@@.......@@@.......@@@@...
291
+ ..@@@@.......@@@.......@@@@... ..@@@@.......@@@.......@@@@...
292
+ ..@@@@.......@@@@......@@@@... ..@@@@.......@@@@......@@@@...
293
+ ..@@@@.......@@@.......@@@@... ..@@@@.......@@@.......@@@@...
294
+ ..@@@@.......@@@@......@@@@... ..@@@@.......@@@@......@@@@...
295
+ ..@@@@.......@@@.......@@@@... ..@@@@.......@@@.......@@@@...
296
+ ..@@@@.......@@@.......@@@@... ..@@@@.......@@@.......@@@@...
297
+ ..@@@@.......@@@.......@@@@... ..@@@@.......@@@.......@@@@...
298
+ ..@@@@.......@@@.......@@@@... ..@@@@.......@@@.......@@@@...
299
+ ..@@@@.......@@@.......@@@@... ..@@@@.......@@@.......@@@@...
300
+ ..@@@@.......@@@@......@@@@... ..@@@@.......@@@@......@@@@...
301
+ ..@@@@......@@@@@......@@@@@.. ..@@@@......@@@@@......@@@@@..
302
+ @@@@@@@@..@@@@@@@@@..@@@@@@@@@ @@@@@@@@..@@@@@@@@@..@@@@@@@@@
303
+ </PRE></TD></TR>
304
+ </TABLE>
305
+
306
+ <H1><A NAME="SEC7">
307
+ Add pixels</A>
308
+ </H1>
309
+ The following picture shows an <I>m</I>. The legs are only barely connected.
310
+ How do we handle this?
311
+
312
+ <UL>
313
+ <LI>if the engine has failed, a filter is switched on and the engine
314
+ starts over
315
+ </LI>
316
+ <LI>the 2x2 filter sets pixels to (<I>O</I>) near barely connected pixels
317
+ </LI>
318
+ </UL>
319
+
320
+ <P>
321
+ <TABLE WIDTH="680">
322
+ <TR><TD>
323
+ <PRE>
324
+ vv vv
325
+ @@@.@@@..@@@... @@@.@@@..@@@...
326
+ .@@.@@@@.@@@@..&lt; .@@O@@@@O@@@@.. filter: .@ =&gt; O@ @. =&gt; @O
327
+ .@@@..@@@..@@..&lt; .@@@..@@@..@@.. @. =&gt; @. .@ =&gt; .@
328
+ .@@@..@@@..@@@. .@@@..@@@..@@@.
329
+ .@@@..@@@..@@@. .@@@..@@@..@@@.
330
+ .@@@..@@@..@@@. .@@@..@@@..@@@.
331
+ .@@@..@@@..@@@. .@@@..@@@..@@@.
332
+ .@@@..@@@..@@@. .@@@..@@@..@@@.
333
+ .@@@..@@@..@@@. .@@@..@@@..@@@.
334
+ .@@@..@@@..@@@. .@@@..@@@..@@@.
335
+ .@@@..@@@..@@@. .@@@..@@@..@@@.
336
+ .@@@..@@@..@@@. .@@@..@@@..@@@.
337
+ .@@@..@@@..@@@. .@@@..@@@..@@@.
338
+ @@@@@.@@@@.@@@@ @@@@@.@@@@.@@@@
339
+ </PRE></TD></TR>
340
+ </TABLE>
341
+
342
+ <H1><A NAME="SEC8">
343
+ Similarity analyzer</A>
344
+ </H1>
345
+ Some characters are a little bit noisy. These characters can be identified by
346
+ comparison with other, already recognized characters. This can be done
347
+ via a good distance function. May be
348
+ the distance function in the actual version of GOCR is not very good.
349
+ Feel free to send me your ideas, but be sure it does not waste my time.
350
+
351
+ <H1><A NAME="SEC9">
352
+ Overlapping characters</A>
353
+ </H1>
354
+ The following picture shows an overlapping <I>ru</I>.
355
+ How do we handle this?
356
+
357
+ <UL>
358
+ <LI>look for 3 weak connections (sum over y is small, start in the middle)
359
+ </LI>
360
+ <LI>test if the right and left part can be detected by the engine
361
+ </LI>
362
+ <LI>correction of surrounding box
363
+ </LI>
364
+ </UL>
365
+
366
+ <P>
367
+ <TABLE WIDTH="680">
368
+ <TR><TD>
369
+ <PRE>
370
+ ....@@...@@@@@@@@@@....@@@@@@@.. ....@@...@@@@@@@@@@....@@@@@@@..
371
+ ..@@@@..@@@@@..@@@@......@@@@@.. ..@@@@..@@@@@..@@@@......@@@@@..
372
+ @@@@@@@@@@@@@.,.@@@.......@@@@.. @@@@@@@@@@@@@...@@@.......@@@@..
373
+ ..@@@@@@..@@@...@@@.......@@@@.. ..@@@@@@..@@@...@@@.......@@@@..
374
+ ...@@@@.......,.@@@@......@@@@.. ...@@@@.........@@@@......@@@@..
375
+ ...@@@@.........@@@@......@@@@.. ...@@@@.........@@@@......@@@@..
376
+ ...@@@@.......,.@@@.......@@@@.. ...@@@@.........@@@.......@@@@..
377
+ ...@@@@.........@@@.......@@@@.. ...@@@@.........@@@.......@@@@..
378
+ ...@@@........,.@@@@......@@@@.. ...@@@..........@@@@......@@@@..
379
+ ...@@@..........@@@@......@@@@.. ...@@@..........@@@@......@@@@..
380
+ ...@@@........,.@@@@......@@@@.. ...@@@..........@@@@......@@@@..
381
+ ...@@@..........@@@.......@@@@.. ...@@@..........@@@.......@@@@..
382
+ ...@@@........,.@@@@......@@@@.. ...@@@..........@@@@......@@@@..
383
+ ...@@@..........@@@@......@@@@.. ...@@@..........@@@@......@@@@..
384
+ ...@@@........,.@@@@......@@@@.. ...@@@..........@@@@......@@@@..
385
+ ...@@@..........@@@@@...@@@@@@@. ...@@@..........@@@@@...@@@@@@@.
386
+ ..@@@@@.......,..@@@@@@@@@.@@@@@ ..@@@@@..........@@@@@@@@@.@@@@@
387
+ @@@@@@@@@.........@@@@@@@..@@@.. @@@@@@@@@.........@@@@@@@..@@@..
388
+ ..............,....@@@.......... ...................@@@..........
389
+ ^^^
390
+ 213 weak vertical lines
391
+ </PRE></TD></TR>
392
+ </TABLE>
393
+
394
+ <P>
395
+ Of course the situation is more difficult with slanted characters.
396
+
397
+ <P>
398
+ The following example shows, how to deal with larger clusters.
399
+ To get a fast program a first test should select the possible positions of
400
+ division. That can be done by following upper and lower bows to a crease or a break. Than try to break on all detected creases, start at most
401
+ important one (not implemented yet v0.2.4).
402
+
403
+ <P>
404
+ <TABLE WIDTH="766">
405
+ <TR><TD>
406
+ <PRE>
407
+ &gt;&gt;&gt;&gt;vvv&lt;&lt;&lt;&lt;&lt; &gt;&gt;vv&lt;&lt;&lt;&lt; &gt;&gt;&gt;vvv&lt;&lt;&lt;&lt;
408
+ ......@@@@@@@..................@@.........@@@@@@@..........@@@@@@@.....
409
+ ....@@@@@@@@@@@...............@@@.......@@@@@@@@@@@......@@@@@@@@@@@...
410
+ ...@@@@@@@@@@@@@.............@@@@......@@@@@@@@@@@@@....@@@@@@@@@@@@@..
411
+ ..@@@@.......@@@@...........@@@@@.....@@@@.......@@@@..@@@@.......@@@@.
412
+ ..@@@........@@@@..........@@@@@@@....@@@........@@@@@@@@@........@@@@.
413
+ .@@@@..........@@.........@@@@@@@@...@@@@..........@@@@@@@.........@@@@
414
+ .@@@.....................@@@@.@@@@...@@@..............@@...........@@@@
415
+ .@@@....................@@@@@.@@@@...@@@...........................@@@@
416
+ @@@...@@@@@@@...........@@@@..@@@...@@@...@@@@@@...................@@@.
417
+ @@@@.@@@@@@@@@@........@@@@...@@@@..@@@@.@@@@@@@@@@...............@@@@.
418
+ @@@@@@@@@@@@@@@.......@@@@....@@@@..@@@@@@@@@@@@@@@...............@@@..
419
+ @@@@@@@.....@@@@@.....@@@.....@@@@..@@@@@@......@@@@@............@@@@..
420
+ @@@@.........@@@@...@@@@......@@@@..@@@@@........@@@@...........@@@....
421
+ @@@@..........@@@@.@@@@.......@@@@..@@@@..........@@@..........@@@@....
422
+ @@@@..........@@@@@@@@@.......@@@@.@@@@@..........@@@.........@@@@.....
423
+ @@@@..........@@@@@@@@@@@@@@@@@@@@@@@@@@..........@@@@.......@@@@......
424
+ @@@@..........@@@@@@@@@@@@@@@@@@@@@@@@@@..........@@@@......@@@........
425
+ .@@@..........@@@@@@@@@@@@@@@@@@@@@@.@@@..........@@@@....@@@@@........
426
+ .@@@@........@@@@.............@@@@...@@@@........@@@@....@@@@..........
427
+ ..@@@@.......@@@@.............@@@@....@@@@.......@@@@...@@@@...........
428
+ ..@@@@@....@@@@@..............@@@@.....@@@@....@@@@@...@@@@@@..........
429
+ ....@@@@@@@@@@@...............@@@@......@@@@@@@@@@@...@@@@@@@@@@@@@@@@@
430
+ .....@@@@@@@@@................@@@@........@@@@@@@@....@@@@@@@@@@@@@@@@@
431
+ ........@@@@...................@@..........@@@@@........@@@@@@@..@.@@@.
432
+ &gt;&gt;&gt;&gt;^ ^&lt;&lt;&gt;&gt;^ ^&lt;&lt;&lt;&lt;&lt; &gt;&gt;&gt;^&lt;&lt;&lt; ^^ ^
433
+
434
+ &gt;,&lt; show the path of the detection algorithm
435
+ </PRE></TD></TR>
436
+ </TABLE>
437
+
438
+ <P>
439
+ The latest version of GOCR may use different algorithms.
440
+ You have to look at the sources learn more.
441
+
442
+ <H1><A NAME="SEC10">
443
+ Black/White, Gray and Colors</A>
444
+ </H1>
445
+ For simplicity colored images are converted to gray internally.
446
+ That means a red text on green background will not be detected.
447
+ You should use your own filter for this purpose.
448
+
449
+ <P>
450
+ If the original image is gray, a critical value is calculated to
451
+ extract characters from the background. This can fail, if images are
452
+ on the scanned page or tha scan is bad (dark edges or borders).
453
+ It is difficult to overcome this problem because graylevels are mostly
454
+ restricted to the 8 bit limit (16 bit would help to overcome this problem).
455
+
456
+ <P>
457
+ Black/White images are internally converted to gray with two levels (0 and
458
+ 255).
459
+
460
+ <P>
461
+ The lowest 4 bits are not used, because they are used by internal functions
462
+ (this can be changed in future).
463
+
464
+ <P>
465
+ After calculation of the threshold value (otsu.c) the brightness of
466
+ every pixel is recalculated to a new internal threshold value
467
+ of 160 (128+32).
468
+ This is a bit above the middle of the 8 bit range. The idea is to
469
+ make the live easier for the other routines. Pixels which does not sure
470
+ belong to the white or black ones get a value near the threshold value.
471
+ Some routines can use this bit of more information to ignore outriders.
472
+ Second point is, that this is necessary for using lowest for bits
473
+ without destroying image informations.
474
+
475
+ <H1><A NAME="SEC11">Pictures on scanned pages</A>
476
+ </H1>
477
+ At first all objects on the scanned page are detected.
478
+ Objects are clusters of black pixels.
479
+ Pictures are detected if they are larger than 4 times the mean size of
480
+ all objects. This rule is very simple and can fail some times.
481
+ But it works fast and mostly the result is ok.
482
+
483
+ <H1><A NAME="SEC12">Tools</A></H1>
484
+
485
+ <P>
486
+ <DL COMPACT>
487
+ <DT>pbmclean:</DT>
488
+ <DD>This program is written by Angus Duggan and Jef Poskanzer.
489
+ It cleans up ``snow'' on bitmap images.
490
+ </DD>
491
+ <DT>pnmtools:</DT>
492
+ <DD>This tools are used to convert different image-formats to
493
+ easy readable PNM (PBM,PGM,PPM) format.
494
+ GOCR uses the popen-routine to call this programs if the
495
+ suffix of the filename matches to a list in pnm.c.
496
+ This will fail if pnmtools are missing.
497
+ </DD>
498
+ </DL>
499
+
500
+ <H1><A NAME="SEC12b">related projects (to learn from)</A></H1>
501
+
502
+ <P>
503
+ <DL COMPACT>
504
+ <DT>unpaper:</DT> <!-- Dec05 JS -->
505
+ <DD> <a href="http://unpaper.berlios.de/">unpaper</a> -
506
+ post-processing scanned and photocopied book pages,
507
+ written by Jens Gulden 2005, GPL
508
+ </DD>
509
+ </DL>
510
+
511
+ <H1><A NAME="SEC13">glossary</A> </H1> <DL COMPACT>
512
+ <DT>font series:</DT> <DD>bold, condensed</DD>
513
+ <DT>font shape: </DT> <DD>normal, italic, slanted, sc... </DD>
514
+ <DT>points:</DT>
515
+ <DD>length unit used for font size, 1/72 inch,
516
+ but I do not know its exact relation to the font size (height?
517
+ totalheight? width? 10pt and 300dpi results in 40 pixel heigh font?)
518
+ </DD>
519
+ <DT>sans serif:</DT>
520
+ <DD>font without the (often thin) lines on the ends
521
+ of the character
522
+ </DD>
523
+ <DT>descewing:</DT>
524
+ <DD>compensation of (slightly) rotated text
525
+ </DD>
526
+ </DL>
527
+
528
+ <H1><A NAME="SEC14"> More information?</A> </H1>
529
+ <DL COMPACT>
530
+ <DT>&middot;</DT>
531
+ <DD>see "/usr/share/doc/package/tetex/texmf/.../fntguide.dvi"
532
+ in the documentation of the tetex package
533
+
534
+ </DD>
535
+ <DT>&middot;</DT>
536
+ <DD>the fonts-HOWTO file is helpfully too
537
+ ("www.faqs.org/faqs/fonts-faq/")
538
+
539
+ </DD>
540
+ <DT>RTF:</DT>
541
+ <DD> RichTextFormat - does someone have a good documontation?
542
+ </DD>
543
+ </DL>
544
+
545
+ <H1><A NAME="SEC15"> About this document</A> </H1>
546
+ This Document was originaly written in LaTeX.
547
+ In May 2002 Joerg has convertet it to HTML. The reason is, that
548
+ you can read it now directly and you does not need to have LaTeX and
549
+ Ghostscript installed on your computer to read it.
550
+ As a side effect you do not need tetex package to build the gocr.rpm-package.
551
+ A good viewer to read this document is lynx, links or w3m.
552
+
553
+ <BR>
554
+ <HR>
555
+ <ADDRESS> jNOschulen-at-gSmPAMx.de (remove NO+S+PAM) </ADDRESS>
556
+ </BODY>
557
+ </HTML>
558
+ <!---
559
+ %
560
+ % -----------------------------------------------------------------
561
+ % # v
562
+ % # v mark connected points via fifo-stack
563
+ % ### *<< and 3bit direction code per pixel,
564
+ % # # ^ ^ should be better for overlapping letters
565
+ % ### ^<<
566
+ % --------------------- point connections ----------------------
567
+ % searching next nearest point
568
+ % /----\
569
+ % | |
570
+ % ^ | ## |
571
+ % | | ## |
572
+ % | | | |
573
+ % | \--/ |
574
+ % \------/
575
+ %-------------------------------------------------------------
576
+ %
577
+
578
+ --->