tesseract_bin 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (612) hide show
  1. data/.document +5 -0
  2. data/Gemfile +14 -0
  3. data/Gemfile.lock +23 -0
  4. data/LICENSE.txt +20 -0
  5. data/README.rdoc +19 -0
  6. data/Rakefile +46 -0
  7. data/VERSION +1 -0
  8. data/ext/tesseract_bin/extconf.rb +17 -0
  9. data/lib/tesseract_bin.rb +12 -0
  10. data/tesseract_bin.gemspec +660 -0
  11. data/test/helper.rb +18 -0
  12. data/test/test_tesseract_bin.rb +7 -0
  13. data/vendor/tesseract-2.04/AUTHORS +8 -0
  14. data/vendor/tesseract-2.04/COPYING +23 -0
  15. data/vendor/tesseract-2.04/ChangeLog +71 -0
  16. data/vendor/tesseract-2.04/INSTALL +229 -0
  17. data/vendor/tesseract-2.04/Makefile.am +20 -0
  18. data/vendor/tesseract-2.04/Makefile.in +641 -0
  19. data/vendor/tesseract-2.04/NEWS +1 -0
  20. data/vendor/tesseract-2.04/README +138 -0
  21. data/vendor/tesseract-2.04/ReleaseNotes +213 -0
  22. data/vendor/tesseract-2.04/StdAfx.cpp +8 -0
  23. data/vendor/tesseract-2.04/StdAfx.h +24 -0
  24. data/vendor/tesseract-2.04/ccmain/Makefile.am +63 -0
  25. data/vendor/tesseract-2.04/ccmain/Makefile.in +735 -0
  26. data/vendor/tesseract-2.04/ccmain/adaptions.cpp +1082 -0
  27. data/vendor/tesseract-2.04/ccmain/adaptions.h +109 -0
  28. data/vendor/tesseract-2.04/ccmain/applybox.cpp +891 -0
  29. data/vendor/tesseract-2.04/ccmain/applybox.h +73 -0
  30. data/vendor/tesseract-2.04/ccmain/baseapi.cpp +1105 -0
  31. data/vendor/tesseract-2.04/ccmain/baseapi.h +256 -0
  32. data/vendor/tesseract-2.04/ccmain/blobcmp.cpp +76 -0
  33. data/vendor/tesseract-2.04/ccmain/blobcmp.h +29 -0
  34. data/vendor/tesseract-2.04/ccmain/callnet.cpp +93 -0
  35. data/vendor/tesseract-2.04/ccmain/callnet.h +32 -0
  36. data/vendor/tesseract-2.04/ccmain/charcut.cpp +704 -0
  37. data/vendor/tesseract-2.04/ccmain/charcut.h +120 -0
  38. data/vendor/tesseract-2.04/ccmain/charsample.cpp +699 -0
  39. data/vendor/tesseract-2.04/ccmain/control.cpp +1842 -0
  40. data/vendor/tesseract-2.04/ccmain/control.h +198 -0
  41. data/vendor/tesseract-2.04/ccmain/docqual.cpp +1481 -0
  42. data/vendor/tesseract-2.04/ccmain/docqual.h +155 -0
  43. data/vendor/tesseract-2.04/ccmain/expandblob.cpp +82 -0
  44. data/vendor/tesseract-2.04/ccmain/expandblob.h +13 -0
  45. data/vendor/tesseract-2.04/ccmain/fixspace.cpp +989 -0
  46. data/vendor/tesseract-2.04/ccmain/fixspace.h +72 -0
  47. data/vendor/tesseract-2.04/ccmain/fixxht.cpp +825 -0
  48. data/vendor/tesseract-2.04/ccmain/fixxht.h +93 -0
  49. data/vendor/tesseract-2.04/ccmain/imgscale.cpp +154 -0
  50. data/vendor/tesseract-2.04/ccmain/imgscale.h +32 -0
  51. data/vendor/tesseract-2.04/ccmain/matmatch.cpp +391 -0
  52. data/vendor/tesseract-2.04/ccmain/matmatch.h +48 -0
  53. data/vendor/tesseract-2.04/ccmain/output.cpp +1273 -0
  54. data/vendor/tesseract-2.04/ccmain/output.h +116 -0
  55. data/vendor/tesseract-2.04/ccmain/pagewalk.cpp +666 -0
  56. data/vendor/tesseract-2.04/ccmain/pagewalk.h +155 -0
  57. data/vendor/tesseract-2.04/ccmain/paircmp.cpp +107 -0
  58. data/vendor/tesseract-2.04/ccmain/paircmp.h +43 -0
  59. data/vendor/tesseract-2.04/ccmain/pgedit.cpp +1867 -0
  60. data/vendor/tesseract-2.04/ccmain/pgedit.h +181 -0
  61. data/vendor/tesseract-2.04/ccmain/reject.cpp +1775 -0
  62. data/vendor/tesseract-2.04/ccmain/reject.h +181 -0
  63. data/vendor/tesseract-2.04/ccmain/scaleimg.cpp +366 -0
  64. data/vendor/tesseract-2.04/ccmain/scaleimg.h +35 -0
  65. data/vendor/tesseract-2.04/ccmain/tessbox.cpp +375 -0
  66. data/vendor/tesseract-2.04/ccmain/tessbox.h +110 -0
  67. data/vendor/tesseract-2.04/ccmain/tessedit.cpp +278 -0
  68. data/vendor/tesseract-2.04/ccmain/tessedit.h +49 -0
  69. data/vendor/tesseract-2.04/ccmain/tessembedded.cpp +110 -0
  70. data/vendor/tesseract-2.04/ccmain/tessembedded.h +38 -0
  71. data/vendor/tesseract-2.04/ccmain/tesseractfull.cc +37 -0
  72. data/vendor/tesseract-2.04/ccmain/tesseractmain.cpp +387 -0
  73. data/vendor/tesseract-2.04/ccmain/tesseractmain.h +58 -0
  74. data/vendor/tesseract-2.04/ccmain/tessio.h +110 -0
  75. data/vendor/tesseract-2.04/ccmain/tessvars.cpp +38 -0
  76. data/vendor/tesseract-2.04/ccmain/tessvars.h +48 -0
  77. data/vendor/tesseract-2.04/ccmain/tfacep.h +62 -0
  78. data/vendor/tesseract-2.04/ccmain/tfacepp.cpp +443 -0
  79. data/vendor/tesseract-2.04/ccmain/tfacepp.h +85 -0
  80. data/vendor/tesseract-2.04/ccmain/tstruct.cpp +549 -0
  81. data/vendor/tesseract-2.04/ccmain/tstruct.h +108 -0
  82. data/vendor/tesseract-2.04/ccmain/varabled.cpp +346 -0
  83. data/vendor/tesseract-2.04/ccmain/varabled.h +125 -0
  84. data/vendor/tesseract-2.04/ccmain/werdit.cpp +193 -0
  85. data/vendor/tesseract-2.04/ccmain/werdit.h +67 -0
  86. data/vendor/tesseract-2.04/ccstruct/Makefile.am +25 -0
  87. data/vendor/tesseract-2.04/ccstruct/Makefile.in +650 -0
  88. data/vendor/tesseract-2.04/ccstruct/blckerr.h +29 -0
  89. data/vendor/tesseract-2.04/ccstruct/blobbox.cpp +778 -0
  90. data/vendor/tesseract-2.04/ccstruct/blobbox.h +381 -0
  91. data/vendor/tesseract-2.04/ccstruct/blobs.cpp +247 -0
  92. data/vendor/tesseract-2.04/ccstruct/blobs.h +119 -0
  93. data/vendor/tesseract-2.04/ccstruct/blread.cpp +537 -0
  94. data/vendor/tesseract-2.04/ccstruct/blread.h +63 -0
  95. data/vendor/tesseract-2.04/ccstruct/callcpp.cpp +252 -0
  96. data/vendor/tesseract-2.04/ccstruct/coutln.cpp +650 -0
  97. data/vendor/tesseract-2.04/ccstruct/coutln.h +186 -0
  98. data/vendor/tesseract-2.04/ccstruct/crakedge.h +39 -0
  99. data/vendor/tesseract-2.04/ccstruct/genblob.cpp +133 -0
  100. data/vendor/tesseract-2.04/ccstruct/genblob.h +52 -0
  101. data/vendor/tesseract-2.04/ccstruct/hpddef.h +39 -0
  102. data/vendor/tesseract-2.04/ccstruct/hpdsizes.h +8 -0
  103. data/vendor/tesseract-2.04/ccstruct/ipoints.h +479 -0
  104. data/vendor/tesseract-2.04/ccstruct/labls.cpp +188 -0
  105. data/vendor/tesseract-2.04/ccstruct/labls.h +38 -0
  106. data/vendor/tesseract-2.04/ccstruct/linlsq.cpp +249 -0
  107. data/vendor/tesseract-2.04/ccstruct/linlsq.h +102 -0
  108. data/vendor/tesseract-2.04/ccstruct/lmedsq.cpp +453 -0
  109. data/vendor/tesseract-2.04/ccstruct/lmedsq.h +84 -0
  110. data/vendor/tesseract-2.04/ccstruct/mod128.cpp +100 -0
  111. data/vendor/tesseract-2.04/ccstruct/mod128.h +85 -0
  112. data/vendor/tesseract-2.04/ccstruct/normalis.cpp +176 -0
  113. data/vendor/tesseract-2.04/ccstruct/normalis.h +108 -0
  114. data/vendor/tesseract-2.04/ccstruct/ocrblock.cpp +369 -0
  115. data/vendor/tesseract-2.04/ccstruct/ocrblock.h +235 -0
  116. data/vendor/tesseract-2.04/ccstruct/ocrrow.cpp +216 -0
  117. data/vendor/tesseract-2.04/ccstruct/ocrrow.h +133 -0
  118. data/vendor/tesseract-2.04/ccstruct/pageblk.cpp +879 -0
  119. data/vendor/tesseract-2.04/ccstruct/pageblk.h +318 -0
  120. data/vendor/tesseract-2.04/ccstruct/pageres.cpp +330 -0
  121. data/vendor/tesseract-2.04/ccstruct/pageres.h +313 -0
  122. data/vendor/tesseract-2.04/ccstruct/pdblock.cpp +361 -0
  123. data/vendor/tesseract-2.04/ccstruct/pdblock.h +181 -0
  124. data/vendor/tesseract-2.04/ccstruct/pdclass.h +54 -0
  125. data/vendor/tesseract-2.04/ccstruct/points.cpp +102 -0
  126. data/vendor/tesseract-2.04/ccstruct/points.h +299 -0
  127. data/vendor/tesseract-2.04/ccstruct/polyaprx.cpp +588 -0
  128. data/vendor/tesseract-2.04/ccstruct/polyaprx.h +51 -0
  129. data/vendor/tesseract-2.04/ccstruct/polyblk.cpp +398 -0
  130. data/vendor/tesseract-2.04/ccstruct/polyblk.h +122 -0
  131. data/vendor/tesseract-2.04/ccstruct/polyblob.cpp +357 -0
  132. data/vendor/tesseract-2.04/ccstruct/polyblob.h +102 -0
  133. data/vendor/tesseract-2.04/ccstruct/polyvert.cpp +23 -0
  134. data/vendor/tesseract-2.04/ccstruct/polyvert.h +58 -0
  135. data/vendor/tesseract-2.04/ccstruct/poutline.cpp +441 -0
  136. data/vendor/tesseract-2.04/ccstruct/poutline.h +125 -0
  137. data/vendor/tesseract-2.04/ccstruct/quadlsq.cpp +147 -0
  138. data/vendor/tesseract-2.04/ccstruct/quadlsq.h +67 -0
  139. data/vendor/tesseract-2.04/ccstruct/quadratc.cpp +21 -0
  140. data/vendor/tesseract-2.04/ccstruct/quadratc.h +63 -0
  141. data/vendor/tesseract-2.04/ccstruct/quspline.cpp +382 -0
  142. data/vendor/tesseract-2.04/ccstruct/quspline.h +113 -0
  143. data/vendor/tesseract-2.04/ccstruct/ratngs.cpp +372 -0
  144. data/vendor/tesseract-2.04/ccstruct/ratngs.h +198 -0
  145. data/vendor/tesseract-2.04/ccstruct/rect.cpp +229 -0
  146. data/vendor/tesseract-2.04/ccstruct/rect.h +320 -0
  147. data/vendor/tesseract-2.04/ccstruct/rejctmap.cpp +545 -0
  148. data/vendor/tesseract-2.04/ccstruct/rejctmap.h +284 -0
  149. data/vendor/tesseract-2.04/ccstruct/rwpoly.cpp +89 -0
  150. data/vendor/tesseract-2.04/ccstruct/rwpoly.h +45 -0
  151. data/vendor/tesseract-2.04/ccstruct/statistc.cpp +905 -0
  152. data/vendor/tesseract-2.04/ccstruct/statistc.h +135 -0
  153. data/vendor/tesseract-2.04/ccstruct/stepblob.cpp +296 -0
  154. data/vendor/tesseract-2.04/ccstruct/stepblob.h +88 -0
  155. data/vendor/tesseract-2.04/ccstruct/txtregn.cpp +230 -0
  156. data/vendor/tesseract-2.04/ccstruct/txtregn.h +155 -0
  157. data/vendor/tesseract-2.04/ccstruct/vecfuncs.cpp +63 -0
  158. data/vendor/tesseract-2.04/ccstruct/vecfuncs.h +91 -0
  159. data/vendor/tesseract-2.04/ccstruct/werd.cpp +967 -0
  160. data/vendor/tesseract-2.04/ccstruct/werd.h +277 -0
  161. data/vendor/tesseract-2.04/ccutil/Makefile.am +19 -0
  162. data/vendor/tesseract-2.04/ccutil/Makefile.in +626 -0
  163. data/vendor/tesseract-2.04/ccutil/basedir.cpp +118 -0
  164. data/vendor/tesseract-2.04/ccutil/basedir.h +32 -0
  165. data/vendor/tesseract-2.04/ccutil/bits16.cpp +30 -0
  166. data/vendor/tesseract-2.04/ccutil/bits16.h +61 -0
  167. data/vendor/tesseract-2.04/ccutil/boxread.cpp +105 -0
  168. data/vendor/tesseract-2.04/ccutil/boxread.h +44 -0
  169. data/vendor/tesseract-2.04/ccutil/clst.cpp +626 -0
  170. data/vendor/tesseract-2.04/ccutil/clst.h +1085 -0
  171. data/vendor/tesseract-2.04/ccutil/debugwin.cpp +500 -0
  172. data/vendor/tesseract-2.04/ccutil/debugwin.h +103 -0
  173. data/vendor/tesseract-2.04/ccutil/elst.cpp +593 -0
  174. data/vendor/tesseract-2.04/ccutil/elst.h +1125 -0
  175. data/vendor/tesseract-2.04/ccutil/elst2.cpp +606 -0
  176. data/vendor/tesseract-2.04/ccutil/elst2.h +1121 -0
  177. data/vendor/tesseract-2.04/ccutil/errcode.cpp +104 -0
  178. data/vendor/tesseract-2.04/ccutil/errcode.h +104 -0
  179. data/vendor/tesseract-2.04/ccutil/fileerr.h +34 -0
  180. data/vendor/tesseract-2.04/ccutil/globaloc.cpp +115 -0
  181. data/vendor/tesseract-2.04/ccutil/globaloc.h +40 -0
  182. data/vendor/tesseract-2.04/ccutil/hashfn.cpp +57 -0
  183. data/vendor/tesseract-2.04/ccutil/hashfn.h +30 -0
  184. data/vendor/tesseract-2.04/ccutil/host.h +180 -0
  185. data/vendor/tesseract-2.04/ccutil/hosthplb.h +1 -0
  186. data/vendor/tesseract-2.04/ccutil/lsterr.h +43 -0
  187. data/vendor/tesseract-2.04/ccutil/mainblk.cpp +126 -0
  188. data/vendor/tesseract-2.04/ccutil/mainblk.h +39 -0
  189. data/vendor/tesseract-2.04/ccutil/memblk.cpp +1106 -0
  190. data/vendor/tesseract-2.04/ccutil/memblk.h +189 -0
  191. data/vendor/tesseract-2.04/ccutil/memry.cpp +532 -0
  192. data/vendor/tesseract-2.04/ccutil/memry.h +192 -0
  193. data/vendor/tesseract-2.04/ccutil/memryerr.h +38 -0
  194. data/vendor/tesseract-2.04/ccutil/mfcpch.cpp +5 -0
  195. data/vendor/tesseract-2.04/ccutil/mfcpch.h +37 -0
  196. data/vendor/tesseract-2.04/ccutil/ndminx.h +31 -0
  197. data/vendor/tesseract-2.04/ccutil/notdll.h +28 -0
  198. data/vendor/tesseract-2.04/ccutil/nwmain.h +176 -0
  199. data/vendor/tesseract-2.04/ccutil/ocrclass.h +345 -0
  200. data/vendor/tesseract-2.04/ccutil/ocrshell.cpp +772 -0
  201. data/vendor/tesseract-2.04/ccutil/ocrshell.h +191 -0
  202. data/vendor/tesseract-2.04/ccutil/platform.h +18 -0
  203. data/vendor/tesseract-2.04/ccutil/scanutils.cpp +543 -0
  204. data/vendor/tesseract-2.04/ccutil/scanutils.h +55 -0
  205. data/vendor/tesseract-2.04/ccutil/secname.h +9 -0
  206. data/vendor/tesseract-2.04/ccutil/serialis.cpp +117 -0
  207. data/vendor/tesseract-2.04/ccutil/serialis.h +93 -0
  208. data/vendor/tesseract-2.04/ccutil/stderr.h +26 -0
  209. data/vendor/tesseract-2.04/ccutil/strngs.cpp +495 -0
  210. data/vendor/tesseract-2.04/ccutil/strngs.h +138 -0
  211. data/vendor/tesseract-2.04/ccutil/tessclas.h +135 -0
  212. data/vendor/tesseract-2.04/ccutil/tessopt.cpp +61 -0
  213. data/vendor/tesseract-2.04/ccutil/tessopt.h +30 -0
  214. data/vendor/tesseract-2.04/ccutil/tprintf.cpp +122 -0
  215. data/vendor/tesseract-2.04/ccutil/tprintf.h +35 -0
  216. data/vendor/tesseract-2.04/ccutil/unichar.cpp +144 -0
  217. data/vendor/tesseract-2.04/ccutil/unichar.h +84 -0
  218. data/vendor/tesseract-2.04/ccutil/unicharmap.cpp +172 -0
  219. data/vendor/tesseract-2.04/ccutil/unicharmap.h +82 -0
  220. data/vendor/tesseract-2.04/ccutil/unicharset.cpp +307 -0
  221. data/vendor/tesseract-2.04/ccutil/unicharset.h +267 -0
  222. data/vendor/tesseract-2.04/ccutil/varable.cpp +672 -0
  223. data/vendor/tesseract-2.04/ccutil/varable.h +419 -0
  224. data/vendor/tesseract-2.04/classify/Makefile.am +24 -0
  225. data/vendor/tesseract-2.04/classify/Makefile.in +647 -0
  226. data/vendor/tesseract-2.04/classify/adaptive.cpp +535 -0
  227. data/vendor/tesseract-2.04/classify/adaptive.h +199 -0
  228. data/vendor/tesseract-2.04/classify/adaptmatch.cpp +2958 -0
  229. data/vendor/tesseract-2.04/classify/adaptmatch.h +86 -0
  230. data/vendor/tesseract-2.04/classify/baseline.cpp +58 -0
  231. data/vendor/tesseract-2.04/classify/baseline.h +91 -0
  232. data/vendor/tesseract-2.04/classify/blobclass.cpp +123 -0
  233. data/vendor/tesseract-2.04/classify/blobclass.h +49 -0
  234. data/vendor/tesseract-2.04/classify/chartoname.cpp +74 -0
  235. data/vendor/tesseract-2.04/classify/chartoname.h +21 -0
  236. data/vendor/tesseract-2.04/classify/cluster.cpp +2834 -0
  237. data/vendor/tesseract-2.04/classify/cluster.h +158 -0
  238. data/vendor/tesseract-2.04/classify/clusttool.cpp +507 -0
  239. data/vendor/tesseract-2.04/classify/clusttool.h +70 -0
  240. data/vendor/tesseract-2.04/classify/cutoffs.cpp +73 -0
  241. data/vendor/tesseract-2.04/classify/cutoffs.h +49 -0
  242. data/vendor/tesseract-2.04/classify/extern.h +32 -0
  243. data/vendor/tesseract-2.04/classify/extract.cpp +100 -0
  244. data/vendor/tesseract-2.04/classify/extract.h +36 -0
  245. data/vendor/tesseract-2.04/classify/featdefs.cpp +244 -0
  246. data/vendor/tesseract-2.04/classify/featdefs.h +71 -0
  247. data/vendor/tesseract-2.04/classify/flexfx.cpp +87 -0
  248. data/vendor/tesseract-2.04/classify/flexfx.h +34 -0
  249. data/vendor/tesseract-2.04/classify/float2int.cpp +126 -0
  250. data/vendor/tesseract-2.04/classify/float2int.h +65 -0
  251. data/vendor/tesseract-2.04/classify/fpoint.cpp +73 -0
  252. data/vendor/tesseract-2.04/classify/fpoint.h +63 -0
  253. data/vendor/tesseract-2.04/classify/fxdefs.cpp +74 -0
  254. data/vendor/tesseract-2.04/classify/fxdefs.h +93 -0
  255. data/vendor/tesseract-2.04/classify/fxid.h +69 -0
  256. data/vendor/tesseract-2.04/classify/hideedge.cpp +35 -0
  257. data/vendor/tesseract-2.04/classify/hideedge.h +76 -0
  258. data/vendor/tesseract-2.04/classify/intfx.cpp +608 -0
  259. data/vendor/tesseract-2.04/classify/intfx.h +63 -0
  260. data/vendor/tesseract-2.04/classify/intmatcher.cpp +1524 -0
  261. data/vendor/tesseract-2.04/classify/intmatcher.h +199 -0
  262. data/vendor/tesseract-2.04/classify/intproto.cpp +1823 -0
  263. data/vendor/tesseract-2.04/classify/intproto.h +320 -0
  264. data/vendor/tesseract-2.04/classify/kdtree.cpp +884 -0
  265. data/vendor/tesseract-2.04/classify/kdtree.h +118 -0
  266. data/vendor/tesseract-2.04/classify/mf.cpp +106 -0
  267. data/vendor/tesseract-2.04/classify/mf.h +43 -0
  268. data/vendor/tesseract-2.04/classify/mfdefs.cpp +58 -0
  269. data/vendor/tesseract-2.04/classify/mfdefs.h +60 -0
  270. data/vendor/tesseract-2.04/classify/mfoutline.cpp +1087 -0
  271. data/vendor/tesseract-2.04/classify/mfoutline.h +277 -0
  272. data/vendor/tesseract-2.04/classify/mfx.cpp +436 -0
  273. data/vendor/tesseract-2.04/classify/mfx.h +52 -0
  274. data/vendor/tesseract-2.04/classify/normfeat.cpp +132 -0
  275. data/vendor/tesseract-2.04/classify/normfeat.h +63 -0
  276. data/vendor/tesseract-2.04/classify/normmatch.cpp +305 -0
  277. data/vendor/tesseract-2.04/classify/normmatch.h +38 -0
  278. data/vendor/tesseract-2.04/classify/ocrfeatures.cpp +310 -0
  279. data/vendor/tesseract-2.04/classify/ocrfeatures.h +148 -0
  280. data/vendor/tesseract-2.04/classify/outfeat.cpp +262 -0
  281. data/vendor/tesseract-2.04/classify/outfeat.h +76 -0
  282. data/vendor/tesseract-2.04/classify/picofeat.cpp +297 -0
  283. data/vendor/tesseract-2.04/classify/picofeat.h +65 -0
  284. data/vendor/tesseract-2.04/classify/protos.cpp +472 -0
  285. data/vendor/tesseract-2.04/classify/protos.h +258 -0
  286. data/vendor/tesseract-2.04/classify/sigmenu.cpp +225 -0
  287. data/vendor/tesseract-2.04/classify/sigmenu.h +39 -0
  288. data/vendor/tesseract-2.04/classify/speckle.cpp +127 -0
  289. data/vendor/tesseract-2.04/classify/speckle.h +69 -0
  290. data/vendor/tesseract-2.04/classify/xform2d.cpp +120 -0
  291. data/vendor/tesseract-2.04/classify/xform2d.h +60 -0
  292. data/vendor/tesseract-2.04/config/config.guess +1466 -0
  293. data/vendor/tesseract-2.04/config/config.h.in +188 -0
  294. data/vendor/tesseract-2.04/config/config.sub +1579 -0
  295. data/vendor/tesseract-2.04/config/depcomp +530 -0
  296. data/vendor/tesseract-2.04/config/install-sh +269 -0
  297. data/vendor/tesseract-2.04/config/missing +198 -0
  298. data/vendor/tesseract-2.04/config/mkinstalldirs +40 -0
  299. data/vendor/tesseract-2.04/config/stamp-h.in +0 -0
  300. data/vendor/tesseract-2.04/configure +10424 -0
  301. data/vendor/tesseract-2.04/cutil/Makefile.am +14 -0
  302. data/vendor/tesseract-2.04/cutil/Makefile.in +612 -0
  303. data/vendor/tesseract-2.04/cutil/bitvec.cpp +115 -0
  304. data/vendor/tesseract-2.04/cutil/bitvec.h +100 -0
  305. data/vendor/tesseract-2.04/cutil/callcpp.h +190 -0
  306. data/vendor/tesseract-2.04/cutil/const.h +108 -0
  307. data/vendor/tesseract-2.04/cutil/cutil.cpp +92 -0
  308. data/vendor/tesseract-2.04/cutil/cutil.h +159 -0
  309. data/vendor/tesseract-2.04/cutil/danerror.cpp +144 -0
  310. data/vendor/tesseract-2.04/cutil/danerror.h +41 -0
  311. data/vendor/tesseract-2.04/cutil/debug.cpp +97 -0
  312. data/vendor/tesseract-2.04/cutil/debug.h +348 -0
  313. data/vendor/tesseract-2.04/cutil/efio.cpp +62 -0
  314. data/vendor/tesseract-2.04/cutil/efio.h +32 -0
  315. data/vendor/tesseract-2.04/cutil/emalloc.cpp +91 -0
  316. data/vendor/tesseract-2.04/cutil/emalloc.h +44 -0
  317. data/vendor/tesseract-2.04/cutil/freelist.cpp +75 -0
  318. data/vendor/tesseract-2.04/cutil/freelist.h +45 -0
  319. data/vendor/tesseract-2.04/cutil/funcdefs.h +35 -0
  320. data/vendor/tesseract-2.04/cutil/general.h +33 -0
  321. data/vendor/tesseract-2.04/cutil/globals.cpp +69 -0
  322. data/vendor/tesseract-2.04/cutil/globals.h +70 -0
  323. data/vendor/tesseract-2.04/cutil/listio.cpp +68 -0
  324. data/vendor/tesseract-2.04/cutil/listio.h +43 -0
  325. data/vendor/tesseract-2.04/cutil/minmax.h +40 -0
  326. data/vendor/tesseract-2.04/cutil/oldheap.cpp +337 -0
  327. data/vendor/tesseract-2.04/cutil/oldheap.h +126 -0
  328. data/vendor/tesseract-2.04/cutil/oldlist.cpp +393 -0
  329. data/vendor/tesseract-2.04/cutil/oldlist.h +350 -0
  330. data/vendor/tesseract-2.04/cutil/structures.cpp +66 -0
  331. data/vendor/tesseract-2.04/cutil/structures.h +112 -0
  332. data/vendor/tesseract-2.04/cutil/tessarray.cpp +115 -0
  333. data/vendor/tesseract-2.04/cutil/tessarray.h +166 -0
  334. data/vendor/tesseract-2.04/cutil/tordvars.cpp +95 -0
  335. data/vendor/tesseract-2.04/cutil/tordvars.h +61 -0
  336. data/vendor/tesseract-2.04/cutil/variables.cpp +317 -0
  337. data/vendor/tesseract-2.04/cutil/variables.h +170 -0
  338. data/vendor/tesseract-2.04/dict/Makefile.am +13 -0
  339. data/vendor/tesseract-2.04/dict/Makefile.in +609 -0
  340. data/vendor/tesseract-2.04/dict/choicearr.h +96 -0
  341. data/vendor/tesseract-2.04/dict/choices.cpp +210 -0
  342. data/vendor/tesseract-2.04/dict/choices.h +241 -0
  343. data/vendor/tesseract-2.04/dict/context.cpp +270 -0
  344. data/vendor/tesseract-2.04/dict/context.h +82 -0
  345. data/vendor/tesseract-2.04/dict/dawg.cpp +363 -0
  346. data/vendor/tesseract-2.04/dict/dawg.h +394 -0
  347. data/vendor/tesseract-2.04/dict/hyphen.cpp +84 -0
  348. data/vendor/tesseract-2.04/dict/hyphen.h +125 -0
  349. data/vendor/tesseract-2.04/dict/lookdawg.cpp +228 -0
  350. data/vendor/tesseract-2.04/dict/lookdawg.h +76 -0
  351. data/vendor/tesseract-2.04/dict/makedawg.cpp +449 -0
  352. data/vendor/tesseract-2.04/dict/makedawg.h +83 -0
  353. data/vendor/tesseract-2.04/dict/matchdefs.h +145 -0
  354. data/vendor/tesseract-2.04/dict/permdawg.cpp +415 -0
  355. data/vendor/tesseract-2.04/dict/permdawg.h +98 -0
  356. data/vendor/tesseract-2.04/dict/permngram.cpp +358 -0
  357. data/vendor/tesseract-2.04/dict/permngram.h +33 -0
  358. data/vendor/tesseract-2.04/dict/permnum.cpp +522 -0
  359. data/vendor/tesseract-2.04/dict/permnum.h +83 -0
  360. data/vendor/tesseract-2.04/dict/permute.cpp +1704 -0
  361. data/vendor/tesseract-2.04/dict/permute.h +93 -0
  362. data/vendor/tesseract-2.04/dict/reduce.cpp +424 -0
  363. data/vendor/tesseract-2.04/dict/reduce.h +112 -0
  364. data/vendor/tesseract-2.04/dict/states.cpp +382 -0
  365. data/vendor/tesseract-2.04/dict/states.h +111 -0
  366. data/vendor/tesseract-2.04/dict/stopper.cpp +1458 -0
  367. data/vendor/tesseract-2.04/dict/stopper.h +103 -0
  368. data/vendor/tesseract-2.04/dict/trie.cpp +683 -0
  369. data/vendor/tesseract-2.04/dict/trie.h +190 -0
  370. data/vendor/tesseract-2.04/dlltest/Makefile.am +2 -0
  371. data/vendor/tesseract-2.04/dlltest/Makefile.in +388 -0
  372. data/vendor/tesseract-2.04/dlltest/dlltest.cpp +163 -0
  373. data/vendor/tesseract-2.04/dlltest/dlltest.dsp +186 -0
  374. data/vendor/tesseract-2.04/dlltest/dlltest.vcproj +637 -0
  375. data/vendor/tesseract-2.04/eurotext.tif +0 -0
  376. data/vendor/tesseract-2.04/image/Makefile.am +10 -0
  377. data/vendor/tesseract-2.04/image/Makefile.in +596 -0
  378. data/vendor/tesseract-2.04/image/bitstrm.cpp +157 -0
  379. data/vendor/tesseract-2.04/image/bitstrm.h +73 -0
  380. data/vendor/tesseract-2.04/image/img.h +336 -0
  381. data/vendor/tesseract-2.04/image/imgbmp.cpp +223 -0
  382. data/vendor/tesseract-2.04/image/imgbmp.h +50 -0
  383. data/vendor/tesseract-2.04/image/imgerrs.h +35 -0
  384. data/vendor/tesseract-2.04/image/imgio.cpp +321 -0
  385. data/vendor/tesseract-2.04/image/imgio.h +22 -0
  386. data/vendor/tesseract-2.04/image/imgs.cpp +1764 -0
  387. data/vendor/tesseract-2.04/image/imgs.h +102 -0
  388. data/vendor/tesseract-2.04/image/imgtiff.cpp +723 -0
  389. data/vendor/tesseract-2.04/image/imgtiff.h +89 -0
  390. data/vendor/tesseract-2.04/image/imgunpk.h +1377 -0
  391. data/vendor/tesseract-2.04/image/svshowim.cpp +40 -0
  392. data/vendor/tesseract-2.04/image/svshowim.h +25 -0
  393. data/vendor/tesseract-2.04/java/Makefile.am +4 -0
  394. data/vendor/tesseract-2.04/java/Makefile.in +473 -0
  395. data/vendor/tesseract-2.04/java/com/Makefile.am +1 -0
  396. data/vendor/tesseract-2.04/java/com/Makefile.in +470 -0
  397. data/vendor/tesseract-2.04/java/com/google/Makefile.am +1 -0
  398. data/vendor/tesseract-2.04/java/com/google/Makefile.in +470 -0
  399. data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.am +4 -0
  400. data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.in +473 -0
  401. data/vendor/tesseract-2.04/java/com/google/scrollview/ScrollView.java +421 -0
  402. data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.am +5 -0
  403. data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.in +474 -0
  404. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEvent.java +87 -0
  405. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventHandler.java +296 -0
  406. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventType.java +31 -0
  407. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.am +7 -0
  408. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.in +476 -0
  409. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVAbstractMenuItem.java +58 -0
  410. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVCheckboxMenuItem.java +60 -0
  411. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVEmptyMenuItem.java +48 -0
  412. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVImageHandler.java +228 -0
  413. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuBar.java +130 -0
  414. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuItem.java +61 -0
  415. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVPopupMenu.java +142 -0
  416. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVSubMenuItem.java +41 -0
  417. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVWindow.java +643 -0
  418. data/vendor/tesseract-2.04/java/makefile +55 -0
  419. data/vendor/tesseract-2.04/pageseg/Makefile.am +13 -0
  420. data/vendor/tesseract-2.04/pageseg/Makefile.in +596 -0
  421. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.cpp +363 -0
  422. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.h +90 -0
  423. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.cpp +82 -0
  424. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.h +30 -0
  425. data/vendor/tesseract-2.04/pageseg/pageseg.cpp +170 -0
  426. data/vendor/tesseract-2.04/pageseg/pageseg.h +29 -0
  427. data/vendor/tesseract-2.04/phototest.tif +0 -0
  428. data/vendor/tesseract-2.04/tessdata/Makefile.am +31 -0
  429. data/vendor/tesseract-2.04/tessdata/Makefile.in +529 -0
  430. data/vendor/tesseract-2.04/tessdata/configs/Makefile.am +3 -0
  431. data/vendor/tesseract-2.04/tessdata/configs/Makefile.in +344 -0
  432. data/vendor/tesseract-2.04/tessdata/configs/api_config +1 -0
  433. data/vendor/tesseract-2.04/tessdata/configs/box.train +19 -0
  434. data/vendor/tesseract-2.04/tessdata/configs/box.train.stderr +18 -0
  435. data/vendor/tesseract-2.04/tessdata/configs/inter +4 -0
  436. data/vendor/tesseract-2.04/tessdata/configs/kannada +4 -0
  437. data/vendor/tesseract-2.04/tessdata/configs/makebox +1 -0
  438. data/vendor/tesseract-2.04/tessdata/configs/unlv +3 -0
  439. data/vendor/tesseract-2.04/tessdata/confsets +3 -0
  440. data/vendor/tesseract-2.04/tessdata/eng.DangAmbigs +39 -0
  441. data/vendor/tesseract-2.04/tessdata/eng.freq-dawg +0 -0
  442. data/vendor/tesseract-2.04/tessdata/eng.inttemp +0 -0
  443. data/vendor/tesseract-2.04/tessdata/eng.normproto +1247 -0
  444. data/vendor/tesseract-2.04/tessdata/eng.pffmtable +111 -0
  445. data/vendor/tesseract-2.04/tessdata/eng.unicharset +113 -0
  446. data/vendor/tesseract-2.04/tessdata/eng.user-words +921 -0
  447. data/vendor/tesseract-2.04/tessdata/eng.word-dawg +0 -0
  448. data/vendor/tesseract-2.04/tessdata/makedummies +8 -0
  449. data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.am +3 -0
  450. data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.in +344 -0
  451. data/vendor/tesseract-2.04/tessdata/tessconfigs/batch +2 -0
  452. data/vendor/tesseract-2.04/tessdata/tessconfigs/batch.nochop +2 -0
  453. data/vendor/tesseract-2.04/tessdata/tessconfigs/matdemo +7 -0
  454. data/vendor/tesseract-2.04/tessdata/tessconfigs/msdemo +13 -0
  455. data/vendor/tesseract-2.04/tessdata/tessconfigs/nobatch +2 -0
  456. data/vendor/tesseract-2.04/tessdata/tessconfigs/segdemo +9 -0
  457. data/vendor/tesseract-2.04/tessdll.cpp +351 -0
  458. data/vendor/tesseract-2.04/tessdll.dsp +2050 -0
  459. data/vendor/tesseract-2.04/tessdll.h +143 -0
  460. data/vendor/tesseract-2.04/tessdll.vcproj +5495 -0
  461. data/vendor/tesseract-2.04/tesseract.dsp +2124 -0
  462. data/vendor/tesseract-2.04/tesseract.dsw +116 -0
  463. data/vendor/tesseract-2.04/tesseract.sln +59 -0
  464. data/vendor/tesseract-2.04/tesseract.spec +188 -0
  465. data/vendor/tesseract-2.04/tesseract.vcproj +5859 -0
  466. data/vendor/tesseract-2.04/testing/Makefile.am +2 -0
  467. data/vendor/tesseract-2.04/testing/Makefile.in +312 -0
  468. data/vendor/tesseract-2.04/testing/README +43 -0
  469. data/vendor/tesseract-2.04/testing/counttestset.sh +61 -0
  470. data/vendor/tesseract-2.04/testing/reorgdata.sh +44 -0
  471. data/vendor/tesseract-2.04/testing/reports/1995.bus.3B.sum +1 -0
  472. data/vendor/tesseract-2.04/testing/reports/1995.doe3.3B.sum +1 -0
  473. data/vendor/tesseract-2.04/testing/reports/1995.mag.3B.sum +1 -0
  474. data/vendor/tesseract-2.04/testing/reports/1995.news.3B.sum +1 -0
  475. data/vendor/tesseract-2.04/testing/reports/2.03.summary +9 -0
  476. data/vendor/tesseract-2.04/testing/reports/2.04.summary +9 -0
  477. data/vendor/tesseract-2.04/testing/runalltests.sh +110 -0
  478. data/vendor/tesseract-2.04/testing/runtestset.sh +61 -0
  479. data/vendor/tesseract-2.04/textord/Makefile.am +20 -0
  480. data/vendor/tesseract-2.04/textord/Makefile.in +624 -0
  481. data/vendor/tesseract-2.04/textord/blkocc.cpp +809 -0
  482. data/vendor/tesseract-2.04/textord/blkocc.h +327 -0
  483. data/vendor/tesseract-2.04/textord/blobcmpl.h +31 -0
  484. data/vendor/tesseract-2.04/textord/drawedg.cpp +77 -0
  485. data/vendor/tesseract-2.04/textord/drawedg.h +34 -0
  486. data/vendor/tesseract-2.04/textord/drawtord.cpp +469 -0
  487. data/vendor/tesseract-2.04/textord/drawtord.h +107 -0
  488. data/vendor/tesseract-2.04/textord/edgblob.cpp +412 -0
  489. data/vendor/tesseract-2.04/textord/edgblob.h +100 -0
  490. data/vendor/tesseract-2.04/textord/edgloop.cpp +211 -0
  491. data/vendor/tesseract-2.04/textord/edgloop.h +66 -0
  492. data/vendor/tesseract-2.04/textord/fpchop.cpp +1641 -0
  493. data/vendor/tesseract-2.04/textord/fpchop.h +238 -0
  494. data/vendor/tesseract-2.04/textord/gap_map.cpp +166 -0
  495. data/vendor/tesseract-2.04/textord/gap_map.h +40 -0
  496. data/vendor/tesseract-2.04/textord/makerow.cpp +2628 -0
  497. data/vendor/tesseract-2.04/textord/makerow.h +295 -0
  498. data/vendor/tesseract-2.04/textord/oldbasel.cpp +1761 -0
  499. data/vendor/tesseract-2.04/textord/oldbasel.h +195 -0
  500. data/vendor/tesseract-2.04/textord/pithsync.cpp +696 -0
  501. data/vendor/tesseract-2.04/textord/pithsync.h +134 -0
  502. data/vendor/tesseract-2.04/textord/pitsync1.cpp +425 -0
  503. data/vendor/tesseract-2.04/textord/pitsync1.h +135 -0
  504. data/vendor/tesseract-2.04/textord/scanedg.cpp +452 -0
  505. data/vendor/tesseract-2.04/textord/scanedg.h +74 -0
  506. data/vendor/tesseract-2.04/textord/sortflts.cpp +80 -0
  507. data/vendor/tesseract-2.04/textord/sortflts.h +64 -0
  508. data/vendor/tesseract-2.04/textord/tessout.h +76 -0
  509. data/vendor/tesseract-2.04/textord/topitch.cpp +2019 -0
  510. data/vendor/tesseract-2.04/textord/topitch.h +195 -0
  511. data/vendor/tesseract-2.04/textord/tordmain.cpp +907 -0
  512. data/vendor/tesseract-2.04/textord/tordmain.h +132 -0
  513. data/vendor/tesseract-2.04/textord/tospace.cpp +1939 -0
  514. data/vendor/tesseract-2.04/textord/tospace.h +193 -0
  515. data/vendor/tesseract-2.04/textord/tovars.cpp +87 -0
  516. data/vendor/tesseract-2.04/textord/tovars.h +94 -0
  517. data/vendor/tesseract-2.04/textord/underlin.cpp +312 -0
  518. data/vendor/tesseract-2.04/textord/underlin.h +53 -0
  519. data/vendor/tesseract-2.04/textord/wordseg.cpp +620 -0
  520. data/vendor/tesseract-2.04/textord/wordseg.h +70 -0
  521. data/vendor/tesseract-2.04/training/Makefile.am +54 -0
  522. data/vendor/tesseract-2.04/training/Makefile.in +720 -0
  523. data/vendor/tesseract-2.04/training/cnTraining.cpp +855 -0
  524. data/vendor/tesseract-2.04/training/cntraining.dsp +243 -0
  525. data/vendor/tesseract-2.04/training/cntraining.vcproj +950 -0
  526. data/vendor/tesseract-2.04/training/mergenf.cpp +451 -0
  527. data/vendor/tesseract-2.04/training/mergenf.h +106 -0
  528. data/vendor/tesseract-2.04/training/mfTraining.cpp +1341 -0
  529. data/vendor/tesseract-2.04/training/mftraining.dsp +285 -0
  530. data/vendor/tesseract-2.04/training/mftraining.vcproj +1055 -0
  531. data/vendor/tesseract-2.04/training/name2char.cpp +166 -0
  532. data/vendor/tesseract-2.04/training/name2char.h +38 -0
  533. data/vendor/tesseract-2.04/training/training.cpp +190 -0
  534. data/vendor/tesseract-2.04/training/training.h +130 -0
  535. data/vendor/tesseract-2.04/training/unicharset_extractor.cpp +140 -0
  536. data/vendor/tesseract-2.04/training/unicharset_extractor.dsp +335 -0
  537. data/vendor/tesseract-2.04/training/unicharset_extractor.vcproj +769 -0
  538. data/vendor/tesseract-2.04/training/wordlist2dawg.cpp +69 -0
  539. data/vendor/tesseract-2.04/training/wordlist2dawg.dsp +319 -0
  540. data/vendor/tesseract-2.04/training/wordlist2dawg.vcproj +1113 -0
  541. data/vendor/tesseract-2.04/viewer/Makefile.am +9 -0
  542. data/vendor/tesseract-2.04/viewer/Makefile.in +591 -0
  543. data/vendor/tesseract-2.04/viewer/scrollview.cpp +825 -0
  544. data/vendor/tesseract-2.04/viewer/scrollview.h +414 -0
  545. data/vendor/tesseract-2.04/viewer/svmnode.cpp +140 -0
  546. data/vendor/tesseract-2.04/viewer/svmnode.h +94 -0
  547. data/vendor/tesseract-2.04/viewer/svpaint.cpp +220 -0
  548. data/vendor/tesseract-2.04/viewer/svutil.cpp +347 -0
  549. data/vendor/tesseract-2.04/viewer/svutil.h +138 -0
  550. data/vendor/tesseract-2.04/wordrec/Makefile.am +23 -0
  551. data/vendor/tesseract-2.04/wordrec/Makefile.in +641 -0
  552. data/vendor/tesseract-2.04/wordrec/associate.cpp +62 -0
  553. data/vendor/tesseract-2.04/wordrec/associate.h +93 -0
  554. data/vendor/tesseract-2.04/wordrec/badwords.cpp +106 -0
  555. data/vendor/tesseract-2.04/wordrec/badwords.h +51 -0
  556. data/vendor/tesseract-2.04/wordrec/bestfirst.cpp +526 -0
  557. data/vendor/tesseract-2.04/wordrec/bestfirst.h +203 -0
  558. data/vendor/tesseract-2.04/wordrec/charsample.h +208 -0
  559. data/vendor/tesseract-2.04/wordrec/chop.cpp +458 -0
  560. data/vendor/tesseract-2.04/wordrec/chop.h +153 -0
  561. data/vendor/tesseract-2.04/wordrec/chopper.cpp +750 -0
  562. data/vendor/tesseract-2.04/wordrec/chopper.h +104 -0
  563. data/vendor/tesseract-2.04/wordrec/closed.cpp +136 -0
  564. data/vendor/tesseract-2.04/wordrec/closed.h +65 -0
  565. data/vendor/tesseract-2.04/wordrec/djmenus.cpp +118 -0
  566. data/vendor/tesseract-2.04/wordrec/djmenus.h +33 -0
  567. data/vendor/tesseract-2.04/wordrec/drawfx.cpp +92 -0
  568. data/vendor/tesseract-2.04/wordrec/drawfx.h +33 -0
  569. data/vendor/tesseract-2.04/wordrec/findseam.cpp +566 -0
  570. data/vendor/tesseract-2.04/wordrec/findseam.h +69 -0
  571. data/vendor/tesseract-2.04/wordrec/gradechop.cpp +226 -0
  572. data/vendor/tesseract-2.04/wordrec/gradechop.h +91 -0
  573. data/vendor/tesseract-2.04/wordrec/heuristic.cpp +194 -0
  574. data/vendor/tesseract-2.04/wordrec/heuristic.h +120 -0
  575. data/vendor/tesseract-2.04/wordrec/makechop.cpp +281 -0
  576. data/vendor/tesseract-2.04/wordrec/makechop.h +69 -0
  577. data/vendor/tesseract-2.04/wordrec/matchtab.cpp +191 -0
  578. data/vendor/tesseract-2.04/wordrec/matchtab.h +45 -0
  579. data/vendor/tesseract-2.04/wordrec/matrix.cpp +118 -0
  580. data/vendor/tesseract-2.04/wordrec/matrix.h +104 -0
  581. data/vendor/tesseract-2.04/wordrec/measure.h +135 -0
  582. data/vendor/tesseract-2.04/wordrec/metrics.cpp +363 -0
  583. data/vendor/tesseract-2.04/wordrec/metrics.h +130 -0
  584. data/vendor/tesseract-2.04/wordrec/mfvars.cpp +51 -0
  585. data/vendor/tesseract-2.04/wordrec/mfvars.h +27 -0
  586. data/vendor/tesseract-2.04/wordrec/msmenus.cpp +110 -0
  587. data/vendor/tesseract-2.04/wordrec/msmenus.h +45 -0
  588. data/vendor/tesseract-2.04/wordrec/olutil.cpp +153 -0
  589. data/vendor/tesseract-2.04/wordrec/olutil.h +128 -0
  590. data/vendor/tesseract-2.04/wordrec/outlines.cpp +172 -0
  591. data/vendor/tesseract-2.04/wordrec/outlines.h +148 -0
  592. data/vendor/tesseract-2.04/wordrec/pieces.cpp +410 -0
  593. data/vendor/tesseract-2.04/wordrec/pieces.h +154 -0
  594. data/vendor/tesseract-2.04/wordrec/plotedges.cpp +134 -0
  595. data/vendor/tesseract-2.04/wordrec/plotedges.h +71 -0
  596. data/vendor/tesseract-2.04/wordrec/plotseg.cpp +116 -0
  597. data/vendor/tesseract-2.04/wordrec/plotseg.h +73 -0
  598. data/vendor/tesseract-2.04/wordrec/render.cpp +152 -0
  599. data/vendor/tesseract-2.04/wordrec/render.h +58 -0
  600. data/vendor/tesseract-2.04/wordrec/seam.cpp +482 -0
  601. data/vendor/tesseract-2.04/wordrec/seam.h +136 -0
  602. data/vendor/tesseract-2.04/wordrec/split.cpp +182 -0
  603. data/vendor/tesseract-2.04/wordrec/split.h +115 -0
  604. data/vendor/tesseract-2.04/wordrec/tally.cpp +68 -0
  605. data/vendor/tesseract-2.04/wordrec/tally.h +94 -0
  606. data/vendor/tesseract-2.04/wordrec/tessinit.cpp +108 -0
  607. data/vendor/tesseract-2.04/wordrec/tessinit.h +46 -0
  608. data/vendor/tesseract-2.04/wordrec/tface.cpp +272 -0
  609. data/vendor/tesseract-2.04/wordrec/tface.h +35 -0
  610. data/vendor/tesseract-2.04/wordrec/wordclass.cpp +284 -0
  611. data/vendor/tesseract-2.04/wordrec/wordclass.h +64 -0
  612. metadata +708 -0
@@ -0,0 +1,2834 @@
1
+ /******************************************************************************
2
+ ** Filename: cluster.c
3
+ ** Purpose: Routines for clustering points in N-D space
4
+ ** Author: Dan Johnson
5
+ ** History: 5/29/89, DSJ, Created.
6
+ **
7
+ ** (c) Copyright Hewlett-Packard Company, 1988.
8
+ ** Licensed under the Apache License, Version 2.0 (the "License");
9
+ ** you may not use this file except in compliance with the License.
10
+ ** You may obtain a copy of the License at
11
+ ** http://www.apache.org/licenses/LICENSE-2.0
12
+ ** Unless required by applicable law or agreed to in writing, software
13
+ ** distributed under the License is distributed on an "AS IS" BASIS,
14
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ ** See the License for the specific language governing permissions and
16
+ ** limitations under the License.
17
+ ******************************************************************************/
18
+ #include "oldheap.h"
19
+ #include "const.h"
20
+ #include "cluster.h"
21
+ #include "emalloc.h"
22
+ #include "tprintf.h"
23
+ #include "danerror.h"
24
+ #include "freelist.h"
25
+ #include <math.h>
26
+
27
+ #define HOTELLING 1 // If true use Hotelling's test to decide where to split.
28
+ #define FTABLE_X 10 // Size of FTable.
29
+ #define FTABLE_Y 100 // Size of FTable.
30
+
31
+ // Table of values approximating the cumulative F-distribution for a confidence of 1%.
32
+ double FTable[FTABLE_Y][FTABLE_X] = {
33
+ {4052.19, 4999.52, 5403.34, 5624.62, 5763.65, 5858.97, 5928.33, 5981.10, 6022.50, 6055.85,},
34
+ {98.502, 99.000, 99.166, 99.249, 99.300, 99.333, 99.356, 99.374, 99.388, 99.399,},
35
+ {34.116, 30.816, 29.457, 28.710, 28.237, 27.911, 27.672, 27.489, 27.345, 27.229,},
36
+ {21.198, 18.000, 16.694, 15.977, 15.522, 15.207, 14.976, 14.799, 14.659, 14.546,},
37
+ {16.258, 13.274, 12.060, 11.392, 10.967, 10.672, 10.456, 10.289, 10.158, 10.051,},
38
+ {13.745, 10.925, 9.780, 9.148, 8.746, 8.466, 8.260, 8.102, 7.976, 7.874,},
39
+ {12.246, 9.547, 8.451, 7.847, 7.460, 7.191, 6.993, 6.840, 6.719, 6.620,},
40
+ {11.259, 8.649, 7.591, 7.006, 6.632, 6.371, 6.178, 6.029, 5.911, 5.814,},
41
+ {10.561, 8.022, 6.992, 6.422, 6.057, 5.802, 5.613, 5.467, 5.351, 5.257,},
42
+ {10.044, 7.559, 6.552, 5.994, 5.636, 5.386, 5.200, 5.057, 4.942, 4.849,},
43
+ { 9.646, 7.206, 6.217, 5.668, 5.316, 5.069, 4.886, 4.744, 4.632, 4.539,},
44
+ { 9.330, 6.927, 5.953, 5.412, 5.064, 4.821, 4.640, 4.499, 4.388, 4.296,},
45
+ { 9.074, 6.701, 5.739, 5.205, 4.862, 4.620, 4.441, 4.302, 4.191, 4.100,},
46
+ { 8.862, 6.515, 5.564, 5.035, 4.695, 4.456, 4.278, 4.140, 4.030, 3.939,},
47
+ { 8.683, 6.359, 5.417, 4.893, 4.556, 4.318, 4.142, 4.004, 3.895, 3.805,},
48
+ { 8.531, 6.226, 5.292, 4.773, 4.437, 4.202, 4.026, 3.890, 3.780, 3.691,},
49
+ { 8.400, 6.112, 5.185, 4.669, 4.336, 4.102, 3.927, 3.791, 3.682, 3.593,},
50
+ { 8.285, 6.013, 5.092, 4.579, 4.248, 4.015, 3.841, 3.705, 3.597, 3.508,},
51
+ { 8.185, 5.926, 5.010, 4.500, 4.171, 3.939, 3.765, 3.631, 3.523, 3.434,},
52
+ { 8.096, 5.849, 4.938, 4.431, 4.103, 3.871, 3.699, 3.564, 3.457, 3.368,},
53
+ { 8.017, 5.780, 4.874, 4.369, 4.042, 3.812, 3.640, 3.506, 3.398, 3.310,},
54
+ { 7.945, 5.719, 4.817, 4.313, 3.988, 3.758, 3.587, 3.453, 3.346, 3.258,},
55
+ { 7.881, 5.664, 4.765, 4.264, 3.939, 3.710, 3.539, 3.406, 3.299, 3.211,},
56
+ { 7.823, 5.614, 4.718, 4.218, 3.895, 3.667, 3.496, 3.363, 3.256, 3.168,},
57
+ { 7.770, 5.568, 4.675, 4.177, 3.855, 3.627, 3.457, 3.324, 3.217, 3.129,},
58
+ { 7.721, 5.526, 4.637, 4.140, 3.818, 3.591, 3.421, 3.288, 3.182, 3.094,},
59
+ { 7.677, 5.488, 4.601, 4.106, 3.785, 3.558, 3.388, 3.256, 3.149, 3.062,},
60
+ { 7.636, 5.453, 4.568, 4.074, 3.754, 3.528, 3.358, 3.226, 3.120, 3.032,},
61
+ { 7.598, 5.420, 4.538, 4.045, 3.725, 3.499, 3.330, 3.198, 3.092, 3.005,},
62
+ { 7.562, 5.390, 4.510, 4.018, 3.699, 3.473, 3.305, 3.173, 3.067, 2.979,},
63
+ { 7.530, 5.362, 4.484, 3.993, 3.675, 3.449, 3.281, 3.149, 3.043, 2.955,},
64
+ { 7.499, 5.336, 4.459, 3.969, 3.652, 3.427, 3.258, 3.127, 3.021, 2.934,},
65
+ { 7.471, 5.312, 4.437, 3.948, 3.630, 3.406, 3.238, 3.106, 3.000, 2.913,},
66
+ { 7.444, 5.289, 4.416, 3.927, 3.611, 3.386, 3.218, 3.087, 2.981, 2.894,},
67
+ { 7.419, 5.268, 4.396, 3.908, 3.592, 3.368, 3.200, 3.069, 2.963, 2.876,},
68
+ { 7.396, 5.248, 4.377, 3.890, 3.574, 3.351, 3.183, 3.052, 2.946, 2.859,},
69
+ { 7.373, 5.229, 4.360, 3.873, 3.558, 3.334, 3.167, 3.036, 2.930, 2.843,},
70
+ { 7.353, 5.211, 4.343, 3.858, 3.542, 3.319, 3.152, 3.021, 2.915, 2.828,},
71
+ { 7.333, 5.194, 4.327, 3.843, 3.528, 3.305, 3.137, 3.006, 2.901, 2.814,},
72
+ { 7.314, 5.179, 4.313, 3.828, 3.514, 3.291, 3.124, 2.993, 2.888, 2.801,},
73
+ { 7.296, 5.163, 4.299, 3.815, 3.501, 3.278, 3.111, 2.980, 2.875, 2.788,},
74
+ { 7.280, 5.149, 4.285, 3.802, 3.488, 3.266, 3.099, 2.968, 2.863, 2.776,},
75
+ { 7.264, 5.136, 4.273, 3.790, 3.476, 3.254, 3.087, 2.957, 2.851, 2.764,},
76
+ { 7.248, 5.123, 4.261, 3.778, 3.465, 3.243, 3.076, 2.946, 2.840, 2.754,},
77
+ { 7.234, 5.110, 4.249, 3.767, 3.454, 3.232, 3.066, 2.935, 2.830, 2.743,},
78
+ { 7.220, 5.099, 4.238, 3.757, 3.444, 3.222, 3.056, 2.925, 2.820, 2.733,},
79
+ { 7.207, 5.087, 4.228, 3.747, 3.434, 3.213, 3.046, 2.916, 2.811, 2.724,},
80
+ { 7.194, 5.077, 4.218, 3.737, 3.425, 3.204, 3.037, 2.907, 2.802, 2.715,},
81
+ { 7.182, 5.066, 4.208, 3.728, 3.416, 3.195, 3.028, 2.898, 2.793, 2.706,},
82
+ { 7.171, 5.057, 4.199, 3.720, 3.408, 3.186, 3.020, 2.890, 2.785, 2.698,},
83
+ { 7.159, 5.047, 4.191, 3.711, 3.400, 3.178, 3.012, 2.882, 2.777, 2.690,},
84
+ { 7.149, 5.038, 4.182, 3.703, 3.392, 3.171, 3.005, 2.874, 2.769, 2.683,},
85
+ { 7.139, 5.030, 4.174, 3.695, 3.384, 3.163, 2.997, 2.867, 2.762, 2.675,},
86
+ { 7.129, 5.021, 4.167, 3.688, 3.377, 3.156, 2.990, 2.860, 2.755, 2.668,},
87
+ { 7.119, 5.013, 4.159, 3.681, 3.370, 3.149, 2.983, 2.853, 2.748, 2.662,},
88
+ { 7.110, 5.006, 4.152, 3.674, 3.363, 3.143, 2.977, 2.847, 2.742, 2.655,},
89
+ { 7.102, 4.998, 4.145, 3.667, 3.357, 3.136, 2.971, 2.841, 2.736, 2.649,},
90
+ { 7.093, 4.991, 4.138, 3.661, 3.351, 3.130, 2.965, 2.835, 2.730, 2.643,},
91
+ { 7.085, 4.984, 4.132, 3.655, 3.345, 3.124, 2.959, 2.829, 2.724, 2.637,},
92
+ { 7.077, 4.977, 4.126, 3.649, 3.339, 3.119, 2.953, 2.823, 2.718, 2.632,},
93
+ { 7.070, 4.971, 4.120, 3.643, 3.333, 3.113, 2.948, 2.818, 2.713, 2.626,},
94
+ { 7.062, 4.965, 4.114, 3.638, 3.328, 3.108, 2.942, 2.813, 2.708, 2.621,},
95
+ { 7.055, 4.959, 4.109, 3.632, 3.323, 3.103, 2.937, 2.808, 2.703, 2.616,},
96
+ { 7.048, 4.953, 4.103, 3.627, 3.318, 3.098, 2.932, 2.803, 2.698, 2.611,},
97
+ { 7.042, 4.947, 4.098, 3.622, 3.313, 3.093, 2.928, 2.798, 2.693, 2.607,},
98
+ { 7.035, 4.942, 4.093, 3.618, 3.308, 3.088, 2.923, 2.793, 2.689, 2.602,},
99
+ { 7.029, 4.937, 4.088, 3.613, 3.304, 3.084, 2.919, 2.789, 2.684, 2.598,},
100
+ { 7.023, 4.932, 4.083, 3.608, 3.299, 3.080, 2.914, 2.785, 2.680, 2.593,},
101
+ { 7.017, 4.927, 4.079, 3.604, 3.295, 3.075, 2.910, 2.781, 2.676, 2.589,},
102
+ { 7.011, 4.922, 4.074, 3.600, 3.291, 3.071, 2.906, 2.777, 2.672, 2.585,},
103
+ { 7.006, 4.917, 4.070, 3.596, 3.287, 3.067, 2.902, 2.773, 2.668, 2.581,},
104
+ { 7.001, 4.913, 4.066, 3.591, 3.283, 3.063, 2.898, 2.769, 2.664, 2.578,},
105
+ { 6.995, 4.908, 4.062, 3.588, 3.279, 3.060, 2.895, 2.765, 2.660, 2.574,},
106
+ { 6.990, 4.904, 4.058, 3.584, 3.275, 3.056, 2.891, 2.762, 2.657, 2.570,},
107
+ { 6.985, 4.900, 4.054, 3.580, 3.272, 3.052, 2.887, 2.758, 2.653, 2.567,},
108
+ { 6.981, 4.896, 4.050, 3.577, 3.268, 3.049, 2.884, 2.755, 2.650, 2.563,},
109
+ { 6.976, 4.892, 4.047, 3.573, 3.265, 3.046, 2.881, 2.751, 2.647, 2.560,},
110
+ { 6.971, 4.888, 4.043, 3.570, 3.261, 3.042, 2.877, 2.748, 2.644, 2.557,},
111
+ { 6.967, 4.884, 4.040, 3.566, 3.258, 3.039, 2.874, 2.745, 2.640, 2.554,},
112
+ { 6.963, 4.881, 4.036, 3.563, 3.255, 3.036, 2.871, 2.742, 2.637, 2.551,},
113
+ { 6.958, 4.877, 4.033, 3.560, 3.252, 3.033, 2.868, 2.739, 2.634, 2.548,},
114
+ { 6.954, 4.874, 4.030, 3.557, 3.249, 3.030, 2.865, 2.736, 2.632, 2.545,},
115
+ { 6.950, 4.870, 4.027, 3.554, 3.246, 3.027, 2.863, 2.733, 2.629, 2.542,},
116
+ { 6.947, 4.867, 4.024, 3.551, 3.243, 3.025, 2.860, 2.731, 2.626, 2.539,},
117
+ { 6.943, 4.864, 4.021, 3.548, 3.240, 3.022, 2.857, 2.728, 2.623, 2.537,},
118
+ { 6.939, 4.861, 4.018, 3.545, 3.238, 3.019, 2.854, 2.725, 2.621, 2.534,},
119
+ { 6.935, 4.858, 4.015, 3.543, 3.235, 3.017, 2.852, 2.723, 2.618, 2.532,},
120
+ { 6.932, 4.855, 4.012, 3.540, 3.233, 3.014, 2.849, 2.720, 2.616, 2.529,},
121
+ { 6.928, 4.852, 4.010, 3.538, 3.230, 3.012, 2.847, 2.718, 2.613, 2.527,},
122
+ { 6.925, 4.849, 4.007, 3.535, 3.228, 3.009, 2.845, 2.715, 2.611, 2.524,},
123
+ { 6.922, 4.846, 4.004, 3.533, 3.225, 3.007, 2.842, 2.713, 2.609, 2.522,},
124
+ { 6.919, 4.844, 4.002, 3.530, 3.223, 3.004, 2.840, 2.711, 2.606, 2.520,},
125
+ { 6.915, 4.841, 3.999, 3.528, 3.221, 3.002, 2.838, 2.709, 2.604, 2.518,},
126
+ { 6.912, 4.838, 3.997, 3.525, 3.218, 3.000, 2.835, 2.706, 2.602, 2.515,},
127
+ { 6.909, 4.836, 3.995, 3.523, 3.216, 2.998, 2.833, 2.704, 2.600, 2.513,},
128
+ { 6.906, 4.833, 3.992, 3.521, 3.214, 2.996, 2.831, 2.702, 2.598, 2.511,},
129
+ { 6.904, 4.831, 3.990, 3.519, 3.212, 2.994, 2.829, 2.700, 2.596, 2.509,},
130
+ { 6.901, 4.829, 3.988, 3.517, 3.210, 2.992, 2.827, 2.698, 2.594, 2.507,},
131
+ { 6.898, 4.826, 3.986, 3.515, 3.208, 2.990, 2.825, 2.696, 2.592, 2.505,},
132
+ { 6.895, 4.824, 3.984, 3.513, 3.206, 2.988, 2.823, 2.694, 2.590, 2.503}
133
+ };
134
+
135
+ /* define the variance which will be used as a minimum variance for any
136
+ dimension of any feature. Since most features are calculated from numbers
137
+ with a precision no better than 1 in 128, the variance should never be
138
+ less than the square of this number for parameters whose range is 1. */
139
+ #define MINVARIANCE 0.0001
140
+
141
+ /* define the absolute minimum number of samples which must be present in
142
+ order to accurately test hypotheses about underlying probability
143
+ distributions. Define separately the minimum samples that are needed
144
+ before a statistical analysis is attempted; this number should be
145
+ equal to MINSAMPLES but can be set to a lower number for early testing
146
+ when very few samples are available. */
147
+ #define MINBUCKETS 5
148
+ #define MINSAMPLESPERBUCKET 5
149
+ #define MINSAMPLES (MINBUCKETS * MINSAMPLESPERBUCKET)
150
+ #define MINSAMPLESNEEDED 1
151
+
152
+ /* define the size of the table which maps normalized samples to
153
+ histogram buckets. Also define the number of standard deviations
154
+ in a normal distribution which are considered to be significant.
155
+ The mapping table will be defined in such a way that it covers
156
+ the specified number of standard deviations on either side of
157
+ the mean. BUCKETTABLESIZE should always be even. */
158
+ #define BUCKETTABLESIZE 1024
159
+ #define NORMALEXTENT 3.0
160
+
161
+ typedef struct
162
+ {
163
+ CLUSTER *Cluster;
164
+ CLUSTER *Neighbor;
165
+ }
166
+
167
+
168
+ TEMPCLUSTER;
169
+
170
+ typedef struct
171
+ {
172
+ FLOAT32 AvgVariance;
173
+ FLOAT32 *CoVariance;
174
+ FLOAT32 *Min; // largest negative distance from the mean
175
+ FLOAT32 *Max; // largest positive distance from the mean
176
+ }
177
+
178
+
179
+ STATISTICS;
180
+
181
+ typedef struct
182
+ {
183
+ DISTRIBUTION Distribution; // distribution being tested for
184
+ uinT32 SampleCount; // # of samples in histogram
185
+ FLOAT64 Confidence; // confidence level of test
186
+ FLOAT64 ChiSquared; // test threshold
187
+ uinT16 NumberOfBuckets; // number of cells in histogram
188
+ uinT16 Bucket[BUCKETTABLESIZE];// mapping to histogram buckets
189
+ uinT32 *Count; // frequency of occurence histogram
190
+ FLOAT32 *ExpectedCount; // expected histogram
191
+ }
192
+
193
+
194
+ BUCKETS;
195
+
196
+ typedef struct
197
+ {
198
+ uinT16 DegreesOfFreedom;
199
+ FLOAT64 Alpha;
200
+ FLOAT64 ChiSquared;
201
+ }
202
+
203
+
204
+ CHISTRUCT;
205
+
206
+ typedef FLOAT64 (*DENSITYFUNC) (inT32);
207
+ typedef FLOAT64 (*SOLVEFUNC) (CHISTRUCT *, double);
208
+
209
+ #define Odd(N) ((N)%2)
210
+ #define Mirror(N,R) ((R) - (N) - 1)
211
+ #define Abs(N) ( ( (N) < 0 ) ? ( -(N) ) : (N) )
212
+
213
+ //--------------Global Data Definitions and Declarations----------------------
214
+ /* the following variables are declared as global so that routines which
215
+ are called from the kd-tree walker can get to them. */
216
+ static HEAP *Heap;
217
+ static TEMPCLUSTER *TempCluster;
218
+ static KDTREE *Tree;
219
+ static inT32 CurrentTemp;
220
+
221
+ /* the following variables describe a discrete normal distribution
222
+ which is used by NormalDensity() and NormalBucket(). The
223
+ constant NORMALEXTENT determines how many standard
224
+ deviations of the distribution are mapped onto the fixed
225
+ discrete range of x. x=0 is mapped to -NORMALEXTENT standard
226
+ deviations and x=BUCKETTABLESIZE is mapped to
227
+ +NORMALEXTENT standard deviations. */
228
+ #define SqrtOf2Pi 2.506628275
229
+ static FLOAT64 NormalStdDev = BUCKETTABLESIZE / (2.0 * NORMALEXTENT);
230
+ static FLOAT64 NormalVariance =
231
+ (BUCKETTABLESIZE * BUCKETTABLESIZE) / (4.0 * NORMALEXTENT * NORMALEXTENT);
232
+ static FLOAT64 NormalMagnitude =
233
+ (2.0 * NORMALEXTENT) / (SqrtOf2Pi * BUCKETTABLESIZE);
234
+ static FLOAT64 NormalMean = BUCKETTABLESIZE / 2;
235
+
236
+ // keep a list of histogram buckets to minimize recomputing them
237
+ static LIST OldBuckets[] = { NIL, NIL, NIL };
238
+
239
+ /* define lookup tables used to compute the number of histogram buckets
240
+ that should be used for a given number of samples. */
241
+ #define LOOKUPTABLESIZE 8
242
+ #define MAXBUCKETS 39
243
+ #define MAXDEGREESOFFREEDOM MAXBUCKETS
244
+
245
+ static uinT32 CountTable[LOOKUPTABLESIZE] = {
246
+ MINSAMPLES, 200, 400, 600, 800, 1000, 1500, 2000
247
+ };
248
+ static uinT16 BucketsTable[LOOKUPTABLESIZE] = {
249
+ MINBUCKETS, 16, 20, 24, 27, 30, 35, MAXBUCKETS
250
+ };
251
+
252
+ /*-------------------------------------------------------------------------
253
+ Private Function Prototypes
254
+ --------------------------------------------------------------------------*/
255
+ void CreateClusterTree(CLUSTERER *Clusterer);
256
+
257
+ void MakePotentialClusters(CLUSTER *Cluster, VISIT Order, inT32 Level);
258
+
259
+ CLUSTER *FindNearestNeighbor(KDTREE *Tree,
260
+ CLUSTER *Cluster,
261
+ FLOAT32 *Distance);
262
+
263
+ CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster);
264
+
265
+ inT32 MergeClusters (inT16 N,
266
+ register PARAM_DESC ParamDesc[],
267
+ register inT32 n1,
268
+ register inT32 n2,
269
+ register FLOAT32 m[],
270
+ register FLOAT32 m1[], register FLOAT32 m2[]);
271
+
272
+ void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config);
273
+
274
+ PROTOTYPE *MakePrototype(CLUSTERER *Clusterer,
275
+ CLUSTERCONFIG *Config,
276
+ CLUSTER *Cluster);
277
+
278
+ PROTOTYPE *MakeDegenerateProto(uinT16 N,
279
+ CLUSTER *Cluster,
280
+ STATISTICS *Statistics,
281
+ PROTOSTYLE Style,
282
+ inT32 MinSamples);
283
+
284
+ PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer,
285
+ CLUSTERCONFIG *Config,
286
+ CLUSTER *Cluster,
287
+ STATISTICS *Statistics);
288
+
289
+ PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer,
290
+ CLUSTER *Cluster,
291
+ STATISTICS *Statistics,
292
+ BUCKETS *Buckets);
293
+
294
+ PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer,
295
+ CLUSTER *Cluster,
296
+ STATISTICS *Statistics,
297
+ BUCKETS *Buckets);
298
+
299
+ PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer,
300
+ CLUSTER *Cluster,
301
+ STATISTICS *Statistics,
302
+ BUCKETS *NormalBuckets,
303
+ FLOAT64 Confidence);
304
+
305
+ void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc);
306
+
307
+ void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics);
308
+
309
+ STATISTICS *ComputeStatistics (inT16 N,
310
+ PARAM_DESC ParamDesc[], CLUSTER * Cluster);
311
+
312
+ PROTOTYPE *NewSphericalProto(uinT16 N,
313
+ CLUSTER *Cluster,
314
+ STATISTICS *Statistics);
315
+
316
+ PROTOTYPE *NewEllipticalProto(inT16 N,
317
+ CLUSTER *Cluster,
318
+ STATISTICS *Statistics);
319
+
320
+ PROTOTYPE *NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics);
321
+
322
+ PROTOTYPE *NewSimpleProto(inT16 N, CLUSTER *Cluster);
323
+
324
+ BOOL8 Independent (PARAM_DESC ParamDesc[],
325
+ inT16 N, FLOAT32 * CoVariance, FLOAT32 Independence);
326
+
327
+ BUCKETS *GetBuckets(DISTRIBUTION Distribution,
328
+ uinT32 SampleCount,
329
+ FLOAT64 Confidence);
330
+
331
+ BUCKETS *MakeBuckets(DISTRIBUTION Distribution,
332
+ uinT32 SampleCount,
333
+ FLOAT64 Confidence);
334
+
335
+ uinT16 OptimumNumberOfBuckets(uinT32 SampleCount);
336
+
337
+ FLOAT64 ComputeChiSquared(uinT16 DegreesOfFreedom, FLOAT64 Alpha);
338
+
339
+ FLOAT64 NormalDensity(inT32 x);
340
+
341
+ FLOAT64 UniformDensity(inT32 x);
342
+
343
+ FLOAT64 Integral(FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx);
344
+
345
+ void FillBuckets(BUCKETS *Buckets,
346
+ CLUSTER *Cluster,
347
+ uinT16 Dim,
348
+ PARAM_DESC *ParamDesc,
349
+ FLOAT32 Mean,
350
+ FLOAT32 StdDev);
351
+
352
+ uinT16 NormalBucket(PARAM_DESC *ParamDesc,
353
+ FLOAT32 x,
354
+ FLOAT32 Mean,
355
+ FLOAT32 StdDev);
356
+
357
+ uinT16 UniformBucket(PARAM_DESC *ParamDesc,
358
+ FLOAT32 x,
359
+ FLOAT32 Mean,
360
+ FLOAT32 StdDev);
361
+
362
+ BOOL8 DistributionOK(BUCKETS *Buckets);
363
+
364
+ void FreeStatistics(STATISTICS *Statistics);
365
+
366
+ void FreeBuckets(BUCKETS *Buckets);
367
+
368
+ void FreeCluster(CLUSTER *Cluster);
369
+
370
+ uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets);
371
+
372
+ int NumBucketsMatch(void *arg1, //BUCKETS *Histogram,
373
+ void *arg2); //uinT16 *DesiredNumberOfBuckets);
374
+
375
+ int ListEntryMatch(void *arg1, void *arg2);
376
+
377
+ void AdjustBuckets(BUCKETS *Buckets, uinT32 NewSampleCount);
378
+
379
+ void InitBuckets(BUCKETS *Buckets);
380
+
381
+ int AlphaMatch(void *arg1, //CHISTRUCT *ChiStruct,
382
+ void *arg2); //CHISTRUCT *SearchKey);
383
+
384
+ CHISTRUCT *NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha);
385
+
386
+ FLOAT64 Solve(SOLVEFUNC Function,
387
+ void *FunctionParams,
388
+ FLOAT64 InitialGuess,
389
+ FLOAT64 Accuracy);
390
+
391
+ FLOAT64 ChiArea(CHISTRUCT *ChiParams, FLOAT64 x);
392
+
393
+ BOOL8 MultipleCharSamples(CLUSTERER *Clusterer,
394
+ CLUSTER *Cluster,
395
+ FLOAT32 MaxIllegal);
396
+
397
+ double InvertMatrix(const float* input, int size, float* inv);
398
+
399
+ //--------------------------Public Code--------------------------------------
400
+ /** MakeClusterer **********************************************************
401
+ Parameters: SampleSize number of dimensions in feature space
402
+ ParamDesc description of each dimension
403
+ Globals: None
404
+ Operation: This routine creates a new clusterer data structure,
405
+ initializes it, and returns a pointer to it.
406
+ Return: pointer to the new clusterer data structure
407
+ Exceptions: None
408
+ History: 5/29/89, DSJ, Created.
409
+ ****************************************************************************/
410
+ CLUSTERER *
411
+ MakeClusterer (inT16 SampleSize, PARAM_DESC ParamDesc[]) {
412
+ CLUSTERER *Clusterer;
413
+ int i;
414
+
415
+ // allocate main clusterer data structure and init simple fields
416
+ Clusterer = (CLUSTERER *) Emalloc (sizeof (CLUSTERER));
417
+ Clusterer->SampleSize = SampleSize;
418
+ Clusterer->NumberOfSamples = 0;
419
+ Clusterer->NumChar = 0;
420
+
421
+ // init fields which will not be used initially
422
+ Clusterer->Root = NULL;
423
+ Clusterer->ProtoList = NIL;
424
+
425
+ // maintain a copy of param descriptors in the clusterer data structure
426
+ Clusterer->ParamDesc =
427
+ (PARAM_DESC *) Emalloc (SampleSize * sizeof (PARAM_DESC));
428
+ for (i = 0; i < SampleSize; i++) {
429
+ Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular;
430
+ Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential;
431
+ Clusterer->ParamDesc[i].Min = ParamDesc[i].Min;
432
+ Clusterer->ParamDesc[i].Max = ParamDesc[i].Max;
433
+ Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
434
+ Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2;
435
+ Clusterer->ParamDesc[i].MidRange =
436
+ (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
437
+ }
438
+
439
+ // allocate a kd tree to hold the samples
440
+ Clusterer->KDTree = MakeKDTree (SampleSize, ParamDesc);
441
+
442
+ // execute hook for monitoring clustering operation
443
+ // (*ClustererCreationHook)( Clusterer );
444
+
445
+ return (Clusterer);
446
+ } // MakeClusterer
447
+
448
+
449
+ /** MakeSample ***********************************************************
450
+ Parameters: Clusterer clusterer data structure to add sample to
451
+ Feature feature to be added to clusterer
452
+ CharID unique ident. of char that sample came from
453
+ Globals: None
454
+ Operation: This routine creates a new sample data structure to hold
455
+ the specified feature. This sample is added to the clusterer
456
+ data structure (so that it knows which samples are to be
457
+ clustered later), and a pointer to the sample is returned to
458
+ the caller.
459
+ Return: Pointer to the new sample data structure
460
+ Exceptions: ALREADYCLUSTERED MakeSample can't be called after
461
+ ClusterSamples has been called
462
+ History: 5/29/89, DSJ, Created.
463
+ *****************************************************************************/
464
+ SAMPLE *
465
+ MakeSample (CLUSTERER * Clusterer, FLOAT32 Feature[], inT32 CharID) {
466
+ SAMPLE *Sample;
467
+ int i;
468
+
469
+ // see if the samples have already been clustered - if so trap an error
470
+ if (Clusterer->Root != NULL)
471
+ DoError (ALREADYCLUSTERED,
472
+ "Can't add samples after they have been clustered");
473
+
474
+ // allocate the new sample and initialize it
475
+ Sample = (SAMPLE *) Emalloc (sizeof (SAMPLE) +
476
+ (Clusterer->SampleSize -
477
+ 1) * sizeof (FLOAT32));
478
+ Sample->Clustered = FALSE;
479
+ Sample->Prototype = FALSE;
480
+ Sample->SampleCount = 1;
481
+ Sample->Left = NULL;
482
+ Sample->Right = NULL;
483
+ Sample->CharID = CharID;
484
+
485
+ for (i = 0; i < Clusterer->SampleSize; i++)
486
+ Sample->Mean[i] = Feature[i];
487
+
488
+ // add the sample to the KD tree - keep track of the total # of samples
489
+ Clusterer->NumberOfSamples++;
490
+ KDStore (Clusterer->KDTree, Sample->Mean, (char *) Sample);
491
+ if (CharID >= Clusterer->NumChar)
492
+ Clusterer->NumChar = CharID + 1;
493
+
494
+ // execute hook for monitoring clustering operation
495
+ // (*SampleCreationHook)( Sample );
496
+
497
+ return (Sample);
498
+ } // MakeSample
499
+
500
+
501
+ /** ClusterSamples ***********************************************************
502
+ Parameters: Clusterer data struct containing samples to be clustered
503
+ Config parameters which control clustering process
504
+ Globals: None
505
+ Operation: This routine first checks to see if the samples in this
506
+ clusterer have already been clustered before; if so, it does
507
+ not bother to recreate the cluster tree. It simply recomputes
508
+ the prototypes based on the new Config info.
509
+ If the samples have not been clustered before, the
510
+ samples in the KD tree are formed into a cluster tree and then
511
+ the prototypes are computed from the cluster tree.
512
+ In either case this routine returns a pointer to a
513
+ list of prototypes that best represent the samples given
514
+ the constraints specified in Config.
515
+ Return: Pointer to a list of prototypes
516
+ Exceptions: None
517
+ History: 5/29/89, DSJ, Created.
518
+ *******************************************************************************/
519
+ LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {
520
+ //only create cluster tree if samples have never been clustered before
521
+ if (Clusterer->Root == NULL)
522
+ CreateClusterTree(Clusterer);
523
+
524
+ //deallocate the old prototype list if one exists
525
+ FreeProtoList (&Clusterer->ProtoList);
526
+ Clusterer->ProtoList = NIL;
527
+
528
+ //compute prototypes starting at the root node in the tree
529
+ ComputePrototypes(Clusterer, Config);
530
+ return (Clusterer->ProtoList);
531
+ } // ClusterSamples
532
+
533
+
534
+ /** FreeClusterer *************************************************************
535
+ Parameters: Clusterer pointer to data structure to be freed
536
+ Globals: None
537
+ Operation: This routine frees all of the memory allocated to the
538
+ specified data structure. It will not, however, free
539
+ the memory used by the prototype list. The pointers to
540
+ the clusters for each prototype in the list will be set
541
+ to NULL to indicate that the cluster data structures no
542
+ longer exist. Any sample lists that have been obtained
543
+ via calls to GetSamples are no longer valid.
544
+ Return: None
545
+ Exceptions: None
546
+ History: 6/6/89, DSJ, Created.
547
+ *******************************************************************************/
548
+ void FreeClusterer(CLUSTERER *Clusterer) {
549
+ if (Clusterer != NULL) {
550
+ memfree (Clusterer->ParamDesc);
551
+ if (Clusterer->KDTree != NULL)
552
+ FreeKDTree (Clusterer->KDTree);
553
+ if (Clusterer->Root != NULL)
554
+ FreeCluster (Clusterer->Root);
555
+ iterate (Clusterer->ProtoList) {
556
+ ((PROTOTYPE *) (first_node (Clusterer->ProtoList)))->Cluster = NULL;
557
+ }
558
+ memfree(Clusterer);
559
+ }
560
+ } // FreeClusterer
561
+
562
+
563
+ /** FreeProtoList ************************************************************
564
+ Parameters: ProtoList pointer to list of prototypes to be freed
565
+ Globals: None
566
+ Operation: This routine frees all of the memory allocated to the
567
+ specified list of prototypes. The clusters which are
568
+ pointed to by the prototypes are not freed.
569
+ Return: None
570
+ Exceptions: None
571
+ History: 6/6/89, DSJ, Created.
572
+ *****************************************************************************/
573
+ void FreeProtoList(LIST *ProtoList) {
574
+ destroy_nodes(*ProtoList, FreePrototype);
575
+ } // FreeProtoList
576
+
577
+
578
+ /** FreePrototype ************************************************************
579
+ Parameters: Prototype prototype data structure to be deallocated
580
+ Globals: None
581
+ Operation: This routine deallocates the memory consumed by the specified
582
+ prototype and modifies the corresponding cluster so that it
583
+ is no longer marked as a prototype. The cluster is NOT
584
+ deallocated by this routine.
585
+ Return: None
586
+ Exceptions: None
587
+ History: 5/30/89, DSJ, Created.
588
+ *******************************************************************************/
589
+ void FreePrototype(void *arg) { //PROTOTYPE *Prototype)
590
+ PROTOTYPE *Prototype = (PROTOTYPE *) arg;
591
+
592
+ // unmark the corresponding cluster (if there is one
593
+ if (Prototype->Cluster != NULL)
594
+ Prototype->Cluster->Prototype = FALSE;
595
+
596
+ // deallocate the prototype statistics and then the prototype itself
597
+ if (Prototype->Distrib != NULL)
598
+ memfree (Prototype->Distrib);
599
+ if (Prototype->Mean != NULL)
600
+ memfree (Prototype->Mean);
601
+ if (Prototype->Style != spherical) {
602
+ if (Prototype->Variance.Elliptical != NULL)
603
+ memfree (Prototype->Variance.Elliptical);
604
+ if (Prototype->Magnitude.Elliptical != NULL)
605
+ memfree (Prototype->Magnitude.Elliptical);
606
+ if (Prototype->Weight.Elliptical != NULL)
607
+ memfree (Prototype->Weight.Elliptical);
608
+ }
609
+ memfree(Prototype);
610
+ } // FreePrototype
611
+
612
+
613
+ /** NextSample ************************************************************
614
+ Parameters: SearchState ptr to list containing clusters to be searched
615
+ Globals: None
616
+ Operation: This routine is used to find all of the samples which
617
+ belong to a cluster. It starts by removing the top
618
+ cluster on the cluster list (SearchState). If this cluster is
619
+ a leaf it is returned. Otherwise, the right subcluster
620
+ is pushed on the list and we continue the search in the
621
+ left subcluster. This continues until a leaf is found.
622
+ If all samples have been found, NULL is returned.
623
+ InitSampleSearch() must be called
624
+ before NextSample() to initialize the search.
625
+ Return: Pointer to the next leaf cluster (sample) or NULL.
626
+ Exceptions: None
627
+ History: 6/16/89, DSJ, Created.
628
+ ****************************************************************************/
629
+ CLUSTER *NextSample(LIST *SearchState) {
630
+ CLUSTER *Cluster;
631
+
632
+ if (*SearchState == NIL)
633
+ return (NULL);
634
+ Cluster = (CLUSTER *) first_node (*SearchState);
635
+ *SearchState = pop (*SearchState);
636
+ while (TRUE) {
637
+ if (Cluster->Left == NULL)
638
+ return (Cluster);
639
+ *SearchState = push (*SearchState, Cluster->Right);
640
+ Cluster = Cluster->Left;
641
+ }
642
+ } // NextSample
643
+
644
+
645
+ /** Mean ***********************************************************
646
+ Parameters: Proto prototype to return mean of
647
+ Dimension dimension whose mean is to be returned
648
+ Globals: none
649
+ Operation: This routine returns the mean of the specified
650
+ prototype in the indicated dimension.
651
+ Return: Mean of Prototype in Dimension
652
+ Exceptions: none
653
+ History: 7/6/89, DSJ, Created.
654
+ *********************************************************************/
655
+ FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension) {
656
+ return (Proto->Mean[Dimension]);
657
+ } // Mean
658
+
659
+
660
+ /** StandardDeviation *************************************************
661
+ Parameters: Proto prototype to return standard deviation of
662
+ Dimension dimension whose stddev is to be returned
663
+ Globals: none
664
+ Operation: This routine returns the standard deviation of the
665
+ prototype in the indicated dimension.
666
+ Return: Standard deviation of Prototype in Dimension
667
+ Exceptions: none
668
+ History: 7/6/89, DSJ, Created.
669
+ **********************************************************************/
670
+ FLOAT32 StandardDeviation(PROTOTYPE *Proto, uinT16 Dimension) {
671
+ switch (Proto->Style) {
672
+ case spherical:
673
+ return ((FLOAT32) sqrt ((double) Proto->Variance.Spherical));
674
+ case elliptical:
675
+ return ((FLOAT32)
676
+ sqrt ((double) Proto->Variance.Elliptical[Dimension]));
677
+ case mixed:
678
+ switch (Proto->Distrib[Dimension]) {
679
+ case normal:
680
+ return ((FLOAT32)
681
+ sqrt ((double) Proto->Variance.Elliptical[Dimension]));
682
+ case uniform:
683
+ case D_random:
684
+ return (Proto->Variance.Elliptical[Dimension]);
685
+ }
686
+ }
687
+ return 0.0f;
688
+ } // StandardDeviation
689
+
690
+
691
+ /*---------------------------------------------------------------------------
692
+ Private Code
693
+ ----------------------------------------------------------------------------*/
694
+ /** CreateClusterTree *******************************************************
695
+ Parameters: Clusterer data structure holdings samples to be clustered
696
+ Globals: Tree kd-tree holding samples
697
+ TempCluster array of temporary clusters
698
+ CurrentTemp index of next temp cluster to be used
699
+ Heap heap used to hold temp clusters - "best" on top
700
+ Operation: This routine performs a bottoms-up clustering on the samples
701
+ held in the kd-tree of the Clusterer data structure. The
702
+ result is a cluster tree. Each node in the tree represents
703
+ a cluster which conceptually contains a subset of the samples.
704
+ More precisely, the cluster contains all of the samples which
705
+ are contained in its two sub-clusters. The leaves of the
706
+ tree are the individual samples themselves; they have no
707
+ sub-clusters. The root node of the tree conceptually contains
708
+ all of the samples.
709
+ Return: None (the Clusterer data structure is changed)
710
+ Exceptions: None
711
+ History: 5/29/89, DSJ, Created.
712
+ ******************************************************************************/
713
+ void CreateClusterTree(CLUSTERER *Clusterer) {
714
+ HEAPENTRY HeapEntry;
715
+ TEMPCLUSTER *PotentialCluster;
716
+
717
+ // save the kd-tree in a global variable so kd-tree walker can get at it
718
+ Tree = Clusterer->KDTree;
719
+
720
+ // allocate memory to to hold all of the "potential" clusters
721
+ TempCluster = (TEMPCLUSTER *)
722
+ Emalloc (Clusterer->NumberOfSamples * sizeof (TEMPCLUSTER));
723
+ CurrentTemp = 0;
724
+
725
+ // each sample and its nearest neighbor form a "potential" cluster
726
+ // save these in a heap with the "best" potential clusters on top
727
+ Heap = MakeHeap (Clusterer->NumberOfSamples);
728
+ KDWalk (Tree, (void_proc) MakePotentialClusters);
729
+
730
+ // form potential clusters into actual clusters - always do "best" first
731
+ while (GetTopOfHeap (Heap, &HeapEntry) != EMPTY) {
732
+ PotentialCluster = (TEMPCLUSTER *) (HeapEntry.Data);
733
+
734
+ // if main cluster of potential cluster is already in another cluster
735
+ // then we don't need to worry about it
736
+ if (PotentialCluster->Cluster->Clustered) {
737
+ continue;
738
+ }
739
+
740
+ // if main cluster is not yet clustered, but its nearest neighbor is
741
+ // then we must find a new nearest neighbor
742
+ else if (PotentialCluster->Neighbor->Clustered) {
743
+ PotentialCluster->Neighbor =
744
+ FindNearestNeighbor (Tree, PotentialCluster->Cluster,
745
+ &(HeapEntry.Key));
746
+ if (PotentialCluster->Neighbor != NULL) {
747
+ HeapStore(Heap, &HeapEntry);
748
+ }
749
+ }
750
+
751
+ // if neither cluster is already clustered, form permanent cluster
752
+ else {
753
+ PotentialCluster->Cluster =
754
+ MakeNewCluster(Clusterer, PotentialCluster);
755
+ PotentialCluster->Neighbor =
756
+ FindNearestNeighbor (Tree, PotentialCluster->Cluster,
757
+ &(HeapEntry.Key));
758
+ if (PotentialCluster->Neighbor != NULL) {
759
+ HeapStore(Heap, &HeapEntry);
760
+ }
761
+ }
762
+ }
763
+
764
+ // the root node in the cluster tree is now the only node in the kd-tree
765
+ Clusterer->Root = (CLUSTER *) RootOf (Clusterer->KDTree);
766
+
767
+ // free up the memory used by the K-D tree, heap, and temp clusters
768
+ FreeKDTree(Tree);
769
+ Clusterer->KDTree = NULL;
770
+ FreeHeap(Heap);
771
+ memfree(TempCluster);
772
+ } // CreateClusterTree
773
+
774
+
775
+ /** MakePotentialClusters **************************************************
776
+ Parameters: Cluster current cluster being visited in kd-tree walk
777
+ Order order in which cluster is being visited
778
+ Level level of this cluster in the kd-tree
779
+ Globals: Tree kd-tree to be searched for neighbors
780
+ TempCluster array of temporary clusters
781
+ CurrentTemp index of next temp cluster to be used
782
+ Heap heap used to hold temp clusters - "best" on top
783
+ Operation: This routine is designed to be used in concert with the
784
+ KDWalk routine. It will create a potential cluster for
785
+ each sample in the kd-tree that is being walked. This
786
+ potential cluster will then be pushed on the heap.
787
+ Return: none
788
+ Exceptions: none
789
+ History: 5/29/89, DSJ, Created.
790
+ 7/13/89, DSJ, Removed visibility of kd-tree node data struct.
791
+ ******************************************************************************/
792
+ void MakePotentialClusters(CLUSTER *Cluster, VISIT Order, inT32 Level) {
793
+ HEAPENTRY HeapEntry;
794
+
795
+ if ((Order == preorder) || (Order == leaf)) {
796
+ TempCluster[CurrentTemp].Cluster = Cluster;
797
+ HeapEntry.Data = (char *) &(TempCluster[CurrentTemp]);
798
+ TempCluster[CurrentTemp].Neighbor =
799
+ FindNearestNeighbor (Tree, TempCluster[CurrentTemp].Cluster,
800
+ &(HeapEntry.Key));
801
+ if (TempCluster[CurrentTemp].Neighbor != NULL) {
802
+ HeapStore(Heap, &HeapEntry);
803
+ CurrentTemp++;
804
+ }
805
+ }
806
+ } // MakePotentialClusters
807
+
808
+
809
+ /** FindNearestNeighbor *********************************************************
810
+ Parameters: Tree kd-tree to search in for nearest neighbor
811
+ Cluster cluster whose nearest neighbor is to be found
812
+ Distance ptr to variable to report distance found
813
+ Globals: none
814
+ Operation: This routine searches the specified kd-tree for the nearest
815
+ neighbor of the specified cluster. It actually uses the
816
+ kd routines to find the 2 nearest neighbors since one of them
817
+ will be the original cluster. A pointer to the nearest
818
+ neighbor is returned, if it can be found, otherwise NULL is
819
+ returned. The distance between the 2 nodes is placed
820
+ in the specified variable.
821
+ Return: Pointer to the nearest neighbor of Cluster, or NULL
822
+ Exceptions: none
823
+ History: 5/29/89, DSJ, Created.
824
+ 7/13/89, DSJ, Removed visibility of kd-tree node data struct
825
+ ********************************************************************************/
826
+ CLUSTER *
827
+ FindNearestNeighbor (KDTREE * Tree, CLUSTER * Cluster, FLOAT32 * Distance)
828
+ #define MAXNEIGHBORS 2
829
+ #define MAXDISTANCE MAX_FLOAT32
830
+ {
831
+ CLUSTER *Neighbor[MAXNEIGHBORS];
832
+ FLOAT32 Dist[MAXNEIGHBORS];
833
+ inT32 NumberOfNeighbors;
834
+ inT32 i;
835
+ CLUSTER *BestNeighbor;
836
+
837
+ // find the 2 nearest neighbors of the cluster
838
+ NumberOfNeighbors = KDNearestNeighborSearch
839
+ (Tree, Cluster->Mean, MAXNEIGHBORS, MAXDISTANCE, Neighbor, Dist);
840
+
841
+ // search for the nearest neighbor that is not the cluster itself
842
+ *Distance = MAXDISTANCE;
843
+ BestNeighbor = NULL;
844
+ for (i = 0; i < NumberOfNeighbors; i++) {
845
+ if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) {
846
+ *Distance = Dist[i];
847
+ BestNeighbor = Neighbor[i];
848
+ }
849
+ }
850
+ return (BestNeighbor);
851
+ } // FindNearestNeighbor
852
+
853
+
854
+ /** MakeNewCluster *************************************************************
855
+ Parameters: Clusterer current clustering environment
856
+ TempCluster potential cluster to make permanent
857
+ Globals: none
858
+ Operation: This routine creates a new permanent cluster from the
859
+ clusters specified in TempCluster. The 2 clusters in
860
+ TempCluster are marked as "clustered" and deleted from
861
+ the kd-tree. The new cluster is then added to the kd-tree.
862
+ Return: Pointer to the new permanent cluster
863
+ Exceptions: none
864
+ History: 5/29/89, DSJ, Created.
865
+ 7/13/89, DSJ, Removed visibility of kd-tree node data struct
866
+ ********************************************************************************/
867
+ CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster) {
868
+ CLUSTER *Cluster;
869
+
870
+ // allocate the new cluster and initialize it
871
+ Cluster = (CLUSTER *) Emalloc (sizeof (CLUSTER) +
872
+ (Clusterer->SampleSize -
873
+ 1) * sizeof (FLOAT32));
874
+ Cluster->Clustered = FALSE;
875
+ Cluster->Prototype = FALSE;
876
+ Cluster->Left = TempCluster->Cluster;
877
+ Cluster->Right = TempCluster->Neighbor;
878
+ Cluster->CharID = -1;
879
+
880
+ // mark the old clusters as "clustered" and delete them from the kd-tree
881
+ Cluster->Left->Clustered = TRUE;
882
+ Cluster->Right->Clustered = TRUE;
883
+ KDDelete (Clusterer->KDTree, Cluster->Left->Mean, Cluster->Left);
884
+ KDDelete (Clusterer->KDTree, Cluster->Right->Mean, Cluster->Right);
885
+
886
+ // compute the mean and sample count for the new cluster
887
+ Cluster->SampleCount =
888
+ MergeClusters (Clusterer->SampleSize, Clusterer->ParamDesc,
889
+ Cluster->Left->SampleCount, Cluster->Right->SampleCount,
890
+ Cluster->Mean, Cluster->Left->Mean, Cluster->Right->Mean);
891
+
892
+ // add the new cluster to the KD tree
893
+ KDStore (Clusterer->KDTree, Cluster->Mean, Cluster);
894
+ return (Cluster);
895
+ } // MakeNewCluster
896
+
897
+
898
+ /** MergeClusters ************************************************************
899
+ Parameters: N # of dimensions (size of arrays)
900
+ ParamDesc array of dimension descriptions
901
+ n1, n2 number of samples in each old cluster
902
+ m array to hold mean of new cluster
903
+ m1, m2 arrays containing means of old clusters
904
+ Globals: None
905
+ Operation: This routine merges two clusters into one larger cluster.
906
+ To do this it computes the number of samples in the new
907
+ cluster and the mean of the new cluster. The ParamDesc
908
+ information is used to ensure that circular dimensions
909
+ are handled correctly.
910
+ Return: The number of samples in the new cluster.
911
+ Exceptions: None
912
+ History: 5/31/89, DSJ, Created.
913
+ *********************************************************************************/
914
+ inT32
915
+ MergeClusters (inT16 N,
916
+ register PARAM_DESC ParamDesc[],
917
+ register inT32 n1,
918
+ register inT32 n2,
919
+ register FLOAT32 m[],
920
+ register FLOAT32 m1[], register FLOAT32 m2[]) {
921
+ register inT32 i, n;
922
+
923
+ n = n1 + n2;
924
+ for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {
925
+ if (ParamDesc->Circular) {
926
+ // if distance between means is greater than allowed
927
+ // reduce upper point by one "rotation" to compute mean
928
+ // then normalize the mean back into the accepted range
929
+ if ((*m2 - *m1) > ParamDesc->HalfRange) {
930
+ *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n;
931
+ if (*m < ParamDesc->Min)
932
+ *m += ParamDesc->Range;
933
+ }
934
+ else if ((*m1 - *m2) > ParamDesc->HalfRange) {
935
+ *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n;
936
+ if (*m < ParamDesc->Min)
937
+ *m += ParamDesc->Range;
938
+ }
939
+ else
940
+ *m = (n1 * *m1 + n2 * *m2) / n;
941
+ }
942
+ else
943
+ *m = (n1 * *m1 + n2 * *m2) / n;
944
+ }
945
+ return (n);
946
+ } // MergeClusters
947
+
948
+
949
+ /** ComputePrototypes *******************************************************
950
+ Parameters: Clusterer data structure holding cluster tree
951
+ Config parameters used to control prototype generation
952
+ Globals: None
953
+ Operation: This routine decides which clusters in the cluster tree
954
+ should be represented by prototypes, forms a list of these
955
+ prototypes, and places the list in the Clusterer data
956
+ structure.
957
+ Return: None
958
+ Exceptions: None
959
+ History: 5/30/89, DSJ, Created.
960
+ *******************************************************************************/
961
+ void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {
962
+ LIST ClusterStack = NIL;
963
+ CLUSTER *Cluster;
964
+ PROTOTYPE *Prototype;
965
+
966
+ // use a stack to keep track of clusters waiting to be processed
967
+ // initially the only cluster on the stack is the root cluster
968
+ if (Clusterer->Root != NULL)
969
+ ClusterStack = push (NIL, Clusterer->Root);
970
+
971
+ // loop until we have analyzed all clusters which are potential prototypes
972
+ while (ClusterStack != NIL) {
973
+ // remove the next cluster to be analyzed from the stack
974
+ // try to make a prototype from the cluster
975
+ // if successful, put it on the proto list, else split the cluster
976
+ Cluster = (CLUSTER *) first_node (ClusterStack);
977
+ ClusterStack = pop (ClusterStack);
978
+ Prototype = MakePrototype (Clusterer, Config, Cluster);
979
+ if (Prototype != NULL) {
980
+ Clusterer->ProtoList = push (Clusterer->ProtoList, Prototype);
981
+ }
982
+ else {
983
+ ClusterStack = push (ClusterStack, Cluster->Right);
984
+ ClusterStack = push (ClusterStack, Cluster->Left);
985
+ }
986
+ }
987
+ } // ComputePrototypes
988
+
989
+
990
+ /** MakePrototype ***********************************************************
991
+ Parameters: Clusterer data structure holding cluster tree
992
+ Config parameters used to control prototype generation
993
+ Cluster cluster to be made into a prototype
994
+ Globals: None
995
+ Operation: This routine attempts to create a prototype from the
996
+ specified cluster that conforms to the distribution
997
+ specified in Config. If there are too few samples in the
998
+ cluster to perform a statistical analysis, then a prototype
999
+ is generated but labelled as insignificant. If the
1000
+ dimensions of the cluster are not independent, no prototype
1001
+ is generated and NULL is returned. If a prototype can be
1002
+ found that matches the desired distribution then a pointer
1003
+ to it is returned, otherwise NULL is returned.
1004
+ Return: Pointer to new prototype or NULL
1005
+ Exceptions: None
1006
+ History: 6/19/89, DSJ, Created.
1007
+ *******************************************************************************/
1008
+ PROTOTYPE *MakePrototype(CLUSTERER *Clusterer,
1009
+ CLUSTERCONFIG *Config,
1010
+ CLUSTER *Cluster) {
1011
+ STATISTICS *Statistics;
1012
+ PROTOTYPE *Proto;
1013
+ BUCKETS *Buckets;
1014
+
1015
+ // filter out clusters which contain samples from the same character
1016
+ if (MultipleCharSamples (Clusterer, Cluster, Config->MaxIllegal))
1017
+ return (NULL);
1018
+
1019
+ // compute the covariance matrix and ranges for the cluster
1020
+ Statistics =
1021
+ ComputeStatistics (Clusterer->SampleSize, Clusterer->ParamDesc, Cluster);
1022
+
1023
+ // check for degenerate clusters which need not be analyzed further
1024
+ // note that the MinSamples test assumes that all clusters with multiple
1025
+ // character samples have been removed (as above)
1026
+ Proto = MakeDegenerateProto (Clusterer->SampleSize, Cluster, Statistics,
1027
+ Config->ProtoStyle,
1028
+ (inT32) (Config->MinSamples *
1029
+ Clusterer->NumChar));
1030
+ if (Proto != NULL) {
1031
+ FreeStatistics(Statistics);
1032
+ return (Proto);
1033
+ }
1034
+ // check to ensure that all dimensions are independent
1035
+ if (!Independent (Clusterer->ParamDesc, Clusterer->SampleSize,
1036
+ Statistics->CoVariance, Config->Independence)) {
1037
+ FreeStatistics(Statistics);
1038
+ return (NULL);
1039
+ }
1040
+
1041
+ if (HOTELLING && Config->ProtoStyle == elliptical) {
1042
+ Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics);
1043
+ if (Proto != NULL) {
1044
+ FreeStatistics(Statistics);
1045
+ return Proto;
1046
+ }
1047
+ }
1048
+
1049
+ // create a histogram data structure used to evaluate distributions
1050
+ Buckets = GetBuckets (normal, Cluster->SampleCount, Config->Confidence);
1051
+
1052
+ // create a prototype based on the statistics and test it
1053
+ switch (Config->ProtoStyle) {
1054
+ case spherical:
1055
+ Proto = MakeSphericalProto (Clusterer, Cluster, Statistics, Buckets);
1056
+ break;
1057
+ case elliptical:
1058
+ Proto = MakeEllipticalProto (Clusterer, Cluster, Statistics, Buckets);
1059
+ break;
1060
+ case mixed:
1061
+ Proto = MakeMixedProto (Clusterer, Cluster, Statistics, Buckets,
1062
+ Config->Confidence);
1063
+ break;
1064
+ case automatic:
1065
+ Proto = MakeSphericalProto (Clusterer, Cluster, Statistics, Buckets);
1066
+ if (Proto != NULL)
1067
+ break;
1068
+ Proto = MakeEllipticalProto (Clusterer, Cluster, Statistics, Buckets);
1069
+ if (Proto != NULL)
1070
+ break;
1071
+ Proto = MakeMixedProto (Clusterer, Cluster, Statistics, Buckets,
1072
+ Config->Confidence);
1073
+ break;
1074
+ }
1075
+ FreeBuckets(Buckets);
1076
+ FreeStatistics(Statistics);
1077
+ return (Proto);
1078
+ } // MakePrototype
1079
+
1080
+
1081
+ /** MakeDegenerateProto ******************************************************
1082
+ Parameters: N number of dimensions
1083
+ Cluster cluster being analyzed
1084
+ Statistics statistical info about cluster
1085
+ Style type of prototype to be generated
1086
+ MinSamples minimum number of samples in a cluster
1087
+ Globals: None
1088
+ Operation: This routine checks for clusters which are degenerate and
1089
+ therefore cannot be analyzed in a statistically valid way.
1090
+ A cluster is defined as degenerate if it does not have at
1091
+ least MINSAMPLESNEEDED samples in it. If the cluster is
1092
+ found to be degenerate, a prototype of the specified style
1093
+ is generated and marked as insignificant. A cluster is
1094
+ also degenerate if it does not have at least MinSamples
1095
+ samples in it.
1096
+ If the cluster is not degenerate, NULL is returned.
1097
+ Return: Pointer to degenerate prototype or NULL.
1098
+ Exceptions: None
1099
+ History: 6/20/89, DSJ, Created.
1100
+ 7/12/89, DSJ, Changed name and added check for 0 stddev.
1101
+ 8/8/89, DSJ, Removed check for 0 stddev (handled elsewhere).
1102
+ ********************************************************************************/
1103
+ PROTOTYPE *MakeDegenerateProto( //this was MinSample
1104
+ uinT16 N,
1105
+ CLUSTER *Cluster,
1106
+ STATISTICS *Statistics,
1107
+ PROTOSTYLE Style,
1108
+ inT32 MinSamples) {
1109
+ PROTOTYPE *Proto = NULL;
1110
+
1111
+ if (MinSamples < MINSAMPLESNEEDED)
1112
+ MinSamples = MINSAMPLESNEEDED;
1113
+
1114
+ if (Cluster->SampleCount < MinSamples) {
1115
+ switch (Style) {
1116
+ case spherical:
1117
+ Proto = NewSphericalProto (N, Cluster, Statistics);
1118
+ break;
1119
+ case elliptical:
1120
+ case automatic:
1121
+ Proto = NewEllipticalProto (N, Cluster, Statistics);
1122
+ break;
1123
+ case mixed:
1124
+ Proto = NewMixedProto (N, Cluster, Statistics);
1125
+ break;
1126
+ }
1127
+ Proto->Significant = FALSE;
1128
+ }
1129
+ return (Proto);
1130
+ } // MakeDegenerateProto
1131
+
1132
+ /** TestEllipticalProto ****************************************************
1133
+ Parameters: Clusterer data struct containing samples being clustered
1134
+ Config provides the magic number of samples that make a good cluster
1135
+ Cluster cluster to be made into an elliptical prototype
1136
+ Statistics statistical info about cluster
1137
+ Globals: None
1138
+ Operation: This routine tests the specified cluster to see if **
1139
+ * there is a statistically significant difference between
1140
+ * the sub-clusters that would be made if the cluster were to
1141
+ * be split. If not, then a new prototype is formed and
1142
+ * returned to the caller. If there is, then NULL is returned
1143
+ * to the caller.
1144
+ Return: Pointer to new elliptical prototype or NULL.
1145
+ ****************************************************************************/
1146
+ PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer,
1147
+ CLUSTERCONFIG *Config,
1148
+ CLUSTER *Cluster,
1149
+ STATISTICS *Statistics) {
1150
+ // Fraction of the number of samples used as a range around 1 within
1151
+ // which a cluster has the magic size that allows a boost to the
1152
+ // FTable by kFTableBoostMargin, thus allowing clusters near the
1153
+ // magic size (equal to the number of sample characters) to be more
1154
+ // likely to stay together.
1155
+ const double kMagicSampleMargin = 0.0625;
1156
+ const double kFTableBoostMargin = 2.0;
1157
+
1158
+ int N = Clusterer->SampleSize;
1159
+ CLUSTER* Left = Cluster->Left;
1160
+ CLUSTER* Right = Cluster->Right;
1161
+ if (Left == NULL || Right == NULL)
1162
+ return NULL;
1163
+ int TotalDims = Left->SampleCount + Right->SampleCount;
1164
+ if (TotalDims < N + 1 || TotalDims < 2)
1165
+ return NULL;
1166
+ const int kMatrixSize = N * N * sizeof(FLOAT32);
1167
+ FLOAT32* Covariance = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
1168
+ FLOAT32* Inverse = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
1169
+ FLOAT32* Delta = reinterpret_cast<FLOAT32*>(Emalloc(N * sizeof(FLOAT32)));
1170
+ // Compute a new covariance matrix that only uses essential features.
1171
+ for (int i = 0; i < N; ++i) {
1172
+ int row_offset = i * N;
1173
+ if (!Clusterer->ParamDesc[i].NonEssential) {
1174
+ for (int j = 0; j < N; ++j) {
1175
+ if (!Clusterer->ParamDesc[j].NonEssential)
1176
+ Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset];
1177
+ else
1178
+ Covariance[j + row_offset] = 0.0f;
1179
+ }
1180
+ } else {
1181
+ for (int j = 0; j < N; ++j) {
1182
+ if (i == j)
1183
+ Covariance[j + row_offset] = 1.0f;
1184
+ else
1185
+ Covariance[j + row_offset] = 0.0f;
1186
+ }
1187
+ }
1188
+ }
1189
+ double err = InvertMatrix(Covariance, N, Inverse);
1190
+ if (err > 1) {
1191
+ tprintf("Clustering error: Matrix inverse failed with error %g\n", err);
1192
+ }
1193
+ int EssentialN = 0;
1194
+ for (int dim = 0; dim < N; ++dim) {
1195
+ if (!Clusterer->ParamDesc[dim].NonEssential) {
1196
+ Delta[dim] = Left->Mean[dim] - Right->Mean[dim];
1197
+ ++EssentialN;
1198
+ } else {
1199
+ Delta[dim] = 0.0f;
1200
+ }
1201
+ }
1202
+ // Compute Hotelling's T-squared.
1203
+ double Tsq = 0.0;
1204
+ for (int x = 0; x < N; ++x) {
1205
+ double temp = 0.0;
1206
+ for (int y = 0; y < N; ++y) {
1207
+ temp += Inverse[y + N*x] * Delta[y];
1208
+ }
1209
+ Tsq += Delta[x] * temp;
1210
+ }
1211
+ memfree(Covariance);
1212
+ memfree(Inverse);
1213
+ memfree(Delta);
1214
+ // Changed this function to match the formula in
1215
+ // Statistical Methods in Medical Research p 473
1216
+ // By Peter Armitage, Geoffrey Berry, J. N. S. Matthews.
1217
+ // Tsq *= Left->SampleCount * Right->SampleCount / TotalDims;
1218
+ double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2)*EssentialN);
1219
+ int Fx = EssentialN;
1220
+ if (Fx > FTABLE_X)
1221
+ Fx = FTABLE_X;
1222
+ --Fx;
1223
+ int Fy = TotalDims - EssentialN - 1;
1224
+ if (Fy > FTABLE_Y)
1225
+ Fy = FTABLE_Y;
1226
+ --Fy;
1227
+ double FTarget = FTable[Fy][Fx];
1228
+ if (Config->MagicSamples > 0 &&
1229
+ TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) &&
1230
+ TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) {
1231
+ // Give magic-sized clusters a magic FTable boost.
1232
+ FTarget += kFTableBoostMargin;
1233
+ }
1234
+ if (F < FTarget) {
1235
+ return NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
1236
+ }
1237
+ return NULL;
1238
+ }
1239
+
1240
+ /* MakeSphericalProto *******************************************************
1241
+ Parameters: Clusterer data struct containing samples being clustered
1242
+ Cluster cluster to be made into a spherical prototype
1243
+ Statistics statistical info about cluster
1244
+ Buckets histogram struct used to analyze distribution
1245
+ Globals: None
1246
+ Operation: This routine tests the specified cluster to see if it can
1247
+ be approximated by a spherical normal distribution. If it
1248
+ can be, then a new prototype is formed and returned to the
1249
+ caller. If it can't be, then NULL is returned to the caller.
1250
+ Return: Pointer to new spherical prototype or NULL.
1251
+ Exceptions: None
1252
+ History: 6/1/89, DSJ, Created.
1253
+ ******************************************************************************/
1254
+ PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer,
1255
+ CLUSTER *Cluster,
1256
+ STATISTICS *Statistics,
1257
+ BUCKETS *Buckets) {
1258
+ PROTOTYPE *Proto = NULL;
1259
+ int i;
1260
+
1261
+ // check that each dimension is a normal distribution
1262
+ for (i = 0; i < Clusterer->SampleSize; i++) {
1263
+ if (Clusterer->ParamDesc[i].NonEssential)
1264
+ continue;
1265
+
1266
+ FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1267
+ Cluster->Mean[i],
1268
+ sqrt ((FLOAT64) (Statistics->AvgVariance)));
1269
+ if (!DistributionOK (Buckets))
1270
+ break;
1271
+ }
1272
+ // if all dimensions matched a normal distribution, make a proto
1273
+ if (i >= Clusterer->SampleSize)
1274
+ Proto = NewSphericalProto (Clusterer->SampleSize, Cluster, Statistics);
1275
+ return (Proto);
1276
+ } // MakeSphericalProto
1277
+
1278
+
1279
+ /** MakeEllipticalProto ****************************************************
1280
+ Parameters: Clusterer data struct containing samples being clustered
1281
+ Cluster cluster to be made into an elliptical prototype
1282
+ Statistics statistical info about cluster
1283
+ Buckets histogram struct used to analyze distribution
1284
+ Globals: None
1285
+ Operation: This routine tests the specified cluster to see if it can
1286
+ be approximated by an elliptical normal distribution. If it
1287
+ can be, then a new prototype is formed and returned to the
1288
+ caller. If it can't be, then NULL is returned to the caller.
1289
+ Return: Pointer to new elliptical prototype or NULL.
1290
+ Exceptions: None
1291
+ History: 6/12/89, DSJ, Created.
1292
+ ****************************************************************************/
1293
+ PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer,
1294
+ CLUSTER *Cluster,
1295
+ STATISTICS *Statistics,
1296
+ BUCKETS *Buckets) {
1297
+ PROTOTYPE *Proto = NULL;
1298
+ int i;
1299
+
1300
+ // check that each dimension is a normal distribution
1301
+ for (i = 0; i < Clusterer->SampleSize; i++) {
1302
+ if (Clusterer->ParamDesc[i].NonEssential)
1303
+ continue;
1304
+
1305
+ FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1306
+ Cluster->Mean[i],
1307
+ sqrt ((FLOAT64) Statistics->
1308
+ CoVariance[i * (Clusterer->SampleSize + 1)]));
1309
+ if (!DistributionOK (Buckets))
1310
+ break;
1311
+ }
1312
+ // if all dimensions matched a normal distribution, make a proto
1313
+ if (i >= Clusterer->SampleSize)
1314
+ Proto = NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
1315
+ return (Proto);
1316
+ } // MakeEllipticalProto
1317
+
1318
+
1319
+ /** MakeMixedProto ***********************************************************
1320
+ Parameters: Clusterer data struct containing samples being clustered
1321
+ Cluster cluster to be made into a prototype
1322
+ Statistics statistical info about cluster
1323
+ NormalBuckets histogram struct used to analyze distribution
1324
+ Confidence confidence level for alternate distributions
1325
+ Globals: None
1326
+ Operation: This routine tests each dimension of the specified cluster to
1327
+ see what distribution would best approximate that dimension.
1328
+ Each dimension is compared to the following distributions
1329
+ in order: normal, random, uniform. If each dimension can
1330
+ be represented by one of these distributions,
1331
+ then a new prototype is formed and returned to the
1332
+ caller. If it can't be, then NULL is returned to the caller.
1333
+ Return: Pointer to new mixed prototype or NULL.
1334
+ Exceptions: None
1335
+ History: 6/12/89, DSJ, Created.
1336
+ ********************************************************************************/
1337
+ PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer,
1338
+ CLUSTER *Cluster,
1339
+ STATISTICS *Statistics,
1340
+ BUCKETS *NormalBuckets,
1341
+ FLOAT64 Confidence) {
1342
+ PROTOTYPE *Proto;
1343
+ int i;
1344
+ BUCKETS *UniformBuckets = NULL;
1345
+ BUCKETS *RandomBuckets = NULL;
1346
+
1347
+ // create a mixed proto to work on - initially assume all dimensions normal*/
1348
+ Proto = NewMixedProto (Clusterer->SampleSize, Cluster, Statistics);
1349
+
1350
+ // find the proper distribution for each dimension
1351
+ for (i = 0; i < Clusterer->SampleSize; i++) {
1352
+ if (Clusterer->ParamDesc[i].NonEssential)
1353
+ continue;
1354
+
1355
+ FillBuckets (NormalBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1356
+ Proto->Mean[i],
1357
+ sqrt ((FLOAT64) Proto->Variance.Elliptical[i]));
1358
+ if (DistributionOK (NormalBuckets))
1359
+ continue;
1360
+
1361
+ if (RandomBuckets == NULL)
1362
+ RandomBuckets =
1363
+ GetBuckets (D_random, Cluster->SampleCount, Confidence);
1364
+ MakeDimRandom (i, Proto, &(Clusterer->ParamDesc[i]));
1365
+ FillBuckets (RandomBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1366
+ Proto->Mean[i], Proto->Variance.Elliptical[i]);
1367
+ if (DistributionOK (RandomBuckets))
1368
+ continue;
1369
+
1370
+ if (UniformBuckets == NULL)
1371
+ UniformBuckets =
1372
+ GetBuckets (uniform, Cluster->SampleCount, Confidence);
1373
+ MakeDimUniform(i, Proto, Statistics);
1374
+ FillBuckets (UniformBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1375
+ Proto->Mean[i], Proto->Variance.Elliptical[i]);
1376
+ if (DistributionOK (UniformBuckets))
1377
+ continue;
1378
+ break;
1379
+ }
1380
+ // if any dimension failed to match a distribution, discard the proto
1381
+ if (i < Clusterer->SampleSize) {
1382
+ FreePrototype(Proto);
1383
+ Proto = NULL;
1384
+ }
1385
+ if (UniformBuckets != NULL)
1386
+ FreeBuckets(UniformBuckets);
1387
+ if (RandomBuckets != NULL)
1388
+ FreeBuckets(RandomBuckets);
1389
+ return (Proto);
1390
+ } // MakeMixedProto
1391
+
1392
+
1393
+ /* MakeDimRandom *************************************************************
1394
+ Parameters: i index of dimension to be changed
1395
+ Proto prototype whose dimension is to be altered
1396
+ ParamDesc description of specified dimension
1397
+ Globals: None
1398
+ Operation: This routine alters the ith dimension of the specified
1399
+ mixed prototype to be D_random.
1400
+ Return: None
1401
+ Exceptions: None
1402
+ History: 6/20/89, DSJ, Created.
1403
+ ******************************************************************************/
1404
+ void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc) {
1405
+ Proto->Distrib[i] = D_random;
1406
+ Proto->Mean[i] = ParamDesc->MidRange;
1407
+ Proto->Variance.Elliptical[i] = ParamDesc->HalfRange;
1408
+
1409
+ // subtract out the previous magnitude of this dimension from the total
1410
+ Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
1411
+ Proto->Magnitude.Elliptical[i] = 1.0 / ParamDesc->Range;
1412
+ Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1413
+ Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1414
+
1415
+ // note that the proto Weight is irrelevant for D_random protos
1416
+ } // MakeDimRandom
1417
+
1418
+
1419
+ /** MakeDimUniform ***********************************************************
1420
+ Parameters: i index of dimension to be changed
1421
+ Proto prototype whose dimension is to be altered
1422
+ Statistics statistical info about prototype
1423
+ Globals: None
1424
+ Operation: This routine alters the ith dimension of the specified
1425
+ mixed prototype to be uniform.
1426
+ Return: None
1427
+ Exceptions: None
1428
+ History: 6/20/89, DSJ, Created.
1429
+ ******************************************************************************/
1430
+ void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics) {
1431
+ Proto->Distrib[i] = uniform;
1432
+ Proto->Mean[i] = Proto->Cluster->Mean[i] +
1433
+ (Statistics->Min[i] + Statistics->Max[i]) / 2;
1434
+ Proto->Variance.Elliptical[i] =
1435
+ (Statistics->Max[i] - Statistics->Min[i]) / 2;
1436
+ if (Proto->Variance.Elliptical[i] < MINVARIANCE)
1437
+ Proto->Variance.Elliptical[i] = MINVARIANCE;
1438
+
1439
+ // subtract out the previous magnitude of this dimension from the total
1440
+ Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
1441
+ Proto->Magnitude.Elliptical[i] =
1442
+ 1.0 / (2.0 * Proto->Variance.Elliptical[i]);
1443
+ Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1444
+ Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1445
+
1446
+ // note that the proto Weight is irrelevant for uniform protos
1447
+ } // MakeDimUniform
1448
+
1449
+
1450
+ /** ComputeStatistics *********************************************************
1451
+ Parameters: N number of dimensions
1452
+ ParamDesc array of dimension descriptions
1453
+ Cluster cluster whose stats are to be computed
1454
+ Globals: None
1455
+ Operation: This routine searches the cluster tree for all leaf nodes
1456
+ which are samples in the specified cluster. It computes
1457
+ a full covariance matrix for these samples as well as
1458
+ keeping track of the ranges (min and max) for each
1459
+ dimension. A special data structure is allocated to
1460
+ return this information to the caller. An incremental
1461
+ algorithm for computing statistics is not used because
1462
+ it will not work with circular dimensions.
1463
+ Return: Pointer to new data structure containing statistics
1464
+ Exceptions: None
1465
+ History: 6/2/89, DSJ, Created.
1466
+ *********************************************************************************/
1467
+ STATISTICS *
1468
+ ComputeStatistics (inT16 N, PARAM_DESC ParamDesc[], CLUSTER * Cluster) {
1469
+ STATISTICS *Statistics;
1470
+ int i, j;
1471
+ FLOAT32 *CoVariance;
1472
+ FLOAT32 *Distance;
1473
+ LIST SearchState;
1474
+ SAMPLE *Sample;
1475
+ uinT32 SampleCountAdjustedForBias;
1476
+
1477
+ // allocate memory to hold the statistics results
1478
+ Statistics = (STATISTICS *) Emalloc (sizeof (STATISTICS));
1479
+ Statistics->CoVariance = (FLOAT32 *) Emalloc (N * N * sizeof (FLOAT32));
1480
+ Statistics->Min = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1481
+ Statistics->Max = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1482
+
1483
+ // allocate temporary memory to hold the sample to mean distances
1484
+ Distance = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1485
+
1486
+ // initialize the statistics
1487
+ Statistics->AvgVariance = 1.0;
1488
+ CoVariance = Statistics->CoVariance;
1489
+ for (i = 0; i < N; i++) {
1490
+ Statistics->Min[i] = 0.0;
1491
+ Statistics->Max[i] = 0.0;
1492
+ for (j = 0; j < N; j++, CoVariance++)
1493
+ *CoVariance = 0;
1494
+ }
1495
+ // find each sample in the cluster and merge it into the statistics
1496
+ InitSampleSearch(SearchState, Cluster);
1497
+ while ((Sample = NextSample (&SearchState)) != NULL) {
1498
+ for (i = 0; i < N; i++) {
1499
+ Distance[i] = Sample->Mean[i] - Cluster->Mean[i];
1500
+ if (ParamDesc[i].Circular) {
1501
+ if (Distance[i] > ParamDesc[i].HalfRange)
1502
+ Distance[i] -= ParamDesc[i].Range;
1503
+ if (Distance[i] < -ParamDesc[i].HalfRange)
1504
+ Distance[i] += ParamDesc[i].Range;
1505
+ }
1506
+ if (Distance[i] < Statistics->Min[i])
1507
+ Statistics->Min[i] = Distance[i];
1508
+ if (Distance[i] > Statistics->Max[i])
1509
+ Statistics->Max[i] = Distance[i];
1510
+ }
1511
+ CoVariance = Statistics->CoVariance;
1512
+ for (i = 0; i < N; i++)
1513
+ for (j = 0; j < N; j++, CoVariance++)
1514
+ *CoVariance += Distance[i] * Distance[j];
1515
+ }
1516
+ // normalize the variances by the total number of samples
1517
+ // use SampleCount-1 instead of SampleCount to get an unbiased estimate
1518
+ // also compute the geometic mean of the diagonal variances
1519
+ // ensure that clusters with only 1 sample are handled correctly
1520
+ if (Cluster->SampleCount > 1)
1521
+ SampleCountAdjustedForBias = Cluster->SampleCount - 1;
1522
+ else
1523
+ SampleCountAdjustedForBias = 1;
1524
+ CoVariance = Statistics->CoVariance;
1525
+ for (i = 0; i < N; i++)
1526
+ for (j = 0; j < N; j++, CoVariance++) {
1527
+ *CoVariance /= SampleCountAdjustedForBias;
1528
+ if (j == i) {
1529
+ if (*CoVariance < MINVARIANCE)
1530
+ *CoVariance = MINVARIANCE;
1531
+ Statistics->AvgVariance *= *CoVariance;
1532
+ }
1533
+ }
1534
+ Statistics->AvgVariance = (float)pow((double)Statistics->AvgVariance,
1535
+ 1.0 / N);
1536
+
1537
+ // release temporary memory and return
1538
+ memfree(Distance);
1539
+ return (Statistics);
1540
+ } // ComputeStatistics
1541
+
1542
+
1543
+ /** NewSpericalProto *********************************************************
1544
+ Parameters: N number of dimensions
1545
+ Cluster cluster to be made into a spherical prototype
1546
+ Statistics statistical info about samples in cluster
1547
+ Globals: None
1548
+ Operation: This routine creates a spherical prototype data structure to
1549
+ approximate the samples in the specified cluster.
1550
+ Spherical prototypes have a single variance which is
1551
+ common across all dimensions. All dimensions are normally
1552
+ distributed and independent.
1553
+ Return: Pointer to a new spherical prototype data structure
1554
+ Exceptions: None
1555
+ History: 6/19/89, DSJ, Created.
1556
+ ******************************************************************************/
1557
+ PROTOTYPE *NewSphericalProto(uinT16 N,
1558
+ CLUSTER *Cluster,
1559
+ STATISTICS *Statistics) {
1560
+ PROTOTYPE *Proto;
1561
+
1562
+ Proto = NewSimpleProto (N, Cluster);
1563
+
1564
+ Proto->Variance.Spherical = Statistics->AvgVariance;
1565
+ if (Proto->Variance.Spherical < MINVARIANCE)
1566
+ Proto->Variance.Spherical = MINVARIANCE;
1567
+
1568
+ Proto->Magnitude.Spherical =
1569
+ 1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Spherical));
1570
+ Proto->TotalMagnitude = (float)pow((double)Proto->Magnitude.Spherical,
1571
+ (double) N);
1572
+ Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
1573
+ Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1574
+
1575
+ return (Proto);
1576
+ } // NewSphericalProto
1577
+
1578
+
1579
+ /** NewEllipticalProto *******************************************************
1580
+ Parameters: N number of dimensions
1581
+ Cluster cluster to be made into an elliptical prototype
1582
+ Statistics statistical info about samples in cluster
1583
+ Globals: None
1584
+ Operation: This routine creates an elliptical prototype data structure to
1585
+ approximate the samples in the specified cluster.
1586
+ Elliptical prototypes have a variance for each dimension.
1587
+ All dimensions are normally distributed and independent.
1588
+ Return: Pointer to a new elliptical prototype data structure
1589
+ Exceptions: None
1590
+ History: 6/19/89, DSJ, Created.
1591
+ *******************************************************************************/
1592
+ PROTOTYPE *NewEllipticalProto(inT16 N,
1593
+ CLUSTER *Cluster,
1594
+ STATISTICS *Statistics) {
1595
+ PROTOTYPE *Proto;
1596
+ FLOAT32 *CoVariance;
1597
+ int i;
1598
+
1599
+ Proto = NewSimpleProto (N, Cluster);
1600
+ Proto->Variance.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1601
+ Proto->Magnitude.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1602
+ Proto->Weight.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1603
+
1604
+ CoVariance = Statistics->CoVariance;
1605
+ Proto->TotalMagnitude = 1.0;
1606
+ for (i = 0; i < N; i++, CoVariance += N + 1) {
1607
+ Proto->Variance.Elliptical[i] = *CoVariance;
1608
+ if (Proto->Variance.Elliptical[i] < MINVARIANCE)
1609
+ Proto->Variance.Elliptical[i] = MINVARIANCE;
1610
+
1611
+ Proto->Magnitude.Elliptical[i] =
1612
+ 1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Elliptical[i]));
1613
+ Proto->Weight.Elliptical[i] = 1.0 / Proto->Variance.Elliptical[i];
1614
+ Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1615
+ }
1616
+ Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1617
+ Proto->Style = elliptical;
1618
+ return (Proto);
1619
+ } // NewEllipticalProto
1620
+
1621
+
1622
+ /** MewMixedProto ************************************************************
1623
+ Parameters: N number of dimensions
1624
+ Cluster cluster to be made into a mixed prototype
1625
+ Statistics statistical info about samples in cluster
1626
+ Globals: None
1627
+ Operation: This routine creates a mixed prototype data structure to
1628
+ approximate the samples in the specified cluster.
1629
+ Mixed prototypes can have different distributions for
1630
+ each dimension. All dimensions are independent. The
1631
+ structure is initially filled in as though it were an
1632
+ elliptical prototype. The actual distributions of the
1633
+ dimensions can be altered by other routines.
1634
+ Return: Pointer to a new mixed prototype data structure
1635
+ Exceptions: None
1636
+ History: 6/19/89, DSJ, Created.
1637
+ ********************************************************************************/
1638
+ PROTOTYPE *NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics) {
1639
+ PROTOTYPE *Proto;
1640
+ int i;
1641
+
1642
+ Proto = NewEllipticalProto (N, Cluster, Statistics);
1643
+ Proto->Distrib = (DISTRIBUTION *) Emalloc (N * sizeof (DISTRIBUTION));
1644
+
1645
+ for (i = 0; i < N; i++) {
1646
+ Proto->Distrib[i] = normal;
1647
+ }
1648
+ Proto->Style = mixed;
1649
+ return (Proto);
1650
+ } // NewMixedProto
1651
+
1652
+
1653
+ /** NewSimpleProto ***********************************************************
1654
+ Parameters: N number of dimensions
1655
+ Cluster cluster to be made into a prototype
1656
+ Globals: None
1657
+ Operation: This routine allocates memory to hold a simple prototype
1658
+ data structure, i.e. one without independent distributions
1659
+ and variances for each dimension.
1660
+ Return: Pointer to new simple prototype
1661
+ Exceptions: None
1662
+ History: 6/19/89, DSJ, Created.
1663
+ *******************************************************************************/
1664
+ PROTOTYPE *NewSimpleProto(inT16 N, CLUSTER *Cluster) {
1665
+ PROTOTYPE *Proto;
1666
+ int i;
1667
+
1668
+ Proto = (PROTOTYPE *) Emalloc (sizeof (PROTOTYPE));
1669
+ Proto->Mean = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1670
+
1671
+ for (i = 0; i < N; i++)
1672
+ Proto->Mean[i] = Cluster->Mean[i];
1673
+ Proto->Distrib = NULL;
1674
+
1675
+ Proto->Significant = TRUE;
1676
+ Proto->Style = spherical;
1677
+ Proto->NumSamples = Cluster->SampleCount;
1678
+ Proto->Cluster = Cluster;
1679
+ Proto->Cluster->Prototype = TRUE;
1680
+ return (Proto);
1681
+ } // NewSimpleProto
1682
+
1683
+
1684
+ /** Independent ***************************************************************
1685
+ Parameters: ParamDesc descriptions of each feature space dimension
1686
+ N number of dimensions
1687
+ CoVariance ptr to a covariance matrix
1688
+ Independence max off-diagonal correlation coefficient
1689
+ Globals: None
1690
+ Operation: This routine returns TRUE if the specified covariance
1691
+ matrix indicates that all N dimensions are independent of
1692
+ one another. One dimension is judged to be independent of
1693
+ another when the magnitude of the corresponding correlation
1694
+ coefficient is
1695
+ less than the specified Independence factor. The
1696
+ correlation coefficient is calculated as: (see Duda and
1697
+ Hart, pg. 247)
1698
+ coeff[ij] = stddev[ij] / sqrt (stddev[ii] * stddev[jj])
1699
+ The covariance matrix is assumed to be symmetric (which
1700
+ should always be true).
1701
+ Return: TRUE if dimensions are independent, FALSE otherwise
1702
+ Exceptions: None
1703
+ History: 6/4/89, DSJ, Created.
1704
+ *******************************************************************************/
1705
+ BOOL8
1706
+ Independent (PARAM_DESC ParamDesc[],
1707
+ inT16 N, FLOAT32 * CoVariance, FLOAT32 Independence) {
1708
+ int i, j;
1709
+ FLOAT32 *VARii; // points to ith on-diagonal element
1710
+ FLOAT32 *VARjj; // points to jth on-diagonal element
1711
+ FLOAT32 CorrelationCoeff;
1712
+
1713
+ VARii = CoVariance;
1714
+ for (i = 0; i < N; i++, VARii += N + 1) {
1715
+ if (ParamDesc[i].NonEssential)
1716
+ continue;
1717
+
1718
+ VARjj = VARii + N + 1;
1719
+ CoVariance = VARii + 1;
1720
+ for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) {
1721
+ if (ParamDesc[j].NonEssential)
1722
+ continue;
1723
+
1724
+ if ((*VARii == 0.0) || (*VARjj == 0.0))
1725
+ CorrelationCoeff = 0.0;
1726
+ else
1727
+ CorrelationCoeff =
1728
+ sqrt (sqrt (*CoVariance * *CoVariance / (*VARii * *VARjj)));
1729
+ if (CorrelationCoeff > Independence)
1730
+ return (FALSE);
1731
+ }
1732
+ }
1733
+ return (TRUE);
1734
+ } // Independent
1735
+
1736
+
1737
+ /** GetBuckets **************************************************************
1738
+ Parameters: Distribution type of probability distribution to test for
1739
+ SampleCount number of samples that are available
1740
+ Confidence probability of a Type I error
1741
+ Globals: none
1742
+ Operation: This routine returns a histogram data structure which can
1743
+ be used by other routines to place samples into histogram
1744
+ buckets, and then apply a goodness of fit test to the
1745
+ histogram data to determine if the samples belong to the
1746
+ specified probability distribution. The routine keeps
1747
+ a list of bucket data structures which have already been
1748
+ created so that it minimizes the computation time needed
1749
+ to create a new bucket.
1750
+ Return: Bucket data structure
1751
+ Exceptions: none
1752
+ History: Thu Aug 3 12:58:10 1989, DSJ, Created.
1753
+ *****************************************************************************/
1754
+ BUCKETS *GetBuckets(DISTRIBUTION Distribution,
1755
+ uinT32 SampleCount,
1756
+ FLOAT64 Confidence) {
1757
+ uinT16 NumberOfBuckets;
1758
+ BUCKETS *Buckets;
1759
+
1760
+ // search for an old bucket structure with the same number of buckets
1761
+ NumberOfBuckets = OptimumNumberOfBuckets (SampleCount);
1762
+ Buckets = (BUCKETS *) first_node (search (OldBuckets[(int) Distribution],
1763
+ &NumberOfBuckets, NumBucketsMatch));
1764
+
1765
+ // if a matching bucket structure is found, delete it from the list
1766
+ if (Buckets != NULL) {
1767
+ OldBuckets[(int) Distribution] =
1768
+ delete_d (OldBuckets[(int) Distribution], Buckets, ListEntryMatch);
1769
+ if (SampleCount != Buckets->SampleCount)
1770
+ AdjustBuckets(Buckets, SampleCount);
1771
+ if (Confidence != Buckets->Confidence) {
1772
+ Buckets->Confidence = Confidence;
1773
+ Buckets->ChiSquared = ComputeChiSquared
1774
+ (DegreesOfFreedom (Distribution, Buckets->NumberOfBuckets),
1775
+ Confidence);
1776
+ }
1777
+ InitBuckets(Buckets);
1778
+ }
1779
+ else // otherwise create a new structure
1780
+ Buckets = MakeBuckets (Distribution, SampleCount, Confidence);
1781
+ return (Buckets);
1782
+ } // GetBuckets
1783
+
1784
+
1785
+ /** Makebuckets *************************************************************
1786
+ Parameters: Distribution type of probability distribution to test for
1787
+ SampleCount number of samples that are available
1788
+ Confidence probability of a Type I error
1789
+ Globals: None
1790
+ Operation: This routine creates a histogram data structure which can
1791
+ be used by other routines to place samples into histogram
1792
+ buckets, and then apply a goodness of fit test to the
1793
+ histogram data to determine if the samples belong to the
1794
+ specified probability distribution. The buckets are
1795
+ allocated in such a way that the expected frequency of
1796
+ samples in each bucket is approximately the same. In
1797
+ order to make this possible, a mapping table is
1798
+ computed which maps "normalized" samples into the
1799
+ appropriate bucket.
1800
+ Return: Pointer to new histogram data structure
1801
+ Exceptions: None
1802
+ History: 6/4/89, DSJ, Created.
1803
+ *****************************************************************************/
1804
+ BUCKETS *MakeBuckets(DISTRIBUTION Distribution,
1805
+ uinT32 SampleCount,
1806
+ FLOAT64 Confidence) {
1807
+ static DENSITYFUNC DensityFunction[] =
1808
+ { NormalDensity, UniformDensity, UniformDensity };
1809
+ int i, j;
1810
+ BUCKETS *Buckets;
1811
+ FLOAT64 BucketProbability;
1812
+ FLOAT64 NextBucketBoundary;
1813
+ FLOAT64 Probability;
1814
+ FLOAT64 ProbabilityDelta;
1815
+ FLOAT64 LastProbDensity;
1816
+ FLOAT64 ProbDensity;
1817
+ uinT16 CurrentBucket;
1818
+ BOOL8 Symmetrical;
1819
+
1820
+ // allocate memory needed for data structure
1821
+ Buckets = (BUCKETS *) Emalloc (sizeof (BUCKETS));
1822
+ Buckets->NumberOfBuckets = OptimumNumberOfBuckets (SampleCount);
1823
+ Buckets->SampleCount = SampleCount;
1824
+ Buckets->Confidence = Confidence;
1825
+ Buckets->Count =
1826
+ (uinT32 *) Emalloc (Buckets->NumberOfBuckets * sizeof (uinT32));
1827
+ Buckets->ExpectedCount =
1828
+ (FLOAT32 *) Emalloc (Buckets->NumberOfBuckets * sizeof (FLOAT32));
1829
+
1830
+ // initialize simple fields
1831
+ Buckets->Distribution = Distribution;
1832
+ for (i = 0; i < Buckets->NumberOfBuckets; i++) {
1833
+ Buckets->Count[i] = 0;
1834
+ Buckets->ExpectedCount[i] = 0.0;
1835
+ }
1836
+
1837
+ // all currently defined distributions are symmetrical
1838
+ Symmetrical = TRUE;
1839
+ Buckets->ChiSquared = ComputeChiSquared
1840
+ (DegreesOfFreedom (Distribution, Buckets->NumberOfBuckets), Confidence);
1841
+
1842
+ if (Symmetrical) {
1843
+ // allocate buckets so that all have approx. equal probability
1844
+ BucketProbability = 1.0 / (FLOAT64) (Buckets->NumberOfBuckets);
1845
+
1846
+ // distribution is symmetric so fill in upper half then copy
1847
+ CurrentBucket = Buckets->NumberOfBuckets / 2;
1848
+ if (Odd (Buckets->NumberOfBuckets))
1849
+ NextBucketBoundary = BucketProbability / 2;
1850
+ else
1851
+ NextBucketBoundary = BucketProbability;
1852
+
1853
+ Probability = 0.0;
1854
+ LastProbDensity =
1855
+ (*DensityFunction[(int) Distribution]) (BUCKETTABLESIZE / 2);
1856
+ for (i = BUCKETTABLESIZE / 2; i < BUCKETTABLESIZE; i++) {
1857
+ ProbDensity = (*DensityFunction[(int) Distribution]) (i + 1);
1858
+ ProbabilityDelta = Integral (LastProbDensity, ProbDensity, 1.0);
1859
+ Probability += ProbabilityDelta;
1860
+ if (Probability > NextBucketBoundary) {
1861
+ if (CurrentBucket < Buckets->NumberOfBuckets - 1)
1862
+ CurrentBucket++;
1863
+ NextBucketBoundary += BucketProbability;
1864
+ }
1865
+ Buckets->Bucket[i] = CurrentBucket;
1866
+ Buckets->ExpectedCount[CurrentBucket] +=
1867
+ (FLOAT32) (ProbabilityDelta * SampleCount);
1868
+ LastProbDensity = ProbDensity;
1869
+ }
1870
+ // place any leftover probability into the last bucket
1871
+ Buckets->ExpectedCount[CurrentBucket] +=
1872
+ (FLOAT32) ((0.5 - Probability) * SampleCount);
1873
+
1874
+ // copy upper half of distribution to lower half
1875
+ for (i = 0, j = BUCKETTABLESIZE - 1; i < j; i++, j--)
1876
+ Buckets->Bucket[i] =
1877
+ Mirror (Buckets->Bucket[j], Buckets->NumberOfBuckets);
1878
+
1879
+ // copy upper half of expected counts to lower half
1880
+ for (i = 0, j = Buckets->NumberOfBuckets - 1; i <= j; i++, j--)
1881
+ Buckets->ExpectedCount[i] += Buckets->ExpectedCount[j];
1882
+ }
1883
+ return (Buckets);
1884
+ } // MakeBuckets
1885
+
1886
+
1887
+ //---------------------------------------------------------------------------
1888
+ uinT16 OptimumNumberOfBuckets(uinT32 SampleCount) {
1889
+ /*
1890
+ ** Parameters:
1891
+ ** SampleCount number of samples to be tested
1892
+ ** Globals:
1893
+ ** CountTable lookup table for number of samples
1894
+ ** BucketsTable lookup table for necessary number of buckets
1895
+ ** Operation:
1896
+ ** This routine computes the optimum number of histogram
1897
+ ** buckets that should be used in a chi-squared goodness of
1898
+ ** fit test for the specified number of samples. The optimum
1899
+ ** number is computed based on Table 4.1 on pg. 147 of
1900
+ ** "Measurement and Analysis of Random Data" by Bendat & Piersol.
1901
+ ** Linear interpolation is used to interpolate between table
1902
+ ** values. The table is intended for a 0.05 level of
1903
+ ** significance (alpha). This routine assumes that it is
1904
+ ** equally valid for other alpha's, which may not be true.
1905
+ ** Return:
1906
+ ** Optimum number of histogram buckets
1907
+ ** Exceptions:
1908
+ ** None
1909
+ ** History:
1910
+ ** 6/5/89, DSJ, Created.
1911
+ */
1912
+ uinT8 Last, Next;
1913
+ FLOAT32 Slope;
1914
+
1915
+ if (SampleCount < CountTable[0])
1916
+ return (BucketsTable[0]);
1917
+
1918
+ for (Last = 0, Next = 1; Next < LOOKUPTABLESIZE; Last++, Next++) {
1919
+ if (SampleCount <= CountTable[Next]) {
1920
+ Slope = (FLOAT32) (BucketsTable[Next] - BucketsTable[Last]) /
1921
+ (FLOAT32) (CountTable[Next] - CountTable[Last]);
1922
+ return ((uinT16) (BucketsTable[Last] +
1923
+ Slope * (SampleCount - CountTable[Last])));
1924
+ }
1925
+ }
1926
+ return (BucketsTable[Last]);
1927
+ } // OptimumNumberOfBuckets
1928
+
1929
+
1930
+ //---------------------------------------------------------------------------
1931
+ FLOAT64
1932
+ ComputeChiSquared (uinT16 DegreesOfFreedom, FLOAT64 Alpha)
1933
+ /*
1934
+ ** Parameters:
1935
+ ** DegreesOfFreedom determines shape of distribution
1936
+ ** Alpha probability of right tail
1937
+ ** Globals: none
1938
+ ** Operation:
1939
+ ** This routine computes the chi-squared value which will
1940
+ ** leave a cumulative probability of Alpha in the right tail
1941
+ ** of a chi-squared distribution with the specified number of
1942
+ ** degrees of freedom. Alpha must be between 0 and 1.
1943
+ ** DegreesOfFreedom must be even. The routine maintains an
1944
+ ** array of lists. Each list corresponds to a different
1945
+ ** number of degrees of freedom. Each entry in the list
1946
+ ** corresponds to a different alpha value and its corresponding
1947
+ ** chi-squared value. Therefore, once a particular chi-squared
1948
+ ** value is computed, it is stored in the list and never
1949
+ ** needs to be computed again.
1950
+ ** Return: Desired chi-squared value
1951
+ ** Exceptions: none
1952
+ ** History: 6/5/89, DSJ, Created.
1953
+ */
1954
+ #define CHIACCURACY 0.01
1955
+ #define MINALPHA (1e-200)
1956
+ {
1957
+ static LIST ChiWith[MAXDEGREESOFFREEDOM + 1];
1958
+
1959
+ CHISTRUCT *OldChiSquared;
1960
+ CHISTRUCT SearchKey;
1961
+
1962
+ // limit the minimum alpha that can be used - if alpha is too small
1963
+ // it may not be possible to compute chi-squared.
1964
+ if (Alpha < MINALPHA)
1965
+ Alpha = MINALPHA;
1966
+ if (Alpha > 1.0)
1967
+ Alpha = 1.0;
1968
+ if (Odd (DegreesOfFreedom))
1969
+ DegreesOfFreedom++;
1970
+
1971
+ /* find the list of chi-squared values which have already been computed
1972
+ for the specified number of degrees of freedom. Search the list for
1973
+ the desired chi-squared. */
1974
+ SearchKey.Alpha = Alpha;
1975
+ OldChiSquared = (CHISTRUCT *) first_node (search (ChiWith[DegreesOfFreedom],
1976
+ &SearchKey, AlphaMatch));
1977
+
1978
+ if (OldChiSquared == NULL) {
1979
+ OldChiSquared = NewChiStruct (DegreesOfFreedom, Alpha);
1980
+ OldChiSquared->ChiSquared = Solve (ChiArea, OldChiSquared,
1981
+ (FLOAT64) DegreesOfFreedom,
1982
+ (FLOAT64) CHIACCURACY);
1983
+ ChiWith[DegreesOfFreedom] = push (ChiWith[DegreesOfFreedom],
1984
+ OldChiSquared);
1985
+ }
1986
+ else {
1987
+ // further optimization might move OldChiSquared to front of list
1988
+ }
1989
+
1990
+ return (OldChiSquared->ChiSquared);
1991
+
1992
+ } // ComputeChiSquared
1993
+
1994
+
1995
+ //---------------------------------------------------------------------------
1996
+ FLOAT64 NormalDensity(inT32 x) {
1997
+ /*
1998
+ ** Parameters:
1999
+ ** x number to compute the normal probability density for
2000
+ ** Globals:
2001
+ ** NormalMean mean of a discrete normal distribution
2002
+ ** NormalVariance variance of a discrete normal distribution
2003
+ ** NormalMagnitude magnitude of a discrete normal distribution
2004
+ ** Operation:
2005
+ ** This routine computes the probability density function
2006
+ ** of a discrete normal distribution defined by the global
2007
+ ** variables NormalMean, NormalVariance, and NormalMagnitude.
2008
+ ** Normal magnitude could, of course, be computed in terms of
2009
+ ** the normal variance but it is precomputed for efficiency.
2010
+ ** Return:
2011
+ ** The value of the normal distribution at x.
2012
+ ** Exceptions:
2013
+ ** None
2014
+ ** History:
2015
+ ** 6/4/89, DSJ, Created.
2016
+ */
2017
+ FLOAT64 Distance;
2018
+
2019
+ Distance = x - NormalMean;
2020
+ return (NormalMagnitude *
2021
+ exp (-0.5 * Distance * Distance / NormalVariance));
2022
+ } // NormalDensity
2023
+
2024
+
2025
+ //---------------------------------------------------------------------------
2026
+ FLOAT64 UniformDensity(inT32 x) {
2027
+ /*
2028
+ ** Parameters:
2029
+ ** x number to compute the uniform probability density for
2030
+ ** Globals:
2031
+ ** BUCKETTABLESIZE determines range of distribution
2032
+ ** Operation:
2033
+ ** This routine computes the probability density function
2034
+ ** of a uniform distribution at the specified point. The
2035
+ ** range of the distribution is from 0 to BUCKETTABLESIZE.
2036
+ ** Return:
2037
+ ** The value of the uniform distribution at x.
2038
+ ** Exceptions:
2039
+ ** None
2040
+ ** History:
2041
+ ** 6/5/89, DSJ, Created.
2042
+ */
2043
+ static FLOAT64 UniformDistributionDensity = (FLOAT64) 1.0 / BUCKETTABLESIZE;
2044
+
2045
+ if ((x >= 0.0) && (x <= BUCKETTABLESIZE))
2046
+ return (UniformDistributionDensity);
2047
+ else
2048
+ return ((FLOAT64) 0.0);
2049
+ } // UniformDensity
2050
+
2051
+
2052
+ //---------------------------------------------------------------------------
2053
+ FLOAT64 Integral(FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx) {
2054
+ /*
2055
+ ** Parameters:
2056
+ ** f1 value of function at x1
2057
+ ** f2 value of function at x2
2058
+ ** Dx x2 - x1 (should always be positive)
2059
+ ** Globals:
2060
+ ** None
2061
+ ** Operation:
2062
+ ** This routine computes a trapezoidal approximation to the
2063
+ ** integral of a function over a small delta in x.
2064
+ ** Return:
2065
+ ** Approximation of the integral of the function from x1 to x2.
2066
+ ** Exceptions:
2067
+ ** None
2068
+ ** History:
2069
+ ** 6/5/89, DSJ, Created.
2070
+ */
2071
+ return ((f1 + f2) * Dx / 2.0);
2072
+ } // Integral
2073
+
2074
+
2075
+ //---------------------------------------------------------------------------
2076
+ void FillBuckets(BUCKETS *Buckets,
2077
+ CLUSTER *Cluster,
2078
+ uinT16 Dim,
2079
+ PARAM_DESC *ParamDesc,
2080
+ FLOAT32 Mean,
2081
+ FLOAT32 StdDev) {
2082
+ /*
2083
+ ** Parameters:
2084
+ ** Buckets histogram buckets to count samples
2085
+ ** Cluster cluster whose samples are being analyzed
2086
+ ** Dim dimension of samples which is being analyzed
2087
+ ** ParamDesc description of the dimension
2088
+ ** Mean "mean" of the distribution
2089
+ ** StdDev "standard deviation" of the distribution
2090
+ ** Globals:
2091
+ ** None
2092
+ ** Operation:
2093
+ ** This routine counts the number of cluster samples which
2094
+ ** fall within the various histogram buckets in Buckets. Only
2095
+ ** one dimension of each sample is examined. The exact meaning
2096
+ ** of the Mean and StdDev parameters depends on the
2097
+ ** distribution which is being analyzed (this info is in the
2098
+ ** Buckets data structure). For normal distributions, Mean
2099
+ ** and StdDev have the expected meanings. For uniform and
2100
+ ** random distributions the Mean is the center point of the
2101
+ ** range and the StdDev is 1/2 the range. A dimension with
2102
+ ** zero standard deviation cannot be statistically analyzed.
2103
+ ** In this case, a pseudo-analysis is used.
2104
+ ** Return:
2105
+ ** None (the Buckets data structure is filled in)
2106
+ ** Exceptions:
2107
+ ** None
2108
+ ** History:
2109
+ ** 6/5/89, DSJ, Created.
2110
+ */
2111
+ uinT16 BucketID;
2112
+ int i;
2113
+ LIST SearchState;
2114
+ SAMPLE *Sample;
2115
+
2116
+ // initialize the histogram bucket counts to 0
2117
+ for (i = 0; i < Buckets->NumberOfBuckets; i++)
2118
+ Buckets->Count[i] = 0;
2119
+
2120
+ if (StdDev == 0.0) {
2121
+ /* if the standard deviation is zero, then we can't statistically
2122
+ analyze the cluster. Use a pseudo-analysis: samples exactly on
2123
+ the mean are distributed evenly across all buckets. Samples greater
2124
+ than the mean are placed in the last bucket; samples less than the
2125
+ mean are placed in the first bucket. */
2126
+
2127
+ InitSampleSearch(SearchState, Cluster);
2128
+ i = 0;
2129
+ while ((Sample = NextSample (&SearchState)) != NULL) {
2130
+ if (Sample->Mean[Dim] > Mean)
2131
+ BucketID = Buckets->NumberOfBuckets - 1;
2132
+ else if (Sample->Mean[Dim] < Mean)
2133
+ BucketID = 0;
2134
+ else
2135
+ BucketID = i;
2136
+ Buckets->Count[BucketID] += 1;
2137
+ i++;
2138
+ if (i >= Buckets->NumberOfBuckets)
2139
+ i = 0;
2140
+ }
2141
+ }
2142
+ else {
2143
+ // search for all samples in the cluster and add to histogram buckets
2144
+ InitSampleSearch(SearchState, Cluster);
2145
+ while ((Sample = NextSample (&SearchState)) != NULL) {
2146
+ switch (Buckets->Distribution) {
2147
+ case normal:
2148
+ BucketID = NormalBucket (ParamDesc, Sample->Mean[Dim],
2149
+ Mean, StdDev);
2150
+ break;
2151
+ case D_random:
2152
+ case uniform:
2153
+ BucketID = UniformBucket (ParamDesc, Sample->Mean[Dim],
2154
+ Mean, StdDev);
2155
+ break;
2156
+ default:
2157
+ BucketID = 0;
2158
+ }
2159
+ Buckets->Count[Buckets->Bucket[BucketID]] += 1;
2160
+ }
2161
+ }
2162
+ } // FillBuckets
2163
+
2164
+
2165
+ //---------------------------------------------------------------------------*/
2166
+ uinT16 NormalBucket(PARAM_DESC *ParamDesc,
2167
+ FLOAT32 x,
2168
+ FLOAT32 Mean,
2169
+ FLOAT32 StdDev) {
2170
+ /*
2171
+ ** Parameters:
2172
+ ** ParamDesc used to identify circular dimensions
2173
+ ** x value to be normalized
2174
+ ** Mean mean of normal distribution
2175
+ ** StdDev standard deviation of normal distribution
2176
+ ** Globals:
2177
+ ** NormalMean mean of discrete normal distribution
2178
+ ** NormalStdDev standard deviation of discrete normal dist.
2179
+ ** BUCKETTABLESIZE range of the discrete distribution
2180
+ ** Operation:
2181
+ ** This routine determines which bucket x falls into in the
2182
+ ** discrete normal distribution defined by NormalMean
2183
+ ** and NormalStdDev. x values which exceed the range of
2184
+ ** the discrete distribution are clipped.
2185
+ ** Return:
2186
+ ** Bucket number into which x falls
2187
+ ** Exceptions:
2188
+ ** None
2189
+ ** History:
2190
+ ** 6/5/89, DSJ, Created.
2191
+ */
2192
+ FLOAT32 X;
2193
+
2194
+ // wraparound circular parameters if necessary
2195
+ if (ParamDesc->Circular) {
2196
+ if (x - Mean > ParamDesc->HalfRange)
2197
+ x -= ParamDesc->Range;
2198
+ else if (x - Mean < -ParamDesc->HalfRange)
2199
+ x += ParamDesc->Range;
2200
+ }
2201
+
2202
+ X = ((x - Mean) / StdDev) * NormalStdDev + NormalMean;
2203
+ if (X < 0)
2204
+ return ((uinT16) 0);
2205
+ if (X > BUCKETTABLESIZE - 1)
2206
+ return ((uinT16) (BUCKETTABLESIZE - 1));
2207
+ return ((uinT16) floor ((FLOAT64) X));
2208
+ } // NormalBucket
2209
+
2210
+
2211
+ //---------------------------------------------------------------------------
2212
+ uinT16 UniformBucket(PARAM_DESC *ParamDesc,
2213
+ FLOAT32 x,
2214
+ FLOAT32 Mean,
2215
+ FLOAT32 StdDev) {
2216
+ /*
2217
+ ** Parameters:
2218
+ ** ParamDesc used to identify circular dimensions
2219
+ ** x value to be normalized
2220
+ ** Mean center of range of uniform distribution
2221
+ ** StdDev 1/2 the range of the uniform distribution
2222
+ ** Globals:
2223
+ ** BUCKETTABLESIZE range of the discrete distribution
2224
+ ** Operation:
2225
+ ** This routine determines which bucket x falls into in the
2226
+ ** discrete uniform distribution defined by
2227
+ ** BUCKETTABLESIZE. x values which exceed the range of
2228
+ ** the discrete distribution are clipped.
2229
+ ** Return:
2230
+ ** Bucket number into which x falls
2231
+ ** Exceptions:
2232
+ ** None
2233
+ ** History:
2234
+ ** 6/5/89, DSJ, Created.
2235
+ */
2236
+ FLOAT32 X;
2237
+
2238
+ // wraparound circular parameters if necessary
2239
+ if (ParamDesc->Circular) {
2240
+ if (x - Mean > ParamDesc->HalfRange)
2241
+ x -= ParamDesc->Range;
2242
+ else if (x - Mean < -ParamDesc->HalfRange)
2243
+ x += ParamDesc->Range;
2244
+ }
2245
+
2246
+ X = ((x - Mean) / (2 * StdDev) * BUCKETTABLESIZE + BUCKETTABLESIZE / 2.0);
2247
+ if (X < 0)
2248
+ return ((uinT16) 0);
2249
+ if (X > BUCKETTABLESIZE - 1)
2250
+ return ((uinT16) (BUCKETTABLESIZE - 1));
2251
+ return ((uinT16) floor ((FLOAT64) X));
2252
+ } // UniformBucket
2253
+
2254
+
2255
+ //---------------------------------------------------------------------------
2256
+ BOOL8 DistributionOK(BUCKETS *Buckets) {
2257
+ /*
2258
+ ** Parameters:
2259
+ ** Buckets histogram data to perform chi-square test on
2260
+ ** Globals:
2261
+ ** None
2262
+ ** Operation:
2263
+ ** This routine performs a chi-square goodness of fit test
2264
+ ** on the histogram data in the Buckets data structure. TRUE
2265
+ ** is returned if the histogram matches the probability
2266
+ ** distribution which was specified when the Buckets
2267
+ ** structure was originally created. Otherwise FALSE is
2268
+ ** returned.
2269
+ ** Return:
2270
+ ** TRUE if samples match distribution, FALSE otherwise
2271
+ ** Exceptions:
2272
+ ** None
2273
+ ** History:
2274
+ ** 6/5/89, DSJ, Created.
2275
+ */
2276
+ FLOAT32 FrequencyDifference;
2277
+ FLOAT32 TotalDifference;
2278
+ int i;
2279
+
2280
+ // compute how well the histogram matches the expected histogram
2281
+ TotalDifference = 0.0;
2282
+ for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2283
+ FrequencyDifference = Buckets->Count[i] - Buckets->ExpectedCount[i];
2284
+ TotalDifference += (FrequencyDifference * FrequencyDifference) /
2285
+ Buckets->ExpectedCount[i];
2286
+ }
2287
+
2288
+ // test to see if the difference is more than expected
2289
+ if (TotalDifference > Buckets->ChiSquared)
2290
+ return (FALSE);
2291
+ else
2292
+ return (TRUE);
2293
+ } // DistributionOK
2294
+
2295
+
2296
+ //---------------------------------------------------------------------------
2297
+ void FreeStatistics(STATISTICS *Statistics) {
2298
+ /*
2299
+ ** Parameters:
2300
+ ** Statistics pointer to data structure to be freed
2301
+ ** Globals:
2302
+ ** None
2303
+ ** Operation:
2304
+ ** This routine frees the memory used by the statistics
2305
+ ** data structure.
2306
+ ** Return:
2307
+ ** None
2308
+ ** Exceptions:
2309
+ ** None
2310
+ ** History:
2311
+ ** 6/5/89, DSJ, Created.
2312
+ */
2313
+ memfree (Statistics->CoVariance);
2314
+ memfree (Statistics->Min);
2315
+ memfree (Statistics->Max);
2316
+ memfree(Statistics);
2317
+ } // FreeStatistics
2318
+
2319
+
2320
+ //---------------------------------------------------------------------------
2321
+ void FreeBuckets(BUCKETS *Buckets) {
2322
+ /*
2323
+ ** Parameters:
2324
+ ** Buckets pointer to data structure to be freed
2325
+ ** Globals: none
2326
+ ** Operation:
2327
+ ** This routine places the specified histogram data structure
2328
+ ** at the front of a list of histograms so that it can be
2329
+ ** reused later if necessary. A separate list is maintained
2330
+ ** for each different type of distribution.
2331
+ ** Return: none
2332
+ ** Exceptions: none
2333
+ ** History: 6/5/89, DSJ, Created.
2334
+ */
2335
+ int Dist;
2336
+
2337
+ if (Buckets != NULL) {
2338
+ Dist = (int) Buckets->Distribution;
2339
+ OldBuckets[Dist] = (LIST) push (OldBuckets[Dist], Buckets);
2340
+ }
2341
+
2342
+ } // FreeBuckets
2343
+
2344
+
2345
+ //---------------------------------------------------------------------------
2346
+ void FreeCluster(CLUSTER *Cluster) {
2347
+ /*
2348
+ ** Parameters:
2349
+ ** Cluster pointer to cluster to be freed
2350
+ ** Globals:
2351
+ ** None
2352
+ ** Operation:
2353
+ ** This routine frees the memory consumed by the specified
2354
+ ** cluster and all of its subclusters. This is done by
2355
+ ** recursive calls to FreeCluster().
2356
+ ** Return:
2357
+ ** None
2358
+ ** Exceptions:
2359
+ ** None
2360
+ ** History:
2361
+ ** 6/6/89, DSJ, Created.
2362
+ */
2363
+ if (Cluster != NULL) {
2364
+ FreeCluster (Cluster->Left);
2365
+ FreeCluster (Cluster->Right);
2366
+ memfree(Cluster);
2367
+ }
2368
+ } // FreeCluster
2369
+
2370
+
2371
+ //---------------------------------------------------------------------------
2372
+ uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets) {
2373
+ /*
2374
+ ** Parameters:
2375
+ ** Distribution distribution being tested for
2376
+ ** HistogramBuckets number of buckets in chi-square test
2377
+ ** Globals: none
2378
+ ** Operation:
2379
+ ** This routine computes the degrees of freedom that should
2380
+ ** be used in a chi-squared test with the specified number of
2381
+ ** histogram buckets. The result is always rounded up to
2382
+ ** the next even number so that the value of chi-squared can be
2383
+ ** computed more easily. This will cause the value of
2384
+ ** chi-squared to be higher than the optimum value, resulting
2385
+ ** in the chi-square test being more lenient than optimum.
2386
+ ** Return: The number of degrees of freedom for a chi-square test
2387
+ ** Exceptions: none
2388
+ ** History: Thu Aug 3 14:04:18 1989, DSJ, Created.
2389
+ */
2390
+ static uinT8 DegreeOffsets[] = { 3, 3, 1 };
2391
+
2392
+ uinT16 AdjustedNumBuckets;
2393
+
2394
+ AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[(int) Distribution];
2395
+ if (Odd (AdjustedNumBuckets))
2396
+ AdjustedNumBuckets++;
2397
+ return (AdjustedNumBuckets);
2398
+
2399
+ } // DegreesOfFreedom
2400
+
2401
+
2402
+ //---------------------------------------------------------------------------
2403
+ int NumBucketsMatch(void *arg1, //BUCKETS *Histogram,
2404
+ void *arg2) { //uinT16 *DesiredNumberOfBuckets)
2405
+ /*
2406
+ ** Parameters:
2407
+ ** Histogram current histogram being tested for a match
2408
+ ** DesiredNumberOfBuckets match key
2409
+ ** Globals: none
2410
+ ** Operation:
2411
+ ** This routine is used to search a list of histogram data
2412
+ ** structures to find one with the specified number of
2413
+ ** buckets. It is called by the list search routines.
2414
+ ** Return: TRUE if Histogram matches DesiredNumberOfBuckets
2415
+ ** Exceptions: none
2416
+ ** History: Thu Aug 3 14:17:33 1989, DSJ, Created.
2417
+ */
2418
+ BUCKETS *Histogram = (BUCKETS *) arg1;
2419
+ uinT16 *DesiredNumberOfBuckets = (uinT16 *) arg2;
2420
+
2421
+ return (*DesiredNumberOfBuckets == Histogram->NumberOfBuckets);
2422
+
2423
+ } // NumBucketsMatch
2424
+
2425
+
2426
+ //---------------------------------------------------------------------------
2427
+ int ListEntryMatch(void *arg1, //ListNode
2428
+ void *arg2) { //Key
2429
+ /*
2430
+ ** Parameters: none
2431
+ ** Globals: none
2432
+ ** Operation:
2433
+ ** This routine is used to search a list for a list node
2434
+ ** whose contents match Key. It is called by the list
2435
+ ** delete_d routine.
2436
+ ** Return: TRUE if ListNode matches Key
2437
+ ** Exceptions: none
2438
+ ** History: Thu Aug 3 14:23:58 1989, DSJ, Created.
2439
+ */
2440
+ return (arg1 == arg2);
2441
+
2442
+ } // ListEntryMatch
2443
+
2444
+
2445
+ //---------------------------------------------------------------------------
2446
+ void AdjustBuckets(BUCKETS *Buckets, uinT32 NewSampleCount) {
2447
+ /*
2448
+ ** Parameters:
2449
+ ** Buckets histogram data structure to adjust
2450
+ ** NewSampleCount new sample count to adjust to
2451
+ ** Globals: none
2452
+ ** Operation:
2453
+ ** This routine multiplies each ExpectedCount histogram entry
2454
+ ** by NewSampleCount/OldSampleCount so that the histogram
2455
+ ** is now adjusted to the new sample count.
2456
+ ** Return: none
2457
+ ** Exceptions: none
2458
+ ** History: Thu Aug 3 14:31:14 1989, DSJ, Created.
2459
+ */
2460
+ int i;
2461
+ FLOAT64 AdjustFactor;
2462
+
2463
+ AdjustFactor = (((FLOAT64) NewSampleCount) /
2464
+ ((FLOAT64) Buckets->SampleCount));
2465
+
2466
+ for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2467
+ Buckets->ExpectedCount[i] *= AdjustFactor;
2468
+ }
2469
+
2470
+ Buckets->SampleCount = NewSampleCount;
2471
+
2472
+ } // AdjustBuckets
2473
+
2474
+
2475
+ //---------------------------------------------------------------------------
2476
+ void InitBuckets(BUCKETS *Buckets) {
2477
+ /*
2478
+ ** Parameters:
2479
+ ** Buckets histogram data structure to init
2480
+ ** Globals: none
2481
+ ** Operation:
2482
+ ** This routine sets the bucket counts in the specified histogram
2483
+ ** to zero.
2484
+ ** Return: none
2485
+ ** Exceptions: none
2486
+ ** History: Thu Aug 3 14:31:14 1989, DSJ, Created.
2487
+ */
2488
+ int i;
2489
+
2490
+ for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2491
+ Buckets->Count[i] = 0;
2492
+ }
2493
+
2494
+ } // InitBuckets
2495
+
2496
+
2497
+ //---------------------------------------------------------------------------
2498
+ int AlphaMatch(void *arg1, //CHISTRUCT *ChiStruct,
2499
+ void *arg2) { //CHISTRUCT *SearchKey)
2500
+ /*
2501
+ ** Parameters:
2502
+ ** ChiStruct chi-squared struct being tested for a match
2503
+ ** SearchKey chi-squared struct that is the search key
2504
+ ** Globals: none
2505
+ ** Operation:
2506
+ ** This routine is used to search a list of structures which
2507
+ ** hold pre-computed chi-squared values for a chi-squared
2508
+ ** value whose corresponding alpha field matches the alpha
2509
+ ** field of SearchKey.
2510
+ ** It is called by the list search routines.
2511
+ ** Return: TRUE if ChiStruct's Alpha matches SearchKey's Alpha
2512
+ ** Exceptions: none
2513
+ ** History: Thu Aug 3 14:17:33 1989, DSJ, Created.
2514
+ */
2515
+ CHISTRUCT *ChiStruct = (CHISTRUCT *) arg1;
2516
+ CHISTRUCT *SearchKey = (CHISTRUCT *) arg2;
2517
+
2518
+ return (ChiStruct->Alpha == SearchKey->Alpha);
2519
+
2520
+ } // AlphaMatch
2521
+
2522
+
2523
+ //---------------------------------------------------------------------------
2524
+ CHISTRUCT *NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha) {
2525
+ /*
2526
+ ** Parameters:
2527
+ ** DegreesOfFreedom degrees of freedom for new chi value
2528
+ ** Alpha confidence level for new chi value
2529
+ ** Globals: none
2530
+ ** Operation:
2531
+ ** This routine allocates a new data structure which is used
2532
+ ** to hold a chi-squared value along with its associated
2533
+ ** number of degrees of freedom and alpha value.
2534
+ ** Return: none
2535
+ ** Exceptions: none
2536
+ ** History: Fri Aug 4 11:04:59 1989, DSJ, Created.
2537
+ */
2538
+ CHISTRUCT *NewChiStruct;
2539
+
2540
+ NewChiStruct = (CHISTRUCT *) Emalloc (sizeof (CHISTRUCT));
2541
+ NewChiStruct->DegreesOfFreedom = DegreesOfFreedom;
2542
+ NewChiStruct->Alpha = Alpha;
2543
+ return (NewChiStruct);
2544
+
2545
+ } // NewChiStruct
2546
+
2547
+
2548
+ //---------------------------------------------------------------------------
2549
+ FLOAT64
2550
+ Solve (SOLVEFUNC Function,
2551
+ void *FunctionParams, FLOAT64 InitialGuess, FLOAT64 Accuracy)
2552
+ /*
2553
+ ** Parameters:
2554
+ ** Function function whose zero is to be found
2555
+ ** FunctionParams arbitrary data to pass to function
2556
+ ** InitialGuess point to start solution search at
2557
+ ** Accuracy maximum allowed error
2558
+ ** Globals: none
2559
+ ** Operation:
2560
+ ** This routine attempts to find an x value at which Function
2561
+ ** goes to zero (i.e. a root of the function ). It will only
2562
+ ** work correctly if a solution actually exists and there
2563
+ ** are no extrema between the solution and the InitialGuess.
2564
+ ** The algorithms used are extremely primitive.
2565
+ ** Return: Solution of function ( x for which f(x) = 0 ).
2566
+ ** Exceptions: none
2567
+ ** History: Fri Aug 4 11:08:59 1989, DSJ, Created.
2568
+ */
2569
+ #define INITIALDELTA 0.1
2570
+ #define DELTARATIO 0.1
2571
+ {
2572
+ FLOAT64 x;
2573
+ FLOAT64 f;
2574
+ FLOAT64 Slope;
2575
+ FLOAT64 Delta;
2576
+ FLOAT64 NewDelta;
2577
+ FLOAT64 xDelta;
2578
+ FLOAT64 LastPosX, LastNegX;
2579
+
2580
+ x = InitialGuess;
2581
+ Delta = INITIALDELTA;
2582
+ LastPosX = MAX_FLOAT32;
2583
+ LastNegX = -MAX_FLOAT32;
2584
+ f = (*Function) ((CHISTRUCT *) FunctionParams, x);
2585
+ while (Abs (LastPosX - LastNegX) > Accuracy) {
2586
+ // keep track of outer bounds of current estimate
2587
+ if (f < 0)
2588
+ LastNegX = x;
2589
+ else
2590
+ LastPosX = x;
2591
+
2592
+ // compute the approx. slope of f(x) at the current point
2593
+ Slope =
2594
+ ((*Function) ((CHISTRUCT *) FunctionParams, x + Delta) - f) / Delta;
2595
+
2596
+ // compute the next solution guess */
2597
+ xDelta = f / Slope;
2598
+ x -= xDelta;
2599
+
2600
+ // reduce the delta used for computing slope to be a fraction of
2601
+ //the amount moved to get to the new guess
2602
+ NewDelta = Abs (xDelta) * DELTARATIO;
2603
+ if (NewDelta < Delta)
2604
+ Delta = NewDelta;
2605
+
2606
+ // compute the value of the function at the new guess
2607
+ f = (*Function) ((CHISTRUCT *) FunctionParams, x);
2608
+ }
2609
+ return (x);
2610
+
2611
+ } // Solve
2612
+
2613
+
2614
+ //---------------------------------------------------------------------------
2615
+ FLOAT64 ChiArea(CHISTRUCT *ChiParams, FLOAT64 x) {
2616
+ /*
2617
+ ** Parameters:
2618
+ ** ChiParams contains degrees of freedom and alpha
2619
+ ** x value of chi-squared to evaluate
2620
+ ** Globals: none
2621
+ ** Operation:
2622
+ ** This routine computes the area under a chi density curve
2623
+ ** from 0 to x, minus the desired area under the curve. The
2624
+ ** number of degrees of freedom of the chi curve is specified
2625
+ ** in the ChiParams structure. The desired area is also
2626
+ ** specified in the ChiParams structure as Alpha ( or 1 minus
2627
+ ** the desired area ). This routine is intended to be passed
2628
+ ** to the Solve() function to find the value of chi-squared
2629
+ ** which will yield a desired area under the right tail of
2630
+ ** the chi density curve. The function will only work for
2631
+ ** even degrees of freedom. The equations are based on
2632
+ ** integrating the chi density curve in parts to obtain
2633
+ ** a series that can be used to compute the area under the
2634
+ ** curve.
2635
+ ** Return: Error between actual and desired area under the chi curve.
2636
+ ** Exceptions: none
2637
+ ** History: Fri Aug 4 12:48:41 1989, DSJ, Created.
2638
+ */
2639
+ int i, N;
2640
+ FLOAT64 SeriesTotal;
2641
+ FLOAT64 Denominator;
2642
+ FLOAT64 PowerOfx;
2643
+
2644
+ N = ChiParams->DegreesOfFreedom / 2 - 1;
2645
+ SeriesTotal = 1;
2646
+ Denominator = 1;
2647
+ PowerOfx = 1;
2648
+ for (i = 1; i <= N; i++) {
2649
+ Denominator *= 2 * i;
2650
+ PowerOfx *= x;
2651
+ SeriesTotal += PowerOfx / Denominator;
2652
+ }
2653
+ return ((SeriesTotal * exp (-0.5 * x)) - ChiParams->Alpha);
2654
+
2655
+ } // ChiArea
2656
+
2657
+
2658
+ //---------------------------------------------------------------------------
2659
+ BOOL8
2660
+ MultipleCharSamples (CLUSTERER * Clusterer,
2661
+ CLUSTER * Cluster, FLOAT32 MaxIllegal)
2662
+ /*
2663
+ ** Parameters:
2664
+ ** Clusterer data structure holding cluster tree
2665
+ ** Cluster cluster containing samples to be tested
2666
+ ** MaxIllegal max percentage of samples allowed to have
2667
+ ** more than 1 feature in the cluster
2668
+ ** Globals: none
2669
+ ** Operation:
2670
+ ** This routine looks at all samples in the specified cluster.
2671
+ ** It computes a running estimate of the percentage of the
2672
+ ** charaters which have more than 1 sample in the cluster.
2673
+ ** When this percentage exceeds MaxIllegal, TRUE is returned.
2674
+ ** Otherwise FALSE is returned. The CharID
2675
+ ** fields must contain integers which identify the training
2676
+ ** characters which were used to generate the sample. One
2677
+ ** integer is used for each sample. The NumChar field in
2678
+ ** the Clusterer must contain the number of characters in the
2679
+ ** training set. All CharID fields must be between 0 and
2680
+ ** NumChar-1. The main function of this routine is to help
2681
+ ** identify clusters which need to be split further, i.e. if
2682
+ ** numerous training characters have 2 or more features which are
2683
+ ** contained in the same cluster, then the cluster should be
2684
+ ** split.
2685
+ ** Return: TRUE if the cluster should be split, FALSE otherwise.
2686
+ ** Exceptions: none
2687
+ ** History: Wed Aug 30 11:13:05 1989, DSJ, Created.
2688
+ ** 2/22/90, DSJ, Added MaxIllegal control rather than always
2689
+ ** splitting illegal clusters.
2690
+ */
2691
+ #define ILLEGAL_CHAR 2
2692
+ {
2693
+ static BOOL8 *CharFlags = NULL;
2694
+ static inT32 NumFlags = 0;
2695
+ int i;
2696
+ LIST SearchState;
2697
+ SAMPLE *Sample;
2698
+ inT32 CharID;
2699
+ inT32 NumCharInCluster;
2700
+ inT32 NumIllegalInCluster;
2701
+ FLOAT32 PercentIllegal;
2702
+
2703
+ // initial estimate assumes that no illegal chars exist in the cluster
2704
+ NumCharInCluster = Cluster->SampleCount;
2705
+ NumIllegalInCluster = 0;
2706
+
2707
+ if (Clusterer->NumChar > NumFlags) {
2708
+ if (CharFlags != NULL)
2709
+ memfree(CharFlags);
2710
+ NumFlags = Clusterer->NumChar;
2711
+ CharFlags = (BOOL8 *) Emalloc (NumFlags * sizeof (BOOL8));
2712
+ }
2713
+
2714
+ for (i = 0; i < NumFlags; i++)
2715
+ CharFlags[i] = FALSE;
2716
+
2717
+ // find each sample in the cluster and check if we have seen it before
2718
+ InitSampleSearch(SearchState, Cluster);
2719
+ while ((Sample = NextSample (&SearchState)) != NULL) {
2720
+ CharID = Sample->CharID;
2721
+ if (CharFlags[CharID] == FALSE) {
2722
+ CharFlags[CharID] = TRUE;
2723
+ }
2724
+ else {
2725
+ if (CharFlags[CharID] == TRUE) {
2726
+ NumIllegalInCluster++;
2727
+ CharFlags[CharID] = ILLEGAL_CHAR;
2728
+ }
2729
+ NumCharInCluster--;
2730
+ PercentIllegal = (FLOAT32) NumIllegalInCluster / NumCharInCluster;
2731
+ if (PercentIllegal > MaxIllegal)
2732
+ return (TRUE);
2733
+ }
2734
+ }
2735
+ return (FALSE);
2736
+
2737
+ } // MultipleCharSamples
2738
+
2739
+ // Compute the inverse of a matrix using LU decomposition with partial pivoting.
2740
+ // The return value is the sum of norms of the off-diagonal terms of the
2741
+ // product of a and inv. (A measure of the error.)
2742
+ double InvertMatrix(const float* input, int size, float* inv) {
2743
+ double** U; // The upper triangular array.
2744
+ double* Umem;
2745
+ double** U_inv; // The inverse of U.
2746
+ double* U_invmem;
2747
+ double** L; // The lower triangular array.
2748
+ double* Lmem;
2749
+
2750
+ // Allocate memory for the 2D arrays.
2751
+ ALLOC_2D_ARRAY(size, size, Umem, U, double);
2752
+ ALLOC_2D_ARRAY(size, size, U_invmem, U_inv, double);
2753
+ ALLOC_2D_ARRAY(size, size, Lmem, L, double);
2754
+
2755
+ // Initialize the working matrices. U starts as input, L as I and U_inv as O.
2756
+ int row;
2757
+ int col;
2758
+ for (row = 0; row < size; row++) {
2759
+ for (col = 0; col < size; col++) {
2760
+ U[row][col] = input[row*size + col];
2761
+ L[row][col] = row == col ? 1.0 : 0.0;
2762
+ U_inv[row][col] = 0.0;
2763
+ }
2764
+ }
2765
+
2766
+ // Compute forward matrix by inversion by LU decomposition of input.
2767
+ for (col = 0; col < size; ++col) {
2768
+ // Find best pivot
2769
+ int best_row = 0;
2770
+ double best_pivot = -1.0;
2771
+ for (row = col; row < size; ++row) {
2772
+ if (Abs(U[row][col]) > best_pivot) {
2773
+ best_pivot = Abs(U[row][col]);
2774
+ best_row = row;
2775
+ }
2776
+ }
2777
+ // Exchange pivot rows.
2778
+ if (best_row != col) {
2779
+ for (int k = 0; k < size; ++k) {
2780
+ double tmp = U[best_row][k];
2781
+ U[best_row][k] = U[col][k];
2782
+ U[col][k] = tmp;
2783
+ tmp = L[best_row][k];
2784
+ L[best_row][k] = L[col][k];
2785
+ L[col][k] = tmp;
2786
+ }
2787
+ }
2788
+ // Now do the pivot itself.
2789
+ for (row = col + 1; row < size; ++row) {
2790
+ double ratio = -U[row][col] / U[col][col];
2791
+ for (int j = col; j < size; ++j) {
2792
+ U[row][j] += U[col][j] * ratio;
2793
+ }
2794
+ for (int k = 0; k < size; ++k) {
2795
+ L[row][k] += L[col][k] * ratio;
2796
+ }
2797
+ }
2798
+ }
2799
+ // Next invert U.
2800
+ for (col = 0; col < size; ++col) {
2801
+ U_inv[col][col] = 1.0 / U[col][col];
2802
+ for (row = col - 1; row >= 0; --row) {
2803
+ double total = 0.0;
2804
+ for (int k = col; k > row; --k) {
2805
+ total += U[row][k] * U_inv[k][col];
2806
+ }
2807
+ U_inv[row][col] = -total / U[row][row];
2808
+ }
2809
+ }
2810
+ // Now the answer is U_inv.L.
2811
+ for (row = 0; row < size; row++) {
2812
+ for (col = 0; col < size; col++) {
2813
+ double sum = 0.0;
2814
+ for (int k = row; k < size; ++k) {
2815
+ sum += U_inv[row][k] * L[k][col];
2816
+ }
2817
+ inv[row*size + col] = sum;
2818
+ }
2819
+ }
2820
+ // Check matrix product.
2821
+ double error_sum = 0.0;
2822
+ for (row = 0; row < size; row++) {
2823
+ for (col = 0; col < size; col++) {
2824
+ double sum = 0.0;
2825
+ for (int k = 0; k < size; ++k) {
2826
+ sum += input[row*size + k] * inv[k *size + col];
2827
+ }
2828
+ if (row != col) {
2829
+ error_sum += Abs(sum);
2830
+ }
2831
+ }
2832
+ }
2833
+ return error_sum;
2834
+ }