tesseract_bin 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (612) hide show
  1. data/.document +5 -0
  2. data/Gemfile +14 -0
  3. data/Gemfile.lock +23 -0
  4. data/LICENSE.txt +20 -0
  5. data/README.rdoc +19 -0
  6. data/Rakefile +46 -0
  7. data/VERSION +1 -0
  8. data/ext/tesseract_bin/extconf.rb +17 -0
  9. data/lib/tesseract_bin.rb +12 -0
  10. data/tesseract_bin.gemspec +660 -0
  11. data/test/helper.rb +18 -0
  12. data/test/test_tesseract_bin.rb +7 -0
  13. data/vendor/tesseract-2.04/AUTHORS +8 -0
  14. data/vendor/tesseract-2.04/COPYING +23 -0
  15. data/vendor/tesseract-2.04/ChangeLog +71 -0
  16. data/vendor/tesseract-2.04/INSTALL +229 -0
  17. data/vendor/tesseract-2.04/Makefile.am +20 -0
  18. data/vendor/tesseract-2.04/Makefile.in +641 -0
  19. data/vendor/tesseract-2.04/NEWS +1 -0
  20. data/vendor/tesseract-2.04/README +138 -0
  21. data/vendor/tesseract-2.04/ReleaseNotes +213 -0
  22. data/vendor/tesseract-2.04/StdAfx.cpp +8 -0
  23. data/vendor/tesseract-2.04/StdAfx.h +24 -0
  24. data/vendor/tesseract-2.04/ccmain/Makefile.am +63 -0
  25. data/vendor/tesseract-2.04/ccmain/Makefile.in +735 -0
  26. data/vendor/tesseract-2.04/ccmain/adaptions.cpp +1082 -0
  27. data/vendor/tesseract-2.04/ccmain/adaptions.h +109 -0
  28. data/vendor/tesseract-2.04/ccmain/applybox.cpp +891 -0
  29. data/vendor/tesseract-2.04/ccmain/applybox.h +73 -0
  30. data/vendor/tesseract-2.04/ccmain/baseapi.cpp +1105 -0
  31. data/vendor/tesseract-2.04/ccmain/baseapi.h +256 -0
  32. data/vendor/tesseract-2.04/ccmain/blobcmp.cpp +76 -0
  33. data/vendor/tesseract-2.04/ccmain/blobcmp.h +29 -0
  34. data/vendor/tesseract-2.04/ccmain/callnet.cpp +93 -0
  35. data/vendor/tesseract-2.04/ccmain/callnet.h +32 -0
  36. data/vendor/tesseract-2.04/ccmain/charcut.cpp +704 -0
  37. data/vendor/tesseract-2.04/ccmain/charcut.h +120 -0
  38. data/vendor/tesseract-2.04/ccmain/charsample.cpp +699 -0
  39. data/vendor/tesseract-2.04/ccmain/control.cpp +1842 -0
  40. data/vendor/tesseract-2.04/ccmain/control.h +198 -0
  41. data/vendor/tesseract-2.04/ccmain/docqual.cpp +1481 -0
  42. data/vendor/tesseract-2.04/ccmain/docqual.h +155 -0
  43. data/vendor/tesseract-2.04/ccmain/expandblob.cpp +82 -0
  44. data/vendor/tesseract-2.04/ccmain/expandblob.h +13 -0
  45. data/vendor/tesseract-2.04/ccmain/fixspace.cpp +989 -0
  46. data/vendor/tesseract-2.04/ccmain/fixspace.h +72 -0
  47. data/vendor/tesseract-2.04/ccmain/fixxht.cpp +825 -0
  48. data/vendor/tesseract-2.04/ccmain/fixxht.h +93 -0
  49. data/vendor/tesseract-2.04/ccmain/imgscale.cpp +154 -0
  50. data/vendor/tesseract-2.04/ccmain/imgscale.h +32 -0
  51. data/vendor/tesseract-2.04/ccmain/matmatch.cpp +391 -0
  52. data/vendor/tesseract-2.04/ccmain/matmatch.h +48 -0
  53. data/vendor/tesseract-2.04/ccmain/output.cpp +1273 -0
  54. data/vendor/tesseract-2.04/ccmain/output.h +116 -0
  55. data/vendor/tesseract-2.04/ccmain/pagewalk.cpp +666 -0
  56. data/vendor/tesseract-2.04/ccmain/pagewalk.h +155 -0
  57. data/vendor/tesseract-2.04/ccmain/paircmp.cpp +107 -0
  58. data/vendor/tesseract-2.04/ccmain/paircmp.h +43 -0
  59. data/vendor/tesseract-2.04/ccmain/pgedit.cpp +1867 -0
  60. data/vendor/tesseract-2.04/ccmain/pgedit.h +181 -0
  61. data/vendor/tesseract-2.04/ccmain/reject.cpp +1775 -0
  62. data/vendor/tesseract-2.04/ccmain/reject.h +181 -0
  63. data/vendor/tesseract-2.04/ccmain/scaleimg.cpp +366 -0
  64. data/vendor/tesseract-2.04/ccmain/scaleimg.h +35 -0
  65. data/vendor/tesseract-2.04/ccmain/tessbox.cpp +375 -0
  66. data/vendor/tesseract-2.04/ccmain/tessbox.h +110 -0
  67. data/vendor/tesseract-2.04/ccmain/tessedit.cpp +278 -0
  68. data/vendor/tesseract-2.04/ccmain/tessedit.h +49 -0
  69. data/vendor/tesseract-2.04/ccmain/tessembedded.cpp +110 -0
  70. data/vendor/tesseract-2.04/ccmain/tessembedded.h +38 -0
  71. data/vendor/tesseract-2.04/ccmain/tesseractfull.cc +37 -0
  72. data/vendor/tesseract-2.04/ccmain/tesseractmain.cpp +387 -0
  73. data/vendor/tesseract-2.04/ccmain/tesseractmain.h +58 -0
  74. data/vendor/tesseract-2.04/ccmain/tessio.h +110 -0
  75. data/vendor/tesseract-2.04/ccmain/tessvars.cpp +38 -0
  76. data/vendor/tesseract-2.04/ccmain/tessvars.h +48 -0
  77. data/vendor/tesseract-2.04/ccmain/tfacep.h +62 -0
  78. data/vendor/tesseract-2.04/ccmain/tfacepp.cpp +443 -0
  79. data/vendor/tesseract-2.04/ccmain/tfacepp.h +85 -0
  80. data/vendor/tesseract-2.04/ccmain/tstruct.cpp +549 -0
  81. data/vendor/tesseract-2.04/ccmain/tstruct.h +108 -0
  82. data/vendor/tesseract-2.04/ccmain/varabled.cpp +346 -0
  83. data/vendor/tesseract-2.04/ccmain/varabled.h +125 -0
  84. data/vendor/tesseract-2.04/ccmain/werdit.cpp +193 -0
  85. data/vendor/tesseract-2.04/ccmain/werdit.h +67 -0
  86. data/vendor/tesseract-2.04/ccstruct/Makefile.am +25 -0
  87. data/vendor/tesseract-2.04/ccstruct/Makefile.in +650 -0
  88. data/vendor/tesseract-2.04/ccstruct/blckerr.h +29 -0
  89. data/vendor/tesseract-2.04/ccstruct/blobbox.cpp +778 -0
  90. data/vendor/tesseract-2.04/ccstruct/blobbox.h +381 -0
  91. data/vendor/tesseract-2.04/ccstruct/blobs.cpp +247 -0
  92. data/vendor/tesseract-2.04/ccstruct/blobs.h +119 -0
  93. data/vendor/tesseract-2.04/ccstruct/blread.cpp +537 -0
  94. data/vendor/tesseract-2.04/ccstruct/blread.h +63 -0
  95. data/vendor/tesseract-2.04/ccstruct/callcpp.cpp +252 -0
  96. data/vendor/tesseract-2.04/ccstruct/coutln.cpp +650 -0
  97. data/vendor/tesseract-2.04/ccstruct/coutln.h +186 -0
  98. data/vendor/tesseract-2.04/ccstruct/crakedge.h +39 -0
  99. data/vendor/tesseract-2.04/ccstruct/genblob.cpp +133 -0
  100. data/vendor/tesseract-2.04/ccstruct/genblob.h +52 -0
  101. data/vendor/tesseract-2.04/ccstruct/hpddef.h +39 -0
  102. data/vendor/tesseract-2.04/ccstruct/hpdsizes.h +8 -0
  103. data/vendor/tesseract-2.04/ccstruct/ipoints.h +479 -0
  104. data/vendor/tesseract-2.04/ccstruct/labls.cpp +188 -0
  105. data/vendor/tesseract-2.04/ccstruct/labls.h +38 -0
  106. data/vendor/tesseract-2.04/ccstruct/linlsq.cpp +249 -0
  107. data/vendor/tesseract-2.04/ccstruct/linlsq.h +102 -0
  108. data/vendor/tesseract-2.04/ccstruct/lmedsq.cpp +453 -0
  109. data/vendor/tesseract-2.04/ccstruct/lmedsq.h +84 -0
  110. data/vendor/tesseract-2.04/ccstruct/mod128.cpp +100 -0
  111. data/vendor/tesseract-2.04/ccstruct/mod128.h +85 -0
  112. data/vendor/tesseract-2.04/ccstruct/normalis.cpp +176 -0
  113. data/vendor/tesseract-2.04/ccstruct/normalis.h +108 -0
  114. data/vendor/tesseract-2.04/ccstruct/ocrblock.cpp +369 -0
  115. data/vendor/tesseract-2.04/ccstruct/ocrblock.h +235 -0
  116. data/vendor/tesseract-2.04/ccstruct/ocrrow.cpp +216 -0
  117. data/vendor/tesseract-2.04/ccstruct/ocrrow.h +133 -0
  118. data/vendor/tesseract-2.04/ccstruct/pageblk.cpp +879 -0
  119. data/vendor/tesseract-2.04/ccstruct/pageblk.h +318 -0
  120. data/vendor/tesseract-2.04/ccstruct/pageres.cpp +330 -0
  121. data/vendor/tesseract-2.04/ccstruct/pageres.h +313 -0
  122. data/vendor/tesseract-2.04/ccstruct/pdblock.cpp +361 -0
  123. data/vendor/tesseract-2.04/ccstruct/pdblock.h +181 -0
  124. data/vendor/tesseract-2.04/ccstruct/pdclass.h +54 -0
  125. data/vendor/tesseract-2.04/ccstruct/points.cpp +102 -0
  126. data/vendor/tesseract-2.04/ccstruct/points.h +299 -0
  127. data/vendor/tesseract-2.04/ccstruct/polyaprx.cpp +588 -0
  128. data/vendor/tesseract-2.04/ccstruct/polyaprx.h +51 -0
  129. data/vendor/tesseract-2.04/ccstruct/polyblk.cpp +398 -0
  130. data/vendor/tesseract-2.04/ccstruct/polyblk.h +122 -0
  131. data/vendor/tesseract-2.04/ccstruct/polyblob.cpp +357 -0
  132. data/vendor/tesseract-2.04/ccstruct/polyblob.h +102 -0
  133. data/vendor/tesseract-2.04/ccstruct/polyvert.cpp +23 -0
  134. data/vendor/tesseract-2.04/ccstruct/polyvert.h +58 -0
  135. data/vendor/tesseract-2.04/ccstruct/poutline.cpp +441 -0
  136. data/vendor/tesseract-2.04/ccstruct/poutline.h +125 -0
  137. data/vendor/tesseract-2.04/ccstruct/quadlsq.cpp +147 -0
  138. data/vendor/tesseract-2.04/ccstruct/quadlsq.h +67 -0
  139. data/vendor/tesseract-2.04/ccstruct/quadratc.cpp +21 -0
  140. data/vendor/tesseract-2.04/ccstruct/quadratc.h +63 -0
  141. data/vendor/tesseract-2.04/ccstruct/quspline.cpp +382 -0
  142. data/vendor/tesseract-2.04/ccstruct/quspline.h +113 -0
  143. data/vendor/tesseract-2.04/ccstruct/ratngs.cpp +372 -0
  144. data/vendor/tesseract-2.04/ccstruct/ratngs.h +198 -0
  145. data/vendor/tesseract-2.04/ccstruct/rect.cpp +229 -0
  146. data/vendor/tesseract-2.04/ccstruct/rect.h +320 -0
  147. data/vendor/tesseract-2.04/ccstruct/rejctmap.cpp +545 -0
  148. data/vendor/tesseract-2.04/ccstruct/rejctmap.h +284 -0
  149. data/vendor/tesseract-2.04/ccstruct/rwpoly.cpp +89 -0
  150. data/vendor/tesseract-2.04/ccstruct/rwpoly.h +45 -0
  151. data/vendor/tesseract-2.04/ccstruct/statistc.cpp +905 -0
  152. data/vendor/tesseract-2.04/ccstruct/statistc.h +135 -0
  153. data/vendor/tesseract-2.04/ccstruct/stepblob.cpp +296 -0
  154. data/vendor/tesseract-2.04/ccstruct/stepblob.h +88 -0
  155. data/vendor/tesseract-2.04/ccstruct/txtregn.cpp +230 -0
  156. data/vendor/tesseract-2.04/ccstruct/txtregn.h +155 -0
  157. data/vendor/tesseract-2.04/ccstruct/vecfuncs.cpp +63 -0
  158. data/vendor/tesseract-2.04/ccstruct/vecfuncs.h +91 -0
  159. data/vendor/tesseract-2.04/ccstruct/werd.cpp +967 -0
  160. data/vendor/tesseract-2.04/ccstruct/werd.h +277 -0
  161. data/vendor/tesseract-2.04/ccutil/Makefile.am +19 -0
  162. data/vendor/tesseract-2.04/ccutil/Makefile.in +626 -0
  163. data/vendor/tesseract-2.04/ccutil/basedir.cpp +118 -0
  164. data/vendor/tesseract-2.04/ccutil/basedir.h +32 -0
  165. data/vendor/tesseract-2.04/ccutil/bits16.cpp +30 -0
  166. data/vendor/tesseract-2.04/ccutil/bits16.h +61 -0
  167. data/vendor/tesseract-2.04/ccutil/boxread.cpp +105 -0
  168. data/vendor/tesseract-2.04/ccutil/boxread.h +44 -0
  169. data/vendor/tesseract-2.04/ccutil/clst.cpp +626 -0
  170. data/vendor/tesseract-2.04/ccutil/clst.h +1085 -0
  171. data/vendor/tesseract-2.04/ccutil/debugwin.cpp +500 -0
  172. data/vendor/tesseract-2.04/ccutil/debugwin.h +103 -0
  173. data/vendor/tesseract-2.04/ccutil/elst.cpp +593 -0
  174. data/vendor/tesseract-2.04/ccutil/elst.h +1125 -0
  175. data/vendor/tesseract-2.04/ccutil/elst2.cpp +606 -0
  176. data/vendor/tesseract-2.04/ccutil/elst2.h +1121 -0
  177. data/vendor/tesseract-2.04/ccutil/errcode.cpp +104 -0
  178. data/vendor/tesseract-2.04/ccutil/errcode.h +104 -0
  179. data/vendor/tesseract-2.04/ccutil/fileerr.h +34 -0
  180. data/vendor/tesseract-2.04/ccutil/globaloc.cpp +115 -0
  181. data/vendor/tesseract-2.04/ccutil/globaloc.h +40 -0
  182. data/vendor/tesseract-2.04/ccutil/hashfn.cpp +57 -0
  183. data/vendor/tesseract-2.04/ccutil/hashfn.h +30 -0
  184. data/vendor/tesseract-2.04/ccutil/host.h +180 -0
  185. data/vendor/tesseract-2.04/ccutil/hosthplb.h +1 -0
  186. data/vendor/tesseract-2.04/ccutil/lsterr.h +43 -0
  187. data/vendor/tesseract-2.04/ccutil/mainblk.cpp +126 -0
  188. data/vendor/tesseract-2.04/ccutil/mainblk.h +39 -0
  189. data/vendor/tesseract-2.04/ccutil/memblk.cpp +1106 -0
  190. data/vendor/tesseract-2.04/ccutil/memblk.h +189 -0
  191. data/vendor/tesseract-2.04/ccutil/memry.cpp +532 -0
  192. data/vendor/tesseract-2.04/ccutil/memry.h +192 -0
  193. data/vendor/tesseract-2.04/ccutil/memryerr.h +38 -0
  194. data/vendor/tesseract-2.04/ccutil/mfcpch.cpp +5 -0
  195. data/vendor/tesseract-2.04/ccutil/mfcpch.h +37 -0
  196. data/vendor/tesseract-2.04/ccutil/ndminx.h +31 -0
  197. data/vendor/tesseract-2.04/ccutil/notdll.h +28 -0
  198. data/vendor/tesseract-2.04/ccutil/nwmain.h +176 -0
  199. data/vendor/tesseract-2.04/ccutil/ocrclass.h +345 -0
  200. data/vendor/tesseract-2.04/ccutil/ocrshell.cpp +772 -0
  201. data/vendor/tesseract-2.04/ccutil/ocrshell.h +191 -0
  202. data/vendor/tesseract-2.04/ccutil/platform.h +18 -0
  203. data/vendor/tesseract-2.04/ccutil/scanutils.cpp +543 -0
  204. data/vendor/tesseract-2.04/ccutil/scanutils.h +55 -0
  205. data/vendor/tesseract-2.04/ccutil/secname.h +9 -0
  206. data/vendor/tesseract-2.04/ccutil/serialis.cpp +117 -0
  207. data/vendor/tesseract-2.04/ccutil/serialis.h +93 -0
  208. data/vendor/tesseract-2.04/ccutil/stderr.h +26 -0
  209. data/vendor/tesseract-2.04/ccutil/strngs.cpp +495 -0
  210. data/vendor/tesseract-2.04/ccutil/strngs.h +138 -0
  211. data/vendor/tesseract-2.04/ccutil/tessclas.h +135 -0
  212. data/vendor/tesseract-2.04/ccutil/tessopt.cpp +61 -0
  213. data/vendor/tesseract-2.04/ccutil/tessopt.h +30 -0
  214. data/vendor/tesseract-2.04/ccutil/tprintf.cpp +122 -0
  215. data/vendor/tesseract-2.04/ccutil/tprintf.h +35 -0
  216. data/vendor/tesseract-2.04/ccutil/unichar.cpp +144 -0
  217. data/vendor/tesseract-2.04/ccutil/unichar.h +84 -0
  218. data/vendor/tesseract-2.04/ccutil/unicharmap.cpp +172 -0
  219. data/vendor/tesseract-2.04/ccutil/unicharmap.h +82 -0
  220. data/vendor/tesseract-2.04/ccutil/unicharset.cpp +307 -0
  221. data/vendor/tesseract-2.04/ccutil/unicharset.h +267 -0
  222. data/vendor/tesseract-2.04/ccutil/varable.cpp +672 -0
  223. data/vendor/tesseract-2.04/ccutil/varable.h +419 -0
  224. data/vendor/tesseract-2.04/classify/Makefile.am +24 -0
  225. data/vendor/tesseract-2.04/classify/Makefile.in +647 -0
  226. data/vendor/tesseract-2.04/classify/adaptive.cpp +535 -0
  227. data/vendor/tesseract-2.04/classify/adaptive.h +199 -0
  228. data/vendor/tesseract-2.04/classify/adaptmatch.cpp +2958 -0
  229. data/vendor/tesseract-2.04/classify/adaptmatch.h +86 -0
  230. data/vendor/tesseract-2.04/classify/baseline.cpp +58 -0
  231. data/vendor/tesseract-2.04/classify/baseline.h +91 -0
  232. data/vendor/tesseract-2.04/classify/blobclass.cpp +123 -0
  233. data/vendor/tesseract-2.04/classify/blobclass.h +49 -0
  234. data/vendor/tesseract-2.04/classify/chartoname.cpp +74 -0
  235. data/vendor/tesseract-2.04/classify/chartoname.h +21 -0
  236. data/vendor/tesseract-2.04/classify/cluster.cpp +2834 -0
  237. data/vendor/tesseract-2.04/classify/cluster.h +158 -0
  238. data/vendor/tesseract-2.04/classify/clusttool.cpp +507 -0
  239. data/vendor/tesseract-2.04/classify/clusttool.h +70 -0
  240. data/vendor/tesseract-2.04/classify/cutoffs.cpp +73 -0
  241. data/vendor/tesseract-2.04/classify/cutoffs.h +49 -0
  242. data/vendor/tesseract-2.04/classify/extern.h +32 -0
  243. data/vendor/tesseract-2.04/classify/extract.cpp +100 -0
  244. data/vendor/tesseract-2.04/classify/extract.h +36 -0
  245. data/vendor/tesseract-2.04/classify/featdefs.cpp +244 -0
  246. data/vendor/tesseract-2.04/classify/featdefs.h +71 -0
  247. data/vendor/tesseract-2.04/classify/flexfx.cpp +87 -0
  248. data/vendor/tesseract-2.04/classify/flexfx.h +34 -0
  249. data/vendor/tesseract-2.04/classify/float2int.cpp +126 -0
  250. data/vendor/tesseract-2.04/classify/float2int.h +65 -0
  251. data/vendor/tesseract-2.04/classify/fpoint.cpp +73 -0
  252. data/vendor/tesseract-2.04/classify/fpoint.h +63 -0
  253. data/vendor/tesseract-2.04/classify/fxdefs.cpp +74 -0
  254. data/vendor/tesseract-2.04/classify/fxdefs.h +93 -0
  255. data/vendor/tesseract-2.04/classify/fxid.h +69 -0
  256. data/vendor/tesseract-2.04/classify/hideedge.cpp +35 -0
  257. data/vendor/tesseract-2.04/classify/hideedge.h +76 -0
  258. data/vendor/tesseract-2.04/classify/intfx.cpp +608 -0
  259. data/vendor/tesseract-2.04/classify/intfx.h +63 -0
  260. data/vendor/tesseract-2.04/classify/intmatcher.cpp +1524 -0
  261. data/vendor/tesseract-2.04/classify/intmatcher.h +199 -0
  262. data/vendor/tesseract-2.04/classify/intproto.cpp +1823 -0
  263. data/vendor/tesseract-2.04/classify/intproto.h +320 -0
  264. data/vendor/tesseract-2.04/classify/kdtree.cpp +884 -0
  265. data/vendor/tesseract-2.04/classify/kdtree.h +118 -0
  266. data/vendor/tesseract-2.04/classify/mf.cpp +106 -0
  267. data/vendor/tesseract-2.04/classify/mf.h +43 -0
  268. data/vendor/tesseract-2.04/classify/mfdefs.cpp +58 -0
  269. data/vendor/tesseract-2.04/classify/mfdefs.h +60 -0
  270. data/vendor/tesseract-2.04/classify/mfoutline.cpp +1087 -0
  271. data/vendor/tesseract-2.04/classify/mfoutline.h +277 -0
  272. data/vendor/tesseract-2.04/classify/mfx.cpp +436 -0
  273. data/vendor/tesseract-2.04/classify/mfx.h +52 -0
  274. data/vendor/tesseract-2.04/classify/normfeat.cpp +132 -0
  275. data/vendor/tesseract-2.04/classify/normfeat.h +63 -0
  276. data/vendor/tesseract-2.04/classify/normmatch.cpp +305 -0
  277. data/vendor/tesseract-2.04/classify/normmatch.h +38 -0
  278. data/vendor/tesseract-2.04/classify/ocrfeatures.cpp +310 -0
  279. data/vendor/tesseract-2.04/classify/ocrfeatures.h +148 -0
  280. data/vendor/tesseract-2.04/classify/outfeat.cpp +262 -0
  281. data/vendor/tesseract-2.04/classify/outfeat.h +76 -0
  282. data/vendor/tesseract-2.04/classify/picofeat.cpp +297 -0
  283. data/vendor/tesseract-2.04/classify/picofeat.h +65 -0
  284. data/vendor/tesseract-2.04/classify/protos.cpp +472 -0
  285. data/vendor/tesseract-2.04/classify/protos.h +258 -0
  286. data/vendor/tesseract-2.04/classify/sigmenu.cpp +225 -0
  287. data/vendor/tesseract-2.04/classify/sigmenu.h +39 -0
  288. data/vendor/tesseract-2.04/classify/speckle.cpp +127 -0
  289. data/vendor/tesseract-2.04/classify/speckle.h +69 -0
  290. data/vendor/tesseract-2.04/classify/xform2d.cpp +120 -0
  291. data/vendor/tesseract-2.04/classify/xform2d.h +60 -0
  292. data/vendor/tesseract-2.04/config/config.guess +1466 -0
  293. data/vendor/tesseract-2.04/config/config.h.in +188 -0
  294. data/vendor/tesseract-2.04/config/config.sub +1579 -0
  295. data/vendor/tesseract-2.04/config/depcomp +530 -0
  296. data/vendor/tesseract-2.04/config/install-sh +269 -0
  297. data/vendor/tesseract-2.04/config/missing +198 -0
  298. data/vendor/tesseract-2.04/config/mkinstalldirs +40 -0
  299. data/vendor/tesseract-2.04/config/stamp-h.in +0 -0
  300. data/vendor/tesseract-2.04/configure +10424 -0
  301. data/vendor/tesseract-2.04/cutil/Makefile.am +14 -0
  302. data/vendor/tesseract-2.04/cutil/Makefile.in +612 -0
  303. data/vendor/tesseract-2.04/cutil/bitvec.cpp +115 -0
  304. data/vendor/tesseract-2.04/cutil/bitvec.h +100 -0
  305. data/vendor/tesseract-2.04/cutil/callcpp.h +190 -0
  306. data/vendor/tesseract-2.04/cutil/const.h +108 -0
  307. data/vendor/tesseract-2.04/cutil/cutil.cpp +92 -0
  308. data/vendor/tesseract-2.04/cutil/cutil.h +159 -0
  309. data/vendor/tesseract-2.04/cutil/danerror.cpp +144 -0
  310. data/vendor/tesseract-2.04/cutil/danerror.h +41 -0
  311. data/vendor/tesseract-2.04/cutil/debug.cpp +97 -0
  312. data/vendor/tesseract-2.04/cutil/debug.h +348 -0
  313. data/vendor/tesseract-2.04/cutil/efio.cpp +62 -0
  314. data/vendor/tesseract-2.04/cutil/efio.h +32 -0
  315. data/vendor/tesseract-2.04/cutil/emalloc.cpp +91 -0
  316. data/vendor/tesseract-2.04/cutil/emalloc.h +44 -0
  317. data/vendor/tesseract-2.04/cutil/freelist.cpp +75 -0
  318. data/vendor/tesseract-2.04/cutil/freelist.h +45 -0
  319. data/vendor/tesseract-2.04/cutil/funcdefs.h +35 -0
  320. data/vendor/tesseract-2.04/cutil/general.h +33 -0
  321. data/vendor/tesseract-2.04/cutil/globals.cpp +69 -0
  322. data/vendor/tesseract-2.04/cutil/globals.h +70 -0
  323. data/vendor/tesseract-2.04/cutil/listio.cpp +68 -0
  324. data/vendor/tesseract-2.04/cutil/listio.h +43 -0
  325. data/vendor/tesseract-2.04/cutil/minmax.h +40 -0
  326. data/vendor/tesseract-2.04/cutil/oldheap.cpp +337 -0
  327. data/vendor/tesseract-2.04/cutil/oldheap.h +126 -0
  328. data/vendor/tesseract-2.04/cutil/oldlist.cpp +393 -0
  329. data/vendor/tesseract-2.04/cutil/oldlist.h +350 -0
  330. data/vendor/tesseract-2.04/cutil/structures.cpp +66 -0
  331. data/vendor/tesseract-2.04/cutil/structures.h +112 -0
  332. data/vendor/tesseract-2.04/cutil/tessarray.cpp +115 -0
  333. data/vendor/tesseract-2.04/cutil/tessarray.h +166 -0
  334. data/vendor/tesseract-2.04/cutil/tordvars.cpp +95 -0
  335. data/vendor/tesseract-2.04/cutil/tordvars.h +61 -0
  336. data/vendor/tesseract-2.04/cutil/variables.cpp +317 -0
  337. data/vendor/tesseract-2.04/cutil/variables.h +170 -0
  338. data/vendor/tesseract-2.04/dict/Makefile.am +13 -0
  339. data/vendor/tesseract-2.04/dict/Makefile.in +609 -0
  340. data/vendor/tesseract-2.04/dict/choicearr.h +96 -0
  341. data/vendor/tesseract-2.04/dict/choices.cpp +210 -0
  342. data/vendor/tesseract-2.04/dict/choices.h +241 -0
  343. data/vendor/tesseract-2.04/dict/context.cpp +270 -0
  344. data/vendor/tesseract-2.04/dict/context.h +82 -0
  345. data/vendor/tesseract-2.04/dict/dawg.cpp +363 -0
  346. data/vendor/tesseract-2.04/dict/dawg.h +394 -0
  347. data/vendor/tesseract-2.04/dict/hyphen.cpp +84 -0
  348. data/vendor/tesseract-2.04/dict/hyphen.h +125 -0
  349. data/vendor/tesseract-2.04/dict/lookdawg.cpp +228 -0
  350. data/vendor/tesseract-2.04/dict/lookdawg.h +76 -0
  351. data/vendor/tesseract-2.04/dict/makedawg.cpp +449 -0
  352. data/vendor/tesseract-2.04/dict/makedawg.h +83 -0
  353. data/vendor/tesseract-2.04/dict/matchdefs.h +145 -0
  354. data/vendor/tesseract-2.04/dict/permdawg.cpp +415 -0
  355. data/vendor/tesseract-2.04/dict/permdawg.h +98 -0
  356. data/vendor/tesseract-2.04/dict/permngram.cpp +358 -0
  357. data/vendor/tesseract-2.04/dict/permngram.h +33 -0
  358. data/vendor/tesseract-2.04/dict/permnum.cpp +522 -0
  359. data/vendor/tesseract-2.04/dict/permnum.h +83 -0
  360. data/vendor/tesseract-2.04/dict/permute.cpp +1704 -0
  361. data/vendor/tesseract-2.04/dict/permute.h +93 -0
  362. data/vendor/tesseract-2.04/dict/reduce.cpp +424 -0
  363. data/vendor/tesseract-2.04/dict/reduce.h +112 -0
  364. data/vendor/tesseract-2.04/dict/states.cpp +382 -0
  365. data/vendor/tesseract-2.04/dict/states.h +111 -0
  366. data/vendor/tesseract-2.04/dict/stopper.cpp +1458 -0
  367. data/vendor/tesseract-2.04/dict/stopper.h +103 -0
  368. data/vendor/tesseract-2.04/dict/trie.cpp +683 -0
  369. data/vendor/tesseract-2.04/dict/trie.h +190 -0
  370. data/vendor/tesseract-2.04/dlltest/Makefile.am +2 -0
  371. data/vendor/tesseract-2.04/dlltest/Makefile.in +388 -0
  372. data/vendor/tesseract-2.04/dlltest/dlltest.cpp +163 -0
  373. data/vendor/tesseract-2.04/dlltest/dlltest.dsp +186 -0
  374. data/vendor/tesseract-2.04/dlltest/dlltest.vcproj +637 -0
  375. data/vendor/tesseract-2.04/eurotext.tif +0 -0
  376. data/vendor/tesseract-2.04/image/Makefile.am +10 -0
  377. data/vendor/tesseract-2.04/image/Makefile.in +596 -0
  378. data/vendor/tesseract-2.04/image/bitstrm.cpp +157 -0
  379. data/vendor/tesseract-2.04/image/bitstrm.h +73 -0
  380. data/vendor/tesseract-2.04/image/img.h +336 -0
  381. data/vendor/tesseract-2.04/image/imgbmp.cpp +223 -0
  382. data/vendor/tesseract-2.04/image/imgbmp.h +50 -0
  383. data/vendor/tesseract-2.04/image/imgerrs.h +35 -0
  384. data/vendor/tesseract-2.04/image/imgio.cpp +321 -0
  385. data/vendor/tesseract-2.04/image/imgio.h +22 -0
  386. data/vendor/tesseract-2.04/image/imgs.cpp +1764 -0
  387. data/vendor/tesseract-2.04/image/imgs.h +102 -0
  388. data/vendor/tesseract-2.04/image/imgtiff.cpp +723 -0
  389. data/vendor/tesseract-2.04/image/imgtiff.h +89 -0
  390. data/vendor/tesseract-2.04/image/imgunpk.h +1377 -0
  391. data/vendor/tesseract-2.04/image/svshowim.cpp +40 -0
  392. data/vendor/tesseract-2.04/image/svshowim.h +25 -0
  393. data/vendor/tesseract-2.04/java/Makefile.am +4 -0
  394. data/vendor/tesseract-2.04/java/Makefile.in +473 -0
  395. data/vendor/tesseract-2.04/java/com/Makefile.am +1 -0
  396. data/vendor/tesseract-2.04/java/com/Makefile.in +470 -0
  397. data/vendor/tesseract-2.04/java/com/google/Makefile.am +1 -0
  398. data/vendor/tesseract-2.04/java/com/google/Makefile.in +470 -0
  399. data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.am +4 -0
  400. data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.in +473 -0
  401. data/vendor/tesseract-2.04/java/com/google/scrollview/ScrollView.java +421 -0
  402. data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.am +5 -0
  403. data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.in +474 -0
  404. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEvent.java +87 -0
  405. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventHandler.java +296 -0
  406. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventType.java +31 -0
  407. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.am +7 -0
  408. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.in +476 -0
  409. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVAbstractMenuItem.java +58 -0
  410. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVCheckboxMenuItem.java +60 -0
  411. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVEmptyMenuItem.java +48 -0
  412. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVImageHandler.java +228 -0
  413. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuBar.java +130 -0
  414. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuItem.java +61 -0
  415. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVPopupMenu.java +142 -0
  416. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVSubMenuItem.java +41 -0
  417. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVWindow.java +643 -0
  418. data/vendor/tesseract-2.04/java/makefile +55 -0
  419. data/vendor/tesseract-2.04/pageseg/Makefile.am +13 -0
  420. data/vendor/tesseract-2.04/pageseg/Makefile.in +596 -0
  421. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.cpp +363 -0
  422. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.h +90 -0
  423. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.cpp +82 -0
  424. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.h +30 -0
  425. data/vendor/tesseract-2.04/pageseg/pageseg.cpp +170 -0
  426. data/vendor/tesseract-2.04/pageseg/pageseg.h +29 -0
  427. data/vendor/tesseract-2.04/phototest.tif +0 -0
  428. data/vendor/tesseract-2.04/tessdata/Makefile.am +31 -0
  429. data/vendor/tesseract-2.04/tessdata/Makefile.in +529 -0
  430. data/vendor/tesseract-2.04/tessdata/configs/Makefile.am +3 -0
  431. data/vendor/tesseract-2.04/tessdata/configs/Makefile.in +344 -0
  432. data/vendor/tesseract-2.04/tessdata/configs/api_config +1 -0
  433. data/vendor/tesseract-2.04/tessdata/configs/box.train +19 -0
  434. data/vendor/tesseract-2.04/tessdata/configs/box.train.stderr +18 -0
  435. data/vendor/tesseract-2.04/tessdata/configs/inter +4 -0
  436. data/vendor/tesseract-2.04/tessdata/configs/kannada +4 -0
  437. data/vendor/tesseract-2.04/tessdata/configs/makebox +1 -0
  438. data/vendor/tesseract-2.04/tessdata/configs/unlv +3 -0
  439. data/vendor/tesseract-2.04/tessdata/confsets +3 -0
  440. data/vendor/tesseract-2.04/tessdata/eng.DangAmbigs +39 -0
  441. data/vendor/tesseract-2.04/tessdata/eng.freq-dawg +0 -0
  442. data/vendor/tesseract-2.04/tessdata/eng.inttemp +0 -0
  443. data/vendor/tesseract-2.04/tessdata/eng.normproto +1247 -0
  444. data/vendor/tesseract-2.04/tessdata/eng.pffmtable +111 -0
  445. data/vendor/tesseract-2.04/tessdata/eng.unicharset +113 -0
  446. data/vendor/tesseract-2.04/tessdata/eng.user-words +921 -0
  447. data/vendor/tesseract-2.04/tessdata/eng.word-dawg +0 -0
  448. data/vendor/tesseract-2.04/tessdata/makedummies +8 -0
  449. data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.am +3 -0
  450. data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.in +344 -0
  451. data/vendor/tesseract-2.04/tessdata/tessconfigs/batch +2 -0
  452. data/vendor/tesseract-2.04/tessdata/tessconfigs/batch.nochop +2 -0
  453. data/vendor/tesseract-2.04/tessdata/tessconfigs/matdemo +7 -0
  454. data/vendor/tesseract-2.04/tessdata/tessconfigs/msdemo +13 -0
  455. data/vendor/tesseract-2.04/tessdata/tessconfigs/nobatch +2 -0
  456. data/vendor/tesseract-2.04/tessdata/tessconfigs/segdemo +9 -0
  457. data/vendor/tesseract-2.04/tessdll.cpp +351 -0
  458. data/vendor/tesseract-2.04/tessdll.dsp +2050 -0
  459. data/vendor/tesseract-2.04/tessdll.h +143 -0
  460. data/vendor/tesseract-2.04/tessdll.vcproj +5495 -0
  461. data/vendor/tesseract-2.04/tesseract.dsp +2124 -0
  462. data/vendor/tesseract-2.04/tesseract.dsw +116 -0
  463. data/vendor/tesseract-2.04/tesseract.sln +59 -0
  464. data/vendor/tesseract-2.04/tesseract.spec +188 -0
  465. data/vendor/tesseract-2.04/tesseract.vcproj +5859 -0
  466. data/vendor/tesseract-2.04/testing/Makefile.am +2 -0
  467. data/vendor/tesseract-2.04/testing/Makefile.in +312 -0
  468. data/vendor/tesseract-2.04/testing/README +43 -0
  469. data/vendor/tesseract-2.04/testing/counttestset.sh +61 -0
  470. data/vendor/tesseract-2.04/testing/reorgdata.sh +44 -0
  471. data/vendor/tesseract-2.04/testing/reports/1995.bus.3B.sum +1 -0
  472. data/vendor/tesseract-2.04/testing/reports/1995.doe3.3B.sum +1 -0
  473. data/vendor/tesseract-2.04/testing/reports/1995.mag.3B.sum +1 -0
  474. data/vendor/tesseract-2.04/testing/reports/1995.news.3B.sum +1 -0
  475. data/vendor/tesseract-2.04/testing/reports/2.03.summary +9 -0
  476. data/vendor/tesseract-2.04/testing/reports/2.04.summary +9 -0
  477. data/vendor/tesseract-2.04/testing/runalltests.sh +110 -0
  478. data/vendor/tesseract-2.04/testing/runtestset.sh +61 -0
  479. data/vendor/tesseract-2.04/textord/Makefile.am +20 -0
  480. data/vendor/tesseract-2.04/textord/Makefile.in +624 -0
  481. data/vendor/tesseract-2.04/textord/blkocc.cpp +809 -0
  482. data/vendor/tesseract-2.04/textord/blkocc.h +327 -0
  483. data/vendor/tesseract-2.04/textord/blobcmpl.h +31 -0
  484. data/vendor/tesseract-2.04/textord/drawedg.cpp +77 -0
  485. data/vendor/tesseract-2.04/textord/drawedg.h +34 -0
  486. data/vendor/tesseract-2.04/textord/drawtord.cpp +469 -0
  487. data/vendor/tesseract-2.04/textord/drawtord.h +107 -0
  488. data/vendor/tesseract-2.04/textord/edgblob.cpp +412 -0
  489. data/vendor/tesseract-2.04/textord/edgblob.h +100 -0
  490. data/vendor/tesseract-2.04/textord/edgloop.cpp +211 -0
  491. data/vendor/tesseract-2.04/textord/edgloop.h +66 -0
  492. data/vendor/tesseract-2.04/textord/fpchop.cpp +1641 -0
  493. data/vendor/tesseract-2.04/textord/fpchop.h +238 -0
  494. data/vendor/tesseract-2.04/textord/gap_map.cpp +166 -0
  495. data/vendor/tesseract-2.04/textord/gap_map.h +40 -0
  496. data/vendor/tesseract-2.04/textord/makerow.cpp +2628 -0
  497. data/vendor/tesseract-2.04/textord/makerow.h +295 -0
  498. data/vendor/tesseract-2.04/textord/oldbasel.cpp +1761 -0
  499. data/vendor/tesseract-2.04/textord/oldbasel.h +195 -0
  500. data/vendor/tesseract-2.04/textord/pithsync.cpp +696 -0
  501. data/vendor/tesseract-2.04/textord/pithsync.h +134 -0
  502. data/vendor/tesseract-2.04/textord/pitsync1.cpp +425 -0
  503. data/vendor/tesseract-2.04/textord/pitsync1.h +135 -0
  504. data/vendor/tesseract-2.04/textord/scanedg.cpp +452 -0
  505. data/vendor/tesseract-2.04/textord/scanedg.h +74 -0
  506. data/vendor/tesseract-2.04/textord/sortflts.cpp +80 -0
  507. data/vendor/tesseract-2.04/textord/sortflts.h +64 -0
  508. data/vendor/tesseract-2.04/textord/tessout.h +76 -0
  509. data/vendor/tesseract-2.04/textord/topitch.cpp +2019 -0
  510. data/vendor/tesseract-2.04/textord/topitch.h +195 -0
  511. data/vendor/tesseract-2.04/textord/tordmain.cpp +907 -0
  512. data/vendor/tesseract-2.04/textord/tordmain.h +132 -0
  513. data/vendor/tesseract-2.04/textord/tospace.cpp +1939 -0
  514. data/vendor/tesseract-2.04/textord/tospace.h +193 -0
  515. data/vendor/tesseract-2.04/textord/tovars.cpp +87 -0
  516. data/vendor/tesseract-2.04/textord/tovars.h +94 -0
  517. data/vendor/tesseract-2.04/textord/underlin.cpp +312 -0
  518. data/vendor/tesseract-2.04/textord/underlin.h +53 -0
  519. data/vendor/tesseract-2.04/textord/wordseg.cpp +620 -0
  520. data/vendor/tesseract-2.04/textord/wordseg.h +70 -0
  521. data/vendor/tesseract-2.04/training/Makefile.am +54 -0
  522. data/vendor/tesseract-2.04/training/Makefile.in +720 -0
  523. data/vendor/tesseract-2.04/training/cnTraining.cpp +855 -0
  524. data/vendor/tesseract-2.04/training/cntraining.dsp +243 -0
  525. data/vendor/tesseract-2.04/training/cntraining.vcproj +950 -0
  526. data/vendor/tesseract-2.04/training/mergenf.cpp +451 -0
  527. data/vendor/tesseract-2.04/training/mergenf.h +106 -0
  528. data/vendor/tesseract-2.04/training/mfTraining.cpp +1341 -0
  529. data/vendor/tesseract-2.04/training/mftraining.dsp +285 -0
  530. data/vendor/tesseract-2.04/training/mftraining.vcproj +1055 -0
  531. data/vendor/tesseract-2.04/training/name2char.cpp +166 -0
  532. data/vendor/tesseract-2.04/training/name2char.h +38 -0
  533. data/vendor/tesseract-2.04/training/training.cpp +190 -0
  534. data/vendor/tesseract-2.04/training/training.h +130 -0
  535. data/vendor/tesseract-2.04/training/unicharset_extractor.cpp +140 -0
  536. data/vendor/tesseract-2.04/training/unicharset_extractor.dsp +335 -0
  537. data/vendor/tesseract-2.04/training/unicharset_extractor.vcproj +769 -0
  538. data/vendor/tesseract-2.04/training/wordlist2dawg.cpp +69 -0
  539. data/vendor/tesseract-2.04/training/wordlist2dawg.dsp +319 -0
  540. data/vendor/tesseract-2.04/training/wordlist2dawg.vcproj +1113 -0
  541. data/vendor/tesseract-2.04/viewer/Makefile.am +9 -0
  542. data/vendor/tesseract-2.04/viewer/Makefile.in +591 -0
  543. data/vendor/tesseract-2.04/viewer/scrollview.cpp +825 -0
  544. data/vendor/tesseract-2.04/viewer/scrollview.h +414 -0
  545. data/vendor/tesseract-2.04/viewer/svmnode.cpp +140 -0
  546. data/vendor/tesseract-2.04/viewer/svmnode.h +94 -0
  547. data/vendor/tesseract-2.04/viewer/svpaint.cpp +220 -0
  548. data/vendor/tesseract-2.04/viewer/svutil.cpp +347 -0
  549. data/vendor/tesseract-2.04/viewer/svutil.h +138 -0
  550. data/vendor/tesseract-2.04/wordrec/Makefile.am +23 -0
  551. data/vendor/tesseract-2.04/wordrec/Makefile.in +641 -0
  552. data/vendor/tesseract-2.04/wordrec/associate.cpp +62 -0
  553. data/vendor/tesseract-2.04/wordrec/associate.h +93 -0
  554. data/vendor/tesseract-2.04/wordrec/badwords.cpp +106 -0
  555. data/vendor/tesseract-2.04/wordrec/badwords.h +51 -0
  556. data/vendor/tesseract-2.04/wordrec/bestfirst.cpp +526 -0
  557. data/vendor/tesseract-2.04/wordrec/bestfirst.h +203 -0
  558. data/vendor/tesseract-2.04/wordrec/charsample.h +208 -0
  559. data/vendor/tesseract-2.04/wordrec/chop.cpp +458 -0
  560. data/vendor/tesseract-2.04/wordrec/chop.h +153 -0
  561. data/vendor/tesseract-2.04/wordrec/chopper.cpp +750 -0
  562. data/vendor/tesseract-2.04/wordrec/chopper.h +104 -0
  563. data/vendor/tesseract-2.04/wordrec/closed.cpp +136 -0
  564. data/vendor/tesseract-2.04/wordrec/closed.h +65 -0
  565. data/vendor/tesseract-2.04/wordrec/djmenus.cpp +118 -0
  566. data/vendor/tesseract-2.04/wordrec/djmenus.h +33 -0
  567. data/vendor/tesseract-2.04/wordrec/drawfx.cpp +92 -0
  568. data/vendor/tesseract-2.04/wordrec/drawfx.h +33 -0
  569. data/vendor/tesseract-2.04/wordrec/findseam.cpp +566 -0
  570. data/vendor/tesseract-2.04/wordrec/findseam.h +69 -0
  571. data/vendor/tesseract-2.04/wordrec/gradechop.cpp +226 -0
  572. data/vendor/tesseract-2.04/wordrec/gradechop.h +91 -0
  573. data/vendor/tesseract-2.04/wordrec/heuristic.cpp +194 -0
  574. data/vendor/tesseract-2.04/wordrec/heuristic.h +120 -0
  575. data/vendor/tesseract-2.04/wordrec/makechop.cpp +281 -0
  576. data/vendor/tesseract-2.04/wordrec/makechop.h +69 -0
  577. data/vendor/tesseract-2.04/wordrec/matchtab.cpp +191 -0
  578. data/vendor/tesseract-2.04/wordrec/matchtab.h +45 -0
  579. data/vendor/tesseract-2.04/wordrec/matrix.cpp +118 -0
  580. data/vendor/tesseract-2.04/wordrec/matrix.h +104 -0
  581. data/vendor/tesseract-2.04/wordrec/measure.h +135 -0
  582. data/vendor/tesseract-2.04/wordrec/metrics.cpp +363 -0
  583. data/vendor/tesseract-2.04/wordrec/metrics.h +130 -0
  584. data/vendor/tesseract-2.04/wordrec/mfvars.cpp +51 -0
  585. data/vendor/tesseract-2.04/wordrec/mfvars.h +27 -0
  586. data/vendor/tesseract-2.04/wordrec/msmenus.cpp +110 -0
  587. data/vendor/tesseract-2.04/wordrec/msmenus.h +45 -0
  588. data/vendor/tesseract-2.04/wordrec/olutil.cpp +153 -0
  589. data/vendor/tesseract-2.04/wordrec/olutil.h +128 -0
  590. data/vendor/tesseract-2.04/wordrec/outlines.cpp +172 -0
  591. data/vendor/tesseract-2.04/wordrec/outlines.h +148 -0
  592. data/vendor/tesseract-2.04/wordrec/pieces.cpp +410 -0
  593. data/vendor/tesseract-2.04/wordrec/pieces.h +154 -0
  594. data/vendor/tesseract-2.04/wordrec/plotedges.cpp +134 -0
  595. data/vendor/tesseract-2.04/wordrec/plotedges.h +71 -0
  596. data/vendor/tesseract-2.04/wordrec/plotseg.cpp +116 -0
  597. data/vendor/tesseract-2.04/wordrec/plotseg.h +73 -0
  598. data/vendor/tesseract-2.04/wordrec/render.cpp +152 -0
  599. data/vendor/tesseract-2.04/wordrec/render.h +58 -0
  600. data/vendor/tesseract-2.04/wordrec/seam.cpp +482 -0
  601. data/vendor/tesseract-2.04/wordrec/seam.h +136 -0
  602. data/vendor/tesseract-2.04/wordrec/split.cpp +182 -0
  603. data/vendor/tesseract-2.04/wordrec/split.h +115 -0
  604. data/vendor/tesseract-2.04/wordrec/tally.cpp +68 -0
  605. data/vendor/tesseract-2.04/wordrec/tally.h +94 -0
  606. data/vendor/tesseract-2.04/wordrec/tessinit.cpp +108 -0
  607. data/vendor/tesseract-2.04/wordrec/tessinit.h +46 -0
  608. data/vendor/tesseract-2.04/wordrec/tface.cpp +272 -0
  609. data/vendor/tesseract-2.04/wordrec/tface.h +35 -0
  610. data/vendor/tesseract-2.04/wordrec/wordclass.cpp +284 -0
  611. data/vendor/tesseract-2.04/wordrec/wordclass.h +64 -0
  612. metadata +708 -0
@@ -0,0 +1,2834 @@
1
+ /******************************************************************************
2
+ ** Filename: cluster.c
3
+ ** Purpose: Routines for clustering points in N-D space
4
+ ** Author: Dan Johnson
5
+ ** History: 5/29/89, DSJ, Created.
6
+ **
7
+ ** (c) Copyright Hewlett-Packard Company, 1988.
8
+ ** Licensed under the Apache License, Version 2.0 (the "License");
9
+ ** you may not use this file except in compliance with the License.
10
+ ** You may obtain a copy of the License at
11
+ ** http://www.apache.org/licenses/LICENSE-2.0
12
+ ** Unless required by applicable law or agreed to in writing, software
13
+ ** distributed under the License is distributed on an "AS IS" BASIS,
14
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ ** See the License for the specific language governing permissions and
16
+ ** limitations under the License.
17
+ ******************************************************************************/
18
+ #include "oldheap.h"
19
+ #include "const.h"
20
+ #include "cluster.h"
21
+ #include "emalloc.h"
22
+ #include "tprintf.h"
23
+ #include "danerror.h"
24
+ #include "freelist.h"
25
+ #include <math.h>
26
+
27
+ #define HOTELLING 1 // If true use Hotelling's test to decide where to split.
28
+ #define FTABLE_X 10 // Size of FTable.
29
+ #define FTABLE_Y 100 // Size of FTable.
30
+
31
+ // Table of values approximating the cumulative F-distribution for a confidence of 1%.
32
+ double FTable[FTABLE_Y][FTABLE_X] = {
33
+ {4052.19, 4999.52, 5403.34, 5624.62, 5763.65, 5858.97, 5928.33, 5981.10, 6022.50, 6055.85,},
34
+ {98.502, 99.000, 99.166, 99.249, 99.300, 99.333, 99.356, 99.374, 99.388, 99.399,},
35
+ {34.116, 30.816, 29.457, 28.710, 28.237, 27.911, 27.672, 27.489, 27.345, 27.229,},
36
+ {21.198, 18.000, 16.694, 15.977, 15.522, 15.207, 14.976, 14.799, 14.659, 14.546,},
37
+ {16.258, 13.274, 12.060, 11.392, 10.967, 10.672, 10.456, 10.289, 10.158, 10.051,},
38
+ {13.745, 10.925, 9.780, 9.148, 8.746, 8.466, 8.260, 8.102, 7.976, 7.874,},
39
+ {12.246, 9.547, 8.451, 7.847, 7.460, 7.191, 6.993, 6.840, 6.719, 6.620,},
40
+ {11.259, 8.649, 7.591, 7.006, 6.632, 6.371, 6.178, 6.029, 5.911, 5.814,},
41
+ {10.561, 8.022, 6.992, 6.422, 6.057, 5.802, 5.613, 5.467, 5.351, 5.257,},
42
+ {10.044, 7.559, 6.552, 5.994, 5.636, 5.386, 5.200, 5.057, 4.942, 4.849,},
43
+ { 9.646, 7.206, 6.217, 5.668, 5.316, 5.069, 4.886, 4.744, 4.632, 4.539,},
44
+ { 9.330, 6.927, 5.953, 5.412, 5.064, 4.821, 4.640, 4.499, 4.388, 4.296,},
45
+ { 9.074, 6.701, 5.739, 5.205, 4.862, 4.620, 4.441, 4.302, 4.191, 4.100,},
46
+ { 8.862, 6.515, 5.564, 5.035, 4.695, 4.456, 4.278, 4.140, 4.030, 3.939,},
47
+ { 8.683, 6.359, 5.417, 4.893, 4.556, 4.318, 4.142, 4.004, 3.895, 3.805,},
48
+ { 8.531, 6.226, 5.292, 4.773, 4.437, 4.202, 4.026, 3.890, 3.780, 3.691,},
49
+ { 8.400, 6.112, 5.185, 4.669, 4.336, 4.102, 3.927, 3.791, 3.682, 3.593,},
50
+ { 8.285, 6.013, 5.092, 4.579, 4.248, 4.015, 3.841, 3.705, 3.597, 3.508,},
51
+ { 8.185, 5.926, 5.010, 4.500, 4.171, 3.939, 3.765, 3.631, 3.523, 3.434,},
52
+ { 8.096, 5.849, 4.938, 4.431, 4.103, 3.871, 3.699, 3.564, 3.457, 3.368,},
53
+ { 8.017, 5.780, 4.874, 4.369, 4.042, 3.812, 3.640, 3.506, 3.398, 3.310,},
54
+ { 7.945, 5.719, 4.817, 4.313, 3.988, 3.758, 3.587, 3.453, 3.346, 3.258,},
55
+ { 7.881, 5.664, 4.765, 4.264, 3.939, 3.710, 3.539, 3.406, 3.299, 3.211,},
56
+ { 7.823, 5.614, 4.718, 4.218, 3.895, 3.667, 3.496, 3.363, 3.256, 3.168,},
57
+ { 7.770, 5.568, 4.675, 4.177, 3.855, 3.627, 3.457, 3.324, 3.217, 3.129,},
58
+ { 7.721, 5.526, 4.637, 4.140, 3.818, 3.591, 3.421, 3.288, 3.182, 3.094,},
59
+ { 7.677, 5.488, 4.601, 4.106, 3.785, 3.558, 3.388, 3.256, 3.149, 3.062,},
60
+ { 7.636, 5.453, 4.568, 4.074, 3.754, 3.528, 3.358, 3.226, 3.120, 3.032,},
61
+ { 7.598, 5.420, 4.538, 4.045, 3.725, 3.499, 3.330, 3.198, 3.092, 3.005,},
62
+ { 7.562, 5.390, 4.510, 4.018, 3.699, 3.473, 3.305, 3.173, 3.067, 2.979,},
63
+ { 7.530, 5.362, 4.484, 3.993, 3.675, 3.449, 3.281, 3.149, 3.043, 2.955,},
64
+ { 7.499, 5.336, 4.459, 3.969, 3.652, 3.427, 3.258, 3.127, 3.021, 2.934,},
65
+ { 7.471, 5.312, 4.437, 3.948, 3.630, 3.406, 3.238, 3.106, 3.000, 2.913,},
66
+ { 7.444, 5.289, 4.416, 3.927, 3.611, 3.386, 3.218, 3.087, 2.981, 2.894,},
67
+ { 7.419, 5.268, 4.396, 3.908, 3.592, 3.368, 3.200, 3.069, 2.963, 2.876,},
68
+ { 7.396, 5.248, 4.377, 3.890, 3.574, 3.351, 3.183, 3.052, 2.946, 2.859,},
69
+ { 7.373, 5.229, 4.360, 3.873, 3.558, 3.334, 3.167, 3.036, 2.930, 2.843,},
70
+ { 7.353, 5.211, 4.343, 3.858, 3.542, 3.319, 3.152, 3.021, 2.915, 2.828,},
71
+ { 7.333, 5.194, 4.327, 3.843, 3.528, 3.305, 3.137, 3.006, 2.901, 2.814,},
72
+ { 7.314, 5.179, 4.313, 3.828, 3.514, 3.291, 3.124, 2.993, 2.888, 2.801,},
73
+ { 7.296, 5.163, 4.299, 3.815, 3.501, 3.278, 3.111, 2.980, 2.875, 2.788,},
74
+ { 7.280, 5.149, 4.285, 3.802, 3.488, 3.266, 3.099, 2.968, 2.863, 2.776,},
75
+ { 7.264, 5.136, 4.273, 3.790, 3.476, 3.254, 3.087, 2.957, 2.851, 2.764,},
76
+ { 7.248, 5.123, 4.261, 3.778, 3.465, 3.243, 3.076, 2.946, 2.840, 2.754,},
77
+ { 7.234, 5.110, 4.249, 3.767, 3.454, 3.232, 3.066, 2.935, 2.830, 2.743,},
78
+ { 7.220, 5.099, 4.238, 3.757, 3.444, 3.222, 3.056, 2.925, 2.820, 2.733,},
79
+ { 7.207, 5.087, 4.228, 3.747, 3.434, 3.213, 3.046, 2.916, 2.811, 2.724,},
80
+ { 7.194, 5.077, 4.218, 3.737, 3.425, 3.204, 3.037, 2.907, 2.802, 2.715,},
81
+ { 7.182, 5.066, 4.208, 3.728, 3.416, 3.195, 3.028, 2.898, 2.793, 2.706,},
82
+ { 7.171, 5.057, 4.199, 3.720, 3.408, 3.186, 3.020, 2.890, 2.785, 2.698,},
83
+ { 7.159, 5.047, 4.191, 3.711, 3.400, 3.178, 3.012, 2.882, 2.777, 2.690,},
84
+ { 7.149, 5.038, 4.182, 3.703, 3.392, 3.171, 3.005, 2.874, 2.769, 2.683,},
85
+ { 7.139, 5.030, 4.174, 3.695, 3.384, 3.163, 2.997, 2.867, 2.762, 2.675,},
86
+ { 7.129, 5.021, 4.167, 3.688, 3.377, 3.156, 2.990, 2.860, 2.755, 2.668,},
87
+ { 7.119, 5.013, 4.159, 3.681, 3.370, 3.149, 2.983, 2.853, 2.748, 2.662,},
88
+ { 7.110, 5.006, 4.152, 3.674, 3.363, 3.143, 2.977, 2.847, 2.742, 2.655,},
89
+ { 7.102, 4.998, 4.145, 3.667, 3.357, 3.136, 2.971, 2.841, 2.736, 2.649,},
90
+ { 7.093, 4.991, 4.138, 3.661, 3.351, 3.130, 2.965, 2.835, 2.730, 2.643,},
91
+ { 7.085, 4.984, 4.132, 3.655, 3.345, 3.124, 2.959, 2.829, 2.724, 2.637,},
92
+ { 7.077, 4.977, 4.126, 3.649, 3.339, 3.119, 2.953, 2.823, 2.718, 2.632,},
93
+ { 7.070, 4.971, 4.120, 3.643, 3.333, 3.113, 2.948, 2.818, 2.713, 2.626,},
94
+ { 7.062, 4.965, 4.114, 3.638, 3.328, 3.108, 2.942, 2.813, 2.708, 2.621,},
95
+ { 7.055, 4.959, 4.109, 3.632, 3.323, 3.103, 2.937, 2.808, 2.703, 2.616,},
96
+ { 7.048, 4.953, 4.103, 3.627, 3.318, 3.098, 2.932, 2.803, 2.698, 2.611,},
97
+ { 7.042, 4.947, 4.098, 3.622, 3.313, 3.093, 2.928, 2.798, 2.693, 2.607,},
98
+ { 7.035, 4.942, 4.093, 3.618, 3.308, 3.088, 2.923, 2.793, 2.689, 2.602,},
99
+ { 7.029, 4.937, 4.088, 3.613, 3.304, 3.084, 2.919, 2.789, 2.684, 2.598,},
100
+ { 7.023, 4.932, 4.083, 3.608, 3.299, 3.080, 2.914, 2.785, 2.680, 2.593,},
101
+ { 7.017, 4.927, 4.079, 3.604, 3.295, 3.075, 2.910, 2.781, 2.676, 2.589,},
102
+ { 7.011, 4.922, 4.074, 3.600, 3.291, 3.071, 2.906, 2.777, 2.672, 2.585,},
103
+ { 7.006, 4.917, 4.070, 3.596, 3.287, 3.067, 2.902, 2.773, 2.668, 2.581,},
104
+ { 7.001, 4.913, 4.066, 3.591, 3.283, 3.063, 2.898, 2.769, 2.664, 2.578,},
105
+ { 6.995, 4.908, 4.062, 3.588, 3.279, 3.060, 2.895, 2.765, 2.660, 2.574,},
106
+ { 6.990, 4.904, 4.058, 3.584, 3.275, 3.056, 2.891, 2.762, 2.657, 2.570,},
107
+ { 6.985, 4.900, 4.054, 3.580, 3.272, 3.052, 2.887, 2.758, 2.653, 2.567,},
108
+ { 6.981, 4.896, 4.050, 3.577, 3.268, 3.049, 2.884, 2.755, 2.650, 2.563,},
109
+ { 6.976, 4.892, 4.047, 3.573, 3.265, 3.046, 2.881, 2.751, 2.647, 2.560,},
110
+ { 6.971, 4.888, 4.043, 3.570, 3.261, 3.042, 2.877, 2.748, 2.644, 2.557,},
111
+ { 6.967, 4.884, 4.040, 3.566, 3.258, 3.039, 2.874, 2.745, 2.640, 2.554,},
112
+ { 6.963, 4.881, 4.036, 3.563, 3.255, 3.036, 2.871, 2.742, 2.637, 2.551,},
113
+ { 6.958, 4.877, 4.033, 3.560, 3.252, 3.033, 2.868, 2.739, 2.634, 2.548,},
114
+ { 6.954, 4.874, 4.030, 3.557, 3.249, 3.030, 2.865, 2.736, 2.632, 2.545,},
115
+ { 6.950, 4.870, 4.027, 3.554, 3.246, 3.027, 2.863, 2.733, 2.629, 2.542,},
116
+ { 6.947, 4.867, 4.024, 3.551, 3.243, 3.025, 2.860, 2.731, 2.626, 2.539,},
117
+ { 6.943, 4.864, 4.021, 3.548, 3.240, 3.022, 2.857, 2.728, 2.623, 2.537,},
118
+ { 6.939, 4.861, 4.018, 3.545, 3.238, 3.019, 2.854, 2.725, 2.621, 2.534,},
119
+ { 6.935, 4.858, 4.015, 3.543, 3.235, 3.017, 2.852, 2.723, 2.618, 2.532,},
120
+ { 6.932, 4.855, 4.012, 3.540, 3.233, 3.014, 2.849, 2.720, 2.616, 2.529,},
121
+ { 6.928, 4.852, 4.010, 3.538, 3.230, 3.012, 2.847, 2.718, 2.613, 2.527,},
122
+ { 6.925, 4.849, 4.007, 3.535, 3.228, 3.009, 2.845, 2.715, 2.611, 2.524,},
123
+ { 6.922, 4.846, 4.004, 3.533, 3.225, 3.007, 2.842, 2.713, 2.609, 2.522,},
124
+ { 6.919, 4.844, 4.002, 3.530, 3.223, 3.004, 2.840, 2.711, 2.606, 2.520,},
125
+ { 6.915, 4.841, 3.999, 3.528, 3.221, 3.002, 2.838, 2.709, 2.604, 2.518,},
126
+ { 6.912, 4.838, 3.997, 3.525, 3.218, 3.000, 2.835, 2.706, 2.602, 2.515,},
127
+ { 6.909, 4.836, 3.995, 3.523, 3.216, 2.998, 2.833, 2.704, 2.600, 2.513,},
128
+ { 6.906, 4.833, 3.992, 3.521, 3.214, 2.996, 2.831, 2.702, 2.598, 2.511,},
129
+ { 6.904, 4.831, 3.990, 3.519, 3.212, 2.994, 2.829, 2.700, 2.596, 2.509,},
130
+ { 6.901, 4.829, 3.988, 3.517, 3.210, 2.992, 2.827, 2.698, 2.594, 2.507,},
131
+ { 6.898, 4.826, 3.986, 3.515, 3.208, 2.990, 2.825, 2.696, 2.592, 2.505,},
132
+ { 6.895, 4.824, 3.984, 3.513, 3.206, 2.988, 2.823, 2.694, 2.590, 2.503}
133
+ };
134
+
135
+ /* define the variance which will be used as a minimum variance for any
136
+ dimension of any feature. Since most features are calculated from numbers
137
+ with a precision no better than 1 in 128, the variance should never be
138
+ less than the square of this number for parameters whose range is 1. */
139
+ #define MINVARIANCE 0.0001
140
+
141
+ /* define the absolute minimum number of samples which must be present in
142
+ order to accurately test hypotheses about underlying probability
143
+ distributions. Define separately the minimum samples that are needed
144
+ before a statistical analysis is attempted; this number should be
145
+ equal to MINSAMPLES but can be set to a lower number for early testing
146
+ when very few samples are available. */
147
+ #define MINBUCKETS 5
148
+ #define MINSAMPLESPERBUCKET 5
149
+ #define MINSAMPLES (MINBUCKETS * MINSAMPLESPERBUCKET)
150
+ #define MINSAMPLESNEEDED 1
151
+
152
+ /* define the size of the table which maps normalized samples to
153
+ histogram buckets. Also define the number of standard deviations
154
+ in a normal distribution which are considered to be significant.
155
+ The mapping table will be defined in such a way that it covers
156
+ the specified number of standard deviations on either side of
157
+ the mean. BUCKETTABLESIZE should always be even. */
158
+ #define BUCKETTABLESIZE 1024
159
+ #define NORMALEXTENT 3.0
160
+
161
+ typedef struct
162
+ {
163
+ CLUSTER *Cluster;
164
+ CLUSTER *Neighbor;
165
+ }
166
+
167
+
168
+ TEMPCLUSTER;
169
+
170
+ typedef struct
171
+ {
172
+ FLOAT32 AvgVariance;
173
+ FLOAT32 *CoVariance;
174
+ FLOAT32 *Min; // largest negative distance from the mean
175
+ FLOAT32 *Max; // largest positive distance from the mean
176
+ }
177
+
178
+
179
+ STATISTICS;
180
+
181
+ typedef struct
182
+ {
183
+ DISTRIBUTION Distribution; // distribution being tested for
184
+ uinT32 SampleCount; // # of samples in histogram
185
+ FLOAT64 Confidence; // confidence level of test
186
+ FLOAT64 ChiSquared; // test threshold
187
+ uinT16 NumberOfBuckets; // number of cells in histogram
188
+ uinT16 Bucket[BUCKETTABLESIZE];// mapping to histogram buckets
189
+ uinT32 *Count; // frequency of occurence histogram
190
+ FLOAT32 *ExpectedCount; // expected histogram
191
+ }
192
+
193
+
194
+ BUCKETS;
195
+
196
+ typedef struct
197
+ {
198
+ uinT16 DegreesOfFreedom;
199
+ FLOAT64 Alpha;
200
+ FLOAT64 ChiSquared;
201
+ }
202
+
203
+
204
+ CHISTRUCT;
205
+
206
+ typedef FLOAT64 (*DENSITYFUNC) (inT32);
207
+ typedef FLOAT64 (*SOLVEFUNC) (CHISTRUCT *, double);
208
+
209
+ #define Odd(N) ((N)%2)
210
+ #define Mirror(N,R) ((R) - (N) - 1)
211
+ #define Abs(N) ( ( (N) < 0 ) ? ( -(N) ) : (N) )
212
+
213
+ //--------------Global Data Definitions and Declarations----------------------
214
+ /* the following variables are declared as global so that routines which
215
+ are called from the kd-tree walker can get to them. */
216
+ static HEAP *Heap;
217
+ static TEMPCLUSTER *TempCluster;
218
+ static KDTREE *Tree;
219
+ static inT32 CurrentTemp;
220
+
221
+ /* the following variables describe a discrete normal distribution
222
+ which is used by NormalDensity() and NormalBucket(). The
223
+ constant NORMALEXTENT determines how many standard
224
+ deviations of the distribution are mapped onto the fixed
225
+ discrete range of x. x=0 is mapped to -NORMALEXTENT standard
226
+ deviations and x=BUCKETTABLESIZE is mapped to
227
+ +NORMALEXTENT standard deviations. */
228
+ #define SqrtOf2Pi 2.506628275
229
+ static FLOAT64 NormalStdDev = BUCKETTABLESIZE / (2.0 * NORMALEXTENT);
230
+ static FLOAT64 NormalVariance =
231
+ (BUCKETTABLESIZE * BUCKETTABLESIZE) / (4.0 * NORMALEXTENT * NORMALEXTENT);
232
+ static FLOAT64 NormalMagnitude =
233
+ (2.0 * NORMALEXTENT) / (SqrtOf2Pi * BUCKETTABLESIZE);
234
+ static FLOAT64 NormalMean = BUCKETTABLESIZE / 2;
235
+
236
+ // keep a list of histogram buckets to minimize recomputing them
237
+ static LIST OldBuckets[] = { NIL, NIL, NIL };
238
+
239
+ /* define lookup tables used to compute the number of histogram buckets
240
+ that should be used for a given number of samples. */
241
+ #define LOOKUPTABLESIZE 8
242
+ #define MAXBUCKETS 39
243
+ #define MAXDEGREESOFFREEDOM MAXBUCKETS
244
+
245
+ static uinT32 CountTable[LOOKUPTABLESIZE] = {
246
+ MINSAMPLES, 200, 400, 600, 800, 1000, 1500, 2000
247
+ };
248
+ static uinT16 BucketsTable[LOOKUPTABLESIZE] = {
249
+ MINBUCKETS, 16, 20, 24, 27, 30, 35, MAXBUCKETS
250
+ };
251
+
252
+ /*-------------------------------------------------------------------------
253
+ Private Function Prototypes
254
+ --------------------------------------------------------------------------*/
255
+ void CreateClusterTree(CLUSTERER *Clusterer);
256
+
257
+ void MakePotentialClusters(CLUSTER *Cluster, VISIT Order, inT32 Level);
258
+
259
+ CLUSTER *FindNearestNeighbor(KDTREE *Tree,
260
+ CLUSTER *Cluster,
261
+ FLOAT32 *Distance);
262
+
263
+ CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster);
264
+
265
+ inT32 MergeClusters (inT16 N,
266
+ register PARAM_DESC ParamDesc[],
267
+ register inT32 n1,
268
+ register inT32 n2,
269
+ register FLOAT32 m[],
270
+ register FLOAT32 m1[], register FLOAT32 m2[]);
271
+
272
+ void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config);
273
+
274
+ PROTOTYPE *MakePrototype(CLUSTERER *Clusterer,
275
+ CLUSTERCONFIG *Config,
276
+ CLUSTER *Cluster);
277
+
278
+ PROTOTYPE *MakeDegenerateProto(uinT16 N,
279
+ CLUSTER *Cluster,
280
+ STATISTICS *Statistics,
281
+ PROTOSTYLE Style,
282
+ inT32 MinSamples);
283
+
284
+ PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer,
285
+ CLUSTERCONFIG *Config,
286
+ CLUSTER *Cluster,
287
+ STATISTICS *Statistics);
288
+
289
+ PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer,
290
+ CLUSTER *Cluster,
291
+ STATISTICS *Statistics,
292
+ BUCKETS *Buckets);
293
+
294
+ PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer,
295
+ CLUSTER *Cluster,
296
+ STATISTICS *Statistics,
297
+ BUCKETS *Buckets);
298
+
299
+ PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer,
300
+ CLUSTER *Cluster,
301
+ STATISTICS *Statistics,
302
+ BUCKETS *NormalBuckets,
303
+ FLOAT64 Confidence);
304
+
305
+ void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc);
306
+
307
+ void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics);
308
+
309
+ STATISTICS *ComputeStatistics (inT16 N,
310
+ PARAM_DESC ParamDesc[], CLUSTER * Cluster);
311
+
312
+ PROTOTYPE *NewSphericalProto(uinT16 N,
313
+ CLUSTER *Cluster,
314
+ STATISTICS *Statistics);
315
+
316
+ PROTOTYPE *NewEllipticalProto(inT16 N,
317
+ CLUSTER *Cluster,
318
+ STATISTICS *Statistics);
319
+
320
+ PROTOTYPE *NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics);
321
+
322
+ PROTOTYPE *NewSimpleProto(inT16 N, CLUSTER *Cluster);
323
+
324
+ BOOL8 Independent (PARAM_DESC ParamDesc[],
325
+ inT16 N, FLOAT32 * CoVariance, FLOAT32 Independence);
326
+
327
+ BUCKETS *GetBuckets(DISTRIBUTION Distribution,
328
+ uinT32 SampleCount,
329
+ FLOAT64 Confidence);
330
+
331
+ BUCKETS *MakeBuckets(DISTRIBUTION Distribution,
332
+ uinT32 SampleCount,
333
+ FLOAT64 Confidence);
334
+
335
+ uinT16 OptimumNumberOfBuckets(uinT32 SampleCount);
336
+
337
+ FLOAT64 ComputeChiSquared(uinT16 DegreesOfFreedom, FLOAT64 Alpha);
338
+
339
+ FLOAT64 NormalDensity(inT32 x);
340
+
341
+ FLOAT64 UniformDensity(inT32 x);
342
+
343
+ FLOAT64 Integral(FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx);
344
+
345
+ void FillBuckets(BUCKETS *Buckets,
346
+ CLUSTER *Cluster,
347
+ uinT16 Dim,
348
+ PARAM_DESC *ParamDesc,
349
+ FLOAT32 Mean,
350
+ FLOAT32 StdDev);
351
+
352
+ uinT16 NormalBucket(PARAM_DESC *ParamDesc,
353
+ FLOAT32 x,
354
+ FLOAT32 Mean,
355
+ FLOAT32 StdDev);
356
+
357
+ uinT16 UniformBucket(PARAM_DESC *ParamDesc,
358
+ FLOAT32 x,
359
+ FLOAT32 Mean,
360
+ FLOAT32 StdDev);
361
+
362
+ BOOL8 DistributionOK(BUCKETS *Buckets);
363
+
364
+ void FreeStatistics(STATISTICS *Statistics);
365
+
366
+ void FreeBuckets(BUCKETS *Buckets);
367
+
368
+ void FreeCluster(CLUSTER *Cluster);
369
+
370
+ uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets);
371
+
372
+ int NumBucketsMatch(void *arg1, //BUCKETS *Histogram,
373
+ void *arg2); //uinT16 *DesiredNumberOfBuckets);
374
+
375
+ int ListEntryMatch(void *arg1, void *arg2);
376
+
377
+ void AdjustBuckets(BUCKETS *Buckets, uinT32 NewSampleCount);
378
+
379
+ void InitBuckets(BUCKETS *Buckets);
380
+
381
+ int AlphaMatch(void *arg1, //CHISTRUCT *ChiStruct,
382
+ void *arg2); //CHISTRUCT *SearchKey);
383
+
384
+ CHISTRUCT *NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha);
385
+
386
+ FLOAT64 Solve(SOLVEFUNC Function,
387
+ void *FunctionParams,
388
+ FLOAT64 InitialGuess,
389
+ FLOAT64 Accuracy);
390
+
391
+ FLOAT64 ChiArea(CHISTRUCT *ChiParams, FLOAT64 x);
392
+
393
+ BOOL8 MultipleCharSamples(CLUSTERER *Clusterer,
394
+ CLUSTER *Cluster,
395
+ FLOAT32 MaxIllegal);
396
+
397
+ double InvertMatrix(const float* input, int size, float* inv);
398
+
399
+ //--------------------------Public Code--------------------------------------
400
+ /** MakeClusterer **********************************************************
401
+ Parameters: SampleSize number of dimensions in feature space
402
+ ParamDesc description of each dimension
403
+ Globals: None
404
+ Operation: This routine creates a new clusterer data structure,
405
+ initializes it, and returns a pointer to it.
406
+ Return: pointer to the new clusterer data structure
407
+ Exceptions: None
408
+ History: 5/29/89, DSJ, Created.
409
+ ****************************************************************************/
410
+ CLUSTERER *
411
+ MakeClusterer (inT16 SampleSize, PARAM_DESC ParamDesc[]) {
412
+ CLUSTERER *Clusterer;
413
+ int i;
414
+
415
+ // allocate main clusterer data structure and init simple fields
416
+ Clusterer = (CLUSTERER *) Emalloc (sizeof (CLUSTERER));
417
+ Clusterer->SampleSize = SampleSize;
418
+ Clusterer->NumberOfSamples = 0;
419
+ Clusterer->NumChar = 0;
420
+
421
+ // init fields which will not be used initially
422
+ Clusterer->Root = NULL;
423
+ Clusterer->ProtoList = NIL;
424
+
425
+ // maintain a copy of param descriptors in the clusterer data structure
426
+ Clusterer->ParamDesc =
427
+ (PARAM_DESC *) Emalloc (SampleSize * sizeof (PARAM_DESC));
428
+ for (i = 0; i < SampleSize; i++) {
429
+ Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular;
430
+ Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential;
431
+ Clusterer->ParamDesc[i].Min = ParamDesc[i].Min;
432
+ Clusterer->ParamDesc[i].Max = ParamDesc[i].Max;
433
+ Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
434
+ Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2;
435
+ Clusterer->ParamDesc[i].MidRange =
436
+ (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
437
+ }
438
+
439
+ // allocate a kd tree to hold the samples
440
+ Clusterer->KDTree = MakeKDTree (SampleSize, ParamDesc);
441
+
442
+ // execute hook for monitoring clustering operation
443
+ // (*ClustererCreationHook)( Clusterer );
444
+
445
+ return (Clusterer);
446
+ } // MakeClusterer
447
+
448
+
449
+ /** MakeSample ***********************************************************
450
+ Parameters: Clusterer clusterer data structure to add sample to
451
+ Feature feature to be added to clusterer
452
+ CharID unique ident. of char that sample came from
453
+ Globals: None
454
+ Operation: This routine creates a new sample data structure to hold
455
+ the specified feature. This sample is added to the clusterer
456
+ data structure (so that it knows which samples are to be
457
+ clustered later), and a pointer to the sample is returned to
458
+ the caller.
459
+ Return: Pointer to the new sample data structure
460
+ Exceptions: ALREADYCLUSTERED MakeSample can't be called after
461
+ ClusterSamples has been called
462
+ History: 5/29/89, DSJ, Created.
463
+ *****************************************************************************/
464
+ SAMPLE *
465
+ MakeSample (CLUSTERER * Clusterer, FLOAT32 Feature[], inT32 CharID) {
466
+ SAMPLE *Sample;
467
+ int i;
468
+
469
+ // see if the samples have already been clustered - if so trap an error
470
+ if (Clusterer->Root != NULL)
471
+ DoError (ALREADYCLUSTERED,
472
+ "Can't add samples after they have been clustered");
473
+
474
+ // allocate the new sample and initialize it
475
+ Sample = (SAMPLE *) Emalloc (sizeof (SAMPLE) +
476
+ (Clusterer->SampleSize -
477
+ 1) * sizeof (FLOAT32));
478
+ Sample->Clustered = FALSE;
479
+ Sample->Prototype = FALSE;
480
+ Sample->SampleCount = 1;
481
+ Sample->Left = NULL;
482
+ Sample->Right = NULL;
483
+ Sample->CharID = CharID;
484
+
485
+ for (i = 0; i < Clusterer->SampleSize; i++)
486
+ Sample->Mean[i] = Feature[i];
487
+
488
+ // add the sample to the KD tree - keep track of the total # of samples
489
+ Clusterer->NumberOfSamples++;
490
+ KDStore (Clusterer->KDTree, Sample->Mean, (char *) Sample);
491
+ if (CharID >= Clusterer->NumChar)
492
+ Clusterer->NumChar = CharID + 1;
493
+
494
+ // execute hook for monitoring clustering operation
495
+ // (*SampleCreationHook)( Sample );
496
+
497
+ return (Sample);
498
+ } // MakeSample
499
+
500
+
501
+ /** ClusterSamples ***********************************************************
502
+ Parameters: Clusterer data struct containing samples to be clustered
503
+ Config parameters which control clustering process
504
+ Globals: None
505
+ Operation: This routine first checks to see if the samples in this
506
+ clusterer have already been clustered before; if so, it does
507
+ not bother to recreate the cluster tree. It simply recomputes
508
+ the prototypes based on the new Config info.
509
+ If the samples have not been clustered before, the
510
+ samples in the KD tree are formed into a cluster tree and then
511
+ the prototypes are computed from the cluster tree.
512
+ In either case this routine returns a pointer to a
513
+ list of prototypes that best represent the samples given
514
+ the constraints specified in Config.
515
+ Return: Pointer to a list of prototypes
516
+ Exceptions: None
517
+ History: 5/29/89, DSJ, Created.
518
+ *******************************************************************************/
519
+ LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {
520
+ //only create cluster tree if samples have never been clustered before
521
+ if (Clusterer->Root == NULL)
522
+ CreateClusterTree(Clusterer);
523
+
524
+ //deallocate the old prototype list if one exists
525
+ FreeProtoList (&Clusterer->ProtoList);
526
+ Clusterer->ProtoList = NIL;
527
+
528
+ //compute prototypes starting at the root node in the tree
529
+ ComputePrototypes(Clusterer, Config);
530
+ return (Clusterer->ProtoList);
531
+ } // ClusterSamples
532
+
533
+
534
+ /** FreeClusterer *************************************************************
535
+ Parameters: Clusterer pointer to data structure to be freed
536
+ Globals: None
537
+ Operation: This routine frees all of the memory allocated to the
538
+ specified data structure. It will not, however, free
539
+ the memory used by the prototype list. The pointers to
540
+ the clusters for each prototype in the list will be set
541
+ to NULL to indicate that the cluster data structures no
542
+ longer exist. Any sample lists that have been obtained
543
+ via calls to GetSamples are no longer valid.
544
+ Return: None
545
+ Exceptions: None
546
+ History: 6/6/89, DSJ, Created.
547
+ *******************************************************************************/
548
+ void FreeClusterer(CLUSTERER *Clusterer) {
549
+ if (Clusterer != NULL) {
550
+ memfree (Clusterer->ParamDesc);
551
+ if (Clusterer->KDTree != NULL)
552
+ FreeKDTree (Clusterer->KDTree);
553
+ if (Clusterer->Root != NULL)
554
+ FreeCluster (Clusterer->Root);
555
+ iterate (Clusterer->ProtoList) {
556
+ ((PROTOTYPE *) (first_node (Clusterer->ProtoList)))->Cluster = NULL;
557
+ }
558
+ memfree(Clusterer);
559
+ }
560
+ } // FreeClusterer
561
+
562
+
563
+ /** FreeProtoList ************************************************************
564
+ Parameters: ProtoList pointer to list of prototypes to be freed
565
+ Globals: None
566
+ Operation: This routine frees all of the memory allocated to the
567
+ specified list of prototypes. The clusters which are
568
+ pointed to by the prototypes are not freed.
569
+ Return: None
570
+ Exceptions: None
571
+ History: 6/6/89, DSJ, Created.
572
+ *****************************************************************************/
573
+ void FreeProtoList(LIST *ProtoList) {
574
+ destroy_nodes(*ProtoList, FreePrototype);
575
+ } // FreeProtoList
576
+
577
+
578
+ /** FreePrototype ************************************************************
579
+ Parameters: Prototype prototype data structure to be deallocated
580
+ Globals: None
581
+ Operation: This routine deallocates the memory consumed by the specified
582
+ prototype and modifies the corresponding cluster so that it
583
+ is no longer marked as a prototype. The cluster is NOT
584
+ deallocated by this routine.
585
+ Return: None
586
+ Exceptions: None
587
+ History: 5/30/89, DSJ, Created.
588
+ *******************************************************************************/
589
+ void FreePrototype(void *arg) { //PROTOTYPE *Prototype)
590
+ PROTOTYPE *Prototype = (PROTOTYPE *) arg;
591
+
592
+ // unmark the corresponding cluster (if there is one
593
+ if (Prototype->Cluster != NULL)
594
+ Prototype->Cluster->Prototype = FALSE;
595
+
596
+ // deallocate the prototype statistics and then the prototype itself
597
+ if (Prototype->Distrib != NULL)
598
+ memfree (Prototype->Distrib);
599
+ if (Prototype->Mean != NULL)
600
+ memfree (Prototype->Mean);
601
+ if (Prototype->Style != spherical) {
602
+ if (Prototype->Variance.Elliptical != NULL)
603
+ memfree (Prototype->Variance.Elliptical);
604
+ if (Prototype->Magnitude.Elliptical != NULL)
605
+ memfree (Prototype->Magnitude.Elliptical);
606
+ if (Prototype->Weight.Elliptical != NULL)
607
+ memfree (Prototype->Weight.Elliptical);
608
+ }
609
+ memfree(Prototype);
610
+ } // FreePrototype
611
+
612
+
613
+ /** NextSample ************************************************************
614
+ Parameters: SearchState ptr to list containing clusters to be searched
615
+ Globals: None
616
+ Operation: This routine is used to find all of the samples which
617
+ belong to a cluster. It starts by removing the top
618
+ cluster on the cluster list (SearchState). If this cluster is
619
+ a leaf it is returned. Otherwise, the right subcluster
620
+ is pushed on the list and we continue the search in the
621
+ left subcluster. This continues until a leaf is found.
622
+ If all samples have been found, NULL is returned.
623
+ InitSampleSearch() must be called
624
+ before NextSample() to initialize the search.
625
+ Return: Pointer to the next leaf cluster (sample) or NULL.
626
+ Exceptions: None
627
+ History: 6/16/89, DSJ, Created.
628
+ ****************************************************************************/
629
+ CLUSTER *NextSample(LIST *SearchState) {
630
+ CLUSTER *Cluster;
631
+
632
+ if (*SearchState == NIL)
633
+ return (NULL);
634
+ Cluster = (CLUSTER *) first_node (*SearchState);
635
+ *SearchState = pop (*SearchState);
636
+ while (TRUE) {
637
+ if (Cluster->Left == NULL)
638
+ return (Cluster);
639
+ *SearchState = push (*SearchState, Cluster->Right);
640
+ Cluster = Cluster->Left;
641
+ }
642
+ } // NextSample
643
+
644
+
645
+ /** Mean ***********************************************************
646
+ Parameters: Proto prototype to return mean of
647
+ Dimension dimension whose mean is to be returned
648
+ Globals: none
649
+ Operation: This routine returns the mean of the specified
650
+ prototype in the indicated dimension.
651
+ Return: Mean of Prototype in Dimension
652
+ Exceptions: none
653
+ History: 7/6/89, DSJ, Created.
654
+ *********************************************************************/
655
+ FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension) {
656
+ return (Proto->Mean[Dimension]);
657
+ } // Mean
658
+
659
+
660
+ /** StandardDeviation *************************************************
661
+ Parameters: Proto prototype to return standard deviation of
662
+ Dimension dimension whose stddev is to be returned
663
+ Globals: none
664
+ Operation: This routine returns the standard deviation of the
665
+ prototype in the indicated dimension.
666
+ Return: Standard deviation of Prototype in Dimension
667
+ Exceptions: none
668
+ History: 7/6/89, DSJ, Created.
669
+ **********************************************************************/
670
+ FLOAT32 StandardDeviation(PROTOTYPE *Proto, uinT16 Dimension) {
671
+ switch (Proto->Style) {
672
+ case spherical:
673
+ return ((FLOAT32) sqrt ((double) Proto->Variance.Spherical));
674
+ case elliptical:
675
+ return ((FLOAT32)
676
+ sqrt ((double) Proto->Variance.Elliptical[Dimension]));
677
+ case mixed:
678
+ switch (Proto->Distrib[Dimension]) {
679
+ case normal:
680
+ return ((FLOAT32)
681
+ sqrt ((double) Proto->Variance.Elliptical[Dimension]));
682
+ case uniform:
683
+ case D_random:
684
+ return (Proto->Variance.Elliptical[Dimension]);
685
+ }
686
+ }
687
+ return 0.0f;
688
+ } // StandardDeviation
689
+
690
+
691
+ /*---------------------------------------------------------------------------
692
+ Private Code
693
+ ----------------------------------------------------------------------------*/
694
+ /** CreateClusterTree *******************************************************
695
+ Parameters: Clusterer data structure holdings samples to be clustered
696
+ Globals: Tree kd-tree holding samples
697
+ TempCluster array of temporary clusters
698
+ CurrentTemp index of next temp cluster to be used
699
+ Heap heap used to hold temp clusters - "best" on top
700
+ Operation: This routine performs a bottoms-up clustering on the samples
701
+ held in the kd-tree of the Clusterer data structure. The
702
+ result is a cluster tree. Each node in the tree represents
703
+ a cluster which conceptually contains a subset of the samples.
704
+ More precisely, the cluster contains all of the samples which
705
+ are contained in its two sub-clusters. The leaves of the
706
+ tree are the individual samples themselves; they have no
707
+ sub-clusters. The root node of the tree conceptually contains
708
+ all of the samples.
709
+ Return: None (the Clusterer data structure is changed)
710
+ Exceptions: None
711
+ History: 5/29/89, DSJ, Created.
712
+ ******************************************************************************/
713
+ void CreateClusterTree(CLUSTERER *Clusterer) {
714
+ HEAPENTRY HeapEntry;
715
+ TEMPCLUSTER *PotentialCluster;
716
+
717
+ // save the kd-tree in a global variable so kd-tree walker can get at it
718
+ Tree = Clusterer->KDTree;
719
+
720
+ // allocate memory to to hold all of the "potential" clusters
721
+ TempCluster = (TEMPCLUSTER *)
722
+ Emalloc (Clusterer->NumberOfSamples * sizeof (TEMPCLUSTER));
723
+ CurrentTemp = 0;
724
+
725
+ // each sample and its nearest neighbor form a "potential" cluster
726
+ // save these in a heap with the "best" potential clusters on top
727
+ Heap = MakeHeap (Clusterer->NumberOfSamples);
728
+ KDWalk (Tree, (void_proc) MakePotentialClusters);
729
+
730
+ // form potential clusters into actual clusters - always do "best" first
731
+ while (GetTopOfHeap (Heap, &HeapEntry) != EMPTY) {
732
+ PotentialCluster = (TEMPCLUSTER *) (HeapEntry.Data);
733
+
734
+ // if main cluster of potential cluster is already in another cluster
735
+ // then we don't need to worry about it
736
+ if (PotentialCluster->Cluster->Clustered) {
737
+ continue;
738
+ }
739
+
740
+ // if main cluster is not yet clustered, but its nearest neighbor is
741
+ // then we must find a new nearest neighbor
742
+ else if (PotentialCluster->Neighbor->Clustered) {
743
+ PotentialCluster->Neighbor =
744
+ FindNearestNeighbor (Tree, PotentialCluster->Cluster,
745
+ &(HeapEntry.Key));
746
+ if (PotentialCluster->Neighbor != NULL) {
747
+ HeapStore(Heap, &HeapEntry);
748
+ }
749
+ }
750
+
751
+ // if neither cluster is already clustered, form permanent cluster
752
+ else {
753
+ PotentialCluster->Cluster =
754
+ MakeNewCluster(Clusterer, PotentialCluster);
755
+ PotentialCluster->Neighbor =
756
+ FindNearestNeighbor (Tree, PotentialCluster->Cluster,
757
+ &(HeapEntry.Key));
758
+ if (PotentialCluster->Neighbor != NULL) {
759
+ HeapStore(Heap, &HeapEntry);
760
+ }
761
+ }
762
+ }
763
+
764
+ // the root node in the cluster tree is now the only node in the kd-tree
765
+ Clusterer->Root = (CLUSTER *) RootOf (Clusterer->KDTree);
766
+
767
+ // free up the memory used by the K-D tree, heap, and temp clusters
768
+ FreeKDTree(Tree);
769
+ Clusterer->KDTree = NULL;
770
+ FreeHeap(Heap);
771
+ memfree(TempCluster);
772
+ } // CreateClusterTree
773
+
774
+
775
+ /** MakePotentialClusters **************************************************
776
+ Parameters: Cluster current cluster being visited in kd-tree walk
777
+ Order order in which cluster is being visited
778
+ Level level of this cluster in the kd-tree
779
+ Globals: Tree kd-tree to be searched for neighbors
780
+ TempCluster array of temporary clusters
781
+ CurrentTemp index of next temp cluster to be used
782
+ Heap heap used to hold temp clusters - "best" on top
783
+ Operation: This routine is designed to be used in concert with the
784
+ KDWalk routine. It will create a potential cluster for
785
+ each sample in the kd-tree that is being walked. This
786
+ potential cluster will then be pushed on the heap.
787
+ Return: none
788
+ Exceptions: none
789
+ History: 5/29/89, DSJ, Created.
790
+ 7/13/89, DSJ, Removed visibility of kd-tree node data struct.
791
+ ******************************************************************************/
792
+ void MakePotentialClusters(CLUSTER *Cluster, VISIT Order, inT32 Level) {
793
+ HEAPENTRY HeapEntry;
794
+
795
+ if ((Order == preorder) || (Order == leaf)) {
796
+ TempCluster[CurrentTemp].Cluster = Cluster;
797
+ HeapEntry.Data = (char *) &(TempCluster[CurrentTemp]);
798
+ TempCluster[CurrentTemp].Neighbor =
799
+ FindNearestNeighbor (Tree, TempCluster[CurrentTemp].Cluster,
800
+ &(HeapEntry.Key));
801
+ if (TempCluster[CurrentTemp].Neighbor != NULL) {
802
+ HeapStore(Heap, &HeapEntry);
803
+ CurrentTemp++;
804
+ }
805
+ }
806
+ } // MakePotentialClusters
807
+
808
+
809
+ /** FindNearestNeighbor *********************************************************
810
+ Parameters: Tree kd-tree to search in for nearest neighbor
811
+ Cluster cluster whose nearest neighbor is to be found
812
+ Distance ptr to variable to report distance found
813
+ Globals: none
814
+ Operation: This routine searches the specified kd-tree for the nearest
815
+ neighbor of the specified cluster. It actually uses the
816
+ kd routines to find the 2 nearest neighbors since one of them
817
+ will be the original cluster. A pointer to the nearest
818
+ neighbor is returned, if it can be found, otherwise NULL is
819
+ returned. The distance between the 2 nodes is placed
820
+ in the specified variable.
821
+ Return: Pointer to the nearest neighbor of Cluster, or NULL
822
+ Exceptions: none
823
+ History: 5/29/89, DSJ, Created.
824
+ 7/13/89, DSJ, Removed visibility of kd-tree node data struct
825
+ ********************************************************************************/
826
+ CLUSTER *
827
+ FindNearestNeighbor (KDTREE * Tree, CLUSTER * Cluster, FLOAT32 * Distance)
828
+ #define MAXNEIGHBORS 2
829
+ #define MAXDISTANCE MAX_FLOAT32
830
+ {
831
+ CLUSTER *Neighbor[MAXNEIGHBORS];
832
+ FLOAT32 Dist[MAXNEIGHBORS];
833
+ inT32 NumberOfNeighbors;
834
+ inT32 i;
835
+ CLUSTER *BestNeighbor;
836
+
837
+ // find the 2 nearest neighbors of the cluster
838
+ NumberOfNeighbors = KDNearestNeighborSearch
839
+ (Tree, Cluster->Mean, MAXNEIGHBORS, MAXDISTANCE, Neighbor, Dist);
840
+
841
+ // search for the nearest neighbor that is not the cluster itself
842
+ *Distance = MAXDISTANCE;
843
+ BestNeighbor = NULL;
844
+ for (i = 0; i < NumberOfNeighbors; i++) {
845
+ if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) {
846
+ *Distance = Dist[i];
847
+ BestNeighbor = Neighbor[i];
848
+ }
849
+ }
850
+ return (BestNeighbor);
851
+ } // FindNearestNeighbor
852
+
853
+
854
+ /** MakeNewCluster *************************************************************
855
+ Parameters: Clusterer current clustering environment
856
+ TempCluster potential cluster to make permanent
857
+ Globals: none
858
+ Operation: This routine creates a new permanent cluster from the
859
+ clusters specified in TempCluster. The 2 clusters in
860
+ TempCluster are marked as "clustered" and deleted from
861
+ the kd-tree. The new cluster is then added to the kd-tree.
862
+ Return: Pointer to the new permanent cluster
863
+ Exceptions: none
864
+ History: 5/29/89, DSJ, Created.
865
+ 7/13/89, DSJ, Removed visibility of kd-tree node data struct
866
+ ********************************************************************************/
867
+ CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster) {
868
+ CLUSTER *Cluster;
869
+
870
+ // allocate the new cluster and initialize it
871
+ Cluster = (CLUSTER *) Emalloc (sizeof (CLUSTER) +
872
+ (Clusterer->SampleSize -
873
+ 1) * sizeof (FLOAT32));
874
+ Cluster->Clustered = FALSE;
875
+ Cluster->Prototype = FALSE;
876
+ Cluster->Left = TempCluster->Cluster;
877
+ Cluster->Right = TempCluster->Neighbor;
878
+ Cluster->CharID = -1;
879
+
880
+ // mark the old clusters as "clustered" and delete them from the kd-tree
881
+ Cluster->Left->Clustered = TRUE;
882
+ Cluster->Right->Clustered = TRUE;
883
+ KDDelete (Clusterer->KDTree, Cluster->Left->Mean, Cluster->Left);
884
+ KDDelete (Clusterer->KDTree, Cluster->Right->Mean, Cluster->Right);
885
+
886
+ // compute the mean and sample count for the new cluster
887
+ Cluster->SampleCount =
888
+ MergeClusters (Clusterer->SampleSize, Clusterer->ParamDesc,
889
+ Cluster->Left->SampleCount, Cluster->Right->SampleCount,
890
+ Cluster->Mean, Cluster->Left->Mean, Cluster->Right->Mean);
891
+
892
+ // add the new cluster to the KD tree
893
+ KDStore (Clusterer->KDTree, Cluster->Mean, Cluster);
894
+ return (Cluster);
895
+ } // MakeNewCluster
896
+
897
+
898
+ /** MergeClusters ************************************************************
899
+ Parameters: N # of dimensions (size of arrays)
900
+ ParamDesc array of dimension descriptions
901
+ n1, n2 number of samples in each old cluster
902
+ m array to hold mean of new cluster
903
+ m1, m2 arrays containing means of old clusters
904
+ Globals: None
905
+ Operation: This routine merges two clusters into one larger cluster.
906
+ To do this it computes the number of samples in the new
907
+ cluster and the mean of the new cluster. The ParamDesc
908
+ information is used to ensure that circular dimensions
909
+ are handled correctly.
910
+ Return: The number of samples in the new cluster.
911
+ Exceptions: None
912
+ History: 5/31/89, DSJ, Created.
913
+ *********************************************************************************/
914
+ inT32
915
+ MergeClusters (inT16 N,
916
+ register PARAM_DESC ParamDesc[],
917
+ register inT32 n1,
918
+ register inT32 n2,
919
+ register FLOAT32 m[],
920
+ register FLOAT32 m1[], register FLOAT32 m2[]) {
921
+ register inT32 i, n;
922
+
923
+ n = n1 + n2;
924
+ for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {
925
+ if (ParamDesc->Circular) {
926
+ // if distance between means is greater than allowed
927
+ // reduce upper point by one "rotation" to compute mean
928
+ // then normalize the mean back into the accepted range
929
+ if ((*m2 - *m1) > ParamDesc->HalfRange) {
930
+ *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n;
931
+ if (*m < ParamDesc->Min)
932
+ *m += ParamDesc->Range;
933
+ }
934
+ else if ((*m1 - *m2) > ParamDesc->HalfRange) {
935
+ *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n;
936
+ if (*m < ParamDesc->Min)
937
+ *m += ParamDesc->Range;
938
+ }
939
+ else
940
+ *m = (n1 * *m1 + n2 * *m2) / n;
941
+ }
942
+ else
943
+ *m = (n1 * *m1 + n2 * *m2) / n;
944
+ }
945
+ return (n);
946
+ } // MergeClusters
947
+
948
+
949
+ /** ComputePrototypes *******************************************************
950
+ Parameters: Clusterer data structure holding cluster tree
951
+ Config parameters used to control prototype generation
952
+ Globals: None
953
+ Operation: This routine decides which clusters in the cluster tree
954
+ should be represented by prototypes, forms a list of these
955
+ prototypes, and places the list in the Clusterer data
956
+ structure.
957
+ Return: None
958
+ Exceptions: None
959
+ History: 5/30/89, DSJ, Created.
960
+ *******************************************************************************/
961
+ void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {
962
+ LIST ClusterStack = NIL;
963
+ CLUSTER *Cluster;
964
+ PROTOTYPE *Prototype;
965
+
966
+ // use a stack to keep track of clusters waiting to be processed
967
+ // initially the only cluster on the stack is the root cluster
968
+ if (Clusterer->Root != NULL)
969
+ ClusterStack = push (NIL, Clusterer->Root);
970
+
971
+ // loop until we have analyzed all clusters which are potential prototypes
972
+ while (ClusterStack != NIL) {
973
+ // remove the next cluster to be analyzed from the stack
974
+ // try to make a prototype from the cluster
975
+ // if successful, put it on the proto list, else split the cluster
976
+ Cluster = (CLUSTER *) first_node (ClusterStack);
977
+ ClusterStack = pop (ClusterStack);
978
+ Prototype = MakePrototype (Clusterer, Config, Cluster);
979
+ if (Prototype != NULL) {
980
+ Clusterer->ProtoList = push (Clusterer->ProtoList, Prototype);
981
+ }
982
+ else {
983
+ ClusterStack = push (ClusterStack, Cluster->Right);
984
+ ClusterStack = push (ClusterStack, Cluster->Left);
985
+ }
986
+ }
987
+ } // ComputePrototypes
988
+
989
+
990
+ /** MakePrototype ***********************************************************
991
+ Parameters: Clusterer data structure holding cluster tree
992
+ Config parameters used to control prototype generation
993
+ Cluster cluster to be made into a prototype
994
+ Globals: None
995
+ Operation: This routine attempts to create a prototype from the
996
+ specified cluster that conforms to the distribution
997
+ specified in Config. If there are too few samples in the
998
+ cluster to perform a statistical analysis, then a prototype
999
+ is generated but labelled as insignificant. If the
1000
+ dimensions of the cluster are not independent, no prototype
1001
+ is generated and NULL is returned. If a prototype can be
1002
+ found that matches the desired distribution then a pointer
1003
+ to it is returned, otherwise NULL is returned.
1004
+ Return: Pointer to new prototype or NULL
1005
+ Exceptions: None
1006
+ History: 6/19/89, DSJ, Created.
1007
+ *******************************************************************************/
1008
+ PROTOTYPE *MakePrototype(CLUSTERER *Clusterer,
1009
+ CLUSTERCONFIG *Config,
1010
+ CLUSTER *Cluster) {
1011
+ STATISTICS *Statistics;
1012
+ PROTOTYPE *Proto;
1013
+ BUCKETS *Buckets;
1014
+
1015
+ // filter out clusters which contain samples from the same character
1016
+ if (MultipleCharSamples (Clusterer, Cluster, Config->MaxIllegal))
1017
+ return (NULL);
1018
+
1019
+ // compute the covariance matrix and ranges for the cluster
1020
+ Statistics =
1021
+ ComputeStatistics (Clusterer->SampleSize, Clusterer->ParamDesc, Cluster);
1022
+
1023
+ // check for degenerate clusters which need not be analyzed further
1024
+ // note that the MinSamples test assumes that all clusters with multiple
1025
+ // character samples have been removed (as above)
1026
+ Proto = MakeDegenerateProto (Clusterer->SampleSize, Cluster, Statistics,
1027
+ Config->ProtoStyle,
1028
+ (inT32) (Config->MinSamples *
1029
+ Clusterer->NumChar));
1030
+ if (Proto != NULL) {
1031
+ FreeStatistics(Statistics);
1032
+ return (Proto);
1033
+ }
1034
+ // check to ensure that all dimensions are independent
1035
+ if (!Independent (Clusterer->ParamDesc, Clusterer->SampleSize,
1036
+ Statistics->CoVariance, Config->Independence)) {
1037
+ FreeStatistics(Statistics);
1038
+ return (NULL);
1039
+ }
1040
+
1041
+ if (HOTELLING && Config->ProtoStyle == elliptical) {
1042
+ Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics);
1043
+ if (Proto != NULL) {
1044
+ FreeStatistics(Statistics);
1045
+ return Proto;
1046
+ }
1047
+ }
1048
+
1049
+ // create a histogram data structure used to evaluate distributions
1050
+ Buckets = GetBuckets (normal, Cluster->SampleCount, Config->Confidence);
1051
+
1052
+ // create a prototype based on the statistics and test it
1053
+ switch (Config->ProtoStyle) {
1054
+ case spherical:
1055
+ Proto = MakeSphericalProto (Clusterer, Cluster, Statistics, Buckets);
1056
+ break;
1057
+ case elliptical:
1058
+ Proto = MakeEllipticalProto (Clusterer, Cluster, Statistics, Buckets);
1059
+ break;
1060
+ case mixed:
1061
+ Proto = MakeMixedProto (Clusterer, Cluster, Statistics, Buckets,
1062
+ Config->Confidence);
1063
+ break;
1064
+ case automatic:
1065
+ Proto = MakeSphericalProto (Clusterer, Cluster, Statistics, Buckets);
1066
+ if (Proto != NULL)
1067
+ break;
1068
+ Proto = MakeEllipticalProto (Clusterer, Cluster, Statistics, Buckets);
1069
+ if (Proto != NULL)
1070
+ break;
1071
+ Proto = MakeMixedProto (Clusterer, Cluster, Statistics, Buckets,
1072
+ Config->Confidence);
1073
+ break;
1074
+ }
1075
+ FreeBuckets(Buckets);
1076
+ FreeStatistics(Statistics);
1077
+ return (Proto);
1078
+ } // MakePrototype
1079
+
1080
+
1081
+ /** MakeDegenerateProto ******************************************************
1082
+ Parameters: N number of dimensions
1083
+ Cluster cluster being analyzed
1084
+ Statistics statistical info about cluster
1085
+ Style type of prototype to be generated
1086
+ MinSamples minimum number of samples in a cluster
1087
+ Globals: None
1088
+ Operation: This routine checks for clusters which are degenerate and
1089
+ therefore cannot be analyzed in a statistically valid way.
1090
+ A cluster is defined as degenerate if it does not have at
1091
+ least MINSAMPLESNEEDED samples in it. If the cluster is
1092
+ found to be degenerate, a prototype of the specified style
1093
+ is generated and marked as insignificant. A cluster is
1094
+ also degenerate if it does not have at least MinSamples
1095
+ samples in it.
1096
+ If the cluster is not degenerate, NULL is returned.
1097
+ Return: Pointer to degenerate prototype or NULL.
1098
+ Exceptions: None
1099
+ History: 6/20/89, DSJ, Created.
1100
+ 7/12/89, DSJ, Changed name and added check for 0 stddev.
1101
+ 8/8/89, DSJ, Removed check for 0 stddev (handled elsewhere).
1102
+ ********************************************************************************/
1103
+ PROTOTYPE *MakeDegenerateProto( //this was MinSample
1104
+ uinT16 N,
1105
+ CLUSTER *Cluster,
1106
+ STATISTICS *Statistics,
1107
+ PROTOSTYLE Style,
1108
+ inT32 MinSamples) {
1109
+ PROTOTYPE *Proto = NULL;
1110
+
1111
+ if (MinSamples < MINSAMPLESNEEDED)
1112
+ MinSamples = MINSAMPLESNEEDED;
1113
+
1114
+ if (Cluster->SampleCount < MinSamples) {
1115
+ switch (Style) {
1116
+ case spherical:
1117
+ Proto = NewSphericalProto (N, Cluster, Statistics);
1118
+ break;
1119
+ case elliptical:
1120
+ case automatic:
1121
+ Proto = NewEllipticalProto (N, Cluster, Statistics);
1122
+ break;
1123
+ case mixed:
1124
+ Proto = NewMixedProto (N, Cluster, Statistics);
1125
+ break;
1126
+ }
1127
+ Proto->Significant = FALSE;
1128
+ }
1129
+ return (Proto);
1130
+ } // MakeDegenerateProto
1131
+
1132
+ /** TestEllipticalProto ****************************************************
1133
+ Parameters: Clusterer data struct containing samples being clustered
1134
+ Config provides the magic number of samples that make a good cluster
1135
+ Cluster cluster to be made into an elliptical prototype
1136
+ Statistics statistical info about cluster
1137
+ Globals: None
1138
+ Operation: This routine tests the specified cluster to see if **
1139
+ * there is a statistically significant difference between
1140
+ * the sub-clusters that would be made if the cluster were to
1141
+ * be split. If not, then a new prototype is formed and
1142
+ * returned to the caller. If there is, then NULL is returned
1143
+ * to the caller.
1144
+ Return: Pointer to new elliptical prototype or NULL.
1145
+ ****************************************************************************/
1146
+ PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer,
1147
+ CLUSTERCONFIG *Config,
1148
+ CLUSTER *Cluster,
1149
+ STATISTICS *Statistics) {
1150
+ // Fraction of the number of samples used as a range around 1 within
1151
+ // which a cluster has the magic size that allows a boost to the
1152
+ // FTable by kFTableBoostMargin, thus allowing clusters near the
1153
+ // magic size (equal to the number of sample characters) to be more
1154
+ // likely to stay together.
1155
+ const double kMagicSampleMargin = 0.0625;
1156
+ const double kFTableBoostMargin = 2.0;
1157
+
1158
+ int N = Clusterer->SampleSize;
1159
+ CLUSTER* Left = Cluster->Left;
1160
+ CLUSTER* Right = Cluster->Right;
1161
+ if (Left == NULL || Right == NULL)
1162
+ return NULL;
1163
+ int TotalDims = Left->SampleCount + Right->SampleCount;
1164
+ if (TotalDims < N + 1 || TotalDims < 2)
1165
+ return NULL;
1166
+ const int kMatrixSize = N * N * sizeof(FLOAT32);
1167
+ FLOAT32* Covariance = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
1168
+ FLOAT32* Inverse = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
1169
+ FLOAT32* Delta = reinterpret_cast<FLOAT32*>(Emalloc(N * sizeof(FLOAT32)));
1170
+ // Compute a new covariance matrix that only uses essential features.
1171
+ for (int i = 0; i < N; ++i) {
1172
+ int row_offset = i * N;
1173
+ if (!Clusterer->ParamDesc[i].NonEssential) {
1174
+ for (int j = 0; j < N; ++j) {
1175
+ if (!Clusterer->ParamDesc[j].NonEssential)
1176
+ Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset];
1177
+ else
1178
+ Covariance[j + row_offset] = 0.0f;
1179
+ }
1180
+ } else {
1181
+ for (int j = 0; j < N; ++j) {
1182
+ if (i == j)
1183
+ Covariance[j + row_offset] = 1.0f;
1184
+ else
1185
+ Covariance[j + row_offset] = 0.0f;
1186
+ }
1187
+ }
1188
+ }
1189
+ double err = InvertMatrix(Covariance, N, Inverse);
1190
+ if (err > 1) {
1191
+ tprintf("Clustering error: Matrix inverse failed with error %g\n", err);
1192
+ }
1193
+ int EssentialN = 0;
1194
+ for (int dim = 0; dim < N; ++dim) {
1195
+ if (!Clusterer->ParamDesc[dim].NonEssential) {
1196
+ Delta[dim] = Left->Mean[dim] - Right->Mean[dim];
1197
+ ++EssentialN;
1198
+ } else {
1199
+ Delta[dim] = 0.0f;
1200
+ }
1201
+ }
1202
+ // Compute Hotelling's T-squared.
1203
+ double Tsq = 0.0;
1204
+ for (int x = 0; x < N; ++x) {
1205
+ double temp = 0.0;
1206
+ for (int y = 0; y < N; ++y) {
1207
+ temp += Inverse[y + N*x] * Delta[y];
1208
+ }
1209
+ Tsq += Delta[x] * temp;
1210
+ }
1211
+ memfree(Covariance);
1212
+ memfree(Inverse);
1213
+ memfree(Delta);
1214
+ // Changed this function to match the formula in
1215
+ // Statistical Methods in Medical Research p 473
1216
+ // By Peter Armitage, Geoffrey Berry, J. N. S. Matthews.
1217
+ // Tsq *= Left->SampleCount * Right->SampleCount / TotalDims;
1218
+ double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2)*EssentialN);
1219
+ int Fx = EssentialN;
1220
+ if (Fx > FTABLE_X)
1221
+ Fx = FTABLE_X;
1222
+ --Fx;
1223
+ int Fy = TotalDims - EssentialN - 1;
1224
+ if (Fy > FTABLE_Y)
1225
+ Fy = FTABLE_Y;
1226
+ --Fy;
1227
+ double FTarget = FTable[Fy][Fx];
1228
+ if (Config->MagicSamples > 0 &&
1229
+ TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) &&
1230
+ TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) {
1231
+ // Give magic-sized clusters a magic FTable boost.
1232
+ FTarget += kFTableBoostMargin;
1233
+ }
1234
+ if (F < FTarget) {
1235
+ return NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
1236
+ }
1237
+ return NULL;
1238
+ }
1239
+
1240
+ /* MakeSphericalProto *******************************************************
1241
+ Parameters: Clusterer data struct containing samples being clustered
1242
+ Cluster cluster to be made into a spherical prototype
1243
+ Statistics statistical info about cluster
1244
+ Buckets histogram struct used to analyze distribution
1245
+ Globals: None
1246
+ Operation: This routine tests the specified cluster to see if it can
1247
+ be approximated by a spherical normal distribution. If it
1248
+ can be, then a new prototype is formed and returned to the
1249
+ caller. If it can't be, then NULL is returned to the caller.
1250
+ Return: Pointer to new spherical prototype or NULL.
1251
+ Exceptions: None
1252
+ History: 6/1/89, DSJ, Created.
1253
+ ******************************************************************************/
1254
+ PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer,
1255
+ CLUSTER *Cluster,
1256
+ STATISTICS *Statistics,
1257
+ BUCKETS *Buckets) {
1258
+ PROTOTYPE *Proto = NULL;
1259
+ int i;
1260
+
1261
+ // check that each dimension is a normal distribution
1262
+ for (i = 0; i < Clusterer->SampleSize; i++) {
1263
+ if (Clusterer->ParamDesc[i].NonEssential)
1264
+ continue;
1265
+
1266
+ FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1267
+ Cluster->Mean[i],
1268
+ sqrt ((FLOAT64) (Statistics->AvgVariance)));
1269
+ if (!DistributionOK (Buckets))
1270
+ break;
1271
+ }
1272
+ // if all dimensions matched a normal distribution, make a proto
1273
+ if (i >= Clusterer->SampleSize)
1274
+ Proto = NewSphericalProto (Clusterer->SampleSize, Cluster, Statistics);
1275
+ return (Proto);
1276
+ } // MakeSphericalProto
1277
+
1278
+
1279
+ /** MakeEllipticalProto ****************************************************
1280
+ Parameters: Clusterer data struct containing samples being clustered
1281
+ Cluster cluster to be made into an elliptical prototype
1282
+ Statistics statistical info about cluster
1283
+ Buckets histogram struct used to analyze distribution
1284
+ Globals: None
1285
+ Operation: This routine tests the specified cluster to see if it can
1286
+ be approximated by an elliptical normal distribution. If it
1287
+ can be, then a new prototype is formed and returned to the
1288
+ caller. If it can't be, then NULL is returned to the caller.
1289
+ Return: Pointer to new elliptical prototype or NULL.
1290
+ Exceptions: None
1291
+ History: 6/12/89, DSJ, Created.
1292
+ ****************************************************************************/
1293
+ PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer,
1294
+ CLUSTER *Cluster,
1295
+ STATISTICS *Statistics,
1296
+ BUCKETS *Buckets) {
1297
+ PROTOTYPE *Proto = NULL;
1298
+ int i;
1299
+
1300
+ // check that each dimension is a normal distribution
1301
+ for (i = 0; i < Clusterer->SampleSize; i++) {
1302
+ if (Clusterer->ParamDesc[i].NonEssential)
1303
+ continue;
1304
+
1305
+ FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1306
+ Cluster->Mean[i],
1307
+ sqrt ((FLOAT64) Statistics->
1308
+ CoVariance[i * (Clusterer->SampleSize + 1)]));
1309
+ if (!DistributionOK (Buckets))
1310
+ break;
1311
+ }
1312
+ // if all dimensions matched a normal distribution, make a proto
1313
+ if (i >= Clusterer->SampleSize)
1314
+ Proto = NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
1315
+ return (Proto);
1316
+ } // MakeEllipticalProto
1317
+
1318
+
1319
+ /** MakeMixedProto ***********************************************************
1320
+ Parameters: Clusterer data struct containing samples being clustered
1321
+ Cluster cluster to be made into a prototype
1322
+ Statistics statistical info about cluster
1323
+ NormalBuckets histogram struct used to analyze distribution
1324
+ Confidence confidence level for alternate distributions
1325
+ Globals: None
1326
+ Operation: This routine tests each dimension of the specified cluster to
1327
+ see what distribution would best approximate that dimension.
1328
+ Each dimension is compared to the following distributions
1329
+ in order: normal, random, uniform. If each dimension can
1330
+ be represented by one of these distributions,
1331
+ then a new prototype is formed and returned to the
1332
+ caller. If it can't be, then NULL is returned to the caller.
1333
+ Return: Pointer to new mixed prototype or NULL.
1334
+ Exceptions: None
1335
+ History: 6/12/89, DSJ, Created.
1336
+ ********************************************************************************/
1337
+ PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer,
1338
+ CLUSTER *Cluster,
1339
+ STATISTICS *Statistics,
1340
+ BUCKETS *NormalBuckets,
1341
+ FLOAT64 Confidence) {
1342
+ PROTOTYPE *Proto;
1343
+ int i;
1344
+ BUCKETS *UniformBuckets = NULL;
1345
+ BUCKETS *RandomBuckets = NULL;
1346
+
1347
+ // create a mixed proto to work on - initially assume all dimensions normal*/
1348
+ Proto = NewMixedProto (Clusterer->SampleSize, Cluster, Statistics);
1349
+
1350
+ // find the proper distribution for each dimension
1351
+ for (i = 0; i < Clusterer->SampleSize; i++) {
1352
+ if (Clusterer->ParamDesc[i].NonEssential)
1353
+ continue;
1354
+
1355
+ FillBuckets (NormalBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1356
+ Proto->Mean[i],
1357
+ sqrt ((FLOAT64) Proto->Variance.Elliptical[i]));
1358
+ if (DistributionOK (NormalBuckets))
1359
+ continue;
1360
+
1361
+ if (RandomBuckets == NULL)
1362
+ RandomBuckets =
1363
+ GetBuckets (D_random, Cluster->SampleCount, Confidence);
1364
+ MakeDimRandom (i, Proto, &(Clusterer->ParamDesc[i]));
1365
+ FillBuckets (RandomBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1366
+ Proto->Mean[i], Proto->Variance.Elliptical[i]);
1367
+ if (DistributionOK (RandomBuckets))
1368
+ continue;
1369
+
1370
+ if (UniformBuckets == NULL)
1371
+ UniformBuckets =
1372
+ GetBuckets (uniform, Cluster->SampleCount, Confidence);
1373
+ MakeDimUniform(i, Proto, Statistics);
1374
+ FillBuckets (UniformBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1375
+ Proto->Mean[i], Proto->Variance.Elliptical[i]);
1376
+ if (DistributionOK (UniformBuckets))
1377
+ continue;
1378
+ break;
1379
+ }
1380
+ // if any dimension failed to match a distribution, discard the proto
1381
+ if (i < Clusterer->SampleSize) {
1382
+ FreePrototype(Proto);
1383
+ Proto = NULL;
1384
+ }
1385
+ if (UniformBuckets != NULL)
1386
+ FreeBuckets(UniformBuckets);
1387
+ if (RandomBuckets != NULL)
1388
+ FreeBuckets(RandomBuckets);
1389
+ return (Proto);
1390
+ } // MakeMixedProto
1391
+
1392
+
1393
+ /* MakeDimRandom *************************************************************
1394
+ Parameters: i index of dimension to be changed
1395
+ Proto prototype whose dimension is to be altered
1396
+ ParamDesc description of specified dimension
1397
+ Globals: None
1398
+ Operation: This routine alters the ith dimension of the specified
1399
+ mixed prototype to be D_random.
1400
+ Return: None
1401
+ Exceptions: None
1402
+ History: 6/20/89, DSJ, Created.
1403
+ ******************************************************************************/
1404
+ void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc) {
1405
+ Proto->Distrib[i] = D_random;
1406
+ Proto->Mean[i] = ParamDesc->MidRange;
1407
+ Proto->Variance.Elliptical[i] = ParamDesc->HalfRange;
1408
+
1409
+ // subtract out the previous magnitude of this dimension from the total
1410
+ Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
1411
+ Proto->Magnitude.Elliptical[i] = 1.0 / ParamDesc->Range;
1412
+ Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1413
+ Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1414
+
1415
+ // note that the proto Weight is irrelevant for D_random protos
1416
+ } // MakeDimRandom
1417
+
1418
+
1419
+ /** MakeDimUniform ***********************************************************
1420
+ Parameters: i index of dimension to be changed
1421
+ Proto prototype whose dimension is to be altered
1422
+ Statistics statistical info about prototype
1423
+ Globals: None
1424
+ Operation: This routine alters the ith dimension of the specified
1425
+ mixed prototype to be uniform.
1426
+ Return: None
1427
+ Exceptions: None
1428
+ History: 6/20/89, DSJ, Created.
1429
+ ******************************************************************************/
1430
+ void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics) {
1431
+ Proto->Distrib[i] = uniform;
1432
+ Proto->Mean[i] = Proto->Cluster->Mean[i] +
1433
+ (Statistics->Min[i] + Statistics->Max[i]) / 2;
1434
+ Proto->Variance.Elliptical[i] =
1435
+ (Statistics->Max[i] - Statistics->Min[i]) / 2;
1436
+ if (Proto->Variance.Elliptical[i] < MINVARIANCE)
1437
+ Proto->Variance.Elliptical[i] = MINVARIANCE;
1438
+
1439
+ // subtract out the previous magnitude of this dimension from the total
1440
+ Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
1441
+ Proto->Magnitude.Elliptical[i] =
1442
+ 1.0 / (2.0 * Proto->Variance.Elliptical[i]);
1443
+ Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1444
+ Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1445
+
1446
+ // note that the proto Weight is irrelevant for uniform protos
1447
+ } // MakeDimUniform
1448
+
1449
+
1450
+ /** ComputeStatistics *********************************************************
1451
+ Parameters: N number of dimensions
1452
+ ParamDesc array of dimension descriptions
1453
+ Cluster cluster whose stats are to be computed
1454
+ Globals: None
1455
+ Operation: This routine searches the cluster tree for all leaf nodes
1456
+ which are samples in the specified cluster. It computes
1457
+ a full covariance matrix for these samples as well as
1458
+ keeping track of the ranges (min and max) for each
1459
+ dimension. A special data structure is allocated to
1460
+ return this information to the caller. An incremental
1461
+ algorithm for computing statistics is not used because
1462
+ it will not work with circular dimensions.
1463
+ Return: Pointer to new data structure containing statistics
1464
+ Exceptions: None
1465
+ History: 6/2/89, DSJ, Created.
1466
+ *********************************************************************************/
1467
+ STATISTICS *
1468
+ ComputeStatistics (inT16 N, PARAM_DESC ParamDesc[], CLUSTER * Cluster) {
1469
+ STATISTICS *Statistics;
1470
+ int i, j;
1471
+ FLOAT32 *CoVariance;
1472
+ FLOAT32 *Distance;
1473
+ LIST SearchState;
1474
+ SAMPLE *Sample;
1475
+ uinT32 SampleCountAdjustedForBias;
1476
+
1477
+ // allocate memory to hold the statistics results
1478
+ Statistics = (STATISTICS *) Emalloc (sizeof (STATISTICS));
1479
+ Statistics->CoVariance = (FLOAT32 *) Emalloc (N * N * sizeof (FLOAT32));
1480
+ Statistics->Min = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1481
+ Statistics->Max = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1482
+
1483
+ // allocate temporary memory to hold the sample to mean distances
1484
+ Distance = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1485
+
1486
+ // initialize the statistics
1487
+ Statistics->AvgVariance = 1.0;
1488
+ CoVariance = Statistics->CoVariance;
1489
+ for (i = 0; i < N; i++) {
1490
+ Statistics->Min[i] = 0.0;
1491
+ Statistics->Max[i] = 0.0;
1492
+ for (j = 0; j < N; j++, CoVariance++)
1493
+ *CoVariance = 0;
1494
+ }
1495
+ // find each sample in the cluster and merge it into the statistics
1496
+ InitSampleSearch(SearchState, Cluster);
1497
+ while ((Sample = NextSample (&SearchState)) != NULL) {
1498
+ for (i = 0; i < N; i++) {
1499
+ Distance[i] = Sample->Mean[i] - Cluster->Mean[i];
1500
+ if (ParamDesc[i].Circular) {
1501
+ if (Distance[i] > ParamDesc[i].HalfRange)
1502
+ Distance[i] -= ParamDesc[i].Range;
1503
+ if (Distance[i] < -ParamDesc[i].HalfRange)
1504
+ Distance[i] += ParamDesc[i].Range;
1505
+ }
1506
+ if (Distance[i] < Statistics->Min[i])
1507
+ Statistics->Min[i] = Distance[i];
1508
+ if (Distance[i] > Statistics->Max[i])
1509
+ Statistics->Max[i] = Distance[i];
1510
+ }
1511
+ CoVariance = Statistics->CoVariance;
1512
+ for (i = 0; i < N; i++)
1513
+ for (j = 0; j < N; j++, CoVariance++)
1514
+ *CoVariance += Distance[i] * Distance[j];
1515
+ }
1516
+ // normalize the variances by the total number of samples
1517
+ // use SampleCount-1 instead of SampleCount to get an unbiased estimate
1518
+ // also compute the geometic mean of the diagonal variances
1519
+ // ensure that clusters with only 1 sample are handled correctly
1520
+ if (Cluster->SampleCount > 1)
1521
+ SampleCountAdjustedForBias = Cluster->SampleCount - 1;
1522
+ else
1523
+ SampleCountAdjustedForBias = 1;
1524
+ CoVariance = Statistics->CoVariance;
1525
+ for (i = 0; i < N; i++)
1526
+ for (j = 0; j < N; j++, CoVariance++) {
1527
+ *CoVariance /= SampleCountAdjustedForBias;
1528
+ if (j == i) {
1529
+ if (*CoVariance < MINVARIANCE)
1530
+ *CoVariance = MINVARIANCE;
1531
+ Statistics->AvgVariance *= *CoVariance;
1532
+ }
1533
+ }
1534
+ Statistics->AvgVariance = (float)pow((double)Statistics->AvgVariance,
1535
+ 1.0 / N);
1536
+
1537
+ // release temporary memory and return
1538
+ memfree(Distance);
1539
+ return (Statistics);
1540
+ } // ComputeStatistics
1541
+
1542
+
1543
+ /** NewSpericalProto *********************************************************
1544
+ Parameters: N number of dimensions
1545
+ Cluster cluster to be made into a spherical prototype
1546
+ Statistics statistical info about samples in cluster
1547
+ Globals: None
1548
+ Operation: This routine creates a spherical prototype data structure to
1549
+ approximate the samples in the specified cluster.
1550
+ Spherical prototypes have a single variance which is
1551
+ common across all dimensions. All dimensions are normally
1552
+ distributed and independent.
1553
+ Return: Pointer to a new spherical prototype data structure
1554
+ Exceptions: None
1555
+ History: 6/19/89, DSJ, Created.
1556
+ ******************************************************************************/
1557
+ PROTOTYPE *NewSphericalProto(uinT16 N,
1558
+ CLUSTER *Cluster,
1559
+ STATISTICS *Statistics) {
1560
+ PROTOTYPE *Proto;
1561
+
1562
+ Proto = NewSimpleProto (N, Cluster);
1563
+
1564
+ Proto->Variance.Spherical = Statistics->AvgVariance;
1565
+ if (Proto->Variance.Spherical < MINVARIANCE)
1566
+ Proto->Variance.Spherical = MINVARIANCE;
1567
+
1568
+ Proto->Magnitude.Spherical =
1569
+ 1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Spherical));
1570
+ Proto->TotalMagnitude = (float)pow((double)Proto->Magnitude.Spherical,
1571
+ (double) N);
1572
+ Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
1573
+ Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1574
+
1575
+ return (Proto);
1576
+ } // NewSphericalProto
1577
+
1578
+
1579
+ /** NewEllipticalProto *******************************************************
1580
+ Parameters: N number of dimensions
1581
+ Cluster cluster to be made into an elliptical prototype
1582
+ Statistics statistical info about samples in cluster
1583
+ Globals: None
1584
+ Operation: This routine creates an elliptical prototype data structure to
1585
+ approximate the samples in the specified cluster.
1586
+ Elliptical prototypes have a variance for each dimension.
1587
+ All dimensions are normally distributed and independent.
1588
+ Return: Pointer to a new elliptical prototype data structure
1589
+ Exceptions: None
1590
+ History: 6/19/89, DSJ, Created.
1591
+ *******************************************************************************/
1592
+ PROTOTYPE *NewEllipticalProto(inT16 N,
1593
+ CLUSTER *Cluster,
1594
+ STATISTICS *Statistics) {
1595
+ PROTOTYPE *Proto;
1596
+ FLOAT32 *CoVariance;
1597
+ int i;
1598
+
1599
+ Proto = NewSimpleProto (N, Cluster);
1600
+ Proto->Variance.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1601
+ Proto->Magnitude.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1602
+ Proto->Weight.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1603
+
1604
+ CoVariance = Statistics->CoVariance;
1605
+ Proto->TotalMagnitude = 1.0;
1606
+ for (i = 0; i < N; i++, CoVariance += N + 1) {
1607
+ Proto->Variance.Elliptical[i] = *CoVariance;
1608
+ if (Proto->Variance.Elliptical[i] < MINVARIANCE)
1609
+ Proto->Variance.Elliptical[i] = MINVARIANCE;
1610
+
1611
+ Proto->Magnitude.Elliptical[i] =
1612
+ 1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Elliptical[i]));
1613
+ Proto->Weight.Elliptical[i] = 1.0 / Proto->Variance.Elliptical[i];
1614
+ Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1615
+ }
1616
+ Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1617
+ Proto->Style = elliptical;
1618
+ return (Proto);
1619
+ } // NewEllipticalProto
1620
+
1621
+
1622
+ /** MewMixedProto ************************************************************
1623
+ Parameters: N number of dimensions
1624
+ Cluster cluster to be made into a mixed prototype
1625
+ Statistics statistical info about samples in cluster
1626
+ Globals: None
1627
+ Operation: This routine creates a mixed prototype data structure to
1628
+ approximate the samples in the specified cluster.
1629
+ Mixed prototypes can have different distributions for
1630
+ each dimension. All dimensions are independent. The
1631
+ structure is initially filled in as though it were an
1632
+ elliptical prototype. The actual distributions of the
1633
+ dimensions can be altered by other routines.
1634
+ Return: Pointer to a new mixed prototype data structure
1635
+ Exceptions: None
1636
+ History: 6/19/89, DSJ, Created.
1637
+ ********************************************************************************/
1638
+ PROTOTYPE *NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics) {
1639
+ PROTOTYPE *Proto;
1640
+ int i;
1641
+
1642
+ Proto = NewEllipticalProto (N, Cluster, Statistics);
1643
+ Proto->Distrib = (DISTRIBUTION *) Emalloc (N * sizeof (DISTRIBUTION));
1644
+
1645
+ for (i = 0; i < N; i++) {
1646
+ Proto->Distrib[i] = normal;
1647
+ }
1648
+ Proto->Style = mixed;
1649
+ return (Proto);
1650
+ } // NewMixedProto
1651
+
1652
+
1653
+ /** NewSimpleProto ***********************************************************
1654
+ Parameters: N number of dimensions
1655
+ Cluster cluster to be made into a prototype
1656
+ Globals: None
1657
+ Operation: This routine allocates memory to hold a simple prototype
1658
+ data structure, i.e. one without independent distributions
1659
+ and variances for each dimension.
1660
+ Return: Pointer to new simple prototype
1661
+ Exceptions: None
1662
+ History: 6/19/89, DSJ, Created.
1663
+ *******************************************************************************/
1664
+ PROTOTYPE *NewSimpleProto(inT16 N, CLUSTER *Cluster) {
1665
+ PROTOTYPE *Proto;
1666
+ int i;
1667
+
1668
+ Proto = (PROTOTYPE *) Emalloc (sizeof (PROTOTYPE));
1669
+ Proto->Mean = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1670
+
1671
+ for (i = 0; i < N; i++)
1672
+ Proto->Mean[i] = Cluster->Mean[i];
1673
+ Proto->Distrib = NULL;
1674
+
1675
+ Proto->Significant = TRUE;
1676
+ Proto->Style = spherical;
1677
+ Proto->NumSamples = Cluster->SampleCount;
1678
+ Proto->Cluster = Cluster;
1679
+ Proto->Cluster->Prototype = TRUE;
1680
+ return (Proto);
1681
+ } // NewSimpleProto
1682
+
1683
+
1684
+ /** Independent ***************************************************************
1685
+ Parameters: ParamDesc descriptions of each feature space dimension
1686
+ N number of dimensions
1687
+ CoVariance ptr to a covariance matrix
1688
+ Independence max off-diagonal correlation coefficient
1689
+ Globals: None
1690
+ Operation: This routine returns TRUE if the specified covariance
1691
+ matrix indicates that all N dimensions are independent of
1692
+ one another. One dimension is judged to be independent of
1693
+ another when the magnitude of the corresponding correlation
1694
+ coefficient is
1695
+ less than the specified Independence factor. The
1696
+ correlation coefficient is calculated as: (see Duda and
1697
+ Hart, pg. 247)
1698
+ coeff[ij] = stddev[ij] / sqrt (stddev[ii] * stddev[jj])
1699
+ The covariance matrix is assumed to be symmetric (which
1700
+ should always be true).
1701
+ Return: TRUE if dimensions are independent, FALSE otherwise
1702
+ Exceptions: None
1703
+ History: 6/4/89, DSJ, Created.
1704
+ *******************************************************************************/
1705
+ BOOL8
1706
+ Independent (PARAM_DESC ParamDesc[],
1707
+ inT16 N, FLOAT32 * CoVariance, FLOAT32 Independence) {
1708
+ int i, j;
1709
+ FLOAT32 *VARii; // points to ith on-diagonal element
1710
+ FLOAT32 *VARjj; // points to jth on-diagonal element
1711
+ FLOAT32 CorrelationCoeff;
1712
+
1713
+ VARii = CoVariance;
1714
+ for (i = 0; i < N; i++, VARii += N + 1) {
1715
+ if (ParamDesc[i].NonEssential)
1716
+ continue;
1717
+
1718
+ VARjj = VARii + N + 1;
1719
+ CoVariance = VARii + 1;
1720
+ for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) {
1721
+ if (ParamDesc[j].NonEssential)
1722
+ continue;
1723
+
1724
+ if ((*VARii == 0.0) || (*VARjj == 0.0))
1725
+ CorrelationCoeff = 0.0;
1726
+ else
1727
+ CorrelationCoeff =
1728
+ sqrt (sqrt (*CoVariance * *CoVariance / (*VARii * *VARjj)));
1729
+ if (CorrelationCoeff > Independence)
1730
+ return (FALSE);
1731
+ }
1732
+ }
1733
+ return (TRUE);
1734
+ } // Independent
1735
+
1736
+
1737
+ /** GetBuckets **************************************************************
1738
+ Parameters: Distribution type of probability distribution to test for
1739
+ SampleCount number of samples that are available
1740
+ Confidence probability of a Type I error
1741
+ Globals: none
1742
+ Operation: This routine returns a histogram data structure which can
1743
+ be used by other routines to place samples into histogram
1744
+ buckets, and then apply a goodness of fit test to the
1745
+ histogram data to determine if the samples belong to the
1746
+ specified probability distribution. The routine keeps
1747
+ a list of bucket data structures which have already been
1748
+ created so that it minimizes the computation time needed
1749
+ to create a new bucket.
1750
+ Return: Bucket data structure
1751
+ Exceptions: none
1752
+ History: Thu Aug 3 12:58:10 1989, DSJ, Created.
1753
+ *****************************************************************************/
1754
+ BUCKETS *GetBuckets(DISTRIBUTION Distribution,
1755
+ uinT32 SampleCount,
1756
+ FLOAT64 Confidence) {
1757
+ uinT16 NumberOfBuckets;
1758
+ BUCKETS *Buckets;
1759
+
1760
+ // search for an old bucket structure with the same number of buckets
1761
+ NumberOfBuckets = OptimumNumberOfBuckets (SampleCount);
1762
+ Buckets = (BUCKETS *) first_node (search (OldBuckets[(int) Distribution],
1763
+ &NumberOfBuckets, NumBucketsMatch));
1764
+
1765
+ // if a matching bucket structure is found, delete it from the list
1766
+ if (Buckets != NULL) {
1767
+ OldBuckets[(int) Distribution] =
1768
+ delete_d (OldBuckets[(int) Distribution], Buckets, ListEntryMatch);
1769
+ if (SampleCount != Buckets->SampleCount)
1770
+ AdjustBuckets(Buckets, SampleCount);
1771
+ if (Confidence != Buckets->Confidence) {
1772
+ Buckets->Confidence = Confidence;
1773
+ Buckets->ChiSquared = ComputeChiSquared
1774
+ (DegreesOfFreedom (Distribution, Buckets->NumberOfBuckets),
1775
+ Confidence);
1776
+ }
1777
+ InitBuckets(Buckets);
1778
+ }
1779
+ else // otherwise create a new structure
1780
+ Buckets = MakeBuckets (Distribution, SampleCount, Confidence);
1781
+ return (Buckets);
1782
+ } // GetBuckets
1783
+
1784
+
1785
+ /** Makebuckets *************************************************************
1786
+ Parameters: Distribution type of probability distribution to test for
1787
+ SampleCount number of samples that are available
1788
+ Confidence probability of a Type I error
1789
+ Globals: None
1790
+ Operation: This routine creates a histogram data structure which can
1791
+ be used by other routines to place samples into histogram
1792
+ buckets, and then apply a goodness of fit test to the
1793
+ histogram data to determine if the samples belong to the
1794
+ specified probability distribution. The buckets are
1795
+ allocated in such a way that the expected frequency of
1796
+ samples in each bucket is approximately the same. In
1797
+ order to make this possible, a mapping table is
1798
+ computed which maps "normalized" samples into the
1799
+ appropriate bucket.
1800
+ Return: Pointer to new histogram data structure
1801
+ Exceptions: None
1802
+ History: 6/4/89, DSJ, Created.
1803
+ *****************************************************************************/
1804
+ BUCKETS *MakeBuckets(DISTRIBUTION Distribution,
1805
+ uinT32 SampleCount,
1806
+ FLOAT64 Confidence) {
1807
+ static DENSITYFUNC DensityFunction[] =
1808
+ { NormalDensity, UniformDensity, UniformDensity };
1809
+ int i, j;
1810
+ BUCKETS *Buckets;
1811
+ FLOAT64 BucketProbability;
1812
+ FLOAT64 NextBucketBoundary;
1813
+ FLOAT64 Probability;
1814
+ FLOAT64 ProbabilityDelta;
1815
+ FLOAT64 LastProbDensity;
1816
+ FLOAT64 ProbDensity;
1817
+ uinT16 CurrentBucket;
1818
+ BOOL8 Symmetrical;
1819
+
1820
+ // allocate memory needed for data structure
1821
+ Buckets = (BUCKETS *) Emalloc (sizeof (BUCKETS));
1822
+ Buckets->NumberOfBuckets = OptimumNumberOfBuckets (SampleCount);
1823
+ Buckets->SampleCount = SampleCount;
1824
+ Buckets->Confidence = Confidence;
1825
+ Buckets->Count =
1826
+ (uinT32 *) Emalloc (Buckets->NumberOfBuckets * sizeof (uinT32));
1827
+ Buckets->ExpectedCount =
1828
+ (FLOAT32 *) Emalloc (Buckets->NumberOfBuckets * sizeof (FLOAT32));
1829
+
1830
+ // initialize simple fields
1831
+ Buckets->Distribution = Distribution;
1832
+ for (i = 0; i < Buckets->NumberOfBuckets; i++) {
1833
+ Buckets->Count[i] = 0;
1834
+ Buckets->ExpectedCount[i] = 0.0;
1835
+ }
1836
+
1837
+ // all currently defined distributions are symmetrical
1838
+ Symmetrical = TRUE;
1839
+ Buckets->ChiSquared = ComputeChiSquared
1840
+ (DegreesOfFreedom (Distribution, Buckets->NumberOfBuckets), Confidence);
1841
+
1842
+ if (Symmetrical) {
1843
+ // allocate buckets so that all have approx. equal probability
1844
+ BucketProbability = 1.0 / (FLOAT64) (Buckets->NumberOfBuckets);
1845
+
1846
+ // distribution is symmetric so fill in upper half then copy
1847
+ CurrentBucket = Buckets->NumberOfBuckets / 2;
1848
+ if (Odd (Buckets->NumberOfBuckets))
1849
+ NextBucketBoundary = BucketProbability / 2;
1850
+ else
1851
+ NextBucketBoundary = BucketProbability;
1852
+
1853
+ Probability = 0.0;
1854
+ LastProbDensity =
1855
+ (*DensityFunction[(int) Distribution]) (BUCKETTABLESIZE / 2);
1856
+ for (i = BUCKETTABLESIZE / 2; i < BUCKETTABLESIZE; i++) {
1857
+ ProbDensity = (*DensityFunction[(int) Distribution]) (i + 1);
1858
+ ProbabilityDelta = Integral (LastProbDensity, ProbDensity, 1.0);
1859
+ Probability += ProbabilityDelta;
1860
+ if (Probability > NextBucketBoundary) {
1861
+ if (CurrentBucket < Buckets->NumberOfBuckets - 1)
1862
+ CurrentBucket++;
1863
+ NextBucketBoundary += BucketProbability;
1864
+ }
1865
+ Buckets->Bucket[i] = CurrentBucket;
1866
+ Buckets->ExpectedCount[CurrentBucket] +=
1867
+ (FLOAT32) (ProbabilityDelta * SampleCount);
1868
+ LastProbDensity = ProbDensity;
1869
+ }
1870
+ // place any leftover probability into the last bucket
1871
+ Buckets->ExpectedCount[CurrentBucket] +=
1872
+ (FLOAT32) ((0.5 - Probability) * SampleCount);
1873
+
1874
+ // copy upper half of distribution to lower half
1875
+ for (i = 0, j = BUCKETTABLESIZE - 1; i < j; i++, j--)
1876
+ Buckets->Bucket[i] =
1877
+ Mirror (Buckets->Bucket[j], Buckets->NumberOfBuckets);
1878
+
1879
+ // copy upper half of expected counts to lower half
1880
+ for (i = 0, j = Buckets->NumberOfBuckets - 1; i <= j; i++, j--)
1881
+ Buckets->ExpectedCount[i] += Buckets->ExpectedCount[j];
1882
+ }
1883
+ return (Buckets);
1884
+ } // MakeBuckets
1885
+
1886
+
1887
+ //---------------------------------------------------------------------------
1888
+ uinT16 OptimumNumberOfBuckets(uinT32 SampleCount) {
1889
+ /*
1890
+ ** Parameters:
1891
+ ** SampleCount number of samples to be tested
1892
+ ** Globals:
1893
+ ** CountTable lookup table for number of samples
1894
+ ** BucketsTable lookup table for necessary number of buckets
1895
+ ** Operation:
1896
+ ** This routine computes the optimum number of histogram
1897
+ ** buckets that should be used in a chi-squared goodness of
1898
+ ** fit test for the specified number of samples. The optimum
1899
+ ** number is computed based on Table 4.1 on pg. 147 of
1900
+ ** "Measurement and Analysis of Random Data" by Bendat & Piersol.
1901
+ ** Linear interpolation is used to interpolate between table
1902
+ ** values. The table is intended for a 0.05 level of
1903
+ ** significance (alpha). This routine assumes that it is
1904
+ ** equally valid for other alpha's, which may not be true.
1905
+ ** Return:
1906
+ ** Optimum number of histogram buckets
1907
+ ** Exceptions:
1908
+ ** None
1909
+ ** History:
1910
+ ** 6/5/89, DSJ, Created.
1911
+ */
1912
+ uinT8 Last, Next;
1913
+ FLOAT32 Slope;
1914
+
1915
+ if (SampleCount < CountTable[0])
1916
+ return (BucketsTable[0]);
1917
+
1918
+ for (Last = 0, Next = 1; Next < LOOKUPTABLESIZE; Last++, Next++) {
1919
+ if (SampleCount <= CountTable[Next]) {
1920
+ Slope = (FLOAT32) (BucketsTable[Next] - BucketsTable[Last]) /
1921
+ (FLOAT32) (CountTable[Next] - CountTable[Last]);
1922
+ return ((uinT16) (BucketsTable[Last] +
1923
+ Slope * (SampleCount - CountTable[Last])));
1924
+ }
1925
+ }
1926
+ return (BucketsTable[Last]);
1927
+ } // OptimumNumberOfBuckets
1928
+
1929
+
1930
+ //---------------------------------------------------------------------------
1931
+ FLOAT64
1932
+ ComputeChiSquared (uinT16 DegreesOfFreedom, FLOAT64 Alpha)
1933
+ /*
1934
+ ** Parameters:
1935
+ ** DegreesOfFreedom determines shape of distribution
1936
+ ** Alpha probability of right tail
1937
+ ** Globals: none
1938
+ ** Operation:
1939
+ ** This routine computes the chi-squared value which will
1940
+ ** leave a cumulative probability of Alpha in the right tail
1941
+ ** of a chi-squared distribution with the specified number of
1942
+ ** degrees of freedom. Alpha must be between 0 and 1.
1943
+ ** DegreesOfFreedom must be even. The routine maintains an
1944
+ ** array of lists. Each list corresponds to a different
1945
+ ** number of degrees of freedom. Each entry in the list
1946
+ ** corresponds to a different alpha value and its corresponding
1947
+ ** chi-squared value. Therefore, once a particular chi-squared
1948
+ ** value is computed, it is stored in the list and never
1949
+ ** needs to be computed again.
1950
+ ** Return: Desired chi-squared value
1951
+ ** Exceptions: none
1952
+ ** History: 6/5/89, DSJ, Created.
1953
+ */
1954
+ #define CHIACCURACY 0.01
1955
+ #define MINALPHA (1e-200)
1956
+ {
1957
+ static LIST ChiWith[MAXDEGREESOFFREEDOM + 1];
1958
+
1959
+ CHISTRUCT *OldChiSquared;
1960
+ CHISTRUCT SearchKey;
1961
+
1962
+ // limit the minimum alpha that can be used - if alpha is too small
1963
+ // it may not be possible to compute chi-squared.
1964
+ if (Alpha < MINALPHA)
1965
+ Alpha = MINALPHA;
1966
+ if (Alpha > 1.0)
1967
+ Alpha = 1.0;
1968
+ if (Odd (DegreesOfFreedom))
1969
+ DegreesOfFreedom++;
1970
+
1971
+ /* find the list of chi-squared values which have already been computed
1972
+ for the specified number of degrees of freedom. Search the list for
1973
+ the desired chi-squared. */
1974
+ SearchKey.Alpha = Alpha;
1975
+ OldChiSquared = (CHISTRUCT *) first_node (search (ChiWith[DegreesOfFreedom],
1976
+ &SearchKey, AlphaMatch));
1977
+
1978
+ if (OldChiSquared == NULL) {
1979
+ OldChiSquared = NewChiStruct (DegreesOfFreedom, Alpha);
1980
+ OldChiSquared->ChiSquared = Solve (ChiArea, OldChiSquared,
1981
+ (FLOAT64) DegreesOfFreedom,
1982
+ (FLOAT64) CHIACCURACY);
1983
+ ChiWith[DegreesOfFreedom] = push (ChiWith[DegreesOfFreedom],
1984
+ OldChiSquared);
1985
+ }
1986
+ else {
1987
+ // further optimization might move OldChiSquared to front of list
1988
+ }
1989
+
1990
+ return (OldChiSquared->ChiSquared);
1991
+
1992
+ } // ComputeChiSquared
1993
+
1994
+
1995
+ //---------------------------------------------------------------------------
1996
+ FLOAT64 NormalDensity(inT32 x) {
1997
+ /*
1998
+ ** Parameters:
1999
+ ** x number to compute the normal probability density for
2000
+ ** Globals:
2001
+ ** NormalMean mean of a discrete normal distribution
2002
+ ** NormalVariance variance of a discrete normal distribution
2003
+ ** NormalMagnitude magnitude of a discrete normal distribution
2004
+ ** Operation:
2005
+ ** This routine computes the probability density function
2006
+ ** of a discrete normal distribution defined by the global
2007
+ ** variables NormalMean, NormalVariance, and NormalMagnitude.
2008
+ ** Normal magnitude could, of course, be computed in terms of
2009
+ ** the normal variance but it is precomputed for efficiency.
2010
+ ** Return:
2011
+ ** The value of the normal distribution at x.
2012
+ ** Exceptions:
2013
+ ** None
2014
+ ** History:
2015
+ ** 6/4/89, DSJ, Created.
2016
+ */
2017
+ FLOAT64 Distance;
2018
+
2019
+ Distance = x - NormalMean;
2020
+ return (NormalMagnitude *
2021
+ exp (-0.5 * Distance * Distance / NormalVariance));
2022
+ } // NormalDensity
2023
+
2024
+
2025
+ //---------------------------------------------------------------------------
2026
+ FLOAT64 UniformDensity(inT32 x) {
2027
+ /*
2028
+ ** Parameters:
2029
+ ** x number to compute the uniform probability density for
2030
+ ** Globals:
2031
+ ** BUCKETTABLESIZE determines range of distribution
2032
+ ** Operation:
2033
+ ** This routine computes the probability density function
2034
+ ** of a uniform distribution at the specified point. The
2035
+ ** range of the distribution is from 0 to BUCKETTABLESIZE.
2036
+ ** Return:
2037
+ ** The value of the uniform distribution at x.
2038
+ ** Exceptions:
2039
+ ** None
2040
+ ** History:
2041
+ ** 6/5/89, DSJ, Created.
2042
+ */
2043
+ static FLOAT64 UniformDistributionDensity = (FLOAT64) 1.0 / BUCKETTABLESIZE;
2044
+
2045
+ if ((x >= 0.0) && (x <= BUCKETTABLESIZE))
2046
+ return (UniformDistributionDensity);
2047
+ else
2048
+ return ((FLOAT64) 0.0);
2049
+ } // UniformDensity
2050
+
2051
+
2052
+ //---------------------------------------------------------------------------
2053
+ FLOAT64 Integral(FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx) {
2054
+ /*
2055
+ ** Parameters:
2056
+ ** f1 value of function at x1
2057
+ ** f2 value of function at x2
2058
+ ** Dx x2 - x1 (should always be positive)
2059
+ ** Globals:
2060
+ ** None
2061
+ ** Operation:
2062
+ ** This routine computes a trapezoidal approximation to the
2063
+ ** integral of a function over a small delta in x.
2064
+ ** Return:
2065
+ ** Approximation of the integral of the function from x1 to x2.
2066
+ ** Exceptions:
2067
+ ** None
2068
+ ** History:
2069
+ ** 6/5/89, DSJ, Created.
2070
+ */
2071
+ return ((f1 + f2) * Dx / 2.0);
2072
+ } // Integral
2073
+
2074
+
2075
+ //---------------------------------------------------------------------------
2076
+ void FillBuckets(BUCKETS *Buckets,
2077
+ CLUSTER *Cluster,
2078
+ uinT16 Dim,
2079
+ PARAM_DESC *ParamDesc,
2080
+ FLOAT32 Mean,
2081
+ FLOAT32 StdDev) {
2082
+ /*
2083
+ ** Parameters:
2084
+ ** Buckets histogram buckets to count samples
2085
+ ** Cluster cluster whose samples are being analyzed
2086
+ ** Dim dimension of samples which is being analyzed
2087
+ ** ParamDesc description of the dimension
2088
+ ** Mean "mean" of the distribution
2089
+ ** StdDev "standard deviation" of the distribution
2090
+ ** Globals:
2091
+ ** None
2092
+ ** Operation:
2093
+ ** This routine counts the number of cluster samples which
2094
+ ** fall within the various histogram buckets in Buckets. Only
2095
+ ** one dimension of each sample is examined. The exact meaning
2096
+ ** of the Mean and StdDev parameters depends on the
2097
+ ** distribution which is being analyzed (this info is in the
2098
+ ** Buckets data structure). For normal distributions, Mean
2099
+ ** and StdDev have the expected meanings. For uniform and
2100
+ ** random distributions the Mean is the center point of the
2101
+ ** range and the StdDev is 1/2 the range. A dimension with
2102
+ ** zero standard deviation cannot be statistically analyzed.
2103
+ ** In this case, a pseudo-analysis is used.
2104
+ ** Return:
2105
+ ** None (the Buckets data structure is filled in)
2106
+ ** Exceptions:
2107
+ ** None
2108
+ ** History:
2109
+ ** 6/5/89, DSJ, Created.
2110
+ */
2111
+ uinT16 BucketID;
2112
+ int i;
2113
+ LIST SearchState;
2114
+ SAMPLE *Sample;
2115
+
2116
+ // initialize the histogram bucket counts to 0
2117
+ for (i = 0; i < Buckets->NumberOfBuckets; i++)
2118
+ Buckets->Count[i] = 0;
2119
+
2120
+ if (StdDev == 0.0) {
2121
+ /* if the standard deviation is zero, then we can't statistically
2122
+ analyze the cluster. Use a pseudo-analysis: samples exactly on
2123
+ the mean are distributed evenly across all buckets. Samples greater
2124
+ than the mean are placed in the last bucket; samples less than the
2125
+ mean are placed in the first bucket. */
2126
+
2127
+ InitSampleSearch(SearchState, Cluster);
2128
+ i = 0;
2129
+ while ((Sample = NextSample (&SearchState)) != NULL) {
2130
+ if (Sample->Mean[Dim] > Mean)
2131
+ BucketID = Buckets->NumberOfBuckets - 1;
2132
+ else if (Sample->Mean[Dim] < Mean)
2133
+ BucketID = 0;
2134
+ else
2135
+ BucketID = i;
2136
+ Buckets->Count[BucketID] += 1;
2137
+ i++;
2138
+ if (i >= Buckets->NumberOfBuckets)
2139
+ i = 0;
2140
+ }
2141
+ }
2142
+ else {
2143
+ // search for all samples in the cluster and add to histogram buckets
2144
+ InitSampleSearch(SearchState, Cluster);
2145
+ while ((Sample = NextSample (&SearchState)) != NULL) {
2146
+ switch (Buckets->Distribution) {
2147
+ case normal:
2148
+ BucketID = NormalBucket (ParamDesc, Sample->Mean[Dim],
2149
+ Mean, StdDev);
2150
+ break;
2151
+ case D_random:
2152
+ case uniform:
2153
+ BucketID = UniformBucket (ParamDesc, Sample->Mean[Dim],
2154
+ Mean, StdDev);
2155
+ break;
2156
+ default:
2157
+ BucketID = 0;
2158
+ }
2159
+ Buckets->Count[Buckets->Bucket[BucketID]] += 1;
2160
+ }
2161
+ }
2162
+ } // FillBuckets
2163
+
2164
+
2165
+ //---------------------------------------------------------------------------*/
2166
+ uinT16 NormalBucket(PARAM_DESC *ParamDesc,
2167
+ FLOAT32 x,
2168
+ FLOAT32 Mean,
2169
+ FLOAT32 StdDev) {
2170
+ /*
2171
+ ** Parameters:
2172
+ ** ParamDesc used to identify circular dimensions
2173
+ ** x value to be normalized
2174
+ ** Mean mean of normal distribution
2175
+ ** StdDev standard deviation of normal distribution
2176
+ ** Globals:
2177
+ ** NormalMean mean of discrete normal distribution
2178
+ ** NormalStdDev standard deviation of discrete normal dist.
2179
+ ** BUCKETTABLESIZE range of the discrete distribution
2180
+ ** Operation:
2181
+ ** This routine determines which bucket x falls into in the
2182
+ ** discrete normal distribution defined by NormalMean
2183
+ ** and NormalStdDev. x values which exceed the range of
2184
+ ** the discrete distribution are clipped.
2185
+ ** Return:
2186
+ ** Bucket number into which x falls
2187
+ ** Exceptions:
2188
+ ** None
2189
+ ** History:
2190
+ ** 6/5/89, DSJ, Created.
2191
+ */
2192
+ FLOAT32 X;
2193
+
2194
+ // wraparound circular parameters if necessary
2195
+ if (ParamDesc->Circular) {
2196
+ if (x - Mean > ParamDesc->HalfRange)
2197
+ x -= ParamDesc->Range;
2198
+ else if (x - Mean < -ParamDesc->HalfRange)
2199
+ x += ParamDesc->Range;
2200
+ }
2201
+
2202
+ X = ((x - Mean) / StdDev) * NormalStdDev + NormalMean;
2203
+ if (X < 0)
2204
+ return ((uinT16) 0);
2205
+ if (X > BUCKETTABLESIZE - 1)
2206
+ return ((uinT16) (BUCKETTABLESIZE - 1));
2207
+ return ((uinT16) floor ((FLOAT64) X));
2208
+ } // NormalBucket
2209
+
2210
+
2211
+ //---------------------------------------------------------------------------
2212
+ uinT16 UniformBucket(PARAM_DESC *ParamDesc,
2213
+ FLOAT32 x,
2214
+ FLOAT32 Mean,
2215
+ FLOAT32 StdDev) {
2216
+ /*
2217
+ ** Parameters:
2218
+ ** ParamDesc used to identify circular dimensions
2219
+ ** x value to be normalized
2220
+ ** Mean center of range of uniform distribution
2221
+ ** StdDev 1/2 the range of the uniform distribution
2222
+ ** Globals:
2223
+ ** BUCKETTABLESIZE range of the discrete distribution
2224
+ ** Operation:
2225
+ ** This routine determines which bucket x falls into in the
2226
+ ** discrete uniform distribution defined by
2227
+ ** BUCKETTABLESIZE. x values which exceed the range of
2228
+ ** the discrete distribution are clipped.
2229
+ ** Return:
2230
+ ** Bucket number into which x falls
2231
+ ** Exceptions:
2232
+ ** None
2233
+ ** History:
2234
+ ** 6/5/89, DSJ, Created.
2235
+ */
2236
+ FLOAT32 X;
2237
+
2238
+ // wraparound circular parameters if necessary
2239
+ if (ParamDesc->Circular) {
2240
+ if (x - Mean > ParamDesc->HalfRange)
2241
+ x -= ParamDesc->Range;
2242
+ else if (x - Mean < -ParamDesc->HalfRange)
2243
+ x += ParamDesc->Range;
2244
+ }
2245
+
2246
+ X = ((x - Mean) / (2 * StdDev) * BUCKETTABLESIZE + BUCKETTABLESIZE / 2.0);
2247
+ if (X < 0)
2248
+ return ((uinT16) 0);
2249
+ if (X > BUCKETTABLESIZE - 1)
2250
+ return ((uinT16) (BUCKETTABLESIZE - 1));
2251
+ return ((uinT16) floor ((FLOAT64) X));
2252
+ } // UniformBucket
2253
+
2254
+
2255
+ //---------------------------------------------------------------------------
2256
+ BOOL8 DistributionOK(BUCKETS *Buckets) {
2257
+ /*
2258
+ ** Parameters:
2259
+ ** Buckets histogram data to perform chi-square test on
2260
+ ** Globals:
2261
+ ** None
2262
+ ** Operation:
2263
+ ** This routine performs a chi-square goodness of fit test
2264
+ ** on the histogram data in the Buckets data structure. TRUE
2265
+ ** is returned if the histogram matches the probability
2266
+ ** distribution which was specified when the Buckets
2267
+ ** structure was originally created. Otherwise FALSE is
2268
+ ** returned.
2269
+ ** Return:
2270
+ ** TRUE if samples match distribution, FALSE otherwise
2271
+ ** Exceptions:
2272
+ ** None
2273
+ ** History:
2274
+ ** 6/5/89, DSJ, Created.
2275
+ */
2276
+ FLOAT32 FrequencyDifference;
2277
+ FLOAT32 TotalDifference;
2278
+ int i;
2279
+
2280
+ // compute how well the histogram matches the expected histogram
2281
+ TotalDifference = 0.0;
2282
+ for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2283
+ FrequencyDifference = Buckets->Count[i] - Buckets->ExpectedCount[i];
2284
+ TotalDifference += (FrequencyDifference * FrequencyDifference) /
2285
+ Buckets->ExpectedCount[i];
2286
+ }
2287
+
2288
+ // test to see if the difference is more than expected
2289
+ if (TotalDifference > Buckets->ChiSquared)
2290
+ return (FALSE);
2291
+ else
2292
+ return (TRUE);
2293
+ } // DistributionOK
2294
+
2295
+
2296
+ //---------------------------------------------------------------------------
2297
+ void FreeStatistics(STATISTICS *Statistics) {
2298
+ /*
2299
+ ** Parameters:
2300
+ ** Statistics pointer to data structure to be freed
2301
+ ** Globals:
2302
+ ** None
2303
+ ** Operation:
2304
+ ** This routine frees the memory used by the statistics
2305
+ ** data structure.
2306
+ ** Return:
2307
+ ** None
2308
+ ** Exceptions:
2309
+ ** None
2310
+ ** History:
2311
+ ** 6/5/89, DSJ, Created.
2312
+ */
2313
+ memfree (Statistics->CoVariance);
2314
+ memfree (Statistics->Min);
2315
+ memfree (Statistics->Max);
2316
+ memfree(Statistics);
2317
+ } // FreeStatistics
2318
+
2319
+
2320
+ //---------------------------------------------------------------------------
2321
+ void FreeBuckets(BUCKETS *Buckets) {
2322
+ /*
2323
+ ** Parameters:
2324
+ ** Buckets pointer to data structure to be freed
2325
+ ** Globals: none
2326
+ ** Operation:
2327
+ ** This routine places the specified histogram data structure
2328
+ ** at the front of a list of histograms so that it can be
2329
+ ** reused later if necessary. A separate list is maintained
2330
+ ** for each different type of distribution.
2331
+ ** Return: none
2332
+ ** Exceptions: none
2333
+ ** History: 6/5/89, DSJ, Created.
2334
+ */
2335
+ int Dist;
2336
+
2337
+ if (Buckets != NULL) {
2338
+ Dist = (int) Buckets->Distribution;
2339
+ OldBuckets[Dist] = (LIST) push (OldBuckets[Dist], Buckets);
2340
+ }
2341
+
2342
+ } // FreeBuckets
2343
+
2344
+
2345
+ //---------------------------------------------------------------------------
2346
+ void FreeCluster(CLUSTER *Cluster) {
2347
+ /*
2348
+ ** Parameters:
2349
+ ** Cluster pointer to cluster to be freed
2350
+ ** Globals:
2351
+ ** None
2352
+ ** Operation:
2353
+ ** This routine frees the memory consumed by the specified
2354
+ ** cluster and all of its subclusters. This is done by
2355
+ ** recursive calls to FreeCluster().
2356
+ ** Return:
2357
+ ** None
2358
+ ** Exceptions:
2359
+ ** None
2360
+ ** History:
2361
+ ** 6/6/89, DSJ, Created.
2362
+ */
2363
+ if (Cluster != NULL) {
2364
+ FreeCluster (Cluster->Left);
2365
+ FreeCluster (Cluster->Right);
2366
+ memfree(Cluster);
2367
+ }
2368
+ } // FreeCluster
2369
+
2370
+
2371
+ //---------------------------------------------------------------------------
2372
+ uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets) {
2373
+ /*
2374
+ ** Parameters:
2375
+ ** Distribution distribution being tested for
2376
+ ** HistogramBuckets number of buckets in chi-square test
2377
+ ** Globals: none
2378
+ ** Operation:
2379
+ ** This routine computes the degrees of freedom that should
2380
+ ** be used in a chi-squared test with the specified number of
2381
+ ** histogram buckets. The result is always rounded up to
2382
+ ** the next even number so that the value of chi-squared can be
2383
+ ** computed more easily. This will cause the value of
2384
+ ** chi-squared to be higher than the optimum value, resulting
2385
+ ** in the chi-square test being more lenient than optimum.
2386
+ ** Return: The number of degrees of freedom for a chi-square test
2387
+ ** Exceptions: none
2388
+ ** History: Thu Aug 3 14:04:18 1989, DSJ, Created.
2389
+ */
2390
+ static uinT8 DegreeOffsets[] = { 3, 3, 1 };
2391
+
2392
+ uinT16 AdjustedNumBuckets;
2393
+
2394
+ AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[(int) Distribution];
2395
+ if (Odd (AdjustedNumBuckets))
2396
+ AdjustedNumBuckets++;
2397
+ return (AdjustedNumBuckets);
2398
+
2399
+ } // DegreesOfFreedom
2400
+
2401
+
2402
+ //---------------------------------------------------------------------------
2403
+ int NumBucketsMatch(void *arg1, //BUCKETS *Histogram,
2404
+ void *arg2) { //uinT16 *DesiredNumberOfBuckets)
2405
+ /*
2406
+ ** Parameters:
2407
+ ** Histogram current histogram being tested for a match
2408
+ ** DesiredNumberOfBuckets match key
2409
+ ** Globals: none
2410
+ ** Operation:
2411
+ ** This routine is used to search a list of histogram data
2412
+ ** structures to find one with the specified number of
2413
+ ** buckets. It is called by the list search routines.
2414
+ ** Return: TRUE if Histogram matches DesiredNumberOfBuckets
2415
+ ** Exceptions: none
2416
+ ** History: Thu Aug 3 14:17:33 1989, DSJ, Created.
2417
+ */
2418
+ BUCKETS *Histogram = (BUCKETS *) arg1;
2419
+ uinT16 *DesiredNumberOfBuckets = (uinT16 *) arg2;
2420
+
2421
+ return (*DesiredNumberOfBuckets == Histogram->NumberOfBuckets);
2422
+
2423
+ } // NumBucketsMatch
2424
+
2425
+
2426
+ //---------------------------------------------------------------------------
2427
+ int ListEntryMatch(void *arg1, //ListNode
2428
+ void *arg2) { //Key
2429
+ /*
2430
+ ** Parameters: none
2431
+ ** Globals: none
2432
+ ** Operation:
2433
+ ** This routine is used to search a list for a list node
2434
+ ** whose contents match Key. It is called by the list
2435
+ ** delete_d routine.
2436
+ ** Return: TRUE if ListNode matches Key
2437
+ ** Exceptions: none
2438
+ ** History: Thu Aug 3 14:23:58 1989, DSJ, Created.
2439
+ */
2440
+ return (arg1 == arg2);
2441
+
2442
+ } // ListEntryMatch
2443
+
2444
+
2445
+ //---------------------------------------------------------------------------
2446
+ void AdjustBuckets(BUCKETS *Buckets, uinT32 NewSampleCount) {
2447
+ /*
2448
+ ** Parameters:
2449
+ ** Buckets histogram data structure to adjust
2450
+ ** NewSampleCount new sample count to adjust to
2451
+ ** Globals: none
2452
+ ** Operation:
2453
+ ** This routine multiplies each ExpectedCount histogram entry
2454
+ ** by NewSampleCount/OldSampleCount so that the histogram
2455
+ ** is now adjusted to the new sample count.
2456
+ ** Return: none
2457
+ ** Exceptions: none
2458
+ ** History: Thu Aug 3 14:31:14 1989, DSJ, Created.
2459
+ */
2460
+ int i;
2461
+ FLOAT64 AdjustFactor;
2462
+
2463
+ AdjustFactor = (((FLOAT64) NewSampleCount) /
2464
+ ((FLOAT64) Buckets->SampleCount));
2465
+
2466
+ for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2467
+ Buckets->ExpectedCount[i] *= AdjustFactor;
2468
+ }
2469
+
2470
+ Buckets->SampleCount = NewSampleCount;
2471
+
2472
+ } // AdjustBuckets
2473
+
2474
+
2475
+ //---------------------------------------------------------------------------
2476
+ void InitBuckets(BUCKETS *Buckets) {
2477
+ /*
2478
+ ** Parameters:
2479
+ ** Buckets histogram data structure to init
2480
+ ** Globals: none
2481
+ ** Operation:
2482
+ ** This routine sets the bucket counts in the specified histogram
2483
+ ** to zero.
2484
+ ** Return: none
2485
+ ** Exceptions: none
2486
+ ** History: Thu Aug 3 14:31:14 1989, DSJ, Created.
2487
+ */
2488
+ int i;
2489
+
2490
+ for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2491
+ Buckets->Count[i] = 0;
2492
+ }
2493
+
2494
+ } // InitBuckets
2495
+
2496
+
2497
+ //---------------------------------------------------------------------------
2498
+ int AlphaMatch(void *arg1, //CHISTRUCT *ChiStruct,
2499
+ void *arg2) { //CHISTRUCT *SearchKey)
2500
+ /*
2501
+ ** Parameters:
2502
+ ** ChiStruct chi-squared struct being tested for a match
2503
+ ** SearchKey chi-squared struct that is the search key
2504
+ ** Globals: none
2505
+ ** Operation:
2506
+ ** This routine is used to search a list of structures which
2507
+ ** hold pre-computed chi-squared values for a chi-squared
2508
+ ** value whose corresponding alpha field matches the alpha
2509
+ ** field of SearchKey.
2510
+ ** It is called by the list search routines.
2511
+ ** Return: TRUE if ChiStruct's Alpha matches SearchKey's Alpha
2512
+ ** Exceptions: none
2513
+ ** History: Thu Aug 3 14:17:33 1989, DSJ, Created.
2514
+ */
2515
+ CHISTRUCT *ChiStruct = (CHISTRUCT *) arg1;
2516
+ CHISTRUCT *SearchKey = (CHISTRUCT *) arg2;
2517
+
2518
+ return (ChiStruct->Alpha == SearchKey->Alpha);
2519
+
2520
+ } // AlphaMatch
2521
+
2522
+
2523
+ //---------------------------------------------------------------------------
2524
+ CHISTRUCT *NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha) {
2525
+ /*
2526
+ ** Parameters:
2527
+ ** DegreesOfFreedom degrees of freedom for new chi value
2528
+ ** Alpha confidence level for new chi value
2529
+ ** Globals: none
2530
+ ** Operation:
2531
+ ** This routine allocates a new data structure which is used
2532
+ ** to hold a chi-squared value along with its associated
2533
+ ** number of degrees of freedom and alpha value.
2534
+ ** Return: none
2535
+ ** Exceptions: none
2536
+ ** History: Fri Aug 4 11:04:59 1989, DSJ, Created.
2537
+ */
2538
+ CHISTRUCT *NewChiStruct;
2539
+
2540
+ NewChiStruct = (CHISTRUCT *) Emalloc (sizeof (CHISTRUCT));
2541
+ NewChiStruct->DegreesOfFreedom = DegreesOfFreedom;
2542
+ NewChiStruct->Alpha = Alpha;
2543
+ return (NewChiStruct);
2544
+
2545
+ } // NewChiStruct
2546
+
2547
+
2548
+ //---------------------------------------------------------------------------
2549
+ FLOAT64
2550
+ Solve (SOLVEFUNC Function,
2551
+ void *FunctionParams, FLOAT64 InitialGuess, FLOAT64 Accuracy)
2552
+ /*
2553
+ ** Parameters:
2554
+ ** Function function whose zero is to be found
2555
+ ** FunctionParams arbitrary data to pass to function
2556
+ ** InitialGuess point to start solution search at
2557
+ ** Accuracy maximum allowed error
2558
+ ** Globals: none
2559
+ ** Operation:
2560
+ ** This routine attempts to find an x value at which Function
2561
+ ** goes to zero (i.e. a root of the function ). It will only
2562
+ ** work correctly if a solution actually exists and there
2563
+ ** are no extrema between the solution and the InitialGuess.
2564
+ ** The algorithms used are extremely primitive.
2565
+ ** Return: Solution of function ( x for which f(x) = 0 ).
2566
+ ** Exceptions: none
2567
+ ** History: Fri Aug 4 11:08:59 1989, DSJ, Created.
2568
+ */
2569
+ #define INITIALDELTA 0.1
2570
+ #define DELTARATIO 0.1
2571
+ {
2572
+ FLOAT64 x;
2573
+ FLOAT64 f;
2574
+ FLOAT64 Slope;
2575
+ FLOAT64 Delta;
2576
+ FLOAT64 NewDelta;
2577
+ FLOAT64 xDelta;
2578
+ FLOAT64 LastPosX, LastNegX;
2579
+
2580
+ x = InitialGuess;
2581
+ Delta = INITIALDELTA;
2582
+ LastPosX = MAX_FLOAT32;
2583
+ LastNegX = -MAX_FLOAT32;
2584
+ f = (*Function) ((CHISTRUCT *) FunctionParams, x);
2585
+ while (Abs (LastPosX - LastNegX) > Accuracy) {
2586
+ // keep track of outer bounds of current estimate
2587
+ if (f < 0)
2588
+ LastNegX = x;
2589
+ else
2590
+ LastPosX = x;
2591
+
2592
+ // compute the approx. slope of f(x) at the current point
2593
+ Slope =
2594
+ ((*Function) ((CHISTRUCT *) FunctionParams, x + Delta) - f) / Delta;
2595
+
2596
+ // compute the next solution guess */
2597
+ xDelta = f / Slope;
2598
+ x -= xDelta;
2599
+
2600
+ // reduce the delta used for computing slope to be a fraction of
2601
+ //the amount moved to get to the new guess
2602
+ NewDelta = Abs (xDelta) * DELTARATIO;
2603
+ if (NewDelta < Delta)
2604
+ Delta = NewDelta;
2605
+
2606
+ // compute the value of the function at the new guess
2607
+ f = (*Function) ((CHISTRUCT *) FunctionParams, x);
2608
+ }
2609
+ return (x);
2610
+
2611
+ } // Solve
2612
+
2613
+
2614
+ //---------------------------------------------------------------------------
2615
+ FLOAT64 ChiArea(CHISTRUCT *ChiParams, FLOAT64 x) {
2616
+ /*
2617
+ ** Parameters:
2618
+ ** ChiParams contains degrees of freedom and alpha
2619
+ ** x value of chi-squared to evaluate
2620
+ ** Globals: none
2621
+ ** Operation:
2622
+ ** This routine computes the area under a chi density curve
2623
+ ** from 0 to x, minus the desired area under the curve. The
2624
+ ** number of degrees of freedom of the chi curve is specified
2625
+ ** in the ChiParams structure. The desired area is also
2626
+ ** specified in the ChiParams structure as Alpha ( or 1 minus
2627
+ ** the desired area ). This routine is intended to be passed
2628
+ ** to the Solve() function to find the value of chi-squared
2629
+ ** which will yield a desired area under the right tail of
2630
+ ** the chi density curve. The function will only work for
2631
+ ** even degrees of freedom. The equations are based on
2632
+ ** integrating the chi density curve in parts to obtain
2633
+ ** a series that can be used to compute the area under the
2634
+ ** curve.
2635
+ ** Return: Error between actual and desired area under the chi curve.
2636
+ ** Exceptions: none
2637
+ ** History: Fri Aug 4 12:48:41 1989, DSJ, Created.
2638
+ */
2639
+ int i, N;
2640
+ FLOAT64 SeriesTotal;
2641
+ FLOAT64 Denominator;
2642
+ FLOAT64 PowerOfx;
2643
+
2644
+ N = ChiParams->DegreesOfFreedom / 2 - 1;
2645
+ SeriesTotal = 1;
2646
+ Denominator = 1;
2647
+ PowerOfx = 1;
2648
+ for (i = 1; i <= N; i++) {
2649
+ Denominator *= 2 * i;
2650
+ PowerOfx *= x;
2651
+ SeriesTotal += PowerOfx / Denominator;
2652
+ }
2653
+ return ((SeriesTotal * exp (-0.5 * x)) - ChiParams->Alpha);
2654
+
2655
+ } // ChiArea
2656
+
2657
+
2658
+ //---------------------------------------------------------------------------
2659
+ BOOL8
2660
+ MultipleCharSamples (CLUSTERER * Clusterer,
2661
+ CLUSTER * Cluster, FLOAT32 MaxIllegal)
2662
+ /*
2663
+ ** Parameters:
2664
+ ** Clusterer data structure holding cluster tree
2665
+ ** Cluster cluster containing samples to be tested
2666
+ ** MaxIllegal max percentage of samples allowed to have
2667
+ ** more than 1 feature in the cluster
2668
+ ** Globals: none
2669
+ ** Operation:
2670
+ ** This routine looks at all samples in the specified cluster.
2671
+ ** It computes a running estimate of the percentage of the
2672
+ ** charaters which have more than 1 sample in the cluster.
2673
+ ** When this percentage exceeds MaxIllegal, TRUE is returned.
2674
+ ** Otherwise FALSE is returned. The CharID
2675
+ ** fields must contain integers which identify the training
2676
+ ** characters which were used to generate the sample. One
2677
+ ** integer is used for each sample. The NumChar field in
2678
+ ** the Clusterer must contain the number of characters in the
2679
+ ** training set. All CharID fields must be between 0 and
2680
+ ** NumChar-1. The main function of this routine is to help
2681
+ ** identify clusters which need to be split further, i.e. if
2682
+ ** numerous training characters have 2 or more features which are
2683
+ ** contained in the same cluster, then the cluster should be
2684
+ ** split.
2685
+ ** Return: TRUE if the cluster should be split, FALSE otherwise.
2686
+ ** Exceptions: none
2687
+ ** History: Wed Aug 30 11:13:05 1989, DSJ, Created.
2688
+ ** 2/22/90, DSJ, Added MaxIllegal control rather than always
2689
+ ** splitting illegal clusters.
2690
+ */
2691
+ #define ILLEGAL_CHAR 2
2692
+ {
2693
+ static BOOL8 *CharFlags = NULL;
2694
+ static inT32 NumFlags = 0;
2695
+ int i;
2696
+ LIST SearchState;
2697
+ SAMPLE *Sample;
2698
+ inT32 CharID;
2699
+ inT32 NumCharInCluster;
2700
+ inT32 NumIllegalInCluster;
2701
+ FLOAT32 PercentIllegal;
2702
+
2703
+ // initial estimate assumes that no illegal chars exist in the cluster
2704
+ NumCharInCluster = Cluster->SampleCount;
2705
+ NumIllegalInCluster = 0;
2706
+
2707
+ if (Clusterer->NumChar > NumFlags) {
2708
+ if (CharFlags != NULL)
2709
+ memfree(CharFlags);
2710
+ NumFlags = Clusterer->NumChar;
2711
+ CharFlags = (BOOL8 *) Emalloc (NumFlags * sizeof (BOOL8));
2712
+ }
2713
+
2714
+ for (i = 0; i < NumFlags; i++)
2715
+ CharFlags[i] = FALSE;
2716
+
2717
+ // find each sample in the cluster and check if we have seen it before
2718
+ InitSampleSearch(SearchState, Cluster);
2719
+ while ((Sample = NextSample (&SearchState)) != NULL) {
2720
+ CharID = Sample->CharID;
2721
+ if (CharFlags[CharID] == FALSE) {
2722
+ CharFlags[CharID] = TRUE;
2723
+ }
2724
+ else {
2725
+ if (CharFlags[CharID] == TRUE) {
2726
+ NumIllegalInCluster++;
2727
+ CharFlags[CharID] = ILLEGAL_CHAR;
2728
+ }
2729
+ NumCharInCluster--;
2730
+ PercentIllegal = (FLOAT32) NumIllegalInCluster / NumCharInCluster;
2731
+ if (PercentIllegal > MaxIllegal)
2732
+ return (TRUE);
2733
+ }
2734
+ }
2735
+ return (FALSE);
2736
+
2737
+ } // MultipleCharSamples
2738
+
2739
+ // Compute the inverse of a matrix using LU decomposition with partial pivoting.
2740
+ // The return value is the sum of norms of the off-diagonal terms of the
2741
+ // product of a and inv. (A measure of the error.)
2742
+ double InvertMatrix(const float* input, int size, float* inv) {
2743
+ double** U; // The upper triangular array.
2744
+ double* Umem;
2745
+ double** U_inv; // The inverse of U.
2746
+ double* U_invmem;
2747
+ double** L; // The lower triangular array.
2748
+ double* Lmem;
2749
+
2750
+ // Allocate memory for the 2D arrays.
2751
+ ALLOC_2D_ARRAY(size, size, Umem, U, double);
2752
+ ALLOC_2D_ARRAY(size, size, U_invmem, U_inv, double);
2753
+ ALLOC_2D_ARRAY(size, size, Lmem, L, double);
2754
+
2755
+ // Initialize the working matrices. U starts as input, L as I and U_inv as O.
2756
+ int row;
2757
+ int col;
2758
+ for (row = 0; row < size; row++) {
2759
+ for (col = 0; col < size; col++) {
2760
+ U[row][col] = input[row*size + col];
2761
+ L[row][col] = row == col ? 1.0 : 0.0;
2762
+ U_inv[row][col] = 0.0;
2763
+ }
2764
+ }
2765
+
2766
+ // Compute forward matrix by inversion by LU decomposition of input.
2767
+ for (col = 0; col < size; ++col) {
2768
+ // Find best pivot
2769
+ int best_row = 0;
2770
+ double best_pivot = -1.0;
2771
+ for (row = col; row < size; ++row) {
2772
+ if (Abs(U[row][col]) > best_pivot) {
2773
+ best_pivot = Abs(U[row][col]);
2774
+ best_row = row;
2775
+ }
2776
+ }
2777
+ // Exchange pivot rows.
2778
+ if (best_row != col) {
2779
+ for (int k = 0; k < size; ++k) {
2780
+ double tmp = U[best_row][k];
2781
+ U[best_row][k] = U[col][k];
2782
+ U[col][k] = tmp;
2783
+ tmp = L[best_row][k];
2784
+ L[best_row][k] = L[col][k];
2785
+ L[col][k] = tmp;
2786
+ }
2787
+ }
2788
+ // Now do the pivot itself.
2789
+ for (row = col + 1; row < size; ++row) {
2790
+ double ratio = -U[row][col] / U[col][col];
2791
+ for (int j = col; j < size; ++j) {
2792
+ U[row][j] += U[col][j] * ratio;
2793
+ }
2794
+ for (int k = 0; k < size; ++k) {
2795
+ L[row][k] += L[col][k] * ratio;
2796
+ }
2797
+ }
2798
+ }
2799
+ // Next invert U.
2800
+ for (col = 0; col < size; ++col) {
2801
+ U_inv[col][col] = 1.0 / U[col][col];
2802
+ for (row = col - 1; row >= 0; --row) {
2803
+ double total = 0.0;
2804
+ for (int k = col; k > row; --k) {
2805
+ total += U[row][k] * U_inv[k][col];
2806
+ }
2807
+ U_inv[row][col] = -total / U[row][row];
2808
+ }
2809
+ }
2810
+ // Now the answer is U_inv.L.
2811
+ for (row = 0; row < size; row++) {
2812
+ for (col = 0; col < size; col++) {
2813
+ double sum = 0.0;
2814
+ for (int k = row; k < size; ++k) {
2815
+ sum += U_inv[row][k] * L[k][col];
2816
+ }
2817
+ inv[row*size + col] = sum;
2818
+ }
2819
+ }
2820
+ // Check matrix product.
2821
+ double error_sum = 0.0;
2822
+ for (row = 0; row < size; row++) {
2823
+ for (col = 0; col < size; col++) {
2824
+ double sum = 0.0;
2825
+ for (int k = 0; k < size; ++k) {
2826
+ sum += input[row*size + k] * inv[k *size + col];
2827
+ }
2828
+ if (row != col) {
2829
+ error_sum += Abs(sum);
2830
+ }
2831
+ }
2832
+ }
2833
+ return error_sum;
2834
+ }