grammar_cop 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (344) hide show
  1. data/.DS_Store +0 -0
  2. data/.gitignore +4 -0
  3. data/Gemfile +4 -0
  4. data/Rakefile +8 -0
  5. data/data/.DS_Store +0 -0
  6. data/data/Makefile +511 -0
  7. data/data/Makefile.am +4 -0
  8. data/data/Makefile.in +511 -0
  9. data/data/de/.DS_Store +0 -0
  10. data/data/de/4.0.affix +7 -0
  11. data/data/de/4.0.dict +474 -0
  12. data/data/de/Makefile +387 -0
  13. data/data/de/Makefile.am +9 -0
  14. data/data/de/Makefile.in +387 -0
  15. data/data/en/.DS_Store +0 -0
  16. data/data/en/4.0.affix +26 -0
  17. data/data/en/4.0.batch +1002 -0
  18. data/data/en/4.0.biolg.batch +411 -0
  19. data/data/en/4.0.constituent-knowledge +127 -0
  20. data/data/en/4.0.dict +8759 -0
  21. data/data/en/4.0.dict.m4 +6928 -0
  22. data/data/en/4.0.enwiki.batch +14 -0
  23. data/data/en/4.0.fixes.batch +2776 -0
  24. data/data/en/4.0.knowledge +306 -0
  25. data/data/en/4.0.regex +225 -0
  26. data/data/en/4.0.voa.batch +114 -0
  27. data/data/en/Makefile +554 -0
  28. data/data/en/Makefile.am +19 -0
  29. data/data/en/Makefile.in +554 -0
  30. data/data/en/README +173 -0
  31. data/data/en/tiny.dict +157 -0
  32. data/data/en/words/.DS_Store +0 -0
  33. data/data/en/words/Makefile +456 -0
  34. data/data/en/words/Makefile.am +78 -0
  35. data/data/en/words/Makefile.in +456 -0
  36. data/data/en/words/currency +205 -0
  37. data/data/en/words/currency.p +28 -0
  38. data/data/en/words/entities.given-bisex.sing +39 -0
  39. data/data/en/words/entities.given-female.sing +4141 -0
  40. data/data/en/words/entities.given-male.sing +1633 -0
  41. data/data/en/words/entities.locations.sing +68 -0
  42. data/data/en/words/entities.national.sing +253 -0
  43. data/data/en/words/entities.organizations.sing +7 -0
  44. data/data/en/words/entities.us-states.sing +11 -0
  45. data/data/en/words/units.1 +45 -0
  46. data/data/en/words/units.1.dot +4 -0
  47. data/data/en/words/units.3 +2 -0
  48. data/data/en/words/units.4 +5 -0
  49. data/data/en/words/units.4.dot +1 -0
  50. data/data/en/words/words-medical.adv.1 +1191 -0
  51. data/data/en/words/words-medical.prep.1 +67 -0
  52. data/data/en/words/words-medical.v.4.1 +2835 -0
  53. data/data/en/words/words-medical.v.4.2 +2848 -0
  54. data/data/en/words/words-medical.v.4.3 +3011 -0
  55. data/data/en/words/words-medical.v.4.4 +3036 -0
  56. data/data/en/words/words-medical.v.4.5 +3050 -0
  57. data/data/en/words/words.adj.1 +6794 -0
  58. data/data/en/words/words.adj.2 +638 -0
  59. data/data/en/words/words.adj.3 +667 -0
  60. data/data/en/words/words.adv.1 +1573 -0
  61. data/data/en/words/words.adv.2 +67 -0
  62. data/data/en/words/words.adv.3 +157 -0
  63. data/data/en/words/words.adv.4 +80 -0
  64. data/data/en/words/words.n.1 +11464 -0
  65. data/data/en/words/words.n.1.wiki +264 -0
  66. data/data/en/words/words.n.2.s +2017 -0
  67. data/data/en/words/words.n.2.s.biolg +1 -0
  68. data/data/en/words/words.n.2.s.wiki +298 -0
  69. data/data/en/words/words.n.2.x +65 -0
  70. data/data/en/words/words.n.2.x.wiki +10 -0
  71. data/data/en/words/words.n.3 +5717 -0
  72. data/data/en/words/words.n.t +23 -0
  73. data/data/en/words/words.v.1.1 +1038 -0
  74. data/data/en/words/words.v.1.2 +1043 -0
  75. data/data/en/words/words.v.1.3 +1052 -0
  76. data/data/en/words/words.v.1.4 +1023 -0
  77. data/data/en/words/words.v.1.p +17 -0
  78. data/data/en/words/words.v.10.1 +14 -0
  79. data/data/en/words/words.v.10.2 +15 -0
  80. data/data/en/words/words.v.10.3 +88 -0
  81. data/data/en/words/words.v.10.4 +17 -0
  82. data/data/en/words/words.v.2.1 +1253 -0
  83. data/data/en/words/words.v.2.2 +1304 -0
  84. data/data/en/words/words.v.2.3 +1280 -0
  85. data/data/en/words/words.v.2.4 +1285 -0
  86. data/data/en/words/words.v.2.5 +1287 -0
  87. data/data/en/words/words.v.4.1 +2472 -0
  88. data/data/en/words/words.v.4.2 +2487 -0
  89. data/data/en/words/words.v.4.3 +2441 -0
  90. data/data/en/words/words.v.4.4 +2478 -0
  91. data/data/en/words/words.v.4.5 +2483 -0
  92. data/data/en/words/words.v.5.1 +98 -0
  93. data/data/en/words/words.v.5.2 +98 -0
  94. data/data/en/words/words.v.5.3 +103 -0
  95. data/data/en/words/words.v.5.4 +102 -0
  96. data/data/en/words/words.v.6.1 +388 -0
  97. data/data/en/words/words.v.6.2 +401 -0
  98. data/data/en/words/words.v.6.3 +397 -0
  99. data/data/en/words/words.v.6.4 +405 -0
  100. data/data/en/words/words.v.6.5 +401 -0
  101. data/data/en/words/words.v.8.1 +117 -0
  102. data/data/en/words/words.v.8.2 +118 -0
  103. data/data/en/words/words.v.8.3 +118 -0
  104. data/data/en/words/words.v.8.4 +119 -0
  105. data/data/en/words/words.v.8.5 +119 -0
  106. data/data/en/words/words.y +104 -0
  107. data/data/lt/.DS_Store +0 -0
  108. data/data/lt/4.0.affix +6 -0
  109. data/data/lt/4.0.constituent-knowledge +24 -0
  110. data/data/lt/4.0.dict +135 -0
  111. data/data/lt/4.0.knowledge +38 -0
  112. data/data/lt/Makefile +389 -0
  113. data/data/lt/Makefile.am +11 -0
  114. data/data/lt/Makefile.in +389 -0
  115. data/ext/.DS_Store +0 -0
  116. data/ext/link_grammar/.DS_Store +0 -0
  117. data/ext/link_grammar/extconf.rb +2 -0
  118. data/ext/link_grammar/link-grammar/.DS_Store +0 -0
  119. data/ext/link_grammar/link-grammar/.deps/analyze-linkage.Plo +198 -0
  120. data/ext/link_grammar/link-grammar/.deps/and.Plo +202 -0
  121. data/ext/link_grammar/link-grammar/.deps/api.Plo +244 -0
  122. data/ext/link_grammar/link-grammar/.deps/build-disjuncts.Plo +212 -0
  123. data/ext/link_grammar/link-grammar/.deps/command-line.Plo +201 -0
  124. data/ext/link_grammar/link-grammar/.deps/constituents.Plo +201 -0
  125. data/ext/link_grammar/link-grammar/.deps/count.Plo +202 -0
  126. data/ext/link_grammar/link-grammar/.deps/disjunct-utils.Plo +126 -0
  127. data/ext/link_grammar/link-grammar/.deps/disjuncts.Plo +123 -0
  128. data/ext/link_grammar/link-grammar/.deps/error.Plo +121 -0
  129. data/ext/link_grammar/link-grammar/.deps/expand.Plo +133 -0
  130. data/ext/link_grammar/link-grammar/.deps/extract-links.Plo +198 -0
  131. data/ext/link_grammar/link-grammar/.deps/fast-match.Plo +200 -0
  132. data/ext/link_grammar/link-grammar/.deps/idiom.Plo +200 -0
  133. data/ext/link_grammar/link-grammar/.deps/jni-client.Plo +217 -0
  134. data/ext/link_grammar/link-grammar/.deps/link-parser.Po +1 -0
  135. data/ext/link_grammar/link-grammar/.deps/massage.Plo +202 -0
  136. data/ext/link_grammar/link-grammar/.deps/post-process.Plo +202 -0
  137. data/ext/link_grammar/link-grammar/.deps/pp_knowledge.Plo +202 -0
  138. data/ext/link_grammar/link-grammar/.deps/pp_lexer.Plo +201 -0
  139. data/ext/link_grammar/link-grammar/.deps/pp_linkset.Plo +200 -0
  140. data/ext/link_grammar/link-grammar/.deps/prefix.Plo +102 -0
  141. data/ext/link_grammar/link-grammar/.deps/preparation.Plo +202 -0
  142. data/ext/link_grammar/link-grammar/.deps/print-util.Plo +200 -0
  143. data/ext/link_grammar/link-grammar/.deps/print.Plo +201 -0
  144. data/ext/link_grammar/link-grammar/.deps/prune.Plo +202 -0
  145. data/ext/link_grammar/link-grammar/.deps/read-dict.Plo +223 -0
  146. data/ext/link_grammar/link-grammar/.deps/read-regex.Plo +123 -0
  147. data/ext/link_grammar/link-grammar/.deps/regex-morph.Plo +131 -0
  148. data/ext/link_grammar/link-grammar/.deps/resources.Plo +203 -0
  149. data/ext/link_grammar/link-grammar/.deps/spellcheck-aspell.Plo +1 -0
  150. data/ext/link_grammar/link-grammar/.deps/spellcheck-hun.Plo +115 -0
  151. data/ext/link_grammar/link-grammar/.deps/string-set.Plo +198 -0
  152. data/ext/link_grammar/link-grammar/.deps/tokenize.Plo +160 -0
  153. data/ext/link_grammar/link-grammar/.deps/utilities.Plo +222 -0
  154. data/ext/link_grammar/link-grammar/.deps/word-file.Plo +201 -0
  155. data/ext/link_grammar/link-grammar/.deps/word-utils.Plo +212 -0
  156. data/ext/link_grammar/link-grammar/.libs/analyze-linkage.o +0 -0
  157. data/ext/link_grammar/link-grammar/.libs/and.o +0 -0
  158. data/ext/link_grammar/link-grammar/.libs/api.o +0 -0
  159. data/ext/link_grammar/link-grammar/.libs/build-disjuncts.o +0 -0
  160. data/ext/link_grammar/link-grammar/.libs/command-line.o +0 -0
  161. data/ext/link_grammar/link-grammar/.libs/constituents.o +0 -0
  162. data/ext/link_grammar/link-grammar/.libs/count.o +0 -0
  163. data/ext/link_grammar/link-grammar/.libs/disjunct-utils.o +0 -0
  164. data/ext/link_grammar/link-grammar/.libs/disjuncts.o +0 -0
  165. data/ext/link_grammar/link-grammar/.libs/error.o +0 -0
  166. data/ext/link_grammar/link-grammar/.libs/expand.o +0 -0
  167. data/ext/link_grammar/link-grammar/.libs/extract-links.o +0 -0
  168. data/ext/link_grammar/link-grammar/.libs/fast-match.o +0 -0
  169. data/ext/link_grammar/link-grammar/.libs/idiom.o +0 -0
  170. data/ext/link_grammar/link-grammar/.libs/jni-client.o +0 -0
  171. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java-symbols.expsym +31 -0
  172. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib +0 -0
  173. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib.dSYM/Contents/Info.plist +20 -0
  174. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib.dSYM/Contents/Resources/DWARF/liblink-grammar-java.4.dylib +0 -0
  175. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.a +0 -0
  176. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.dylib +0 -0
  177. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-symbols.expsym +194 -0
  178. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib +0 -0
  179. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib.dSYM/Contents/Info.plist +20 -0
  180. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib.dSYM/Contents/Resources/DWARF/liblink-grammar.4.dylib +0 -0
  181. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.a +0 -0
  182. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.dylib +0 -0
  183. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.la +41 -0
  184. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.lai +41 -0
  185. data/ext/link_grammar/link-grammar/.libs/massage.o +0 -0
  186. data/ext/link_grammar/link-grammar/.libs/post-process.o +0 -0
  187. data/ext/link_grammar/link-grammar/.libs/pp_knowledge.o +0 -0
  188. data/ext/link_grammar/link-grammar/.libs/pp_lexer.o +0 -0
  189. data/ext/link_grammar/link-grammar/.libs/pp_linkset.o +0 -0
  190. data/ext/link_grammar/link-grammar/.libs/prefix.o +0 -0
  191. data/ext/link_grammar/link-grammar/.libs/preparation.o +0 -0
  192. data/ext/link_grammar/link-grammar/.libs/print-util.o +0 -0
  193. data/ext/link_grammar/link-grammar/.libs/print.o +0 -0
  194. data/ext/link_grammar/link-grammar/.libs/prune.o +0 -0
  195. data/ext/link_grammar/link-grammar/.libs/read-dict.o +0 -0
  196. data/ext/link_grammar/link-grammar/.libs/read-regex.o +0 -0
  197. data/ext/link_grammar/link-grammar/.libs/regex-morph.o +0 -0
  198. data/ext/link_grammar/link-grammar/.libs/resources.o +0 -0
  199. data/ext/link_grammar/link-grammar/.libs/spellcheck-aspell.o +0 -0
  200. data/ext/link_grammar/link-grammar/.libs/spellcheck-hun.o +0 -0
  201. data/ext/link_grammar/link-grammar/.libs/string-set.o +0 -0
  202. data/ext/link_grammar/link-grammar/.libs/tokenize.o +0 -0
  203. data/ext/link_grammar/link-grammar/.libs/utilities.o +0 -0
  204. data/ext/link_grammar/link-grammar/.libs/word-file.o +0 -0
  205. data/ext/link_grammar/link-grammar/.libs/word-utils.o +0 -0
  206. data/ext/link_grammar/link-grammar/Makefile +900 -0
  207. data/ext/link_grammar/link-grammar/Makefile.am +202 -0
  208. data/ext/link_grammar/link-grammar/Makefile.in +900 -0
  209. data/ext/link_grammar/link-grammar/analyze-linkage.c +1317 -0
  210. data/ext/link_grammar/link-grammar/analyze-linkage.h +24 -0
  211. data/ext/link_grammar/link-grammar/and.c +1603 -0
  212. data/ext/link_grammar/link-grammar/and.h +27 -0
  213. data/ext/link_grammar/link-grammar/api-structures.h +362 -0
  214. data/ext/link_grammar/link-grammar/api-types.h +72 -0
  215. data/ext/link_grammar/link-grammar/api.c +1887 -0
  216. data/ext/link_grammar/link-grammar/api.h +96 -0
  217. data/ext/link_grammar/link-grammar/autoit/.DS_Store +0 -0
  218. data/ext/link_grammar/link-grammar/autoit/README +10 -0
  219. data/ext/link_grammar/link-grammar/autoit/_LGTest.au3 +22 -0
  220. data/ext/link_grammar/link-grammar/autoit/_LinkGrammar.au3 +545 -0
  221. data/ext/link_grammar/link-grammar/build-disjuncts.c +487 -0
  222. data/ext/link_grammar/link-grammar/build-disjuncts.h +21 -0
  223. data/ext/link_grammar/link-grammar/command-line.c +458 -0
  224. data/ext/link_grammar/link-grammar/command-line.h +15 -0
  225. data/ext/link_grammar/link-grammar/constituents.c +1836 -0
  226. data/ext/link_grammar/link-grammar/constituents.h +26 -0
  227. data/ext/link_grammar/link-grammar/corpus/.DS_Store +0 -0
  228. data/ext/link_grammar/link-grammar/corpus/.deps/cluster.Plo +1 -0
  229. data/ext/link_grammar/link-grammar/corpus/.deps/corpus.Plo +1 -0
  230. data/ext/link_grammar/link-grammar/corpus/Makefile +527 -0
  231. data/ext/link_grammar/link-grammar/corpus/Makefile.am +46 -0
  232. data/ext/link_grammar/link-grammar/corpus/Makefile.in +527 -0
  233. data/ext/link_grammar/link-grammar/corpus/README +17 -0
  234. data/ext/link_grammar/link-grammar/corpus/cluster.c +286 -0
  235. data/ext/link_grammar/link-grammar/corpus/cluster.h +32 -0
  236. data/ext/link_grammar/link-grammar/corpus/corpus.c +483 -0
  237. data/ext/link_grammar/link-grammar/corpus/corpus.h +46 -0
  238. data/ext/link_grammar/link-grammar/count.c +828 -0
  239. data/ext/link_grammar/link-grammar/count.h +25 -0
  240. data/ext/link_grammar/link-grammar/disjunct-utils.c +261 -0
  241. data/ext/link_grammar/link-grammar/disjunct-utils.h +27 -0
  242. data/ext/link_grammar/link-grammar/disjuncts.c +138 -0
  243. data/ext/link_grammar/link-grammar/disjuncts.h +13 -0
  244. data/ext/link_grammar/link-grammar/error.c +92 -0
  245. data/ext/link_grammar/link-grammar/error.h +35 -0
  246. data/ext/link_grammar/link-grammar/expand.c +67 -0
  247. data/ext/link_grammar/link-grammar/expand.h +13 -0
  248. data/ext/link_grammar/link-grammar/externs.h +22 -0
  249. data/ext/link_grammar/link-grammar/extract-links.c +625 -0
  250. data/ext/link_grammar/link-grammar/extract-links.h +16 -0
  251. data/ext/link_grammar/link-grammar/fast-match.c +309 -0
  252. data/ext/link_grammar/link-grammar/fast-match.h +17 -0
  253. data/ext/link_grammar/link-grammar/idiom.c +373 -0
  254. data/ext/link_grammar/link-grammar/idiom.h +15 -0
  255. data/ext/link_grammar/link-grammar/jni-client.c +779 -0
  256. data/ext/link_grammar/link-grammar/jni-client.h +236 -0
  257. data/ext/link_grammar/link-grammar/liblink-grammar-java.la +42 -0
  258. data/ext/link_grammar/link-grammar/liblink-grammar.la +41 -0
  259. data/ext/link_grammar/link-grammar/link-features.h +37 -0
  260. data/ext/link_grammar/link-grammar/link-features.h.in +37 -0
  261. data/ext/link_grammar/link-grammar/link-grammar-java.def +31 -0
  262. data/ext/link_grammar/link-grammar/link-grammar.def +194 -0
  263. data/ext/link_grammar/link-grammar/link-includes.h +465 -0
  264. data/ext/link_grammar/link-grammar/link-parser.c +849 -0
  265. data/ext/link_grammar/link-grammar/massage.c +329 -0
  266. data/ext/link_grammar/link-grammar/massage.h +13 -0
  267. data/ext/link_grammar/link-grammar/post-process.c +1113 -0
  268. data/ext/link_grammar/link-grammar/post-process.h +45 -0
  269. data/ext/link_grammar/link-grammar/pp_knowledge.c +376 -0
  270. data/ext/link_grammar/link-grammar/pp_knowledge.h +14 -0
  271. data/ext/link_grammar/link-grammar/pp_lexer.c +1920 -0
  272. data/ext/link_grammar/link-grammar/pp_lexer.h +19 -0
  273. data/ext/link_grammar/link-grammar/pp_linkset.c +158 -0
  274. data/ext/link_grammar/link-grammar/pp_linkset.h +20 -0
  275. data/ext/link_grammar/link-grammar/prefix.c +482 -0
  276. data/ext/link_grammar/link-grammar/prefix.h +139 -0
  277. data/ext/link_grammar/link-grammar/preparation.c +412 -0
  278. data/ext/link_grammar/link-grammar/preparation.h +20 -0
  279. data/ext/link_grammar/link-grammar/print-util.c +87 -0
  280. data/ext/link_grammar/link-grammar/print-util.h +32 -0
  281. data/ext/link_grammar/link-grammar/print.c +1085 -0
  282. data/ext/link_grammar/link-grammar/print.h +16 -0
  283. data/ext/link_grammar/link-grammar/prune.c +1864 -0
  284. data/ext/link_grammar/link-grammar/prune.h +17 -0
  285. data/ext/link_grammar/link-grammar/read-dict.c +1785 -0
  286. data/ext/link_grammar/link-grammar/read-dict.h +29 -0
  287. data/ext/link_grammar/link-grammar/read-regex.c +161 -0
  288. data/ext/link_grammar/link-grammar/read-regex.h +12 -0
  289. data/ext/link_grammar/link-grammar/regex-morph.c +126 -0
  290. data/ext/link_grammar/link-grammar/regex-morph.h +17 -0
  291. data/ext/link_grammar/link-grammar/resources.c +180 -0
  292. data/ext/link_grammar/link-grammar/resources.h +23 -0
  293. data/ext/link_grammar/link-grammar/sat-solver/.DS_Store +0 -0
  294. data/ext/link_grammar/link-grammar/sat-solver/.deps/fast-sprintf.Plo +1 -0
  295. data/ext/link_grammar/link-grammar/sat-solver/.deps/sat-encoder.Plo +1 -0
  296. data/ext/link_grammar/link-grammar/sat-solver/.deps/util.Plo +1 -0
  297. data/ext/link_grammar/link-grammar/sat-solver/.deps/variables.Plo +1 -0
  298. data/ext/link_grammar/link-grammar/sat-solver/.deps/word-tag.Plo +1 -0
  299. data/ext/link_grammar/link-grammar/sat-solver/Makefile +527 -0
  300. data/ext/link_grammar/link-grammar/sat-solver/Makefile.am +29 -0
  301. data/ext/link_grammar/link-grammar/sat-solver/Makefile.in +527 -0
  302. data/ext/link_grammar/link-grammar/sat-solver/clock.hpp +33 -0
  303. data/ext/link_grammar/link-grammar/sat-solver/fast-sprintf.cpp +26 -0
  304. data/ext/link_grammar/link-grammar/sat-solver/fast-sprintf.hpp +7 -0
  305. data/ext/link_grammar/link-grammar/sat-solver/guiding.hpp +244 -0
  306. data/ext/link_grammar/link-grammar/sat-solver/matrix-ut.hpp +79 -0
  307. data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.cpp +2811 -0
  308. data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.h +11 -0
  309. data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.hpp +381 -0
  310. data/ext/link_grammar/link-grammar/sat-solver/trie.hpp +118 -0
  311. data/ext/link_grammar/link-grammar/sat-solver/util.cpp +23 -0
  312. data/ext/link_grammar/link-grammar/sat-solver/util.hpp +14 -0
  313. data/ext/link_grammar/link-grammar/sat-solver/variables.cpp +5 -0
  314. data/ext/link_grammar/link-grammar/sat-solver/variables.hpp +829 -0
  315. data/ext/link_grammar/link-grammar/sat-solver/word-tag.cpp +159 -0
  316. data/ext/link_grammar/link-grammar/sat-solver/word-tag.hpp +162 -0
  317. data/ext/link_grammar/link-grammar/spellcheck-aspell.c +148 -0
  318. data/ext/link_grammar/link-grammar/spellcheck-hun.c +136 -0
  319. data/ext/link_grammar/link-grammar/spellcheck.h +34 -0
  320. data/ext/link_grammar/link-grammar/string-set.c +169 -0
  321. data/ext/link_grammar/link-grammar/string-set.h +16 -0
  322. data/ext/link_grammar/link-grammar/structures.h +498 -0
  323. data/ext/link_grammar/link-grammar/tokenize.c +1049 -0
  324. data/ext/link_grammar/link-grammar/tokenize.h +15 -0
  325. data/ext/link_grammar/link-grammar/utilities.c +847 -0
  326. data/ext/link_grammar/link-grammar/utilities.h +281 -0
  327. data/ext/link_grammar/link-grammar/word-file.c +124 -0
  328. data/ext/link_grammar/link-grammar/word-file.h +15 -0
  329. data/ext/link_grammar/link-grammar/word-utils.c +526 -0
  330. data/ext/link_grammar/link-grammar/word-utils.h +152 -0
  331. data/ext/link_grammar/link_grammar.c +202 -0
  332. data/ext/link_grammar/link_grammar.h +99 -0
  333. data/grammar_cop.gemspec +24 -0
  334. data/lib/.DS_Store +0 -0
  335. data/lib/grammar_cop.rb +9 -0
  336. data/lib/grammar_cop/.DS_Store +0 -0
  337. data/lib/grammar_cop/dictionary.rb +19 -0
  338. data/lib/grammar_cop/linkage.rb +30 -0
  339. data/lib/grammar_cop/parse_options.rb +32 -0
  340. data/lib/grammar_cop/sentence.rb +36 -0
  341. data/lib/grammar_cop/version.rb +3 -0
  342. data/test/.DS_Store +0 -0
  343. data/test/grammar_cop_test.rb +27 -0
  344. metadata +407 -0
@@ -0,0 +1,16 @@
1
+ /*************************************************************************/
2
+ /* Copyright (c) 2004 */
3
+ /* Daniel Sleator, David Temperley, and John Lafferty */
4
+ /* All rights reserved */
5
+ /* */
6
+ /* Use of the link grammar parsing system is subject to the terms of the */
7
+ /* license set forth in the LICENSE file included with this software, */
8
+ /* and also available at http://www.link.cs.cmu.edu/link/license.html */
9
+ /* This license allows free redistribution and use in source and binary */
10
+ /* forms, with or without modification, subject to certain conditions. */
11
+ /* */
12
+ /*************************************************************************/
13
+ void print_disjunct_counts(Sentence sent);
14
+ void print_sentence(FILE *fp, Sentence sent, int w);
15
+ void print_expression_sizes(Sentence sent);
16
+ void compute_chosen_words(Sentence sent, Linkage linkage);
@@ -0,0 +1,1864 @@
1
+ /*************************************************************************/
2
+ /* Copyright (c) 2004 */
3
+ /* Daniel Sleator, David Temperley, and John Lafferty */
4
+ /* All rights reserved */
5
+ /* */
6
+ /* Use of the link grammar parsing system is subject to the terms of the */
7
+ /* license set forth in the LICENSE file included with this software, */
8
+ /* and also available at http://www.link.cs.cmu.edu/link/license.html */
9
+ /* This license allows free redistribution and use in source and binary */
10
+ /* forms, with or without modification, subject to certain conditions. */
11
+ /* */
12
+ /*************************************************************************/
13
+
14
+ #include "api.h"
15
+ #include "disjunct-utils.h"
16
+
17
+ #define CONTABSZ 8192
18
+ typedef Connector * connector_table;
19
+
20
+ /*
21
+ typedef struct disjunct_dup_table_s disjunct_dup_table;
22
+ struct disjunct_dup_table_s
23
+ {
24
+ int dup_table_size;
25
+ Disjunct ** dup_table;
26
+ };
27
+ */
28
+
29
+ /* the indiction in a word field that this connector cannot
30
+ * be used -- is obsolete.
31
+ */
32
+ #define BAD_WORD (MAX_SENTENCE+1)
33
+
34
+ typedef struct c_list_s C_list;
35
+ struct c_list_s
36
+ {
37
+ Connector * c;
38
+ int shallow;
39
+ C_list * next;
40
+ };
41
+
42
+ typedef struct power_table_s power_table;
43
+ struct power_table_s
44
+ {
45
+ int power_table_size;
46
+ int l_table_size[MAX_SENTENCE]; /* the sizes of the hash tables */
47
+ int r_table_size[MAX_SENTENCE];
48
+ C_list ** l_table[MAX_SENTENCE];
49
+ C_list ** r_table[MAX_SENTENCE];
50
+ };
51
+
52
+ typedef struct cms_struct Cms;
53
+ struct cms_struct
54
+ {
55
+ Cms * next;
56
+ const char * name;
57
+ int count; /* the number of times this is in the multiset */
58
+ };
59
+
60
+ #define CMS_SIZE (2<<10)
61
+ typedef struct multiset_table_s multiset_table;
62
+ struct multiset_table_s
63
+ {
64
+ Cms * cms_table[CMS_SIZE];
65
+ };
66
+
67
+ typedef struct prune_context_s prune_context;
68
+ struct prune_context_s
69
+ {
70
+ int null_links;
71
+ char ** deletable;
72
+ char ** effective_dist;
73
+ int power_cost;
74
+ int power_prune_mode; /* either GENTLE or RUTHLESS */
75
+ int N_changed; /* counts the number of changes
76
+ of c->word fields in a pass */
77
+
78
+ power_table *pt;
79
+ Sentence sent;
80
+ };
81
+
82
+ /*
83
+
84
+ The algorithms in this file prune disjuncts from the disjunct list
85
+ of the sentence that can be elimininated by a simple checks. The first
86
+ check works as follows:
87
+
88
+ A series of passes are made through the sentence, alternating
89
+ left-to-right and right-to-left. Consier the left-to-right pass (the
90
+ other is symmetric). A set S of connectors is maintained (initialized
91
+ to be empty). Now the disjuncts of the current word are processed.
92
+ If a given disjunct's left pointing connectors have the property that
93
+ at least one of them has no connector in S to which it can be matched,
94
+ then that disjunct is deleted. Now the set S is augmented by the right
95
+ connectors of the remaining disjuncts of that word. This completes
96
+ one word. The process continues through the words from left to right.
97
+ Alternate passes are made until no disjunct is deleted.
98
+
99
+ It worries me a little that if there are some really huge disjuncts lists,
100
+ then this process will probably do nothing. (This fear turns out to be
101
+ unfounded.)
102
+
103
+ Notes: Power pruning will not work if applied before generating the
104
+ "and" disjuncts. This is because certain of it's tricks don't work.
105
+ Think about this, and finish this note later....
106
+ Also, currently I use the standard connector match procedure instead
107
+ of the pruning one, since I know power pruning will not be used before
108
+ and generation. Replace this to allow power pruning to work before
109
+ generating and disjuncts.
110
+
111
+ Currently it seems that normal pruning, power pruning, and generation,
112
+ pruning, and power pruning (after "and" generation) and parsing take
113
+ about the same amount of time. This is why doing power pruning before
114
+ "and" generation might be a very good idea.
115
+
116
+ New idea: Suppose all the disjuncts of a word have a connector of type
117
+ c pointing to the right. And further, suppose that there is exactly one
118
+ word to its right containing that type of connector pointing to the left.
119
+ Then all the other disjuncts on the latter word can be deleted.
120
+ (This situation is created by the processing of "either...or", and by
121
+ the extra disjuncts added to a "," neighboring a conjunction.)
122
+
123
+ */
124
+
125
+ /**
126
+ * This hash function only looks at the leading upper case letters of
127
+ * the connector string, and the label fields. This ensures that if two
128
+ * strings match (formally), then they must hash to the same place.
129
+ */
130
+ static inline int hash_S(Connector * c)
131
+ {
132
+ int h = connector_hash(c);
133
+ return (h & (CONTABSZ-1));
134
+ }
135
+
136
+ /**
137
+ * This is almost identical to match(). Its reason for existance
138
+ * is the rather subtle fact that with "and" can transform a "Ss"
139
+ * connector into "Sp". This means that in order for pruning to
140
+ * work, we must allow a "Ss" connector on word match an "Sp" connector
141
+ * on a word to its right. This is what this version of match allows.
142
+ * We assume that a is on a word to the left of b.
143
+ */
144
+ int prune_match(int dist, Connector *a, Connector *b)
145
+ {
146
+ const char *s, *t;
147
+ int x, y;
148
+
149
+ if (a->label != b->label) return FALSE;
150
+
151
+ x = hash_S(a);
152
+ y = hash_S(b);
153
+ if (x != y) return FALSE;
154
+
155
+ s = a->string;
156
+ t = b->string;
157
+
158
+ while(s < a->prune_string || t < b->prune_string)
159
+ {
160
+ if (*s != *t) return FALSE;
161
+ s++;
162
+ t++;
163
+ }
164
+
165
+ /* printf("PM: a=%4s b=%4s ap=%d bp=%d a->ll=%d b->ll=%d dist=%d\n",
166
+ s, t, x, y, a->length_limit, b->length_limit, dist); */
167
+ if (dist > a->length_limit || dist > b->length_limit) return FALSE;
168
+
169
+ x = a->priority;
170
+ y = b->priority;
171
+
172
+ if ((x == THIN_priority) && (y == THIN_priority))
173
+ {
174
+ #if defined(PLURALIZATION)
175
+ /*
176
+ if ((*(a->string)=='S') && ((*s=='s') || (*s=='p')) && (*t=='p')) {
177
+ return TRUE;
178
+ }
179
+ */
180
+ /*
181
+ The above is a kludge to stop pruning from killing off disjuncts
182
+ which (because of pluralization in and) might become valid later.
183
+ Recall that "and" converts a singular subject into a plural one.
184
+ The (*s=='p') part is so that "he and I are good" doesn't get killed off.
185
+ The above hack is subsumed by the following one:
186
+ */
187
+ if ((*(a->string)=='S') && ((*s=='s') || (*s=='p')) &&
188
+ ((*t=='p') || (*t=='s')) &&
189
+ ((s-1 == a->string) || ((s-2 == a->string) && (*(s-1) == 'I')))){
190
+ return TRUE;
191
+ }
192
+ /*
193
+ This change is to accommodate "nor". In particular we need to
194
+ prevent "neither John nor I likes dogs" from being killed off.
195
+ We want to allow this to apply to "are neither a dog nor a cat here"
196
+ and "is neither a dog nor a cat here". This uses the "SI" connector.
197
+ The third line above ensures that the connector is either "S" or "SI".
198
+ */
199
+ #endif
200
+ while ((*s != '\0') && (*t != '\0'))
201
+ {
202
+ if ((*s == '*') || (*t == '*') ||
203
+ ((*s == *t) && (*s != '^')))
204
+ {
205
+ /* this last case here is rather obscure. It prevents
206
+ '^' from matching '^'.....Is this necessary?
207
+ ......yes, I think it is. */
208
+ s++;
209
+ t++;
210
+ }
211
+ else
212
+ return FALSE;
213
+ }
214
+ return TRUE;
215
+ }
216
+ else if ((x == UP_priority) && (y == DOWN_priority))
217
+ {
218
+ while ((*s!='\0') && (*t!='\0'))
219
+ {
220
+ if ((*s == *t) || (*s == '*') || (*t == '^'))
221
+ {
222
+ /* that '^' should match on the DOWN_priority
223
+ node is subtle, but correct */
224
+ s++;
225
+ t++;
226
+ }
227
+ else
228
+ return FALSE;
229
+ }
230
+ return TRUE;
231
+ }
232
+ else if ((y == UP_priority) && (x == DOWN_priority))
233
+ {
234
+ while ((*s!='\0') && (*t!='\0'))
235
+ {
236
+ if ((*s == *t) || (*t == '*') || (*s == '^'))
237
+ {
238
+ s++;
239
+ t++;
240
+ }
241
+ else
242
+ return FALSE;
243
+ }
244
+ return TRUE;
245
+ }
246
+ else
247
+ return FALSE;
248
+ }
249
+
250
+ static void zero_connector_table(connector_table *ct)
251
+ {
252
+ memset(ct, 0, sizeof(Connector *) * CONTABSZ);
253
+ }
254
+
255
+ /**
256
+ * This function puts connector c into the connector table
257
+ * if one like it isn't already there.
258
+ */
259
+ static void insert_connector(connector_table *ct, Connector * c)
260
+ {
261
+ int h;
262
+ Connector * e;
263
+
264
+ h = hash_S(c);
265
+
266
+ for (e = ct[h]; e != NULL; e = e->tableNext)
267
+ {
268
+ if ((strcmp(c->string, e->string) == 0) &&
269
+ (c->label == e->label) &&
270
+ (c->priority == e->priority))
271
+ return;
272
+ }
273
+ c->tableNext = ct[h];
274
+ ct[h] = c;
275
+ }
276
+
277
+ void prune(Sentence sent)
278
+ {
279
+ Connector *e, *f;
280
+ int w;
281
+ int N_deleted;
282
+ Connector *ct[CONTABSZ];
283
+ Disjunct fake_head, *d, *d1;
284
+
285
+ /* XXX why is this here ?? */
286
+ count_set_effective_distance(sent);
287
+
288
+ N_deleted = 1; /* a lie to make it always do at least 2 passes */
289
+ while(1)
290
+ {
291
+ /* Left-to-right pass */
292
+ zero_connector_table(ct);
293
+
294
+ /* For every word */
295
+ for (w = 0; w < sent->length; w++)
296
+ {
297
+ d = &fake_head;
298
+ d->next = sent->word[w].d;
299
+
300
+ /* For every disjunct of word */
301
+ while ((d1 = d->next))
302
+ {
303
+ e = d1->left;
304
+
305
+ /* For every left clause of this disjunct */
306
+ while (e)
307
+ {
308
+ int h = hash_S(e);
309
+ for (f = ct[h]; f != NULL; f = f->tableNext)
310
+ {
311
+ if (prune_match(0, f, e)) break;
312
+ }
313
+ if (!f) break; /* If f null, not a single match was found */
314
+ e = e->next;
315
+ }
316
+
317
+ /* We know this disjunct is dead since no match
318
+ * can be found on a required clause. */
319
+ if (e)
320
+ {
321
+ N_deleted ++;
322
+ free_connectors(d1->left);
323
+ free_connectors(d1->right);
324
+ d->next = d1->next;
325
+ xfree(d1, sizeof(Disjunct));
326
+ }
327
+ else
328
+ {
329
+ /* Store surviving disjunct in hash table */
330
+ for (e = d1->right; e != NULL; e = e->next)
331
+ {
332
+ insert_connector(ct, e);
333
+ }
334
+ d = d1; /* move on to next disjunct*/
335
+ }
336
+ }
337
+ sent->word[w].d = fake_head.next;
338
+ }
339
+
340
+ if (2 < verbosity)
341
+ {
342
+ printf("l->r pass removed %d\n", N_deleted);
343
+ print_disjunct_counts(sent);
344
+ }
345
+
346
+ /* We did nothing (and this is not the 1st pass) */
347
+ if (N_deleted == 0) break;
348
+
349
+ /* Right-to-left pass */
350
+ zero_connector_table(ct);
351
+ N_deleted = 0;
352
+
353
+ /* For every word */
354
+ for (w = sent->length-1; w >= 0; w--)
355
+ {
356
+ d = &fake_head;
357
+ d->next = sent->word[w].d;
358
+
359
+ while ((d1 = d->next))
360
+ {
361
+ e = d1->right;
362
+
363
+ while (e)
364
+ {
365
+ int h = hash_S(e);
366
+ for (f = ct[h]; f != NULL; f = f->tableNext)
367
+ {
368
+ if (prune_match(0, e, f)) break;
369
+ }
370
+ if (!f) break; /* If f null, not a single match was found */
371
+ e = e->next;
372
+ }
373
+
374
+ /* We know this disjunct is dead since it can't match
375
+ * to the right*/
376
+ if(e)
377
+ {
378
+ N_deleted ++;
379
+ free_connectors(d1->left);
380
+ free_connectors(d1->right);
381
+ d->next = d1->next;
382
+ xfree(d1, sizeof(Disjunct));
383
+ }
384
+ else
385
+ {
386
+ /* Store surviving disjunct in hash table */
387
+ for (e = d1->left; e != NULL; e = e->next)
388
+ {
389
+ insert_connector(ct, e);
390
+ }
391
+ d = d1; /* move on to next disjunct*/
392
+ }
393
+ sent->word[w].d = fake_head.next;
394
+ }
395
+ }
396
+
397
+ if (verbosity > 2)
398
+ {
399
+ printf("r->l pass removed %d\n", N_deleted);
400
+ print_disjunct_counts(sent);
401
+ }
402
+
403
+ /* We made no change on this pass */
404
+ if (N_deleted == 0) break;
405
+ N_deleted = 0;
406
+ }
407
+ }
408
+
409
+ /*
410
+ The second algorithm eliminates disjuncts that are dominated by
411
+ another. It works by hashing them all, and checking for domination.
412
+ */
413
+
414
+ #if FALSE
415
+ /* ============================================================x */
416
+
417
+ /*
418
+ Consider the idea of deleting a disjunct if it is dominated (in terms of
419
+ what it can match) by some other disjunct on the same word. This has
420
+ been implemented below. There are three problems with it:
421
+
422
+ (1) It is almost never the case that any disjuncts are eliminated.
423
+ (The code below has works correctly with fat links, but because
424
+ all of the fat connectors on a fat disjunct have the same matching
425
+ string, the only time a disjuct will die is if it is the same
426
+ as another one. This is captured by the simplistic version below.
427
+
428
+ (2) connector_matches_alam may not be exactly correct. I don't
429
+ think it does the fat link matches properly. (See the comment
430
+ in and.c for more information about matching fat links.) This is
431
+ irrelevant because of (1).
432
+
433
+ (3) The linkage that is eliminated by this, might just be the one that
434
+ passes post-processing, as the following example shows.
435
+ This is pretty silly, and should probably be changed.
436
+
437
+ > telling John how our program works would be stupid
438
+ Accepted (2 linkages, 1 with no P.P. violations)
439
+ Linkage 1, cost vector = (0, 0, 7)
440
+
441
+ +------------------G-----------------+
442
+ +-----R-----+----CL----+ |
443
+ +---O---+ | +---D--+---S---+ +--I-+-AI-+
444
+ | | | | | | | | |
445
+ telling.g John how our program.n works would be stupid
446
+
447
+ ///// CLg <---CLg---> CL telling.g
448
+ (g) telling.g G <---G-----> G would
449
+ (g) (d) telling.g R <---R-----> R how
450
+ (g) (d) telling.g O <---O-----> O John
451
+ (g) (d) how CLe <---CLe---> CL program.n
452
+ (g) (d) (e) our D <---Ds----> Ds program.n
453
+ (g) (d) (e) program.n Ss <---Ss----> Ss works
454
+ (g) would I <---Ix----> Ix be
455
+ (g) be AI <---AIi---> AIi stupid
456
+
457
+ (press return for another)
458
+ >
459
+ Linkage 2 (bad), cost vector = (0, 0, 7)
460
+
461
+ +------------------G-----------------+
462
+ +-----R-----+----CL----+ |
463
+ +---O---+ | +---D--+---S---+ +--I-+-AI-+
464
+ | | | | | | | | |
465
+ telling.g John how our program.n works would be stupid
466
+
467
+ ///// CLg <---CLg---> CL telling.g
468
+ (g) telling.g G <---G-----> G would
469
+ (g) (d) telling.g R <---R-----> R how
470
+ (g) (d) telling.g O <---O-----> O John
471
+ (g) (d) how CLe <---CLe---> CL program.n
472
+ (g) (d) (e) our D <---Ds----> Ds program.n
473
+ (g) (d) (e) program.n Ss <---Ss----> Ss works
474
+ (g) would I <---Ix----> Ix be
475
+ (g) be AI <---AI----> AI stupid
476
+
477
+ P.P. violations:
478
+ Special subject rule violated
479
+ */
480
+
481
+ /**
482
+ * hash function that takes a string and a seed value i
483
+ */
484
+ static int string_hash(disjunct_dup_table *dt, const char * s, int i)
485
+ {
486
+ for(;*s != '\0';s++) i = i + (i<<1) + randtable[(*s + i) & (RTSIZE-1)];
487
+ return (i & (dt->dup_table_size-1));
488
+ }
489
+
490
+ /**
491
+ * This returns true if the connector a matches everything that b
492
+ * matches, and possibly more. (alam=at least as much)
493
+ *
494
+ * TRUE for equal connectors.
495
+ * remains TRUE if multi-match added to the first.
496
+ * remains TRUE if subsrcripts deleted from the first.
497
+ */
498
+ int connector_matches_alam(Connector * a, Connector * b)
499
+ {
500
+ char * s, * t, *u;
501
+ if (((!a->multi) && b->multi) ||
502
+ (a->label != b->label) ||
503
+ (a->priority != b->priority)) return FALSE;
504
+ s = a->string;
505
+ t = b->string;
506
+
507
+ /* isupper -- connectors cannot be UTF8 at this time */
508
+ while(isupper(*s) || isupper(*t))
509
+ {
510
+ if (*s == *t) {
511
+ s++;
512
+ t++;
513
+ } else return FALSE;
514
+ }
515
+ if (a->priority == DOWN_priority) {
516
+ u = s;
517
+ s = t;
518
+ t = u;
519
+ }
520
+ while((*s != '\0') && (*t != '\0')) {
521
+ if ((*s == *t) || (*s == '*') || (*t == '^')) {
522
+ s++;
523
+ t++;
524
+ } else return FALSE;
525
+ }
526
+ while ((*s != '\0') && (*s == '*')) s++;
527
+ return (*s == '\0');
528
+ }
529
+
530
+
531
+ /**
532
+ * This hash function that takes a connector and a seed value i.
533
+ * It only looks at the leading upper case letters of
534
+ * the string, and the label. This ensures that if two connectors
535
+ * match, then they must hash to the same place.
536
+ */
537
+ static int conn_hash(Connector * c, int i)
538
+ {
539
+ int nb;
540
+ const char * s;
541
+ s = c->string;
542
+
543
+ i = i + (i<<1) + randtable[(c->label + i) & (RTSIZE-1)];
544
+ nb = is_utf8_upper(s);
545
+ while(nb)
546
+ {
547
+ i = i + (i<<1) + randtable[(*s + i) & (RTSIZE-1)];
548
+ s += nb;
549
+ nb = is_utf8_upper(s);
550
+ }
551
+ return i;
552
+ }
553
+
554
+ static inline int pconnector_hash(disjunct_dup_table *dt, Connector * c, int i)
555
+ {
556
+ i = conn_hash(c, i);
557
+ return (i & (ct->dup_table_size-1));
558
+ }
559
+
560
+ /**
561
+ * This is a hash function for disjuncts
562
+ */
563
+ static int hash_disjunct(disjunct_dup_table *dt, Disjunct * d)
564
+ {
565
+ int i;
566
+ Connector *e;
567
+ i = 0;
568
+ for (e = d->left ; e != NULL; e = e->next)
569
+ {
570
+ i = pconnector_hash(dt, e, i);
571
+ }
572
+ for (e = d->right ; e != NULL; e = e->next)
573
+ {
574
+ i = pconnector_hash(dt, e, i);
575
+ }
576
+ return string_hash(dt, d->string, i);
577
+ }
578
+
579
+ /**
580
+ * Returns TRUE if disjunct d1 can match anything that d2 can
581
+ * if this happens, it constitutes a proof that there is absolutely
582
+ * no use for d2.
583
+ */
584
+ static int disjunct_matches_alam(Disjunct * d1, Disjunct * d2)
585
+ {
586
+ Connector *e1, *e2;
587
+ if (d1->cost > d2->cost) return FALSE;
588
+ e1 = d1->left;
589
+ e2 = d2->left;
590
+ while((e1!=NULL) && (e2!=NULL)) {
591
+ if (!connector_matches_alam(e1,e2)) break;
592
+ e1 = e1->next;
593
+ e2 = e2->next;
594
+ }
595
+ if ((e1!=NULL) || (e2!=NULL)) return FALSE;
596
+ e1 = d1->right;
597
+ e2 = d2->right;
598
+ while((e1!=NULL) && (e2!=NULL)) {
599
+ if (!connector_matches_alam(e1,e2)) break;
600
+ e1 = e1->next;
601
+ e2 = e2->next;
602
+ }
603
+ if ((e1!=NULL) || (e2!=NULL)) return FALSE;
604
+ return (strcmp(d1->string, d2->string) == 0);
605
+ }
606
+
607
+ /**
608
+ * Takes the list of disjuncts pointed to by d, eliminates all
609
+ * duplicates, and returns a pointer to a new list.
610
+ * It frees the disjuncts that are eliminated.
611
+ */
612
+ Disjunct * eliminate_duplicate_disjuncts(Disjunct * d)
613
+ {
614
+ int i, h, count;
615
+ Disjunct *dn, *dx, *dxn, *front;
616
+ count = 0;
617
+ disjunct_dup_table *dt;
618
+
619
+ dt = disjunct_dup_table_new(next_power_of_two_up(2 * count_disjuncts(d)));
620
+
621
+ for (;d!=NULL; d = dn)
622
+ {
623
+ dn = d->next;
624
+ h = hash_disjunct(d);
625
+
626
+ front = NULL;
627
+ for (dx = dt->dup_table[h]; dx != NULL; dx = dxn)
628
+ {
629
+ dxn = dx->next;
630
+ if (disjunct_matches_alam(dx,d))
631
+ {
632
+ /* we know that d should be killed */
633
+ d->next = NULL;
634
+ free_disjuncts(d);
635
+ count++;
636
+ front = catenate_disjuncts(front, dx);
637
+ break;
638
+ } else if (disjunct_matches_alam(d,dx)) {
639
+ /* we know that dx should be killed off */
640
+ dx->next = NULL;
641
+ free_disjuncts(dx);
642
+ count++;
643
+ } else {
644
+ /* neither should be killed off */
645
+ dx->next = front;
646
+ front = dx;
647
+ }
648
+ }
649
+ if (dx == NULL) {
650
+ /* we put d in the table */
651
+ d->next = front;
652
+ front = d;
653
+ }
654
+ dt->dup_table[h] = front;
655
+ }
656
+
657
+ /* d is now NULL */
658
+ for (i = 0; i < dt->dup_table_size; i++)
659
+ {
660
+ for (dx = dt->dup_table[i]; dx != NULL; dx = dxn)
661
+ {
662
+ dxn = dx->next;
663
+ dx->next = d;
664
+ d = dx;
665
+ }
666
+ }
667
+
668
+ if ((verbosity > 2) && (count != 0)) printf("killed %d duplicates\n", count);
669
+
670
+ disjunct_dup_table_delete(dt);
671
+ return d;
672
+ }
673
+
674
+ /* ============================================================x */
675
+ #endif
676
+
677
+ /* ================================================================= */
678
+ /**
679
+ * Here is expression pruning. This is done even before the expressions
680
+ * are turned into lists of disjuncts.
681
+ *
682
+ * This uses many of the same data structures and functions that are used
683
+ * by prune.
684
+ *
685
+ * The purge operations remove all irrelevant stuff from the expression,
686
+ * and free the purged stuff. A connector is deemed irrelevant if its
687
+ * string pointer has been set to NULL. The passes through the sentence
688
+ * have the job of doing this.
689
+ *
690
+ * If an OR or AND type expression node has one child, we can replace it
691
+ * by its child. This, of course, is not really necessary, except for
692
+ * performance(?)
693
+ */
694
+
695
+ static Exp* purge_Exp(Exp *);
696
+
697
+ /**
698
+ * Get rid of the elements with null expressions
699
+ */
700
+ static E_list * or_purge_E_list(E_list * l)
701
+ {
702
+ E_list * el;
703
+ if (l == NULL) return NULL;
704
+ if ((l->e = purge_Exp(l->e)) == NULL)
705
+ {
706
+ el = or_purge_E_list(l->next);
707
+ xfree((char *)l, sizeof(E_list));
708
+ return el;
709
+ }
710
+ l->next = or_purge_E_list(l->next);
711
+ return l;
712
+ }
713
+
714
+ /**
715
+ * Returns 0 iff the length of the disjunct list is 0.
716
+ * If this is the case, it frees the structure rooted at l.
717
+ */
718
+ static int and_purge_E_list(E_list * l)
719
+ {
720
+ if (l == NULL) return 1;
721
+ if ((l->e = purge_Exp(l->e)) == NULL)
722
+ {
723
+ free_E_list(l->next);
724
+ xfree((char *)l, sizeof(E_list));
725
+ return 0;
726
+ }
727
+ if (and_purge_E_list(l->next) == 0)
728
+ {
729
+ free_Exp(l->e);
730
+ xfree((char *)l, sizeof(E_list));
731
+ return 0;
732
+ }
733
+ return 1;
734
+ }
735
+
736
+ /**
737
+ * Must be called with a non-null expression.
738
+ * Return NULL iff the expression has no disjuncts.
739
+ */
740
+ static Exp* purge_Exp(Exp *e)
741
+ {
742
+ if (e->type == CONNECTOR_type)
743
+ {
744
+ if (e->u.string == NULL)
745
+ {
746
+ xfree((char *)e, sizeof(Exp));
747
+ return NULL;
748
+ }
749
+ else
750
+ {
751
+ return e;
752
+ }
753
+ }
754
+ if (e->type == AND_type)
755
+ {
756
+ if (and_purge_E_list(e->u.l) == 0)
757
+ {
758
+ xfree((char *)e, sizeof(Exp));
759
+ return NULL;
760
+ }
761
+ }
762
+ else
763
+ {
764
+ e->u.l = or_purge_E_list(e->u.l);
765
+ if (e->u.l == NULL)
766
+ {
767
+ xfree((char *)e, sizeof(Exp));
768
+ return NULL;
769
+ }
770
+ }
771
+
772
+ /* This code makes it kill off nodes that have just one child
773
+ (1) It's going to give an insignificant speed-up
774
+ (2) Costs have not been handled correctly here.
775
+ The code is excised for these reasons.
776
+ */
777
+ /*
778
+ if ((e->u.l != NULL) && (e->u.l->next == NULL))
779
+ {
780
+ ne = e->u.l->e;
781
+ xfree((char *) e->u.l, sizeof(E_list));
782
+ xfree((char *) e, sizeof(Exp));
783
+ return ne;
784
+ }
785
+ */
786
+ return e;
787
+ }
788
+
789
+ /**
790
+ * Returns TRUE if c can match anything in the set S.
791
+ */
792
+ static inline int matches_S(connector_table *ct, Connector * c, int dir)
793
+ {
794
+ Connector * e;
795
+ int h = hash_S(c);
796
+
797
+ if (dir == '-')
798
+ {
799
+ for (e = ct[h]; e != NULL; e = e->tableNext)
800
+ {
801
+ if (prune_match(0, e, c)) return TRUE;
802
+ }
803
+ return FALSE;
804
+ }
805
+ else
806
+ {
807
+ for (e = ct[h]; e != NULL; e = e->tableNext)
808
+ {
809
+ if (prune_match(0, c, e)) return TRUE;
810
+ }
811
+ return FALSE;
812
+ }
813
+ }
814
+
815
+ /**
816
+ * Mark as dead all of the dir-pointing connectors
817
+ * in e that are not matched by anything in the current set.
818
+ * Returns the number of connectors so marked.
819
+ */
820
+ static int mark_dead_connectors(connector_table *ct, Exp * e, int dir)
821
+ {
822
+ int count;
823
+ count = 0;
824
+ if (e->type == CONNECTOR_type)
825
+ {
826
+ if (e->dir == dir)
827
+ {
828
+ Connector dummy;
829
+ init_connector(&dummy);
830
+ dummy.label = NORMAL_LABEL;
831
+ dummy.priority = THIN_priority;
832
+ dummy.string = e->u.string;
833
+ if (!matches_S(ct, &dummy, dir))
834
+ {
835
+ e->u.string = NULL;
836
+ count++;
837
+ }
838
+ }
839
+ }
840
+ else
841
+ {
842
+ E_list *l;
843
+ for (l = e->u.l; l != NULL; l = l->next)
844
+ {
845
+ count += mark_dead_connectors(ct, l->e, dir);
846
+ }
847
+ }
848
+ return count;
849
+ }
850
+
851
+ /**
852
+ * Put into the set S all of the dir-pointing connectors still in e.
853
+ * Return a list of allocated dummy connectors; these will need to be
854
+ * freed.
855
+ */
856
+ static Connector * insert_connectors(connector_table *ct, Exp * e,
857
+ Connector *alloc_list, int dir)
858
+ {
859
+ if (e->type == CONNECTOR_type)
860
+ {
861
+ if (e->dir == dir)
862
+ {
863
+ Connector *dummy = connector_new();
864
+ dummy->string = e->u.string;
865
+ insert_connector(ct, dummy);
866
+ dummy->next = alloc_list;
867
+ alloc_list = dummy;
868
+ }
869
+ }
870
+ else
871
+ {
872
+ E_list *l;
873
+ for (l=e->u.l; l!=NULL; l=l->next)
874
+ {
875
+ alloc_list = insert_connectors(ct, l->e, alloc_list, dir);
876
+ }
877
+ }
878
+ return alloc_list;
879
+ }
880
+
881
+ /**
882
+ * This removes the expressions that are empty from the list corresponding
883
+ * to word w of the sentence.
884
+ */
885
+ static void clean_up_expressions(Sentence sent, int w)
886
+ {
887
+ X_node head_node, *d, *d1;
888
+ d = &head_node;
889
+ d->next = sent->word[w].x;
890
+ while (d->next != NULL)
891
+ {
892
+ if (d->next->exp == NULL)
893
+ {
894
+ d1 = d->next;
895
+ d->next = d1->next;
896
+ xfree((char *)d1, sizeof(X_node));
897
+ }
898
+ else
899
+ {
900
+ d = d->next;
901
+ }
902
+ }
903
+ sent->word[w].x = head_node.next;
904
+ }
905
+
906
+ void expression_prune(Sentence sent)
907
+ {
908
+ int N_deleted;
909
+ X_node * x;
910
+ int w;
911
+ Connector *ct[CONTABSZ];
912
+ Connector *dummy_list = NULL;
913
+
914
+ zero_connector_table(ct);
915
+
916
+ N_deleted = 1; /* a lie to make it always do at least 2 passes */
917
+
918
+ while(1)
919
+ {
920
+ /* Left-to-right pass */
921
+ /* For every word */
922
+ for (w = 0; w < sent->length; w++)
923
+ {
924
+ /* For every expression in word */
925
+ for (x = sent->word[w].x; x != NULL; x = x->next)
926
+ {
927
+ /* printf("before marking: "); print_expression(x->exp); printf("\n"); */
928
+ N_deleted += mark_dead_connectors(ct, x->exp, '-');
929
+ /* printf(" after marking: "); print_expression(x->exp); printf("\n"); */
930
+ }
931
+ for (x = sent->word[w].x; x != NULL; x = x->next)
932
+ {
933
+ /* printf("before purging: "); print_expression(x->exp); printf("\n"); */
934
+ x->exp = purge_Exp(x->exp);
935
+ /* printf("after purging: "); print_expression(x->exp); printf("\n"); */
936
+ }
937
+
938
+ /* gets rid of X_nodes with NULL exp */
939
+ clean_up_expressions(sent, w);
940
+ for (x = sent->word[w].x; x != NULL; x = x->next)
941
+ {
942
+ dummy_list = insert_connectors(ct, x->exp, dummy_list, '+');
943
+ }
944
+ }
945
+
946
+ if (verbosity > 2)
947
+ {
948
+ printf("l->r pass removed %d\n", N_deleted);
949
+ print_expression_sizes(sent);
950
+ }
951
+
952
+ /* Free the allocated dummy connectors */
953
+ free_connectors(dummy_list);
954
+ dummy_list = NULL;
955
+ zero_connector_table(ct);
956
+
957
+ if (N_deleted == 0) break;
958
+
959
+ /* Right-to-left pass */
960
+ N_deleted = 0;
961
+ for (w = sent->length-1; w >= 0; w--)
962
+ {
963
+ for (x = sent->word[w].x; x != NULL; x = x->next)
964
+ {
965
+ /* printf("before marking: "); print_expression(x->exp); printf("\n"); */
966
+ N_deleted += mark_dead_connectors(ct, x->exp, '+');
967
+ /* printf("after marking: "); print_expression(x->exp); printf("\n"); */
968
+ }
969
+ for (x = sent->word[w].x; x != NULL; x = x->next)
970
+ {
971
+ /* printf("before perging: "); print_expression(x->exp); printf("\n"); */
972
+ x->exp = purge_Exp(x->exp);
973
+ /* printf("after perging: "); print_expression(x->exp); printf("\n"); */
974
+ }
975
+ clean_up_expressions(sent, w); /* gets rid of X_nodes with NULL exp */
976
+ for (x = sent->word[w].x; x != NULL; x = x->next)
977
+ {
978
+ dummy_list = insert_connectors(ct, x->exp, dummy_list, '-');
979
+ }
980
+ }
981
+
982
+ if (verbosity > 2)
983
+ {
984
+ printf("r->l pass removed %d\n", N_deleted);
985
+ print_expression_sizes(sent);
986
+ }
987
+
988
+ /* Free the allocated dummy connectors */
989
+ free_connectors(dummy_list);
990
+ dummy_list = NULL;
991
+ zero_connector_table(ct);
992
+ if (N_deleted == 0) break;
993
+ N_deleted = 0;
994
+ }
995
+ }
996
+
997
+
998
+
999
+ /*
1000
+ Here is what you've been waiting for: POWER-PRUNE
1001
+
1002
+ The kinds of constraints it checks for are the following:
1003
+
1004
+ 1) successive connectors on the same disjunct have to go to
1005
+ nearer and nearer words.
1006
+
1007
+ 2) two deep connectors cannot attach to eachother
1008
+ (A connectors is deep if it is not the first in its list, it
1009
+ is shallow if it is the first in its list, it is deepest if it
1010
+ is the last on its list.)
1011
+
1012
+ 3) on two adjacent words, a pair of connectors can be used
1013
+ only if they're the deepest ones on their disjuncts
1014
+
1015
+ 4) on two non-adjacent words, a pair of connectors can be used only
1016
+ if not [both of them are the deepest].
1017
+
1018
+ The data structure consists of a pair of hash tables on every word.
1019
+ Each bucket of a hash table has a list of pointers to connectors.
1020
+ These nodes also store if the chosen connector is shallow.
1021
+ */
1022
+ /*
1023
+ As with normal pruning, we make alternate left->right and right->left
1024
+ passes. In the R->L pass, when we're on a word w, we make use of
1025
+ all the left-pointing hash tables on the words to the right of w.
1026
+ After the pruning on this word, we build the left-pointing hash table
1027
+ this word. This guarantees idempotence of the pass -- after doing an
1028
+ L->R, doing another would change nothing.
1029
+
1030
+ Each connector has an integer c_word field. This refers to the closest
1031
+ word that it could be connected to. These are initially determined by
1032
+ how deep the connector is. For example, a deepest connector can connect
1033
+ to the neighboring word, so its c_word field is w+1 (w-1 if this is a left
1034
+ pointing connector). It's neighboring shallow connector has a c_word
1035
+ value of w+2, etc.
1036
+
1037
+ The pruning process adjusts these c_word values as it goes along,
1038
+ accumulating information about any way of linking this sentence.
1039
+ The pruning process stops only after no disjunct is deleted and no
1040
+ c_word values change.
1041
+
1042
+ The difference between RUTHLESS and GENTLE power pruning is simply
1043
+ that GENTLE uses the deletable region array, and RUTHLESS does not.
1044
+ So we can get the effect of these two different methods simply by
1045
+ always unsuring that deletable[][] has been defined. With nothing
1046
+ deletable, this is equivalent to RUTHLESS. --DS, 7/97
1047
+ */
1048
+
1049
+ /**
1050
+ * returns the number of connectors in the left lists of the disjuncts.
1051
+ */
1052
+ static int left_connector_count(Disjunct * d)
1053
+ {
1054
+ Connector *c;
1055
+ int i=0;
1056
+ for (;d!=NULL; d=d->next) {
1057
+ for (c = d->left; c!=NULL; c = c->next) i++;
1058
+ }
1059
+ return i;
1060
+ }
1061
+
1062
+ static int right_connector_count(Disjunct * d)
1063
+ {
1064
+ Connector *c;
1065
+ int i=0;
1066
+ for (;d!=NULL; d=d->next) {
1067
+ for (c = d->right; c!=NULL; c = c->next) i++;
1068
+ }
1069
+ return i;
1070
+ }
1071
+
1072
+ static void free_C_list(C_list * t)
1073
+ {
1074
+ C_list *xt;
1075
+ for (; t!=NULL; t=xt) {
1076
+ xt = t->next;
1077
+ xfree((char *)t, sizeof(C_list));
1078
+ }
1079
+ }
1080
+
1081
+ /**
1082
+ * free all of the hash tables and C_lists
1083
+ */
1084
+ static void power_table_delete(power_table *pt)
1085
+ {
1086
+ int w;
1087
+ int i;
1088
+
1089
+ for (w = 0; w < pt->power_table_size; w++)
1090
+ {
1091
+ for (i = 0; i < pt->l_table_size[w]; i++)
1092
+ {
1093
+ free_C_list(pt->l_table[w][i]);
1094
+ }
1095
+ xfree((char *)pt->l_table[w], pt->l_table_size[w] * sizeof (C_list *));
1096
+
1097
+ for (i = 0; i < pt->r_table_size[w]; i++)
1098
+ {
1099
+ free_C_list(pt->r_table[w][i]);
1100
+ }
1101
+ xfree((char *)pt->r_table[w], pt->r_table_size[w] * sizeof (C_list *));
1102
+ }
1103
+ free(pt);
1104
+ }
1105
+
1106
+ /**
1107
+ * The disjunct d (whose left or right pointer points to c) is put
1108
+ * into the appropriate hash table
1109
+ */
1110
+ static void put_into_power_table(int size, C_list ** t, Connector * c, int shal)
1111
+ {
1112
+ int h;
1113
+ C_list * m;
1114
+ h = connector_hash(c) & (size-1);
1115
+ m = (C_list *) xalloc (sizeof(C_list));
1116
+ m->next = t[h];
1117
+ t[h] = m;
1118
+ m->c = c;
1119
+ m->shallow = shal;
1120
+ }
1121
+
1122
+ static int set_dist_fields(Connector * c, int w, int delta)
1123
+ {
1124
+ int i;
1125
+ if (c==NULL) return w;
1126
+ i = set_dist_fields(c->next, w, delta) + delta;
1127
+ c->word = i;
1128
+ return i;
1129
+ }
1130
+
1131
+ /**
1132
+ * Allocates and builds the initial power hash tables
1133
+ */
1134
+ static power_table * power_table_new(Sentence sent)
1135
+ {
1136
+ power_table *pt;
1137
+ int w, len, size, i;
1138
+ C_list ** t;
1139
+ Disjunct * d, * xd, * head;
1140
+ Connector * c;
1141
+
1142
+ pt = (power_table *) malloc (sizeof(power_table));
1143
+ pt->power_table_size = sent->length;
1144
+
1145
+ /* first we initialize the word fields of the connectors, and
1146
+ eliminate those disjuncts with illegal connectors */
1147
+ for (w=0; w<sent->length; w++)
1148
+ {
1149
+ head = NULL;
1150
+ for (d=sent->word[w].d; d!=NULL; d=xd) {
1151
+ xd = d->next;
1152
+ if ((set_dist_fields(d->left, w, -1) < 0) ||
1153
+ (set_dist_fields(d->right, w, 1) >= sent->length)) {
1154
+ d->next = NULL;
1155
+ free_disjuncts(d);
1156
+ } else {
1157
+ d->next = head;
1158
+ head = d;
1159
+ }
1160
+ }
1161
+ sent->word[w].d = head;
1162
+ }
1163
+
1164
+ for (w=0; w<sent->length; w++)
1165
+ {
1166
+ len = left_connector_count(sent->word[w].d);
1167
+ size = next_power_of_two_up(len);
1168
+ pt->l_table_size[w] = size;
1169
+ t = pt->l_table[w] = (C_list **) xalloc(size * sizeof(C_list *));
1170
+ for (i=0; i<size; i++) t[i] = NULL;
1171
+
1172
+ for (d=sent->word[w].d; d!=NULL; d=d->next) {
1173
+ c = d->left;
1174
+ if (c != NULL) {
1175
+ put_into_power_table(size, t, c, TRUE);
1176
+ for (c=c->next; c!=NULL; c=c->next){
1177
+ put_into_power_table(size, t, c, FALSE);
1178
+ }
1179
+ }
1180
+ }
1181
+
1182
+ len = right_connector_count(sent->word[w].d);
1183
+ size = next_power_of_two_up(len);
1184
+ pt->r_table_size[w] = size;
1185
+ t = pt->r_table[w] = (C_list **) xalloc(size * sizeof(C_list *));
1186
+ for (i=0; i<size; i++) t[i] = NULL;
1187
+
1188
+ for (d=sent->word[w].d; d!=NULL; d=d->next) {
1189
+ c = d->right;
1190
+ if (c != NULL) {
1191
+ put_into_power_table(size, t, c, TRUE);
1192
+ for (c=c->next; c!=NULL; c=c->next){
1193
+ put_into_power_table(size, t, c, FALSE);
1194
+ }
1195
+ }
1196
+ }
1197
+ }
1198
+
1199
+ return pt;
1200
+ }
1201
+
1202
+ /**
1203
+ * This runs through all the connectors in this table, and eliminates those
1204
+ * who are obsolete. The word fields of an obsolete one has been set to
1205
+ * BAD_WORD.
1206
+ */
1207
+ static void clean_table(int size, C_list ** t)
1208
+ {
1209
+ int i;
1210
+ C_list * m, * xm, * head;
1211
+ for (i=0; i<size; i++) {
1212
+ head = NULL;
1213
+ for (m=t[i]; m!=NULL; m=xm) {
1214
+ xm = m->next;
1215
+ if (m->c->word != BAD_WORD) {
1216
+ m->next = head;
1217
+ head = m;
1218
+ } else {
1219
+ xfree((char *) m, sizeof(C_list));
1220
+ }
1221
+ }
1222
+ t[i] = head;
1223
+ }
1224
+ }
1225
+
1226
+ /**
1227
+ * This takes two connectors (and whether these are shallow or not)
1228
+ * (and the two words that these came from) and returns TRUE if it is
1229
+ * possible for these two to match based on local considerations.
1230
+ */
1231
+ static int possible_connection(prune_context *pc,
1232
+ Connector *lc, Connector *rc,
1233
+ int lshallow, int rshallow,
1234
+ int lword, int rword)
1235
+ {
1236
+ if ((!lshallow) && (!rshallow)) return FALSE;
1237
+ /* two deep connectors can't work */
1238
+ if ((lc->word > rword) || (rc->word < lword)) return FALSE;
1239
+ /* word range constraints */
1240
+
1241
+ assert(lword < rword, "Bad word order in possible connection.");
1242
+
1243
+ /* Now, notice that the only differences between the following two
1244
+ cases is that (1) ruthless uses match and gentle uses prune_match.
1245
+ and (2) ruthless doesn't use deletable[][]. This latter fact is
1246
+ irrelevant, since deletable[][] is now guaranteed to have been
1247
+ created. */
1248
+
1249
+ if (pc->power_prune_mode == RUTHLESS) {
1250
+ if (lword == rword-1) {
1251
+ if (!((lc->next == NULL) && (rc->next == NULL))) return FALSE;
1252
+ } else {
1253
+ if ((!pc->null_links) &&
1254
+ (lc->next == NULL) && (rc->next == NULL) && (!lc->multi) && (!rc->multi)) {
1255
+ return FALSE;
1256
+ }
1257
+ }
1258
+ return do_match(pc->sent, lc, rc, lword, rword);
1259
+ } else {
1260
+ if (lword == rword-1) {
1261
+ if (!((lc->next == NULL) && (rc->next == NULL))) return FALSE;
1262
+ } else {
1263
+ if ((!pc->null_links) &&
1264
+ (lc->next == NULL) && (rc->next == NULL) && (!lc->multi) && (!rc->multi) &&
1265
+ !pc->deletable[lword][rword]) {
1266
+ return FALSE;
1267
+ }
1268
+ }
1269
+ return prune_match(pc->effective_dist[lword][rword], lc, rc);
1270
+ }
1271
+ }
1272
+
1273
+
1274
+ /**
1275
+ * This returns TRUE if the right table of word w contains
1276
+ * a connector that can match to c. shallow tells if c is shallow.
1277
+ */
1278
+ static int right_table_search(prune_context *pc, int w, Connector *c, int shallow, int word_c)
1279
+ {
1280
+ int size, h;
1281
+ C_list *cl;
1282
+ power_table *pt;
1283
+
1284
+ pt = pc->pt;
1285
+ size = pt->r_table_size[w];
1286
+ h = connector_hash(c) & (size-1);
1287
+ for (cl = pt->r_table[w][h]; cl != NULL; cl = cl->next)
1288
+ {
1289
+ if (possible_connection(pc, cl->c, c, cl->shallow, shallow, w, word_c))
1290
+ {
1291
+ return TRUE;
1292
+ }
1293
+ }
1294
+ return FALSE;
1295
+ }
1296
+
1297
+ /**
1298
+ * This returns TRUE if the right table of word w contains
1299
+ * a connector that can match to c. shallows tells if c is shallow
1300
+ */
1301
+ static int left_table_search(prune_context *pc, int w, Connector *c, int shallow, int word_c)
1302
+ {
1303
+ int size, h;
1304
+ C_list *cl;
1305
+ power_table *pt;
1306
+
1307
+ pt = pc->pt;
1308
+ size = pt->l_table_size[w];
1309
+ h = connector_hash(c) & (size-1);
1310
+ for (cl = pt->l_table[w][h]; cl != NULL; cl = cl->next)
1311
+ {
1312
+ if (possible_connection(pc, c, cl->c, shallow, cl->shallow, word_c, w))
1313
+ {
1314
+ return TRUE;
1315
+ }
1316
+ }
1317
+ return FALSE;
1318
+ }
1319
+
1320
+ #if NOT_USED_NOW
1321
+ static int ok_cwords(Sentence sent, Connector *c)
1322
+ {
1323
+ for (; c != NULL; c=c->next) {
1324
+ if (c->word == BAD_WORD) return FALSE;
1325
+ if (c->word >= sent->length) return FALSE;
1326
+ }
1327
+ return TRUE;
1328
+ }
1329
+ #endif
1330
+
1331
+ /**
1332
+ * take this connector list, and try to match it with the words
1333
+ * w-1, w-2, w-3...Returns the word to which the first connector of the
1334
+ * list could possibly be matched. If c is NULL, returns w. If there
1335
+ * is no way to match this list, it returns a negative number.
1336
+ * If it does find a way to match it, it updates the c->word fields
1337
+ * correctly.
1338
+ */
1339
+ static int left_connector_list_update(prune_context *pc, Connector *c, int word_c, int w, int shallow)
1340
+ {
1341
+ int n;
1342
+ int foundmatch;
1343
+
1344
+ if (c==NULL) return w;
1345
+ n = left_connector_list_update(pc, c->next, word_c, w, FALSE) - 1;
1346
+ if (((int) c->word) < n) n = c->word;
1347
+
1348
+ /* n is now the rightmost word we need to check */
1349
+ foundmatch = FALSE;
1350
+ for (; (n >= 0) && ((w-n) < MAX_SENTENCE); n--) {
1351
+ pc->power_cost++;
1352
+ if (right_table_search(pc, n, c, shallow, word_c)) {
1353
+ foundmatch = TRUE;
1354
+ break;
1355
+ }
1356
+ }
1357
+ if (n < ((int) c->word)) {
1358
+ c->word = n;
1359
+ pc->N_changed++;
1360
+ }
1361
+ return (foundmatch ? n : -1);
1362
+ }
1363
+
1364
+ /**
1365
+ * take this connector list, and try to match it with the words
1366
+ * w+1, w+2, w+3...Returns the word to which the first connector of the
1367
+ * list could possibly be matched. If c is NULL, returns w. If there
1368
+ * is no way to match this list, it returns a number greater than N_words-1
1369
+ * If it does find a way to match it, it updates the c->word fields
1370
+ * correctly.
1371
+ */
1372
+ static int right_connector_list_update(prune_context *pc, Sentence sent, Connector *c,
1373
+ int word_c, int w, int shallow)
1374
+ {
1375
+ int n;
1376
+ int foundmatch;
1377
+
1378
+ if (c==NULL) return w;
1379
+ n = right_connector_list_update(pc, sent, c->next, word_c, w, FALSE) + 1;
1380
+ if (c->word > n) n = c->word;
1381
+
1382
+ /* n is now the leftmost word we need to check */
1383
+ foundmatch = FALSE;
1384
+ for (; (n < sent->length) && ((n-w) < MAX_SENTENCE); n++) {
1385
+ pc->power_cost++;
1386
+ if (left_table_search(pc, n, c, shallow, word_c)) {
1387
+ foundmatch = TRUE;
1388
+ break;
1389
+ }
1390
+ }
1391
+ if (n > c->word) {
1392
+ c->word = n;
1393
+ pc->N_changed++;
1394
+ }
1395
+ return (foundmatch ? n : sent->length);
1396
+ }
1397
+
1398
+ /** The return value is the number of disjuncts deleted */
1399
+ int power_prune(Sentence sent, int mode, Parse_Options opts)
1400
+ {
1401
+ power_table *pt;
1402
+ prune_context *pc;
1403
+ Disjunct *d, *free_later, *dx, *nd;
1404
+ Connector *c;
1405
+ int w, N_deleted, total_deleted;
1406
+
1407
+ pc = (prune_context *) malloc (sizeof(prune_context));
1408
+ pc->power_cost = 0;
1409
+ pc->power_prune_mode = mode;
1410
+ pc->null_links = (opts->min_null_count > 0);
1411
+ pc->N_changed = 1; /* forces it always to make at least two passes */
1412
+ pc->deletable = sent->deletable;
1413
+ pc->effective_dist = sent->effective_dist;
1414
+ pc->sent = sent;
1415
+
1416
+ count_set_effective_distance(sent);
1417
+
1418
+ pt = power_table_new(sent);
1419
+ pc->pt = pt;
1420
+
1421
+ free_later = NULL;
1422
+ N_deleted = 0;
1423
+
1424
+ total_deleted = 0;
1425
+
1426
+ while (1)
1427
+ {
1428
+ /* left-to-right pass */
1429
+ for (w = 0; w < sent->length; w++) {
1430
+ if (parse_options_resources_exhausted(opts)) break;
1431
+ for (d = sent->word[w].d; d != NULL; d = d->next) {
1432
+ if (d->left == NULL) continue;
1433
+ if (left_connector_list_update(pc, d->left, w, w, TRUE) < 0) {
1434
+ for (c=d->left ;c!=NULL; c = c->next) c->word = BAD_WORD;
1435
+ for (c=d->right ;c!=NULL; c = c->next) c->word = BAD_WORD;
1436
+ N_deleted++;
1437
+ total_deleted++;
1438
+ }
1439
+ }
1440
+
1441
+ clean_table(pt->r_table_size[w], pt->r_table[w]);
1442
+ nd = NULL;
1443
+ for (d = sent->word[w].d; d != NULL; d = dx) {
1444
+ dx = d->next;
1445
+ if ((d->left != NULL) && (d->left->word == BAD_WORD)) {
1446
+ d->next = free_later;
1447
+ free_later = d;
1448
+ } else {
1449
+ d->next = nd;
1450
+ nd = d;
1451
+ }
1452
+ }
1453
+ sent->word[w].d = nd;
1454
+ }
1455
+ if (verbosity > 2) {
1456
+ printf("l->r pass changed %d and deleted %d\n",pc->N_changed,N_deleted);
1457
+ }
1458
+
1459
+ if (pc->N_changed == 0) break;
1460
+
1461
+ pc->N_changed = N_deleted = 0;
1462
+ /* right-to-left pass */
1463
+
1464
+ for (w = sent->length-1; w >= 0; w--) {
1465
+ if (parse_options_resources_exhausted(opts)) break;
1466
+ for (d = sent->word[w].d; d != NULL; d = d->next) {
1467
+ if (d->right == NULL) continue;
1468
+ if (right_connector_list_update(pc, sent, d->right,w,w,TRUE) >= sent->length){
1469
+ for (c=d->right;c!=NULL; c = c->next) c->word = BAD_WORD;
1470
+ for (c=d->left ;c!=NULL; c = c->next) c->word = BAD_WORD;
1471
+ N_deleted++;
1472
+ total_deleted++;
1473
+ }
1474
+ }
1475
+ clean_table(pt->l_table_size[w], pt->l_table[w]);
1476
+ nd = NULL;
1477
+ for (d = sent->word[w].d; d != NULL; d = dx) {
1478
+ dx = d->next;
1479
+ if ((d->right != NULL) && (d->right->word == BAD_WORD)) {
1480
+ d->next = free_later;
1481
+ free_later = d;
1482
+ } else {
1483
+ d->next = nd;
1484
+ nd = d;
1485
+ }
1486
+ }
1487
+ sent->word[w].d = nd;
1488
+ }
1489
+
1490
+ if (verbosity > 2) {
1491
+ printf("r->l pass changed %d and deleted %d\n", pc->N_changed,N_deleted);
1492
+ }
1493
+
1494
+ if (pc->N_changed == 0) break;
1495
+ pc->N_changed = N_deleted = 0;
1496
+ }
1497
+ free_disjuncts(free_later);
1498
+ power_table_delete(pt);
1499
+ pt = NULL;
1500
+ pc->pt = NULL;
1501
+
1502
+ if (verbosity > 2) printf("%d power prune cost:\n", pc->power_cost);
1503
+
1504
+ if (mode == RUTHLESS) {
1505
+ print_time(opts, "power pruned (ruthless)");
1506
+ } else {
1507
+ print_time(opts, "power pruned (gentle)");
1508
+ }
1509
+
1510
+ if (verbosity > 2) {
1511
+ if (mode == RUTHLESS) {
1512
+ printf("\nAfter power_pruning (ruthless):\n");
1513
+ } else {
1514
+ printf("\nAfter power_pruning (gentle):\n");
1515
+ }
1516
+ print_disjunct_counts(sent);
1517
+ }
1518
+
1519
+ free(pc);
1520
+ return total_deleted;
1521
+ }
1522
+
1523
+ /* ===================================================================
1524
+ PP Pruning
1525
+
1526
+ The "contains one" post-processing constraints give us a new way to
1527
+ prune. Suppose there's a rule that says "a group that contains foo
1528
+ must contain a bar or a baz." Here foo, bar, and baz are connector
1529
+ types. foo is the trigger link, bar and baz are called the criterion
1530
+ links. If, after considering the disjuncts we find that there is is
1531
+ a foo, but neither a bar, nor a baz, then we can eliminte the disjuct
1532
+ containing bar.
1533
+
1534
+ Things are actually a bit more complex, because of the matching rules
1535
+ and subscripts. The problem is that post-processing deals with link
1536
+ names, while at this point all we have to work with is connector
1537
+ names. Consider the foo part. Consider a connector C. When does
1538
+ foo match C for our purposes? It matches it if every possible link
1539
+ name L (that can result from C being at one end of that link) results
1540
+ in post_process_match(foo,L) being true. Suppose foo contains a "*".
1541
+ Then there is no C that has this property. This is because the *s in
1542
+ C may be replaced by some other subscripts in the construction of L.
1543
+ And the non-* chars in L will not post_process_match the * in foo.
1544
+
1545
+ So let's assume that foo has no *. Now the result we want is simply
1546
+ given by post_process_match(foo, C). Proof: L is the same as C but
1547
+ with some *s replaced by some other letters. Since foo contains no *
1548
+ the replacement in C of some * by some other letter could change
1549
+ post_process_match from FALSE to TRUE, but not vice versa. Therefore
1550
+ it's conservative to use this test.
1551
+
1552
+ For the criterion parts, we need to determine if there is a
1553
+ collection of connectors C1, C2,... such that by combining them you
1554
+ can get a link name L that post_process_matches bar or baz. Here's a
1555
+ way to do this. Say bar="Xabc". Then we see if there are connector
1556
+ names that post_process_match "Xa##", "X#b#", and "X##c". They must
1557
+ all be there in order for it to be possible to create a link name
1558
+ "Xabc". A "*" in the criterion part is a little different. In this
1559
+ case we can simply skip the * (treat it like an upper case letter)
1560
+ for this purpose. So if bar="X*ab" then we look for "X*#b" and
1561
+ "X*a#". (The * in this case could be treated the same as another
1562
+ subscript without breaking it.) Note also that it's only necessary
1563
+ to find a way to match one of the many criterion links that may be in
1564
+ the rule. If none can match, then we can delete the disjunct
1565
+ containing C.
1566
+
1567
+ Here's how we're going to implement this. We'll maintain a multiset
1568
+ of connector names. We'll represent them in a hash table, where the
1569
+ hash function uses only the upper case letters of the connector name.
1570
+ We'll insert all the connectors into the multiset. The multiset will
1571
+ support the operation of deletion (probably simplest to just
1572
+ decrement the count). Here's the algorithm.
1573
+
1574
+ Insert all the connectors into M.
1575
+
1576
+ While the previous pass caused a count to go to 0 do:
1577
+ For each connector C do
1578
+ For each rule R do
1579
+ if C is a trigger for R and the criterion links
1580
+ of the rule cannot be satisfied by the connectors in
1581
+ M, Then:
1582
+ We delete C's disjunct. But before we do,
1583
+ we remove all the connectors of this disjunct
1584
+ from the multiset. Keep tabs on whether or not
1585
+ any of the counts went to 0.
1586
+
1587
+
1588
+
1589
+ Efficiency hacks to be added later:
1590
+ Note for a given rule can become less and less satisfiable.
1591
+ That is, rule_satisfiable(r) for a given rule r can change from
1592
+ TRUE to FALSE, but not vice versa. So once it's FALSE, we can just
1593
+ remember that.
1594
+
1595
+ Consider the effect of a pass p on the set of rules that are
1596
+ satisfiable. Suppose this set does not change. Then pass p+1
1597
+ will do nothing. This is true even if pass p caused some
1598
+ disjuncts to be deleted. (This observation will only obviate
1599
+ the need for the last pass.)
1600
+
1601
+ */
1602
+
1603
+ static multiset_table * cms_table_new(void)
1604
+ {
1605
+ multiset_table *mt;
1606
+ int i;
1607
+
1608
+ mt = (multiset_table *) malloc(sizeof(multiset_table));
1609
+
1610
+ for (i=0; i<CMS_SIZE; i++) {
1611
+ mt->cms_table[i] = NULL;
1612
+ }
1613
+ return mt;
1614
+ }
1615
+
1616
+ static void cms_table_delete(multiset_table *mt)
1617
+ {
1618
+ Cms * cms, *xcms;
1619
+ int i;
1620
+ for (i=0; i<CMS_SIZE; i++)
1621
+ {
1622
+ for (cms = mt->cms_table[i]; cms != NULL; cms = xcms)
1623
+ {
1624
+ xcms = cms->next;
1625
+ xfree(cms, sizeof(Cms));
1626
+ }
1627
+ }
1628
+ free(mt);
1629
+ }
1630
+
1631
+ static int cms_hash(const char * s)
1632
+ {
1633
+ unsigned int i = 5381;
1634
+ while (isupper((int) *s)) /* connector names are not yet UTF8-capable */
1635
+ {
1636
+ i = ((i << 5) + i) + *s;
1637
+ s++;
1638
+ }
1639
+ return (i & (CMS_SIZE-1));
1640
+ }
1641
+
1642
+ /**
1643
+ * This returns TRUE if there is a connector name C in the table
1644
+ * such that post_process_match(pp_match_name, C) is TRUE
1645
+ */
1646
+ static int match_in_cms_table(multiset_table *cmt, const char * pp_match_name)
1647
+ {
1648
+ Cms * cms;
1649
+ for (cms = cmt->cms_table[cms_hash(pp_match_name)]; cms != NULL; cms = cms->next)
1650
+ {
1651
+ if(post_process_match(pp_match_name, cms->name)) return TRUE;
1652
+ }
1653
+ return FALSE;
1654
+ }
1655
+
1656
+ static Cms * lookup_in_cms_table(multiset_table *cmt, const char * str)
1657
+ {
1658
+ Cms * cms;
1659
+ for (cms = cmt->cms_table[cms_hash(str)]; cms != NULL; cms = cms->next)
1660
+ {
1661
+ if(strcmp(str, cms->name) == 0) return cms;
1662
+ }
1663
+ return NULL;
1664
+ }
1665
+
1666
+ static void insert_in_cms_table(multiset_table *cmt, const char * str)
1667
+ {
1668
+ Cms * cms;
1669
+ int h;
1670
+ cms = lookup_in_cms_table(cmt, str);
1671
+ if (cms != NULL) {
1672
+ cms->count++;
1673
+ } else {
1674
+ cms = (Cms *) xalloc(sizeof(Cms));
1675
+ cms->name = str; /* don't copy the string...just keep a pointer to it.
1676
+ we won't free these later */
1677
+ cms->count = 1;
1678
+ h = cms_hash(str);
1679
+ cms->next = cmt->cms_table[h];
1680
+ cmt->cms_table[h] = cms;
1681
+ }
1682
+ }
1683
+
1684
+ /**
1685
+ * Delete the given string from the table. Return TRUE if
1686
+ * this caused a count to go to 0, return FALSE otherwise.
1687
+ */
1688
+ static int delete_from_cms_table(multiset_table *cmt, const char * str)
1689
+ {
1690
+ Cms * cms;
1691
+ cms = lookup_in_cms_table(cmt, str);
1692
+ if (cms != NULL && cms->count > 0) {
1693
+ cms->count--;
1694
+ return (cms->count == 0);
1695
+ }
1696
+ return FALSE;
1697
+ }
1698
+
1699
+ static int rule_satisfiable(multiset_table *cmt, pp_linkset *ls)
1700
+ {
1701
+ int hashval;
1702
+ const char * t;
1703
+ char name[20], *s;
1704
+ pp_linkset_node *p;
1705
+ int bad, n_subscripts;
1706
+
1707
+ for (hashval = 0; hashval < ls->hash_table_size; hashval++)
1708
+ {
1709
+ for (p = ls->hash_table[hashval]; p!=NULL; p=p->next)
1710
+ {
1711
+ /* ok, we've got our hands on one of the criterion links */
1712
+ strncpy(name, p->str, sizeof(name)-1);
1713
+ /* could actually use the string in place because we change it back */
1714
+ name[sizeof(name)-1] = '\0';
1715
+ /* now we want to see if we can satisfy this criterion link */
1716
+ /* with a collection of the links in the cms table */
1717
+
1718
+ for (s = name; isupper((int)*s); s++) {}
1719
+ for (;*s != '\0'; s++) if (*s != '*') *s = '#';
1720
+ for (s = name, t = p->str; isupper((int) *s); s++, t++) {}
1721
+
1722
+ /* s and t remain in lockstep */
1723
+ bad = 0;
1724
+ n_subscripts = 0;
1725
+ for (;*s != '\0' && bad==0; s++, t++) {
1726
+ if (*s == '*') continue;
1727
+ n_subscripts++;
1728
+ /* after the upper case part, and is not a * so must be a regular subscript */
1729
+ *s = *t;
1730
+ if (!match_in_cms_table(cmt, name)) bad++;
1731
+ *s = '#';
1732
+ }
1733
+
1734
+ if (n_subscripts == 0) {
1735
+ /* now we handle the special case which occurs if there
1736
+ were 0 subscripts */
1737
+ if (!match_in_cms_table(cmt, name)) bad++;
1738
+ }
1739
+
1740
+ /* now if bad==0 this criterion link does the job
1741
+ to satisfy the needs of the trigger link */
1742
+
1743
+ if (bad == 0) return TRUE;
1744
+ }
1745
+ }
1746
+ return FALSE;
1747
+ }
1748
+
1749
+ static int pp_prune(Sentence sent, Parse_Options opts)
1750
+ {
1751
+ pp_knowledge * knowledge;
1752
+ pp_rule rule;
1753
+ const char * selector;
1754
+ pp_linkset * link_set;
1755
+ int i, w, dir;
1756
+ Disjunct *d;
1757
+ Connector *c;
1758
+ int change, total_deleted, N_deleted, deleteme;
1759
+ multiset_table *cmt;
1760
+
1761
+ if (sent->dict->postprocessor == NULL) return 0;
1762
+
1763
+ knowledge = sent->dict->postprocessor->knowledge;
1764
+
1765
+ cmt = cms_table_new();
1766
+
1767
+ for (w = 0; w < sent->length; w++) {
1768
+ for (d = sent->word[w].d; d != NULL; d = d->next) {
1769
+ d->marked = TRUE;
1770
+ for (dir=0; dir < 2; dir++) {
1771
+ for (c = ( (dir)?(d->left):(d->right) ); c!=NULL; c=c->next) {
1772
+ insert_in_cms_table(cmt, c->string);
1773
+ }
1774
+ }
1775
+ }
1776
+ }
1777
+
1778
+ total_deleted = 0;
1779
+ change = 1;
1780
+ while (change > 0) {
1781
+ change = 0;
1782
+ N_deleted = 0;
1783
+ for (w = 0; w < sent->length; w++) {
1784
+ for (d = sent->word[w].d; d != NULL; d = d->next) {
1785
+ if (!d->marked) continue;
1786
+ deleteme = FALSE;
1787
+ for (dir=0; dir < 2; dir++) {
1788
+ for (c = ( (dir)?(d->left):(d->right) ); c!=NULL; c=c->next) {
1789
+ for (i=0; i<knowledge->n_contains_one_rules; i++) {
1790
+
1791
+ rule = knowledge->contains_one_rules[i]; /* the ith rule */
1792
+ selector = rule.selector; /* selector string for this rule */
1793
+ link_set = rule.link_set; /* the set of criterion links */
1794
+
1795
+ if (strchr(selector, '*') != NULL) continue; /* If it has a * forget it */
1796
+
1797
+ if (!post_process_match(selector, c->string)) continue;
1798
+
1799
+ /*
1800
+ printf("pp_prune: trigger ok. selector = %s c->string = %s\n", selector, c->string);
1801
+ */
1802
+
1803
+ /* We know c matches the trigger link of the rule. */
1804
+ /* Now check the criterion links */
1805
+
1806
+ if (!rule_satisfiable(cmt, link_set)) {
1807
+ deleteme = TRUE;
1808
+ }
1809
+ if (deleteme) break;
1810
+ }
1811
+ if (deleteme) break;
1812
+ }
1813
+ if (deleteme) break;
1814
+ }
1815
+
1816
+ if (deleteme) { /* now we delete this disjunct */
1817
+ N_deleted++;
1818
+ total_deleted++;
1819
+ d->marked = FALSE; /* mark for deletion later */
1820
+ for (dir=0; dir < 2; dir++) {
1821
+ for (c = ( (dir)?(d->left):(d->right) ); c!=NULL; c=c->next) {
1822
+ change += delete_from_cms_table(cmt, c->string);
1823
+ }
1824
+ }
1825
+ }
1826
+ }
1827
+ }
1828
+
1829
+ if (verbosity > 2) {
1830
+ printf("pp_prune pass deleted %d\n", N_deleted);
1831
+ }
1832
+
1833
+ }
1834
+ delete_unmarked_disjuncts(sent);
1835
+ cms_table_delete(cmt);
1836
+
1837
+ if (verbosity > 2) {
1838
+ printf("\nAfter pp_pruning:\n");
1839
+ print_disjunct_counts(sent);
1840
+ }
1841
+
1842
+ print_time(opts, "pp pruning");
1843
+
1844
+ return total_deleted;
1845
+ }
1846
+
1847
+
1848
+ /**
1849
+ * Do the following pruning steps until nothing happens:
1850
+ * power pp power pp power pp....
1851
+ * Make sure you do them both at least once.
1852
+ */
1853
+ void pp_and_power_prune(Sentence sent, int mode, Parse_Options opts)
1854
+ {
1855
+ power_prune(sent, mode, opts);
1856
+
1857
+ for (;;) {
1858
+ if (parse_options_resources_exhausted(opts)) break;
1859
+ if (pp_prune(sent, opts) == 0) break;
1860
+ if (parse_options_resources_exhausted(opts)) break;
1861
+ if (power_prune(sent, mode, opts) == 0) break;
1862
+ }
1863
+ }
1864
+