grammar_cop 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (344) hide show
  1. data/.DS_Store +0 -0
  2. data/.gitignore +4 -0
  3. data/Gemfile +4 -0
  4. data/Rakefile +8 -0
  5. data/data/.DS_Store +0 -0
  6. data/data/Makefile +511 -0
  7. data/data/Makefile.am +4 -0
  8. data/data/Makefile.in +511 -0
  9. data/data/de/.DS_Store +0 -0
  10. data/data/de/4.0.affix +7 -0
  11. data/data/de/4.0.dict +474 -0
  12. data/data/de/Makefile +387 -0
  13. data/data/de/Makefile.am +9 -0
  14. data/data/de/Makefile.in +387 -0
  15. data/data/en/.DS_Store +0 -0
  16. data/data/en/4.0.affix +26 -0
  17. data/data/en/4.0.batch +1002 -0
  18. data/data/en/4.0.biolg.batch +411 -0
  19. data/data/en/4.0.constituent-knowledge +127 -0
  20. data/data/en/4.0.dict +8759 -0
  21. data/data/en/4.0.dict.m4 +6928 -0
  22. data/data/en/4.0.enwiki.batch +14 -0
  23. data/data/en/4.0.fixes.batch +2776 -0
  24. data/data/en/4.0.knowledge +306 -0
  25. data/data/en/4.0.regex +225 -0
  26. data/data/en/4.0.voa.batch +114 -0
  27. data/data/en/Makefile +554 -0
  28. data/data/en/Makefile.am +19 -0
  29. data/data/en/Makefile.in +554 -0
  30. data/data/en/README +173 -0
  31. data/data/en/tiny.dict +157 -0
  32. data/data/en/words/.DS_Store +0 -0
  33. data/data/en/words/Makefile +456 -0
  34. data/data/en/words/Makefile.am +78 -0
  35. data/data/en/words/Makefile.in +456 -0
  36. data/data/en/words/currency +205 -0
  37. data/data/en/words/currency.p +28 -0
  38. data/data/en/words/entities.given-bisex.sing +39 -0
  39. data/data/en/words/entities.given-female.sing +4141 -0
  40. data/data/en/words/entities.given-male.sing +1633 -0
  41. data/data/en/words/entities.locations.sing +68 -0
  42. data/data/en/words/entities.national.sing +253 -0
  43. data/data/en/words/entities.organizations.sing +7 -0
  44. data/data/en/words/entities.us-states.sing +11 -0
  45. data/data/en/words/units.1 +45 -0
  46. data/data/en/words/units.1.dot +4 -0
  47. data/data/en/words/units.3 +2 -0
  48. data/data/en/words/units.4 +5 -0
  49. data/data/en/words/units.4.dot +1 -0
  50. data/data/en/words/words-medical.adv.1 +1191 -0
  51. data/data/en/words/words-medical.prep.1 +67 -0
  52. data/data/en/words/words-medical.v.4.1 +2835 -0
  53. data/data/en/words/words-medical.v.4.2 +2848 -0
  54. data/data/en/words/words-medical.v.4.3 +3011 -0
  55. data/data/en/words/words-medical.v.4.4 +3036 -0
  56. data/data/en/words/words-medical.v.4.5 +3050 -0
  57. data/data/en/words/words.adj.1 +6794 -0
  58. data/data/en/words/words.adj.2 +638 -0
  59. data/data/en/words/words.adj.3 +667 -0
  60. data/data/en/words/words.adv.1 +1573 -0
  61. data/data/en/words/words.adv.2 +67 -0
  62. data/data/en/words/words.adv.3 +157 -0
  63. data/data/en/words/words.adv.4 +80 -0
  64. data/data/en/words/words.n.1 +11464 -0
  65. data/data/en/words/words.n.1.wiki +264 -0
  66. data/data/en/words/words.n.2.s +2017 -0
  67. data/data/en/words/words.n.2.s.biolg +1 -0
  68. data/data/en/words/words.n.2.s.wiki +298 -0
  69. data/data/en/words/words.n.2.x +65 -0
  70. data/data/en/words/words.n.2.x.wiki +10 -0
  71. data/data/en/words/words.n.3 +5717 -0
  72. data/data/en/words/words.n.t +23 -0
  73. data/data/en/words/words.v.1.1 +1038 -0
  74. data/data/en/words/words.v.1.2 +1043 -0
  75. data/data/en/words/words.v.1.3 +1052 -0
  76. data/data/en/words/words.v.1.4 +1023 -0
  77. data/data/en/words/words.v.1.p +17 -0
  78. data/data/en/words/words.v.10.1 +14 -0
  79. data/data/en/words/words.v.10.2 +15 -0
  80. data/data/en/words/words.v.10.3 +88 -0
  81. data/data/en/words/words.v.10.4 +17 -0
  82. data/data/en/words/words.v.2.1 +1253 -0
  83. data/data/en/words/words.v.2.2 +1304 -0
  84. data/data/en/words/words.v.2.3 +1280 -0
  85. data/data/en/words/words.v.2.4 +1285 -0
  86. data/data/en/words/words.v.2.5 +1287 -0
  87. data/data/en/words/words.v.4.1 +2472 -0
  88. data/data/en/words/words.v.4.2 +2487 -0
  89. data/data/en/words/words.v.4.3 +2441 -0
  90. data/data/en/words/words.v.4.4 +2478 -0
  91. data/data/en/words/words.v.4.5 +2483 -0
  92. data/data/en/words/words.v.5.1 +98 -0
  93. data/data/en/words/words.v.5.2 +98 -0
  94. data/data/en/words/words.v.5.3 +103 -0
  95. data/data/en/words/words.v.5.4 +102 -0
  96. data/data/en/words/words.v.6.1 +388 -0
  97. data/data/en/words/words.v.6.2 +401 -0
  98. data/data/en/words/words.v.6.3 +397 -0
  99. data/data/en/words/words.v.6.4 +405 -0
  100. data/data/en/words/words.v.6.5 +401 -0
  101. data/data/en/words/words.v.8.1 +117 -0
  102. data/data/en/words/words.v.8.2 +118 -0
  103. data/data/en/words/words.v.8.3 +118 -0
  104. data/data/en/words/words.v.8.4 +119 -0
  105. data/data/en/words/words.v.8.5 +119 -0
  106. data/data/en/words/words.y +104 -0
  107. data/data/lt/.DS_Store +0 -0
  108. data/data/lt/4.0.affix +6 -0
  109. data/data/lt/4.0.constituent-knowledge +24 -0
  110. data/data/lt/4.0.dict +135 -0
  111. data/data/lt/4.0.knowledge +38 -0
  112. data/data/lt/Makefile +389 -0
  113. data/data/lt/Makefile.am +11 -0
  114. data/data/lt/Makefile.in +389 -0
  115. data/ext/.DS_Store +0 -0
  116. data/ext/link_grammar/.DS_Store +0 -0
  117. data/ext/link_grammar/extconf.rb +2 -0
  118. data/ext/link_grammar/link-grammar/.DS_Store +0 -0
  119. data/ext/link_grammar/link-grammar/.deps/analyze-linkage.Plo +198 -0
  120. data/ext/link_grammar/link-grammar/.deps/and.Plo +202 -0
  121. data/ext/link_grammar/link-grammar/.deps/api.Plo +244 -0
  122. data/ext/link_grammar/link-grammar/.deps/build-disjuncts.Plo +212 -0
  123. data/ext/link_grammar/link-grammar/.deps/command-line.Plo +201 -0
  124. data/ext/link_grammar/link-grammar/.deps/constituents.Plo +201 -0
  125. data/ext/link_grammar/link-grammar/.deps/count.Plo +202 -0
  126. data/ext/link_grammar/link-grammar/.deps/disjunct-utils.Plo +126 -0
  127. data/ext/link_grammar/link-grammar/.deps/disjuncts.Plo +123 -0
  128. data/ext/link_grammar/link-grammar/.deps/error.Plo +121 -0
  129. data/ext/link_grammar/link-grammar/.deps/expand.Plo +133 -0
  130. data/ext/link_grammar/link-grammar/.deps/extract-links.Plo +198 -0
  131. data/ext/link_grammar/link-grammar/.deps/fast-match.Plo +200 -0
  132. data/ext/link_grammar/link-grammar/.deps/idiom.Plo +200 -0
  133. data/ext/link_grammar/link-grammar/.deps/jni-client.Plo +217 -0
  134. data/ext/link_grammar/link-grammar/.deps/link-parser.Po +1 -0
  135. data/ext/link_grammar/link-grammar/.deps/massage.Plo +202 -0
  136. data/ext/link_grammar/link-grammar/.deps/post-process.Plo +202 -0
  137. data/ext/link_grammar/link-grammar/.deps/pp_knowledge.Plo +202 -0
  138. data/ext/link_grammar/link-grammar/.deps/pp_lexer.Plo +201 -0
  139. data/ext/link_grammar/link-grammar/.deps/pp_linkset.Plo +200 -0
  140. data/ext/link_grammar/link-grammar/.deps/prefix.Plo +102 -0
  141. data/ext/link_grammar/link-grammar/.deps/preparation.Plo +202 -0
  142. data/ext/link_grammar/link-grammar/.deps/print-util.Plo +200 -0
  143. data/ext/link_grammar/link-grammar/.deps/print.Plo +201 -0
  144. data/ext/link_grammar/link-grammar/.deps/prune.Plo +202 -0
  145. data/ext/link_grammar/link-grammar/.deps/read-dict.Plo +223 -0
  146. data/ext/link_grammar/link-grammar/.deps/read-regex.Plo +123 -0
  147. data/ext/link_grammar/link-grammar/.deps/regex-morph.Plo +131 -0
  148. data/ext/link_grammar/link-grammar/.deps/resources.Plo +203 -0
  149. data/ext/link_grammar/link-grammar/.deps/spellcheck-aspell.Plo +1 -0
  150. data/ext/link_grammar/link-grammar/.deps/spellcheck-hun.Plo +115 -0
  151. data/ext/link_grammar/link-grammar/.deps/string-set.Plo +198 -0
  152. data/ext/link_grammar/link-grammar/.deps/tokenize.Plo +160 -0
  153. data/ext/link_grammar/link-grammar/.deps/utilities.Plo +222 -0
  154. data/ext/link_grammar/link-grammar/.deps/word-file.Plo +201 -0
  155. data/ext/link_grammar/link-grammar/.deps/word-utils.Plo +212 -0
  156. data/ext/link_grammar/link-grammar/.libs/analyze-linkage.o +0 -0
  157. data/ext/link_grammar/link-grammar/.libs/and.o +0 -0
  158. data/ext/link_grammar/link-grammar/.libs/api.o +0 -0
  159. data/ext/link_grammar/link-grammar/.libs/build-disjuncts.o +0 -0
  160. data/ext/link_grammar/link-grammar/.libs/command-line.o +0 -0
  161. data/ext/link_grammar/link-grammar/.libs/constituents.o +0 -0
  162. data/ext/link_grammar/link-grammar/.libs/count.o +0 -0
  163. data/ext/link_grammar/link-grammar/.libs/disjunct-utils.o +0 -0
  164. data/ext/link_grammar/link-grammar/.libs/disjuncts.o +0 -0
  165. data/ext/link_grammar/link-grammar/.libs/error.o +0 -0
  166. data/ext/link_grammar/link-grammar/.libs/expand.o +0 -0
  167. data/ext/link_grammar/link-grammar/.libs/extract-links.o +0 -0
  168. data/ext/link_grammar/link-grammar/.libs/fast-match.o +0 -0
  169. data/ext/link_grammar/link-grammar/.libs/idiom.o +0 -0
  170. data/ext/link_grammar/link-grammar/.libs/jni-client.o +0 -0
  171. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java-symbols.expsym +31 -0
  172. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib +0 -0
  173. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib.dSYM/Contents/Info.plist +20 -0
  174. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib.dSYM/Contents/Resources/DWARF/liblink-grammar-java.4.dylib +0 -0
  175. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.a +0 -0
  176. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.dylib +0 -0
  177. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-symbols.expsym +194 -0
  178. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib +0 -0
  179. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib.dSYM/Contents/Info.plist +20 -0
  180. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib.dSYM/Contents/Resources/DWARF/liblink-grammar.4.dylib +0 -0
  181. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.a +0 -0
  182. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.dylib +0 -0
  183. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.la +41 -0
  184. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.lai +41 -0
  185. data/ext/link_grammar/link-grammar/.libs/massage.o +0 -0
  186. data/ext/link_grammar/link-grammar/.libs/post-process.o +0 -0
  187. data/ext/link_grammar/link-grammar/.libs/pp_knowledge.o +0 -0
  188. data/ext/link_grammar/link-grammar/.libs/pp_lexer.o +0 -0
  189. data/ext/link_grammar/link-grammar/.libs/pp_linkset.o +0 -0
  190. data/ext/link_grammar/link-grammar/.libs/prefix.o +0 -0
  191. data/ext/link_grammar/link-grammar/.libs/preparation.o +0 -0
  192. data/ext/link_grammar/link-grammar/.libs/print-util.o +0 -0
  193. data/ext/link_grammar/link-grammar/.libs/print.o +0 -0
  194. data/ext/link_grammar/link-grammar/.libs/prune.o +0 -0
  195. data/ext/link_grammar/link-grammar/.libs/read-dict.o +0 -0
  196. data/ext/link_grammar/link-grammar/.libs/read-regex.o +0 -0
  197. data/ext/link_grammar/link-grammar/.libs/regex-morph.o +0 -0
  198. data/ext/link_grammar/link-grammar/.libs/resources.o +0 -0
  199. data/ext/link_grammar/link-grammar/.libs/spellcheck-aspell.o +0 -0
  200. data/ext/link_grammar/link-grammar/.libs/spellcheck-hun.o +0 -0
  201. data/ext/link_grammar/link-grammar/.libs/string-set.o +0 -0
  202. data/ext/link_grammar/link-grammar/.libs/tokenize.o +0 -0
  203. data/ext/link_grammar/link-grammar/.libs/utilities.o +0 -0
  204. data/ext/link_grammar/link-grammar/.libs/word-file.o +0 -0
  205. data/ext/link_grammar/link-grammar/.libs/word-utils.o +0 -0
  206. data/ext/link_grammar/link-grammar/Makefile +900 -0
  207. data/ext/link_grammar/link-grammar/Makefile.am +202 -0
  208. data/ext/link_grammar/link-grammar/Makefile.in +900 -0
  209. data/ext/link_grammar/link-grammar/analyze-linkage.c +1317 -0
  210. data/ext/link_grammar/link-grammar/analyze-linkage.h +24 -0
  211. data/ext/link_grammar/link-grammar/and.c +1603 -0
  212. data/ext/link_grammar/link-grammar/and.h +27 -0
  213. data/ext/link_grammar/link-grammar/api-structures.h +362 -0
  214. data/ext/link_grammar/link-grammar/api-types.h +72 -0
  215. data/ext/link_grammar/link-grammar/api.c +1887 -0
  216. data/ext/link_grammar/link-grammar/api.h +96 -0
  217. data/ext/link_grammar/link-grammar/autoit/.DS_Store +0 -0
  218. data/ext/link_grammar/link-grammar/autoit/README +10 -0
  219. data/ext/link_grammar/link-grammar/autoit/_LGTest.au3 +22 -0
  220. data/ext/link_grammar/link-grammar/autoit/_LinkGrammar.au3 +545 -0
  221. data/ext/link_grammar/link-grammar/build-disjuncts.c +487 -0
  222. data/ext/link_grammar/link-grammar/build-disjuncts.h +21 -0
  223. data/ext/link_grammar/link-grammar/command-line.c +458 -0
  224. data/ext/link_grammar/link-grammar/command-line.h +15 -0
  225. data/ext/link_grammar/link-grammar/constituents.c +1836 -0
  226. data/ext/link_grammar/link-grammar/constituents.h +26 -0
  227. data/ext/link_grammar/link-grammar/corpus/.DS_Store +0 -0
  228. data/ext/link_grammar/link-grammar/corpus/.deps/cluster.Plo +1 -0
  229. data/ext/link_grammar/link-grammar/corpus/.deps/corpus.Plo +1 -0
  230. data/ext/link_grammar/link-grammar/corpus/Makefile +527 -0
  231. data/ext/link_grammar/link-grammar/corpus/Makefile.am +46 -0
  232. data/ext/link_grammar/link-grammar/corpus/Makefile.in +527 -0
  233. data/ext/link_grammar/link-grammar/corpus/README +17 -0
  234. data/ext/link_grammar/link-grammar/corpus/cluster.c +286 -0
  235. data/ext/link_grammar/link-grammar/corpus/cluster.h +32 -0
  236. data/ext/link_grammar/link-grammar/corpus/corpus.c +483 -0
  237. data/ext/link_grammar/link-grammar/corpus/corpus.h +46 -0
  238. data/ext/link_grammar/link-grammar/count.c +828 -0
  239. data/ext/link_grammar/link-grammar/count.h +25 -0
  240. data/ext/link_grammar/link-grammar/disjunct-utils.c +261 -0
  241. data/ext/link_grammar/link-grammar/disjunct-utils.h +27 -0
  242. data/ext/link_grammar/link-grammar/disjuncts.c +138 -0
  243. data/ext/link_grammar/link-grammar/disjuncts.h +13 -0
  244. data/ext/link_grammar/link-grammar/error.c +92 -0
  245. data/ext/link_grammar/link-grammar/error.h +35 -0
  246. data/ext/link_grammar/link-grammar/expand.c +67 -0
  247. data/ext/link_grammar/link-grammar/expand.h +13 -0
  248. data/ext/link_grammar/link-grammar/externs.h +22 -0
  249. data/ext/link_grammar/link-grammar/extract-links.c +625 -0
  250. data/ext/link_grammar/link-grammar/extract-links.h +16 -0
  251. data/ext/link_grammar/link-grammar/fast-match.c +309 -0
  252. data/ext/link_grammar/link-grammar/fast-match.h +17 -0
  253. data/ext/link_grammar/link-grammar/idiom.c +373 -0
  254. data/ext/link_grammar/link-grammar/idiom.h +15 -0
  255. data/ext/link_grammar/link-grammar/jni-client.c +779 -0
  256. data/ext/link_grammar/link-grammar/jni-client.h +236 -0
  257. data/ext/link_grammar/link-grammar/liblink-grammar-java.la +42 -0
  258. data/ext/link_grammar/link-grammar/liblink-grammar.la +41 -0
  259. data/ext/link_grammar/link-grammar/link-features.h +37 -0
  260. data/ext/link_grammar/link-grammar/link-features.h.in +37 -0
  261. data/ext/link_grammar/link-grammar/link-grammar-java.def +31 -0
  262. data/ext/link_grammar/link-grammar/link-grammar.def +194 -0
  263. data/ext/link_grammar/link-grammar/link-includes.h +465 -0
  264. data/ext/link_grammar/link-grammar/link-parser.c +849 -0
  265. data/ext/link_grammar/link-grammar/massage.c +329 -0
  266. data/ext/link_grammar/link-grammar/massage.h +13 -0
  267. data/ext/link_grammar/link-grammar/post-process.c +1113 -0
  268. data/ext/link_grammar/link-grammar/post-process.h +45 -0
  269. data/ext/link_grammar/link-grammar/pp_knowledge.c +376 -0
  270. data/ext/link_grammar/link-grammar/pp_knowledge.h +14 -0
  271. data/ext/link_grammar/link-grammar/pp_lexer.c +1920 -0
  272. data/ext/link_grammar/link-grammar/pp_lexer.h +19 -0
  273. data/ext/link_grammar/link-grammar/pp_linkset.c +158 -0
  274. data/ext/link_grammar/link-grammar/pp_linkset.h +20 -0
  275. data/ext/link_grammar/link-grammar/prefix.c +482 -0
  276. data/ext/link_grammar/link-grammar/prefix.h +139 -0
  277. data/ext/link_grammar/link-grammar/preparation.c +412 -0
  278. data/ext/link_grammar/link-grammar/preparation.h +20 -0
  279. data/ext/link_grammar/link-grammar/print-util.c +87 -0
  280. data/ext/link_grammar/link-grammar/print-util.h +32 -0
  281. data/ext/link_grammar/link-grammar/print.c +1085 -0
  282. data/ext/link_grammar/link-grammar/print.h +16 -0
  283. data/ext/link_grammar/link-grammar/prune.c +1864 -0
  284. data/ext/link_grammar/link-grammar/prune.h +17 -0
  285. data/ext/link_grammar/link-grammar/read-dict.c +1785 -0
  286. data/ext/link_grammar/link-grammar/read-dict.h +29 -0
  287. data/ext/link_grammar/link-grammar/read-regex.c +161 -0
  288. data/ext/link_grammar/link-grammar/read-regex.h +12 -0
  289. data/ext/link_grammar/link-grammar/regex-morph.c +126 -0
  290. data/ext/link_grammar/link-grammar/regex-morph.h +17 -0
  291. data/ext/link_grammar/link-grammar/resources.c +180 -0
  292. data/ext/link_grammar/link-grammar/resources.h +23 -0
  293. data/ext/link_grammar/link-grammar/sat-solver/.DS_Store +0 -0
  294. data/ext/link_grammar/link-grammar/sat-solver/.deps/fast-sprintf.Plo +1 -0
  295. data/ext/link_grammar/link-grammar/sat-solver/.deps/sat-encoder.Plo +1 -0
  296. data/ext/link_grammar/link-grammar/sat-solver/.deps/util.Plo +1 -0
  297. data/ext/link_grammar/link-grammar/sat-solver/.deps/variables.Plo +1 -0
  298. data/ext/link_grammar/link-grammar/sat-solver/.deps/word-tag.Plo +1 -0
  299. data/ext/link_grammar/link-grammar/sat-solver/Makefile +527 -0
  300. data/ext/link_grammar/link-grammar/sat-solver/Makefile.am +29 -0
  301. data/ext/link_grammar/link-grammar/sat-solver/Makefile.in +527 -0
  302. data/ext/link_grammar/link-grammar/sat-solver/clock.hpp +33 -0
  303. data/ext/link_grammar/link-grammar/sat-solver/fast-sprintf.cpp +26 -0
  304. data/ext/link_grammar/link-grammar/sat-solver/fast-sprintf.hpp +7 -0
  305. data/ext/link_grammar/link-grammar/sat-solver/guiding.hpp +244 -0
  306. data/ext/link_grammar/link-grammar/sat-solver/matrix-ut.hpp +79 -0
  307. data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.cpp +2811 -0
  308. data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.h +11 -0
  309. data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.hpp +381 -0
  310. data/ext/link_grammar/link-grammar/sat-solver/trie.hpp +118 -0
  311. data/ext/link_grammar/link-grammar/sat-solver/util.cpp +23 -0
  312. data/ext/link_grammar/link-grammar/sat-solver/util.hpp +14 -0
  313. data/ext/link_grammar/link-grammar/sat-solver/variables.cpp +5 -0
  314. data/ext/link_grammar/link-grammar/sat-solver/variables.hpp +829 -0
  315. data/ext/link_grammar/link-grammar/sat-solver/word-tag.cpp +159 -0
  316. data/ext/link_grammar/link-grammar/sat-solver/word-tag.hpp +162 -0
  317. data/ext/link_grammar/link-grammar/spellcheck-aspell.c +148 -0
  318. data/ext/link_grammar/link-grammar/spellcheck-hun.c +136 -0
  319. data/ext/link_grammar/link-grammar/spellcheck.h +34 -0
  320. data/ext/link_grammar/link-grammar/string-set.c +169 -0
  321. data/ext/link_grammar/link-grammar/string-set.h +16 -0
  322. data/ext/link_grammar/link-grammar/structures.h +498 -0
  323. data/ext/link_grammar/link-grammar/tokenize.c +1049 -0
  324. data/ext/link_grammar/link-grammar/tokenize.h +15 -0
  325. data/ext/link_grammar/link-grammar/utilities.c +847 -0
  326. data/ext/link_grammar/link-grammar/utilities.h +281 -0
  327. data/ext/link_grammar/link-grammar/word-file.c +124 -0
  328. data/ext/link_grammar/link-grammar/word-file.h +15 -0
  329. data/ext/link_grammar/link-grammar/word-utils.c +526 -0
  330. data/ext/link_grammar/link-grammar/word-utils.h +152 -0
  331. data/ext/link_grammar/link_grammar.c +202 -0
  332. data/ext/link_grammar/link_grammar.h +99 -0
  333. data/grammar_cop.gemspec +24 -0
  334. data/lib/.DS_Store +0 -0
  335. data/lib/grammar_cop.rb +9 -0
  336. data/lib/grammar_cop/.DS_Store +0 -0
  337. data/lib/grammar_cop/dictionary.rb +19 -0
  338. data/lib/grammar_cop/linkage.rb +30 -0
  339. data/lib/grammar_cop/parse_options.rb +32 -0
  340. data/lib/grammar_cop/sentence.rb +36 -0
  341. data/lib/grammar_cop/version.rb +3 -0
  342. data/test/.DS_Store +0 -0
  343. data/test/grammar_cop_test.rb +27 -0
  344. metadata +407 -0
@@ -0,0 +1,16 @@
1
+ /*************************************************************************/
2
+ /* Copyright (c) 2004 */
3
+ /* Daniel Sleator, David Temperley, and John Lafferty */
4
+ /* All rights reserved */
5
+ /* */
6
+ /* Use of the link grammar parsing system is subject to the terms of the */
7
+ /* license set forth in the LICENSE file included with this software, */
8
+ /* and also available at http://www.link.cs.cmu.edu/link/license.html */
9
+ /* This license allows free redistribution and use in source and binary */
10
+ /* forms, with or without modification, subject to certain conditions. */
11
+ /* */
12
+ /*************************************************************************/
13
+ void print_disjunct_counts(Sentence sent);
14
+ void print_sentence(FILE *fp, Sentence sent, int w);
15
+ void print_expression_sizes(Sentence sent);
16
+ void compute_chosen_words(Sentence sent, Linkage linkage);
@@ -0,0 +1,1864 @@
1
+ /*************************************************************************/
2
+ /* Copyright (c) 2004 */
3
+ /* Daniel Sleator, David Temperley, and John Lafferty */
4
+ /* All rights reserved */
5
+ /* */
6
+ /* Use of the link grammar parsing system is subject to the terms of the */
7
+ /* license set forth in the LICENSE file included with this software, */
8
+ /* and also available at http://www.link.cs.cmu.edu/link/license.html */
9
+ /* This license allows free redistribution and use in source and binary */
10
+ /* forms, with or without modification, subject to certain conditions. */
11
+ /* */
12
+ /*************************************************************************/
13
+
14
+ #include "api.h"
15
+ #include "disjunct-utils.h"
16
+
17
+ #define CONTABSZ 8192
18
+ typedef Connector * connector_table;
19
+
20
+ /*
21
+ typedef struct disjunct_dup_table_s disjunct_dup_table;
22
+ struct disjunct_dup_table_s
23
+ {
24
+ int dup_table_size;
25
+ Disjunct ** dup_table;
26
+ };
27
+ */
28
+
29
+ /* the indiction in a word field that this connector cannot
30
+ * be used -- is obsolete.
31
+ */
32
+ #define BAD_WORD (MAX_SENTENCE+1)
33
+
34
+ typedef struct c_list_s C_list;
35
+ struct c_list_s
36
+ {
37
+ Connector * c;
38
+ int shallow;
39
+ C_list * next;
40
+ };
41
+
42
+ typedef struct power_table_s power_table;
43
+ struct power_table_s
44
+ {
45
+ int power_table_size;
46
+ int l_table_size[MAX_SENTENCE]; /* the sizes of the hash tables */
47
+ int r_table_size[MAX_SENTENCE];
48
+ C_list ** l_table[MAX_SENTENCE];
49
+ C_list ** r_table[MAX_SENTENCE];
50
+ };
51
+
52
+ typedef struct cms_struct Cms;
53
+ struct cms_struct
54
+ {
55
+ Cms * next;
56
+ const char * name;
57
+ int count; /* the number of times this is in the multiset */
58
+ };
59
+
60
+ #define CMS_SIZE (2<<10)
61
+ typedef struct multiset_table_s multiset_table;
62
+ struct multiset_table_s
63
+ {
64
+ Cms * cms_table[CMS_SIZE];
65
+ };
66
+
67
+ typedef struct prune_context_s prune_context;
68
+ struct prune_context_s
69
+ {
70
+ int null_links;
71
+ char ** deletable;
72
+ char ** effective_dist;
73
+ int power_cost;
74
+ int power_prune_mode; /* either GENTLE or RUTHLESS */
75
+ int N_changed; /* counts the number of changes
76
+ of c->word fields in a pass */
77
+
78
+ power_table *pt;
79
+ Sentence sent;
80
+ };
81
+
82
+ /*
83
+
84
+ The algorithms in this file prune disjuncts from the disjunct list
85
+ of the sentence that can be elimininated by a simple checks. The first
86
+ check works as follows:
87
+
88
+ A series of passes are made through the sentence, alternating
89
+ left-to-right and right-to-left. Consier the left-to-right pass (the
90
+ other is symmetric). A set S of connectors is maintained (initialized
91
+ to be empty). Now the disjuncts of the current word are processed.
92
+ If a given disjunct's left pointing connectors have the property that
93
+ at least one of them has no connector in S to which it can be matched,
94
+ then that disjunct is deleted. Now the set S is augmented by the right
95
+ connectors of the remaining disjuncts of that word. This completes
96
+ one word. The process continues through the words from left to right.
97
+ Alternate passes are made until no disjunct is deleted.
98
+
99
+ It worries me a little that if there are some really huge disjuncts lists,
100
+ then this process will probably do nothing. (This fear turns out to be
101
+ unfounded.)
102
+
103
+ Notes: Power pruning will not work if applied before generating the
104
+ "and" disjuncts. This is because certain of it's tricks don't work.
105
+ Think about this, and finish this note later....
106
+ Also, currently I use the standard connector match procedure instead
107
+ of the pruning one, since I know power pruning will not be used before
108
+ and generation. Replace this to allow power pruning to work before
109
+ generating and disjuncts.
110
+
111
+ Currently it seems that normal pruning, power pruning, and generation,
112
+ pruning, and power pruning (after "and" generation) and parsing take
113
+ about the same amount of time. This is why doing power pruning before
114
+ "and" generation might be a very good idea.
115
+
116
+ New idea: Suppose all the disjuncts of a word have a connector of type
117
+ c pointing to the right. And further, suppose that there is exactly one
118
+ word to its right containing that type of connector pointing to the left.
119
+ Then all the other disjuncts on the latter word can be deleted.
120
+ (This situation is created by the processing of "either...or", and by
121
+ the extra disjuncts added to a "," neighboring a conjunction.)
122
+
123
+ */
124
+
125
+ /**
126
+ * This hash function only looks at the leading upper case letters of
127
+ * the connector string, and the label fields. This ensures that if two
128
+ * strings match (formally), then they must hash to the same place.
129
+ */
130
+ static inline int hash_S(Connector * c)
131
+ {
132
+ int h = connector_hash(c);
133
+ return (h & (CONTABSZ-1));
134
+ }
135
+
136
+ /**
137
+ * This is almost identical to match(). Its reason for existance
138
+ * is the rather subtle fact that with "and" can transform a "Ss"
139
+ * connector into "Sp". This means that in order for pruning to
140
+ * work, we must allow a "Ss" connector on word match an "Sp" connector
141
+ * on a word to its right. This is what this version of match allows.
142
+ * We assume that a is on a word to the left of b.
143
+ */
144
+ int prune_match(int dist, Connector *a, Connector *b)
145
+ {
146
+ const char *s, *t;
147
+ int x, y;
148
+
149
+ if (a->label != b->label) return FALSE;
150
+
151
+ x = hash_S(a);
152
+ y = hash_S(b);
153
+ if (x != y) return FALSE;
154
+
155
+ s = a->string;
156
+ t = b->string;
157
+
158
+ while(s < a->prune_string || t < b->prune_string)
159
+ {
160
+ if (*s != *t) return FALSE;
161
+ s++;
162
+ t++;
163
+ }
164
+
165
+ /* printf("PM: a=%4s b=%4s ap=%d bp=%d a->ll=%d b->ll=%d dist=%d\n",
166
+ s, t, x, y, a->length_limit, b->length_limit, dist); */
167
+ if (dist > a->length_limit || dist > b->length_limit) return FALSE;
168
+
169
+ x = a->priority;
170
+ y = b->priority;
171
+
172
+ if ((x == THIN_priority) && (y == THIN_priority))
173
+ {
174
+ #if defined(PLURALIZATION)
175
+ /*
176
+ if ((*(a->string)=='S') && ((*s=='s') || (*s=='p')) && (*t=='p')) {
177
+ return TRUE;
178
+ }
179
+ */
180
+ /*
181
+ The above is a kludge to stop pruning from killing off disjuncts
182
+ which (because of pluralization in and) might become valid later.
183
+ Recall that "and" converts a singular subject into a plural one.
184
+ The (*s=='p') part is so that "he and I are good" doesn't get killed off.
185
+ The above hack is subsumed by the following one:
186
+ */
187
+ if ((*(a->string)=='S') && ((*s=='s') || (*s=='p')) &&
188
+ ((*t=='p') || (*t=='s')) &&
189
+ ((s-1 == a->string) || ((s-2 == a->string) && (*(s-1) == 'I')))){
190
+ return TRUE;
191
+ }
192
+ /*
193
+ This change is to accommodate "nor". In particular we need to
194
+ prevent "neither John nor I likes dogs" from being killed off.
195
+ We want to allow this to apply to "are neither a dog nor a cat here"
196
+ and "is neither a dog nor a cat here". This uses the "SI" connector.
197
+ The third line above ensures that the connector is either "S" or "SI".
198
+ */
199
+ #endif
200
+ while ((*s != '\0') && (*t != '\0'))
201
+ {
202
+ if ((*s == '*') || (*t == '*') ||
203
+ ((*s == *t) && (*s != '^')))
204
+ {
205
+ /* this last case here is rather obscure. It prevents
206
+ '^' from matching '^'.....Is this necessary?
207
+ ......yes, I think it is. */
208
+ s++;
209
+ t++;
210
+ }
211
+ else
212
+ return FALSE;
213
+ }
214
+ return TRUE;
215
+ }
216
+ else if ((x == UP_priority) && (y == DOWN_priority))
217
+ {
218
+ while ((*s!='\0') && (*t!='\0'))
219
+ {
220
+ if ((*s == *t) || (*s == '*') || (*t == '^'))
221
+ {
222
+ /* that '^' should match on the DOWN_priority
223
+ node is subtle, but correct */
224
+ s++;
225
+ t++;
226
+ }
227
+ else
228
+ return FALSE;
229
+ }
230
+ return TRUE;
231
+ }
232
+ else if ((y == UP_priority) && (x == DOWN_priority))
233
+ {
234
+ while ((*s!='\0') && (*t!='\0'))
235
+ {
236
+ if ((*s == *t) || (*t == '*') || (*s == '^'))
237
+ {
238
+ s++;
239
+ t++;
240
+ }
241
+ else
242
+ return FALSE;
243
+ }
244
+ return TRUE;
245
+ }
246
+ else
247
+ return FALSE;
248
+ }
249
+
250
+ static void zero_connector_table(connector_table *ct)
251
+ {
252
+ memset(ct, 0, sizeof(Connector *) * CONTABSZ);
253
+ }
254
+
255
+ /**
256
+ * This function puts connector c into the connector table
257
+ * if one like it isn't already there.
258
+ */
259
+ static void insert_connector(connector_table *ct, Connector * c)
260
+ {
261
+ int h;
262
+ Connector * e;
263
+
264
+ h = hash_S(c);
265
+
266
+ for (e = ct[h]; e != NULL; e = e->tableNext)
267
+ {
268
+ if ((strcmp(c->string, e->string) == 0) &&
269
+ (c->label == e->label) &&
270
+ (c->priority == e->priority))
271
+ return;
272
+ }
273
+ c->tableNext = ct[h];
274
+ ct[h] = c;
275
+ }
276
+
277
+ void prune(Sentence sent)
278
+ {
279
+ Connector *e, *f;
280
+ int w;
281
+ int N_deleted;
282
+ Connector *ct[CONTABSZ];
283
+ Disjunct fake_head, *d, *d1;
284
+
285
+ /* XXX why is this here ?? */
286
+ count_set_effective_distance(sent);
287
+
288
+ N_deleted = 1; /* a lie to make it always do at least 2 passes */
289
+ while(1)
290
+ {
291
+ /* Left-to-right pass */
292
+ zero_connector_table(ct);
293
+
294
+ /* For every word */
295
+ for (w = 0; w < sent->length; w++)
296
+ {
297
+ d = &fake_head;
298
+ d->next = sent->word[w].d;
299
+
300
+ /* For every disjunct of word */
301
+ while ((d1 = d->next))
302
+ {
303
+ e = d1->left;
304
+
305
+ /* For every left clause of this disjunct */
306
+ while (e)
307
+ {
308
+ int h = hash_S(e);
309
+ for (f = ct[h]; f != NULL; f = f->tableNext)
310
+ {
311
+ if (prune_match(0, f, e)) break;
312
+ }
313
+ if (!f) break; /* If f null, not a single match was found */
314
+ e = e->next;
315
+ }
316
+
317
+ /* We know this disjunct is dead since no match
318
+ * can be found on a required clause. */
319
+ if (e)
320
+ {
321
+ N_deleted ++;
322
+ free_connectors(d1->left);
323
+ free_connectors(d1->right);
324
+ d->next = d1->next;
325
+ xfree(d1, sizeof(Disjunct));
326
+ }
327
+ else
328
+ {
329
+ /* Store surviving disjunct in hash table */
330
+ for (e = d1->right; e != NULL; e = e->next)
331
+ {
332
+ insert_connector(ct, e);
333
+ }
334
+ d = d1; /* move on to next disjunct*/
335
+ }
336
+ }
337
+ sent->word[w].d = fake_head.next;
338
+ }
339
+
340
+ if (2 < verbosity)
341
+ {
342
+ printf("l->r pass removed %d\n", N_deleted);
343
+ print_disjunct_counts(sent);
344
+ }
345
+
346
+ /* We did nothing (and this is not the 1st pass) */
347
+ if (N_deleted == 0) break;
348
+
349
+ /* Right-to-left pass */
350
+ zero_connector_table(ct);
351
+ N_deleted = 0;
352
+
353
+ /* For every word */
354
+ for (w = sent->length-1; w >= 0; w--)
355
+ {
356
+ d = &fake_head;
357
+ d->next = sent->word[w].d;
358
+
359
+ while ((d1 = d->next))
360
+ {
361
+ e = d1->right;
362
+
363
+ while (e)
364
+ {
365
+ int h = hash_S(e);
366
+ for (f = ct[h]; f != NULL; f = f->tableNext)
367
+ {
368
+ if (prune_match(0, e, f)) break;
369
+ }
370
+ if (!f) break; /* If f null, not a single match was found */
371
+ e = e->next;
372
+ }
373
+
374
+ /* We know this disjunct is dead since it can't match
375
+ * to the right*/
376
+ if(e)
377
+ {
378
+ N_deleted ++;
379
+ free_connectors(d1->left);
380
+ free_connectors(d1->right);
381
+ d->next = d1->next;
382
+ xfree(d1, sizeof(Disjunct));
383
+ }
384
+ else
385
+ {
386
+ /* Store surviving disjunct in hash table */
387
+ for (e = d1->left; e != NULL; e = e->next)
388
+ {
389
+ insert_connector(ct, e);
390
+ }
391
+ d = d1; /* move on to next disjunct*/
392
+ }
393
+ sent->word[w].d = fake_head.next;
394
+ }
395
+ }
396
+
397
+ if (verbosity > 2)
398
+ {
399
+ printf("r->l pass removed %d\n", N_deleted);
400
+ print_disjunct_counts(sent);
401
+ }
402
+
403
+ /* We made no change on this pass */
404
+ if (N_deleted == 0) break;
405
+ N_deleted = 0;
406
+ }
407
+ }
408
+
409
+ /*
410
+ The second algorithm eliminates disjuncts that are dominated by
411
+ another. It works by hashing them all, and checking for domination.
412
+ */
413
+
414
+ #if FALSE
415
+ /* ============================================================x */
416
+
417
+ /*
418
+ Consider the idea of deleting a disjunct if it is dominated (in terms of
419
+ what it can match) by some other disjunct on the same word. This has
420
+ been implemented below. There are three problems with it:
421
+
422
+ (1) It is almost never the case that any disjuncts are eliminated.
423
+ (The code below has works correctly with fat links, but because
424
+ all of the fat connectors on a fat disjunct have the same matching
425
+ string, the only time a disjuct will die is if it is the same
426
+ as another one. This is captured by the simplistic version below.
427
+
428
+ (2) connector_matches_alam may not be exactly correct. I don't
429
+ think it does the fat link matches properly. (See the comment
430
+ in and.c for more information about matching fat links.) This is
431
+ irrelevant because of (1).
432
+
433
+ (3) The linkage that is eliminated by this, might just be the one that
434
+ passes post-processing, as the following example shows.
435
+ This is pretty silly, and should probably be changed.
436
+
437
+ > telling John how our program works would be stupid
438
+ Accepted (2 linkages, 1 with no P.P. violations)
439
+ Linkage 1, cost vector = (0, 0, 7)
440
+
441
+ +------------------G-----------------+
442
+ +-----R-----+----CL----+ |
443
+ +---O---+ | +---D--+---S---+ +--I-+-AI-+
444
+ | | | | | | | | |
445
+ telling.g John how our program.n works would be stupid
446
+
447
+ ///// CLg <---CLg---> CL telling.g
448
+ (g) telling.g G <---G-----> G would
449
+ (g) (d) telling.g R <---R-----> R how
450
+ (g) (d) telling.g O <---O-----> O John
451
+ (g) (d) how CLe <---CLe---> CL program.n
452
+ (g) (d) (e) our D <---Ds----> Ds program.n
453
+ (g) (d) (e) program.n Ss <---Ss----> Ss works
454
+ (g) would I <---Ix----> Ix be
455
+ (g) be AI <---AIi---> AIi stupid
456
+
457
+ (press return for another)
458
+ >
459
+ Linkage 2 (bad), cost vector = (0, 0, 7)
460
+
461
+ +------------------G-----------------+
462
+ +-----R-----+----CL----+ |
463
+ +---O---+ | +---D--+---S---+ +--I-+-AI-+
464
+ | | | | | | | | |
465
+ telling.g John how our program.n works would be stupid
466
+
467
+ ///// CLg <---CLg---> CL telling.g
468
+ (g) telling.g G <---G-----> G would
469
+ (g) (d) telling.g R <---R-----> R how
470
+ (g) (d) telling.g O <---O-----> O John
471
+ (g) (d) how CLe <---CLe---> CL program.n
472
+ (g) (d) (e) our D <---Ds----> Ds program.n
473
+ (g) (d) (e) program.n Ss <---Ss----> Ss works
474
+ (g) would I <---Ix----> Ix be
475
+ (g) be AI <---AI----> AI stupid
476
+
477
+ P.P. violations:
478
+ Special subject rule violated
479
+ */
480
+
481
+ /**
482
+ * hash function that takes a string and a seed value i
483
+ */
484
+ static int string_hash(disjunct_dup_table *dt, const char * s, int i)
485
+ {
486
+ for(;*s != '\0';s++) i = i + (i<<1) + randtable[(*s + i) & (RTSIZE-1)];
487
+ return (i & (dt->dup_table_size-1));
488
+ }
489
+
490
+ /**
491
+ * This returns true if the connector a matches everything that b
492
+ * matches, and possibly more. (alam=at least as much)
493
+ *
494
+ * TRUE for equal connectors.
495
+ * remains TRUE if multi-match added to the first.
496
+ * remains TRUE if subsrcripts deleted from the first.
497
+ */
498
+ int connector_matches_alam(Connector * a, Connector * b)
499
+ {
500
+ char * s, * t, *u;
501
+ if (((!a->multi) && b->multi) ||
502
+ (a->label != b->label) ||
503
+ (a->priority != b->priority)) return FALSE;
504
+ s = a->string;
505
+ t = b->string;
506
+
507
+ /* isupper -- connectors cannot be UTF8 at this time */
508
+ while(isupper(*s) || isupper(*t))
509
+ {
510
+ if (*s == *t) {
511
+ s++;
512
+ t++;
513
+ } else return FALSE;
514
+ }
515
+ if (a->priority == DOWN_priority) {
516
+ u = s;
517
+ s = t;
518
+ t = u;
519
+ }
520
+ while((*s != '\0') && (*t != '\0')) {
521
+ if ((*s == *t) || (*s == '*') || (*t == '^')) {
522
+ s++;
523
+ t++;
524
+ } else return FALSE;
525
+ }
526
+ while ((*s != '\0') && (*s == '*')) s++;
527
+ return (*s == '\0');
528
+ }
529
+
530
+
531
+ /**
532
+ * This hash function that takes a connector and a seed value i.
533
+ * It only looks at the leading upper case letters of
534
+ * the string, and the label. This ensures that if two connectors
535
+ * match, then they must hash to the same place.
536
+ */
537
+ static int conn_hash(Connector * c, int i)
538
+ {
539
+ int nb;
540
+ const char * s;
541
+ s = c->string;
542
+
543
+ i = i + (i<<1) + randtable[(c->label + i) & (RTSIZE-1)];
544
+ nb = is_utf8_upper(s);
545
+ while(nb)
546
+ {
547
+ i = i + (i<<1) + randtable[(*s + i) & (RTSIZE-1)];
548
+ s += nb;
549
+ nb = is_utf8_upper(s);
550
+ }
551
+ return i;
552
+ }
553
+
554
+ static inline int pconnector_hash(disjunct_dup_table *dt, Connector * c, int i)
555
+ {
556
+ i = conn_hash(c, i);
557
+ return (i & (ct->dup_table_size-1));
558
+ }
559
+
560
+ /**
561
+ * This is a hash function for disjuncts
562
+ */
563
+ static int hash_disjunct(disjunct_dup_table *dt, Disjunct * d)
564
+ {
565
+ int i;
566
+ Connector *e;
567
+ i = 0;
568
+ for (e = d->left ; e != NULL; e = e->next)
569
+ {
570
+ i = pconnector_hash(dt, e, i);
571
+ }
572
+ for (e = d->right ; e != NULL; e = e->next)
573
+ {
574
+ i = pconnector_hash(dt, e, i);
575
+ }
576
+ return string_hash(dt, d->string, i);
577
+ }
578
+
579
+ /**
580
+ * Returns TRUE if disjunct d1 can match anything that d2 can
581
+ * if this happens, it constitutes a proof that there is absolutely
582
+ * no use for d2.
583
+ */
584
+ static int disjunct_matches_alam(Disjunct * d1, Disjunct * d2)
585
+ {
586
+ Connector *e1, *e2;
587
+ if (d1->cost > d2->cost) return FALSE;
588
+ e1 = d1->left;
589
+ e2 = d2->left;
590
+ while((e1!=NULL) && (e2!=NULL)) {
591
+ if (!connector_matches_alam(e1,e2)) break;
592
+ e1 = e1->next;
593
+ e2 = e2->next;
594
+ }
595
+ if ((e1!=NULL) || (e2!=NULL)) return FALSE;
596
+ e1 = d1->right;
597
+ e2 = d2->right;
598
+ while((e1!=NULL) && (e2!=NULL)) {
599
+ if (!connector_matches_alam(e1,e2)) break;
600
+ e1 = e1->next;
601
+ e2 = e2->next;
602
+ }
603
+ if ((e1!=NULL) || (e2!=NULL)) return FALSE;
604
+ return (strcmp(d1->string, d2->string) == 0);
605
+ }
606
+
607
+ /**
608
+ * Takes the list of disjuncts pointed to by d, eliminates all
609
+ * duplicates, and returns a pointer to a new list.
610
+ * It frees the disjuncts that are eliminated.
611
+ */
612
+ Disjunct * eliminate_duplicate_disjuncts(Disjunct * d)
613
+ {
614
+ int i, h, count;
615
+ Disjunct *dn, *dx, *dxn, *front;
616
+ count = 0;
617
+ disjunct_dup_table *dt;
618
+
619
+ dt = disjunct_dup_table_new(next_power_of_two_up(2 * count_disjuncts(d)));
620
+
621
+ for (;d!=NULL; d = dn)
622
+ {
623
+ dn = d->next;
624
+ h = hash_disjunct(d);
625
+
626
+ front = NULL;
627
+ for (dx = dt->dup_table[h]; dx != NULL; dx = dxn)
628
+ {
629
+ dxn = dx->next;
630
+ if (disjunct_matches_alam(dx,d))
631
+ {
632
+ /* we know that d should be killed */
633
+ d->next = NULL;
634
+ free_disjuncts(d);
635
+ count++;
636
+ front = catenate_disjuncts(front, dx);
637
+ break;
638
+ } else if (disjunct_matches_alam(d,dx)) {
639
+ /* we know that dx should be killed off */
640
+ dx->next = NULL;
641
+ free_disjuncts(dx);
642
+ count++;
643
+ } else {
644
+ /* neither should be killed off */
645
+ dx->next = front;
646
+ front = dx;
647
+ }
648
+ }
649
+ if (dx == NULL) {
650
+ /* we put d in the table */
651
+ d->next = front;
652
+ front = d;
653
+ }
654
+ dt->dup_table[h] = front;
655
+ }
656
+
657
+ /* d is now NULL */
658
+ for (i = 0; i < dt->dup_table_size; i++)
659
+ {
660
+ for (dx = dt->dup_table[i]; dx != NULL; dx = dxn)
661
+ {
662
+ dxn = dx->next;
663
+ dx->next = d;
664
+ d = dx;
665
+ }
666
+ }
667
+
668
+ if ((verbosity > 2) && (count != 0)) printf("killed %d duplicates\n", count);
669
+
670
+ disjunct_dup_table_delete(dt);
671
+ return d;
672
+ }
673
+
674
+ /* ============================================================x */
675
+ #endif
676
+
677
+ /* ================================================================= */
678
+ /**
679
+ * Here is expression pruning. This is done even before the expressions
680
+ * are turned into lists of disjuncts.
681
+ *
682
+ * This uses many of the same data structures and functions that are used
683
+ * by prune.
684
+ *
685
+ * The purge operations remove all irrelevant stuff from the expression,
686
+ * and free the purged stuff. A connector is deemed irrelevant if its
687
+ * string pointer has been set to NULL. The passes through the sentence
688
+ * have the job of doing this.
689
+ *
690
+ * If an OR or AND type expression node has one child, we can replace it
691
+ * by its child. This, of course, is not really necessary, except for
692
+ * performance(?)
693
+ */
694
+
695
+ static Exp* purge_Exp(Exp *);
696
+
697
+ /**
698
+ * Get rid of the elements with null expressions
699
+ */
700
+ static E_list * or_purge_E_list(E_list * l)
701
+ {
702
+ E_list * el;
703
+ if (l == NULL) return NULL;
704
+ if ((l->e = purge_Exp(l->e)) == NULL)
705
+ {
706
+ el = or_purge_E_list(l->next);
707
+ xfree((char *)l, sizeof(E_list));
708
+ return el;
709
+ }
710
+ l->next = or_purge_E_list(l->next);
711
+ return l;
712
+ }
713
+
714
+ /**
715
+ * Returns 0 iff the length of the disjunct list is 0.
716
+ * If this is the case, it frees the structure rooted at l.
717
+ */
718
+ static int and_purge_E_list(E_list * l)
719
+ {
720
+ if (l == NULL) return 1;
721
+ if ((l->e = purge_Exp(l->e)) == NULL)
722
+ {
723
+ free_E_list(l->next);
724
+ xfree((char *)l, sizeof(E_list));
725
+ return 0;
726
+ }
727
+ if (and_purge_E_list(l->next) == 0)
728
+ {
729
+ free_Exp(l->e);
730
+ xfree((char *)l, sizeof(E_list));
731
+ return 0;
732
+ }
733
+ return 1;
734
+ }
735
+
736
+ /**
737
+ * Must be called with a non-null expression.
738
+ * Return NULL iff the expression has no disjuncts.
739
+ */
740
+ static Exp* purge_Exp(Exp *e)
741
+ {
742
+ if (e->type == CONNECTOR_type)
743
+ {
744
+ if (e->u.string == NULL)
745
+ {
746
+ xfree((char *)e, sizeof(Exp));
747
+ return NULL;
748
+ }
749
+ else
750
+ {
751
+ return e;
752
+ }
753
+ }
754
+ if (e->type == AND_type)
755
+ {
756
+ if (and_purge_E_list(e->u.l) == 0)
757
+ {
758
+ xfree((char *)e, sizeof(Exp));
759
+ return NULL;
760
+ }
761
+ }
762
+ else
763
+ {
764
+ e->u.l = or_purge_E_list(e->u.l);
765
+ if (e->u.l == NULL)
766
+ {
767
+ xfree((char *)e, sizeof(Exp));
768
+ return NULL;
769
+ }
770
+ }
771
+
772
+ /* This code makes it kill off nodes that have just one child
773
+ (1) It's going to give an insignificant speed-up
774
+ (2) Costs have not been handled correctly here.
775
+ The code is excised for these reasons.
776
+ */
777
+ /*
778
+ if ((e->u.l != NULL) && (e->u.l->next == NULL))
779
+ {
780
+ ne = e->u.l->e;
781
+ xfree((char *) e->u.l, sizeof(E_list));
782
+ xfree((char *) e, sizeof(Exp));
783
+ return ne;
784
+ }
785
+ */
786
+ return e;
787
+ }
788
+
789
+ /**
790
+ * Returns TRUE if c can match anything in the set S.
791
+ */
792
+ static inline int matches_S(connector_table *ct, Connector * c, int dir)
793
+ {
794
+ Connector * e;
795
+ int h = hash_S(c);
796
+
797
+ if (dir == '-')
798
+ {
799
+ for (e = ct[h]; e != NULL; e = e->tableNext)
800
+ {
801
+ if (prune_match(0, e, c)) return TRUE;
802
+ }
803
+ return FALSE;
804
+ }
805
+ else
806
+ {
807
+ for (e = ct[h]; e != NULL; e = e->tableNext)
808
+ {
809
+ if (prune_match(0, c, e)) return TRUE;
810
+ }
811
+ return FALSE;
812
+ }
813
+ }
814
+
815
+ /**
816
+ * Mark as dead all of the dir-pointing connectors
817
+ * in e that are not matched by anything in the current set.
818
+ * Returns the number of connectors so marked.
819
+ */
820
+ static int mark_dead_connectors(connector_table *ct, Exp * e, int dir)
821
+ {
822
+ int count;
823
+ count = 0;
824
+ if (e->type == CONNECTOR_type)
825
+ {
826
+ if (e->dir == dir)
827
+ {
828
+ Connector dummy;
829
+ init_connector(&dummy);
830
+ dummy.label = NORMAL_LABEL;
831
+ dummy.priority = THIN_priority;
832
+ dummy.string = e->u.string;
833
+ if (!matches_S(ct, &dummy, dir))
834
+ {
835
+ e->u.string = NULL;
836
+ count++;
837
+ }
838
+ }
839
+ }
840
+ else
841
+ {
842
+ E_list *l;
843
+ for (l = e->u.l; l != NULL; l = l->next)
844
+ {
845
+ count += mark_dead_connectors(ct, l->e, dir);
846
+ }
847
+ }
848
+ return count;
849
+ }
850
+
851
+ /**
852
+ * Put into the set S all of the dir-pointing connectors still in e.
853
+ * Return a list of allocated dummy connectors; these will need to be
854
+ * freed.
855
+ */
856
+ static Connector * insert_connectors(connector_table *ct, Exp * e,
857
+ Connector *alloc_list, int dir)
858
+ {
859
+ if (e->type == CONNECTOR_type)
860
+ {
861
+ if (e->dir == dir)
862
+ {
863
+ Connector *dummy = connector_new();
864
+ dummy->string = e->u.string;
865
+ insert_connector(ct, dummy);
866
+ dummy->next = alloc_list;
867
+ alloc_list = dummy;
868
+ }
869
+ }
870
+ else
871
+ {
872
+ E_list *l;
873
+ for (l=e->u.l; l!=NULL; l=l->next)
874
+ {
875
+ alloc_list = insert_connectors(ct, l->e, alloc_list, dir);
876
+ }
877
+ }
878
+ return alloc_list;
879
+ }
880
+
881
+ /**
882
+ * This removes the expressions that are empty from the list corresponding
883
+ * to word w of the sentence.
884
+ */
885
+ static void clean_up_expressions(Sentence sent, int w)
886
+ {
887
+ X_node head_node, *d, *d1;
888
+ d = &head_node;
889
+ d->next = sent->word[w].x;
890
+ while (d->next != NULL)
891
+ {
892
+ if (d->next->exp == NULL)
893
+ {
894
+ d1 = d->next;
895
+ d->next = d1->next;
896
+ xfree((char *)d1, sizeof(X_node));
897
+ }
898
+ else
899
+ {
900
+ d = d->next;
901
+ }
902
+ }
903
+ sent->word[w].x = head_node.next;
904
+ }
905
+
906
+ void expression_prune(Sentence sent)
907
+ {
908
+ int N_deleted;
909
+ X_node * x;
910
+ int w;
911
+ Connector *ct[CONTABSZ];
912
+ Connector *dummy_list = NULL;
913
+
914
+ zero_connector_table(ct);
915
+
916
+ N_deleted = 1; /* a lie to make it always do at least 2 passes */
917
+
918
+ while(1)
919
+ {
920
+ /* Left-to-right pass */
921
+ /* For every word */
922
+ for (w = 0; w < sent->length; w++)
923
+ {
924
+ /* For every expression in word */
925
+ for (x = sent->word[w].x; x != NULL; x = x->next)
926
+ {
927
+ /* printf("before marking: "); print_expression(x->exp); printf("\n"); */
928
+ N_deleted += mark_dead_connectors(ct, x->exp, '-');
929
+ /* printf(" after marking: "); print_expression(x->exp); printf("\n"); */
930
+ }
931
+ for (x = sent->word[w].x; x != NULL; x = x->next)
932
+ {
933
+ /* printf("before purging: "); print_expression(x->exp); printf("\n"); */
934
+ x->exp = purge_Exp(x->exp);
935
+ /* printf("after purging: "); print_expression(x->exp); printf("\n"); */
936
+ }
937
+
938
+ /* gets rid of X_nodes with NULL exp */
939
+ clean_up_expressions(sent, w);
940
+ for (x = sent->word[w].x; x != NULL; x = x->next)
941
+ {
942
+ dummy_list = insert_connectors(ct, x->exp, dummy_list, '+');
943
+ }
944
+ }
945
+
946
+ if (verbosity > 2)
947
+ {
948
+ printf("l->r pass removed %d\n", N_deleted);
949
+ print_expression_sizes(sent);
950
+ }
951
+
952
+ /* Free the allocated dummy connectors */
953
+ free_connectors(dummy_list);
954
+ dummy_list = NULL;
955
+ zero_connector_table(ct);
956
+
957
+ if (N_deleted == 0) break;
958
+
959
+ /* Right-to-left pass */
960
+ N_deleted = 0;
961
+ for (w = sent->length-1; w >= 0; w--)
962
+ {
963
+ for (x = sent->word[w].x; x != NULL; x = x->next)
964
+ {
965
+ /* printf("before marking: "); print_expression(x->exp); printf("\n"); */
966
+ N_deleted += mark_dead_connectors(ct, x->exp, '+');
967
+ /* printf("after marking: "); print_expression(x->exp); printf("\n"); */
968
+ }
969
+ for (x = sent->word[w].x; x != NULL; x = x->next)
970
+ {
971
+ /* printf("before perging: "); print_expression(x->exp); printf("\n"); */
972
+ x->exp = purge_Exp(x->exp);
973
+ /* printf("after perging: "); print_expression(x->exp); printf("\n"); */
974
+ }
975
+ clean_up_expressions(sent, w); /* gets rid of X_nodes with NULL exp */
976
+ for (x = sent->word[w].x; x != NULL; x = x->next)
977
+ {
978
+ dummy_list = insert_connectors(ct, x->exp, dummy_list, '-');
979
+ }
980
+ }
981
+
982
+ if (verbosity > 2)
983
+ {
984
+ printf("r->l pass removed %d\n", N_deleted);
985
+ print_expression_sizes(sent);
986
+ }
987
+
988
+ /* Free the allocated dummy connectors */
989
+ free_connectors(dummy_list);
990
+ dummy_list = NULL;
991
+ zero_connector_table(ct);
992
+ if (N_deleted == 0) break;
993
+ N_deleted = 0;
994
+ }
995
+ }
996
+
997
+
998
+
999
+ /*
1000
+ Here is what you've been waiting for: POWER-PRUNE
1001
+
1002
+ The kinds of constraints it checks for are the following:
1003
+
1004
+ 1) successive connectors on the same disjunct have to go to
1005
+ nearer and nearer words.
1006
+
1007
+ 2) two deep connectors cannot attach to eachother
1008
+ (A connectors is deep if it is not the first in its list, it
1009
+ is shallow if it is the first in its list, it is deepest if it
1010
+ is the last on its list.)
1011
+
1012
+ 3) on two adjacent words, a pair of connectors can be used
1013
+ only if they're the deepest ones on their disjuncts
1014
+
1015
+ 4) on two non-adjacent words, a pair of connectors can be used only
1016
+ if not [both of them are the deepest].
1017
+
1018
+ The data structure consists of a pair of hash tables on every word.
1019
+ Each bucket of a hash table has a list of pointers to connectors.
1020
+ These nodes also store if the chosen connector is shallow.
1021
+ */
1022
+ /*
1023
+ As with normal pruning, we make alternate left->right and right->left
1024
+ passes. In the R->L pass, when we're on a word w, we make use of
1025
+ all the left-pointing hash tables on the words to the right of w.
1026
+ After the pruning on this word, we build the left-pointing hash table
1027
+ this word. This guarantees idempotence of the pass -- after doing an
1028
+ L->R, doing another would change nothing.
1029
+
1030
+ Each connector has an integer c_word field. This refers to the closest
1031
+ word that it could be connected to. These are initially determined by
1032
+ how deep the connector is. For example, a deepest connector can connect
1033
+ to the neighboring word, so its c_word field is w+1 (w-1 if this is a left
1034
+ pointing connector). It's neighboring shallow connector has a c_word
1035
+ value of w+2, etc.
1036
+
1037
+ The pruning process adjusts these c_word values as it goes along,
1038
+ accumulating information about any way of linking this sentence.
1039
+ The pruning process stops only after no disjunct is deleted and no
1040
+ c_word values change.
1041
+
1042
+ The difference between RUTHLESS and GENTLE power pruning is simply
1043
+ that GENTLE uses the deletable region array, and RUTHLESS does not.
1044
+ So we can get the effect of these two different methods simply by
1045
+ always unsuring that deletable[][] has been defined. With nothing
1046
+ deletable, this is equivalent to RUTHLESS. --DS, 7/97
1047
+ */
1048
+
1049
+ /**
1050
+ * returns the number of connectors in the left lists of the disjuncts.
1051
+ */
1052
+ static int left_connector_count(Disjunct * d)
1053
+ {
1054
+ Connector *c;
1055
+ int i=0;
1056
+ for (;d!=NULL; d=d->next) {
1057
+ for (c = d->left; c!=NULL; c = c->next) i++;
1058
+ }
1059
+ return i;
1060
+ }
1061
+
1062
+ static int right_connector_count(Disjunct * d)
1063
+ {
1064
+ Connector *c;
1065
+ int i=0;
1066
+ for (;d!=NULL; d=d->next) {
1067
+ for (c = d->right; c!=NULL; c = c->next) i++;
1068
+ }
1069
+ return i;
1070
+ }
1071
+
1072
+ static void free_C_list(C_list * t)
1073
+ {
1074
+ C_list *xt;
1075
+ for (; t!=NULL; t=xt) {
1076
+ xt = t->next;
1077
+ xfree((char *)t, sizeof(C_list));
1078
+ }
1079
+ }
1080
+
1081
+ /**
1082
+ * free all of the hash tables and C_lists
1083
+ */
1084
+ static void power_table_delete(power_table *pt)
1085
+ {
1086
+ int w;
1087
+ int i;
1088
+
1089
+ for (w = 0; w < pt->power_table_size; w++)
1090
+ {
1091
+ for (i = 0; i < pt->l_table_size[w]; i++)
1092
+ {
1093
+ free_C_list(pt->l_table[w][i]);
1094
+ }
1095
+ xfree((char *)pt->l_table[w], pt->l_table_size[w] * sizeof (C_list *));
1096
+
1097
+ for (i = 0; i < pt->r_table_size[w]; i++)
1098
+ {
1099
+ free_C_list(pt->r_table[w][i]);
1100
+ }
1101
+ xfree((char *)pt->r_table[w], pt->r_table_size[w] * sizeof (C_list *));
1102
+ }
1103
+ free(pt);
1104
+ }
1105
+
1106
+ /**
1107
+ * The disjunct d (whose left or right pointer points to c) is put
1108
+ * into the appropriate hash table
1109
+ */
1110
+ static void put_into_power_table(int size, C_list ** t, Connector * c, int shal)
1111
+ {
1112
+ int h;
1113
+ C_list * m;
1114
+ h = connector_hash(c) & (size-1);
1115
+ m = (C_list *) xalloc (sizeof(C_list));
1116
+ m->next = t[h];
1117
+ t[h] = m;
1118
+ m->c = c;
1119
+ m->shallow = shal;
1120
+ }
1121
+
1122
+ static int set_dist_fields(Connector * c, int w, int delta)
1123
+ {
1124
+ int i;
1125
+ if (c==NULL) return w;
1126
+ i = set_dist_fields(c->next, w, delta) + delta;
1127
+ c->word = i;
1128
+ return i;
1129
+ }
1130
+
1131
+ /**
1132
+ * Allocates and builds the initial power hash tables
1133
+ */
1134
+ static power_table * power_table_new(Sentence sent)
1135
+ {
1136
+ power_table *pt;
1137
+ int w, len, size, i;
1138
+ C_list ** t;
1139
+ Disjunct * d, * xd, * head;
1140
+ Connector * c;
1141
+
1142
+ pt = (power_table *) malloc (sizeof(power_table));
1143
+ pt->power_table_size = sent->length;
1144
+
1145
+ /* first we initialize the word fields of the connectors, and
1146
+ eliminate those disjuncts with illegal connectors */
1147
+ for (w=0; w<sent->length; w++)
1148
+ {
1149
+ head = NULL;
1150
+ for (d=sent->word[w].d; d!=NULL; d=xd) {
1151
+ xd = d->next;
1152
+ if ((set_dist_fields(d->left, w, -1) < 0) ||
1153
+ (set_dist_fields(d->right, w, 1) >= sent->length)) {
1154
+ d->next = NULL;
1155
+ free_disjuncts(d);
1156
+ } else {
1157
+ d->next = head;
1158
+ head = d;
1159
+ }
1160
+ }
1161
+ sent->word[w].d = head;
1162
+ }
1163
+
1164
+ for (w=0; w<sent->length; w++)
1165
+ {
1166
+ len = left_connector_count(sent->word[w].d);
1167
+ size = next_power_of_two_up(len);
1168
+ pt->l_table_size[w] = size;
1169
+ t = pt->l_table[w] = (C_list **) xalloc(size * sizeof(C_list *));
1170
+ for (i=0; i<size; i++) t[i] = NULL;
1171
+
1172
+ for (d=sent->word[w].d; d!=NULL; d=d->next) {
1173
+ c = d->left;
1174
+ if (c != NULL) {
1175
+ put_into_power_table(size, t, c, TRUE);
1176
+ for (c=c->next; c!=NULL; c=c->next){
1177
+ put_into_power_table(size, t, c, FALSE);
1178
+ }
1179
+ }
1180
+ }
1181
+
1182
+ len = right_connector_count(sent->word[w].d);
1183
+ size = next_power_of_two_up(len);
1184
+ pt->r_table_size[w] = size;
1185
+ t = pt->r_table[w] = (C_list **) xalloc(size * sizeof(C_list *));
1186
+ for (i=0; i<size; i++) t[i] = NULL;
1187
+
1188
+ for (d=sent->word[w].d; d!=NULL; d=d->next) {
1189
+ c = d->right;
1190
+ if (c != NULL) {
1191
+ put_into_power_table(size, t, c, TRUE);
1192
+ for (c=c->next; c!=NULL; c=c->next){
1193
+ put_into_power_table(size, t, c, FALSE);
1194
+ }
1195
+ }
1196
+ }
1197
+ }
1198
+
1199
+ return pt;
1200
+ }
1201
+
1202
+ /**
1203
+ * This runs through all the connectors in this table, and eliminates those
1204
+ * who are obsolete. The word fields of an obsolete one has been set to
1205
+ * BAD_WORD.
1206
+ */
1207
+ static void clean_table(int size, C_list ** t)
1208
+ {
1209
+ int i;
1210
+ C_list * m, * xm, * head;
1211
+ for (i=0; i<size; i++) {
1212
+ head = NULL;
1213
+ for (m=t[i]; m!=NULL; m=xm) {
1214
+ xm = m->next;
1215
+ if (m->c->word != BAD_WORD) {
1216
+ m->next = head;
1217
+ head = m;
1218
+ } else {
1219
+ xfree((char *) m, sizeof(C_list));
1220
+ }
1221
+ }
1222
+ t[i] = head;
1223
+ }
1224
+ }
1225
+
1226
+ /**
1227
+ * This takes two connectors (and whether these are shallow or not)
1228
+ * (and the two words that these came from) and returns TRUE if it is
1229
+ * possible for these two to match based on local considerations.
1230
+ */
1231
+ static int possible_connection(prune_context *pc,
1232
+ Connector *lc, Connector *rc,
1233
+ int lshallow, int rshallow,
1234
+ int lword, int rword)
1235
+ {
1236
+ if ((!lshallow) && (!rshallow)) return FALSE;
1237
+ /* two deep connectors can't work */
1238
+ if ((lc->word > rword) || (rc->word < lword)) return FALSE;
1239
+ /* word range constraints */
1240
+
1241
+ assert(lword < rword, "Bad word order in possible connection.");
1242
+
1243
+ /* Now, notice that the only differences between the following two
1244
+ cases is that (1) ruthless uses match and gentle uses prune_match.
1245
+ and (2) ruthless doesn't use deletable[][]. This latter fact is
1246
+ irrelevant, since deletable[][] is now guaranteed to have been
1247
+ created. */
1248
+
1249
+ if (pc->power_prune_mode == RUTHLESS) {
1250
+ if (lword == rword-1) {
1251
+ if (!((lc->next == NULL) && (rc->next == NULL))) return FALSE;
1252
+ } else {
1253
+ if ((!pc->null_links) &&
1254
+ (lc->next == NULL) && (rc->next == NULL) && (!lc->multi) && (!rc->multi)) {
1255
+ return FALSE;
1256
+ }
1257
+ }
1258
+ return do_match(pc->sent, lc, rc, lword, rword);
1259
+ } else {
1260
+ if (lword == rword-1) {
1261
+ if (!((lc->next == NULL) && (rc->next == NULL))) return FALSE;
1262
+ } else {
1263
+ if ((!pc->null_links) &&
1264
+ (lc->next == NULL) && (rc->next == NULL) && (!lc->multi) && (!rc->multi) &&
1265
+ !pc->deletable[lword][rword]) {
1266
+ return FALSE;
1267
+ }
1268
+ }
1269
+ return prune_match(pc->effective_dist[lword][rword], lc, rc);
1270
+ }
1271
+ }
1272
+
1273
+
1274
+ /**
1275
+ * This returns TRUE if the right table of word w contains
1276
+ * a connector that can match to c. shallow tells if c is shallow.
1277
+ */
1278
+ static int right_table_search(prune_context *pc, int w, Connector *c, int shallow, int word_c)
1279
+ {
1280
+ int size, h;
1281
+ C_list *cl;
1282
+ power_table *pt;
1283
+
1284
+ pt = pc->pt;
1285
+ size = pt->r_table_size[w];
1286
+ h = connector_hash(c) & (size-1);
1287
+ for (cl = pt->r_table[w][h]; cl != NULL; cl = cl->next)
1288
+ {
1289
+ if (possible_connection(pc, cl->c, c, cl->shallow, shallow, w, word_c))
1290
+ {
1291
+ return TRUE;
1292
+ }
1293
+ }
1294
+ return FALSE;
1295
+ }
1296
+
1297
+ /**
1298
+ * This returns TRUE if the right table of word w contains
1299
+ * a connector that can match to c. shallows tells if c is shallow
1300
+ */
1301
+ static int left_table_search(prune_context *pc, int w, Connector *c, int shallow, int word_c)
1302
+ {
1303
+ int size, h;
1304
+ C_list *cl;
1305
+ power_table *pt;
1306
+
1307
+ pt = pc->pt;
1308
+ size = pt->l_table_size[w];
1309
+ h = connector_hash(c) & (size-1);
1310
+ for (cl = pt->l_table[w][h]; cl != NULL; cl = cl->next)
1311
+ {
1312
+ if (possible_connection(pc, c, cl->c, shallow, cl->shallow, word_c, w))
1313
+ {
1314
+ return TRUE;
1315
+ }
1316
+ }
1317
+ return FALSE;
1318
+ }
1319
+
1320
+ #if NOT_USED_NOW
1321
+ static int ok_cwords(Sentence sent, Connector *c)
1322
+ {
1323
+ for (; c != NULL; c=c->next) {
1324
+ if (c->word == BAD_WORD) return FALSE;
1325
+ if (c->word >= sent->length) return FALSE;
1326
+ }
1327
+ return TRUE;
1328
+ }
1329
+ #endif
1330
+
1331
+ /**
1332
+ * take this connector list, and try to match it with the words
1333
+ * w-1, w-2, w-3...Returns the word to which the first connector of the
1334
+ * list could possibly be matched. If c is NULL, returns w. If there
1335
+ * is no way to match this list, it returns a negative number.
1336
+ * If it does find a way to match it, it updates the c->word fields
1337
+ * correctly.
1338
+ */
1339
+ static int left_connector_list_update(prune_context *pc, Connector *c, int word_c, int w, int shallow)
1340
+ {
1341
+ int n;
1342
+ int foundmatch;
1343
+
1344
+ if (c==NULL) return w;
1345
+ n = left_connector_list_update(pc, c->next, word_c, w, FALSE) - 1;
1346
+ if (((int) c->word) < n) n = c->word;
1347
+
1348
+ /* n is now the rightmost word we need to check */
1349
+ foundmatch = FALSE;
1350
+ for (; (n >= 0) && ((w-n) < MAX_SENTENCE); n--) {
1351
+ pc->power_cost++;
1352
+ if (right_table_search(pc, n, c, shallow, word_c)) {
1353
+ foundmatch = TRUE;
1354
+ break;
1355
+ }
1356
+ }
1357
+ if (n < ((int) c->word)) {
1358
+ c->word = n;
1359
+ pc->N_changed++;
1360
+ }
1361
+ return (foundmatch ? n : -1);
1362
+ }
1363
+
1364
+ /**
1365
+ * take this connector list, and try to match it with the words
1366
+ * w+1, w+2, w+3...Returns the word to which the first connector of the
1367
+ * list could possibly be matched. If c is NULL, returns w. If there
1368
+ * is no way to match this list, it returns a number greater than N_words-1
1369
+ * If it does find a way to match it, it updates the c->word fields
1370
+ * correctly.
1371
+ */
1372
+ static int right_connector_list_update(prune_context *pc, Sentence sent, Connector *c,
1373
+ int word_c, int w, int shallow)
1374
+ {
1375
+ int n;
1376
+ int foundmatch;
1377
+
1378
+ if (c==NULL) return w;
1379
+ n = right_connector_list_update(pc, sent, c->next, word_c, w, FALSE) + 1;
1380
+ if (c->word > n) n = c->word;
1381
+
1382
+ /* n is now the leftmost word we need to check */
1383
+ foundmatch = FALSE;
1384
+ for (; (n < sent->length) && ((n-w) < MAX_SENTENCE); n++) {
1385
+ pc->power_cost++;
1386
+ if (left_table_search(pc, n, c, shallow, word_c)) {
1387
+ foundmatch = TRUE;
1388
+ break;
1389
+ }
1390
+ }
1391
+ if (n > c->word) {
1392
+ c->word = n;
1393
+ pc->N_changed++;
1394
+ }
1395
+ return (foundmatch ? n : sent->length);
1396
+ }
1397
+
1398
+ /** The return value is the number of disjuncts deleted */
1399
+ int power_prune(Sentence sent, int mode, Parse_Options opts)
1400
+ {
1401
+ power_table *pt;
1402
+ prune_context *pc;
1403
+ Disjunct *d, *free_later, *dx, *nd;
1404
+ Connector *c;
1405
+ int w, N_deleted, total_deleted;
1406
+
1407
+ pc = (prune_context *) malloc (sizeof(prune_context));
1408
+ pc->power_cost = 0;
1409
+ pc->power_prune_mode = mode;
1410
+ pc->null_links = (opts->min_null_count > 0);
1411
+ pc->N_changed = 1; /* forces it always to make at least two passes */
1412
+ pc->deletable = sent->deletable;
1413
+ pc->effective_dist = sent->effective_dist;
1414
+ pc->sent = sent;
1415
+
1416
+ count_set_effective_distance(sent);
1417
+
1418
+ pt = power_table_new(sent);
1419
+ pc->pt = pt;
1420
+
1421
+ free_later = NULL;
1422
+ N_deleted = 0;
1423
+
1424
+ total_deleted = 0;
1425
+
1426
+ while (1)
1427
+ {
1428
+ /* left-to-right pass */
1429
+ for (w = 0; w < sent->length; w++) {
1430
+ if (parse_options_resources_exhausted(opts)) break;
1431
+ for (d = sent->word[w].d; d != NULL; d = d->next) {
1432
+ if (d->left == NULL) continue;
1433
+ if (left_connector_list_update(pc, d->left, w, w, TRUE) < 0) {
1434
+ for (c=d->left ;c!=NULL; c = c->next) c->word = BAD_WORD;
1435
+ for (c=d->right ;c!=NULL; c = c->next) c->word = BAD_WORD;
1436
+ N_deleted++;
1437
+ total_deleted++;
1438
+ }
1439
+ }
1440
+
1441
+ clean_table(pt->r_table_size[w], pt->r_table[w]);
1442
+ nd = NULL;
1443
+ for (d = sent->word[w].d; d != NULL; d = dx) {
1444
+ dx = d->next;
1445
+ if ((d->left != NULL) && (d->left->word == BAD_WORD)) {
1446
+ d->next = free_later;
1447
+ free_later = d;
1448
+ } else {
1449
+ d->next = nd;
1450
+ nd = d;
1451
+ }
1452
+ }
1453
+ sent->word[w].d = nd;
1454
+ }
1455
+ if (verbosity > 2) {
1456
+ printf("l->r pass changed %d and deleted %d\n",pc->N_changed,N_deleted);
1457
+ }
1458
+
1459
+ if (pc->N_changed == 0) break;
1460
+
1461
+ pc->N_changed = N_deleted = 0;
1462
+ /* right-to-left pass */
1463
+
1464
+ for (w = sent->length-1; w >= 0; w--) {
1465
+ if (parse_options_resources_exhausted(opts)) break;
1466
+ for (d = sent->word[w].d; d != NULL; d = d->next) {
1467
+ if (d->right == NULL) continue;
1468
+ if (right_connector_list_update(pc, sent, d->right,w,w,TRUE) >= sent->length){
1469
+ for (c=d->right;c!=NULL; c = c->next) c->word = BAD_WORD;
1470
+ for (c=d->left ;c!=NULL; c = c->next) c->word = BAD_WORD;
1471
+ N_deleted++;
1472
+ total_deleted++;
1473
+ }
1474
+ }
1475
+ clean_table(pt->l_table_size[w], pt->l_table[w]);
1476
+ nd = NULL;
1477
+ for (d = sent->word[w].d; d != NULL; d = dx) {
1478
+ dx = d->next;
1479
+ if ((d->right != NULL) && (d->right->word == BAD_WORD)) {
1480
+ d->next = free_later;
1481
+ free_later = d;
1482
+ } else {
1483
+ d->next = nd;
1484
+ nd = d;
1485
+ }
1486
+ }
1487
+ sent->word[w].d = nd;
1488
+ }
1489
+
1490
+ if (verbosity > 2) {
1491
+ printf("r->l pass changed %d and deleted %d\n", pc->N_changed,N_deleted);
1492
+ }
1493
+
1494
+ if (pc->N_changed == 0) break;
1495
+ pc->N_changed = N_deleted = 0;
1496
+ }
1497
+ free_disjuncts(free_later);
1498
+ power_table_delete(pt);
1499
+ pt = NULL;
1500
+ pc->pt = NULL;
1501
+
1502
+ if (verbosity > 2) printf("%d power prune cost:\n", pc->power_cost);
1503
+
1504
+ if (mode == RUTHLESS) {
1505
+ print_time(opts, "power pruned (ruthless)");
1506
+ } else {
1507
+ print_time(opts, "power pruned (gentle)");
1508
+ }
1509
+
1510
+ if (verbosity > 2) {
1511
+ if (mode == RUTHLESS) {
1512
+ printf("\nAfter power_pruning (ruthless):\n");
1513
+ } else {
1514
+ printf("\nAfter power_pruning (gentle):\n");
1515
+ }
1516
+ print_disjunct_counts(sent);
1517
+ }
1518
+
1519
+ free(pc);
1520
+ return total_deleted;
1521
+ }
1522
+
1523
+ /* ===================================================================
1524
+ PP Pruning
1525
+
1526
+ The "contains one" post-processing constraints give us a new way to
1527
+ prune. Suppose there's a rule that says "a group that contains foo
1528
+ must contain a bar or a baz." Here foo, bar, and baz are connector
1529
+ types. foo is the trigger link, bar and baz are called the criterion
1530
+ links. If, after considering the disjuncts we find that there is is
1531
+ a foo, but neither a bar, nor a baz, then we can eliminte the disjuct
1532
+ containing bar.
1533
+
1534
+ Things are actually a bit more complex, because of the matching rules
1535
+ and subscripts. The problem is that post-processing deals with link
1536
+ names, while at this point all we have to work with is connector
1537
+ names. Consider the foo part. Consider a connector C. When does
1538
+ foo match C for our purposes? It matches it if every possible link
1539
+ name L (that can result from C being at one end of that link) results
1540
+ in post_process_match(foo,L) being true. Suppose foo contains a "*".
1541
+ Then there is no C that has this property. This is because the *s in
1542
+ C may be replaced by some other subscripts in the construction of L.
1543
+ And the non-* chars in L will not post_process_match the * in foo.
1544
+
1545
+ So let's assume that foo has no *. Now the result we want is simply
1546
+ given by post_process_match(foo, C). Proof: L is the same as C but
1547
+ with some *s replaced by some other letters. Since foo contains no *
1548
+ the replacement in C of some * by some other letter could change
1549
+ post_process_match from FALSE to TRUE, but not vice versa. Therefore
1550
+ it's conservative to use this test.
1551
+
1552
+ For the criterion parts, we need to determine if there is a
1553
+ collection of connectors C1, C2,... such that by combining them you
1554
+ can get a link name L that post_process_matches bar or baz. Here's a
1555
+ way to do this. Say bar="Xabc". Then we see if there are connector
1556
+ names that post_process_match "Xa##", "X#b#", and "X##c". They must
1557
+ all be there in order for it to be possible to create a link name
1558
+ "Xabc". A "*" in the criterion part is a little different. In this
1559
+ case we can simply skip the * (treat it like an upper case letter)
1560
+ for this purpose. So if bar="X*ab" then we look for "X*#b" and
1561
+ "X*a#". (The * in this case could be treated the same as another
1562
+ subscript without breaking it.) Note also that it's only necessary
1563
+ to find a way to match one of the many criterion links that may be in
1564
+ the rule. If none can match, then we can delete the disjunct
1565
+ containing C.
1566
+
1567
+ Here's how we're going to implement this. We'll maintain a multiset
1568
+ of connector names. We'll represent them in a hash table, where the
1569
+ hash function uses only the upper case letters of the connector name.
1570
+ We'll insert all the connectors into the multiset. The multiset will
1571
+ support the operation of deletion (probably simplest to just
1572
+ decrement the count). Here's the algorithm.
1573
+
1574
+ Insert all the connectors into M.
1575
+
1576
+ While the previous pass caused a count to go to 0 do:
1577
+ For each connector C do
1578
+ For each rule R do
1579
+ if C is a trigger for R and the criterion links
1580
+ of the rule cannot be satisfied by the connectors in
1581
+ M, Then:
1582
+ We delete C's disjunct. But before we do,
1583
+ we remove all the connectors of this disjunct
1584
+ from the multiset. Keep tabs on whether or not
1585
+ any of the counts went to 0.
1586
+
1587
+
1588
+
1589
+ Efficiency hacks to be added later:
1590
+ Note for a given rule can become less and less satisfiable.
1591
+ That is, rule_satisfiable(r) for a given rule r can change from
1592
+ TRUE to FALSE, but not vice versa. So once it's FALSE, we can just
1593
+ remember that.
1594
+
1595
+ Consider the effect of a pass p on the set of rules that are
1596
+ satisfiable. Suppose this set does not change. Then pass p+1
1597
+ will do nothing. This is true even if pass p caused some
1598
+ disjuncts to be deleted. (This observation will only obviate
1599
+ the need for the last pass.)
1600
+
1601
+ */
1602
+
1603
+ static multiset_table * cms_table_new(void)
1604
+ {
1605
+ multiset_table *mt;
1606
+ int i;
1607
+
1608
+ mt = (multiset_table *) malloc(sizeof(multiset_table));
1609
+
1610
+ for (i=0; i<CMS_SIZE; i++) {
1611
+ mt->cms_table[i] = NULL;
1612
+ }
1613
+ return mt;
1614
+ }
1615
+
1616
+ static void cms_table_delete(multiset_table *mt)
1617
+ {
1618
+ Cms * cms, *xcms;
1619
+ int i;
1620
+ for (i=0; i<CMS_SIZE; i++)
1621
+ {
1622
+ for (cms = mt->cms_table[i]; cms != NULL; cms = xcms)
1623
+ {
1624
+ xcms = cms->next;
1625
+ xfree(cms, sizeof(Cms));
1626
+ }
1627
+ }
1628
+ free(mt);
1629
+ }
1630
+
1631
+ static int cms_hash(const char * s)
1632
+ {
1633
+ unsigned int i = 5381;
1634
+ while (isupper((int) *s)) /* connector names are not yet UTF8-capable */
1635
+ {
1636
+ i = ((i << 5) + i) + *s;
1637
+ s++;
1638
+ }
1639
+ return (i & (CMS_SIZE-1));
1640
+ }
1641
+
1642
+ /**
1643
+ * This returns TRUE if there is a connector name C in the table
1644
+ * such that post_process_match(pp_match_name, C) is TRUE
1645
+ */
1646
+ static int match_in_cms_table(multiset_table *cmt, const char * pp_match_name)
1647
+ {
1648
+ Cms * cms;
1649
+ for (cms = cmt->cms_table[cms_hash(pp_match_name)]; cms != NULL; cms = cms->next)
1650
+ {
1651
+ if(post_process_match(pp_match_name, cms->name)) return TRUE;
1652
+ }
1653
+ return FALSE;
1654
+ }
1655
+
1656
+ static Cms * lookup_in_cms_table(multiset_table *cmt, const char * str)
1657
+ {
1658
+ Cms * cms;
1659
+ for (cms = cmt->cms_table[cms_hash(str)]; cms != NULL; cms = cms->next)
1660
+ {
1661
+ if(strcmp(str, cms->name) == 0) return cms;
1662
+ }
1663
+ return NULL;
1664
+ }
1665
+
1666
+ static void insert_in_cms_table(multiset_table *cmt, const char * str)
1667
+ {
1668
+ Cms * cms;
1669
+ int h;
1670
+ cms = lookup_in_cms_table(cmt, str);
1671
+ if (cms != NULL) {
1672
+ cms->count++;
1673
+ } else {
1674
+ cms = (Cms *) xalloc(sizeof(Cms));
1675
+ cms->name = str; /* don't copy the string...just keep a pointer to it.
1676
+ we won't free these later */
1677
+ cms->count = 1;
1678
+ h = cms_hash(str);
1679
+ cms->next = cmt->cms_table[h];
1680
+ cmt->cms_table[h] = cms;
1681
+ }
1682
+ }
1683
+
1684
+ /**
1685
+ * Delete the given string from the table. Return TRUE if
1686
+ * this caused a count to go to 0, return FALSE otherwise.
1687
+ */
1688
+ static int delete_from_cms_table(multiset_table *cmt, const char * str)
1689
+ {
1690
+ Cms * cms;
1691
+ cms = lookup_in_cms_table(cmt, str);
1692
+ if (cms != NULL && cms->count > 0) {
1693
+ cms->count--;
1694
+ return (cms->count == 0);
1695
+ }
1696
+ return FALSE;
1697
+ }
1698
+
1699
+ static int rule_satisfiable(multiset_table *cmt, pp_linkset *ls)
1700
+ {
1701
+ int hashval;
1702
+ const char * t;
1703
+ char name[20], *s;
1704
+ pp_linkset_node *p;
1705
+ int bad, n_subscripts;
1706
+
1707
+ for (hashval = 0; hashval < ls->hash_table_size; hashval++)
1708
+ {
1709
+ for (p = ls->hash_table[hashval]; p!=NULL; p=p->next)
1710
+ {
1711
+ /* ok, we've got our hands on one of the criterion links */
1712
+ strncpy(name, p->str, sizeof(name)-1);
1713
+ /* could actually use the string in place because we change it back */
1714
+ name[sizeof(name)-1] = '\0';
1715
+ /* now we want to see if we can satisfy this criterion link */
1716
+ /* with a collection of the links in the cms table */
1717
+
1718
+ for (s = name; isupper((int)*s); s++) {}
1719
+ for (;*s != '\0'; s++) if (*s != '*') *s = '#';
1720
+ for (s = name, t = p->str; isupper((int) *s); s++, t++) {}
1721
+
1722
+ /* s and t remain in lockstep */
1723
+ bad = 0;
1724
+ n_subscripts = 0;
1725
+ for (;*s != '\0' && bad==0; s++, t++) {
1726
+ if (*s == '*') continue;
1727
+ n_subscripts++;
1728
+ /* after the upper case part, and is not a * so must be a regular subscript */
1729
+ *s = *t;
1730
+ if (!match_in_cms_table(cmt, name)) bad++;
1731
+ *s = '#';
1732
+ }
1733
+
1734
+ if (n_subscripts == 0) {
1735
+ /* now we handle the special case which occurs if there
1736
+ were 0 subscripts */
1737
+ if (!match_in_cms_table(cmt, name)) bad++;
1738
+ }
1739
+
1740
+ /* now if bad==0 this criterion link does the job
1741
+ to satisfy the needs of the trigger link */
1742
+
1743
+ if (bad == 0) return TRUE;
1744
+ }
1745
+ }
1746
+ return FALSE;
1747
+ }
1748
+
1749
+ static int pp_prune(Sentence sent, Parse_Options opts)
1750
+ {
1751
+ pp_knowledge * knowledge;
1752
+ pp_rule rule;
1753
+ const char * selector;
1754
+ pp_linkset * link_set;
1755
+ int i, w, dir;
1756
+ Disjunct *d;
1757
+ Connector *c;
1758
+ int change, total_deleted, N_deleted, deleteme;
1759
+ multiset_table *cmt;
1760
+
1761
+ if (sent->dict->postprocessor == NULL) return 0;
1762
+
1763
+ knowledge = sent->dict->postprocessor->knowledge;
1764
+
1765
+ cmt = cms_table_new();
1766
+
1767
+ for (w = 0; w < sent->length; w++) {
1768
+ for (d = sent->word[w].d; d != NULL; d = d->next) {
1769
+ d->marked = TRUE;
1770
+ for (dir=0; dir < 2; dir++) {
1771
+ for (c = ( (dir)?(d->left):(d->right) ); c!=NULL; c=c->next) {
1772
+ insert_in_cms_table(cmt, c->string);
1773
+ }
1774
+ }
1775
+ }
1776
+ }
1777
+
1778
+ total_deleted = 0;
1779
+ change = 1;
1780
+ while (change > 0) {
1781
+ change = 0;
1782
+ N_deleted = 0;
1783
+ for (w = 0; w < sent->length; w++) {
1784
+ for (d = sent->word[w].d; d != NULL; d = d->next) {
1785
+ if (!d->marked) continue;
1786
+ deleteme = FALSE;
1787
+ for (dir=0; dir < 2; dir++) {
1788
+ for (c = ( (dir)?(d->left):(d->right) ); c!=NULL; c=c->next) {
1789
+ for (i=0; i<knowledge->n_contains_one_rules; i++) {
1790
+
1791
+ rule = knowledge->contains_one_rules[i]; /* the ith rule */
1792
+ selector = rule.selector; /* selector string for this rule */
1793
+ link_set = rule.link_set; /* the set of criterion links */
1794
+
1795
+ if (strchr(selector, '*') != NULL) continue; /* If it has a * forget it */
1796
+
1797
+ if (!post_process_match(selector, c->string)) continue;
1798
+
1799
+ /*
1800
+ printf("pp_prune: trigger ok. selector = %s c->string = %s\n", selector, c->string);
1801
+ */
1802
+
1803
+ /* We know c matches the trigger link of the rule. */
1804
+ /* Now check the criterion links */
1805
+
1806
+ if (!rule_satisfiable(cmt, link_set)) {
1807
+ deleteme = TRUE;
1808
+ }
1809
+ if (deleteme) break;
1810
+ }
1811
+ if (deleteme) break;
1812
+ }
1813
+ if (deleteme) break;
1814
+ }
1815
+
1816
+ if (deleteme) { /* now we delete this disjunct */
1817
+ N_deleted++;
1818
+ total_deleted++;
1819
+ d->marked = FALSE; /* mark for deletion later */
1820
+ for (dir=0; dir < 2; dir++) {
1821
+ for (c = ( (dir)?(d->left):(d->right) ); c!=NULL; c=c->next) {
1822
+ change += delete_from_cms_table(cmt, c->string);
1823
+ }
1824
+ }
1825
+ }
1826
+ }
1827
+ }
1828
+
1829
+ if (verbosity > 2) {
1830
+ printf("pp_prune pass deleted %d\n", N_deleted);
1831
+ }
1832
+
1833
+ }
1834
+ delete_unmarked_disjuncts(sent);
1835
+ cms_table_delete(cmt);
1836
+
1837
+ if (verbosity > 2) {
1838
+ printf("\nAfter pp_pruning:\n");
1839
+ print_disjunct_counts(sent);
1840
+ }
1841
+
1842
+ print_time(opts, "pp pruning");
1843
+
1844
+ return total_deleted;
1845
+ }
1846
+
1847
+
1848
+ /**
1849
+ * Do the following pruning steps until nothing happens:
1850
+ * power pp power pp power pp....
1851
+ * Make sure you do them both at least once.
1852
+ */
1853
+ void pp_and_power_prune(Sentence sent, int mode, Parse_Options opts)
1854
+ {
1855
+ power_prune(sent, mode, opts);
1856
+
1857
+ for (;;) {
1858
+ if (parse_options_resources_exhausted(opts)) break;
1859
+ if (pp_prune(sent, opts) == 0) break;
1860
+ if (parse_options_resources_exhausted(opts)) break;
1861
+ if (power_prune(sent, mode, opts) == 0) break;
1862
+ }
1863
+ }
1864
+