grammar_cop 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (344) hide show
  1. data/.DS_Store +0 -0
  2. data/.gitignore +4 -0
  3. data/Gemfile +4 -0
  4. data/Rakefile +8 -0
  5. data/data/.DS_Store +0 -0
  6. data/data/Makefile +511 -0
  7. data/data/Makefile.am +4 -0
  8. data/data/Makefile.in +511 -0
  9. data/data/de/.DS_Store +0 -0
  10. data/data/de/4.0.affix +7 -0
  11. data/data/de/4.0.dict +474 -0
  12. data/data/de/Makefile +387 -0
  13. data/data/de/Makefile.am +9 -0
  14. data/data/de/Makefile.in +387 -0
  15. data/data/en/.DS_Store +0 -0
  16. data/data/en/4.0.affix +26 -0
  17. data/data/en/4.0.batch +1002 -0
  18. data/data/en/4.0.biolg.batch +411 -0
  19. data/data/en/4.0.constituent-knowledge +127 -0
  20. data/data/en/4.0.dict +8759 -0
  21. data/data/en/4.0.dict.m4 +6928 -0
  22. data/data/en/4.0.enwiki.batch +14 -0
  23. data/data/en/4.0.fixes.batch +2776 -0
  24. data/data/en/4.0.knowledge +306 -0
  25. data/data/en/4.0.regex +225 -0
  26. data/data/en/4.0.voa.batch +114 -0
  27. data/data/en/Makefile +554 -0
  28. data/data/en/Makefile.am +19 -0
  29. data/data/en/Makefile.in +554 -0
  30. data/data/en/README +173 -0
  31. data/data/en/tiny.dict +157 -0
  32. data/data/en/words/.DS_Store +0 -0
  33. data/data/en/words/Makefile +456 -0
  34. data/data/en/words/Makefile.am +78 -0
  35. data/data/en/words/Makefile.in +456 -0
  36. data/data/en/words/currency +205 -0
  37. data/data/en/words/currency.p +28 -0
  38. data/data/en/words/entities.given-bisex.sing +39 -0
  39. data/data/en/words/entities.given-female.sing +4141 -0
  40. data/data/en/words/entities.given-male.sing +1633 -0
  41. data/data/en/words/entities.locations.sing +68 -0
  42. data/data/en/words/entities.national.sing +253 -0
  43. data/data/en/words/entities.organizations.sing +7 -0
  44. data/data/en/words/entities.us-states.sing +11 -0
  45. data/data/en/words/units.1 +45 -0
  46. data/data/en/words/units.1.dot +4 -0
  47. data/data/en/words/units.3 +2 -0
  48. data/data/en/words/units.4 +5 -0
  49. data/data/en/words/units.4.dot +1 -0
  50. data/data/en/words/words-medical.adv.1 +1191 -0
  51. data/data/en/words/words-medical.prep.1 +67 -0
  52. data/data/en/words/words-medical.v.4.1 +2835 -0
  53. data/data/en/words/words-medical.v.4.2 +2848 -0
  54. data/data/en/words/words-medical.v.4.3 +3011 -0
  55. data/data/en/words/words-medical.v.4.4 +3036 -0
  56. data/data/en/words/words-medical.v.4.5 +3050 -0
  57. data/data/en/words/words.adj.1 +6794 -0
  58. data/data/en/words/words.adj.2 +638 -0
  59. data/data/en/words/words.adj.3 +667 -0
  60. data/data/en/words/words.adv.1 +1573 -0
  61. data/data/en/words/words.adv.2 +67 -0
  62. data/data/en/words/words.adv.3 +157 -0
  63. data/data/en/words/words.adv.4 +80 -0
  64. data/data/en/words/words.n.1 +11464 -0
  65. data/data/en/words/words.n.1.wiki +264 -0
  66. data/data/en/words/words.n.2.s +2017 -0
  67. data/data/en/words/words.n.2.s.biolg +1 -0
  68. data/data/en/words/words.n.2.s.wiki +298 -0
  69. data/data/en/words/words.n.2.x +65 -0
  70. data/data/en/words/words.n.2.x.wiki +10 -0
  71. data/data/en/words/words.n.3 +5717 -0
  72. data/data/en/words/words.n.t +23 -0
  73. data/data/en/words/words.v.1.1 +1038 -0
  74. data/data/en/words/words.v.1.2 +1043 -0
  75. data/data/en/words/words.v.1.3 +1052 -0
  76. data/data/en/words/words.v.1.4 +1023 -0
  77. data/data/en/words/words.v.1.p +17 -0
  78. data/data/en/words/words.v.10.1 +14 -0
  79. data/data/en/words/words.v.10.2 +15 -0
  80. data/data/en/words/words.v.10.3 +88 -0
  81. data/data/en/words/words.v.10.4 +17 -0
  82. data/data/en/words/words.v.2.1 +1253 -0
  83. data/data/en/words/words.v.2.2 +1304 -0
  84. data/data/en/words/words.v.2.3 +1280 -0
  85. data/data/en/words/words.v.2.4 +1285 -0
  86. data/data/en/words/words.v.2.5 +1287 -0
  87. data/data/en/words/words.v.4.1 +2472 -0
  88. data/data/en/words/words.v.4.2 +2487 -0
  89. data/data/en/words/words.v.4.3 +2441 -0
  90. data/data/en/words/words.v.4.4 +2478 -0
  91. data/data/en/words/words.v.4.5 +2483 -0
  92. data/data/en/words/words.v.5.1 +98 -0
  93. data/data/en/words/words.v.5.2 +98 -0
  94. data/data/en/words/words.v.5.3 +103 -0
  95. data/data/en/words/words.v.5.4 +102 -0
  96. data/data/en/words/words.v.6.1 +388 -0
  97. data/data/en/words/words.v.6.2 +401 -0
  98. data/data/en/words/words.v.6.3 +397 -0
  99. data/data/en/words/words.v.6.4 +405 -0
  100. data/data/en/words/words.v.6.5 +401 -0
  101. data/data/en/words/words.v.8.1 +117 -0
  102. data/data/en/words/words.v.8.2 +118 -0
  103. data/data/en/words/words.v.8.3 +118 -0
  104. data/data/en/words/words.v.8.4 +119 -0
  105. data/data/en/words/words.v.8.5 +119 -0
  106. data/data/en/words/words.y +104 -0
  107. data/data/lt/.DS_Store +0 -0
  108. data/data/lt/4.0.affix +6 -0
  109. data/data/lt/4.0.constituent-knowledge +24 -0
  110. data/data/lt/4.0.dict +135 -0
  111. data/data/lt/4.0.knowledge +38 -0
  112. data/data/lt/Makefile +389 -0
  113. data/data/lt/Makefile.am +11 -0
  114. data/data/lt/Makefile.in +389 -0
  115. data/ext/.DS_Store +0 -0
  116. data/ext/link_grammar/.DS_Store +0 -0
  117. data/ext/link_grammar/extconf.rb +2 -0
  118. data/ext/link_grammar/link-grammar/.DS_Store +0 -0
  119. data/ext/link_grammar/link-grammar/.deps/analyze-linkage.Plo +198 -0
  120. data/ext/link_grammar/link-grammar/.deps/and.Plo +202 -0
  121. data/ext/link_grammar/link-grammar/.deps/api.Plo +244 -0
  122. data/ext/link_grammar/link-grammar/.deps/build-disjuncts.Plo +212 -0
  123. data/ext/link_grammar/link-grammar/.deps/command-line.Plo +201 -0
  124. data/ext/link_grammar/link-grammar/.deps/constituents.Plo +201 -0
  125. data/ext/link_grammar/link-grammar/.deps/count.Plo +202 -0
  126. data/ext/link_grammar/link-grammar/.deps/disjunct-utils.Plo +126 -0
  127. data/ext/link_grammar/link-grammar/.deps/disjuncts.Plo +123 -0
  128. data/ext/link_grammar/link-grammar/.deps/error.Plo +121 -0
  129. data/ext/link_grammar/link-grammar/.deps/expand.Plo +133 -0
  130. data/ext/link_grammar/link-grammar/.deps/extract-links.Plo +198 -0
  131. data/ext/link_grammar/link-grammar/.deps/fast-match.Plo +200 -0
  132. data/ext/link_grammar/link-grammar/.deps/idiom.Plo +200 -0
  133. data/ext/link_grammar/link-grammar/.deps/jni-client.Plo +217 -0
  134. data/ext/link_grammar/link-grammar/.deps/link-parser.Po +1 -0
  135. data/ext/link_grammar/link-grammar/.deps/massage.Plo +202 -0
  136. data/ext/link_grammar/link-grammar/.deps/post-process.Plo +202 -0
  137. data/ext/link_grammar/link-grammar/.deps/pp_knowledge.Plo +202 -0
  138. data/ext/link_grammar/link-grammar/.deps/pp_lexer.Plo +201 -0
  139. data/ext/link_grammar/link-grammar/.deps/pp_linkset.Plo +200 -0
  140. data/ext/link_grammar/link-grammar/.deps/prefix.Plo +102 -0
  141. data/ext/link_grammar/link-grammar/.deps/preparation.Plo +202 -0
  142. data/ext/link_grammar/link-grammar/.deps/print-util.Plo +200 -0
  143. data/ext/link_grammar/link-grammar/.deps/print.Plo +201 -0
  144. data/ext/link_grammar/link-grammar/.deps/prune.Plo +202 -0
  145. data/ext/link_grammar/link-grammar/.deps/read-dict.Plo +223 -0
  146. data/ext/link_grammar/link-grammar/.deps/read-regex.Plo +123 -0
  147. data/ext/link_grammar/link-grammar/.deps/regex-morph.Plo +131 -0
  148. data/ext/link_grammar/link-grammar/.deps/resources.Plo +203 -0
  149. data/ext/link_grammar/link-grammar/.deps/spellcheck-aspell.Plo +1 -0
  150. data/ext/link_grammar/link-grammar/.deps/spellcheck-hun.Plo +115 -0
  151. data/ext/link_grammar/link-grammar/.deps/string-set.Plo +198 -0
  152. data/ext/link_grammar/link-grammar/.deps/tokenize.Plo +160 -0
  153. data/ext/link_grammar/link-grammar/.deps/utilities.Plo +222 -0
  154. data/ext/link_grammar/link-grammar/.deps/word-file.Plo +201 -0
  155. data/ext/link_grammar/link-grammar/.deps/word-utils.Plo +212 -0
  156. data/ext/link_grammar/link-grammar/.libs/analyze-linkage.o +0 -0
  157. data/ext/link_grammar/link-grammar/.libs/and.o +0 -0
  158. data/ext/link_grammar/link-grammar/.libs/api.o +0 -0
  159. data/ext/link_grammar/link-grammar/.libs/build-disjuncts.o +0 -0
  160. data/ext/link_grammar/link-grammar/.libs/command-line.o +0 -0
  161. data/ext/link_grammar/link-grammar/.libs/constituents.o +0 -0
  162. data/ext/link_grammar/link-grammar/.libs/count.o +0 -0
  163. data/ext/link_grammar/link-grammar/.libs/disjunct-utils.o +0 -0
  164. data/ext/link_grammar/link-grammar/.libs/disjuncts.o +0 -0
  165. data/ext/link_grammar/link-grammar/.libs/error.o +0 -0
  166. data/ext/link_grammar/link-grammar/.libs/expand.o +0 -0
  167. data/ext/link_grammar/link-grammar/.libs/extract-links.o +0 -0
  168. data/ext/link_grammar/link-grammar/.libs/fast-match.o +0 -0
  169. data/ext/link_grammar/link-grammar/.libs/idiom.o +0 -0
  170. data/ext/link_grammar/link-grammar/.libs/jni-client.o +0 -0
  171. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java-symbols.expsym +31 -0
  172. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib +0 -0
  173. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib.dSYM/Contents/Info.plist +20 -0
  174. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib.dSYM/Contents/Resources/DWARF/liblink-grammar-java.4.dylib +0 -0
  175. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.a +0 -0
  176. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.dylib +0 -0
  177. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-symbols.expsym +194 -0
  178. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib +0 -0
  179. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib.dSYM/Contents/Info.plist +20 -0
  180. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib.dSYM/Contents/Resources/DWARF/liblink-grammar.4.dylib +0 -0
  181. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.a +0 -0
  182. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.dylib +0 -0
  183. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.la +41 -0
  184. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.lai +41 -0
  185. data/ext/link_grammar/link-grammar/.libs/massage.o +0 -0
  186. data/ext/link_grammar/link-grammar/.libs/post-process.o +0 -0
  187. data/ext/link_grammar/link-grammar/.libs/pp_knowledge.o +0 -0
  188. data/ext/link_grammar/link-grammar/.libs/pp_lexer.o +0 -0
  189. data/ext/link_grammar/link-grammar/.libs/pp_linkset.o +0 -0
  190. data/ext/link_grammar/link-grammar/.libs/prefix.o +0 -0
  191. data/ext/link_grammar/link-grammar/.libs/preparation.o +0 -0
  192. data/ext/link_grammar/link-grammar/.libs/print-util.o +0 -0
  193. data/ext/link_grammar/link-grammar/.libs/print.o +0 -0
  194. data/ext/link_grammar/link-grammar/.libs/prune.o +0 -0
  195. data/ext/link_grammar/link-grammar/.libs/read-dict.o +0 -0
  196. data/ext/link_grammar/link-grammar/.libs/read-regex.o +0 -0
  197. data/ext/link_grammar/link-grammar/.libs/regex-morph.o +0 -0
  198. data/ext/link_grammar/link-grammar/.libs/resources.o +0 -0
  199. data/ext/link_grammar/link-grammar/.libs/spellcheck-aspell.o +0 -0
  200. data/ext/link_grammar/link-grammar/.libs/spellcheck-hun.o +0 -0
  201. data/ext/link_grammar/link-grammar/.libs/string-set.o +0 -0
  202. data/ext/link_grammar/link-grammar/.libs/tokenize.o +0 -0
  203. data/ext/link_grammar/link-grammar/.libs/utilities.o +0 -0
  204. data/ext/link_grammar/link-grammar/.libs/word-file.o +0 -0
  205. data/ext/link_grammar/link-grammar/.libs/word-utils.o +0 -0
  206. data/ext/link_grammar/link-grammar/Makefile +900 -0
  207. data/ext/link_grammar/link-grammar/Makefile.am +202 -0
  208. data/ext/link_grammar/link-grammar/Makefile.in +900 -0
  209. data/ext/link_grammar/link-grammar/analyze-linkage.c +1317 -0
  210. data/ext/link_grammar/link-grammar/analyze-linkage.h +24 -0
  211. data/ext/link_grammar/link-grammar/and.c +1603 -0
  212. data/ext/link_grammar/link-grammar/and.h +27 -0
  213. data/ext/link_grammar/link-grammar/api-structures.h +362 -0
  214. data/ext/link_grammar/link-grammar/api-types.h +72 -0
  215. data/ext/link_grammar/link-grammar/api.c +1887 -0
  216. data/ext/link_grammar/link-grammar/api.h +96 -0
  217. data/ext/link_grammar/link-grammar/autoit/.DS_Store +0 -0
  218. data/ext/link_grammar/link-grammar/autoit/README +10 -0
  219. data/ext/link_grammar/link-grammar/autoit/_LGTest.au3 +22 -0
  220. data/ext/link_grammar/link-grammar/autoit/_LinkGrammar.au3 +545 -0
  221. data/ext/link_grammar/link-grammar/build-disjuncts.c +487 -0
  222. data/ext/link_grammar/link-grammar/build-disjuncts.h +21 -0
  223. data/ext/link_grammar/link-grammar/command-line.c +458 -0
  224. data/ext/link_grammar/link-grammar/command-line.h +15 -0
  225. data/ext/link_grammar/link-grammar/constituents.c +1836 -0
  226. data/ext/link_grammar/link-grammar/constituents.h +26 -0
  227. data/ext/link_grammar/link-grammar/corpus/.DS_Store +0 -0
  228. data/ext/link_grammar/link-grammar/corpus/.deps/cluster.Plo +1 -0
  229. data/ext/link_grammar/link-grammar/corpus/.deps/corpus.Plo +1 -0
  230. data/ext/link_grammar/link-grammar/corpus/Makefile +527 -0
  231. data/ext/link_grammar/link-grammar/corpus/Makefile.am +46 -0
  232. data/ext/link_grammar/link-grammar/corpus/Makefile.in +527 -0
  233. data/ext/link_grammar/link-grammar/corpus/README +17 -0
  234. data/ext/link_grammar/link-grammar/corpus/cluster.c +286 -0
  235. data/ext/link_grammar/link-grammar/corpus/cluster.h +32 -0
  236. data/ext/link_grammar/link-grammar/corpus/corpus.c +483 -0
  237. data/ext/link_grammar/link-grammar/corpus/corpus.h +46 -0
  238. data/ext/link_grammar/link-grammar/count.c +828 -0
  239. data/ext/link_grammar/link-grammar/count.h +25 -0
  240. data/ext/link_grammar/link-grammar/disjunct-utils.c +261 -0
  241. data/ext/link_grammar/link-grammar/disjunct-utils.h +27 -0
  242. data/ext/link_grammar/link-grammar/disjuncts.c +138 -0
  243. data/ext/link_grammar/link-grammar/disjuncts.h +13 -0
  244. data/ext/link_grammar/link-grammar/error.c +92 -0
  245. data/ext/link_grammar/link-grammar/error.h +35 -0
  246. data/ext/link_grammar/link-grammar/expand.c +67 -0
  247. data/ext/link_grammar/link-grammar/expand.h +13 -0
  248. data/ext/link_grammar/link-grammar/externs.h +22 -0
  249. data/ext/link_grammar/link-grammar/extract-links.c +625 -0
  250. data/ext/link_grammar/link-grammar/extract-links.h +16 -0
  251. data/ext/link_grammar/link-grammar/fast-match.c +309 -0
  252. data/ext/link_grammar/link-grammar/fast-match.h +17 -0
  253. data/ext/link_grammar/link-grammar/idiom.c +373 -0
  254. data/ext/link_grammar/link-grammar/idiom.h +15 -0
  255. data/ext/link_grammar/link-grammar/jni-client.c +779 -0
  256. data/ext/link_grammar/link-grammar/jni-client.h +236 -0
  257. data/ext/link_grammar/link-grammar/liblink-grammar-java.la +42 -0
  258. data/ext/link_grammar/link-grammar/liblink-grammar.la +41 -0
  259. data/ext/link_grammar/link-grammar/link-features.h +37 -0
  260. data/ext/link_grammar/link-grammar/link-features.h.in +37 -0
  261. data/ext/link_grammar/link-grammar/link-grammar-java.def +31 -0
  262. data/ext/link_grammar/link-grammar/link-grammar.def +194 -0
  263. data/ext/link_grammar/link-grammar/link-includes.h +465 -0
  264. data/ext/link_grammar/link-grammar/link-parser.c +849 -0
  265. data/ext/link_grammar/link-grammar/massage.c +329 -0
  266. data/ext/link_grammar/link-grammar/massage.h +13 -0
  267. data/ext/link_grammar/link-grammar/post-process.c +1113 -0
  268. data/ext/link_grammar/link-grammar/post-process.h +45 -0
  269. data/ext/link_grammar/link-grammar/pp_knowledge.c +376 -0
  270. data/ext/link_grammar/link-grammar/pp_knowledge.h +14 -0
  271. data/ext/link_grammar/link-grammar/pp_lexer.c +1920 -0
  272. data/ext/link_grammar/link-grammar/pp_lexer.h +19 -0
  273. data/ext/link_grammar/link-grammar/pp_linkset.c +158 -0
  274. data/ext/link_grammar/link-grammar/pp_linkset.h +20 -0
  275. data/ext/link_grammar/link-grammar/prefix.c +482 -0
  276. data/ext/link_grammar/link-grammar/prefix.h +139 -0
  277. data/ext/link_grammar/link-grammar/preparation.c +412 -0
  278. data/ext/link_grammar/link-grammar/preparation.h +20 -0
  279. data/ext/link_grammar/link-grammar/print-util.c +87 -0
  280. data/ext/link_grammar/link-grammar/print-util.h +32 -0
  281. data/ext/link_grammar/link-grammar/print.c +1085 -0
  282. data/ext/link_grammar/link-grammar/print.h +16 -0
  283. data/ext/link_grammar/link-grammar/prune.c +1864 -0
  284. data/ext/link_grammar/link-grammar/prune.h +17 -0
  285. data/ext/link_grammar/link-grammar/read-dict.c +1785 -0
  286. data/ext/link_grammar/link-grammar/read-dict.h +29 -0
  287. data/ext/link_grammar/link-grammar/read-regex.c +161 -0
  288. data/ext/link_grammar/link-grammar/read-regex.h +12 -0
  289. data/ext/link_grammar/link-grammar/regex-morph.c +126 -0
  290. data/ext/link_grammar/link-grammar/regex-morph.h +17 -0
  291. data/ext/link_grammar/link-grammar/resources.c +180 -0
  292. data/ext/link_grammar/link-grammar/resources.h +23 -0
  293. data/ext/link_grammar/link-grammar/sat-solver/.DS_Store +0 -0
  294. data/ext/link_grammar/link-grammar/sat-solver/.deps/fast-sprintf.Plo +1 -0
  295. data/ext/link_grammar/link-grammar/sat-solver/.deps/sat-encoder.Plo +1 -0
  296. data/ext/link_grammar/link-grammar/sat-solver/.deps/util.Plo +1 -0
  297. data/ext/link_grammar/link-grammar/sat-solver/.deps/variables.Plo +1 -0
  298. data/ext/link_grammar/link-grammar/sat-solver/.deps/word-tag.Plo +1 -0
  299. data/ext/link_grammar/link-grammar/sat-solver/Makefile +527 -0
  300. data/ext/link_grammar/link-grammar/sat-solver/Makefile.am +29 -0
  301. data/ext/link_grammar/link-grammar/sat-solver/Makefile.in +527 -0
  302. data/ext/link_grammar/link-grammar/sat-solver/clock.hpp +33 -0
  303. data/ext/link_grammar/link-grammar/sat-solver/fast-sprintf.cpp +26 -0
  304. data/ext/link_grammar/link-grammar/sat-solver/fast-sprintf.hpp +7 -0
  305. data/ext/link_grammar/link-grammar/sat-solver/guiding.hpp +244 -0
  306. data/ext/link_grammar/link-grammar/sat-solver/matrix-ut.hpp +79 -0
  307. data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.cpp +2811 -0
  308. data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.h +11 -0
  309. data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.hpp +381 -0
  310. data/ext/link_grammar/link-grammar/sat-solver/trie.hpp +118 -0
  311. data/ext/link_grammar/link-grammar/sat-solver/util.cpp +23 -0
  312. data/ext/link_grammar/link-grammar/sat-solver/util.hpp +14 -0
  313. data/ext/link_grammar/link-grammar/sat-solver/variables.cpp +5 -0
  314. data/ext/link_grammar/link-grammar/sat-solver/variables.hpp +829 -0
  315. data/ext/link_grammar/link-grammar/sat-solver/word-tag.cpp +159 -0
  316. data/ext/link_grammar/link-grammar/sat-solver/word-tag.hpp +162 -0
  317. data/ext/link_grammar/link-grammar/spellcheck-aspell.c +148 -0
  318. data/ext/link_grammar/link-grammar/spellcheck-hun.c +136 -0
  319. data/ext/link_grammar/link-grammar/spellcheck.h +34 -0
  320. data/ext/link_grammar/link-grammar/string-set.c +169 -0
  321. data/ext/link_grammar/link-grammar/string-set.h +16 -0
  322. data/ext/link_grammar/link-grammar/structures.h +498 -0
  323. data/ext/link_grammar/link-grammar/tokenize.c +1049 -0
  324. data/ext/link_grammar/link-grammar/tokenize.h +15 -0
  325. data/ext/link_grammar/link-grammar/utilities.c +847 -0
  326. data/ext/link_grammar/link-grammar/utilities.h +281 -0
  327. data/ext/link_grammar/link-grammar/word-file.c +124 -0
  328. data/ext/link_grammar/link-grammar/word-file.h +15 -0
  329. data/ext/link_grammar/link-grammar/word-utils.c +526 -0
  330. data/ext/link_grammar/link-grammar/word-utils.h +152 -0
  331. data/ext/link_grammar/link_grammar.c +202 -0
  332. data/ext/link_grammar/link_grammar.h +99 -0
  333. data/grammar_cop.gemspec +24 -0
  334. data/lib/.DS_Store +0 -0
  335. data/lib/grammar_cop.rb +9 -0
  336. data/lib/grammar_cop/.DS_Store +0 -0
  337. data/lib/grammar_cop/dictionary.rb +19 -0
  338. data/lib/grammar_cop/linkage.rb +30 -0
  339. data/lib/grammar_cop/parse_options.rb +32 -0
  340. data/lib/grammar_cop/sentence.rb +36 -0
  341. data/lib/grammar_cop/version.rb +3 -0
  342. data/test/.DS_Store +0 -0
  343. data/test/grammar_cop_test.rb +27 -0
  344. metadata +407 -0
@@ -0,0 +1,24 @@
1
+ /*************************************************************************/
2
+ /* Copyright (c) 2004 */
3
+ /* Daniel Sleator, David Temperley, and John Lafferty */
4
+ /* All rights reserved */
5
+ /* */
6
+ /* Use of the link grammar parsing system is subject to the terms of the */
7
+ /* license set forth in the LICENSE file included with this software, */
8
+ /* and also available at http://www.link.cs.cmu.edu/link/license.html */
9
+ /* This license allows free redistribution and use in source and binary */
10
+ /* forms, with or without modification, subject to certain conditions. */
11
+ /* */
12
+ /*************************************************************************/
13
+
14
+ void init_analyze(Sentence);
15
+ void free_analyze(Sentence);
16
+
17
+ void extract_thin_linkage(Sentence, Parse_Options, Linkage);
18
+ void extract_fat_linkage (Sentence, Parse_Options, Linkage);
19
+ Linkage_info analyze_fat_linkage (Sentence, Parse_Options, int pass);
20
+ Linkage_info analyze_thin_linkage(Sentence, Parse_Options, int pass);
21
+ void free_DIS_tree(DIS_node *);
22
+
23
+ void zero_sublinkage(Sublinkage *s);
24
+
@@ -0,0 +1,1603 @@
1
+ /*************************************************************************/
2
+ /* Copyright (c) 2004 */
3
+ /* Daniel Sleator, David Temperley, and John Lafferty */
4
+ /* All rights reserved */
5
+ /* */
6
+ /* Use of the link grammar parsing system is subject to the terms of the */
7
+ /* license set forth in the LICENSE file included with this software, */
8
+ /* and also available at http://www.link.cs.cmu.edu/link/license.html */
9
+ /* This license allows free redistribution and use in source and binary */
10
+ /* forms, with or without modification, subject to certain conditions. */
11
+ /* */
12
+ /*************************************************************************/
13
+
14
+ #include "api.h"
15
+ #include "disjunct-utils.h"
16
+
17
+ /*
18
+ Notes about AND
19
+
20
+ A large fraction of the code of this parser seems to deal with handling
21
+ conjunctions. This comment (combined with reading the paper) should
22
+ give an idea of how it works.
23
+
24
+ First of all, we need a more detailed discussion of strings, what they
25
+ match, etc. (This entire discussion ignores the labels, which are
26
+ semantically the same as the leading upper case letters of the
27
+ connector.)
28
+
29
+ We'll deal with infinite strings from an alphabet of three types of
30
+ characters: "*". "^" and ordinary characters (denoted "a" and "b").
31
+ (The end of a string should be thought of as an infinite sequence of
32
+ "*"s).
33
+
34
+ Let match(s) be the set of strings that will match the string s. This
35
+ is defined as follows. A string t is in match(s) if (1) its leading
36
+ upper case letters exactly match those of s. (2) traversing through
37
+ both strings, from left to right in step, no missmatch is found
38
+ between corresponding letters. A missmatch is a pair of differing
39
+ ordinary characters, or a "^" and any ordinary letter or two "^"s.
40
+ In other words, a match is exactly a "*" and anything, or two
41
+ identical ordinary letters.
42
+
43
+ Alternative definition of the set match(s):
44
+ {t | t is obtained from s by replacing each "^" and any other characters
45
+ by "*"s, and replacing any original "*" in s by any other character
46
+ (or "^").}
47
+
48
+ Theorem: if t in match(s) then s in match(t).
49
+
50
+ It is also a theorem that given any two strings s and t, there exists a
51
+ unique new string u with the property that:
52
+
53
+ match(u) = match(s) intersect match(t)
54
+
55
+ This string is called the GCD of s and t. Here are some examples.
56
+
57
+ GCD(N*a,Nb) = Nba
58
+ GCD(Na, Nb) = N^
59
+ GCD(Nab,Nb) = N^b
60
+ GCD(N^,N*a) = N^a
61
+ GCD(N^, N) = N^
62
+ GCD(N^^,N^) = N^^
63
+
64
+ We need an algorithm for computing the GCD of two strings. Here is
65
+ one.
66
+
67
+ First get by the upper case letters (which must be equal, otherwise
68
+ there is no intersection), issuing them. Traverse the rest of the
69
+ characters of s and t in lockstep until there is nothing left but
70
+ "*"s. If the two characters are:
71
+
72
+ "a" and "a", issue "a"
73
+ "a" and "b", issue "^"
74
+ "a" and "*", issue "a"
75
+ "*" and "*", issue "*"
76
+ "*" and "^", issue "^"
77
+ "a" and "^", issue "^"
78
+ "^" and "^", issue "^"
79
+
80
+ A simple case analysis suffices to show that any string that matches
81
+ the right side, must match both of the left sides, and any string not
82
+ matching the right side must not match at least one of the left sides.
83
+
84
+ This proves that the GCD operator is associative and commutative.
85
+ (There must be a name for a mathematical structure with these properties.)
86
+
87
+ To elaborate further on this theory, define the notion of two strings
88
+ matching in the dual sense as follows: s and t dual-match if
89
+ match(s) is contained in match(t) or vice versa---
90
+
91
+ Full development of this theory could lead to a more efficient
92
+ algorithm for this problem. I'll defer this until such time as it
93
+ appears necessary.
94
+
95
+
96
+ We need a data structure that stores a set of fat links. Each fat
97
+ link has a number (called its label). The fat link operates in liu of
98
+ a collection of links. The particular stuff it is a substitute for is
99
+ defined by a disjunct. This disjunct is stored in the data structure.
100
+
101
+ The type of a disjunct is defined by the sequence of connector types
102
+ (defined by their upper case letters) that comprises it. Each entry
103
+ of the label_table[] points to a list of disjuncts that have the same
104
+ type (a hash table is uses so that, given a disjunct, we can efficiently
105
+ compute the element of the label table in which it belongs).
106
+
107
+ We begin by loading up the label table with all of the possible
108
+ fat links that occur through the words of the sentence. These are
109
+ obtained by taking every sub-range of the connectors of each disjunct
110
+ (containing the center). We also compute the closure (under the GCD
111
+ operator) of these disjuncts and store also store these in the
112
+ label_table. Each disjunct in this table has a string which represents
113
+ the subscripts of all of its connectors (and their multi-connector bits).
114
+
115
+ It is possible to generate a fat connector for any one of the
116
+ disjuncts in the label_table. This connector's label field is given
117
+ the label from the disjunct from which it arose. It's string field
118
+ is taken from the string of the disjunct (mentioned above). It will be
119
+ given a priority with a value of UP_priority or DOWN_priority (depending
120
+ on how it will be used). A connector of UP_priority can match one of
121
+ DOWN_priority, but neither of these can match any other priority.
122
+ (Of course, a fat connector can match only another fat connector with
123
+ the same label.)
124
+
125
+ The paper describes in some detail how disjuncts are given to words
126
+ and to "and" and ",", etc. Each word in the sentence gets many more
127
+ new disjuncts. For each contiguous set of connectors containing (or
128
+ adjacent to) the center of the disjunct, we generate a fat link, and
129
+ replace these connector in the word by a fat link. (Actually we do
130
+ this twice. Once pointing to the right, once to the left.) These fat
131
+ links have priority UP_priority.
132
+
133
+ What do we generate for ","? For each type of fat link (each label)
134
+ we make a disjunct that has two down connectors (to the right and left)
135
+ and one up connector (to the right). There will be a unique way of
136
+ hooking together a comma-separated and-list.
137
+
138
+ The disjuncts on "and" are more complicated. Here we have to do just what
139
+ we did for comma (but also include the up link to the left), then
140
+ we also have to allow the process to terminate. So, there is a disjunct
141
+ with two down fat links, and between them are the original thin links.
142
+ These are said to "blossom" out. However, this is not all that is
143
+ necessary. It's possible for an and-list to be part of another and list
144
+ with a different labeled fat connector. To make this possible, we
145
+ regroup the just blossomed disjuncts (in all possible ways about the center)
146
+ and install them as fat links. If this sounds like a lot of disjuncts --
147
+ it is! The program is currently fairly slow on long sentence with and.
148
+
149
+ It is slightly non-obvious that the fat-links in a linkage constructed
150
+ from disjuncts defined in this way form a binary tree. Naturally,
151
+ connectors with UP_priority point up the tree, and those with DOWN_priority
152
+ point down the tree.
153
+
154
+ Think of the string x on the connector as representing a set X of strings.
155
+ X = match(x). So, for example, if x="S^" then match(x) = {"S", "S*a",
156
+ "S*b", etc}. The matching rules for UP and DOWN priority connectors
157
+ are such that as you go up (the tree of ands) the X sets get no larger.
158
+ So, for example, a "Sb" pointing up can match an "S^" pointing down.
159
+ (Because more stuff can match "Sb" than can match "S^".)
160
+ This guarantees that whatever connector ultimately gets used after the
161
+ fat connector blossoms out (see below), it is a powerful enough connector
162
+ to be able to match to any of the connectors associated with it.
163
+
164
+ One problem with the scheme just descibed is that it sometimes generates
165
+ essentially the same linkage several times. This happens if there is
166
+ a gap in the connective power, and the mismatch can be moved around in
167
+ different ways. Here is an example of how this happens.
168
+
169
+ (Left is DOWN, right is UP)
170
+
171
+ Sa <---> S^ <---> S or Sa <---> Sa <---> S
172
+ fat thin fat thin
173
+
174
+ Here two of the disjunct types are given by "S^" and "Sa". Notice that
175
+ the criterion of shrinking the matching set is satisfied by the the fat
176
+ link (traversing from left to right). How do I eliminate one of these?
177
+
178
+ I use the technique of canonization. I generate all the linkages. There
179
+ is then a procedure that can check to see of a linkage is canonical.
180
+ If it is, it's used, otherwise it's ignored. It's claimed that exactly
181
+ one canonical one of each equivalence class will be generated.
182
+ We basically insist that the intermediate fat disjuncts (ones that
183
+ have a fat link pointing down) are all minimal -- that is, that they
184
+ cannot be replaced by by another (with a strictly) smaller match set.
185
+ If one is not minimal, then the linkage is rejected.
186
+
187
+ Here's a proof that this is correct. Consider the set of equivalent
188
+ linkages that are generated. These Pick a disjunct that is the root of
189
+ its tree. Consider the set of all disjuncts which occur in that positon
190
+ among the equivalent linkages. The GCD of all of these can fit in that
191
+ position (it matches down the tree, since its match set has gotten
192
+ smaller, and it also matches to the THIN links.) Since the GCD is put
193
+ on "and" this particular one will be generated. Therefore rejecting
194
+ a linkage in which a root fat disjunct can be replaced by a smaller one
195
+ is ok (since the smaller one will be generated separately). What about
196
+ a fat disjunct that is not the root. We consider the set of linkages in
197
+ which the root is minimal (the ones for which it's not have already been
198
+ eliminated). Now, consider one of the children of the root in precisely
199
+ the way we just considered the root. The same argument holds. The only
200
+ difference is that the root node gives another constraint on how small
201
+ you can make the disjunct -- so, within these constraints, if we can go
202
+ smaller, we reject.
203
+
204
+ The code to do all of this is fairly ugly, but I think it works.
205
+
206
+
207
+ Problems with this stuff:
208
+
209
+ 1) There is obviously a combinatorial explosion that takes place.
210
+ As the number of disjuncts (and the number of their subscripts
211
+ increase) the number of disjuncts that get put onto "and" will
212
+ increase tremendously. When we made the transcript for the tech
213
+ report (Around August 1991) most of the sentence were processed
214
+ in well under 10 seconds. Now (Jan 1992), some of these sentences
215
+ take ten times longer. As of this writing I don't really know the
216
+ reason, other than just the fact that the dictionary entries are
217
+ more complex than they used to be. The number of linkages has also
218
+ increased significantly.
219
+
220
+ 2) Each element of an and list must be attached through only one word.
221
+ This disallows "there is time enough and space enough for both of us",
222
+ and many other reasonable sounding things. The combinatorial
223
+ explosion that would occur if you allowed two different connection
224
+ points would be tremendous, and the number of solutions would also
225
+ probably go up by another order of magnitude. Perhaps if there
226
+ were strong constraints on the type of connectors in which this
227
+ would be allowed, then this would be a conceivable prospect.
228
+
229
+ 3) A multi-connector must be either all "outside" or all "inside" the and.
230
+ For example, "the big black dog and cat ran" has only two ways to
231
+ linkages (instead of three).
232
+
233
+ Possible bug: It seems that the following two linkages should be the
234
+ same under the canonical linkage test. Could this have to do with the
235
+ pluralization system?
236
+
237
+ > I am big and the bike and the car were broken
238
+ Accepted (4 linkages, 4 with no P.P. violations) at stage 1
239
+ Linkage 1, cost vector = (0, 0, 18)
240
+
241
+ +------Spx-----+
242
+ +-----CC-----+------Wd------+-d^^*i^-+ |
243
+ +-Wd-+Spi+-Pa+ | +--Ds-+d^^*+ +-Ds-+ +--Pv-+
244
+ | | | | | | | | | | | |
245
+ ///// I.p am big.a and the bike.n and the car.n were broken
246
+
247
+ ///// RW <---RW----> RW /////
248
+ ///// Wd <---Wd----> Wd I.p
249
+ I.p CC <---CC----> CC and
250
+ I.p Sp*i <---Spii--> Spi am
251
+ am Pa <---Pa----> Pa big.a
252
+ and Wd <---Wd----> Wd and
253
+ bike.n d^s** 6<---d^^*i-> d^^*i 6 and
254
+ the D <---Ds----> Ds bike.n
255
+ and Sp <---Spx---> Spx were
256
+ and d^^*i 6<---d^^*i-> d^s** 6 car.n
257
+ the D <---Ds----> Ds car.n
258
+ were Pv <---Pv----> Pv broken
259
+
260
+ (press return for another)
261
+ >
262
+ Linkage 2, cost vector = (0, 0, 18)
263
+
264
+ +------Spx-----+
265
+ +-----CC-----+------Wd------+-d^s**^-+ |
266
+ +-Wd-+Spi+-Pa+ | +--Ds-+d^s*+ +-Ds-+ +--Pv-+
267
+ | | | | | | | | | | | |
268
+ ///// I.p am big.a and the bike.n and the car.n were broken
269
+
270
+ ///// RW <---RW----> RW /////
271
+ ///// Wd <---Wd----> Wd I.p
272
+ I.p CC <---CC----> CC and
273
+ I.p Sp*i <---Spii--> Spi am
274
+ am Pa <---Pa----> Pa big.a
275
+ and Wd <---Wd----> Wd and
276
+ bike.n d^s** 6<---d^s**-> d^s** 6 and
277
+ the D <---Ds----> Ds bike.n
278
+ and Sp <---Spx---> Spx were
279
+ and d^s** 6<---d^s**-> d^s** 6 car.n
280
+ the D <---Ds----> Ds car.n
281
+ were Pv <---Pv----> Pv broken
282
+
283
+ */
284
+
285
+ static void init_LT(Sentence sent)
286
+ {
287
+ sent->and_data.LT_bound = 20;
288
+ sent->and_data.LT_size = 0;
289
+ sent->and_data.label_table =
290
+ (Disjunct **) xalloc(sent->and_data.LT_bound * sizeof(Disjunct *));
291
+ }
292
+
293
+ static void grow_LT(Sentence sent)
294
+ {
295
+ size_t oldsize = sent->and_data.LT_bound * sizeof(Disjunct *);
296
+ sent->and_data.LT_bound = (3*sent->and_data.LT_bound)/2;
297
+ sent->and_data.label_table =
298
+ (Disjunct **) xrealloc(sent->and_data.label_table,
299
+ oldsize,
300
+ sent->and_data.LT_bound * sizeof(Disjunct *));
301
+ }
302
+
303
+ static void init_HT(Sentence sent)
304
+ {
305
+ memset(sent->and_data.hash_table, 0, HT_SIZE*sizeof(Label_node *));
306
+ }
307
+
308
+ static void free_HT(Sentence sent)
309
+ {
310
+ int i;
311
+ Label_node * la, * la1;
312
+ for (i=0; i<HT_SIZE; i++) {
313
+ for (la=sent->and_data.hash_table[i]; la != NULL; la = la1) {
314
+ la1 = la->next;
315
+ xfree((char *)la, sizeof(Label_node));
316
+ }
317
+ sent->and_data.hash_table[i] = NULL;
318
+ }
319
+ }
320
+
321
+ static void free_LT(Sentence sent)
322
+ {
323
+ int i;
324
+ for (i=0; i<sent->and_data.LT_size; i++) {
325
+ free_disjuncts(sent->and_data.label_table[i]);
326
+ }
327
+ xfree((char *) sent->and_data.label_table,
328
+ sent->and_data.LT_bound * sizeof(Disjunct*));
329
+ sent->and_data.LT_bound = 0;
330
+ sent->and_data.LT_size = 0;
331
+ sent->and_data.label_table = NULL;
332
+ }
333
+
334
+ void free_AND_tables(Sentence sent)
335
+ {
336
+ free_LT(sent);
337
+ free_HT(sent);
338
+ }
339
+
340
+ void initialize_conjunction_tables(Sentence sent)
341
+ {
342
+ int i;
343
+ sent->and_data.LT_bound = 0;
344
+ sent->and_data.LT_size = 0;
345
+ sent->and_data.label_table = NULL;
346
+ for (i=0; i<HT_SIZE; i++) {
347
+ sent->and_data.hash_table[i] = NULL;
348
+ }
349
+ }
350
+
351
+ /**
352
+ * This is a hash function for disjuncts
353
+ */
354
+ static inline int and_hash_disjunct(Disjunct *d)
355
+ {
356
+ unsigned int i;
357
+ Connector *e;
358
+ i = 0;
359
+ for (e = d->left ; e != NULL; e = e->next) {
360
+ i += connector_hash(e);
361
+ }
362
+ i += (i<<5);
363
+ for (e = d->right ; e != NULL; e = e->next) {
364
+ i += connector_hash(e);
365
+ }
366
+ return (i & (HT_SIZE-1));
367
+ }
368
+
369
+ /**
370
+ * Returns TRUE if the disjunct is appropriate to be made into fat links.
371
+ * Check here that the connectors are from some small set.
372
+ * This will disallow, for example "the and their dog ran".
373
+ */
374
+ static int is_appropriate(Sentence sent, Disjunct * d)
375
+ {
376
+ Connector * c;
377
+
378
+ if (sent->dict->andable_connector_set == NULL) return TRUE;
379
+ /* if no set, then everything is considered andable */
380
+ for (c = d->right; c!=NULL; c=c->next) {
381
+ if (!match_in_connector_set(sent, sent->dict->andable_connector_set, c, '+')) return FALSE;
382
+ }
383
+ for (c = d->left; c!=NULL; c=c->next) {
384
+ if (!match_in_connector_set(sent, sent->dict->andable_connector_set, c, '-')) return FALSE;
385
+ }
386
+ return TRUE;
387
+ }
388
+
389
+ /**
390
+ * Two connectors are said to be of the same type if they have
391
+ * the same label, and the initial upper case letters of their
392
+ * strings match.
393
+ */
394
+ static int connector_types_equal(Connector * c1, Connector * c2)
395
+ {
396
+ if (c1->label != c2->label) return FALSE;
397
+ return utf8_upper_match(c1->string, c2->string);
398
+ }
399
+
400
+ /**
401
+ * Two disjuncts are said to be the same type if they're the same
402
+ * ignoring the multi fields, the priority fields, and the subscripts
403
+ * of the connectors (and the string field of the disjunct of course).
404
+ * Disjuncts of the same type are located in the same label_table list.
405
+ *
406
+ * This returns TRUE if they are of the same type.
407
+ */
408
+ static int disjunct_types_equal(Disjunct * d1, Disjunct * d2)
409
+ {
410
+ Connector *e1, *e2;
411
+
412
+ e1 = d1->left;
413
+ e2 = d2->left;
414
+ while((e1!=NULL) && (e2!=NULL)) {
415
+ if (!connector_types_equal(e1,e2)) break;
416
+ e1 = e1->next;
417
+ e2 = e2->next;
418
+ }
419
+ if ((e1!=NULL) || (e2!=NULL)) return FALSE;
420
+ e1 = d1->right;
421
+ e2 = d2->right;
422
+ while((e1!=NULL) && (e2!=NULL)) {
423
+ if (!connector_types_equal(e1,e2)) break;
424
+ e1 = e1->next;
425
+ e2 = e2->next;
426
+ }
427
+ if ((e1!=NULL) || (e2!=NULL)) return FALSE;
428
+ return TRUE;
429
+ }
430
+
431
+ /**
432
+ * This returns a string that is the the GCD of the two given strings.
433
+ * If the GCD is equal to one of them, a pointer to it is returned.
434
+ * Otherwise a new string for the GCD is xalloced and put on the
435
+ * "free later" list.
436
+ */
437
+ const char * intersect_strings(Sentence sent, const char * s, const char * t)
438
+ {
439
+ int len, i, j, d;
440
+ const char *w, *s0;
441
+ char u0[MAX_TOKEN_LENGTH]; /* Links are *always* less than 10 chars long */
442
+ char *u;
443
+ if (strcmp(s,t)==0) return s; /* would work without this */
444
+ i = strlen(s);
445
+ j = strlen(t);
446
+ if (j > i) {
447
+ w = s; s = t; t = w;
448
+ len = j;
449
+ } else {
450
+ len = i;
451
+ }
452
+ /* s is now the longer (at least not the shorter) string */
453
+ /* and len is its length */
454
+ u = u0;
455
+ d = 0;
456
+ s0 = s;
457
+ while (*t != '\0') {
458
+ if ((*s == *t) || (*t == '*')) {
459
+ *u = *s;
460
+ } else {
461
+ d++;
462
+ if (*s == '*') *u = *t;
463
+ else *u = '^';
464
+ }
465
+ s++; t++; u++;
466
+ }
467
+ if (d==0) {
468
+ return s0;
469
+ } else {
470
+ strcpy(u, s); /* get the remainder of s */
471
+ return string_set_add(u0, sent->string_set);
472
+ }
473
+ }
474
+
475
+ /**
476
+ * Two connectors are said to be equal if they are of the same type
477
+ * (defined above), they have the same multi field, and they have
478
+ * exactly the same connectors (including lower case chars).
479
+ * (priorities ignored).
480
+ */
481
+ static int connectors_equal_AND(Connector *c1, Connector *c2)
482
+ {
483
+ return (c1->label == c2->label) &&
484
+ (c1->multi == c2->multi) &&
485
+ (strcmp(c1->string, c2->string) == 0);
486
+ }
487
+
488
+ /**
489
+ * Return true if the disjuncts are equal (ignoring priority fields)
490
+ * and the string of the disjunct.
491
+ */
492
+ static int disjuncts_equal_AND(Sentence sent, Disjunct * d1, Disjunct * d2)
493
+ {
494
+ Connector *e1, *e2;
495
+ sent->and_data.STAT_calls_to_equality_test++;
496
+ e1 = d1->left;
497
+ e2 = d2->left;
498
+ while((e1!=NULL) && (e2!=NULL)) {
499
+ if (!connectors_equal_AND(e1, e2)) break;
500
+ e1 = e1->next;
501
+ e2 = e2->next;
502
+ }
503
+ if ((e1!=NULL) || (e2!=NULL)) return FALSE;
504
+ e1 = d1->right;
505
+ e2 = d2->right;
506
+ while((e1!=NULL) && (e2!=NULL)) {
507
+ if (!connectors_equal_AND(e1, e2)) break;
508
+ e1 = e1->next;
509
+ e2 = e2->next;
510
+ }
511
+ if ((e1!=NULL) || (e2!=NULL)) return FALSE;
512
+ return TRUE;
513
+ }
514
+
515
+ /**
516
+ * Create a new disjunct that is the GCD of d1 and d2.
517
+ * It assumes that the disjuncts are of the same type, so the
518
+ * GCD will not be empty.
519
+ */
520
+ static Disjunct * intersect_disjuncts(Sentence sent, Disjunct * d1, Disjunct * d2)
521
+ {
522
+ Disjunct * d;
523
+ Connector *c1, *c2, *c;
524
+ d = copy_disjunct(d1);
525
+ c = d->left;
526
+ c1 = d1->left;
527
+ c2 = d2->left;
528
+ while (c1!=NULL) {
529
+ connector_set_string (c, intersect_strings(sent, c1->string, c2->string));
530
+ c->multi = (c1->multi) && (c2->multi);
531
+ c = c->next; c1 = c1->next; c2 = c2->next;
532
+ }
533
+ c = d->right;
534
+ c1 = d1->right;
535
+ c2 = d2->right;
536
+ while (c1!=NULL) {
537
+ connector_set_string (c, intersect_strings(sent, c1->string, c2->string));
538
+ c->multi = (c1->multi) && (c2->multi);
539
+ c = c->next; c1 = c1->next; c2 = c2->next;
540
+ }
541
+ return d;
542
+ }
543
+
544
+ /**
545
+ * (1) look for the given disjunct in the table structures
546
+ * if it's already in the table structures, do nothing
547
+ * (2) otherwise make a copy of it, and put it into the table structures
548
+ * (3) also put all of the GCDs of this disjunct with all of the
549
+ * other matching disjuncts into the table.
550
+ *
551
+ * The costs are set to zero.
552
+ * Note that this has no effect on disjunct d.
553
+ */
554
+ static void put_disjunct_into_table(Sentence sent, Disjunct *d)
555
+ {
556
+ Disjunct *d1=NULL, *d2, *di, *d_copy;
557
+ Label_node * lp;
558
+ int h, k;
559
+
560
+ h = and_hash_disjunct(d);
561
+
562
+ for (lp = sent->and_data.hash_table[h]; lp != NULL; lp = lp->next)
563
+ {
564
+ d1 = sent->and_data.label_table[lp->label];
565
+ if (disjunct_types_equal(d,d1)) break;
566
+ }
567
+ if (lp != NULL)
568
+ {
569
+ /* there is already a label for disjuncts of this type */
570
+ /* d1 points to the list of disjuncts of this type already there */
571
+ while(d1 != NULL)
572
+ {
573
+ if (disjuncts_equal_AND(sent, d1, d)) return;
574
+ d1 = d1->next;
575
+ }
576
+ /* now we must put the d disjunct in there, and all of the GCDs of
577
+ it with the ones already there.
578
+
579
+ This is done as follows. We scan through the list of disjuncts
580
+ computing the gcd of the new one with each of the others, putting
581
+ the resulting disjuncts onto another list rooted at d2.
582
+ Now insert d into the the list already there. Now for each
583
+ one on the d2 list, put it in if it isn't already there.
584
+
585
+ Here we're making use of the following theorem: Given a
586
+ collection of sets s1, s2 ... sn closed under intersection,
587
+ to if we add a new set s to the collection and also add
588
+ all the intersections between s and s1...sn to the collection,
589
+ then the collection is still closed under intersection.
590
+
591
+ Use a Venn diagram to prove this theorem.
592
+
593
+ */
594
+ d_copy = copy_disjunct(d);
595
+ d_copy->cost = 0;
596
+ k = lp->label;
597
+ d2 = NULL;
598
+ for (d1=sent->and_data.label_table[k]; d1!=NULL; d1 = d1->next) {
599
+ di = intersect_disjuncts(sent, d_copy, d1);
600
+ di->next = d2;
601
+ d2 = di;
602
+ }
603
+ d_copy->next = sent->and_data.label_table[k];
604
+ sent->and_data.label_table[k] = d_copy;
605
+ for (;d2 != NULL; d2 = di) {
606
+ di = d2->next;
607
+ for (d1 = sent->and_data.label_table[k]; d1 != NULL; d1 = d1->next) {
608
+ if (disjuncts_equal_AND(sent, d1, d2)) break;
609
+ }
610
+ if (d1 == NULL) {
611
+ sent->and_data.STAT_N_disjuncts++;
612
+ d2->next = sent->and_data.label_table[k];
613
+ sent->and_data.label_table[k] = d2;
614
+ } else {
615
+ d2->next = NULL;
616
+ free_disjuncts(d2);
617
+ }
618
+ }
619
+ } else {
620
+ /* create a new label for disjuncts of this type */
621
+ d_copy = copy_disjunct(d);
622
+ d_copy->cost = 0;
623
+ d_copy->next = NULL;
624
+ if (sent->and_data.LT_size == sent->and_data.LT_bound) grow_LT(sent);
625
+ lp = (Label_node *) xalloc(sizeof(Label_node));
626
+ lp->next = sent->and_data.hash_table[h];
627
+ sent->and_data.hash_table[h] = lp;
628
+ lp->label = sent->and_data.LT_size;
629
+ sent->and_data.label_table[sent->and_data.LT_size] = d_copy;
630
+ sent->and_data.LT_size++;
631
+ sent->and_data.STAT_N_disjuncts++;
632
+ }
633
+ }
634
+
635
+ /**
636
+ * A sub disjuct of d is any disjunct obtained by killing the tail
637
+ * of either connector list at any point.
638
+ * Here we go through each sub-disjunct of d, and put it into our
639
+ * table data structure.
640
+ *
641
+ * The function has no side effects on d.
642
+ */
643
+ static void extract_all_fat_links(Sentence sent, Disjunct * d)
644
+ {
645
+ Connector * cl, * cr, *tl, *tr;
646
+ tl = d->left;
647
+ d->left = NULL;
648
+ for (cr = d->right; cr!=NULL; cr = cr->next) {
649
+ tr = cr->next;
650
+ cr->next = NULL;
651
+ if (is_appropriate(sent, d)) put_disjunct_into_table(sent, d);
652
+ cr->next = tr;
653
+ }
654
+ d->left = tl;
655
+
656
+ tr = d->right;
657
+ d->right = NULL;
658
+ for (cl = d->left; cl!=NULL; cl = cl->next) {
659
+ tl = cl->next;
660
+ cl->next = NULL;
661
+ if (is_appropriate(sent, d)) put_disjunct_into_table(sent, d);
662
+ cl->next = tl;
663
+ }
664
+ d->right = tr;
665
+
666
+ for (cl = d->left; cl!=NULL; cl = cl->next) {
667
+ for (cr = d->right; cr!=NULL; cr = cr->next) {
668
+ tl = cl->next;
669
+ tr = cr->next;
670
+ cl->next = cr->next = NULL;
671
+
672
+ if (is_appropriate(sent, d)) put_disjunct_into_table(sent, d);
673
+
674
+ cl->next = tl;
675
+ cr->next = tr;
676
+ }
677
+ }
678
+ }
679
+
680
+ /**
681
+ * put the next len characters from c->string (skipping upper
682
+ * case ones) into s. If there are fewer than this, pad with '*'s.
683
+ * Then put in a character for the multi match bit of c.
684
+ * Then put in a '\0', and return a pointer to this place.
685
+ */
686
+ static char * stick_in_one_connector(char *s, Connector *c, int len)
687
+ {
688
+ const char * t;
689
+
690
+ t = skip_utf8_upper(c->string);
691
+
692
+ while (*t != '\0') {
693
+ *s++ = *t++;
694
+ len--;
695
+ }
696
+ while (len > 0) {
697
+ *s++ = '*';
698
+ len--;
699
+ }
700
+ if (c->multi) *s++ = '*'; else *s++ = '^'; /* check this sometime */
701
+ *s = '\0';
702
+ return s;
703
+ }
704
+
705
+ /**
706
+ * This takes a label k, modifies the list of disjuncts with that
707
+ * label. For each such disjunct, it computes the string that
708
+ * will be used in the fat connector that represents it.
709
+ *
710
+ * The only hard part is finding the length of each of the strings
711
+ * so that "*" can be put in. A better explanation will have to wait.
712
+ */
713
+ static void compute_matchers_for_a_label(Sentence sent, int k)
714
+ {
715
+ char buff[2*MAX_WORD];
716
+ int lengths[MAX_LINKS];
717
+ int N_connectors, i, j;
718
+ Connector * c;
719
+ Disjunct * d;
720
+ const char *cs;
721
+ char *s;
722
+
723
+ d = sent->and_data.label_table[k];
724
+
725
+ N_connectors = 0;
726
+ for (c=d->left; c != NULL; c = c->next) N_connectors ++;
727
+ for (c=d->right; c != NULL; c = c->next) N_connectors ++;
728
+
729
+ for (i=0; i<N_connectors; i++) lengths[i] = 0;
730
+ while(d != NULL) {
731
+ i = 0;
732
+ for (c=d->left; c != NULL; c = c->next) {
733
+ cs = skip_utf8_upper(c->string);
734
+ j = strlen(cs);
735
+ if (j > lengths[i]) lengths[i] = j;
736
+ i++;
737
+ }
738
+ for (c=d->right; c != NULL; c = c->next) {
739
+ cs = c->string;
740
+ cs = skip_utf8_upper(cs);
741
+ j = strlen(cs);
742
+ if (j > lengths[i]) lengths[i] = j;
743
+ i++;
744
+ }
745
+ d = d->next;
746
+ }
747
+
748
+ for (d = sent->and_data.label_table[k]; d!= NULL; d = d->next)
749
+ {
750
+ i=0;
751
+ s = buff;
752
+ for (c=d->left; c != NULL; c = c->next) {
753
+ s = stick_in_one_connector(s, c, lengths[i]);
754
+ i++;
755
+ }
756
+ for (c=d->right; c != NULL; c = c->next) {
757
+ s = stick_in_one_connector(s, c, lengths[i]);
758
+ i++;
759
+ }
760
+ d->string = string_set_add(buff, sent->string_set);
761
+ }
762
+ }
763
+
764
+ /**
765
+ * Goes through the entire sentence and builds the fat link tables
766
+ * for all the disjuncts of all the words.
767
+ */
768
+ void build_conjunction_tables(Sentence sent)
769
+ {
770
+ int w;
771
+ int k;
772
+ Disjunct * d;
773
+
774
+ init_HT(sent);
775
+ init_LT(sent);
776
+ sent->and_data.STAT_N_disjuncts = 0;
777
+ sent->and_data.STAT_calls_to_equality_test = 0;
778
+
779
+ for (w=0; w<sent->length; w++) {
780
+ for (d=sent->word[w].d; d!=NULL; d=d->next) {
781
+ extract_all_fat_links(sent, d);
782
+ }
783
+ }
784
+
785
+ for (k=0; k<sent->and_data.LT_size; k++) {
786
+ compute_matchers_for_a_label(sent, k);
787
+ }
788
+ }
789
+
790
+ void print_AND_statistics(Sentence sent)
791
+ {
792
+ printf("Number of disjunct types (labels): %d\n", sent->and_data.LT_size);
793
+ printf("Number of disjuncts in the table: %d\n", sent->and_data.STAT_N_disjuncts);
794
+ if (sent->and_data.LT_size != 0) {
795
+ printf("average list length: %f\n",
796
+ (float)sent->and_data.STAT_N_disjuncts/sent->and_data.LT_size);
797
+ }
798
+ printf("Number of equality tests: %d\n", sent->and_data.STAT_calls_to_equality_test);
799
+ }
800
+
801
+ /**
802
+ * Fill in the fields of c for the disjunct. This must be in
803
+ * the table data structures. The label field and the string field
804
+ * are filled in appropriately. Priority is set to UP_priority.
805
+ */
806
+ static void connector_for_disjunct(Sentence sent, Disjunct * d, Connector * c)
807
+ {
808
+ int h;
809
+ Disjunct * d1 = NULL;
810
+ Label_node * lp;
811
+
812
+ h = and_hash_disjunct(d);
813
+
814
+ for (lp = sent->and_data.hash_table[h]; lp != NULL; lp = lp->next) {
815
+ d1 = sent->and_data.label_table[lp->label];
816
+ if (disjunct_types_equal(d,d1)) break;
817
+ }
818
+ assert(lp != NULL, "A disjunct I inserted was not there. (1)");
819
+
820
+ while(d1 != NULL) {
821
+ if (disjuncts_equal_AND(sent, d1, d)) break;
822
+ d1 = d1->next;
823
+ }
824
+
825
+ assert(d1 != NULL, "A disjunct I inserted was not there. (2)");
826
+
827
+ c->label = lp->label;
828
+ connector_set_string(c, d1->string);
829
+ c->priority = UP_priority;
830
+ c->multi = FALSE;
831
+ }
832
+
833
+
834
+ /**
835
+ * This function allocates and returns a list of disjuncts.
836
+ * This is the one obtained by substituting each contiguous
837
+ * non-empty subrange of d (incident on the center) by an appropriate
838
+ * fat link, in two possible positions. Does not effect d.
839
+ * The cost of d is inherited by all of the disjuncts in the result.
840
+ */
841
+ static Disjunct * build_fat_link_substitutions(Sentence sent, Disjunct *d)
842
+ {
843
+ Connector * cl, * cr, *tl, *tr, *wc, work_connector;
844
+ Disjunct *d1, *wd, work_disjunct, *d_list;
845
+ if (d==NULL) return NULL;
846
+ wd = &work_disjunct;
847
+ wc = init_connector(&work_connector);
848
+ d_list = NULL;
849
+ *wd = *d;
850
+ tl = d->left;
851
+ d->left = NULL;
852
+ for (cr = d->right; cr!=NULL; cr = cr->next) {
853
+ tr = cr->next;
854
+ cr->next = NULL;
855
+ if (is_appropriate(sent, d)) {
856
+ connector_for_disjunct(sent, d, wc);
857
+ wd->left = tl;
858
+ wd->right = wc;
859
+ wc->next = tr;
860
+ d1 = copy_disjunct(wd);
861
+ d1->next = d_list;
862
+ d_list = d1;
863
+ wd->left = wc;
864
+ wc->next = tl;
865
+ wd->right = tr;
866
+ d1 = copy_disjunct(wd);
867
+ d1->next = d_list;
868
+ d_list = d1;
869
+ }
870
+ cr->next = tr;
871
+ }
872
+ d->left = tl;
873
+
874
+ tr = d->right;
875
+ d->right = NULL;
876
+ for (cl = d->left; cl!=NULL; cl = cl->next) {
877
+ tl = cl->next;
878
+ cl->next = NULL;
879
+ if (is_appropriate(sent, d)) {
880
+ connector_for_disjunct(sent, d, wc);
881
+ wd->left = tl;
882
+ wd->right = wc;
883
+ wc->next = tr;
884
+ d1 = copy_disjunct(wd);
885
+ d1->next = d_list;
886
+ d_list = d1;
887
+ wd->left = wc;
888
+ wc->next = tl;
889
+ wd->right = tr;
890
+ d1 = copy_disjunct(wd);
891
+ d1->next = d_list;
892
+ d_list = d1;
893
+ }
894
+ cl->next = tl;
895
+ }
896
+ d->right = tr;
897
+
898
+ for (cl = d->left; cl!=NULL; cl = cl->next) {
899
+ for (cr = d->right; cr!=NULL; cr = cr->next) {
900
+ tl = cl->next;
901
+ tr = cr->next;
902
+ cl->next = cr->next = NULL;
903
+ if (is_appropriate(sent, d)) {
904
+ connector_for_disjunct(sent, d, wc);
905
+ wd->left = tl;
906
+ wd->right = wc;
907
+ wc->next = tr;
908
+ d1 = copy_disjunct(wd);
909
+ d1->next = d_list;
910
+ d_list = d1;
911
+ wd->left = wc;
912
+ wc->next = tl;
913
+ wd->right = tr;
914
+ d1 = copy_disjunct(wd);
915
+ d1->next = d_list;
916
+ d_list = d1;
917
+ }
918
+ cl->next = tl;
919
+ cr->next = tr;
920
+ }
921
+ }
922
+ return d_list;
923
+ }
924
+
925
+ /**
926
+ * This is basically a "map" function for build_fat_link_substitutions.
927
+ * It's applied to the disjuncts for all regular words of the sentence.
928
+ */
929
+ Disjunct * explode_disjunct_list(Sentence sent, Disjunct *d)
930
+ {
931
+ Disjunct *d1;
932
+
933
+ d1 = NULL;
934
+
935
+ for (; d!=NULL; d = d->next) {
936
+ d1 = catenate_disjuncts(d1, build_fat_link_substitutions(sent, d));
937
+ }
938
+ return d1;
939
+ }
940
+
941
+ /**
942
+ * Builds and returns a disjunct list for the comma. These are the
943
+ * disjuncts that are used when "," operates in conjunction with "and".
944
+ * Does not deal with the ", and" issue, nor the other uses
945
+ * of comma.
946
+ */
947
+ Disjunct * build_COMMA_disjunct_list(Sentence sent)
948
+ {
949
+ int lab;
950
+ Disjunct *d1, *d2, *d, work_disjunct, *wd;
951
+ Connector work_connector1, work_connector2, *c1, *c2;
952
+ Connector work_connector3, *c3;
953
+ c1 = init_connector(&work_connector1);
954
+ c2 = init_connector(&work_connector2);
955
+ c3 = init_connector(&work_connector3);
956
+ wd = &work_disjunct;
957
+
958
+ d1 = NULL; /* where we put the list we're building */
959
+
960
+ c1->next = NULL;
961
+ c2->next = c3;
962
+ c3->next = NULL;
963
+ c1->priority = c3->priority = DOWN_priority;
964
+ c2->priority = UP_priority;
965
+ c1->multi = c2->multi = c3->multi = FALSE;
966
+ wd->left = c1;
967
+ wd->right = c2;
968
+ wd->string = ","; /* *** fix this later?? */
969
+ wd->next = NULL;
970
+ wd->cost = 0;
971
+ for (lab = 0; lab < sent->and_data.LT_size; lab++) {
972
+ for (d = sent->and_data.label_table[lab]; d!=NULL; d=d->next) {
973
+ c1->string = c2->string = c3->string = d->string;
974
+ c1->label = c2->label = c3->label = lab;
975
+ d2 = copy_disjunct(wd);
976
+ d2->next = d1;
977
+ d1 = d2;
978
+ }
979
+ }
980
+ return d1;
981
+ }
982
+
983
+ /**
984
+ * Builds and returns a disjunct list for "and", "or" and "nor"
985
+ * for each disjunct in the label_table, we build three disjuncts
986
+ * this means that "Danny and Tycho and Billy" will be parsable in
987
+ * two ways. I don't know an easy way to avoid this
988
+ * the string is either "and", or "or", or "nor" at the moment.
989
+ */
990
+ Disjunct * build_AND_disjunct_list(Sentence sent, char * s)
991
+ {
992
+ int lab;
993
+ Disjunct *d_list, *d1, *d3, *d, *d_copy;
994
+ Connector *c1, *c2, *c3;
995
+
996
+ d_list = NULL; /* where we put the list we're building */
997
+
998
+ for (lab = 0; lab < sent->and_data.LT_size; lab++) {
999
+ for (d = sent->and_data.label_table[lab]; d!=NULL; d=d->next) {
1000
+ d1 = build_fat_link_substitutions(sent, d);
1001
+ d_copy = copy_disjunct(d); /* also include the thing itself! */
1002
+ d_copy->next = d1;
1003
+ d1 = d_copy;
1004
+ for(;d1 != NULL; d1 = d3) {
1005
+ d3 = d1->next;
1006
+
1007
+ c1 = connector_new();
1008
+ c2 = connector_new();
1009
+ c1->priority = c2->priority = DOWN_priority;
1010
+ connector_set_string(c1, d->string);
1011
+ connector_set_string(c2, d->string);
1012
+ c1->label = c2->label = lab;
1013
+
1014
+ d1->string = s;
1015
+
1016
+ if (d1->right == NULL) {
1017
+ d1->right = c2;
1018
+ } else {
1019
+ for (c3=d1->right; c3->next != NULL; c3 = c3->next)
1020
+ ;
1021
+ c3->next = c2;
1022
+ }
1023
+ if (d1->left == NULL) {
1024
+ d1->left = c1;
1025
+ } else {
1026
+ for (c3=d1->left; c3->next != NULL; c3 = c3->next)
1027
+ ;
1028
+ c3->next = c1;
1029
+ }
1030
+ d1->next = d_list;
1031
+ d_list = d1;
1032
+ }
1033
+ }
1034
+ }
1035
+ #if defined(PLURALIZATION)
1036
+ /* here is where "and" makes singular into plural. */
1037
+ /* must accommodate "he and I are good", "Davy and I are good"
1038
+ "Danny and Davy are good", and reject all of these with "is"
1039
+ instead of "are".
1040
+
1041
+ The SI connectors must also be modified to accommodate "are John
1042
+ and Dave here", but kill "is John and Dave here"
1043
+ */
1044
+ if (strcmp(s, "and") == 0)
1045
+ {
1046
+ for (d1 = d_list; d1 != NULL; d1 = d1->next)
1047
+ {
1048
+ for (c1 = d1->right; c1 != NULL; c1 = c1->next)
1049
+ {
1050
+ if ((c1->string[0] == 'S') &&
1051
+ ((c1->string[1] == '^') ||
1052
+ (c1->string[1] == 's') ||
1053
+ (c1->string[1] == 'p') ||
1054
+ (c1->string[1] == '\0')))
1055
+ {
1056
+ connector_set_string(c1, "Sp");
1057
+ }
1058
+ }
1059
+ for (c1 = d1->left; c1 != NULL; c1 = c1->next)
1060
+ {
1061
+ if ((c1->string[0] == 'S') && (c1->string[1] == 'I') &&
1062
+ ((c1->string[2] == '^') ||
1063
+ (c1->string[2] == 's') ||
1064
+ (c1->string[2] == 'p') ||
1065
+ (c1->string[2] == '\0')))
1066
+ {
1067
+ connector_set_string(c1, "SIp");
1068
+ }
1069
+ }
1070
+ }
1071
+ }
1072
+ /*
1073
+ "a cat or a dog is here" vs "a cat or a dog are here"
1074
+ The first seems right, the second seems wrong. I'll stick with this.
1075
+
1076
+ That is, "or" has the property that if both parts are the same in
1077
+ number, we use that but if they differ, we use plural.
1078
+
1079
+ The connectors on "I" must be handled specially. We accept
1080
+ "I or the dogs are here" but reject "I or the dogs is here"
1081
+ */
1082
+
1083
+ /* the code here still does now work "right", rejecting "is John or I invited"
1084
+ and accepting "I or my friend know what happened"
1085
+
1086
+ The more generous code for "nor" has been used instead
1087
+ */
1088
+ /*
1089
+ else if (strcmp(s, "or") == 0) {
1090
+ for (d1 = d_list; d1!=NULL; d1=d1->next) {
1091
+ for (c1=d1->right; c1!=NULL; c1=c1->next) {
1092
+ if (c1->string[0] == 'S') {
1093
+ if (c1->string[1]=='^') {
1094
+ if (c1->string[2]=='a') {
1095
+ connector_set_string(c1, "Ss");
1096
+ } else {
1097
+ connector_set_string(c1, "Sp");
1098
+ }
1099
+ } else if ((c1->string[1]=='p') && (c1->string[2]=='a')){
1100
+ connector_set_string(c1, "Sp");
1101
+ }
1102
+ }
1103
+ }
1104
+ for (c1=d1->left; c1!=NULL; c1=c1->next) {
1105
+ if ((c1->string[0] == 'S') && (c1->string[1] == 'I')) {
1106
+ if (c1->string[2]=='^') {
1107
+ if (c1->string[3]=='a') {
1108
+ connector_set_string(c1, "Ss");
1109
+ } else {
1110
+ connector_set_string(c1, "Sp");
1111
+ }
1112
+ } else if ((c1->string[2]=='p') && (c1->string[3]=='a')){
1113
+ connector_set_string(c1, "Sp");
1114
+ }
1115
+ }
1116
+ }
1117
+ }
1118
+ }
1119
+ */
1120
+ /*
1121
+ It appears that the "nor" of two things can be either singular or
1122
+ plural. "neither she nor John likes dogs"
1123
+ "neither she nor John like dogs"
1124
+
1125
+ */
1126
+ else if ((strcmp(s,"nor")==0) || (strcmp(s,"or")==0)) {
1127
+ for (d1 = d_list; d1!=NULL; d1=d1->next) {
1128
+ for (c1=d1->right; c1!=NULL; c1=c1->next) {
1129
+ if ((c1->string[0] == 'S') &&
1130
+ ((c1->string[1]=='^') ||
1131
+ (c1->string[1]=='s') ||
1132
+ (c1->string[1]=='p'))) {
1133
+ connector_set_string(c1, "S");
1134
+ }
1135
+ }
1136
+ for (c1=d1->left; c1!=NULL; c1=c1->next) {
1137
+ if ((c1->string[0] == 'S') && (c1->string[1] == 'I') &&
1138
+ ((c1->string[2]=='^') ||
1139
+ (c1->string[2]=='s') ||
1140
+ (c1->string[2]=='p'))) {
1141
+ connector_set_string(c1, "SI");
1142
+ }
1143
+ }
1144
+ }
1145
+ }
1146
+
1147
+ #endif
1148
+ return d_list;
1149
+ }
1150
+
1151
+
1152
+ /* The following routines' purpose is to eliminate all but the
1153
+ canonical linkage (of a collection of linkages that are identical
1154
+ except for fat links). An example of the problem is
1155
+ "I went to a talk and ate lunch". Without the canonical checker
1156
+ this has two linkages with identical structure.
1157
+
1158
+ We restrict our attention to a collection of linkages that are all
1159
+ isomorphic. Consider the set of all disjuncts that are used on one
1160
+ word (over the collection of linkages). This set is closed under GCD,
1161
+ since two linkages could both be used in that position, then so could
1162
+ their GCD. The GCD has been constructed and put in the label table.
1163
+
1164
+ The canonical linkage is the one in which the minimal disjunct that
1165
+ ever occurrs in a position is used in that position. It is easy to
1166
+ prove that a disjunct is not canonical -- just find one of it's fat
1167
+ disjuncts that can be replaced by a smaller one. If this can not be
1168
+ done, then the linkage is canonical.
1169
+
1170
+ The algorithm uses link_array[] and chosen_disjuncts[] as input to
1171
+ describe the linkage, and also uses the label_table.
1172
+
1173
+ (1) find all the words with fat disjuncts
1174
+ (2) scan all links and build, for each fat disjucnt used,
1175
+ an "image" structure that contains what this disjunct must
1176
+ connect to in the rest of the linkage.
1177
+ (3) For each fat disjunct, run through the label_table for disjuncts
1178
+ with the same label, considering only those with strictly more
1179
+ restricted match sets (this uses the string fields of the disjuncts
1180
+ from the table).
1181
+ (4) For each that passes this test, we see if it can replace the chosen
1182
+ disjunct. This is performed by examining how this disjunct
1183
+ compares with the image structure for this word.
1184
+ */
1185
+
1186
+ struct Image_node_struct {
1187
+ Image_node * next;
1188
+ Connector * c; /* the connector the place on the disjunct must match */
1189
+ int place; /* Indicates the place in the fat disjunct where this
1190
+ connector must connect. If 0 then this is a fat
1191
+ connector. If >0 then go place to the right, if
1192
+ <0 then go -place to the left. */
1193
+ };
1194
+
1195
+ /**
1196
+ * Fill in the has_fat_down array. Uses link_array[].
1197
+ * Returns TRUE if there exists at least one word with a
1198
+ * fat down label.
1199
+ */
1200
+ int set_has_fat_down(Sentence sent)
1201
+ {
1202
+ int link, w, N_fat;
1203
+ Parse_info pi = sent->parse_info;
1204
+
1205
+ N_fat = 0;
1206
+
1207
+ for (w = 0; w < pi->N_words; w++)
1208
+ {
1209
+ pi->has_fat_down[w] = FALSE;
1210
+ }
1211
+
1212
+ for (link = 0; link < pi->N_links; link++)
1213
+ {
1214
+ if (pi->link_array[link].lc->priority == DOWN_priority)
1215
+ {
1216
+ N_fat ++;
1217
+ pi->has_fat_down[pi->link_array[link].l] = TRUE;
1218
+ }
1219
+ else if (pi->link_array[link].rc->priority == DOWN_priority)
1220
+ {
1221
+ N_fat ++;
1222
+ pi->has_fat_down[pi->link_array[link].r] = TRUE;
1223
+ }
1224
+ }
1225
+ return (N_fat > 0);
1226
+ }
1227
+
1228
+ static void free_image_array(Parse_info pi)
1229
+ {
1230
+ int w;
1231
+ Image_node * in, * inx;
1232
+ for (w = 0; w < pi->N_words; w++)
1233
+ {
1234
+ for (in = pi->image_array[w]; in != NULL; in = inx)
1235
+ {
1236
+ inx = in->next;
1237
+ xfree((char *)in, sizeof(Image_node));
1238
+ }
1239
+ pi->image_array[w] = NULL;
1240
+ }
1241
+ }
1242
+
1243
+ /**
1244
+ * Uses link_array, chosen_disjuncts, and down_label to construct
1245
+ * image_array
1246
+ */
1247
+ static void build_image_array(Sentence sent)
1248
+ {
1249
+ int link, end, word;
1250
+ Connector * this_end_con, *other_end_con, * upcon, * updiscon, *clist;
1251
+ Disjunct * dis, * updis;
1252
+ Image_node * in;
1253
+ Parse_info pi = sent->parse_info;
1254
+
1255
+ for (word=0; word<pi->N_words; word++)
1256
+ {
1257
+ pi->image_array[word] = NULL;
1258
+ }
1259
+
1260
+ for (end = -1; end <= 1; end += 2)
1261
+ {
1262
+ for (link = 0; link < pi->N_links; link++)
1263
+ {
1264
+ if (end < 0)
1265
+ {
1266
+ word = pi->link_array[link].l;
1267
+ if (!pi->has_fat_down[word]) continue;
1268
+ this_end_con = pi->link_array[link].lc;
1269
+ other_end_con = pi->link_array[link].rc;
1270
+ dis = pi->chosen_disjuncts[word];
1271
+ clist = dis->right;
1272
+ }
1273
+ else
1274
+ {
1275
+ word = pi->link_array[link].r;
1276
+ if (!pi->has_fat_down[word]) continue;
1277
+ this_end_con = pi->link_array[link].rc;
1278
+ other_end_con = pi->link_array[link].lc;
1279
+ dis = pi->chosen_disjuncts[word];
1280
+ clist = dis->left;
1281
+ }
1282
+
1283
+ if (this_end_con->priority == DOWN_priority) continue;
1284
+ if ((this_end_con->label != NORMAL_LABEL) &&
1285
+ (this_end_con->label < 0)) continue;
1286
+ /* no need to construct an image node for down links,
1287
+ or commas links or either/neither links */
1288
+
1289
+ in = (Image_node *) xalloc(sizeof(Image_node));
1290
+ in->next = pi->image_array[word];
1291
+ pi->image_array[word] = in;
1292
+ in->c = other_end_con;
1293
+
1294
+ /* the rest of this code is for computing in->place */
1295
+ if (this_end_con->priority == UP_priority)
1296
+ {
1297
+ in->place = 0;
1298
+ }
1299
+ else
1300
+ {
1301
+ in->place = 1;
1302
+ if ((dis->left != NULL) &&
1303
+ (dis->left->priority == UP_priority))
1304
+ {
1305
+ upcon = dis->left;
1306
+ }
1307
+ else if ((dis->right != NULL) &&
1308
+ (dis->right->priority == UP_priority))
1309
+ {
1310
+ upcon = dis->right;
1311
+ }
1312
+ else
1313
+ {
1314
+ upcon = NULL;
1315
+ }
1316
+ if (upcon != NULL)
1317
+ {
1318
+ /* add on extra for a fat up link */
1319
+ updis = sent->and_data.label_table[upcon->label];
1320
+ if (end > 0)
1321
+ {
1322
+ updiscon = updis->left;
1323
+ }
1324
+ else
1325
+ {
1326
+ updiscon = updis->right;
1327
+ }
1328
+ for (;updiscon != NULL; updiscon = updiscon->next)
1329
+ {
1330
+ in->place ++;
1331
+ }
1332
+ }
1333
+ for (; clist != this_end_con; clist = clist->next)
1334
+ {
1335
+ if (clist->label < 0) in->place++;
1336
+ }
1337
+ in->place = in->place * (-end);
1338
+ }
1339
+ }
1340
+ }
1341
+ }
1342
+
1343
+ /**
1344
+ * returns TRUE if string s represents a strictly smaller match set
1345
+ * than does t
1346
+ */
1347
+ static int strictly_smaller(const char * s, const char * t)
1348
+ {
1349
+ int strictness;
1350
+ strictness = 0;
1351
+ for (;(*s!='\0') && (*t!='\0'); s++,t++) {
1352
+ if (*s == *t) continue;
1353
+ if ((*t == '*') || (*s == '^')) {
1354
+ strictness++;
1355
+ } else {
1356
+ return FALSE;
1357
+ }
1358
+ }
1359
+ assert(! ((*s!='\0') || (*t!='\0')), "s and t should be the same length!");
1360
+ return (strictness > 0);
1361
+ }
1362
+
1363
+ /**
1364
+ * dis points to a disjunct in the label_table. label is the label
1365
+ * of a different set of disjuncts. These can be derived from the label
1366
+ * of dis. Find the specific disjunct of in label_table[label]
1367
+ * which corresponds to dis.
1368
+ */
1369
+ static Disjunct * find_subdisjunct(Sentence sent, Disjunct * dis, int label)
1370
+ {
1371
+ Disjunct * d;
1372
+ Connector * cx, *cy;
1373
+ for (d=sent->and_data.label_table[label]; d!=NULL; d=d->next)
1374
+ {
1375
+ for (cx=d->left, cy=dis->left; cx!=NULL; cx=cx->next,cy=cy->next)
1376
+ {
1377
+ /* if ((cx->string != cy->string) || */
1378
+ if ((strcmp(connector_get_string(cx),
1379
+ connector_get_string(cy)) != 0) ||
1380
+ (cx->multi != cy->multi)) break;/* have to check multi? */
1381
+ }
1382
+ if (cx!=NULL) continue;
1383
+ for (cx=d->right, cy=dis->right; cx!=NULL; cx=cx->next,cy=cy->next)
1384
+ {
1385
+ /* if ((cx->string != cy->string) || */
1386
+ if ((strcmp(connector_get_string(cx),
1387
+ connector_get_string(cy)) != 0) ||
1388
+ (cx->multi != cy->multi)) break;
1389
+ }
1390
+ if (cx==NULL) break;
1391
+ }
1392
+ assert(d!=NULL, "Never found subdisjunct");
1393
+ return d;
1394
+ }
1395
+
1396
+ /**
1397
+ * is_canonical_linkage --
1398
+ * This uses link_array[], chosen_disjuncts[], has_fat_down[].
1399
+ * It assumes that there is a fat link in the current linkage.
1400
+ * See the comments above for more information about how it works
1401
+ */
1402
+ int is_canonical_linkage(Sentence sent)
1403
+ {
1404
+ int w, d_label=0, place;
1405
+ Connector *d_c, *c, dummy_connector, *upcon;
1406
+ Disjunct *dis, *chosen_d;
1407
+ Image_node * in;
1408
+ Parse_info pi = sent->parse_info;
1409
+
1410
+ init_connector(&dummy_connector);
1411
+ dummy_connector.priority = UP_priority;
1412
+
1413
+ build_image_array(sent);
1414
+
1415
+ for (w=0; w<pi->N_words; w++)
1416
+ {
1417
+ if (!pi->has_fat_down[w]) continue;
1418
+ chosen_d = pi->chosen_disjuncts[w];
1419
+
1420
+ /* there must be a down connector in both the left and right list */
1421
+ for (d_c = chosen_d->left; d_c!=NULL; d_c=d_c->next)
1422
+ {
1423
+ if (d_c->priority == DOWN_priority)
1424
+ {
1425
+ d_label = d_c->label;
1426
+ break;
1427
+ }
1428
+ }
1429
+ assert(d_c != NULL, "Should have found the down link.");
1430
+
1431
+ if ((chosen_d->left != NULL) &&
1432
+ (chosen_d->left->priority == UP_priority)) {
1433
+ upcon = chosen_d->left;
1434
+ } else if ((chosen_d->right != NULL) &&
1435
+ (chosen_d->right->priority == UP_priority)) {
1436
+ upcon = chosen_d->right;
1437
+ } else {
1438
+ upcon = NULL;
1439
+ }
1440
+
1441
+ /* check that the disjunct on w is minimal (canonical) */
1442
+
1443
+ for (dis=sent->and_data.label_table[d_label]; dis!=NULL; dis=dis->next)
1444
+ {
1445
+ /* now, reject a disjunct if it's not strictly below the old */
1446
+ if(!strictly_smaller(dis->string,
1447
+ connector_get_string(d_c))) continue;
1448
+
1449
+ /* Now, it has to match the image connectors */
1450
+ for (in = pi->image_array[w]; in != NULL; in = in->next)
1451
+ {
1452
+ place = in->place;
1453
+ if (place == 0)
1454
+ {
1455
+ assert(upcon != NULL, "Should have found an up link");
1456
+ dummy_connector.label = upcon->label;
1457
+
1458
+ /* now we have to compute the string of the
1459
+ disjunct with upcon->label that corresponds
1460
+ to dis */
1461
+ if (upcon->label == d_label)
1462
+ {
1463
+ connector_set_string(&dummy_connector, dis->string);
1464
+ } else {
1465
+ connector_set_string(&dummy_connector,
1466
+ find_subdisjunct(sent, dis, upcon->label)->string);
1467
+ }
1468
+
1469
+ /* I hope using x_match here is right */
1470
+ if (!x_match(sent, &dummy_connector, in->c)) break;
1471
+ } else if (place > 0) {
1472
+ for (c=dis->right; place > 1; place--) {
1473
+ c = c->next;
1474
+ }
1475
+ if (!x_match(sent, c, in->c)) break; /* Ditto above comment --DS 07/97*/
1476
+ } else {
1477
+ for (c=dis->left; place < -1; place++) {
1478
+ c = c->next;
1479
+ }
1480
+ if (!x_match(sent, c, in->c)) break; /* Ditto Ditto */
1481
+ }
1482
+ }
1483
+
1484
+ if (in == NULL) break;
1485
+ }
1486
+ if (dis != NULL) break;
1487
+ /* there is a better disjunct that the one we're using, so this
1488
+ word is bad, so we're done */
1489
+ }
1490
+ free_image_array(pi);
1491
+ return (w == pi->N_words);
1492
+ }
1493
+
1494
+ /**
1495
+ * This takes as input link_array[], sublinkage->link[]->l and
1496
+ * sublinkage->link[]->r (and also has_fat_down[word], which has been
1497
+ * computed in a prior call to is_canonical()), and from these
1498
+ * computes sublinkage->link[].lc and .rc. We assume these have
1499
+ * been initialized with the values from link_array. We also assume
1500
+ * that there are fat links.
1501
+ */
1502
+ void compute_pp_link_array_connectors(Sentence sent, Sublinkage *sublinkage)
1503
+ {
1504
+ int link, end, word, place;
1505
+ Connector * this_end_con, * upcon, * updiscon, *clist, *con, *mycon;
1506
+ Disjunct * dis, * updis, *mydis;
1507
+ Parse_info pi = sent->parse_info;
1508
+
1509
+ for (end = -1; end <= 1; end += 2)
1510
+ {
1511
+ for (link=0; link<pi->N_links; link++)
1512
+ {
1513
+ if (sublinkage->link[link]->l == -1) continue;
1514
+ if (end < 0)
1515
+ {
1516
+ word = pi->link_array[link].l;
1517
+ if (!pi->has_fat_down[word]) continue;
1518
+ this_end_con = pi->link_array[link].lc;
1519
+ dis = pi->chosen_disjuncts[word];
1520
+ mydis = pi->chosen_disjuncts[sublinkage->link[link]->l];
1521
+ clist = dis->right;
1522
+ }
1523
+ else
1524
+ {
1525
+ word = pi->link_array[link].r;
1526
+ if (!pi->has_fat_down[word]) continue;
1527
+ this_end_con = pi->link_array[link].rc;
1528
+ dis = pi->chosen_disjuncts[word];
1529
+ mydis = pi->chosen_disjuncts[sublinkage->link[link]->r];
1530
+ clist = dis->left;
1531
+ }
1532
+
1533
+ if (this_end_con->label != NORMAL_LABEL) continue;
1534
+ /* no need to construct a connector for up links,
1535
+ or commas links or either/neither links */
1536
+
1537
+ /* Now compute the place */
1538
+ place = 0;
1539
+ if ((dis->left != NULL) &&
1540
+ (dis->left->priority == UP_priority)) {
1541
+ upcon = dis->left;
1542
+ } else if ((dis->right != NULL) &&
1543
+ (dis->right->priority == UP_priority)) {
1544
+ upcon = dis->right;
1545
+ } else {
1546
+ upcon = NULL;
1547
+ }
1548
+ if (upcon != NULL) { /* add on extra for a fat up link */
1549
+ updis = sent->and_data.label_table[upcon->label];
1550
+ if (end > 0) {
1551
+ updiscon = updis->left;
1552
+ } else {
1553
+ updiscon = updis->right;
1554
+ }
1555
+ for (;updiscon != NULL; updiscon = updiscon->next) {
1556
+ place ++;
1557
+ }
1558
+ }
1559
+ for (; clist != this_end_con; clist = clist->next) {
1560
+ if (clist->label < 0) place++;
1561
+ }
1562
+ /* place has just been computed */
1563
+
1564
+ /* now find the right disjunct in the table */
1565
+ if ((mydis->left != NULL) &&
1566
+ (mydis->left->priority == UP_priority)) {
1567
+ mycon = mydis->left;
1568
+ } else if ((mydis->right != NULL) &&
1569
+ (mydis->right->priority == UP_priority)) {
1570
+ mycon = mydis->right;
1571
+ } else {
1572
+ printf("word = %d\n", word);
1573
+ printf("fat link: [%d, %d]\n",
1574
+ pi->link_array[link].l, pi->link_array[link].r);
1575
+ printf("thin link: [%d, %d]\n",
1576
+ sublinkage->link[link]->l, sublinkage->link[link]->r);
1577
+ assert(FALSE, "There should be a fat UP link here");
1578
+ }
1579
+
1580
+ for (dis=sent->and_data.label_table[mycon->label];
1581
+ dis != NULL; dis=dis->next) {
1582
+ if (dis->string == connector_get_string(mycon)) break;
1583
+ }
1584
+ assert(dis!=NULL, "Should have found this connector string");
1585
+ /* the disjunct in the table has just been found */
1586
+
1587
+ if (end < 0)
1588
+ {
1589
+ for (con = dis->right; place > 0; place--, con=con->next) {}
1590
+ /* sublinkage->link[link]->lc = con; OLD CODE */
1591
+ exfree_connectors(sublinkage->link[link]->lc);
1592
+ sublinkage->link[link]->lc = excopy_connectors(con);
1593
+ }
1594
+ else
1595
+ {
1596
+ for (con = dis->left; place > 0; place--, con=con->next) {}
1597
+ /* sublinkage->link[link]->rc = con; OLD CODE */
1598
+ exfree_connectors(sublinkage->link[link]->rc);
1599
+ sublinkage->link[link]->rc = excopy_connectors(con);
1600
+ }
1601
+ }
1602
+ }
1603
+ }