grammar_cop 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (344) hide show
  1. data/.DS_Store +0 -0
  2. data/.gitignore +4 -0
  3. data/Gemfile +4 -0
  4. data/Rakefile +8 -0
  5. data/data/.DS_Store +0 -0
  6. data/data/Makefile +511 -0
  7. data/data/Makefile.am +4 -0
  8. data/data/Makefile.in +511 -0
  9. data/data/de/.DS_Store +0 -0
  10. data/data/de/4.0.affix +7 -0
  11. data/data/de/4.0.dict +474 -0
  12. data/data/de/Makefile +387 -0
  13. data/data/de/Makefile.am +9 -0
  14. data/data/de/Makefile.in +387 -0
  15. data/data/en/.DS_Store +0 -0
  16. data/data/en/4.0.affix +26 -0
  17. data/data/en/4.0.batch +1002 -0
  18. data/data/en/4.0.biolg.batch +411 -0
  19. data/data/en/4.0.constituent-knowledge +127 -0
  20. data/data/en/4.0.dict +8759 -0
  21. data/data/en/4.0.dict.m4 +6928 -0
  22. data/data/en/4.0.enwiki.batch +14 -0
  23. data/data/en/4.0.fixes.batch +2776 -0
  24. data/data/en/4.0.knowledge +306 -0
  25. data/data/en/4.0.regex +225 -0
  26. data/data/en/4.0.voa.batch +114 -0
  27. data/data/en/Makefile +554 -0
  28. data/data/en/Makefile.am +19 -0
  29. data/data/en/Makefile.in +554 -0
  30. data/data/en/README +173 -0
  31. data/data/en/tiny.dict +157 -0
  32. data/data/en/words/.DS_Store +0 -0
  33. data/data/en/words/Makefile +456 -0
  34. data/data/en/words/Makefile.am +78 -0
  35. data/data/en/words/Makefile.in +456 -0
  36. data/data/en/words/currency +205 -0
  37. data/data/en/words/currency.p +28 -0
  38. data/data/en/words/entities.given-bisex.sing +39 -0
  39. data/data/en/words/entities.given-female.sing +4141 -0
  40. data/data/en/words/entities.given-male.sing +1633 -0
  41. data/data/en/words/entities.locations.sing +68 -0
  42. data/data/en/words/entities.national.sing +253 -0
  43. data/data/en/words/entities.organizations.sing +7 -0
  44. data/data/en/words/entities.us-states.sing +11 -0
  45. data/data/en/words/units.1 +45 -0
  46. data/data/en/words/units.1.dot +4 -0
  47. data/data/en/words/units.3 +2 -0
  48. data/data/en/words/units.4 +5 -0
  49. data/data/en/words/units.4.dot +1 -0
  50. data/data/en/words/words-medical.adv.1 +1191 -0
  51. data/data/en/words/words-medical.prep.1 +67 -0
  52. data/data/en/words/words-medical.v.4.1 +2835 -0
  53. data/data/en/words/words-medical.v.4.2 +2848 -0
  54. data/data/en/words/words-medical.v.4.3 +3011 -0
  55. data/data/en/words/words-medical.v.4.4 +3036 -0
  56. data/data/en/words/words-medical.v.4.5 +3050 -0
  57. data/data/en/words/words.adj.1 +6794 -0
  58. data/data/en/words/words.adj.2 +638 -0
  59. data/data/en/words/words.adj.3 +667 -0
  60. data/data/en/words/words.adv.1 +1573 -0
  61. data/data/en/words/words.adv.2 +67 -0
  62. data/data/en/words/words.adv.3 +157 -0
  63. data/data/en/words/words.adv.4 +80 -0
  64. data/data/en/words/words.n.1 +11464 -0
  65. data/data/en/words/words.n.1.wiki +264 -0
  66. data/data/en/words/words.n.2.s +2017 -0
  67. data/data/en/words/words.n.2.s.biolg +1 -0
  68. data/data/en/words/words.n.2.s.wiki +298 -0
  69. data/data/en/words/words.n.2.x +65 -0
  70. data/data/en/words/words.n.2.x.wiki +10 -0
  71. data/data/en/words/words.n.3 +5717 -0
  72. data/data/en/words/words.n.t +23 -0
  73. data/data/en/words/words.v.1.1 +1038 -0
  74. data/data/en/words/words.v.1.2 +1043 -0
  75. data/data/en/words/words.v.1.3 +1052 -0
  76. data/data/en/words/words.v.1.4 +1023 -0
  77. data/data/en/words/words.v.1.p +17 -0
  78. data/data/en/words/words.v.10.1 +14 -0
  79. data/data/en/words/words.v.10.2 +15 -0
  80. data/data/en/words/words.v.10.3 +88 -0
  81. data/data/en/words/words.v.10.4 +17 -0
  82. data/data/en/words/words.v.2.1 +1253 -0
  83. data/data/en/words/words.v.2.2 +1304 -0
  84. data/data/en/words/words.v.2.3 +1280 -0
  85. data/data/en/words/words.v.2.4 +1285 -0
  86. data/data/en/words/words.v.2.5 +1287 -0
  87. data/data/en/words/words.v.4.1 +2472 -0
  88. data/data/en/words/words.v.4.2 +2487 -0
  89. data/data/en/words/words.v.4.3 +2441 -0
  90. data/data/en/words/words.v.4.4 +2478 -0
  91. data/data/en/words/words.v.4.5 +2483 -0
  92. data/data/en/words/words.v.5.1 +98 -0
  93. data/data/en/words/words.v.5.2 +98 -0
  94. data/data/en/words/words.v.5.3 +103 -0
  95. data/data/en/words/words.v.5.4 +102 -0
  96. data/data/en/words/words.v.6.1 +388 -0
  97. data/data/en/words/words.v.6.2 +401 -0
  98. data/data/en/words/words.v.6.3 +397 -0
  99. data/data/en/words/words.v.6.4 +405 -0
  100. data/data/en/words/words.v.6.5 +401 -0
  101. data/data/en/words/words.v.8.1 +117 -0
  102. data/data/en/words/words.v.8.2 +118 -0
  103. data/data/en/words/words.v.8.3 +118 -0
  104. data/data/en/words/words.v.8.4 +119 -0
  105. data/data/en/words/words.v.8.5 +119 -0
  106. data/data/en/words/words.y +104 -0
  107. data/data/lt/.DS_Store +0 -0
  108. data/data/lt/4.0.affix +6 -0
  109. data/data/lt/4.0.constituent-knowledge +24 -0
  110. data/data/lt/4.0.dict +135 -0
  111. data/data/lt/4.0.knowledge +38 -0
  112. data/data/lt/Makefile +389 -0
  113. data/data/lt/Makefile.am +11 -0
  114. data/data/lt/Makefile.in +389 -0
  115. data/ext/.DS_Store +0 -0
  116. data/ext/link_grammar/.DS_Store +0 -0
  117. data/ext/link_grammar/extconf.rb +2 -0
  118. data/ext/link_grammar/link-grammar/.DS_Store +0 -0
  119. data/ext/link_grammar/link-grammar/.deps/analyze-linkage.Plo +198 -0
  120. data/ext/link_grammar/link-grammar/.deps/and.Plo +202 -0
  121. data/ext/link_grammar/link-grammar/.deps/api.Plo +244 -0
  122. data/ext/link_grammar/link-grammar/.deps/build-disjuncts.Plo +212 -0
  123. data/ext/link_grammar/link-grammar/.deps/command-line.Plo +201 -0
  124. data/ext/link_grammar/link-grammar/.deps/constituents.Plo +201 -0
  125. data/ext/link_grammar/link-grammar/.deps/count.Plo +202 -0
  126. data/ext/link_grammar/link-grammar/.deps/disjunct-utils.Plo +126 -0
  127. data/ext/link_grammar/link-grammar/.deps/disjuncts.Plo +123 -0
  128. data/ext/link_grammar/link-grammar/.deps/error.Plo +121 -0
  129. data/ext/link_grammar/link-grammar/.deps/expand.Plo +133 -0
  130. data/ext/link_grammar/link-grammar/.deps/extract-links.Plo +198 -0
  131. data/ext/link_grammar/link-grammar/.deps/fast-match.Plo +200 -0
  132. data/ext/link_grammar/link-grammar/.deps/idiom.Plo +200 -0
  133. data/ext/link_grammar/link-grammar/.deps/jni-client.Plo +217 -0
  134. data/ext/link_grammar/link-grammar/.deps/link-parser.Po +1 -0
  135. data/ext/link_grammar/link-grammar/.deps/massage.Plo +202 -0
  136. data/ext/link_grammar/link-grammar/.deps/post-process.Plo +202 -0
  137. data/ext/link_grammar/link-grammar/.deps/pp_knowledge.Plo +202 -0
  138. data/ext/link_grammar/link-grammar/.deps/pp_lexer.Plo +201 -0
  139. data/ext/link_grammar/link-grammar/.deps/pp_linkset.Plo +200 -0
  140. data/ext/link_grammar/link-grammar/.deps/prefix.Plo +102 -0
  141. data/ext/link_grammar/link-grammar/.deps/preparation.Plo +202 -0
  142. data/ext/link_grammar/link-grammar/.deps/print-util.Plo +200 -0
  143. data/ext/link_grammar/link-grammar/.deps/print.Plo +201 -0
  144. data/ext/link_grammar/link-grammar/.deps/prune.Plo +202 -0
  145. data/ext/link_grammar/link-grammar/.deps/read-dict.Plo +223 -0
  146. data/ext/link_grammar/link-grammar/.deps/read-regex.Plo +123 -0
  147. data/ext/link_grammar/link-grammar/.deps/regex-morph.Plo +131 -0
  148. data/ext/link_grammar/link-grammar/.deps/resources.Plo +203 -0
  149. data/ext/link_grammar/link-grammar/.deps/spellcheck-aspell.Plo +1 -0
  150. data/ext/link_grammar/link-grammar/.deps/spellcheck-hun.Plo +115 -0
  151. data/ext/link_grammar/link-grammar/.deps/string-set.Plo +198 -0
  152. data/ext/link_grammar/link-grammar/.deps/tokenize.Plo +160 -0
  153. data/ext/link_grammar/link-grammar/.deps/utilities.Plo +222 -0
  154. data/ext/link_grammar/link-grammar/.deps/word-file.Plo +201 -0
  155. data/ext/link_grammar/link-grammar/.deps/word-utils.Plo +212 -0
  156. data/ext/link_grammar/link-grammar/.libs/analyze-linkage.o +0 -0
  157. data/ext/link_grammar/link-grammar/.libs/and.o +0 -0
  158. data/ext/link_grammar/link-grammar/.libs/api.o +0 -0
  159. data/ext/link_grammar/link-grammar/.libs/build-disjuncts.o +0 -0
  160. data/ext/link_grammar/link-grammar/.libs/command-line.o +0 -0
  161. data/ext/link_grammar/link-grammar/.libs/constituents.o +0 -0
  162. data/ext/link_grammar/link-grammar/.libs/count.o +0 -0
  163. data/ext/link_grammar/link-grammar/.libs/disjunct-utils.o +0 -0
  164. data/ext/link_grammar/link-grammar/.libs/disjuncts.o +0 -0
  165. data/ext/link_grammar/link-grammar/.libs/error.o +0 -0
  166. data/ext/link_grammar/link-grammar/.libs/expand.o +0 -0
  167. data/ext/link_grammar/link-grammar/.libs/extract-links.o +0 -0
  168. data/ext/link_grammar/link-grammar/.libs/fast-match.o +0 -0
  169. data/ext/link_grammar/link-grammar/.libs/idiom.o +0 -0
  170. data/ext/link_grammar/link-grammar/.libs/jni-client.o +0 -0
  171. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java-symbols.expsym +31 -0
  172. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib +0 -0
  173. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib.dSYM/Contents/Info.plist +20 -0
  174. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib.dSYM/Contents/Resources/DWARF/liblink-grammar-java.4.dylib +0 -0
  175. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.a +0 -0
  176. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.dylib +0 -0
  177. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-symbols.expsym +194 -0
  178. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib +0 -0
  179. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib.dSYM/Contents/Info.plist +20 -0
  180. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib.dSYM/Contents/Resources/DWARF/liblink-grammar.4.dylib +0 -0
  181. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.a +0 -0
  182. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.dylib +0 -0
  183. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.la +41 -0
  184. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.lai +41 -0
  185. data/ext/link_grammar/link-grammar/.libs/massage.o +0 -0
  186. data/ext/link_grammar/link-grammar/.libs/post-process.o +0 -0
  187. data/ext/link_grammar/link-grammar/.libs/pp_knowledge.o +0 -0
  188. data/ext/link_grammar/link-grammar/.libs/pp_lexer.o +0 -0
  189. data/ext/link_grammar/link-grammar/.libs/pp_linkset.o +0 -0
  190. data/ext/link_grammar/link-grammar/.libs/prefix.o +0 -0
  191. data/ext/link_grammar/link-grammar/.libs/preparation.o +0 -0
  192. data/ext/link_grammar/link-grammar/.libs/print-util.o +0 -0
  193. data/ext/link_grammar/link-grammar/.libs/print.o +0 -0
  194. data/ext/link_grammar/link-grammar/.libs/prune.o +0 -0
  195. data/ext/link_grammar/link-grammar/.libs/read-dict.o +0 -0
  196. data/ext/link_grammar/link-grammar/.libs/read-regex.o +0 -0
  197. data/ext/link_grammar/link-grammar/.libs/regex-morph.o +0 -0
  198. data/ext/link_grammar/link-grammar/.libs/resources.o +0 -0
  199. data/ext/link_grammar/link-grammar/.libs/spellcheck-aspell.o +0 -0
  200. data/ext/link_grammar/link-grammar/.libs/spellcheck-hun.o +0 -0
  201. data/ext/link_grammar/link-grammar/.libs/string-set.o +0 -0
  202. data/ext/link_grammar/link-grammar/.libs/tokenize.o +0 -0
  203. data/ext/link_grammar/link-grammar/.libs/utilities.o +0 -0
  204. data/ext/link_grammar/link-grammar/.libs/word-file.o +0 -0
  205. data/ext/link_grammar/link-grammar/.libs/word-utils.o +0 -0
  206. data/ext/link_grammar/link-grammar/Makefile +900 -0
  207. data/ext/link_grammar/link-grammar/Makefile.am +202 -0
  208. data/ext/link_grammar/link-grammar/Makefile.in +900 -0
  209. data/ext/link_grammar/link-grammar/analyze-linkage.c +1317 -0
  210. data/ext/link_grammar/link-grammar/analyze-linkage.h +24 -0
  211. data/ext/link_grammar/link-grammar/and.c +1603 -0
  212. data/ext/link_grammar/link-grammar/and.h +27 -0
  213. data/ext/link_grammar/link-grammar/api-structures.h +362 -0
  214. data/ext/link_grammar/link-grammar/api-types.h +72 -0
  215. data/ext/link_grammar/link-grammar/api.c +1887 -0
  216. data/ext/link_grammar/link-grammar/api.h +96 -0
  217. data/ext/link_grammar/link-grammar/autoit/.DS_Store +0 -0
  218. data/ext/link_grammar/link-grammar/autoit/README +10 -0
  219. data/ext/link_grammar/link-grammar/autoit/_LGTest.au3 +22 -0
  220. data/ext/link_grammar/link-grammar/autoit/_LinkGrammar.au3 +545 -0
  221. data/ext/link_grammar/link-grammar/build-disjuncts.c +487 -0
  222. data/ext/link_grammar/link-grammar/build-disjuncts.h +21 -0
  223. data/ext/link_grammar/link-grammar/command-line.c +458 -0
  224. data/ext/link_grammar/link-grammar/command-line.h +15 -0
  225. data/ext/link_grammar/link-grammar/constituents.c +1836 -0
  226. data/ext/link_grammar/link-grammar/constituents.h +26 -0
  227. data/ext/link_grammar/link-grammar/corpus/.DS_Store +0 -0
  228. data/ext/link_grammar/link-grammar/corpus/.deps/cluster.Plo +1 -0
  229. data/ext/link_grammar/link-grammar/corpus/.deps/corpus.Plo +1 -0
  230. data/ext/link_grammar/link-grammar/corpus/Makefile +527 -0
  231. data/ext/link_grammar/link-grammar/corpus/Makefile.am +46 -0
  232. data/ext/link_grammar/link-grammar/corpus/Makefile.in +527 -0
  233. data/ext/link_grammar/link-grammar/corpus/README +17 -0
  234. data/ext/link_grammar/link-grammar/corpus/cluster.c +286 -0
  235. data/ext/link_grammar/link-grammar/corpus/cluster.h +32 -0
  236. data/ext/link_grammar/link-grammar/corpus/corpus.c +483 -0
  237. data/ext/link_grammar/link-grammar/corpus/corpus.h +46 -0
  238. data/ext/link_grammar/link-grammar/count.c +828 -0
  239. data/ext/link_grammar/link-grammar/count.h +25 -0
  240. data/ext/link_grammar/link-grammar/disjunct-utils.c +261 -0
  241. data/ext/link_grammar/link-grammar/disjunct-utils.h +27 -0
  242. data/ext/link_grammar/link-grammar/disjuncts.c +138 -0
  243. data/ext/link_grammar/link-grammar/disjuncts.h +13 -0
  244. data/ext/link_grammar/link-grammar/error.c +92 -0
  245. data/ext/link_grammar/link-grammar/error.h +35 -0
  246. data/ext/link_grammar/link-grammar/expand.c +67 -0
  247. data/ext/link_grammar/link-grammar/expand.h +13 -0
  248. data/ext/link_grammar/link-grammar/externs.h +22 -0
  249. data/ext/link_grammar/link-grammar/extract-links.c +625 -0
  250. data/ext/link_grammar/link-grammar/extract-links.h +16 -0
  251. data/ext/link_grammar/link-grammar/fast-match.c +309 -0
  252. data/ext/link_grammar/link-grammar/fast-match.h +17 -0
  253. data/ext/link_grammar/link-grammar/idiom.c +373 -0
  254. data/ext/link_grammar/link-grammar/idiom.h +15 -0
  255. data/ext/link_grammar/link-grammar/jni-client.c +779 -0
  256. data/ext/link_grammar/link-grammar/jni-client.h +236 -0
  257. data/ext/link_grammar/link-grammar/liblink-grammar-java.la +42 -0
  258. data/ext/link_grammar/link-grammar/liblink-grammar.la +41 -0
  259. data/ext/link_grammar/link-grammar/link-features.h +37 -0
  260. data/ext/link_grammar/link-grammar/link-features.h.in +37 -0
  261. data/ext/link_grammar/link-grammar/link-grammar-java.def +31 -0
  262. data/ext/link_grammar/link-grammar/link-grammar.def +194 -0
  263. data/ext/link_grammar/link-grammar/link-includes.h +465 -0
  264. data/ext/link_grammar/link-grammar/link-parser.c +849 -0
  265. data/ext/link_grammar/link-grammar/massage.c +329 -0
  266. data/ext/link_grammar/link-grammar/massage.h +13 -0
  267. data/ext/link_grammar/link-grammar/post-process.c +1113 -0
  268. data/ext/link_grammar/link-grammar/post-process.h +45 -0
  269. data/ext/link_grammar/link-grammar/pp_knowledge.c +376 -0
  270. data/ext/link_grammar/link-grammar/pp_knowledge.h +14 -0
  271. data/ext/link_grammar/link-grammar/pp_lexer.c +1920 -0
  272. data/ext/link_grammar/link-grammar/pp_lexer.h +19 -0
  273. data/ext/link_grammar/link-grammar/pp_linkset.c +158 -0
  274. data/ext/link_grammar/link-grammar/pp_linkset.h +20 -0
  275. data/ext/link_grammar/link-grammar/prefix.c +482 -0
  276. data/ext/link_grammar/link-grammar/prefix.h +139 -0
  277. data/ext/link_grammar/link-grammar/preparation.c +412 -0
  278. data/ext/link_grammar/link-grammar/preparation.h +20 -0
  279. data/ext/link_grammar/link-grammar/print-util.c +87 -0
  280. data/ext/link_grammar/link-grammar/print-util.h +32 -0
  281. data/ext/link_grammar/link-grammar/print.c +1085 -0
  282. data/ext/link_grammar/link-grammar/print.h +16 -0
  283. data/ext/link_grammar/link-grammar/prune.c +1864 -0
  284. data/ext/link_grammar/link-grammar/prune.h +17 -0
  285. data/ext/link_grammar/link-grammar/read-dict.c +1785 -0
  286. data/ext/link_grammar/link-grammar/read-dict.h +29 -0
  287. data/ext/link_grammar/link-grammar/read-regex.c +161 -0
  288. data/ext/link_grammar/link-grammar/read-regex.h +12 -0
  289. data/ext/link_grammar/link-grammar/regex-morph.c +126 -0
  290. data/ext/link_grammar/link-grammar/regex-morph.h +17 -0
  291. data/ext/link_grammar/link-grammar/resources.c +180 -0
  292. data/ext/link_grammar/link-grammar/resources.h +23 -0
  293. data/ext/link_grammar/link-grammar/sat-solver/.DS_Store +0 -0
  294. data/ext/link_grammar/link-grammar/sat-solver/.deps/fast-sprintf.Plo +1 -0
  295. data/ext/link_grammar/link-grammar/sat-solver/.deps/sat-encoder.Plo +1 -0
  296. data/ext/link_grammar/link-grammar/sat-solver/.deps/util.Plo +1 -0
  297. data/ext/link_grammar/link-grammar/sat-solver/.deps/variables.Plo +1 -0
  298. data/ext/link_grammar/link-grammar/sat-solver/.deps/word-tag.Plo +1 -0
  299. data/ext/link_grammar/link-grammar/sat-solver/Makefile +527 -0
  300. data/ext/link_grammar/link-grammar/sat-solver/Makefile.am +29 -0
  301. data/ext/link_grammar/link-grammar/sat-solver/Makefile.in +527 -0
  302. data/ext/link_grammar/link-grammar/sat-solver/clock.hpp +33 -0
  303. data/ext/link_grammar/link-grammar/sat-solver/fast-sprintf.cpp +26 -0
  304. data/ext/link_grammar/link-grammar/sat-solver/fast-sprintf.hpp +7 -0
  305. data/ext/link_grammar/link-grammar/sat-solver/guiding.hpp +244 -0
  306. data/ext/link_grammar/link-grammar/sat-solver/matrix-ut.hpp +79 -0
  307. data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.cpp +2811 -0
  308. data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.h +11 -0
  309. data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.hpp +381 -0
  310. data/ext/link_grammar/link-grammar/sat-solver/trie.hpp +118 -0
  311. data/ext/link_grammar/link-grammar/sat-solver/util.cpp +23 -0
  312. data/ext/link_grammar/link-grammar/sat-solver/util.hpp +14 -0
  313. data/ext/link_grammar/link-grammar/sat-solver/variables.cpp +5 -0
  314. data/ext/link_grammar/link-grammar/sat-solver/variables.hpp +829 -0
  315. data/ext/link_grammar/link-grammar/sat-solver/word-tag.cpp +159 -0
  316. data/ext/link_grammar/link-grammar/sat-solver/word-tag.hpp +162 -0
  317. data/ext/link_grammar/link-grammar/spellcheck-aspell.c +148 -0
  318. data/ext/link_grammar/link-grammar/spellcheck-hun.c +136 -0
  319. data/ext/link_grammar/link-grammar/spellcheck.h +34 -0
  320. data/ext/link_grammar/link-grammar/string-set.c +169 -0
  321. data/ext/link_grammar/link-grammar/string-set.h +16 -0
  322. data/ext/link_grammar/link-grammar/structures.h +498 -0
  323. data/ext/link_grammar/link-grammar/tokenize.c +1049 -0
  324. data/ext/link_grammar/link-grammar/tokenize.h +15 -0
  325. data/ext/link_grammar/link-grammar/utilities.c +847 -0
  326. data/ext/link_grammar/link-grammar/utilities.h +281 -0
  327. data/ext/link_grammar/link-grammar/word-file.c +124 -0
  328. data/ext/link_grammar/link-grammar/word-file.h +15 -0
  329. data/ext/link_grammar/link-grammar/word-utils.c +526 -0
  330. data/ext/link_grammar/link-grammar/word-utils.h +152 -0
  331. data/ext/link_grammar/link_grammar.c +202 -0
  332. data/ext/link_grammar/link_grammar.h +99 -0
  333. data/grammar_cop.gemspec +24 -0
  334. data/lib/.DS_Store +0 -0
  335. data/lib/grammar_cop.rb +9 -0
  336. data/lib/grammar_cop/.DS_Store +0 -0
  337. data/lib/grammar_cop/dictionary.rb +19 -0
  338. data/lib/grammar_cop/linkage.rb +30 -0
  339. data/lib/grammar_cop/parse_options.rb +32 -0
  340. data/lib/grammar_cop/sentence.rb +36 -0
  341. data/lib/grammar_cop/version.rb +3 -0
  342. data/test/.DS_Store +0 -0
  343. data/test/grammar_cop_test.rb +27 -0
  344. metadata +407 -0
@@ -0,0 +1,24 @@
1
+ /*************************************************************************/
2
+ /* Copyright (c) 2004 */
3
+ /* Daniel Sleator, David Temperley, and John Lafferty */
4
+ /* All rights reserved */
5
+ /* */
6
+ /* Use of the link grammar parsing system is subject to the terms of the */
7
+ /* license set forth in the LICENSE file included with this software, */
8
+ /* and also available at http://www.link.cs.cmu.edu/link/license.html */
9
+ /* This license allows free redistribution and use in source and binary */
10
+ /* forms, with or without modification, subject to certain conditions. */
11
+ /* */
12
+ /*************************************************************************/
13
+
14
+ void init_analyze(Sentence);
15
+ void free_analyze(Sentence);
16
+
17
+ void extract_thin_linkage(Sentence, Parse_Options, Linkage);
18
+ void extract_fat_linkage (Sentence, Parse_Options, Linkage);
19
+ Linkage_info analyze_fat_linkage (Sentence, Parse_Options, int pass);
20
+ Linkage_info analyze_thin_linkage(Sentence, Parse_Options, int pass);
21
+ void free_DIS_tree(DIS_node *);
22
+
23
+ void zero_sublinkage(Sublinkage *s);
24
+
@@ -0,0 +1,1603 @@
1
+ /*************************************************************************/
2
+ /* Copyright (c) 2004 */
3
+ /* Daniel Sleator, David Temperley, and John Lafferty */
4
+ /* All rights reserved */
5
+ /* */
6
+ /* Use of the link grammar parsing system is subject to the terms of the */
7
+ /* license set forth in the LICENSE file included with this software, */
8
+ /* and also available at http://www.link.cs.cmu.edu/link/license.html */
9
+ /* This license allows free redistribution and use in source and binary */
10
+ /* forms, with or without modification, subject to certain conditions. */
11
+ /* */
12
+ /*************************************************************************/
13
+
14
+ #include "api.h"
15
+ #include "disjunct-utils.h"
16
+
17
+ /*
18
+ Notes about AND
19
+
20
+ A large fraction of the code of this parser seems to deal with handling
21
+ conjunctions. This comment (combined with reading the paper) should
22
+ give an idea of how it works.
23
+
24
+ First of all, we need a more detailed discussion of strings, what they
25
+ match, etc. (This entire discussion ignores the labels, which are
26
+ semantically the same as the leading upper case letters of the
27
+ connector.)
28
+
29
+ We'll deal with infinite strings from an alphabet of three types of
30
+ characters: "*". "^" and ordinary characters (denoted "a" and "b").
31
+ (The end of a string should be thought of as an infinite sequence of
32
+ "*"s).
33
+
34
+ Let match(s) be the set of strings that will match the string s. This
35
+ is defined as follows. A string t is in match(s) if (1) its leading
36
+ upper case letters exactly match those of s. (2) traversing through
37
+ both strings, from left to right in step, no missmatch is found
38
+ between corresponding letters. A missmatch is a pair of differing
39
+ ordinary characters, or a "^" and any ordinary letter or two "^"s.
40
+ In other words, a match is exactly a "*" and anything, or two
41
+ identical ordinary letters.
42
+
43
+ Alternative definition of the set match(s):
44
+ {t | t is obtained from s by replacing each "^" and any other characters
45
+ by "*"s, and replacing any original "*" in s by any other character
46
+ (or "^").}
47
+
48
+ Theorem: if t in match(s) then s in match(t).
49
+
50
+ It is also a theorem that given any two strings s and t, there exists a
51
+ unique new string u with the property that:
52
+
53
+ match(u) = match(s) intersect match(t)
54
+
55
+ This string is called the GCD of s and t. Here are some examples.
56
+
57
+ GCD(N*a,Nb) = Nba
58
+ GCD(Na, Nb) = N^
59
+ GCD(Nab,Nb) = N^b
60
+ GCD(N^,N*a) = N^a
61
+ GCD(N^, N) = N^
62
+ GCD(N^^,N^) = N^^
63
+
64
+ We need an algorithm for computing the GCD of two strings. Here is
65
+ one.
66
+
67
+ First get by the upper case letters (which must be equal, otherwise
68
+ there is no intersection), issuing them. Traverse the rest of the
69
+ characters of s and t in lockstep until there is nothing left but
70
+ "*"s. If the two characters are:
71
+
72
+ "a" and "a", issue "a"
73
+ "a" and "b", issue "^"
74
+ "a" and "*", issue "a"
75
+ "*" and "*", issue "*"
76
+ "*" and "^", issue "^"
77
+ "a" and "^", issue "^"
78
+ "^" and "^", issue "^"
79
+
80
+ A simple case analysis suffices to show that any string that matches
81
+ the right side, must match both of the left sides, and any string not
82
+ matching the right side must not match at least one of the left sides.
83
+
84
+ This proves that the GCD operator is associative and commutative.
85
+ (There must be a name for a mathematical structure with these properties.)
86
+
87
+ To elaborate further on this theory, define the notion of two strings
88
+ matching in the dual sense as follows: s and t dual-match if
89
+ match(s) is contained in match(t) or vice versa---
90
+
91
+ Full development of this theory could lead to a more efficient
92
+ algorithm for this problem. I'll defer this until such time as it
93
+ appears necessary.
94
+
95
+
96
+ We need a data structure that stores a set of fat links. Each fat
97
+ link has a number (called its label). The fat link operates in liu of
98
+ a collection of links. The particular stuff it is a substitute for is
99
+ defined by a disjunct. This disjunct is stored in the data structure.
100
+
101
+ The type of a disjunct is defined by the sequence of connector types
102
+ (defined by their upper case letters) that comprises it. Each entry
103
+ of the label_table[] points to a list of disjuncts that have the same
104
+ type (a hash table is uses so that, given a disjunct, we can efficiently
105
+ compute the element of the label table in which it belongs).
106
+
107
+ We begin by loading up the label table with all of the possible
108
+ fat links that occur through the words of the sentence. These are
109
+ obtained by taking every sub-range of the connectors of each disjunct
110
+ (containing the center). We also compute the closure (under the GCD
111
+ operator) of these disjuncts and store also store these in the
112
+ label_table. Each disjunct in this table has a string which represents
113
+ the subscripts of all of its connectors (and their multi-connector bits).
114
+
115
+ It is possible to generate a fat connector for any one of the
116
+ disjuncts in the label_table. This connector's label field is given
117
+ the label from the disjunct from which it arose. It's string field
118
+ is taken from the string of the disjunct (mentioned above). It will be
119
+ given a priority with a value of UP_priority or DOWN_priority (depending
120
+ on how it will be used). A connector of UP_priority can match one of
121
+ DOWN_priority, but neither of these can match any other priority.
122
+ (Of course, a fat connector can match only another fat connector with
123
+ the same label.)
124
+
125
+ The paper describes in some detail how disjuncts are given to words
126
+ and to "and" and ",", etc. Each word in the sentence gets many more
127
+ new disjuncts. For each contiguous set of connectors containing (or
128
+ adjacent to) the center of the disjunct, we generate a fat link, and
129
+ replace these connector in the word by a fat link. (Actually we do
130
+ this twice. Once pointing to the right, once to the left.) These fat
131
+ links have priority UP_priority.
132
+
133
+ What do we generate for ","? For each type of fat link (each label)
134
+ we make a disjunct that has two down connectors (to the right and left)
135
+ and one up connector (to the right). There will be a unique way of
136
+ hooking together a comma-separated and-list.
137
+
138
+ The disjuncts on "and" are more complicated. Here we have to do just what
139
+ we did for comma (but also include the up link to the left), then
140
+ we also have to allow the process to terminate. So, there is a disjunct
141
+ with two down fat links, and between them are the original thin links.
142
+ These are said to "blossom" out. However, this is not all that is
143
+ necessary. It's possible for an and-list to be part of another and list
144
+ with a different labeled fat connector. To make this possible, we
145
+ regroup the just blossomed disjuncts (in all possible ways about the center)
146
+ and install them as fat links. If this sounds like a lot of disjuncts --
147
+ it is! The program is currently fairly slow on long sentence with and.
148
+
149
+ It is slightly non-obvious that the fat-links in a linkage constructed
150
+ from disjuncts defined in this way form a binary tree. Naturally,
151
+ connectors with UP_priority point up the tree, and those with DOWN_priority
152
+ point down the tree.
153
+
154
+ Think of the string x on the connector as representing a set X of strings.
155
+ X = match(x). So, for example, if x="S^" then match(x) = {"S", "S*a",
156
+ "S*b", etc}. The matching rules for UP and DOWN priority connectors
157
+ are such that as you go up (the tree of ands) the X sets get no larger.
158
+ So, for example, a "Sb" pointing up can match an "S^" pointing down.
159
+ (Because more stuff can match "Sb" than can match "S^".)
160
+ This guarantees that whatever connector ultimately gets used after the
161
+ fat connector blossoms out (see below), it is a powerful enough connector
162
+ to be able to match to any of the connectors associated with it.
163
+
164
+ One problem with the scheme just descibed is that it sometimes generates
165
+ essentially the same linkage several times. This happens if there is
166
+ a gap in the connective power, and the mismatch can be moved around in
167
+ different ways. Here is an example of how this happens.
168
+
169
+ (Left is DOWN, right is UP)
170
+
171
+ Sa <---> S^ <---> S or Sa <---> Sa <---> S
172
+ fat thin fat thin
173
+
174
+ Here two of the disjunct types are given by "S^" and "Sa". Notice that
175
+ the criterion of shrinking the matching set is satisfied by the the fat
176
+ link (traversing from left to right). How do I eliminate one of these?
177
+
178
+ I use the technique of canonization. I generate all the linkages. There
179
+ is then a procedure that can check to see of a linkage is canonical.
180
+ If it is, it's used, otherwise it's ignored. It's claimed that exactly
181
+ one canonical one of each equivalence class will be generated.
182
+ We basically insist that the intermediate fat disjuncts (ones that
183
+ have a fat link pointing down) are all minimal -- that is, that they
184
+ cannot be replaced by by another (with a strictly) smaller match set.
185
+ If one is not minimal, then the linkage is rejected.
186
+
187
+ Here's a proof that this is correct. Consider the set of equivalent
188
+ linkages that are generated. These Pick a disjunct that is the root of
189
+ its tree. Consider the set of all disjuncts which occur in that positon
190
+ among the equivalent linkages. The GCD of all of these can fit in that
191
+ position (it matches down the tree, since its match set has gotten
192
+ smaller, and it also matches to the THIN links.) Since the GCD is put
193
+ on "and" this particular one will be generated. Therefore rejecting
194
+ a linkage in which a root fat disjunct can be replaced by a smaller one
195
+ is ok (since the smaller one will be generated separately). What about
196
+ a fat disjunct that is not the root. We consider the set of linkages in
197
+ which the root is minimal (the ones for which it's not have already been
198
+ eliminated). Now, consider one of the children of the root in precisely
199
+ the way we just considered the root. The same argument holds. The only
200
+ difference is that the root node gives another constraint on how small
201
+ you can make the disjunct -- so, within these constraints, if we can go
202
+ smaller, we reject.
203
+
204
+ The code to do all of this is fairly ugly, but I think it works.
205
+
206
+
207
+ Problems with this stuff:
208
+
209
+ 1) There is obviously a combinatorial explosion that takes place.
210
+ As the number of disjuncts (and the number of their subscripts
211
+ increase) the number of disjuncts that get put onto "and" will
212
+ increase tremendously. When we made the transcript for the tech
213
+ report (Around August 1991) most of the sentence were processed
214
+ in well under 10 seconds. Now (Jan 1992), some of these sentences
215
+ take ten times longer. As of this writing I don't really know the
216
+ reason, other than just the fact that the dictionary entries are
217
+ more complex than they used to be. The number of linkages has also
218
+ increased significantly.
219
+
220
+ 2) Each element of an and list must be attached through only one word.
221
+ This disallows "there is time enough and space enough for both of us",
222
+ and many other reasonable sounding things. The combinatorial
223
+ explosion that would occur if you allowed two different connection
224
+ points would be tremendous, and the number of solutions would also
225
+ probably go up by another order of magnitude. Perhaps if there
226
+ were strong constraints on the type of connectors in which this
227
+ would be allowed, then this would be a conceivable prospect.
228
+
229
+ 3) A multi-connector must be either all "outside" or all "inside" the and.
230
+ For example, "the big black dog and cat ran" has only two ways to
231
+ linkages (instead of three).
232
+
233
+ Possible bug: It seems that the following two linkages should be the
234
+ same under the canonical linkage test. Could this have to do with the
235
+ pluralization system?
236
+
237
+ > I am big and the bike and the car were broken
238
+ Accepted (4 linkages, 4 with no P.P. violations) at stage 1
239
+ Linkage 1, cost vector = (0, 0, 18)
240
+
241
+ +------Spx-----+
242
+ +-----CC-----+------Wd------+-d^^*i^-+ |
243
+ +-Wd-+Spi+-Pa+ | +--Ds-+d^^*+ +-Ds-+ +--Pv-+
244
+ | | | | | | | | | | | |
245
+ ///// I.p am big.a and the bike.n and the car.n were broken
246
+
247
+ ///// RW <---RW----> RW /////
248
+ ///// Wd <---Wd----> Wd I.p
249
+ I.p CC <---CC----> CC and
250
+ I.p Sp*i <---Spii--> Spi am
251
+ am Pa <---Pa----> Pa big.a
252
+ and Wd <---Wd----> Wd and
253
+ bike.n d^s** 6<---d^^*i-> d^^*i 6 and
254
+ the D <---Ds----> Ds bike.n
255
+ and Sp <---Spx---> Spx were
256
+ and d^^*i 6<---d^^*i-> d^s** 6 car.n
257
+ the D <---Ds----> Ds car.n
258
+ were Pv <---Pv----> Pv broken
259
+
260
+ (press return for another)
261
+ >
262
+ Linkage 2, cost vector = (0, 0, 18)
263
+
264
+ +------Spx-----+
265
+ +-----CC-----+------Wd------+-d^s**^-+ |
266
+ +-Wd-+Spi+-Pa+ | +--Ds-+d^s*+ +-Ds-+ +--Pv-+
267
+ | | | | | | | | | | | |
268
+ ///// I.p am big.a and the bike.n and the car.n were broken
269
+
270
+ ///// RW <---RW----> RW /////
271
+ ///// Wd <---Wd----> Wd I.p
272
+ I.p CC <---CC----> CC and
273
+ I.p Sp*i <---Spii--> Spi am
274
+ am Pa <---Pa----> Pa big.a
275
+ and Wd <---Wd----> Wd and
276
+ bike.n d^s** 6<---d^s**-> d^s** 6 and
277
+ the D <---Ds----> Ds bike.n
278
+ and Sp <---Spx---> Spx were
279
+ and d^s** 6<---d^s**-> d^s** 6 car.n
280
+ the D <---Ds----> Ds car.n
281
+ were Pv <---Pv----> Pv broken
282
+
283
+ */
284
+
285
+ static void init_LT(Sentence sent)
286
+ {
287
+ sent->and_data.LT_bound = 20;
288
+ sent->and_data.LT_size = 0;
289
+ sent->and_data.label_table =
290
+ (Disjunct **) xalloc(sent->and_data.LT_bound * sizeof(Disjunct *));
291
+ }
292
+
293
+ static void grow_LT(Sentence sent)
294
+ {
295
+ size_t oldsize = sent->and_data.LT_bound * sizeof(Disjunct *);
296
+ sent->and_data.LT_bound = (3*sent->and_data.LT_bound)/2;
297
+ sent->and_data.label_table =
298
+ (Disjunct **) xrealloc(sent->and_data.label_table,
299
+ oldsize,
300
+ sent->and_data.LT_bound * sizeof(Disjunct *));
301
+ }
302
+
303
+ static void init_HT(Sentence sent)
304
+ {
305
+ memset(sent->and_data.hash_table, 0, HT_SIZE*sizeof(Label_node *));
306
+ }
307
+
308
+ static void free_HT(Sentence sent)
309
+ {
310
+ int i;
311
+ Label_node * la, * la1;
312
+ for (i=0; i<HT_SIZE; i++) {
313
+ for (la=sent->and_data.hash_table[i]; la != NULL; la = la1) {
314
+ la1 = la->next;
315
+ xfree((char *)la, sizeof(Label_node));
316
+ }
317
+ sent->and_data.hash_table[i] = NULL;
318
+ }
319
+ }
320
+
321
+ static void free_LT(Sentence sent)
322
+ {
323
+ int i;
324
+ for (i=0; i<sent->and_data.LT_size; i++) {
325
+ free_disjuncts(sent->and_data.label_table[i]);
326
+ }
327
+ xfree((char *) sent->and_data.label_table,
328
+ sent->and_data.LT_bound * sizeof(Disjunct*));
329
+ sent->and_data.LT_bound = 0;
330
+ sent->and_data.LT_size = 0;
331
+ sent->and_data.label_table = NULL;
332
+ }
333
+
334
+ void free_AND_tables(Sentence sent)
335
+ {
336
+ free_LT(sent);
337
+ free_HT(sent);
338
+ }
339
+
340
+ void initialize_conjunction_tables(Sentence sent)
341
+ {
342
+ int i;
343
+ sent->and_data.LT_bound = 0;
344
+ sent->and_data.LT_size = 0;
345
+ sent->and_data.label_table = NULL;
346
+ for (i=0; i<HT_SIZE; i++) {
347
+ sent->and_data.hash_table[i] = NULL;
348
+ }
349
+ }
350
+
351
+ /**
352
+ * This is a hash function for disjuncts
353
+ */
354
+ static inline int and_hash_disjunct(Disjunct *d)
355
+ {
356
+ unsigned int i;
357
+ Connector *e;
358
+ i = 0;
359
+ for (e = d->left ; e != NULL; e = e->next) {
360
+ i += connector_hash(e);
361
+ }
362
+ i += (i<<5);
363
+ for (e = d->right ; e != NULL; e = e->next) {
364
+ i += connector_hash(e);
365
+ }
366
+ return (i & (HT_SIZE-1));
367
+ }
368
+
369
+ /**
370
+ * Returns TRUE if the disjunct is appropriate to be made into fat links.
371
+ * Check here that the connectors are from some small set.
372
+ * This will disallow, for example "the and their dog ran".
373
+ */
374
+ static int is_appropriate(Sentence sent, Disjunct * d)
375
+ {
376
+ Connector * c;
377
+
378
+ if (sent->dict->andable_connector_set == NULL) return TRUE;
379
+ /* if no set, then everything is considered andable */
380
+ for (c = d->right; c!=NULL; c=c->next) {
381
+ if (!match_in_connector_set(sent, sent->dict->andable_connector_set, c, '+')) return FALSE;
382
+ }
383
+ for (c = d->left; c!=NULL; c=c->next) {
384
+ if (!match_in_connector_set(sent, sent->dict->andable_connector_set, c, '-')) return FALSE;
385
+ }
386
+ return TRUE;
387
+ }
388
+
389
+ /**
390
+ * Two connectors are said to be of the same type if they have
391
+ * the same label, and the initial upper case letters of their
392
+ * strings match.
393
+ */
394
+ static int connector_types_equal(Connector * c1, Connector * c2)
395
+ {
396
+ if (c1->label != c2->label) return FALSE;
397
+ return utf8_upper_match(c1->string, c2->string);
398
+ }
399
+
400
+ /**
401
+ * Two disjuncts are said to be the same type if they're the same
402
+ * ignoring the multi fields, the priority fields, and the subscripts
403
+ * of the connectors (and the string field of the disjunct of course).
404
+ * Disjuncts of the same type are located in the same label_table list.
405
+ *
406
+ * This returns TRUE if they are of the same type.
407
+ */
408
+ static int disjunct_types_equal(Disjunct * d1, Disjunct * d2)
409
+ {
410
+ Connector *e1, *e2;
411
+
412
+ e1 = d1->left;
413
+ e2 = d2->left;
414
+ while((e1!=NULL) && (e2!=NULL)) {
415
+ if (!connector_types_equal(e1,e2)) break;
416
+ e1 = e1->next;
417
+ e2 = e2->next;
418
+ }
419
+ if ((e1!=NULL) || (e2!=NULL)) return FALSE;
420
+ e1 = d1->right;
421
+ e2 = d2->right;
422
+ while((e1!=NULL) && (e2!=NULL)) {
423
+ if (!connector_types_equal(e1,e2)) break;
424
+ e1 = e1->next;
425
+ e2 = e2->next;
426
+ }
427
+ if ((e1!=NULL) || (e2!=NULL)) return FALSE;
428
+ return TRUE;
429
+ }
430
+
431
+ /**
432
+ * This returns a string that is the the GCD of the two given strings.
433
+ * If the GCD is equal to one of them, a pointer to it is returned.
434
+ * Otherwise a new string for the GCD is xalloced and put on the
435
+ * "free later" list.
436
+ */
437
+ const char * intersect_strings(Sentence sent, const char * s, const char * t)
438
+ {
439
+ int len, i, j, d;
440
+ const char *w, *s0;
441
+ char u0[MAX_TOKEN_LENGTH]; /* Links are *always* less than 10 chars long */
442
+ char *u;
443
+ if (strcmp(s,t)==0) return s; /* would work without this */
444
+ i = strlen(s);
445
+ j = strlen(t);
446
+ if (j > i) {
447
+ w = s; s = t; t = w;
448
+ len = j;
449
+ } else {
450
+ len = i;
451
+ }
452
+ /* s is now the longer (at least not the shorter) string */
453
+ /* and len is its length */
454
+ u = u0;
455
+ d = 0;
456
+ s0 = s;
457
+ while (*t != '\0') {
458
+ if ((*s == *t) || (*t == '*')) {
459
+ *u = *s;
460
+ } else {
461
+ d++;
462
+ if (*s == '*') *u = *t;
463
+ else *u = '^';
464
+ }
465
+ s++; t++; u++;
466
+ }
467
+ if (d==0) {
468
+ return s0;
469
+ } else {
470
+ strcpy(u, s); /* get the remainder of s */
471
+ return string_set_add(u0, sent->string_set);
472
+ }
473
+ }
474
+
475
+ /**
476
+ * Two connectors are said to be equal if they are of the same type
477
+ * (defined above), they have the same multi field, and they have
478
+ * exactly the same connectors (including lower case chars).
479
+ * (priorities ignored).
480
+ */
481
+ static int connectors_equal_AND(Connector *c1, Connector *c2)
482
+ {
483
+ return (c1->label == c2->label) &&
484
+ (c1->multi == c2->multi) &&
485
+ (strcmp(c1->string, c2->string) == 0);
486
+ }
487
+
488
+ /**
489
+ * Return true if the disjuncts are equal (ignoring priority fields)
490
+ * and the string of the disjunct.
491
+ */
492
+ static int disjuncts_equal_AND(Sentence sent, Disjunct * d1, Disjunct * d2)
493
+ {
494
+ Connector *e1, *e2;
495
+ sent->and_data.STAT_calls_to_equality_test++;
496
+ e1 = d1->left;
497
+ e2 = d2->left;
498
+ while((e1!=NULL) && (e2!=NULL)) {
499
+ if (!connectors_equal_AND(e1, e2)) break;
500
+ e1 = e1->next;
501
+ e2 = e2->next;
502
+ }
503
+ if ((e1!=NULL) || (e2!=NULL)) return FALSE;
504
+ e1 = d1->right;
505
+ e2 = d2->right;
506
+ while((e1!=NULL) && (e2!=NULL)) {
507
+ if (!connectors_equal_AND(e1, e2)) break;
508
+ e1 = e1->next;
509
+ e2 = e2->next;
510
+ }
511
+ if ((e1!=NULL) || (e2!=NULL)) return FALSE;
512
+ return TRUE;
513
+ }
514
+
515
+ /**
516
+ * Create a new disjunct that is the GCD of d1 and d2.
517
+ * It assumes that the disjuncts are of the same type, so the
518
+ * GCD will not be empty.
519
+ */
520
+ static Disjunct * intersect_disjuncts(Sentence sent, Disjunct * d1, Disjunct * d2)
521
+ {
522
+ Disjunct * d;
523
+ Connector *c1, *c2, *c;
524
+ d = copy_disjunct(d1);
525
+ c = d->left;
526
+ c1 = d1->left;
527
+ c2 = d2->left;
528
+ while (c1!=NULL) {
529
+ connector_set_string (c, intersect_strings(sent, c1->string, c2->string));
530
+ c->multi = (c1->multi) && (c2->multi);
531
+ c = c->next; c1 = c1->next; c2 = c2->next;
532
+ }
533
+ c = d->right;
534
+ c1 = d1->right;
535
+ c2 = d2->right;
536
+ while (c1!=NULL) {
537
+ connector_set_string (c, intersect_strings(sent, c1->string, c2->string));
538
+ c->multi = (c1->multi) && (c2->multi);
539
+ c = c->next; c1 = c1->next; c2 = c2->next;
540
+ }
541
+ return d;
542
+ }
543
+
544
+ /**
545
+ * (1) look for the given disjunct in the table structures
546
+ * if it's already in the table structures, do nothing
547
+ * (2) otherwise make a copy of it, and put it into the table structures
548
+ * (3) also put all of the GCDs of this disjunct with all of the
549
+ * other matching disjuncts into the table.
550
+ *
551
+ * The costs are set to zero.
552
+ * Note that this has no effect on disjunct d.
553
+ */
554
+ static void put_disjunct_into_table(Sentence sent, Disjunct *d)
555
+ {
556
+ Disjunct *d1=NULL, *d2, *di, *d_copy;
557
+ Label_node * lp;
558
+ int h, k;
559
+
560
+ h = and_hash_disjunct(d);
561
+
562
+ for (lp = sent->and_data.hash_table[h]; lp != NULL; lp = lp->next)
563
+ {
564
+ d1 = sent->and_data.label_table[lp->label];
565
+ if (disjunct_types_equal(d,d1)) break;
566
+ }
567
+ if (lp != NULL)
568
+ {
569
+ /* there is already a label for disjuncts of this type */
570
+ /* d1 points to the list of disjuncts of this type already there */
571
+ while(d1 != NULL)
572
+ {
573
+ if (disjuncts_equal_AND(sent, d1, d)) return;
574
+ d1 = d1->next;
575
+ }
576
+ /* now we must put the d disjunct in there, and all of the GCDs of
577
+ it with the ones already there.
578
+
579
+ This is done as follows. We scan through the list of disjuncts
580
+ computing the gcd of the new one with each of the others, putting
581
+ the resulting disjuncts onto another list rooted at d2.
582
+ Now insert d into the the list already there. Now for each
583
+ one on the d2 list, put it in if it isn't already there.
584
+
585
+ Here we're making use of the following theorem: Given a
586
+ collection of sets s1, s2 ... sn closed under intersection,
587
+ to if we add a new set s to the collection and also add
588
+ all the intersections between s and s1...sn to the collection,
589
+ then the collection is still closed under intersection.
590
+
591
+ Use a Venn diagram to prove this theorem.
592
+
593
+ */
594
+ d_copy = copy_disjunct(d);
595
+ d_copy->cost = 0;
596
+ k = lp->label;
597
+ d2 = NULL;
598
+ for (d1=sent->and_data.label_table[k]; d1!=NULL; d1 = d1->next) {
599
+ di = intersect_disjuncts(sent, d_copy, d1);
600
+ di->next = d2;
601
+ d2 = di;
602
+ }
603
+ d_copy->next = sent->and_data.label_table[k];
604
+ sent->and_data.label_table[k] = d_copy;
605
+ for (;d2 != NULL; d2 = di) {
606
+ di = d2->next;
607
+ for (d1 = sent->and_data.label_table[k]; d1 != NULL; d1 = d1->next) {
608
+ if (disjuncts_equal_AND(sent, d1, d2)) break;
609
+ }
610
+ if (d1 == NULL) {
611
+ sent->and_data.STAT_N_disjuncts++;
612
+ d2->next = sent->and_data.label_table[k];
613
+ sent->and_data.label_table[k] = d2;
614
+ } else {
615
+ d2->next = NULL;
616
+ free_disjuncts(d2);
617
+ }
618
+ }
619
+ } else {
620
+ /* create a new label for disjuncts of this type */
621
+ d_copy = copy_disjunct(d);
622
+ d_copy->cost = 0;
623
+ d_copy->next = NULL;
624
+ if (sent->and_data.LT_size == sent->and_data.LT_bound) grow_LT(sent);
625
+ lp = (Label_node *) xalloc(sizeof(Label_node));
626
+ lp->next = sent->and_data.hash_table[h];
627
+ sent->and_data.hash_table[h] = lp;
628
+ lp->label = sent->and_data.LT_size;
629
+ sent->and_data.label_table[sent->and_data.LT_size] = d_copy;
630
+ sent->and_data.LT_size++;
631
+ sent->and_data.STAT_N_disjuncts++;
632
+ }
633
+ }
634
+
635
+ /**
636
+ * A sub disjuct of d is any disjunct obtained by killing the tail
637
+ * of either connector list at any point.
638
+ * Here we go through each sub-disjunct of d, and put it into our
639
+ * table data structure.
640
+ *
641
+ * The function has no side effects on d.
642
+ */
643
+ static void extract_all_fat_links(Sentence sent, Disjunct * d)
644
+ {
645
+ Connector * cl, * cr, *tl, *tr;
646
+ tl = d->left;
647
+ d->left = NULL;
648
+ for (cr = d->right; cr!=NULL; cr = cr->next) {
649
+ tr = cr->next;
650
+ cr->next = NULL;
651
+ if (is_appropriate(sent, d)) put_disjunct_into_table(sent, d);
652
+ cr->next = tr;
653
+ }
654
+ d->left = tl;
655
+
656
+ tr = d->right;
657
+ d->right = NULL;
658
+ for (cl = d->left; cl!=NULL; cl = cl->next) {
659
+ tl = cl->next;
660
+ cl->next = NULL;
661
+ if (is_appropriate(sent, d)) put_disjunct_into_table(sent, d);
662
+ cl->next = tl;
663
+ }
664
+ d->right = tr;
665
+
666
+ for (cl = d->left; cl!=NULL; cl = cl->next) {
667
+ for (cr = d->right; cr!=NULL; cr = cr->next) {
668
+ tl = cl->next;
669
+ tr = cr->next;
670
+ cl->next = cr->next = NULL;
671
+
672
+ if (is_appropriate(sent, d)) put_disjunct_into_table(sent, d);
673
+
674
+ cl->next = tl;
675
+ cr->next = tr;
676
+ }
677
+ }
678
+ }
679
+
680
+ /**
681
+ * put the next len characters from c->string (skipping upper
682
+ * case ones) into s. If there are fewer than this, pad with '*'s.
683
+ * Then put in a character for the multi match bit of c.
684
+ * Then put in a '\0', and return a pointer to this place.
685
+ */
686
+ static char * stick_in_one_connector(char *s, Connector *c, int len)
687
+ {
688
+ const char * t;
689
+
690
+ t = skip_utf8_upper(c->string);
691
+
692
+ while (*t != '\0') {
693
+ *s++ = *t++;
694
+ len--;
695
+ }
696
+ while (len > 0) {
697
+ *s++ = '*';
698
+ len--;
699
+ }
700
+ if (c->multi) *s++ = '*'; else *s++ = '^'; /* check this sometime */
701
+ *s = '\0';
702
+ return s;
703
+ }
704
+
705
+ /**
706
+ * This takes a label k, modifies the list of disjuncts with that
707
+ * label. For each such disjunct, it computes the string that
708
+ * will be used in the fat connector that represents it.
709
+ *
710
+ * The only hard part is finding the length of each of the strings
711
+ * so that "*" can be put in. A better explanation will have to wait.
712
+ */
713
+ static void compute_matchers_for_a_label(Sentence sent, int k)
714
+ {
715
+ char buff[2*MAX_WORD];
716
+ int lengths[MAX_LINKS];
717
+ int N_connectors, i, j;
718
+ Connector * c;
719
+ Disjunct * d;
720
+ const char *cs;
721
+ char *s;
722
+
723
+ d = sent->and_data.label_table[k];
724
+
725
+ N_connectors = 0;
726
+ for (c=d->left; c != NULL; c = c->next) N_connectors ++;
727
+ for (c=d->right; c != NULL; c = c->next) N_connectors ++;
728
+
729
+ for (i=0; i<N_connectors; i++) lengths[i] = 0;
730
+ while(d != NULL) {
731
+ i = 0;
732
+ for (c=d->left; c != NULL; c = c->next) {
733
+ cs = skip_utf8_upper(c->string);
734
+ j = strlen(cs);
735
+ if (j > lengths[i]) lengths[i] = j;
736
+ i++;
737
+ }
738
+ for (c=d->right; c != NULL; c = c->next) {
739
+ cs = c->string;
740
+ cs = skip_utf8_upper(cs);
741
+ j = strlen(cs);
742
+ if (j > lengths[i]) lengths[i] = j;
743
+ i++;
744
+ }
745
+ d = d->next;
746
+ }
747
+
748
+ for (d = sent->and_data.label_table[k]; d!= NULL; d = d->next)
749
+ {
750
+ i=0;
751
+ s = buff;
752
+ for (c=d->left; c != NULL; c = c->next) {
753
+ s = stick_in_one_connector(s, c, lengths[i]);
754
+ i++;
755
+ }
756
+ for (c=d->right; c != NULL; c = c->next) {
757
+ s = stick_in_one_connector(s, c, lengths[i]);
758
+ i++;
759
+ }
760
+ d->string = string_set_add(buff, sent->string_set);
761
+ }
762
+ }
763
+
764
+ /**
765
+ * Goes through the entire sentence and builds the fat link tables
766
+ * for all the disjuncts of all the words.
767
+ */
768
+ void build_conjunction_tables(Sentence sent)
769
+ {
770
+ int w;
771
+ int k;
772
+ Disjunct * d;
773
+
774
+ init_HT(sent);
775
+ init_LT(sent);
776
+ sent->and_data.STAT_N_disjuncts = 0;
777
+ sent->and_data.STAT_calls_to_equality_test = 0;
778
+
779
+ for (w=0; w<sent->length; w++) {
780
+ for (d=sent->word[w].d; d!=NULL; d=d->next) {
781
+ extract_all_fat_links(sent, d);
782
+ }
783
+ }
784
+
785
+ for (k=0; k<sent->and_data.LT_size; k++) {
786
+ compute_matchers_for_a_label(sent, k);
787
+ }
788
+ }
789
+
790
+ void print_AND_statistics(Sentence sent)
791
+ {
792
+ printf("Number of disjunct types (labels): %d\n", sent->and_data.LT_size);
793
+ printf("Number of disjuncts in the table: %d\n", sent->and_data.STAT_N_disjuncts);
794
+ if (sent->and_data.LT_size != 0) {
795
+ printf("average list length: %f\n",
796
+ (float)sent->and_data.STAT_N_disjuncts/sent->and_data.LT_size);
797
+ }
798
+ printf("Number of equality tests: %d\n", sent->and_data.STAT_calls_to_equality_test);
799
+ }
800
+
801
+ /**
802
+ * Fill in the fields of c for the disjunct. This must be in
803
+ * the table data structures. The label field and the string field
804
+ * are filled in appropriately. Priority is set to UP_priority.
805
+ */
806
+ static void connector_for_disjunct(Sentence sent, Disjunct * d, Connector * c)
807
+ {
808
+ int h;
809
+ Disjunct * d1 = NULL;
810
+ Label_node * lp;
811
+
812
+ h = and_hash_disjunct(d);
813
+
814
+ for (lp = sent->and_data.hash_table[h]; lp != NULL; lp = lp->next) {
815
+ d1 = sent->and_data.label_table[lp->label];
816
+ if (disjunct_types_equal(d,d1)) break;
817
+ }
818
+ assert(lp != NULL, "A disjunct I inserted was not there. (1)");
819
+
820
+ while(d1 != NULL) {
821
+ if (disjuncts_equal_AND(sent, d1, d)) break;
822
+ d1 = d1->next;
823
+ }
824
+
825
+ assert(d1 != NULL, "A disjunct I inserted was not there. (2)");
826
+
827
+ c->label = lp->label;
828
+ connector_set_string(c, d1->string);
829
+ c->priority = UP_priority;
830
+ c->multi = FALSE;
831
+ }
832
+
833
+
834
+ /**
835
+ * This function allocates and returns a list of disjuncts.
836
+ * This is the one obtained by substituting each contiguous
837
+ * non-empty subrange of d (incident on the center) by an appropriate
838
+ * fat link, in two possible positions. Does not effect d.
839
+ * The cost of d is inherited by all of the disjuncts in the result.
840
+ */
841
+ static Disjunct * build_fat_link_substitutions(Sentence sent, Disjunct *d)
842
+ {
843
+ Connector * cl, * cr, *tl, *tr, *wc, work_connector;
844
+ Disjunct *d1, *wd, work_disjunct, *d_list;
845
+ if (d==NULL) return NULL;
846
+ wd = &work_disjunct;
847
+ wc = init_connector(&work_connector);
848
+ d_list = NULL;
849
+ *wd = *d;
850
+ tl = d->left;
851
+ d->left = NULL;
852
+ for (cr = d->right; cr!=NULL; cr = cr->next) {
853
+ tr = cr->next;
854
+ cr->next = NULL;
855
+ if (is_appropriate(sent, d)) {
856
+ connector_for_disjunct(sent, d, wc);
857
+ wd->left = tl;
858
+ wd->right = wc;
859
+ wc->next = tr;
860
+ d1 = copy_disjunct(wd);
861
+ d1->next = d_list;
862
+ d_list = d1;
863
+ wd->left = wc;
864
+ wc->next = tl;
865
+ wd->right = tr;
866
+ d1 = copy_disjunct(wd);
867
+ d1->next = d_list;
868
+ d_list = d1;
869
+ }
870
+ cr->next = tr;
871
+ }
872
+ d->left = tl;
873
+
874
+ tr = d->right;
875
+ d->right = NULL;
876
+ for (cl = d->left; cl!=NULL; cl = cl->next) {
877
+ tl = cl->next;
878
+ cl->next = NULL;
879
+ if (is_appropriate(sent, d)) {
880
+ connector_for_disjunct(sent, d, wc);
881
+ wd->left = tl;
882
+ wd->right = wc;
883
+ wc->next = tr;
884
+ d1 = copy_disjunct(wd);
885
+ d1->next = d_list;
886
+ d_list = d1;
887
+ wd->left = wc;
888
+ wc->next = tl;
889
+ wd->right = tr;
890
+ d1 = copy_disjunct(wd);
891
+ d1->next = d_list;
892
+ d_list = d1;
893
+ }
894
+ cl->next = tl;
895
+ }
896
+ d->right = tr;
897
+
898
+ for (cl = d->left; cl!=NULL; cl = cl->next) {
899
+ for (cr = d->right; cr!=NULL; cr = cr->next) {
900
+ tl = cl->next;
901
+ tr = cr->next;
902
+ cl->next = cr->next = NULL;
903
+ if (is_appropriate(sent, d)) {
904
+ connector_for_disjunct(sent, d, wc);
905
+ wd->left = tl;
906
+ wd->right = wc;
907
+ wc->next = tr;
908
+ d1 = copy_disjunct(wd);
909
+ d1->next = d_list;
910
+ d_list = d1;
911
+ wd->left = wc;
912
+ wc->next = tl;
913
+ wd->right = tr;
914
+ d1 = copy_disjunct(wd);
915
+ d1->next = d_list;
916
+ d_list = d1;
917
+ }
918
+ cl->next = tl;
919
+ cr->next = tr;
920
+ }
921
+ }
922
+ return d_list;
923
+ }
924
+
925
+ /**
926
+ * This is basically a "map" function for build_fat_link_substitutions.
927
+ * It's applied to the disjuncts for all regular words of the sentence.
928
+ */
929
+ Disjunct * explode_disjunct_list(Sentence sent, Disjunct *d)
930
+ {
931
+ Disjunct *d1;
932
+
933
+ d1 = NULL;
934
+
935
+ for (; d!=NULL; d = d->next) {
936
+ d1 = catenate_disjuncts(d1, build_fat_link_substitutions(sent, d));
937
+ }
938
+ return d1;
939
+ }
940
+
941
+ /**
942
+ * Builds and returns a disjunct list for the comma. These are the
943
+ * disjuncts that are used when "," operates in conjunction with "and".
944
+ * Does not deal with the ", and" issue, nor the other uses
945
+ * of comma.
946
+ */
947
+ Disjunct * build_COMMA_disjunct_list(Sentence sent)
948
+ {
949
+ int lab;
950
+ Disjunct *d1, *d2, *d, work_disjunct, *wd;
951
+ Connector work_connector1, work_connector2, *c1, *c2;
952
+ Connector work_connector3, *c3;
953
+ c1 = init_connector(&work_connector1);
954
+ c2 = init_connector(&work_connector2);
955
+ c3 = init_connector(&work_connector3);
956
+ wd = &work_disjunct;
957
+
958
+ d1 = NULL; /* where we put the list we're building */
959
+
960
+ c1->next = NULL;
961
+ c2->next = c3;
962
+ c3->next = NULL;
963
+ c1->priority = c3->priority = DOWN_priority;
964
+ c2->priority = UP_priority;
965
+ c1->multi = c2->multi = c3->multi = FALSE;
966
+ wd->left = c1;
967
+ wd->right = c2;
968
+ wd->string = ","; /* *** fix this later?? */
969
+ wd->next = NULL;
970
+ wd->cost = 0;
971
+ for (lab = 0; lab < sent->and_data.LT_size; lab++) {
972
+ for (d = sent->and_data.label_table[lab]; d!=NULL; d=d->next) {
973
+ c1->string = c2->string = c3->string = d->string;
974
+ c1->label = c2->label = c3->label = lab;
975
+ d2 = copy_disjunct(wd);
976
+ d2->next = d1;
977
+ d1 = d2;
978
+ }
979
+ }
980
+ return d1;
981
+ }
982
+
983
+ /**
984
+ * Builds and returns a disjunct list for "and", "or" and "nor"
985
+ * for each disjunct in the label_table, we build three disjuncts
986
+ * this means that "Danny and Tycho and Billy" will be parsable in
987
+ * two ways. I don't know an easy way to avoid this
988
+ * the string is either "and", or "or", or "nor" at the moment.
989
+ */
990
+ Disjunct * build_AND_disjunct_list(Sentence sent, char * s)
991
+ {
992
+ int lab;
993
+ Disjunct *d_list, *d1, *d3, *d, *d_copy;
994
+ Connector *c1, *c2, *c3;
995
+
996
+ d_list = NULL; /* where we put the list we're building */
997
+
998
+ for (lab = 0; lab < sent->and_data.LT_size; lab++) {
999
+ for (d = sent->and_data.label_table[lab]; d!=NULL; d=d->next) {
1000
+ d1 = build_fat_link_substitutions(sent, d);
1001
+ d_copy = copy_disjunct(d); /* also include the thing itself! */
1002
+ d_copy->next = d1;
1003
+ d1 = d_copy;
1004
+ for(;d1 != NULL; d1 = d3) {
1005
+ d3 = d1->next;
1006
+
1007
+ c1 = connector_new();
1008
+ c2 = connector_new();
1009
+ c1->priority = c2->priority = DOWN_priority;
1010
+ connector_set_string(c1, d->string);
1011
+ connector_set_string(c2, d->string);
1012
+ c1->label = c2->label = lab;
1013
+
1014
+ d1->string = s;
1015
+
1016
+ if (d1->right == NULL) {
1017
+ d1->right = c2;
1018
+ } else {
1019
+ for (c3=d1->right; c3->next != NULL; c3 = c3->next)
1020
+ ;
1021
+ c3->next = c2;
1022
+ }
1023
+ if (d1->left == NULL) {
1024
+ d1->left = c1;
1025
+ } else {
1026
+ for (c3=d1->left; c3->next != NULL; c3 = c3->next)
1027
+ ;
1028
+ c3->next = c1;
1029
+ }
1030
+ d1->next = d_list;
1031
+ d_list = d1;
1032
+ }
1033
+ }
1034
+ }
1035
+ #if defined(PLURALIZATION)
1036
+ /* here is where "and" makes singular into plural. */
1037
+ /* must accommodate "he and I are good", "Davy and I are good"
1038
+ "Danny and Davy are good", and reject all of these with "is"
1039
+ instead of "are".
1040
+
1041
+ The SI connectors must also be modified to accommodate "are John
1042
+ and Dave here", but kill "is John and Dave here"
1043
+ */
1044
+ if (strcmp(s, "and") == 0)
1045
+ {
1046
+ for (d1 = d_list; d1 != NULL; d1 = d1->next)
1047
+ {
1048
+ for (c1 = d1->right; c1 != NULL; c1 = c1->next)
1049
+ {
1050
+ if ((c1->string[0] == 'S') &&
1051
+ ((c1->string[1] == '^') ||
1052
+ (c1->string[1] == 's') ||
1053
+ (c1->string[1] == 'p') ||
1054
+ (c1->string[1] == '\0')))
1055
+ {
1056
+ connector_set_string(c1, "Sp");
1057
+ }
1058
+ }
1059
+ for (c1 = d1->left; c1 != NULL; c1 = c1->next)
1060
+ {
1061
+ if ((c1->string[0] == 'S') && (c1->string[1] == 'I') &&
1062
+ ((c1->string[2] == '^') ||
1063
+ (c1->string[2] == 's') ||
1064
+ (c1->string[2] == 'p') ||
1065
+ (c1->string[2] == '\0')))
1066
+ {
1067
+ connector_set_string(c1, "SIp");
1068
+ }
1069
+ }
1070
+ }
1071
+ }
1072
+ /*
1073
+ "a cat or a dog is here" vs "a cat or a dog are here"
1074
+ The first seems right, the second seems wrong. I'll stick with this.
1075
+
1076
+ That is, "or" has the property that if both parts are the same in
1077
+ number, we use that but if they differ, we use plural.
1078
+
1079
+ The connectors on "I" must be handled specially. We accept
1080
+ "I or the dogs are here" but reject "I or the dogs is here"
1081
+ */
1082
+
1083
+ /* the code here still does now work "right", rejecting "is John or I invited"
1084
+ and accepting "I or my friend know what happened"
1085
+
1086
+ The more generous code for "nor" has been used instead
1087
+ */
1088
+ /*
1089
+ else if (strcmp(s, "or") == 0) {
1090
+ for (d1 = d_list; d1!=NULL; d1=d1->next) {
1091
+ for (c1=d1->right; c1!=NULL; c1=c1->next) {
1092
+ if (c1->string[0] == 'S') {
1093
+ if (c1->string[1]=='^') {
1094
+ if (c1->string[2]=='a') {
1095
+ connector_set_string(c1, "Ss");
1096
+ } else {
1097
+ connector_set_string(c1, "Sp");
1098
+ }
1099
+ } else if ((c1->string[1]=='p') && (c1->string[2]=='a')){
1100
+ connector_set_string(c1, "Sp");
1101
+ }
1102
+ }
1103
+ }
1104
+ for (c1=d1->left; c1!=NULL; c1=c1->next) {
1105
+ if ((c1->string[0] == 'S') && (c1->string[1] == 'I')) {
1106
+ if (c1->string[2]=='^') {
1107
+ if (c1->string[3]=='a') {
1108
+ connector_set_string(c1, "Ss");
1109
+ } else {
1110
+ connector_set_string(c1, "Sp");
1111
+ }
1112
+ } else if ((c1->string[2]=='p') && (c1->string[3]=='a')){
1113
+ connector_set_string(c1, "Sp");
1114
+ }
1115
+ }
1116
+ }
1117
+ }
1118
+ }
1119
+ */
1120
+ /*
1121
+ It appears that the "nor" of two things can be either singular or
1122
+ plural. "neither she nor John likes dogs"
1123
+ "neither she nor John like dogs"
1124
+
1125
+ */
1126
+ else if ((strcmp(s,"nor")==0) || (strcmp(s,"or")==0)) {
1127
+ for (d1 = d_list; d1!=NULL; d1=d1->next) {
1128
+ for (c1=d1->right; c1!=NULL; c1=c1->next) {
1129
+ if ((c1->string[0] == 'S') &&
1130
+ ((c1->string[1]=='^') ||
1131
+ (c1->string[1]=='s') ||
1132
+ (c1->string[1]=='p'))) {
1133
+ connector_set_string(c1, "S");
1134
+ }
1135
+ }
1136
+ for (c1=d1->left; c1!=NULL; c1=c1->next) {
1137
+ if ((c1->string[0] == 'S') && (c1->string[1] == 'I') &&
1138
+ ((c1->string[2]=='^') ||
1139
+ (c1->string[2]=='s') ||
1140
+ (c1->string[2]=='p'))) {
1141
+ connector_set_string(c1, "SI");
1142
+ }
1143
+ }
1144
+ }
1145
+ }
1146
+
1147
+ #endif
1148
+ return d_list;
1149
+ }
1150
+
1151
+
1152
+ /* The following routines' purpose is to eliminate all but the
1153
+ canonical linkage (of a collection of linkages that are identical
1154
+ except for fat links). An example of the problem is
1155
+ "I went to a talk and ate lunch". Without the canonical checker
1156
+ this has two linkages with identical structure.
1157
+
1158
+ We restrict our attention to a collection of linkages that are all
1159
+ isomorphic. Consider the set of all disjuncts that are used on one
1160
+ word (over the collection of linkages). This set is closed under GCD,
1161
+ since two linkages could both be used in that position, then so could
1162
+ their GCD. The GCD has been constructed and put in the label table.
1163
+
1164
+ The canonical linkage is the one in which the minimal disjunct that
1165
+ ever occurrs in a position is used in that position. It is easy to
1166
+ prove that a disjunct is not canonical -- just find one of it's fat
1167
+ disjuncts that can be replaced by a smaller one. If this can not be
1168
+ done, then the linkage is canonical.
1169
+
1170
+ The algorithm uses link_array[] and chosen_disjuncts[] as input to
1171
+ describe the linkage, and also uses the label_table.
1172
+
1173
+ (1) find all the words with fat disjuncts
1174
+ (2) scan all links and build, for each fat disjucnt used,
1175
+ an "image" structure that contains what this disjunct must
1176
+ connect to in the rest of the linkage.
1177
+ (3) For each fat disjunct, run through the label_table for disjuncts
1178
+ with the same label, considering only those with strictly more
1179
+ restricted match sets (this uses the string fields of the disjuncts
1180
+ from the table).
1181
+ (4) For each that passes this test, we see if it can replace the chosen
1182
+ disjunct. This is performed by examining how this disjunct
1183
+ compares with the image structure for this word.
1184
+ */
1185
+
1186
+ struct Image_node_struct {
1187
+ Image_node * next;
1188
+ Connector * c; /* the connector the place on the disjunct must match */
1189
+ int place; /* Indicates the place in the fat disjunct where this
1190
+ connector must connect. If 0 then this is a fat
1191
+ connector. If >0 then go place to the right, if
1192
+ <0 then go -place to the left. */
1193
+ };
1194
+
1195
+ /**
1196
+ * Fill in the has_fat_down array. Uses link_array[].
1197
+ * Returns TRUE if there exists at least one word with a
1198
+ * fat down label.
1199
+ */
1200
+ int set_has_fat_down(Sentence sent)
1201
+ {
1202
+ int link, w, N_fat;
1203
+ Parse_info pi = sent->parse_info;
1204
+
1205
+ N_fat = 0;
1206
+
1207
+ for (w = 0; w < pi->N_words; w++)
1208
+ {
1209
+ pi->has_fat_down[w] = FALSE;
1210
+ }
1211
+
1212
+ for (link = 0; link < pi->N_links; link++)
1213
+ {
1214
+ if (pi->link_array[link].lc->priority == DOWN_priority)
1215
+ {
1216
+ N_fat ++;
1217
+ pi->has_fat_down[pi->link_array[link].l] = TRUE;
1218
+ }
1219
+ else if (pi->link_array[link].rc->priority == DOWN_priority)
1220
+ {
1221
+ N_fat ++;
1222
+ pi->has_fat_down[pi->link_array[link].r] = TRUE;
1223
+ }
1224
+ }
1225
+ return (N_fat > 0);
1226
+ }
1227
+
1228
+ static void free_image_array(Parse_info pi)
1229
+ {
1230
+ int w;
1231
+ Image_node * in, * inx;
1232
+ for (w = 0; w < pi->N_words; w++)
1233
+ {
1234
+ for (in = pi->image_array[w]; in != NULL; in = inx)
1235
+ {
1236
+ inx = in->next;
1237
+ xfree((char *)in, sizeof(Image_node));
1238
+ }
1239
+ pi->image_array[w] = NULL;
1240
+ }
1241
+ }
1242
+
1243
+ /**
1244
+ * Uses link_array, chosen_disjuncts, and down_label to construct
1245
+ * image_array
1246
+ */
1247
+ static void build_image_array(Sentence sent)
1248
+ {
1249
+ int link, end, word;
1250
+ Connector * this_end_con, *other_end_con, * upcon, * updiscon, *clist;
1251
+ Disjunct * dis, * updis;
1252
+ Image_node * in;
1253
+ Parse_info pi = sent->parse_info;
1254
+
1255
+ for (word=0; word<pi->N_words; word++)
1256
+ {
1257
+ pi->image_array[word] = NULL;
1258
+ }
1259
+
1260
+ for (end = -1; end <= 1; end += 2)
1261
+ {
1262
+ for (link = 0; link < pi->N_links; link++)
1263
+ {
1264
+ if (end < 0)
1265
+ {
1266
+ word = pi->link_array[link].l;
1267
+ if (!pi->has_fat_down[word]) continue;
1268
+ this_end_con = pi->link_array[link].lc;
1269
+ other_end_con = pi->link_array[link].rc;
1270
+ dis = pi->chosen_disjuncts[word];
1271
+ clist = dis->right;
1272
+ }
1273
+ else
1274
+ {
1275
+ word = pi->link_array[link].r;
1276
+ if (!pi->has_fat_down[word]) continue;
1277
+ this_end_con = pi->link_array[link].rc;
1278
+ other_end_con = pi->link_array[link].lc;
1279
+ dis = pi->chosen_disjuncts[word];
1280
+ clist = dis->left;
1281
+ }
1282
+
1283
+ if (this_end_con->priority == DOWN_priority) continue;
1284
+ if ((this_end_con->label != NORMAL_LABEL) &&
1285
+ (this_end_con->label < 0)) continue;
1286
+ /* no need to construct an image node for down links,
1287
+ or commas links or either/neither links */
1288
+
1289
+ in = (Image_node *) xalloc(sizeof(Image_node));
1290
+ in->next = pi->image_array[word];
1291
+ pi->image_array[word] = in;
1292
+ in->c = other_end_con;
1293
+
1294
+ /* the rest of this code is for computing in->place */
1295
+ if (this_end_con->priority == UP_priority)
1296
+ {
1297
+ in->place = 0;
1298
+ }
1299
+ else
1300
+ {
1301
+ in->place = 1;
1302
+ if ((dis->left != NULL) &&
1303
+ (dis->left->priority == UP_priority))
1304
+ {
1305
+ upcon = dis->left;
1306
+ }
1307
+ else if ((dis->right != NULL) &&
1308
+ (dis->right->priority == UP_priority))
1309
+ {
1310
+ upcon = dis->right;
1311
+ }
1312
+ else
1313
+ {
1314
+ upcon = NULL;
1315
+ }
1316
+ if (upcon != NULL)
1317
+ {
1318
+ /* add on extra for a fat up link */
1319
+ updis = sent->and_data.label_table[upcon->label];
1320
+ if (end > 0)
1321
+ {
1322
+ updiscon = updis->left;
1323
+ }
1324
+ else
1325
+ {
1326
+ updiscon = updis->right;
1327
+ }
1328
+ for (;updiscon != NULL; updiscon = updiscon->next)
1329
+ {
1330
+ in->place ++;
1331
+ }
1332
+ }
1333
+ for (; clist != this_end_con; clist = clist->next)
1334
+ {
1335
+ if (clist->label < 0) in->place++;
1336
+ }
1337
+ in->place = in->place * (-end);
1338
+ }
1339
+ }
1340
+ }
1341
+ }
1342
+
1343
+ /**
1344
+ * returns TRUE if string s represents a strictly smaller match set
1345
+ * than does t
1346
+ */
1347
+ static int strictly_smaller(const char * s, const char * t)
1348
+ {
1349
+ int strictness;
1350
+ strictness = 0;
1351
+ for (;(*s!='\0') && (*t!='\0'); s++,t++) {
1352
+ if (*s == *t) continue;
1353
+ if ((*t == '*') || (*s == '^')) {
1354
+ strictness++;
1355
+ } else {
1356
+ return FALSE;
1357
+ }
1358
+ }
1359
+ assert(! ((*s!='\0') || (*t!='\0')), "s and t should be the same length!");
1360
+ return (strictness > 0);
1361
+ }
1362
+
1363
+ /**
1364
+ * dis points to a disjunct in the label_table. label is the label
1365
+ * of a different set of disjuncts. These can be derived from the label
1366
+ * of dis. Find the specific disjunct of in label_table[label]
1367
+ * which corresponds to dis.
1368
+ */
1369
+ static Disjunct * find_subdisjunct(Sentence sent, Disjunct * dis, int label)
1370
+ {
1371
+ Disjunct * d;
1372
+ Connector * cx, *cy;
1373
+ for (d=sent->and_data.label_table[label]; d!=NULL; d=d->next)
1374
+ {
1375
+ for (cx=d->left, cy=dis->left; cx!=NULL; cx=cx->next,cy=cy->next)
1376
+ {
1377
+ /* if ((cx->string != cy->string) || */
1378
+ if ((strcmp(connector_get_string(cx),
1379
+ connector_get_string(cy)) != 0) ||
1380
+ (cx->multi != cy->multi)) break;/* have to check multi? */
1381
+ }
1382
+ if (cx!=NULL) continue;
1383
+ for (cx=d->right, cy=dis->right; cx!=NULL; cx=cx->next,cy=cy->next)
1384
+ {
1385
+ /* if ((cx->string != cy->string) || */
1386
+ if ((strcmp(connector_get_string(cx),
1387
+ connector_get_string(cy)) != 0) ||
1388
+ (cx->multi != cy->multi)) break;
1389
+ }
1390
+ if (cx==NULL) break;
1391
+ }
1392
+ assert(d!=NULL, "Never found subdisjunct");
1393
+ return d;
1394
+ }
1395
+
1396
+ /**
1397
+ * is_canonical_linkage --
1398
+ * This uses link_array[], chosen_disjuncts[], has_fat_down[].
1399
+ * It assumes that there is a fat link in the current linkage.
1400
+ * See the comments above for more information about how it works
1401
+ */
1402
+ int is_canonical_linkage(Sentence sent)
1403
+ {
1404
+ int w, d_label=0, place;
1405
+ Connector *d_c, *c, dummy_connector, *upcon;
1406
+ Disjunct *dis, *chosen_d;
1407
+ Image_node * in;
1408
+ Parse_info pi = sent->parse_info;
1409
+
1410
+ init_connector(&dummy_connector);
1411
+ dummy_connector.priority = UP_priority;
1412
+
1413
+ build_image_array(sent);
1414
+
1415
+ for (w=0; w<pi->N_words; w++)
1416
+ {
1417
+ if (!pi->has_fat_down[w]) continue;
1418
+ chosen_d = pi->chosen_disjuncts[w];
1419
+
1420
+ /* there must be a down connector in both the left and right list */
1421
+ for (d_c = chosen_d->left; d_c!=NULL; d_c=d_c->next)
1422
+ {
1423
+ if (d_c->priority == DOWN_priority)
1424
+ {
1425
+ d_label = d_c->label;
1426
+ break;
1427
+ }
1428
+ }
1429
+ assert(d_c != NULL, "Should have found the down link.");
1430
+
1431
+ if ((chosen_d->left != NULL) &&
1432
+ (chosen_d->left->priority == UP_priority)) {
1433
+ upcon = chosen_d->left;
1434
+ } else if ((chosen_d->right != NULL) &&
1435
+ (chosen_d->right->priority == UP_priority)) {
1436
+ upcon = chosen_d->right;
1437
+ } else {
1438
+ upcon = NULL;
1439
+ }
1440
+
1441
+ /* check that the disjunct on w is minimal (canonical) */
1442
+
1443
+ for (dis=sent->and_data.label_table[d_label]; dis!=NULL; dis=dis->next)
1444
+ {
1445
+ /* now, reject a disjunct if it's not strictly below the old */
1446
+ if(!strictly_smaller(dis->string,
1447
+ connector_get_string(d_c))) continue;
1448
+
1449
+ /* Now, it has to match the image connectors */
1450
+ for (in = pi->image_array[w]; in != NULL; in = in->next)
1451
+ {
1452
+ place = in->place;
1453
+ if (place == 0)
1454
+ {
1455
+ assert(upcon != NULL, "Should have found an up link");
1456
+ dummy_connector.label = upcon->label;
1457
+
1458
+ /* now we have to compute the string of the
1459
+ disjunct with upcon->label that corresponds
1460
+ to dis */
1461
+ if (upcon->label == d_label)
1462
+ {
1463
+ connector_set_string(&dummy_connector, dis->string);
1464
+ } else {
1465
+ connector_set_string(&dummy_connector,
1466
+ find_subdisjunct(sent, dis, upcon->label)->string);
1467
+ }
1468
+
1469
+ /* I hope using x_match here is right */
1470
+ if (!x_match(sent, &dummy_connector, in->c)) break;
1471
+ } else if (place > 0) {
1472
+ for (c=dis->right; place > 1; place--) {
1473
+ c = c->next;
1474
+ }
1475
+ if (!x_match(sent, c, in->c)) break; /* Ditto above comment --DS 07/97*/
1476
+ } else {
1477
+ for (c=dis->left; place < -1; place++) {
1478
+ c = c->next;
1479
+ }
1480
+ if (!x_match(sent, c, in->c)) break; /* Ditto Ditto */
1481
+ }
1482
+ }
1483
+
1484
+ if (in == NULL) break;
1485
+ }
1486
+ if (dis != NULL) break;
1487
+ /* there is a better disjunct that the one we're using, so this
1488
+ word is bad, so we're done */
1489
+ }
1490
+ free_image_array(pi);
1491
+ return (w == pi->N_words);
1492
+ }
1493
+
1494
+ /**
1495
+ * This takes as input link_array[], sublinkage->link[]->l and
1496
+ * sublinkage->link[]->r (and also has_fat_down[word], which has been
1497
+ * computed in a prior call to is_canonical()), and from these
1498
+ * computes sublinkage->link[].lc and .rc. We assume these have
1499
+ * been initialized with the values from link_array. We also assume
1500
+ * that there are fat links.
1501
+ */
1502
+ void compute_pp_link_array_connectors(Sentence sent, Sublinkage *sublinkage)
1503
+ {
1504
+ int link, end, word, place;
1505
+ Connector * this_end_con, * upcon, * updiscon, *clist, *con, *mycon;
1506
+ Disjunct * dis, * updis, *mydis;
1507
+ Parse_info pi = sent->parse_info;
1508
+
1509
+ for (end = -1; end <= 1; end += 2)
1510
+ {
1511
+ for (link=0; link<pi->N_links; link++)
1512
+ {
1513
+ if (sublinkage->link[link]->l == -1) continue;
1514
+ if (end < 0)
1515
+ {
1516
+ word = pi->link_array[link].l;
1517
+ if (!pi->has_fat_down[word]) continue;
1518
+ this_end_con = pi->link_array[link].lc;
1519
+ dis = pi->chosen_disjuncts[word];
1520
+ mydis = pi->chosen_disjuncts[sublinkage->link[link]->l];
1521
+ clist = dis->right;
1522
+ }
1523
+ else
1524
+ {
1525
+ word = pi->link_array[link].r;
1526
+ if (!pi->has_fat_down[word]) continue;
1527
+ this_end_con = pi->link_array[link].rc;
1528
+ dis = pi->chosen_disjuncts[word];
1529
+ mydis = pi->chosen_disjuncts[sublinkage->link[link]->r];
1530
+ clist = dis->left;
1531
+ }
1532
+
1533
+ if (this_end_con->label != NORMAL_LABEL) continue;
1534
+ /* no need to construct a connector for up links,
1535
+ or commas links or either/neither links */
1536
+
1537
+ /* Now compute the place */
1538
+ place = 0;
1539
+ if ((dis->left != NULL) &&
1540
+ (dis->left->priority == UP_priority)) {
1541
+ upcon = dis->left;
1542
+ } else if ((dis->right != NULL) &&
1543
+ (dis->right->priority == UP_priority)) {
1544
+ upcon = dis->right;
1545
+ } else {
1546
+ upcon = NULL;
1547
+ }
1548
+ if (upcon != NULL) { /* add on extra for a fat up link */
1549
+ updis = sent->and_data.label_table[upcon->label];
1550
+ if (end > 0) {
1551
+ updiscon = updis->left;
1552
+ } else {
1553
+ updiscon = updis->right;
1554
+ }
1555
+ for (;updiscon != NULL; updiscon = updiscon->next) {
1556
+ place ++;
1557
+ }
1558
+ }
1559
+ for (; clist != this_end_con; clist = clist->next) {
1560
+ if (clist->label < 0) place++;
1561
+ }
1562
+ /* place has just been computed */
1563
+
1564
+ /* now find the right disjunct in the table */
1565
+ if ((mydis->left != NULL) &&
1566
+ (mydis->left->priority == UP_priority)) {
1567
+ mycon = mydis->left;
1568
+ } else if ((mydis->right != NULL) &&
1569
+ (mydis->right->priority == UP_priority)) {
1570
+ mycon = mydis->right;
1571
+ } else {
1572
+ printf("word = %d\n", word);
1573
+ printf("fat link: [%d, %d]\n",
1574
+ pi->link_array[link].l, pi->link_array[link].r);
1575
+ printf("thin link: [%d, %d]\n",
1576
+ sublinkage->link[link]->l, sublinkage->link[link]->r);
1577
+ assert(FALSE, "There should be a fat UP link here");
1578
+ }
1579
+
1580
+ for (dis=sent->and_data.label_table[mycon->label];
1581
+ dis != NULL; dis=dis->next) {
1582
+ if (dis->string == connector_get_string(mycon)) break;
1583
+ }
1584
+ assert(dis!=NULL, "Should have found this connector string");
1585
+ /* the disjunct in the table has just been found */
1586
+
1587
+ if (end < 0)
1588
+ {
1589
+ for (con = dis->right; place > 0; place--, con=con->next) {}
1590
+ /* sublinkage->link[link]->lc = con; OLD CODE */
1591
+ exfree_connectors(sublinkage->link[link]->lc);
1592
+ sublinkage->link[link]->lc = excopy_connectors(con);
1593
+ }
1594
+ else
1595
+ {
1596
+ for (con = dis->left; place > 0; place--, con=con->next) {}
1597
+ /* sublinkage->link[link]->rc = con; OLD CODE */
1598
+ exfree_connectors(sublinkage->link[link]->rc);
1599
+ sublinkage->link[link]->rc = excopy_connectors(con);
1600
+ }
1601
+ }
1602
+ }
1603
+ }