grammar_cop 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (344) hide show
  1. data/.DS_Store +0 -0
  2. data/.gitignore +4 -0
  3. data/Gemfile +4 -0
  4. data/Rakefile +8 -0
  5. data/data/.DS_Store +0 -0
  6. data/data/Makefile +511 -0
  7. data/data/Makefile.am +4 -0
  8. data/data/Makefile.in +511 -0
  9. data/data/de/.DS_Store +0 -0
  10. data/data/de/4.0.affix +7 -0
  11. data/data/de/4.0.dict +474 -0
  12. data/data/de/Makefile +387 -0
  13. data/data/de/Makefile.am +9 -0
  14. data/data/de/Makefile.in +387 -0
  15. data/data/en/.DS_Store +0 -0
  16. data/data/en/4.0.affix +26 -0
  17. data/data/en/4.0.batch +1002 -0
  18. data/data/en/4.0.biolg.batch +411 -0
  19. data/data/en/4.0.constituent-knowledge +127 -0
  20. data/data/en/4.0.dict +8759 -0
  21. data/data/en/4.0.dict.m4 +6928 -0
  22. data/data/en/4.0.enwiki.batch +14 -0
  23. data/data/en/4.0.fixes.batch +2776 -0
  24. data/data/en/4.0.knowledge +306 -0
  25. data/data/en/4.0.regex +225 -0
  26. data/data/en/4.0.voa.batch +114 -0
  27. data/data/en/Makefile +554 -0
  28. data/data/en/Makefile.am +19 -0
  29. data/data/en/Makefile.in +554 -0
  30. data/data/en/README +173 -0
  31. data/data/en/tiny.dict +157 -0
  32. data/data/en/words/.DS_Store +0 -0
  33. data/data/en/words/Makefile +456 -0
  34. data/data/en/words/Makefile.am +78 -0
  35. data/data/en/words/Makefile.in +456 -0
  36. data/data/en/words/currency +205 -0
  37. data/data/en/words/currency.p +28 -0
  38. data/data/en/words/entities.given-bisex.sing +39 -0
  39. data/data/en/words/entities.given-female.sing +4141 -0
  40. data/data/en/words/entities.given-male.sing +1633 -0
  41. data/data/en/words/entities.locations.sing +68 -0
  42. data/data/en/words/entities.national.sing +253 -0
  43. data/data/en/words/entities.organizations.sing +7 -0
  44. data/data/en/words/entities.us-states.sing +11 -0
  45. data/data/en/words/units.1 +45 -0
  46. data/data/en/words/units.1.dot +4 -0
  47. data/data/en/words/units.3 +2 -0
  48. data/data/en/words/units.4 +5 -0
  49. data/data/en/words/units.4.dot +1 -0
  50. data/data/en/words/words-medical.adv.1 +1191 -0
  51. data/data/en/words/words-medical.prep.1 +67 -0
  52. data/data/en/words/words-medical.v.4.1 +2835 -0
  53. data/data/en/words/words-medical.v.4.2 +2848 -0
  54. data/data/en/words/words-medical.v.4.3 +3011 -0
  55. data/data/en/words/words-medical.v.4.4 +3036 -0
  56. data/data/en/words/words-medical.v.4.5 +3050 -0
  57. data/data/en/words/words.adj.1 +6794 -0
  58. data/data/en/words/words.adj.2 +638 -0
  59. data/data/en/words/words.adj.3 +667 -0
  60. data/data/en/words/words.adv.1 +1573 -0
  61. data/data/en/words/words.adv.2 +67 -0
  62. data/data/en/words/words.adv.3 +157 -0
  63. data/data/en/words/words.adv.4 +80 -0
  64. data/data/en/words/words.n.1 +11464 -0
  65. data/data/en/words/words.n.1.wiki +264 -0
  66. data/data/en/words/words.n.2.s +2017 -0
  67. data/data/en/words/words.n.2.s.biolg +1 -0
  68. data/data/en/words/words.n.2.s.wiki +298 -0
  69. data/data/en/words/words.n.2.x +65 -0
  70. data/data/en/words/words.n.2.x.wiki +10 -0
  71. data/data/en/words/words.n.3 +5717 -0
  72. data/data/en/words/words.n.t +23 -0
  73. data/data/en/words/words.v.1.1 +1038 -0
  74. data/data/en/words/words.v.1.2 +1043 -0
  75. data/data/en/words/words.v.1.3 +1052 -0
  76. data/data/en/words/words.v.1.4 +1023 -0
  77. data/data/en/words/words.v.1.p +17 -0
  78. data/data/en/words/words.v.10.1 +14 -0
  79. data/data/en/words/words.v.10.2 +15 -0
  80. data/data/en/words/words.v.10.3 +88 -0
  81. data/data/en/words/words.v.10.4 +17 -0
  82. data/data/en/words/words.v.2.1 +1253 -0
  83. data/data/en/words/words.v.2.2 +1304 -0
  84. data/data/en/words/words.v.2.3 +1280 -0
  85. data/data/en/words/words.v.2.4 +1285 -0
  86. data/data/en/words/words.v.2.5 +1287 -0
  87. data/data/en/words/words.v.4.1 +2472 -0
  88. data/data/en/words/words.v.4.2 +2487 -0
  89. data/data/en/words/words.v.4.3 +2441 -0
  90. data/data/en/words/words.v.4.4 +2478 -0
  91. data/data/en/words/words.v.4.5 +2483 -0
  92. data/data/en/words/words.v.5.1 +98 -0
  93. data/data/en/words/words.v.5.2 +98 -0
  94. data/data/en/words/words.v.5.3 +103 -0
  95. data/data/en/words/words.v.5.4 +102 -0
  96. data/data/en/words/words.v.6.1 +388 -0
  97. data/data/en/words/words.v.6.2 +401 -0
  98. data/data/en/words/words.v.6.3 +397 -0
  99. data/data/en/words/words.v.6.4 +405 -0
  100. data/data/en/words/words.v.6.5 +401 -0
  101. data/data/en/words/words.v.8.1 +117 -0
  102. data/data/en/words/words.v.8.2 +118 -0
  103. data/data/en/words/words.v.8.3 +118 -0
  104. data/data/en/words/words.v.8.4 +119 -0
  105. data/data/en/words/words.v.8.5 +119 -0
  106. data/data/en/words/words.y +104 -0
  107. data/data/lt/.DS_Store +0 -0
  108. data/data/lt/4.0.affix +6 -0
  109. data/data/lt/4.0.constituent-knowledge +24 -0
  110. data/data/lt/4.0.dict +135 -0
  111. data/data/lt/4.0.knowledge +38 -0
  112. data/data/lt/Makefile +389 -0
  113. data/data/lt/Makefile.am +11 -0
  114. data/data/lt/Makefile.in +389 -0
  115. data/ext/.DS_Store +0 -0
  116. data/ext/link_grammar/.DS_Store +0 -0
  117. data/ext/link_grammar/extconf.rb +2 -0
  118. data/ext/link_grammar/link-grammar/.DS_Store +0 -0
  119. data/ext/link_grammar/link-grammar/.deps/analyze-linkage.Plo +198 -0
  120. data/ext/link_grammar/link-grammar/.deps/and.Plo +202 -0
  121. data/ext/link_grammar/link-grammar/.deps/api.Plo +244 -0
  122. data/ext/link_grammar/link-grammar/.deps/build-disjuncts.Plo +212 -0
  123. data/ext/link_grammar/link-grammar/.deps/command-line.Plo +201 -0
  124. data/ext/link_grammar/link-grammar/.deps/constituents.Plo +201 -0
  125. data/ext/link_grammar/link-grammar/.deps/count.Plo +202 -0
  126. data/ext/link_grammar/link-grammar/.deps/disjunct-utils.Plo +126 -0
  127. data/ext/link_grammar/link-grammar/.deps/disjuncts.Plo +123 -0
  128. data/ext/link_grammar/link-grammar/.deps/error.Plo +121 -0
  129. data/ext/link_grammar/link-grammar/.deps/expand.Plo +133 -0
  130. data/ext/link_grammar/link-grammar/.deps/extract-links.Plo +198 -0
  131. data/ext/link_grammar/link-grammar/.deps/fast-match.Plo +200 -0
  132. data/ext/link_grammar/link-grammar/.deps/idiom.Plo +200 -0
  133. data/ext/link_grammar/link-grammar/.deps/jni-client.Plo +217 -0
  134. data/ext/link_grammar/link-grammar/.deps/link-parser.Po +1 -0
  135. data/ext/link_grammar/link-grammar/.deps/massage.Plo +202 -0
  136. data/ext/link_grammar/link-grammar/.deps/post-process.Plo +202 -0
  137. data/ext/link_grammar/link-grammar/.deps/pp_knowledge.Plo +202 -0
  138. data/ext/link_grammar/link-grammar/.deps/pp_lexer.Plo +201 -0
  139. data/ext/link_grammar/link-grammar/.deps/pp_linkset.Plo +200 -0
  140. data/ext/link_grammar/link-grammar/.deps/prefix.Plo +102 -0
  141. data/ext/link_grammar/link-grammar/.deps/preparation.Plo +202 -0
  142. data/ext/link_grammar/link-grammar/.deps/print-util.Plo +200 -0
  143. data/ext/link_grammar/link-grammar/.deps/print.Plo +201 -0
  144. data/ext/link_grammar/link-grammar/.deps/prune.Plo +202 -0
  145. data/ext/link_grammar/link-grammar/.deps/read-dict.Plo +223 -0
  146. data/ext/link_grammar/link-grammar/.deps/read-regex.Plo +123 -0
  147. data/ext/link_grammar/link-grammar/.deps/regex-morph.Plo +131 -0
  148. data/ext/link_grammar/link-grammar/.deps/resources.Plo +203 -0
  149. data/ext/link_grammar/link-grammar/.deps/spellcheck-aspell.Plo +1 -0
  150. data/ext/link_grammar/link-grammar/.deps/spellcheck-hun.Plo +115 -0
  151. data/ext/link_grammar/link-grammar/.deps/string-set.Plo +198 -0
  152. data/ext/link_grammar/link-grammar/.deps/tokenize.Plo +160 -0
  153. data/ext/link_grammar/link-grammar/.deps/utilities.Plo +222 -0
  154. data/ext/link_grammar/link-grammar/.deps/word-file.Plo +201 -0
  155. data/ext/link_grammar/link-grammar/.deps/word-utils.Plo +212 -0
  156. data/ext/link_grammar/link-grammar/.libs/analyze-linkage.o +0 -0
  157. data/ext/link_grammar/link-grammar/.libs/and.o +0 -0
  158. data/ext/link_grammar/link-grammar/.libs/api.o +0 -0
  159. data/ext/link_grammar/link-grammar/.libs/build-disjuncts.o +0 -0
  160. data/ext/link_grammar/link-grammar/.libs/command-line.o +0 -0
  161. data/ext/link_grammar/link-grammar/.libs/constituents.o +0 -0
  162. data/ext/link_grammar/link-grammar/.libs/count.o +0 -0
  163. data/ext/link_grammar/link-grammar/.libs/disjunct-utils.o +0 -0
  164. data/ext/link_grammar/link-grammar/.libs/disjuncts.o +0 -0
  165. data/ext/link_grammar/link-grammar/.libs/error.o +0 -0
  166. data/ext/link_grammar/link-grammar/.libs/expand.o +0 -0
  167. data/ext/link_grammar/link-grammar/.libs/extract-links.o +0 -0
  168. data/ext/link_grammar/link-grammar/.libs/fast-match.o +0 -0
  169. data/ext/link_grammar/link-grammar/.libs/idiom.o +0 -0
  170. data/ext/link_grammar/link-grammar/.libs/jni-client.o +0 -0
  171. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java-symbols.expsym +31 -0
  172. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib +0 -0
  173. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib.dSYM/Contents/Info.plist +20 -0
  174. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib.dSYM/Contents/Resources/DWARF/liblink-grammar-java.4.dylib +0 -0
  175. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.a +0 -0
  176. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.dylib +0 -0
  177. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-symbols.expsym +194 -0
  178. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib +0 -0
  179. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib.dSYM/Contents/Info.plist +20 -0
  180. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib.dSYM/Contents/Resources/DWARF/liblink-grammar.4.dylib +0 -0
  181. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.a +0 -0
  182. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.dylib +0 -0
  183. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.la +41 -0
  184. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.lai +41 -0
  185. data/ext/link_grammar/link-grammar/.libs/massage.o +0 -0
  186. data/ext/link_grammar/link-grammar/.libs/post-process.o +0 -0
  187. data/ext/link_grammar/link-grammar/.libs/pp_knowledge.o +0 -0
  188. data/ext/link_grammar/link-grammar/.libs/pp_lexer.o +0 -0
  189. data/ext/link_grammar/link-grammar/.libs/pp_linkset.o +0 -0
  190. data/ext/link_grammar/link-grammar/.libs/prefix.o +0 -0
  191. data/ext/link_grammar/link-grammar/.libs/preparation.o +0 -0
  192. data/ext/link_grammar/link-grammar/.libs/print-util.o +0 -0
  193. data/ext/link_grammar/link-grammar/.libs/print.o +0 -0
  194. data/ext/link_grammar/link-grammar/.libs/prune.o +0 -0
  195. data/ext/link_grammar/link-grammar/.libs/read-dict.o +0 -0
  196. data/ext/link_grammar/link-grammar/.libs/read-regex.o +0 -0
  197. data/ext/link_grammar/link-grammar/.libs/regex-morph.o +0 -0
  198. data/ext/link_grammar/link-grammar/.libs/resources.o +0 -0
  199. data/ext/link_grammar/link-grammar/.libs/spellcheck-aspell.o +0 -0
  200. data/ext/link_grammar/link-grammar/.libs/spellcheck-hun.o +0 -0
  201. data/ext/link_grammar/link-grammar/.libs/string-set.o +0 -0
  202. data/ext/link_grammar/link-grammar/.libs/tokenize.o +0 -0
  203. data/ext/link_grammar/link-grammar/.libs/utilities.o +0 -0
  204. data/ext/link_grammar/link-grammar/.libs/word-file.o +0 -0
  205. data/ext/link_grammar/link-grammar/.libs/word-utils.o +0 -0
  206. data/ext/link_grammar/link-grammar/Makefile +900 -0
  207. data/ext/link_grammar/link-grammar/Makefile.am +202 -0
  208. data/ext/link_grammar/link-grammar/Makefile.in +900 -0
  209. data/ext/link_grammar/link-grammar/analyze-linkage.c +1317 -0
  210. data/ext/link_grammar/link-grammar/analyze-linkage.h +24 -0
  211. data/ext/link_grammar/link-grammar/and.c +1603 -0
  212. data/ext/link_grammar/link-grammar/and.h +27 -0
  213. data/ext/link_grammar/link-grammar/api-structures.h +362 -0
  214. data/ext/link_grammar/link-grammar/api-types.h +72 -0
  215. data/ext/link_grammar/link-grammar/api.c +1887 -0
  216. data/ext/link_grammar/link-grammar/api.h +96 -0
  217. data/ext/link_grammar/link-grammar/autoit/.DS_Store +0 -0
  218. data/ext/link_grammar/link-grammar/autoit/README +10 -0
  219. data/ext/link_grammar/link-grammar/autoit/_LGTest.au3 +22 -0
  220. data/ext/link_grammar/link-grammar/autoit/_LinkGrammar.au3 +545 -0
  221. data/ext/link_grammar/link-grammar/build-disjuncts.c +487 -0
  222. data/ext/link_grammar/link-grammar/build-disjuncts.h +21 -0
  223. data/ext/link_grammar/link-grammar/command-line.c +458 -0
  224. data/ext/link_grammar/link-grammar/command-line.h +15 -0
  225. data/ext/link_grammar/link-grammar/constituents.c +1836 -0
  226. data/ext/link_grammar/link-grammar/constituents.h +26 -0
  227. data/ext/link_grammar/link-grammar/corpus/.DS_Store +0 -0
  228. data/ext/link_grammar/link-grammar/corpus/.deps/cluster.Plo +1 -0
  229. data/ext/link_grammar/link-grammar/corpus/.deps/corpus.Plo +1 -0
  230. data/ext/link_grammar/link-grammar/corpus/Makefile +527 -0
  231. data/ext/link_grammar/link-grammar/corpus/Makefile.am +46 -0
  232. data/ext/link_grammar/link-grammar/corpus/Makefile.in +527 -0
  233. data/ext/link_grammar/link-grammar/corpus/README +17 -0
  234. data/ext/link_grammar/link-grammar/corpus/cluster.c +286 -0
  235. data/ext/link_grammar/link-grammar/corpus/cluster.h +32 -0
  236. data/ext/link_grammar/link-grammar/corpus/corpus.c +483 -0
  237. data/ext/link_grammar/link-grammar/corpus/corpus.h +46 -0
  238. data/ext/link_grammar/link-grammar/count.c +828 -0
  239. data/ext/link_grammar/link-grammar/count.h +25 -0
  240. data/ext/link_grammar/link-grammar/disjunct-utils.c +261 -0
  241. data/ext/link_grammar/link-grammar/disjunct-utils.h +27 -0
  242. data/ext/link_grammar/link-grammar/disjuncts.c +138 -0
  243. data/ext/link_grammar/link-grammar/disjuncts.h +13 -0
  244. data/ext/link_grammar/link-grammar/error.c +92 -0
  245. data/ext/link_grammar/link-grammar/error.h +35 -0
  246. data/ext/link_grammar/link-grammar/expand.c +67 -0
  247. data/ext/link_grammar/link-grammar/expand.h +13 -0
  248. data/ext/link_grammar/link-grammar/externs.h +22 -0
  249. data/ext/link_grammar/link-grammar/extract-links.c +625 -0
  250. data/ext/link_grammar/link-grammar/extract-links.h +16 -0
  251. data/ext/link_grammar/link-grammar/fast-match.c +309 -0
  252. data/ext/link_grammar/link-grammar/fast-match.h +17 -0
  253. data/ext/link_grammar/link-grammar/idiom.c +373 -0
  254. data/ext/link_grammar/link-grammar/idiom.h +15 -0
  255. data/ext/link_grammar/link-grammar/jni-client.c +779 -0
  256. data/ext/link_grammar/link-grammar/jni-client.h +236 -0
  257. data/ext/link_grammar/link-grammar/liblink-grammar-java.la +42 -0
  258. data/ext/link_grammar/link-grammar/liblink-grammar.la +41 -0
  259. data/ext/link_grammar/link-grammar/link-features.h +37 -0
  260. data/ext/link_grammar/link-grammar/link-features.h.in +37 -0
  261. data/ext/link_grammar/link-grammar/link-grammar-java.def +31 -0
  262. data/ext/link_grammar/link-grammar/link-grammar.def +194 -0
  263. data/ext/link_grammar/link-grammar/link-includes.h +465 -0
  264. data/ext/link_grammar/link-grammar/link-parser.c +849 -0
  265. data/ext/link_grammar/link-grammar/massage.c +329 -0
  266. data/ext/link_grammar/link-grammar/massage.h +13 -0
  267. data/ext/link_grammar/link-grammar/post-process.c +1113 -0
  268. data/ext/link_grammar/link-grammar/post-process.h +45 -0
  269. data/ext/link_grammar/link-grammar/pp_knowledge.c +376 -0
  270. data/ext/link_grammar/link-grammar/pp_knowledge.h +14 -0
  271. data/ext/link_grammar/link-grammar/pp_lexer.c +1920 -0
  272. data/ext/link_grammar/link-grammar/pp_lexer.h +19 -0
  273. data/ext/link_grammar/link-grammar/pp_linkset.c +158 -0
  274. data/ext/link_grammar/link-grammar/pp_linkset.h +20 -0
  275. data/ext/link_grammar/link-grammar/prefix.c +482 -0
  276. data/ext/link_grammar/link-grammar/prefix.h +139 -0
  277. data/ext/link_grammar/link-grammar/preparation.c +412 -0
  278. data/ext/link_grammar/link-grammar/preparation.h +20 -0
  279. data/ext/link_grammar/link-grammar/print-util.c +87 -0
  280. data/ext/link_grammar/link-grammar/print-util.h +32 -0
  281. data/ext/link_grammar/link-grammar/print.c +1085 -0
  282. data/ext/link_grammar/link-grammar/print.h +16 -0
  283. data/ext/link_grammar/link-grammar/prune.c +1864 -0
  284. data/ext/link_grammar/link-grammar/prune.h +17 -0
  285. data/ext/link_grammar/link-grammar/read-dict.c +1785 -0
  286. data/ext/link_grammar/link-grammar/read-dict.h +29 -0
  287. data/ext/link_grammar/link-grammar/read-regex.c +161 -0
  288. data/ext/link_grammar/link-grammar/read-regex.h +12 -0
  289. data/ext/link_grammar/link-grammar/regex-morph.c +126 -0
  290. data/ext/link_grammar/link-grammar/regex-morph.h +17 -0
  291. data/ext/link_grammar/link-grammar/resources.c +180 -0
  292. data/ext/link_grammar/link-grammar/resources.h +23 -0
  293. data/ext/link_grammar/link-grammar/sat-solver/.DS_Store +0 -0
  294. data/ext/link_grammar/link-grammar/sat-solver/.deps/fast-sprintf.Plo +1 -0
  295. data/ext/link_grammar/link-grammar/sat-solver/.deps/sat-encoder.Plo +1 -0
  296. data/ext/link_grammar/link-grammar/sat-solver/.deps/util.Plo +1 -0
  297. data/ext/link_grammar/link-grammar/sat-solver/.deps/variables.Plo +1 -0
  298. data/ext/link_grammar/link-grammar/sat-solver/.deps/word-tag.Plo +1 -0
  299. data/ext/link_grammar/link-grammar/sat-solver/Makefile +527 -0
  300. data/ext/link_grammar/link-grammar/sat-solver/Makefile.am +29 -0
  301. data/ext/link_grammar/link-grammar/sat-solver/Makefile.in +527 -0
  302. data/ext/link_grammar/link-grammar/sat-solver/clock.hpp +33 -0
  303. data/ext/link_grammar/link-grammar/sat-solver/fast-sprintf.cpp +26 -0
  304. data/ext/link_grammar/link-grammar/sat-solver/fast-sprintf.hpp +7 -0
  305. data/ext/link_grammar/link-grammar/sat-solver/guiding.hpp +244 -0
  306. data/ext/link_grammar/link-grammar/sat-solver/matrix-ut.hpp +79 -0
  307. data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.cpp +2811 -0
  308. data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.h +11 -0
  309. data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.hpp +381 -0
  310. data/ext/link_grammar/link-grammar/sat-solver/trie.hpp +118 -0
  311. data/ext/link_grammar/link-grammar/sat-solver/util.cpp +23 -0
  312. data/ext/link_grammar/link-grammar/sat-solver/util.hpp +14 -0
  313. data/ext/link_grammar/link-grammar/sat-solver/variables.cpp +5 -0
  314. data/ext/link_grammar/link-grammar/sat-solver/variables.hpp +829 -0
  315. data/ext/link_grammar/link-grammar/sat-solver/word-tag.cpp +159 -0
  316. data/ext/link_grammar/link-grammar/sat-solver/word-tag.hpp +162 -0
  317. data/ext/link_grammar/link-grammar/spellcheck-aspell.c +148 -0
  318. data/ext/link_grammar/link-grammar/spellcheck-hun.c +136 -0
  319. data/ext/link_grammar/link-grammar/spellcheck.h +34 -0
  320. data/ext/link_grammar/link-grammar/string-set.c +169 -0
  321. data/ext/link_grammar/link-grammar/string-set.h +16 -0
  322. data/ext/link_grammar/link-grammar/structures.h +498 -0
  323. data/ext/link_grammar/link-grammar/tokenize.c +1049 -0
  324. data/ext/link_grammar/link-grammar/tokenize.h +15 -0
  325. data/ext/link_grammar/link-grammar/utilities.c +847 -0
  326. data/ext/link_grammar/link-grammar/utilities.h +281 -0
  327. data/ext/link_grammar/link-grammar/word-file.c +124 -0
  328. data/ext/link_grammar/link-grammar/word-file.h +15 -0
  329. data/ext/link_grammar/link-grammar/word-utils.c +526 -0
  330. data/ext/link_grammar/link-grammar/word-utils.h +152 -0
  331. data/ext/link_grammar/link_grammar.c +202 -0
  332. data/ext/link_grammar/link_grammar.h +99 -0
  333. data/grammar_cop.gemspec +24 -0
  334. data/lib/.DS_Store +0 -0
  335. data/lib/grammar_cop.rb +9 -0
  336. data/lib/grammar_cop/.DS_Store +0 -0
  337. data/lib/grammar_cop/dictionary.rb +19 -0
  338. data/lib/grammar_cop/linkage.rb +30 -0
  339. data/lib/grammar_cop/parse_options.rb +32 -0
  340. data/lib/grammar_cop/sentence.rb +36 -0
  341. data/lib/grammar_cop/version.rb +3 -0
  342. data/test/.DS_Store +0 -0
  343. data/test/grammar_cop_test.rb +27 -0
  344. metadata +407 -0
@@ -0,0 +1,15 @@
1
+ /*************************************************************************/
2
+ /* Copyright (c) 2004 */
3
+ /* Daniel Sleator, David Temperley, and John Lafferty */
4
+ /* All rights reserved */
5
+ /* */
6
+ /* Use of the link grammar parsing system is subject to the terms of the */
7
+ /* license set forth in the LICENSE file included with this software, */
8
+ /* and also available at http://www.link.cs.cmu.edu/link/license.html */
9
+ /* This license allows free redistribution and use in source and binary */
10
+ /* forms, with or without modification, subject to certain conditions. */
11
+ /* */
12
+ /*************************************************************************/
13
+
14
+
15
+
@@ -0,0 +1,1836 @@
1
+ /*************************************************************************/
2
+ /* Copyright (c) 2004 */
3
+ /* Daniel Sleator, David Temperley, and John Lafferty */
4
+ /* All rights reserved */
5
+ /* */
6
+ /* Use of the link grammar parsing system is subject to the terms of the */
7
+ /* license set forth in the LICENSE file included with this software, */
8
+ /* and also available at http://www.link.cs.cmu.edu/link/license.html */
9
+ /* This license allows free redistribution and use in source and binary */
10
+ /* forms, with or without modification, subject to certain conditions. */
11
+ /* */
12
+ /*************************************************************************/
13
+
14
+ #include <stdarg.h>
15
+ #include <string.h>
16
+ #include <link-grammar/api.h>
17
+ #include "error.h"
18
+ #include "constituents.h"
19
+
20
+ #define MAXCONSTITUENTS 8192
21
+ #define MAXSUBL 16
22
+ #define OPEN_BRACKET '['
23
+ #define CLOSE_BRACKET ']'
24
+
25
+ typedef enum {OPEN_TOK, CLOSE_TOK, WORD_TOK} CType;
26
+ typedef enum {NONE, STYPE, PTYPE, QTYPE, QDTYPE} WType;
27
+
28
+ typedef struct
29
+ {
30
+ int left;
31
+ int right;
32
+ const char * type;
33
+ char domain_type;
34
+ const char * start_link;
35
+ int start_num;
36
+ int subl;
37
+ int canon;
38
+ int valid;
39
+ #ifdef AUX_CODE_IS_DEAD
40
+ /* The only code that actually sets aux to a non-zero value is code
41
+ * followed by code that zets it to zero. -- its dead code, and so
42
+ * aux is never actually used. Comment this code out.
43
+ */
44
+ int aux;
45
+ /* 0: it's an ordinary VP (or other type);
46
+ * 1: it's an AUX, don't print it;
47
+ * 2: it's an AUX, and print it
48
+ */
49
+ #endif /* AUX_CODE_IS_DEAD */
50
+ } constituent_t;
51
+
52
+ /* XXX it seems like the old code worked fine with MAX_ELTS=10 */
53
+ #define MAX_ELTS 100
54
+ typedef struct
55
+ {
56
+ int num;
57
+ int e[MAX_ELTS];
58
+ int valid;
59
+ } andlist_t;
60
+
61
+ /*
62
+ * Context used to store assorted intermediate data
63
+ * when the constituent string is being generated.
64
+ */
65
+ #define MAX_ANDS 1024
66
+ typedef struct
67
+ {
68
+ String_set * phrase_ss;
69
+ WType wordtype[MAX_SENTENCE];
70
+ int word_used[MAXSUBL][MAX_SENTENCE];
71
+ int templist[MAX_ELTS];
72
+ constituent_t constituent[MAXCONSTITUENTS];
73
+ andlist_t andlist[MAX_ANDS];
74
+ } con_context_t;
75
+
76
+ /* ================================================================ */
77
+
78
+ static inline int uppercompare(const char * s, const char * t)
79
+ {
80
+ return (FALSE == utf8_upper_match(s,t));
81
+ }
82
+
83
+ /**
84
+ * If a constituent c has a comma at either end, we exclude the
85
+ * comma. (We continue to shift the boundary until we get to
86
+ * something inside the current sublinkage)
87
+ */
88
+ static void adjust_for_left_comma(con_context_t * ctxt, Linkage linkage, int c)
89
+ {
90
+ int w;
91
+ w = ctxt->constituent[c].left;
92
+ if (strcmp(linkage->word[w], ",") == 0)
93
+ {
94
+ w++;
95
+ while (1) {
96
+ if (ctxt->word_used[linkage->current][w] == 1) break;
97
+ w++;
98
+ }
99
+ }
100
+ ctxt->constituent[c].left = w;
101
+ }
102
+
103
+ static void adjust_for_right_comma(con_context_t *ctxt, Linkage linkage, int c)
104
+ {
105
+ int w;
106
+ w = ctxt->constituent[c].right;
107
+ if ((strcmp(linkage->word[w], ",") == 0) ||
108
+ (strcmp(linkage->word[w], "RIGHT-WALL") == 0))
109
+ {
110
+ w--;
111
+ while (1)
112
+ {
113
+ if (ctxt->word_used[linkage->current][w]==1) break;
114
+ w--;
115
+ }
116
+ }
117
+ ctxt->constituent[c].right = w;
118
+ }
119
+
120
+ static void print_constituent(con_context_t *ctxt, Linkage linkage, int c)
121
+ {
122
+ int w;
123
+ if (verbosity < 2) return;
124
+
125
+ printf(" c %2d %4s [%c] (%2d-%2d): ",
126
+ c, ctxt->constituent[c].type, ctxt->constituent[c].domain_type,
127
+ ctxt->constituent[c].left, ctxt->constituent[c].right);
128
+ for (w = ctxt->constituent[c].left; w <= ctxt->constituent[c].right; w++) {
129
+ printf("%s ", linkage->word[w]); /**PV**/
130
+ }
131
+ printf("\n");
132
+ }
133
+
134
+ /******************************************************
135
+ * These functions do the bulk of the actual
136
+ * constituent-generating; they're called once for each
137
+ * sublinkage
138
+ *********************************************************/
139
+
140
+ /**
141
+ * This function looks for constituents of type ctype1. Say it finds
142
+ * one, call it c1. It searches for the next larger constituent of
143
+ * type ctype2, call it c2. It then generates a new constituent of
144
+ * ctype3, containing all the words in c2 but not c1.
145
+ */
146
+ static int gen_comp(con_context_t *ctxt, Linkage linkage,
147
+ int numcon_total, int numcon_subl,
148
+ const char * ctype1, const char * ctype2,
149
+ const char * ctype3, int x)
150
+ {
151
+ int w, w2, w3, c, c1, c2, done;
152
+ c = numcon_total + numcon_subl;
153
+
154
+ for (c1=numcon_total; c1<numcon_total + numcon_subl; c1++)
155
+ {
156
+ /* If ctype1 is NP, it has to be an appositive to continue */
157
+ if ((x==4) && (post_process_match("MX#*", ctxt->constituent[c1].start_link)==0))
158
+ continue;
159
+
160
+ /* If ctype1 is X, and domain_type is t, it's an infinitive - skip it */
161
+ if ((x==2) && (ctxt->constituent[c1].domain_type=='t'))
162
+ continue;
163
+
164
+ /* If it's domain-type z, it's a subject-relative clause;
165
+ the VP doesn't need an NP */
166
+ if (ctxt->constituent[c1].domain_type=='z')
167
+ continue;
168
+
169
+ /* If ctype1 is X or VP, and it's not started by an S, don't generate an NP
170
+ (Neither of the two previous checks are necessary now, right?) */
171
+ if ((x==1 || x==2) &&
172
+ (((post_process_match("S", ctxt->constituent[c1].start_link) == 0) &&
173
+ (post_process_match("SX", ctxt->constituent[c1].start_link) == 0) &&
174
+ (post_process_match("SF", ctxt->constituent[c1].start_link) == 0)) ||
175
+ (post_process_match("S##w", ctxt->constituent[c1].start_link) != 0)))
176
+ continue;
177
+
178
+ /* If it's an SBAR (relative clause case), it has to be a relative clause */
179
+ if ((x==3) &&
180
+ ((post_process_match("Rn", ctxt->constituent[c1].start_link) == 0) &&
181
+ (post_process_match("R*", ctxt->constituent[c1].start_link) == 0) &&
182
+ (post_process_match("MX#r", ctxt->constituent[c1].start_link) == 0) &&
183
+ (post_process_match("Mr", ctxt->constituent[c1].start_link) == 0) &&
184
+ (post_process_match("MX#d", ctxt->constituent[c1].start_link) == 0)))
185
+ continue;
186
+
187
+ /* If ctype1 is SBAR (clause opener case), it has to be an f domain */
188
+ if ((x==5) && (ctxt->constituent[c1].domain_type!='f'))
189
+ continue;
190
+
191
+ /* If ctype1 is SBAR (pp opener case), it has to be a g domain */
192
+ if ((x==6) && (ctxt->constituent[c1].domain_type!='g'))
193
+ continue;
194
+
195
+ /* If ctype1 is NP (paraphrase case), it has to be started by an SI */
196
+ if ((x==7) && (post_process_match("SI", ctxt->constituent[c1].start_link)==0))
197
+ continue;
198
+
199
+ /* If ctype1 is VP (participle modifier case), it has to be
200
+ started by an Mv or Mg */
201
+ if ((x==8) && (post_process_match("M", ctxt->constituent[c1].start_link)==0))
202
+ continue;
203
+
204
+ /* If ctype1 is VP (participle opener case), it has
205
+ to be started by a COp */
206
+ if ((x==9) && (post_process_match("COp", ctxt->constituent[c1].start_link)==0))
207
+ continue;
208
+
209
+ /* Now start at the bounds of c1, and work outwards until you
210
+ find a larger constituent of type ctype2 */
211
+ if (!(strcmp(ctxt->constituent[c1].type, ctype1)==0))
212
+ continue;
213
+
214
+ if (verbosity >= 2)
215
+ printf("Generating complement constituent for c %d of type %s\n",
216
+ c1, ctype1);
217
+ done = 0;
218
+ for (w2=ctxt->constituent[c1].left; (done==0) && (w2>=0); w2--) {
219
+ for (w3=ctxt->constituent[c1].right; w3<linkage->num_words; w3++) {
220
+ for (c2=numcon_total; (done==0) &&
221
+ (c2 < numcon_total + numcon_subl); c2++) {
222
+ if (!((ctxt->constituent[c2].left==w2) &&
223
+ (ctxt->constituent[c2].right==w3)) || (c2==c1))
224
+ continue;
225
+ if (!(strcmp(ctxt->constituent[c2].type, ctype2)==0))
226
+ continue;
227
+
228
+ /* if the new constituent (c) is to the left
229
+ of c1, its right edge should be adjacent to the
230
+ left edge of c1 - or as close as possible
231
+ without going outside the current sublinkage.
232
+ (Or substituting right and left as necessary.) */
233
+
234
+ if ((x==5) || (x==6) || (x==9)) {
235
+ /* This is the case where c is to the
236
+ RIGHT of c1 */
237
+ w = ctxt->constituent[c1].right+1;
238
+ while(1) {
239
+ if (ctxt->word_used[linkage->current][w]==1)
240
+ break;
241
+ w++;
242
+ }
243
+ if (w > ctxt->constituent[c2].right)
244
+ {
245
+ done=1;
246
+ continue;
247
+ }
248
+ ctxt->constituent[c].left = w;
249
+ ctxt->constituent[c].right = ctxt->constituent[c2].right;
250
+ }
251
+ else {
252
+ w = ctxt->constituent[c1].left-1;
253
+ while(1) {
254
+ if (ctxt->word_used[linkage->current][w] == 1)
255
+ break;
256
+ w--;
257
+ }
258
+ if (w < ctxt->constituent[c2].left) {
259
+ done=1;
260
+ continue;
261
+ }
262
+ ctxt->constituent[c].right = w;
263
+ ctxt->constituent[c].left = ctxt->constituent[c2].left;
264
+ }
265
+
266
+ adjust_for_left_comma(ctxt, linkage, c1);
267
+ adjust_for_right_comma(ctxt, linkage, c1);
268
+
269
+ ctxt->constituent[c].type =
270
+ string_set_add(ctype3, ctxt->phrase_ss);
271
+ ctxt->constituent[c].domain_type = 'x';
272
+ ctxt->constituent[c].start_link =
273
+ string_set_add("XX", ctxt->phrase_ss);
274
+ ctxt->constituent[c].start_num =
275
+ ctxt->constituent[c1].start_num; /* bogus */
276
+ if (verbosity >= 2)
277
+ {
278
+ printf("Larger c found: c %d (%s); ",
279
+ c2, ctype2);
280
+ printf("Adding constituent:\n");
281
+ print_constituent(ctxt, linkage, c);
282
+ }
283
+ c++;
284
+ if (MAXCONSTITUENTS <= c)
285
+ {
286
+ err_ctxt ec;
287
+ ec.sent = linkage->sent;
288
+ err_msg(&ec, Error, "Error: Too many constituents (a).\n");
289
+ c--;
290
+ }
291
+ done = 1;
292
+ }
293
+ }
294
+ }
295
+ if (verbosity >= 2)
296
+ {
297
+ if (done == 0)
298
+ printf("No constituent added, because no larger %s " \
299
+ " was found\n", ctype2);
300
+ }
301
+ }
302
+ numcon_subl = c - numcon_total;
303
+ return numcon_subl;
304
+ }
305
+
306
+ /**
307
+ * Look for a constituent started by an MVs or MVg.
308
+ * Find any VP's or ADJP's that contain it (without going
309
+ * beyond a larger S or NP). Adjust them so that
310
+ * they end right before the m domain starts.
311
+ */
312
+ static void adjust_subordinate_clauses(con_context_t *ctxt, Linkage linkage,
313
+ int numcon_total,
314
+ int numcon_subl)
315
+ {
316
+ int c, w, c2, w2, done;
317
+
318
+ for (c=numcon_total; c<numcon_total + numcon_subl; c++) {
319
+ if ((post_process_match("MVs", ctxt->constituent[c].start_link) == 1) ||
320
+ (post_process_match("MVg", ctxt->constituent[c].start_link)==1)) {
321
+ done=0;
322
+ for (w2=ctxt->constituent[c].left-1; (done==0) && w2>=0; w2--) {
323
+ for (c2=numcon_total; c2<numcon_total + numcon_subl; c2++) {
324
+ if (!((ctxt->constituent[c2].left==w2) &&
325
+ (ctxt->constituent[c2].right >= ctxt->constituent[c].right)))
326
+ continue;
327
+ if ((strcmp(ctxt->constituent[c2].type, "S") == 0) ||
328
+ (strcmp(ctxt->constituent[c2].type, "NP") == 0)) {
329
+ done=1;
330
+ break;
331
+ }
332
+ if ((ctxt->constituent[c2].domain_type == 'v') ||
333
+ (ctxt->constituent[c2].domain_type == 'a')) {
334
+ w = ctxt->constituent[c].left-1;
335
+ while (1) {
336
+ if (ctxt->word_used[linkage->current][w] == 1) break;
337
+ w--;
338
+ }
339
+ ctxt->constituent[c2].right = w;
340
+
341
+ if (verbosity >= 2)
342
+ printf("Adjusting constituent %d:\n", c2);
343
+ print_constituent(ctxt, linkage, c2);
344
+ }
345
+ }
346
+ }
347
+ if (strcmp(linkage->word[ctxt->constituent[c].left], ",") == 0)
348
+ ctxt->constituent[c].left++;
349
+ }
350
+ }
351
+ }
352
+
353
+ /******************************************************
354
+ * These functions are called once, after constituents
355
+ * for each sublinkage have been generated, to merge them
356
+ * together and fix up some other things.
357
+ *
358
+ ********************************************************/
359
+
360
+ /**
361
+ * Here we're looking for the next andlist element to add on
362
+ * to a conjectural andlist, stored in the array templist.
363
+ * We go through the constituents, starting at "start".
364
+ */
365
+ static int find_next_element(con_context_t *ctxt,
366
+ Linkage linkage,
367
+ int start,
368
+ int numcon_total,
369
+ int num_elements,
370
+ int num_lists)
371
+ {
372
+ int c, a, ok, c2, c3, addedone=0, n;
373
+
374
+ assert(num_elements <= MAX_ELTS, "Constutent element array overflow!\n");
375
+
376
+ n = num_lists;
377
+ for (c=start+1; c<numcon_total; c++)
378
+ {
379
+ constituent_t *cc = &ctxt->constituent[c];
380
+
381
+ if (cc->valid == 0)
382
+ continue;
383
+ if (strcmp(ctxt->constituent[ctxt->templist[0]].type, cc->type)!=0)
384
+ continue;
385
+ ok = 1;
386
+
387
+ /* We're considering adding constituent c to the andlist.
388
+ If c is in the same sublinkage as one of the other andlist
389
+ elements, don't add it. If it overlaps with one of the other
390
+ constituents, don't add it. If there's a constituent
391
+ identical to c that occurs in a sublinkage in which one of
392
+ the other elements occurs, don't add it. */
393
+
394
+ for (a=0; a<num_elements; a++)
395
+ {
396
+ int t = ctxt->templist[a];
397
+ constituent_t *ct = &ctxt->constituent[t];
398
+
399
+ if (cc->subl == ct->subl)
400
+ ok=0;
401
+ if (((cc->left < ct->left) && (cc->right > ct->left))
402
+ ||
403
+ ((cc->right > ct->right) && (cc->left < ct->right))
404
+ ||
405
+ ((cc->right > ct->right) && (cc->left < ct->right))
406
+ ||
407
+ ((cc->left > ct->left) && (cc->right < ct->right)))
408
+ ok=0;
409
+
410
+ for (c2=0; c2<numcon_total; c2++)
411
+ {
412
+ if (ctxt->constituent[c2].canon != cc->canon)
413
+ continue;
414
+ for (c3=0; c3<numcon_total; c3++)
415
+ {
416
+ if ((ctxt->constituent[c3].canon == ct->canon)
417
+ && (ctxt->constituent[c3].subl == ctxt->constituent[c2].subl))
418
+ ok=0;
419
+ }
420
+ }
421
+ }
422
+ if (ok == 0) continue;
423
+
424
+ ctxt->templist[num_elements] = c;
425
+ addedone = 1;
426
+ num_lists = find_next_element(ctxt, linkage, c, numcon_total,
427
+ num_elements+1, num_lists);
428
+
429
+ /* Test for overlow of the and-list.
430
+ * With the current parser, the following will cause an
431
+ * overflow:
432
+ *
433
+ * I have not seen the grysbok, or the suni, or the dibitag, or
434
+ * the lechwi, or the aoul, or the gerenuk, or the blaauwbok,
435
+ * or the chevrotain, or lots of others, but who in the world
436
+ * could guess what they were or what they looked like, judging
437
+ * only from the names?
438
+ */
439
+ if (MAX_ANDS <= num_lists)
440
+ {
441
+ err_ctxt ec;
442
+ ec.sent = linkage->sent;
443
+ err_msg(&ec, Error, "Error: Constituent overflowed andlist!\n");
444
+ return MAX_ANDS;
445
+ }
446
+ }
447
+
448
+ if (addedone == 0 && num_elements > 1)
449
+ {
450
+ for (a=0; a<num_elements; a++) {
451
+ ctxt->andlist[num_lists].e[a] = ctxt->templist[a];
452
+ ctxt->andlist[num_lists].num = num_elements;
453
+ }
454
+ num_lists++;
455
+ }
456
+ return num_lists;
457
+ }
458
+
459
+ static int merge_constituents(con_context_t *ctxt, Linkage linkage, int numcon_total)
460
+ {
461
+ int c1, c2=0, c3, ok, a, n, a2, n2, match, listmatch, a3;
462
+ int num_lists, num_elements;
463
+ int leftend, rightend;
464
+
465
+ for (c1=0; c1<numcon_total; c1++)
466
+ {
467
+ ctxt->constituent[c1].valid = 1;
468
+
469
+ /* Find and invalidate any constituents with negative length */
470
+ if(ctxt->constituent[c1].right < ctxt->constituent[c1].left)
471
+ {
472
+ if(verbosity >= 2)
473
+ {
474
+ err_ctxt ec;
475
+ ec.sent = linkage->sent;
476
+ err_msg(&ec, Warn,
477
+ "Warning: Constituent %d has negative length. Deleting it.\n", c1);
478
+ }
479
+ ctxt->constituent[c1].valid = 0;
480
+ }
481
+ ctxt->constituent[c1].canon = c1;
482
+ }
483
+
484
+ /* First go through and give each constituent a canonical number
485
+ (the index number of the lowest-numbered constituent
486
+ identical to it) */
487
+
488
+ for (c1 = 0; c1 < numcon_total; c1++)
489
+ {
490
+ if (ctxt->constituent[c1].canon != c1) continue;
491
+ for (c2 = c1 + 1; c2 < numcon_total; c2++)
492
+ {
493
+ if ((ctxt->constituent[c1].left == ctxt->constituent[c2].left) &&
494
+ (ctxt->constituent[c1].right == ctxt->constituent[c2].right) &&
495
+ (strcmp(ctxt->constituent[c1].type, ctxt->constituent[c2].type) == 0))
496
+ {
497
+ ctxt->constituent[c2].canon = c1;
498
+ }
499
+ }
500
+ }
501
+
502
+ /* If constituents A and B in different sublinkages X and Y
503
+ * have one endpoint in common, but A is larger at the other end,
504
+ * and B has no duplicate in X, then declare B invalid. (Example:
505
+ * " [A [B We saw the cat B] and the dog A] "
506
+ */
507
+ for (c1 = 0; c1 < numcon_total; c1++)
508
+ {
509
+ if (ctxt->constituent[c1].valid == 0) continue;
510
+ for (c2 = 0; c2 < numcon_total; c2++)
511
+ {
512
+ if (ctxt->constituent[c2].subl == ctxt->constituent[c1].subl) continue;
513
+ ok = 1;
514
+ /* Does c2 have a duplicate in the sublinkage containing c1?
515
+ If so, bag it */
516
+ for (c3 = 0; c3 < numcon_total; c3++)
517
+ {
518
+ if ((ctxt->constituent[c2].canon == ctxt->constituent[c3].canon) &&
519
+ (ctxt->constituent[c3].subl == ctxt->constituent[c1].subl))
520
+ ok = 0;
521
+ }
522
+ for (c3 = 0; c3 < numcon_total; c3++)
523
+ {
524
+ if ((ctxt->constituent[c1].canon == ctxt->constituent[c3].canon) &&
525
+ (ctxt->constituent[c3].subl == ctxt->constituent[c2].subl))
526
+ ok = 0;
527
+ }
528
+ if (ok == 0) continue;
529
+ if ((ctxt->constituent[c1].left == ctxt->constituent[c2].left) &&
530
+ (ctxt->constituent[c1].right > ctxt->constituent[c2].right) &&
531
+ (strcmp(ctxt->constituent[c1].type, ctxt->constituent[c2].type) == 0))
532
+ {
533
+ ctxt->constituent[c2].valid = 0;
534
+ }
535
+
536
+ if ((ctxt->constituent[c1].left < ctxt->constituent[c2].left) &&
537
+ (ctxt->constituent[c1].right == ctxt->constituent[c2].right) &&
538
+ (strcmp(ctxt->constituent[c1].type, ctxt->constituent[c2].type) == 0))
539
+ {
540
+ ctxt->constituent[c2].valid = 0;
541
+ }
542
+ }
543
+ }
544
+
545
+ /* Now go through and find duplicates; if a pair is found,
546
+ * mark one as invalid. (It doesn't matter if they're in the
547
+ * same sublinkage or not)
548
+ */
549
+ for (c1 = 0; c1 < numcon_total; c1++)
550
+ {
551
+ if (ctxt->constituent[c1].valid == 0) continue;
552
+ for (c2 = c1 + 1; c2 < numcon_total; c2++)
553
+ {
554
+ if (ctxt->constituent[c2].canon == ctxt->constituent[c1].canon)
555
+ ctxt->constituent[c2].valid = 0;
556
+ }
557
+ }
558
+
559
+ /* Now we generate the and-lists. An and-list is a set of mutually
560
+ * exclusive constituents. Each constituent in the list may not
561
+ * be present in the same sublinkage as any of the others.
562
+ */
563
+ num_lists = 0;
564
+ for (c1 = 0; c1 < numcon_total; c1++)
565
+ {
566
+ if (ctxt->constituent[c1].valid == 0) continue;
567
+ num_elements = 1;
568
+ ctxt->templist[0] = c1;
569
+ num_lists = find_next_element(ctxt, linkage, c1, numcon_total,
570
+ num_elements, num_lists);
571
+
572
+ /* If we're overflowing, then punt */
573
+ if (MAX_ANDS <= num_lists)
574
+ break;
575
+ }
576
+
577
+ if (verbosity >= 2)
578
+ {
579
+ printf("And-lists:\n");
580
+ for (n=0; n<num_lists; n++)
581
+ {
582
+ printf(" %d: ", n);
583
+ for (a=0; a < ctxt->andlist[n].num; a++)
584
+ {
585
+ printf("%d ", ctxt->andlist[n].e[a]);
586
+ }
587
+ printf("\n");
588
+ }
589
+ }
590
+
591
+ /* Now we prune out any andlists that are subsumed by other
592
+ * andlists--e.g. if andlist X contains constituents A and B,
593
+ * and Y contains A B and C, we throw out X
594
+ */
595
+ for (n = 0; n < num_lists; n++)
596
+ {
597
+ ctxt->andlist[n].valid = 1;
598
+ for (n2 = 0; n2 < num_lists; n2++)
599
+ {
600
+ if (n2 == n) continue;
601
+ if (ctxt->andlist[n2].num < ctxt->andlist[n].num)
602
+ continue;
603
+
604
+ listmatch = 1;
605
+ for (a = 0; a < ctxt->andlist[n].num; a++)
606
+ {
607
+ match = 0;
608
+ for (a2 = 0; a2 < ctxt->andlist[n2].num; a2++)
609
+ {
610
+ if (ctxt->andlist[n2].e[a2] == ctxt->andlist[n].e[a])
611
+ match = 1;
612
+ }
613
+ if (match == 0) listmatch = 0;
614
+ /* At least one element was not matched by n2 */
615
+ }
616
+ if (listmatch == 1) ctxt->andlist[n].valid = 0;
617
+ }
618
+ }
619
+
620
+ /* If an element of an andlist contains an element of another
621
+ * andlist, it must contain the entire andlist.
622
+ */
623
+ for (n = 0; n < num_lists; n++)
624
+ {
625
+ if (ctxt->andlist[n].valid == 0)
626
+ continue;
627
+ for (a = 0; (a < ctxt->andlist[n].num) && (ctxt->andlist[n].valid); a++)
628
+ {
629
+ for (n2 = 0; (n2 < num_lists) && (ctxt->andlist[n].valid); n2++)
630
+ {
631
+ if ((n2 == n) || (ctxt->andlist[n2].valid == 0))
632
+ continue;
633
+ for (a2 = 0; (a2 < ctxt->andlist[n2].num) && (ctxt->andlist[n].valid); a2++)
634
+ {
635
+ c1 = ctxt->andlist[n].e[a];
636
+ c2 = ctxt->andlist[n2].e[a2];
637
+ if (c1 == c2)
638
+ continue;
639
+ if (!((ctxt->constituent[c2].left <= ctxt->constituent[c1].left) &&
640
+ (ctxt->constituent[c2].right >= ctxt->constituent[c1].right)))
641
+ continue;
642
+ if (verbosity >= 2)
643
+ printf("Found that c%d in list %d is bigger " \
644
+ "than c%d in list %d\n", c2, n2, c1, n);
645
+ ok = 1;
646
+
647
+ /* An element of n2 contains an element of n.
648
+ * Now, we check to see if that element of n2
649
+ * contains ALL the elements of n.
650
+ * If not, n is invalid.
651
+ */
652
+ for (a3 = 0; a3 < ctxt->andlist[n].num; a3++)
653
+ {
654
+ c3 = ctxt->andlist[n].e[a3];
655
+ if ((ctxt->constituent[c2].left>ctxt->constituent[c3].left) ||
656
+ (ctxt->constituent[c2].right<ctxt->constituent[c3].right))
657
+ ok = 0;
658
+ }
659
+ if (ok != 0)
660
+ continue;
661
+ ctxt->andlist[n].valid = 0;
662
+ if (verbosity >= 2)
663
+ {
664
+ printf("Eliminating andlist, " \
665
+ "n=%d, a=%d, n2=%d, a2=%d: ",
666
+ n, a, n2, a2);
667
+ for (a3 = 0; a3 < ctxt->andlist[n].num; a3++)
668
+ {
669
+ printf("%d ", ctxt->andlist[n].e[a3]);
670
+ }
671
+ printf("\n");
672
+ }
673
+ }
674
+ }
675
+ }
676
+ }
677
+
678
+ if (verbosity >= 2)
679
+ {
680
+ printf("And-lists after pruning:\n");
681
+ for (n=0; n<num_lists; n++) {
682
+ if (ctxt->andlist[n].valid==0)
683
+ continue;
684
+ printf(" %d: ", n);
685
+ for (a=0; a<ctxt->andlist[n].num; a++) {
686
+ printf("%d ", ctxt->andlist[n].e[a]);
687
+ }
688
+ printf("\n");
689
+ }
690
+ }
691
+
692
+ c1 = numcon_total;
693
+ for (n = 0; n < num_lists; n++)
694
+ {
695
+ if (ctxt->andlist[n].valid == 0) continue;
696
+ leftend = 256;
697
+ rightend = -1;
698
+ for (a = 0; a < ctxt->andlist[n].num; a++)
699
+ {
700
+ c2 = ctxt->andlist[n].e[a];
701
+ if (ctxt->constituent[c2].left < leftend)
702
+ {
703
+ leftend = ctxt->constituent[c2].left;
704
+ }
705
+ if (ctxt->constituent[c2].right > rightend)
706
+ {
707
+ rightend=ctxt->constituent[c2].right;
708
+ }
709
+ }
710
+
711
+ ctxt->constituent[c1].left = leftend;
712
+ ctxt->constituent[c1].right = rightend;
713
+ ctxt->constituent[c1].type = ctxt->constituent[c2].type;
714
+ ctxt->constituent[c1].domain_type = 'x';
715
+ ctxt->constituent[c1].valid = 1;
716
+ ctxt->constituent[c1].start_link = ctxt->constituent[c2].start_link; /* bogus */
717
+ ctxt->constituent[c1].start_num = ctxt->constituent[c2].start_num; /* bogus */
718
+
719
+ #ifdef AUX_CODE_IS_DEAD /* See comments above */
720
+ /* If a constituent within the andlist is an aux (aux==1),
721
+ * set aux for the whole-list constituent to 2, also set
722
+ * aux for the smaller constituent to 2, meaning they'll both
723
+ * be printed (as an "X"). (If aux is 2 for the smaller
724
+ * constituent going in, the same thing should be done,
725
+ * though I doubt this ever happens.)
726
+ */
727
+ for (a = 0; a < ctxt->andlist[n].num; a++)
728
+ {
729
+ c2 = ctxt->andlist[n].e[a];
730
+ if ((ctxt->constituent[c2].aux == 1) || (ctxt->constituent[c2].aux == 2))
731
+ {
732
+ ctxt->constituent[c1].aux = 2;
733
+ ctxt->constituent[c2].aux = 2;
734
+ }
735
+ }
736
+ #endif /* AUX_CODE_IS_DEAD */
737
+
738
+ if (verbosity >= 2)
739
+ printf("Adding constituent:\n");
740
+ print_constituent(ctxt, linkage, c1);
741
+ c1++;
742
+ }
743
+ numcon_total = c1;
744
+ return numcon_total;
745
+ }
746
+
747
+ /**
748
+ * Go through all the words. If a word is on the right end of
749
+ * an S (or SF or SX), wordtype[w]=STYPE. If it's also on the left end of a
750
+ * Pg*b, I, PP, or Pv, wordtype[w]=PTYPE. If it's a question-word
751
+ * used in an indirect question, wordtype[w]=QTYPE. If it's a
752
+ * question-word determiner, wordtype[w]=QDTYPE. Else wordtype[w]=NONE.
753
+ * (This function is called once for each sublinkage.)
754
+ */
755
+ static void generate_misc_word_info(con_context_t * ctxt, Linkage linkage)
756
+ {
757
+ int l1, l2, w1, w2;
758
+ const char * label1, * label2;
759
+
760
+ for (w1=0; w1<linkage->num_words; w1++)
761
+ ctxt->wordtype[w1]=NONE;
762
+
763
+ for (l1=0; l1<linkage_get_num_links(linkage); l1++) {
764
+ w1=linkage_get_link_rword(linkage, l1);
765
+ label1 = linkage_get_link_label(linkage, l1);
766
+ if ((uppercompare(label1, "S")==0) ||
767
+ (uppercompare(label1, "SX")==0) ||
768
+ (uppercompare(label1, "SF")==0)) {
769
+ ctxt->wordtype[w1] = STYPE;
770
+ for (l2=0; l2<linkage_get_num_links(linkage); l2++) {
771
+ w2=linkage_get_link_lword(linkage, l2);
772
+ label2 = linkage_get_link_label(linkage, l2);
773
+ if ((w1==w2) &&
774
+ ((post_process_match("Pg#b", label2)==1) ||
775
+ (uppercompare(label2, "I")==0) ||
776
+ (uppercompare(label2, "PP")==0) ||
777
+ (post_process_match("Pv", label2)==1))) {
778
+ /* Pvf, Pgf? */
779
+ ctxt->wordtype[w1] = PTYPE;
780
+ }
781
+ }
782
+ }
783
+ if (post_process_match("QI#d", label1)==1) {
784
+ ctxt->wordtype[w1] = QTYPE;
785
+ for (l2=0; l2<linkage_get_num_links(linkage); l2++) {
786
+ w2=linkage_get_link_lword(linkage, l2);
787
+ label2 = linkage_get_link_label(linkage, l2);
788
+ if ((w1==w2) && (post_process_match("D##w", label2)==1)) {
789
+ ctxt->wordtype[w1] = QDTYPE;
790
+ }
791
+ }
792
+ }
793
+ if (post_process_match("Mr", label1)==1) ctxt->wordtype[w1] = QDTYPE;
794
+ if (post_process_match("MX#d", label1)==1) ctxt->wordtype[w1] = QDTYPE;
795
+ }
796
+ }
797
+
798
+ static int last_minute_fixes(con_context_t *ctxt, Linkage linkage, int numcon_total)
799
+ {
800
+ int c, c2, global_leftend_found, adjustment_made,
801
+ global_rightend_found, lastword, newcon_total = 0;
802
+ Sentence sent;
803
+ sent = linkage_get_sentence(linkage);
804
+
805
+ for (c = 0; c < numcon_total; c++)
806
+ {
807
+ /* In a paraphrase construction ("John ran, he said"),
808
+ the paraphrasing clause doesn't get
809
+ an S. (This is true in Treebank II, not Treebank I) */
810
+
811
+ if (uppercompare(ctxt->constituent[c].start_link, "CP") == 0)
812
+ {
813
+ ctxt->constituent[c].valid = 0;
814
+ }
815
+
816
+ /* If it's a possessive with an "'s", the NP on the left
817
+ should be extended to include the "'s". */
818
+ if ((uppercompare(ctxt->constituent[c].start_link, "YS") == 0) ||
819
+ (uppercompare(ctxt->constituent[c].start_link, "YP") == 0))
820
+ {
821
+ ctxt->constituent[c].right++;
822
+ }
823
+
824
+ /* If a constituent has starting link MVpn, it's a time
825
+ expression like "last week"; label it as a noun phrase
826
+ (incorrectly) */
827
+
828
+ if (strcmp(ctxt->constituent[c].start_link, "MVpn") == 0)
829
+ {
830
+ ctxt->constituent[c].type = string_set_add("NP", ctxt->phrase_ss);
831
+ }
832
+ if (strcmp(ctxt->constituent[c].start_link, "COn") == 0)
833
+ {
834
+ ctxt->constituent[c].type = string_set_add("NP", ctxt->phrase_ss);
835
+ }
836
+ if (strcmp(ctxt->constituent[c].start_link, "Mpn") == 0)
837
+ {
838
+ ctxt->constituent[c].type = string_set_add("NP", ctxt->phrase_ss);
839
+ }
840
+
841
+ /* If the constituent is an S started by "but" or "and" at
842
+ the beginning of the sentence, it should be ignored. */
843
+
844
+ if ((strcmp(ctxt->constituent[c].start_link, "Wdc") == 0) &&
845
+ (ctxt->constituent[c].left == 2))
846
+ {
847
+ ctxt->constituent[c].valid = 0;
848
+ }
849
+
850
+ /* For prenominal adjectives, an ADJP constituent is assigned
851
+ if it's a hyphenated (Ah) or comparative (Am) adjective;
852
+ otherwise no ADJP is assigned, unless the phrase is more
853
+ than one word long (e.g. "very big"). The same with certain
854
+ types of adverbs. */
855
+ /* That was for Treebank I. For Treebank II, the rule only
856
+ seems to apply to prenominal adjectives (of all kinds).
857
+ However, it also applies to number expressions ("QP"). */
858
+
859
+ if ((post_process_match("A", ctxt->constituent[c].start_link) == 1) ||
860
+ (ctxt->constituent[c].domain_type == 'd') ||
861
+ (ctxt->constituent[c].domain_type == 'h')) {
862
+ if (ctxt->constituent[c].right-ctxt->constituent[c].left == 0)
863
+ {
864
+ ctxt->constituent[c].valid = 0;
865
+ }
866
+ }
867
+
868
+ if ((ctxt->constituent[c].domain_type == 'h') &&
869
+ (strcmp(linkage->word[ctxt->constituent[c].left - 1], "$") == 0))
870
+ {
871
+ ctxt->constituent[c].left--;
872
+ }
873
+
874
+ #ifdef AUX_CODE_IS_DEAD /* See comments at top */
875
+ /* If a constituent has type VP and its aux value is 2,
876
+ this means it's an aux that should be printed; change its
877
+ type to "X". If its aux value is 1, set "valid" to 0. (This
878
+ applies to Treebank I only) */
879
+
880
+ if (ctxt->constituent[c].aux == 2)
881
+ {
882
+ ctxt->constituent[c].type = string_set_add("X", ctxt->phrase_ss);
883
+ }
884
+ if (ctxt->constituent[c].aux == 1)
885
+ {
886
+ ctxt->constituent[c].valid = 0;
887
+ }
888
+ #endif /* AUX_CODE_IS_DEAD */
889
+ }
890
+
891
+ numcon_total = numcon_total + newcon_total;
892
+
893
+ /* If there's a global S constituent that includes everything
894
+ except a final period or question mark, extend it by one word */
895
+
896
+ for (c = 0; c < numcon_total; c++)
897
+ {
898
+ if ((ctxt->constituent[c].right == linkage->num_words -3) &&
899
+ (ctxt->constituent[c].left == 1) &&
900
+ (strcmp(ctxt->constituent[c].type, "S") == 0) &&
901
+ (strcmp(sent->word[linkage->num_words -2].string, ".") == 0))
902
+ ctxt->constituent[c].right++;
903
+ }
904
+
905
+ /* If there's no S boundary at the very left end of the sentence,
906
+ or the very right end, create a new S spanning the entire sentence */
907
+
908
+ lastword = linkage->num_words - 2;
909
+ global_leftend_found = 0;
910
+ global_rightend_found = 0;
911
+ for (c = 0; c < numcon_total; c++)
912
+ {
913
+ if ((ctxt->constituent[c].left == 1) && (strcmp(ctxt->constituent[c].type, "S") == 0) &&
914
+ (ctxt->constituent[c].valid == 1))
915
+ {
916
+ global_leftend_found = 1;
917
+ }
918
+ }
919
+ for (c = 0; c < numcon_total; c++)
920
+ {
921
+ if ((ctxt->constituent[c].right >= lastword) &&
922
+ (strcmp(ctxt->constituent[c].type, "S") == 0) && (ctxt->constituent[c].valid == 1))
923
+ {
924
+ global_rightend_found = 1;
925
+ }
926
+ }
927
+ if ((global_leftend_found == 0) || (global_rightend_found == 0))
928
+ {
929
+ c = numcon_total;
930
+ ctxt->constituent[c].left = 1;
931
+ ctxt->constituent[c].right = linkage->num_words-1;
932
+ ctxt->constituent[c].type = string_set_add("S", ctxt->phrase_ss);
933
+ ctxt->constituent[c].valid = 1;
934
+ ctxt->constituent[c].domain_type = 'x';
935
+ numcon_total++;
936
+ if (verbosity >= 2)
937
+ printf("Adding global sentence constituent:\n");
938
+ print_constituent(ctxt, linkage, c);
939
+ }
940
+
941
+ /* Check once more to see if constituents are nested (checking BETWEEN sublinkages
942
+ this time) */
943
+
944
+ while (1)
945
+ {
946
+ adjustment_made=0;
947
+ for (c = 0; c < numcon_total; c++)
948
+ {
949
+ if(ctxt->constituent[c].valid == 0) continue;
950
+ for (c2 = 0; c2 < numcon_total; c2++)
951
+ {
952
+ if(ctxt->constituent[c2].valid == 0) continue;
953
+ if ((ctxt->constituent[c].left < ctxt->constituent[c2].left) &&
954
+ (ctxt->constituent[c].right < ctxt->constituent[c2].right) &&
955
+ (ctxt->constituent[c].right >= ctxt->constituent[c2].left))
956
+ {
957
+ if (verbosity >= 2)
958
+ {
959
+ err_ctxt ec;
960
+ ec.sent = linkage->sent;
961
+ err_msg(&ec, Warn, "Warning: the constituents aren't nested! "
962
+ "Adjusting them. (%d, %d)\n", c, c2);
963
+ }
964
+ ctxt->constituent[c].left = ctxt->constituent[c2].left;
965
+ }
966
+ }
967
+ }
968
+ if (adjustment_made == 0) break;
969
+ }
970
+ return numcon_total;
971
+ }
972
+
973
+ /**
974
+ * This function generates a table, word_used[i][w], showing
975
+ * whether each word w is used in each sublinkage i; if so,
976
+ * the value for that cell of the table is 1.
977
+ */
978
+ static void count_words_used(con_context_t *ctxt, Linkage linkage)
979
+ {
980
+ int i, w, link, num_subl;
981
+
982
+ num_subl = linkage->num_sublinkages;
983
+ if(linkage->unionized == 1 && num_subl > 1) num_subl--;
984
+
985
+ if (verbosity >= 2)
986
+ printf("Number of sublinkages = %d\n", num_subl);
987
+
988
+ for (i=0; i<num_subl; i++)
989
+ {
990
+ for (w = 0; w < linkage->num_words; w++) ctxt->word_used[i][w] = 0;
991
+ linkage->current = i;
992
+ for (link = 0; link < linkage_get_num_links(linkage); link++)
993
+ {
994
+ ctxt->word_used[i][linkage_get_link_lword(linkage, link)] = 1;
995
+ ctxt->word_used[i][linkage_get_link_rword(linkage, link)] = 1;
996
+ }
997
+ if (verbosity >= 2)
998
+ {
999
+ printf("Sublinkage %d: ", i);
1000
+ for (w = 0; w < linkage->num_words; w++)
1001
+ {
1002
+ if (ctxt->word_used[i][w] == 0) printf("0 ");
1003
+ if (ctxt->word_used[i][w] == 1) printf("1 ");
1004
+ }
1005
+ printf("\n");
1006
+ }
1007
+ }
1008
+ }
1009
+
1010
+ static int add_constituent(con_context_t *ctxt, int c, Linkage linkage, Domain domain,
1011
+ int l, int r, const char * name)
1012
+ {
1013
+ int nwords = linkage->num_words-2;
1014
+ c++;
1015
+
1016
+ /* Avoid running off end, to walls. */
1017
+ if (l < 1) l=1;
1018
+ if (r > nwords) r = nwords;
1019
+ if (l > nwords) l = nwords;
1020
+ assert(l <= r, "negative constituent length!" );
1021
+
1022
+ ctxt->constituent[c].left = l;
1023
+ ctxt->constituent[c].right = r;
1024
+ ctxt->constituent[c].domain_type = domain.type;
1025
+ ctxt->constituent[c].start_link =
1026
+ linkage_get_link_label(linkage, domain.start_link);
1027
+ ctxt->constituent[c].start_num = domain.start_link;
1028
+ ctxt->constituent[c].type = string_set_add(name, ctxt->phrase_ss);
1029
+ return c;
1030
+ }
1031
+
1032
+ static const char * cons_of_domain(Linkage linkage, char domain_type)
1033
+ {
1034
+ switch (domain_type) {
1035
+ case 'a':
1036
+ return "ADJP";
1037
+ case 'b':
1038
+ return "SBAR";
1039
+ case 'c':
1040
+ return "VP";
1041
+ case 'd':
1042
+ return "QP";
1043
+ case 'e':
1044
+ return "ADVP";
1045
+ case 'f':
1046
+ return "SBAR";
1047
+ case 'g':
1048
+ return "PP";
1049
+ case 'h':
1050
+ return "QP";
1051
+ case 'i':
1052
+ return "ADVP";
1053
+ case 'k':
1054
+ return "PRT";
1055
+ case 'n':
1056
+ return "NP";
1057
+ case 'p':
1058
+ return "PP";
1059
+ case 'q':
1060
+ return "SINV";
1061
+ case 's':
1062
+ return "S";
1063
+ case 't':
1064
+ return "VP";
1065
+ case 'u':
1066
+ return "ADJP";
1067
+ case 'v':
1068
+ return "VP";
1069
+ case 'y':
1070
+ return "NP";
1071
+ case 'z':
1072
+ return "VP";
1073
+ default:
1074
+ {
1075
+ err_ctxt ec;
1076
+ ec.sent = linkage->sent;
1077
+ err_msg(&ec, Error, "Error: Illegal domain: %c\n", domain_type);
1078
+ return "";
1079
+ }
1080
+ }
1081
+ }
1082
+
1083
+ static int read_constituents_from_domains(con_context_t *ctxt, Linkage linkage,
1084
+ int numcon_total, int s)
1085
+ {
1086
+ int d, c, leftlimit, l, leftmost, rightmost, w, c2, numcon_subl = 0, w2;
1087
+ List_o_links * dlink;
1088
+ int rootright, rootleft, adjustment_made;
1089
+ Sublinkage * subl;
1090
+ const char * name;
1091
+ Domain domain;
1092
+
1093
+ subl = &linkage->sublinkage[s];
1094
+
1095
+ for (d = 0, c = numcon_total; d < subl->pp_data.N_domains; d++, c++)
1096
+ {
1097
+ domain = subl->pp_data.domain_array[d];
1098
+ rootright = linkage_get_link_rword(linkage, domain.start_link);
1099
+ rootleft = linkage_get_link_lword(linkage, domain.start_link);
1100
+
1101
+ if ((domain.type=='c') ||
1102
+ (domain.type=='d') ||
1103
+ (domain.type=='e') ||
1104
+ (domain.type=='f') ||
1105
+ (domain.type=='g') ||
1106
+ (domain.type=='u') ||
1107
+ (domain.type=='y'))
1108
+ {
1109
+ leftlimit = 0;
1110
+ leftmost = linkage_get_link_lword(linkage, domain.start_link);
1111
+ rightmost = linkage_get_link_lword(linkage, domain.start_link);
1112
+ }
1113
+ else
1114
+ {
1115
+ leftlimit = linkage_get_link_lword(linkage, domain.start_link) + 1;
1116
+ leftmost = linkage_get_link_rword(linkage, domain.start_link);
1117
+ rightmost = linkage_get_link_rword(linkage, domain.start_link);
1118
+ }
1119
+
1120
+ /* Start by assigning both left and right limits to the
1121
+ * right word of the start link. This will always be contained
1122
+ * in the constituent. This will also handle the case
1123
+ * where the domain contains no links.
1124
+ */
1125
+ for (dlink = domain.lol; dlink != NULL; dlink = dlink->next)
1126
+ {
1127
+ l = dlink->link;
1128
+
1129
+ if ((linkage_get_link_lword(linkage, l) < leftmost) &&
1130
+ (linkage_get_link_lword(linkage, l) >= leftlimit))
1131
+ {
1132
+ leftmost = linkage_get_link_lword(linkage, l);
1133
+ }
1134
+
1135
+ if (linkage_get_link_rword(linkage, l) > rightmost)
1136
+ {
1137
+ rightmost = linkage_get_link_rword(linkage, l);
1138
+ }
1139
+ }
1140
+
1141
+ c--;
1142
+ c = add_constituent(ctxt, c, linkage, domain, leftmost, rightmost,
1143
+ cons_of_domain(linkage, domain.type));
1144
+
1145
+ if (domain.type == 'z')
1146
+ {
1147
+ c = add_constituent(ctxt, c, linkage, domain, leftmost, rightmost, "S");
1148
+ }
1149
+ if (domain.type=='c')
1150
+ {
1151
+ c = add_constituent(ctxt, c, linkage, domain, leftmost, rightmost, "S");
1152
+ }
1153
+ if ((post_process_match("Ce*", ctxt->constituent[c].start_link)==1) ||
1154
+ (post_process_match("Rn", ctxt->constituent[c].start_link)==1))
1155
+ {
1156
+ c = add_constituent(ctxt, c, linkage, domain, leftmost, rightmost, "SBAR");
1157
+ }
1158
+ if ((post_process_match("R*", ctxt->constituent[c].start_link)==1) ||
1159
+ (post_process_match("MX#r", ctxt->constituent[c].start_link)==1))
1160
+ {
1161
+ w = leftmost;
1162
+ if (strcmp(linkage->word[w], ",") == 0) w++;
1163
+ c = add_constituent(ctxt, c, linkage, domain, w, w, "WHNP");
1164
+ }
1165
+ if (post_process_match("Mj", ctxt->constituent[c].start_link) == 1)
1166
+ {
1167
+ w = leftmost;
1168
+ if (strcmp(linkage->word[w], ",") == 0) w++;
1169
+ c = add_constituent(ctxt, c, linkage, domain, w, w+1, "WHPP");
1170
+ c = add_constituent(ctxt, c, linkage, domain, w+1, w+1, "WHNP");
1171
+ }
1172
+ if ((post_process_match("Ss#d", ctxt->constituent[c].start_link)==1) ||
1173
+ (post_process_match("B#d", ctxt->constituent[c].start_link)==1))
1174
+ {
1175
+ c = add_constituent(ctxt, c, linkage, domain, rootleft, rootleft, "WHNP");
1176
+ c = add_constituent(ctxt, c, linkage, domain,
1177
+ rootleft, ctxt->constituent[c-1].right, "SBAR");
1178
+ }
1179
+ if (post_process_match("CP", ctxt->constituent[c].start_link)==1)
1180
+ {
1181
+ if (strcmp(linkage->word[leftmost], ",") == 0)
1182
+ ctxt->constituent[c].left++;
1183
+ c = add_constituent(ctxt, c, linkage, domain, 1, linkage->num_words-1, "S");
1184
+ }
1185
+ if ((post_process_match("MVs", ctxt->constituent[c].start_link)==1) ||
1186
+ (domain.type=='f'))
1187
+ {
1188
+ w = ctxt->constituent[c].left;
1189
+ if (strcmp(linkage->word[w], ",") == 0)
1190
+ w++;
1191
+ if (strcmp(linkage->word[w], "when") == 0)
1192
+ {
1193
+ c = add_constituent(ctxt, c, linkage, domain, w, w, "WHADVP");
1194
+ }
1195
+ }
1196
+ if (domain.type=='t')
1197
+ {
1198
+ c = add_constituent(ctxt, c, linkage, domain, leftmost, rightmost, "S");
1199
+ }
1200
+ if ((post_process_match("QI", ctxt->constituent[c].start_link) == 1) ||
1201
+ (post_process_match("Mr", ctxt->constituent[c].start_link) == 1) ||
1202
+ (post_process_match("MX#d", ctxt->constituent[c].start_link) == 1))
1203
+ {
1204
+ w = leftmost;
1205
+ if (strcmp(linkage->word[w], ",") == 0) w++;
1206
+ if (ctxt->wordtype[w] == NONE)
1207
+ name = "WHADVP";
1208
+ else if (ctxt->wordtype[w] == QTYPE)
1209
+ name = "WHNP";
1210
+ else if (ctxt->wordtype[w] == QDTYPE)
1211
+ name = "WHNP";
1212
+ else
1213
+ assert(0, "Unexpected word type");
1214
+ c = add_constituent(ctxt, c, linkage, domain, w, w, name);
1215
+
1216
+ if (ctxt->wordtype[w] == QDTYPE)
1217
+ {
1218
+ /* Now find the finite verb to the right, start an S */
1219
+ /* Limit w2 to sentence length. */
1220
+ // for( w2=w+1; w2 < ctxt->r_limit-1; w2++ )
1221
+ for (w2 = w+1; w2 < rightmost; w2++)
1222
+ if ((ctxt->wordtype[w2] == STYPE) || (ctxt->wordtype[w2] == PTYPE)) break;
1223
+
1224
+ /* Adjust the right boundary of previous constituent */
1225
+ ctxt->constituent[c].right = w2 - 1;
1226
+ c = add_constituent(ctxt, c, linkage, domain, w2, rightmost, "S");
1227
+ }
1228
+ }
1229
+
1230
+ if (ctxt->constituent[c].domain_type == '\0')
1231
+ {
1232
+ err_ctxt ec;
1233
+ ec.sent = linkage->sent;
1234
+ err_msg(&ec, Error, "Error: no domain type assigned to constituent\n");
1235
+ }
1236
+ if (ctxt->constituent[c].start_link == NULL)
1237
+ {
1238
+ err_ctxt ec;
1239
+ ec.sent = linkage->sent;
1240
+ err_msg(&ec, Error, "Error: no type assigned to constituent\n");
1241
+ }
1242
+ }
1243
+
1244
+ numcon_subl = c - numcon_total;
1245
+ /* numcon_subl = handle_islands(linkage, numcon_total, numcon_subl); */
1246
+
1247
+ if (verbosity >= 2)
1248
+ printf("Constituents added at first stage for subl %d:\n",
1249
+ linkage->current);
1250
+ for (c = numcon_total; c < numcon_total + numcon_subl; c++)
1251
+ {
1252
+ print_constituent(ctxt, linkage, c);
1253
+ }
1254
+
1255
+ /* Opener case - generates S around main clause.
1256
+ (This must be done first; the S generated will be needed for
1257
+ later cases.) */
1258
+ numcon_subl =
1259
+ gen_comp(ctxt, linkage, numcon_total, numcon_subl, "SBAR", "S", "S", 5);
1260
+
1261
+ /* pp opener case */
1262
+ numcon_subl =
1263
+ gen_comp(ctxt, linkage, numcon_total, numcon_subl, "PP", "S", "S", 6);
1264
+
1265
+ /* participle opener case */
1266
+ numcon_subl =
1267
+ gen_comp(ctxt, linkage, numcon_total, numcon_subl, "S", "S", "S", 9);
1268
+
1269
+ /* Subject-phrase case; every main VP generates an S */
1270
+ numcon_subl =
1271
+ gen_comp(ctxt, linkage, numcon_total, numcon_subl, "VP", "S", "NP", 1);
1272
+
1273
+ /* Relative clause case; an SBAR generates a complement NP */
1274
+ numcon_subl =
1275
+ gen_comp(ctxt, linkage, numcon_total, numcon_subl, "SBAR", "NP", "NP", 3);
1276
+
1277
+ /* Participle modifier case */
1278
+ numcon_subl =
1279
+ gen_comp(ctxt, linkage, numcon_total, numcon_subl, "VP", "NP", "NP", 8);
1280
+
1281
+ /* PP modifying NP */
1282
+ numcon_subl =
1283
+ gen_comp(ctxt, linkage, numcon_total, numcon_subl, "PP", "NP", "NP", 8);
1284
+
1285
+ /* Appositive case */
1286
+ numcon_subl =
1287
+ gen_comp(ctxt, linkage, numcon_total, numcon_subl, "NP", "NP", "NP", 4);
1288
+
1289
+ /* S-V inversion case; an NP generates a complement VP */
1290
+ numcon_subl =
1291
+ gen_comp(ctxt, linkage, numcon_total, numcon_subl, "NP", "SINV", "VP", 7);
1292
+
1293
+ adjust_subordinate_clauses(ctxt, linkage, numcon_total, numcon_subl);
1294
+ for (c = numcon_total; c < numcon_total + numcon_subl; c++)
1295
+ {
1296
+ if ((ctxt->constituent[c].domain_type=='p') &&
1297
+ (strcmp(linkage->word[ctxt->constituent[c].left], ",")==0))
1298
+ {
1299
+ ctxt->constituent[c].left++;
1300
+ }
1301
+ }
1302
+
1303
+ /* Make sure the constituents are nested. If two constituents
1304
+ * are not nested: whichever constituent has the furthest left
1305
+ * boundary, shift that boundary rightwards to the left boundary
1306
+ * of the other one.
1307
+ */
1308
+ while (1)
1309
+ {
1310
+ adjustment_made = 0;
1311
+ for (c = numcon_total; c < numcon_total + numcon_subl; c++)
1312
+ {
1313
+ for (c2 = numcon_total; c2 < numcon_total + numcon_subl; c2++)
1314
+ {
1315
+ if ((ctxt->constituent[c].left < ctxt->constituent[c2].left) &&
1316
+ (ctxt->constituent[c].right < ctxt->constituent[c2].right) &&
1317
+ (ctxt->constituent[c].right >= ctxt->constituent[c2].left))
1318
+ {
1319
+ /* We've found two overlapping constituents.
1320
+ If one is larger, except the smaller one
1321
+ includes an extra comma, adjust the smaller one
1322
+ to exclude the comma */
1323
+
1324
+ if ((strcmp(linkage->word[ctxt->constituent[c2].right], ",") == 0) ||
1325
+ (strcmp(linkage->word[ctxt->constituent[c2].right],
1326
+ "RIGHT-WALL") == 0))
1327
+ {
1328
+ if (verbosity >= 2)
1329
+ printf("Adjusting %d to fix comma overlap\n", c2);
1330
+ adjust_for_right_comma(ctxt, linkage, c2);
1331
+ adjustment_made = 1;
1332
+ }
1333
+ else if (strcmp(linkage->word[ctxt->constituent[c].left], ",") == 0)
1334
+ {
1335
+ if (verbosity >= 2)
1336
+ printf("Adjusting c %d to fix comma overlap\n", c);
1337
+ adjust_for_left_comma(ctxt, linkage, c);
1338
+ adjustment_made = 1;
1339
+ }
1340
+ else
1341
+ {
1342
+ if (verbosity >= 2)
1343
+ {
1344
+ err_ctxt ec;
1345
+ ec.sent = linkage->sent;
1346
+ err_msg(&ec, Warn,
1347
+ "Warning: the constituents aren't nested! "
1348
+ "Adjusting them. (%d, %d)\n", c, c2);
1349
+ }
1350
+ ctxt->constituent[c].left = ctxt->constituent[c2].left;
1351
+ }
1352
+ }
1353
+ }
1354
+ }
1355
+ if (adjustment_made == 0) break;
1356
+ }
1357
+
1358
+ #ifdef AUX_CODE_IS_DEAD
1359
+ /* The code here is ifdef-dead as it appears to be dead, as the computation it does
1360
+ * is immediately undone in the very next block.
1361
+ */
1362
+ /* This labels certain words as auxiliaries (such as forms of "be"
1363
+ * with passives, forms of "have" wth past participles,
1364
+ * "to" with infinitives). These words start VP's which include
1365
+ * them. In Treebank I, these don't get printed unless they're part of an
1366
+ * andlist, in which case they get labeled "X". (this is why we need to
1367
+ * label them as "aux".) In Treebank II, however, they seem to be treated
1368
+ * just like other verbs, so the "aux" stuff isn't needed.
1369
+ */
1370
+ for (c = numcon_total; c < numcon_total + numcon_subl; c++)
1371
+ {
1372
+ ctxt->constituent[c].subl = linkage->current;
1373
+ if (((ctxt->constituent[c].domain_type == 'v') &&
1374
+ (ctxt->wordtype[linkage_get_link_rword(linkage,
1375
+ ctxt->constituent[c].start_num)] == PTYPE))
1376
+ ||
1377
+ ((ctxt->constituent[c].domain_type == 't') &&
1378
+ (strcmp(ctxt->constituent[c].type, "VP") == 0)))
1379
+ {
1380
+ ctxt->constituent[c].aux = 1;
1381
+ }
1382
+ else
1383
+ {
1384
+ ctxt->constituent[c].aux = 0;
1385
+ }
1386
+ }
1387
+ #endif /* AUX_CODE_IS_DEAD */
1388
+
1389
+ if (MAXCONSTITUENTS <= numcon_total + numcon_subl)
1390
+ {
1391
+ err_ctxt ec;
1392
+ ec.sent = linkage->sent;
1393
+ err_msg(&ec, Error, "Error: Too many constituents (a2).\n");
1394
+ numcon_total = MAXCONSTITUENTS - numcon_subl;
1395
+ }
1396
+ for (c = numcon_total; c < numcon_total + numcon_subl; c++)
1397
+ {
1398
+ ctxt->constituent[c].subl = linkage->current;
1399
+ #ifdef AUX_CODE_IS_DEAD /* See comments at top */
1400
+ ctxt->constituent[c].aux = 0;
1401
+ #endif /* AUX_CODE_IS_DEAD */
1402
+ }
1403
+
1404
+ return numcon_subl;
1405
+ }
1406
+
1407
+ static char * exprint_constituent_structure(con_context_t *ctxt, Linkage linkage, int numcon_total)
1408
+ {
1409
+ int have_opened = 1;
1410
+ int c, w;
1411
+ int leftdone[MAXCONSTITUENTS];
1412
+ int rightdone[MAXCONSTITUENTS];
1413
+ int best, bestright, bestleft;
1414
+ Sentence sent;
1415
+ char s[100], * p;
1416
+ String * cs = string_new();
1417
+
1418
+ assert (numcon_total < MAXCONSTITUENTS, "Too many constituents (b)");
1419
+ sent = linkage_get_sentence(linkage);
1420
+
1421
+ for (c = 0; c < numcon_total; c++)
1422
+ {
1423
+ leftdone[c] = 0;
1424
+ rightdone[c] = 0;
1425
+ }
1426
+
1427
+ if (verbosity >= 2)
1428
+ printf("\n");
1429
+
1430
+ for (w = 1; w < linkage->num_words; w++)
1431
+ {
1432
+ /* Skip left wall; don't skip right wall, since it may
1433
+ have constituent boundaries */
1434
+
1435
+ while(1)
1436
+ {
1437
+ best = -1;
1438
+ bestright = -1;
1439
+ for (c = 0; c < numcon_total; c++)
1440
+ {
1441
+ if ((ctxt->constituent[c].left == w) &&
1442
+ (leftdone[c] == 0) && (ctxt->constituent[c].valid == 1) &&
1443
+ (ctxt->constituent[c].right >= bestright)) {
1444
+ best = c;
1445
+ bestright = ctxt->constituent[c].right;
1446
+ }
1447
+ }
1448
+ if (best == -1)
1449
+ break;
1450
+
1451
+ leftdone[best] = 1;
1452
+ /* have_open is a hack to avoid printing anything until
1453
+ * bracket is opened */
1454
+ if (w == 1) have_opened = 0;
1455
+ #ifdef AUX_CODE_IS_DEAD /* See comments at top */
1456
+ if (ctxt->constituent[best].aux == 1) continue;
1457
+ #endif /* AUX_CODE_IS_DEAD */
1458
+ have_opened = 1;
1459
+ append_string(cs, "%c%s ", OPEN_BRACKET, ctxt->constituent[best].type);
1460
+ }
1461
+
1462
+ /* Don't print out right wall */
1463
+ if (have_opened && (w < linkage->num_words - 1))
1464
+ {
1465
+ char *p;
1466
+ strcpy(s, sent->word[w].string);
1467
+
1468
+ /* Constituent processing will crash if the sentence contains
1469
+ * square brackets, so we have to do something ... replace
1470
+ * them with curly braces ... will have to do.
1471
+ */
1472
+ p = strchr(s, OPEN_BRACKET);
1473
+ while(p)
1474
+ {
1475
+ *p = '{';
1476
+ p = strchr(p, OPEN_BRACKET);
1477
+ }
1478
+
1479
+ p = strchr(s, CLOSE_BRACKET);
1480
+ while(p)
1481
+ {
1482
+ *p = '}';
1483
+ p = strchr(p, CLOSE_BRACKET);
1484
+ }
1485
+
1486
+ /* Now, if the first character of the word was
1487
+ originally uppercase, we put it back that way */
1488
+ if (sent->word[w].firstupper == 1)
1489
+ upcase_utf8_str(s, s, MAX_WORD);
1490
+ append_string(cs, "%s ", s);
1491
+ }
1492
+
1493
+ while(1)
1494
+ {
1495
+ best = -1;
1496
+ bestleft = -1;
1497
+ for(c = 0; c < numcon_total; c++)
1498
+ {
1499
+ if ((ctxt->constituent[c].right == w) &&
1500
+ (rightdone[c] == 0) && (ctxt->constituent[c].valid == 1) &&
1501
+ (ctxt->constituent[c].left > bestleft)) {
1502
+ best = c;
1503
+ bestleft = ctxt->constituent[c].left;
1504
+ }
1505
+ }
1506
+ if (best == -1)
1507
+ break;
1508
+ rightdone[best] = 1;
1509
+ #ifdef AUX_CODE_IS_DEAD /* See comments at top */
1510
+ if (ctxt->constituent[best].aux == 1)
1511
+ continue;
1512
+ #endif /* AUX_CODE_IS_DEAD */
1513
+ append_string(cs, "%s%c ", ctxt->constituent[best].type, CLOSE_BRACKET);
1514
+ }
1515
+ }
1516
+
1517
+ append_string(cs, "\n");
1518
+ p = string_copy(cs);
1519
+ string_delete(cs);
1520
+ return p;
1521
+ }
1522
+
1523
+ static char * do_print_flat_constituents(con_context_t *ctxt, Linkage linkage)
1524
+ {
1525
+ int num_words;
1526
+ Sentence sent;
1527
+ Postprocessor * pp;
1528
+ int s, numcon_total, numcon_subl, num_subl;
1529
+ char * q;
1530
+
1531
+ sent = linkage_get_sentence(linkage);
1532
+ ctxt->phrase_ss = string_set_create();
1533
+ pp = linkage->sent->dict->constituent_pp;
1534
+ numcon_total = 0;
1535
+
1536
+ count_words_used(ctxt, linkage);
1537
+
1538
+ num_subl = linkage->num_sublinkages;
1539
+ if (num_subl > MAXSUBL)
1540
+ {
1541
+ num_subl = MAXSUBL;
1542
+ if (verbosity >= 2)
1543
+ printf("Number of sublinkages exceeds maximum: only considering first %d sublinkages\n", MAXSUBL);
1544
+ }
1545
+
1546
+ if (linkage->unionized == 1 && num_subl > 1) num_subl--;
1547
+ for (s = 0; s < num_subl; s++)
1548
+ {
1549
+ linkage_set_current_sublinkage(linkage, s);
1550
+ linkage_post_process(linkage, pp);
1551
+ num_words = linkage_get_num_words(linkage);
1552
+ generate_misc_word_info(ctxt, linkage);
1553
+ numcon_subl = read_constituents_from_domains(ctxt, linkage, numcon_total, s);
1554
+ numcon_total = numcon_total + numcon_subl;
1555
+ if (MAXCONSTITUENTS <= numcon_total)
1556
+ {
1557
+ err_ctxt ec;
1558
+ ec.sent = linkage->sent;
1559
+ err_msg(&ec, Error, "Error: Too many constituents (c).\n");
1560
+ numcon_total = MAXCONSTITUENTS-1;
1561
+ break;
1562
+ }
1563
+ }
1564
+ numcon_total = merge_constituents(ctxt, linkage, numcon_total);
1565
+ if (MAXCONSTITUENTS <= numcon_total)
1566
+ {
1567
+ err_ctxt ec;
1568
+ ec.sent = linkage->sent;
1569
+ err_msg(&ec, Error, "Error: Too many constituents (d).\n");
1570
+ numcon_total = MAXCONSTITUENTS-1;
1571
+ }
1572
+ numcon_total = last_minute_fixes(ctxt, linkage, numcon_total);
1573
+ if (MAXCONSTITUENTS <= numcon_total)
1574
+ {
1575
+ err_ctxt ec;
1576
+ ec.sent = linkage->sent;
1577
+ err_msg(&ec, Error, "Error: Too many constituents (e).\n");
1578
+ numcon_total = MAXCONSTITUENTS-1;
1579
+ }
1580
+ q = exprint_constituent_structure(ctxt, linkage, numcon_total);
1581
+ string_set_delete(ctxt->phrase_ss);
1582
+ ctxt->phrase_ss = NULL;
1583
+ return q;
1584
+ }
1585
+
1586
+ static char * print_flat_constituents(Linkage linkage)
1587
+ {
1588
+ /* In principle, the ctxt could be allocated on stack, instead of
1589
+ * with malloc(). However, The java6 jvm (and MS Windows jvm's)
1590
+ * gives JNI clients only a small amount of stack space. Alloc'ing
1591
+ * this (rather large) structure on stack will blow up the JVM.
1592
+ * This was discovered only after much work. Bummer.
1593
+ */
1594
+ char * p;
1595
+ con_context_t *ctxt = (con_context_t *) malloc (sizeof(con_context_t));
1596
+ memset(ctxt, 0, sizeof(con_context_t));
1597
+ p = do_print_flat_constituents(ctxt, linkage);
1598
+ free(ctxt);
1599
+ return p;
1600
+ }
1601
+
1602
+ static CType token_type (char *token)
1603
+ {
1604
+ if ((token[0] == OPEN_BRACKET) && (strlen(token) > 1))
1605
+ return OPEN_TOK;
1606
+ if ((strlen(token) > 1) && (token[strlen(token) - 1] == CLOSE_BRACKET))
1607
+ return CLOSE_TOK;
1608
+ return WORD_TOK;
1609
+ }
1610
+
1611
+ static CNode * make_CNode(char *q)
1612
+ {
1613
+ CNode * cn;
1614
+ cn = (CNode *) exalloc(sizeof(CNode));
1615
+ cn->label = (char *) exalloc(sizeof(char)*(strlen(q)+1));
1616
+ strcpy(cn->label, q);
1617
+ cn->child = cn->next = (CNode *) NULL;
1618
+ cn->next = (CNode *) NULL;
1619
+ cn->start = cn->end = -1;
1620
+ return cn;
1621
+ }
1622
+
1623
+ static CNode * parse_string(CNode * n, char **saveptr)
1624
+ {
1625
+ char *q;
1626
+ CNode *m, *last_child=NULL;
1627
+
1628
+ while ((q = strtok_r(NULL, " ", saveptr))) {
1629
+ switch (token_type(q)) {
1630
+ case CLOSE_TOK :
1631
+ q[strlen(q)-1]='\0';
1632
+ assert(strcmp(q, n->label)==0,
1633
+ "Constituent tree: Labels do not match.");
1634
+ return n;
1635
+ break;
1636
+ case OPEN_TOK:
1637
+ m = make_CNode(q+1);
1638
+ m = parse_string(m, saveptr);
1639
+ break;
1640
+ case WORD_TOK:
1641
+ m = make_CNode(q);
1642
+ break;
1643
+ default:
1644
+ assert(0, "Constituent tree: Illegal token type");
1645
+ }
1646
+ if (n->child == NULL) {
1647
+ last_child = n->child = m;
1648
+ }
1649
+ else {
1650
+ last_child->next = m;
1651
+ last_child = m;
1652
+ }
1653
+ }
1654
+ assert(0, "Constituent tree: Constituent did not close");
1655
+ return NULL;
1656
+ }
1657
+
1658
+ static void print_tree(String * cs, int indent, CNode * n, int o1, int o2)
1659
+ {
1660
+ int i, child_offset;
1661
+ CNode * m;
1662
+
1663
+ if (n == NULL) return;
1664
+
1665
+ if (indent)
1666
+ for (i = 0; i < o1; ++i)
1667
+ append_string(cs, " ");
1668
+ append_string(cs, "(%s ", n->label);
1669
+ child_offset = o2 + strlen(n->label) + 2;
1670
+
1671
+ for (m = n->child; m != NULL; m = m->next)
1672
+ {
1673
+ if (m->child == NULL)
1674
+ {
1675
+ char * p;
1676
+ /* If the original string has left or right parens in it,
1677
+ * the printed string will be messed up by these ...
1678
+ * so replace them by curly braces. What else can one do?
1679
+ */
1680
+ p = strchr(m->label, '(');
1681
+ while(p)
1682
+ {
1683
+ *p = '{';
1684
+ p = strchr(p, '(');
1685
+ }
1686
+
1687
+ p = strchr(m->label, ')');
1688
+ while(p)
1689
+ {
1690
+ *p = '}';
1691
+ p = strchr(p, ')');
1692
+ }
1693
+
1694
+ append_string(cs, "%s", m->label);
1695
+ if ((m->next != NULL) && (m->next->child == NULL))
1696
+ append_string(cs, " ");
1697
+ }
1698
+ else
1699
+ {
1700
+ if (m != n->child)
1701
+ {
1702
+ if (indent) append_string(cs, "\n");
1703
+ else append_string(cs, " ");
1704
+ print_tree(cs, indent, m, child_offset, child_offset);
1705
+ }
1706
+ else
1707
+ {
1708
+ print_tree(cs, indent, m, 0, child_offset);
1709
+ }
1710
+ if ((m->next != NULL) && (m->next->child == NULL))
1711
+ {
1712
+ if (indent)
1713
+ {
1714
+ append_string(cs, "\n");
1715
+ for (i = 0; i < child_offset; ++i)
1716
+ append_string(cs, " ");
1717
+ }
1718
+ else append_string(cs, " ");
1719
+ }
1720
+ }
1721
+ }
1722
+ append_string(cs, ")");
1723
+ }
1724
+
1725
+ static int assign_spans(CNode * n, int start) {
1726
+ int num_words=0;
1727
+ CNode * m=NULL;
1728
+ if (n==NULL) return 0;
1729
+ n->start = start;
1730
+ if (n->child == NULL) {
1731
+ n->end = start;
1732
+ return 1;
1733
+ }
1734
+ else {
1735
+ for (m=n->child; m!=NULL; m=m->next) {
1736
+ num_words += assign_spans(m, start+num_words);
1737
+ }
1738
+ n->end = start+num_words-1;
1739
+ }
1740
+ return num_words;
1741
+ }
1742
+
1743
+ CNode * linkage_constituent_tree(Linkage linkage)
1744
+ {
1745
+ char *p, *q, *saveptr;
1746
+ int len;
1747
+ CNode * root;
1748
+
1749
+ p = print_flat_constituents(linkage);
1750
+
1751
+ len = strlen(p);
1752
+ q = strtok_r(p, " ", &saveptr);
1753
+ assert(token_type(q) == OPEN_TOK, "Illegal beginning of string");
1754
+ root = make_CNode(q+1);
1755
+ root = parse_string(root, &saveptr);
1756
+ assign_spans(root, 0);
1757
+ exfree(p, sizeof(char)*(len+1));
1758
+ return root;
1759
+ }
1760
+
1761
+ void linkage_free_constituent_tree(CNode * n)
1762
+ {
1763
+ CNode *m, *x;
1764
+ for (m=n->child; m!=NULL; m=x) {
1765
+ x=m->next;
1766
+ linkage_free_constituent_tree(m);
1767
+ }
1768
+ exfree(n->label, sizeof(char)*(strlen(n->label)+1));
1769
+ exfree(n, sizeof(CNode));
1770
+ }
1771
+
1772
+ /**
1773
+ * Print out the constituent tree.
1774
+ * mode 1: treebank-style constituent tree
1775
+ * mode 2: flat, bracketed tree [A like [B this B] A]
1776
+ * mode 3: flat, treebank-style tree (A like (B this) )
1777
+ */
1778
+ char * linkage_print_constituent_tree(Linkage linkage, int mode)
1779
+ {
1780
+ String * cs;
1781
+ CNode * root;
1782
+ char * p;
1783
+
1784
+ if ((mode == 0) || (linkage->sent->dict->constituent_pp == NULL))
1785
+ {
1786
+ return NULL;
1787
+ }
1788
+ else if (mode == 1 || mode == 3)
1789
+ {
1790
+ cs = string_new();
1791
+ root = linkage_constituent_tree(linkage);
1792
+ print_tree(cs, (mode==1), root, 0, 0);
1793
+ linkage_free_constituent_tree(root);
1794
+ append_string(cs, "\n");
1795
+ p = string_copy(cs);
1796
+ string_delete(cs);
1797
+ return p;
1798
+ }
1799
+ else if (mode == 2)
1800
+ {
1801
+ return print_flat_constituents(linkage);
1802
+ }
1803
+ assert(0, "Illegal mode in linkage_print_constituent_tree");
1804
+ return NULL;
1805
+ }
1806
+
1807
+ void linkage_free_constituent_tree_str(char * s)
1808
+ {
1809
+ exfree(s, strlen(s)+1);
1810
+ }
1811
+
1812
+ const char * linkage_constituent_node_get_label(const CNode *n)
1813
+ {
1814
+ return n->label;
1815
+ }
1816
+
1817
+
1818
+ CNode * linkage_constituent_node_get_child(const CNode *n)
1819
+ {
1820
+ return n->child;
1821
+ }
1822
+
1823
+ CNode * linkage_constituent_node_get_next(const CNode *n)
1824
+ {
1825
+ return n->next;
1826
+ }
1827
+
1828
+ int linkage_constituent_node_get_start(const CNode *n)
1829
+ {
1830
+ return n->start;
1831
+ }
1832
+
1833
+ int linkage_constituent_node_get_end(const CNode *n)
1834
+ {
1835
+ return n->end;
1836
+ }