grammar_cop 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (344) hide show
  1. data/.DS_Store +0 -0
  2. data/.gitignore +4 -0
  3. data/Gemfile +4 -0
  4. data/Rakefile +8 -0
  5. data/data/.DS_Store +0 -0
  6. data/data/Makefile +511 -0
  7. data/data/Makefile.am +4 -0
  8. data/data/Makefile.in +511 -0
  9. data/data/de/.DS_Store +0 -0
  10. data/data/de/4.0.affix +7 -0
  11. data/data/de/4.0.dict +474 -0
  12. data/data/de/Makefile +387 -0
  13. data/data/de/Makefile.am +9 -0
  14. data/data/de/Makefile.in +387 -0
  15. data/data/en/.DS_Store +0 -0
  16. data/data/en/4.0.affix +26 -0
  17. data/data/en/4.0.batch +1002 -0
  18. data/data/en/4.0.biolg.batch +411 -0
  19. data/data/en/4.0.constituent-knowledge +127 -0
  20. data/data/en/4.0.dict +8759 -0
  21. data/data/en/4.0.dict.m4 +6928 -0
  22. data/data/en/4.0.enwiki.batch +14 -0
  23. data/data/en/4.0.fixes.batch +2776 -0
  24. data/data/en/4.0.knowledge +306 -0
  25. data/data/en/4.0.regex +225 -0
  26. data/data/en/4.0.voa.batch +114 -0
  27. data/data/en/Makefile +554 -0
  28. data/data/en/Makefile.am +19 -0
  29. data/data/en/Makefile.in +554 -0
  30. data/data/en/README +173 -0
  31. data/data/en/tiny.dict +157 -0
  32. data/data/en/words/.DS_Store +0 -0
  33. data/data/en/words/Makefile +456 -0
  34. data/data/en/words/Makefile.am +78 -0
  35. data/data/en/words/Makefile.in +456 -0
  36. data/data/en/words/currency +205 -0
  37. data/data/en/words/currency.p +28 -0
  38. data/data/en/words/entities.given-bisex.sing +39 -0
  39. data/data/en/words/entities.given-female.sing +4141 -0
  40. data/data/en/words/entities.given-male.sing +1633 -0
  41. data/data/en/words/entities.locations.sing +68 -0
  42. data/data/en/words/entities.national.sing +253 -0
  43. data/data/en/words/entities.organizations.sing +7 -0
  44. data/data/en/words/entities.us-states.sing +11 -0
  45. data/data/en/words/units.1 +45 -0
  46. data/data/en/words/units.1.dot +4 -0
  47. data/data/en/words/units.3 +2 -0
  48. data/data/en/words/units.4 +5 -0
  49. data/data/en/words/units.4.dot +1 -0
  50. data/data/en/words/words-medical.adv.1 +1191 -0
  51. data/data/en/words/words-medical.prep.1 +67 -0
  52. data/data/en/words/words-medical.v.4.1 +2835 -0
  53. data/data/en/words/words-medical.v.4.2 +2848 -0
  54. data/data/en/words/words-medical.v.4.3 +3011 -0
  55. data/data/en/words/words-medical.v.4.4 +3036 -0
  56. data/data/en/words/words-medical.v.4.5 +3050 -0
  57. data/data/en/words/words.adj.1 +6794 -0
  58. data/data/en/words/words.adj.2 +638 -0
  59. data/data/en/words/words.adj.3 +667 -0
  60. data/data/en/words/words.adv.1 +1573 -0
  61. data/data/en/words/words.adv.2 +67 -0
  62. data/data/en/words/words.adv.3 +157 -0
  63. data/data/en/words/words.adv.4 +80 -0
  64. data/data/en/words/words.n.1 +11464 -0
  65. data/data/en/words/words.n.1.wiki +264 -0
  66. data/data/en/words/words.n.2.s +2017 -0
  67. data/data/en/words/words.n.2.s.biolg +1 -0
  68. data/data/en/words/words.n.2.s.wiki +298 -0
  69. data/data/en/words/words.n.2.x +65 -0
  70. data/data/en/words/words.n.2.x.wiki +10 -0
  71. data/data/en/words/words.n.3 +5717 -0
  72. data/data/en/words/words.n.t +23 -0
  73. data/data/en/words/words.v.1.1 +1038 -0
  74. data/data/en/words/words.v.1.2 +1043 -0
  75. data/data/en/words/words.v.1.3 +1052 -0
  76. data/data/en/words/words.v.1.4 +1023 -0
  77. data/data/en/words/words.v.1.p +17 -0
  78. data/data/en/words/words.v.10.1 +14 -0
  79. data/data/en/words/words.v.10.2 +15 -0
  80. data/data/en/words/words.v.10.3 +88 -0
  81. data/data/en/words/words.v.10.4 +17 -0
  82. data/data/en/words/words.v.2.1 +1253 -0
  83. data/data/en/words/words.v.2.2 +1304 -0
  84. data/data/en/words/words.v.2.3 +1280 -0
  85. data/data/en/words/words.v.2.4 +1285 -0
  86. data/data/en/words/words.v.2.5 +1287 -0
  87. data/data/en/words/words.v.4.1 +2472 -0
  88. data/data/en/words/words.v.4.2 +2487 -0
  89. data/data/en/words/words.v.4.3 +2441 -0
  90. data/data/en/words/words.v.4.4 +2478 -0
  91. data/data/en/words/words.v.4.5 +2483 -0
  92. data/data/en/words/words.v.5.1 +98 -0
  93. data/data/en/words/words.v.5.2 +98 -0
  94. data/data/en/words/words.v.5.3 +103 -0
  95. data/data/en/words/words.v.5.4 +102 -0
  96. data/data/en/words/words.v.6.1 +388 -0
  97. data/data/en/words/words.v.6.2 +401 -0
  98. data/data/en/words/words.v.6.3 +397 -0
  99. data/data/en/words/words.v.6.4 +405 -0
  100. data/data/en/words/words.v.6.5 +401 -0
  101. data/data/en/words/words.v.8.1 +117 -0
  102. data/data/en/words/words.v.8.2 +118 -0
  103. data/data/en/words/words.v.8.3 +118 -0
  104. data/data/en/words/words.v.8.4 +119 -0
  105. data/data/en/words/words.v.8.5 +119 -0
  106. data/data/en/words/words.y +104 -0
  107. data/data/lt/.DS_Store +0 -0
  108. data/data/lt/4.0.affix +6 -0
  109. data/data/lt/4.0.constituent-knowledge +24 -0
  110. data/data/lt/4.0.dict +135 -0
  111. data/data/lt/4.0.knowledge +38 -0
  112. data/data/lt/Makefile +389 -0
  113. data/data/lt/Makefile.am +11 -0
  114. data/data/lt/Makefile.in +389 -0
  115. data/ext/.DS_Store +0 -0
  116. data/ext/link_grammar/.DS_Store +0 -0
  117. data/ext/link_grammar/extconf.rb +2 -0
  118. data/ext/link_grammar/link-grammar/.DS_Store +0 -0
  119. data/ext/link_grammar/link-grammar/.deps/analyze-linkage.Plo +198 -0
  120. data/ext/link_grammar/link-grammar/.deps/and.Plo +202 -0
  121. data/ext/link_grammar/link-grammar/.deps/api.Plo +244 -0
  122. data/ext/link_grammar/link-grammar/.deps/build-disjuncts.Plo +212 -0
  123. data/ext/link_grammar/link-grammar/.deps/command-line.Plo +201 -0
  124. data/ext/link_grammar/link-grammar/.deps/constituents.Plo +201 -0
  125. data/ext/link_grammar/link-grammar/.deps/count.Plo +202 -0
  126. data/ext/link_grammar/link-grammar/.deps/disjunct-utils.Plo +126 -0
  127. data/ext/link_grammar/link-grammar/.deps/disjuncts.Plo +123 -0
  128. data/ext/link_grammar/link-grammar/.deps/error.Plo +121 -0
  129. data/ext/link_grammar/link-grammar/.deps/expand.Plo +133 -0
  130. data/ext/link_grammar/link-grammar/.deps/extract-links.Plo +198 -0
  131. data/ext/link_grammar/link-grammar/.deps/fast-match.Plo +200 -0
  132. data/ext/link_grammar/link-grammar/.deps/idiom.Plo +200 -0
  133. data/ext/link_grammar/link-grammar/.deps/jni-client.Plo +217 -0
  134. data/ext/link_grammar/link-grammar/.deps/link-parser.Po +1 -0
  135. data/ext/link_grammar/link-grammar/.deps/massage.Plo +202 -0
  136. data/ext/link_grammar/link-grammar/.deps/post-process.Plo +202 -0
  137. data/ext/link_grammar/link-grammar/.deps/pp_knowledge.Plo +202 -0
  138. data/ext/link_grammar/link-grammar/.deps/pp_lexer.Plo +201 -0
  139. data/ext/link_grammar/link-grammar/.deps/pp_linkset.Plo +200 -0
  140. data/ext/link_grammar/link-grammar/.deps/prefix.Plo +102 -0
  141. data/ext/link_grammar/link-grammar/.deps/preparation.Plo +202 -0
  142. data/ext/link_grammar/link-grammar/.deps/print-util.Plo +200 -0
  143. data/ext/link_grammar/link-grammar/.deps/print.Plo +201 -0
  144. data/ext/link_grammar/link-grammar/.deps/prune.Plo +202 -0
  145. data/ext/link_grammar/link-grammar/.deps/read-dict.Plo +223 -0
  146. data/ext/link_grammar/link-grammar/.deps/read-regex.Plo +123 -0
  147. data/ext/link_grammar/link-grammar/.deps/regex-morph.Plo +131 -0
  148. data/ext/link_grammar/link-grammar/.deps/resources.Plo +203 -0
  149. data/ext/link_grammar/link-grammar/.deps/spellcheck-aspell.Plo +1 -0
  150. data/ext/link_grammar/link-grammar/.deps/spellcheck-hun.Plo +115 -0
  151. data/ext/link_grammar/link-grammar/.deps/string-set.Plo +198 -0
  152. data/ext/link_grammar/link-grammar/.deps/tokenize.Plo +160 -0
  153. data/ext/link_grammar/link-grammar/.deps/utilities.Plo +222 -0
  154. data/ext/link_grammar/link-grammar/.deps/word-file.Plo +201 -0
  155. data/ext/link_grammar/link-grammar/.deps/word-utils.Plo +212 -0
  156. data/ext/link_grammar/link-grammar/.libs/analyze-linkage.o +0 -0
  157. data/ext/link_grammar/link-grammar/.libs/and.o +0 -0
  158. data/ext/link_grammar/link-grammar/.libs/api.o +0 -0
  159. data/ext/link_grammar/link-grammar/.libs/build-disjuncts.o +0 -0
  160. data/ext/link_grammar/link-grammar/.libs/command-line.o +0 -0
  161. data/ext/link_grammar/link-grammar/.libs/constituents.o +0 -0
  162. data/ext/link_grammar/link-grammar/.libs/count.o +0 -0
  163. data/ext/link_grammar/link-grammar/.libs/disjunct-utils.o +0 -0
  164. data/ext/link_grammar/link-grammar/.libs/disjuncts.o +0 -0
  165. data/ext/link_grammar/link-grammar/.libs/error.o +0 -0
  166. data/ext/link_grammar/link-grammar/.libs/expand.o +0 -0
  167. data/ext/link_grammar/link-grammar/.libs/extract-links.o +0 -0
  168. data/ext/link_grammar/link-grammar/.libs/fast-match.o +0 -0
  169. data/ext/link_grammar/link-grammar/.libs/idiom.o +0 -0
  170. data/ext/link_grammar/link-grammar/.libs/jni-client.o +0 -0
  171. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java-symbols.expsym +31 -0
  172. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib +0 -0
  173. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib.dSYM/Contents/Info.plist +20 -0
  174. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib.dSYM/Contents/Resources/DWARF/liblink-grammar-java.4.dylib +0 -0
  175. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.a +0 -0
  176. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.dylib +0 -0
  177. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-symbols.expsym +194 -0
  178. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib +0 -0
  179. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib.dSYM/Contents/Info.plist +20 -0
  180. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib.dSYM/Contents/Resources/DWARF/liblink-grammar.4.dylib +0 -0
  181. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.a +0 -0
  182. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.dylib +0 -0
  183. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.la +41 -0
  184. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.lai +41 -0
  185. data/ext/link_grammar/link-grammar/.libs/massage.o +0 -0
  186. data/ext/link_grammar/link-grammar/.libs/post-process.o +0 -0
  187. data/ext/link_grammar/link-grammar/.libs/pp_knowledge.o +0 -0
  188. data/ext/link_grammar/link-grammar/.libs/pp_lexer.o +0 -0
  189. data/ext/link_grammar/link-grammar/.libs/pp_linkset.o +0 -0
  190. data/ext/link_grammar/link-grammar/.libs/prefix.o +0 -0
  191. data/ext/link_grammar/link-grammar/.libs/preparation.o +0 -0
  192. data/ext/link_grammar/link-grammar/.libs/print-util.o +0 -0
  193. data/ext/link_grammar/link-grammar/.libs/print.o +0 -0
  194. data/ext/link_grammar/link-grammar/.libs/prune.o +0 -0
  195. data/ext/link_grammar/link-grammar/.libs/read-dict.o +0 -0
  196. data/ext/link_grammar/link-grammar/.libs/read-regex.o +0 -0
  197. data/ext/link_grammar/link-grammar/.libs/regex-morph.o +0 -0
  198. data/ext/link_grammar/link-grammar/.libs/resources.o +0 -0
  199. data/ext/link_grammar/link-grammar/.libs/spellcheck-aspell.o +0 -0
  200. data/ext/link_grammar/link-grammar/.libs/spellcheck-hun.o +0 -0
  201. data/ext/link_grammar/link-grammar/.libs/string-set.o +0 -0
  202. data/ext/link_grammar/link-grammar/.libs/tokenize.o +0 -0
  203. data/ext/link_grammar/link-grammar/.libs/utilities.o +0 -0
  204. data/ext/link_grammar/link-grammar/.libs/word-file.o +0 -0
  205. data/ext/link_grammar/link-grammar/.libs/word-utils.o +0 -0
  206. data/ext/link_grammar/link-grammar/Makefile +900 -0
  207. data/ext/link_grammar/link-grammar/Makefile.am +202 -0
  208. data/ext/link_grammar/link-grammar/Makefile.in +900 -0
  209. data/ext/link_grammar/link-grammar/analyze-linkage.c +1317 -0
  210. data/ext/link_grammar/link-grammar/analyze-linkage.h +24 -0
  211. data/ext/link_grammar/link-grammar/and.c +1603 -0
  212. data/ext/link_grammar/link-grammar/and.h +27 -0
  213. data/ext/link_grammar/link-grammar/api-structures.h +362 -0
  214. data/ext/link_grammar/link-grammar/api-types.h +72 -0
  215. data/ext/link_grammar/link-grammar/api.c +1887 -0
  216. data/ext/link_grammar/link-grammar/api.h +96 -0
  217. data/ext/link_grammar/link-grammar/autoit/.DS_Store +0 -0
  218. data/ext/link_grammar/link-grammar/autoit/README +10 -0
  219. data/ext/link_grammar/link-grammar/autoit/_LGTest.au3 +22 -0
  220. data/ext/link_grammar/link-grammar/autoit/_LinkGrammar.au3 +545 -0
  221. data/ext/link_grammar/link-grammar/build-disjuncts.c +487 -0
  222. data/ext/link_grammar/link-grammar/build-disjuncts.h +21 -0
  223. data/ext/link_grammar/link-grammar/command-line.c +458 -0
  224. data/ext/link_grammar/link-grammar/command-line.h +15 -0
  225. data/ext/link_grammar/link-grammar/constituents.c +1836 -0
  226. data/ext/link_grammar/link-grammar/constituents.h +26 -0
  227. data/ext/link_grammar/link-grammar/corpus/.DS_Store +0 -0
  228. data/ext/link_grammar/link-grammar/corpus/.deps/cluster.Plo +1 -0
  229. data/ext/link_grammar/link-grammar/corpus/.deps/corpus.Plo +1 -0
  230. data/ext/link_grammar/link-grammar/corpus/Makefile +527 -0
  231. data/ext/link_grammar/link-grammar/corpus/Makefile.am +46 -0
  232. data/ext/link_grammar/link-grammar/corpus/Makefile.in +527 -0
  233. data/ext/link_grammar/link-grammar/corpus/README +17 -0
  234. data/ext/link_grammar/link-grammar/corpus/cluster.c +286 -0
  235. data/ext/link_grammar/link-grammar/corpus/cluster.h +32 -0
  236. data/ext/link_grammar/link-grammar/corpus/corpus.c +483 -0
  237. data/ext/link_grammar/link-grammar/corpus/corpus.h +46 -0
  238. data/ext/link_grammar/link-grammar/count.c +828 -0
  239. data/ext/link_grammar/link-grammar/count.h +25 -0
  240. data/ext/link_grammar/link-grammar/disjunct-utils.c +261 -0
  241. data/ext/link_grammar/link-grammar/disjunct-utils.h +27 -0
  242. data/ext/link_grammar/link-grammar/disjuncts.c +138 -0
  243. data/ext/link_grammar/link-grammar/disjuncts.h +13 -0
  244. data/ext/link_grammar/link-grammar/error.c +92 -0
  245. data/ext/link_grammar/link-grammar/error.h +35 -0
  246. data/ext/link_grammar/link-grammar/expand.c +67 -0
  247. data/ext/link_grammar/link-grammar/expand.h +13 -0
  248. data/ext/link_grammar/link-grammar/externs.h +22 -0
  249. data/ext/link_grammar/link-grammar/extract-links.c +625 -0
  250. data/ext/link_grammar/link-grammar/extract-links.h +16 -0
  251. data/ext/link_grammar/link-grammar/fast-match.c +309 -0
  252. data/ext/link_grammar/link-grammar/fast-match.h +17 -0
  253. data/ext/link_grammar/link-grammar/idiom.c +373 -0
  254. data/ext/link_grammar/link-grammar/idiom.h +15 -0
  255. data/ext/link_grammar/link-grammar/jni-client.c +779 -0
  256. data/ext/link_grammar/link-grammar/jni-client.h +236 -0
  257. data/ext/link_grammar/link-grammar/liblink-grammar-java.la +42 -0
  258. data/ext/link_grammar/link-grammar/liblink-grammar.la +41 -0
  259. data/ext/link_grammar/link-grammar/link-features.h +37 -0
  260. data/ext/link_grammar/link-grammar/link-features.h.in +37 -0
  261. data/ext/link_grammar/link-grammar/link-grammar-java.def +31 -0
  262. data/ext/link_grammar/link-grammar/link-grammar.def +194 -0
  263. data/ext/link_grammar/link-grammar/link-includes.h +465 -0
  264. data/ext/link_grammar/link-grammar/link-parser.c +849 -0
  265. data/ext/link_grammar/link-grammar/massage.c +329 -0
  266. data/ext/link_grammar/link-grammar/massage.h +13 -0
  267. data/ext/link_grammar/link-grammar/post-process.c +1113 -0
  268. data/ext/link_grammar/link-grammar/post-process.h +45 -0
  269. data/ext/link_grammar/link-grammar/pp_knowledge.c +376 -0
  270. data/ext/link_grammar/link-grammar/pp_knowledge.h +14 -0
  271. data/ext/link_grammar/link-grammar/pp_lexer.c +1920 -0
  272. data/ext/link_grammar/link-grammar/pp_lexer.h +19 -0
  273. data/ext/link_grammar/link-grammar/pp_linkset.c +158 -0
  274. data/ext/link_grammar/link-grammar/pp_linkset.h +20 -0
  275. data/ext/link_grammar/link-grammar/prefix.c +482 -0
  276. data/ext/link_grammar/link-grammar/prefix.h +139 -0
  277. data/ext/link_grammar/link-grammar/preparation.c +412 -0
  278. data/ext/link_grammar/link-grammar/preparation.h +20 -0
  279. data/ext/link_grammar/link-grammar/print-util.c +87 -0
  280. data/ext/link_grammar/link-grammar/print-util.h +32 -0
  281. data/ext/link_grammar/link-grammar/print.c +1085 -0
  282. data/ext/link_grammar/link-grammar/print.h +16 -0
  283. data/ext/link_grammar/link-grammar/prune.c +1864 -0
  284. data/ext/link_grammar/link-grammar/prune.h +17 -0
  285. data/ext/link_grammar/link-grammar/read-dict.c +1785 -0
  286. data/ext/link_grammar/link-grammar/read-dict.h +29 -0
  287. data/ext/link_grammar/link-grammar/read-regex.c +161 -0
  288. data/ext/link_grammar/link-grammar/read-regex.h +12 -0
  289. data/ext/link_grammar/link-grammar/regex-morph.c +126 -0
  290. data/ext/link_grammar/link-grammar/regex-morph.h +17 -0
  291. data/ext/link_grammar/link-grammar/resources.c +180 -0
  292. data/ext/link_grammar/link-grammar/resources.h +23 -0
  293. data/ext/link_grammar/link-grammar/sat-solver/.DS_Store +0 -0
  294. data/ext/link_grammar/link-grammar/sat-solver/.deps/fast-sprintf.Plo +1 -0
  295. data/ext/link_grammar/link-grammar/sat-solver/.deps/sat-encoder.Plo +1 -0
  296. data/ext/link_grammar/link-grammar/sat-solver/.deps/util.Plo +1 -0
  297. data/ext/link_grammar/link-grammar/sat-solver/.deps/variables.Plo +1 -0
  298. data/ext/link_grammar/link-grammar/sat-solver/.deps/word-tag.Plo +1 -0
  299. data/ext/link_grammar/link-grammar/sat-solver/Makefile +527 -0
  300. data/ext/link_grammar/link-grammar/sat-solver/Makefile.am +29 -0
  301. data/ext/link_grammar/link-grammar/sat-solver/Makefile.in +527 -0
  302. data/ext/link_grammar/link-grammar/sat-solver/clock.hpp +33 -0
  303. data/ext/link_grammar/link-grammar/sat-solver/fast-sprintf.cpp +26 -0
  304. data/ext/link_grammar/link-grammar/sat-solver/fast-sprintf.hpp +7 -0
  305. data/ext/link_grammar/link-grammar/sat-solver/guiding.hpp +244 -0
  306. data/ext/link_grammar/link-grammar/sat-solver/matrix-ut.hpp +79 -0
  307. data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.cpp +2811 -0
  308. data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.h +11 -0
  309. data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.hpp +381 -0
  310. data/ext/link_grammar/link-grammar/sat-solver/trie.hpp +118 -0
  311. data/ext/link_grammar/link-grammar/sat-solver/util.cpp +23 -0
  312. data/ext/link_grammar/link-grammar/sat-solver/util.hpp +14 -0
  313. data/ext/link_grammar/link-grammar/sat-solver/variables.cpp +5 -0
  314. data/ext/link_grammar/link-grammar/sat-solver/variables.hpp +829 -0
  315. data/ext/link_grammar/link-grammar/sat-solver/word-tag.cpp +159 -0
  316. data/ext/link_grammar/link-grammar/sat-solver/word-tag.hpp +162 -0
  317. data/ext/link_grammar/link-grammar/spellcheck-aspell.c +148 -0
  318. data/ext/link_grammar/link-grammar/spellcheck-hun.c +136 -0
  319. data/ext/link_grammar/link-grammar/spellcheck.h +34 -0
  320. data/ext/link_grammar/link-grammar/string-set.c +169 -0
  321. data/ext/link_grammar/link-grammar/string-set.h +16 -0
  322. data/ext/link_grammar/link-grammar/structures.h +498 -0
  323. data/ext/link_grammar/link-grammar/tokenize.c +1049 -0
  324. data/ext/link_grammar/link-grammar/tokenize.h +15 -0
  325. data/ext/link_grammar/link-grammar/utilities.c +847 -0
  326. data/ext/link_grammar/link-grammar/utilities.h +281 -0
  327. data/ext/link_grammar/link-grammar/word-file.c +124 -0
  328. data/ext/link_grammar/link-grammar/word-file.h +15 -0
  329. data/ext/link_grammar/link-grammar/word-utils.c +526 -0
  330. data/ext/link_grammar/link-grammar/word-utils.h +152 -0
  331. data/ext/link_grammar/link_grammar.c +202 -0
  332. data/ext/link_grammar/link_grammar.h +99 -0
  333. data/grammar_cop.gemspec +24 -0
  334. data/lib/.DS_Store +0 -0
  335. data/lib/grammar_cop.rb +9 -0
  336. data/lib/grammar_cop/.DS_Store +0 -0
  337. data/lib/grammar_cop/dictionary.rb +19 -0
  338. data/lib/grammar_cop/linkage.rb +30 -0
  339. data/lib/grammar_cop/parse_options.rb +32 -0
  340. data/lib/grammar_cop/sentence.rb +36 -0
  341. data/lib/grammar_cop/version.rb +3 -0
  342. data/test/.DS_Store +0 -0
  343. data/test/grammar_cop_test.rb +27 -0
  344. metadata +407 -0
@@ -0,0 +1,1887 @@
1
+ /*************************************************************************/
2
+ /* Copyright (c) 2004 */
3
+ /* Daniel Sleator, David Temperley, and John Lafferty */
4
+ /* Copyright 2008, 2009 Linas Vepstas */
5
+ /* All rights reserved */
6
+ /* */
7
+ /* Use of the link grammar parsing system is subject to the terms of the */
8
+ /* license set forth in the LICENSE file included with this software, */
9
+ /* and also available at http://www.link.cs.cmu.edu/link/license.html */
10
+ /* This license allows free redistribution and use in source and binary */
11
+ /* forms, with or without modification, subject to certain conditions. */
12
+ /* */
13
+ /*************************************************************************/
14
+
15
+ #ifndef API_C
16
+ #define API_C
17
+
18
+ #include <limits.h>
19
+ #include <math.h>
20
+ #include <string.h>
21
+
22
+ #include "api.h"
23
+ #include "disjuncts.h"
24
+ #include "error.h"
25
+ #include "preparation.h"
26
+ #include "read-regex.h"
27
+ #include "regex-morph.h"
28
+ #include "sat-solver/sat-encoder.h"
29
+ #include "corpus/corpus.h"
30
+ #include "spellcheck.h"
31
+
32
+ /***************************************************************
33
+ *
34
+ * Routines for setting Parse_Options
35
+ *
36
+ ****************************************************************/
37
+ static int VDAL_compare_parse(Linkage_info * p1, Linkage_info * p2)
38
+ {
39
+ /* for sorting the linkages in postprocessing */
40
+ if (p1->N_violations != p2->N_violations) {
41
+ return (p1->N_violations - p2->N_violations);
42
+ }
43
+ else if (p1->unused_word_cost != p2->unused_word_cost) {
44
+ return (p1->unused_word_cost - p2->unused_word_cost);
45
+ }
46
+ else if (p1->fat != p2->fat) {
47
+ return (p1->fat - p2->fat);
48
+ }
49
+ else if (p1->disjunct_cost != p2->disjunct_cost) {
50
+ return (p1->disjunct_cost - p2->disjunct_cost);
51
+ }
52
+ else if (p1->and_cost != p2->and_cost) {
53
+ return (p1->and_cost - p2->and_cost);
54
+ }
55
+ else {
56
+ return (p1->link_cost - p2->link_cost);
57
+ }
58
+ }
59
+
60
+ #ifdef USE_CORPUS
61
+ static int CORP_compare_parse(Linkage_info * p1, Linkage_info * p2)
62
+ {
63
+ double diff = p1->corpus_cost - p2->corpus_cost;
64
+ if (fabs(diff) < 1.0e-5)
65
+ return VDAL_compare_parse(p1, p2);
66
+ if (diff < 0.0f) return -1;
67
+ return 1;
68
+ }
69
+ #endif
70
+
71
+ /**
72
+ * Create and initialize a Parse_Options object
73
+ */
74
+ Parse_Options parse_options_create(void)
75
+ {
76
+ Parse_Options po;
77
+
78
+ init_memusage();
79
+ po = (Parse_Options) xalloc(sizeof(struct Parse_Options_s));
80
+
81
+ /* Here's where the values are initialized */
82
+ po->verbosity = 1;
83
+ po->linkage_limit = 100;
84
+ po->disjunct_cost = MAX_DISJUNCT_COST;
85
+ po->use_fat_links = FALSE;
86
+ po->min_null_count = 0;
87
+ po->max_null_count = 0;
88
+ po->null_block = 1;
89
+ po->islands_ok = FALSE;
90
+ po->use_spell_guess = TRUE;
91
+ po->use_sat_solver = FALSE;
92
+
93
+ #ifdef XXX_USE_CORPUS
94
+ /* Use the corpus cost model, if available.
95
+ * It really does a better job at parse ranking.
96
+ * Err .. sometimes ...
97
+ */
98
+ po->cost_model.compare_fn = &CORP_compare_parse;
99
+ po->cost_model.type = CORPUS;
100
+ #else /* USE_CORPUS */
101
+ po->cost_model.compare_fn = &VDAL_compare_parse;
102
+ po->cost_model.type = VDAL;
103
+ #endif /* USE_CORPUS */
104
+ po->short_length = 6;
105
+ po->all_short = FALSE;
106
+ po->twopass_length = 30;
107
+ po->max_sentence_length = 170;
108
+ po->resources = resources_create();
109
+ po->display_short = TRUE;
110
+ po->display_word_subscripts = TRUE;
111
+ po->display_link_subscripts = TRUE;
112
+ po->display_walls = FALSE;
113
+ po->display_union = FALSE;
114
+ po->allow_null = TRUE;
115
+ po->use_cluster_disjuncts = FALSE;
116
+ po->echo_on = FALSE;
117
+ po->batch_mode = FALSE;
118
+ po->panic_mode = FALSE;
119
+ po->screen_width = 79;
120
+ po->display_on = TRUE;
121
+ po->display_postscript = FALSE;
122
+ po->display_constituents = 0;
123
+ po->display_bad = FALSE;
124
+ po->display_disjuncts = FALSE;
125
+ po->display_links = FALSE;
126
+ po->display_senses = FALSE;
127
+
128
+ return po;
129
+ }
130
+
131
+ int parse_options_delete(Parse_Options opts)
132
+ {
133
+ resources_delete(opts->resources);
134
+ xfree(opts, sizeof(struct Parse_Options_s));
135
+ return 0;
136
+ }
137
+
138
+ void parse_options_set_cost_model_type(Parse_Options opts, int cm)
139
+ {
140
+ switch(cm) {
141
+ case VDAL:
142
+ opts->cost_model.type = VDAL;
143
+ opts->cost_model.compare_fn = &VDAL_compare_parse;
144
+ break;
145
+ case CORPUS:
146
+ #ifdef USE_CORPUS
147
+ opts->cost_model.type = CORPUS;
148
+ opts->cost_model.compare_fn = &CORP_compare_parse;
149
+ #else
150
+ prt_error("Error: Source code compiled with cost model 'CORPUS' disabled.\n");
151
+ #endif
152
+ break;
153
+ default:
154
+ prt_error("Error: Illegal cost model: %d\n", cm);
155
+ }
156
+ }
157
+
158
+ int parse_options_get_cost_model_type(Parse_Options opts)
159
+ {
160
+ return opts->cost_model.type;
161
+ }
162
+
163
+ void parse_options_set_verbosity(Parse_Options opts, int dummy)
164
+ {
165
+ opts->verbosity = dummy;
166
+ verbosity = opts->verbosity;
167
+ /* this is one of the only global variables. */
168
+ }
169
+
170
+ int parse_options_get_verbosity(Parse_Options opts) {
171
+ return opts->verbosity;
172
+ }
173
+
174
+ void parse_options_set_use_sat_parser(Parse_Options opts, int dummy) {
175
+ #ifdef USE_SAT_SOLVER
176
+ opts->use_sat_solver = dummy;
177
+ #else
178
+ prt_error("Error: cannot enable the Boolean SAT parser; this "
179
+ " library was built without SAT solver support.\n");
180
+ #endif
181
+ }
182
+ int parse_options_get_use_sat_parser(Parse_Options opts) {
183
+ return opts->use_sat_solver;
184
+ }
185
+
186
+ void parse_options_set_use_fat_links(Parse_Options opts, int dummy) {
187
+ opts->use_fat_links = dummy;
188
+ }
189
+ int parse_options_get_use_fat_links(Parse_Options opts) {
190
+ return opts->use_fat_links;
191
+ }
192
+
193
+ void parse_options_set_linkage_limit(Parse_Options opts, int dummy) {
194
+ opts->linkage_limit = dummy;
195
+ }
196
+ int parse_options_get_linkage_limit(Parse_Options opts) {
197
+ return opts->linkage_limit;
198
+ }
199
+
200
+ void parse_options_set_disjunct_cost(Parse_Options opts, int dummy) {
201
+ opts->disjunct_cost = dummy;
202
+ }
203
+ void parse_options_set_disjunct_costf(Parse_Options opts, float dummy) {
204
+ opts->disjunct_cost = dummy;
205
+ }
206
+ int parse_options_get_disjunct_cost(Parse_Options opts) {
207
+ return opts->disjunct_cost;
208
+ }
209
+ float parse_options_get_disjunct_costf(Parse_Options opts) {
210
+ return opts->disjunct_cost;
211
+ }
212
+
213
+ void parse_options_set_min_null_count(Parse_Options opts, int val) {
214
+ opts->min_null_count = val;
215
+ }
216
+ int parse_options_get_min_null_count(Parse_Options opts) {
217
+ return opts->min_null_count;
218
+ }
219
+
220
+ void parse_options_set_max_null_count(Parse_Options opts, int val) {
221
+ opts->max_null_count = val;
222
+ }
223
+ int parse_options_get_max_null_count(Parse_Options opts) {
224
+ return opts->max_null_count;
225
+ }
226
+
227
+
228
+ void parse_options_set_null_block(Parse_Options opts, int dummy) {
229
+ opts->null_block = dummy;
230
+ }
231
+ int parse_options_get_null_block(Parse_Options opts) {
232
+ return opts->null_block;
233
+ }
234
+
235
+ void parse_options_set_islands_ok(Parse_Options opts, int dummy) {
236
+ opts->islands_ok = dummy;
237
+ }
238
+
239
+ int parse_options_get_islands_ok(Parse_Options opts) {
240
+ return opts->islands_ok;
241
+ }
242
+
243
+ void parse_options_set_spell_guess(Parse_Options opts, int dummy) {
244
+ opts->use_spell_guess = dummy;
245
+ }
246
+
247
+ int parse_options_get_spell_guess(Parse_Options opts) {
248
+ return opts->use_spell_guess;
249
+ }
250
+
251
+ void parse_options_set_short_length(Parse_Options opts, int short_length) {
252
+ opts->short_length = short_length;
253
+ }
254
+
255
+ int parse_options_get_short_length(Parse_Options opts) {
256
+ return opts->short_length;
257
+ }
258
+
259
+ void parse_options_set_all_short_connectors(Parse_Options opts, int val) {
260
+ opts->all_short = val;
261
+ }
262
+
263
+ int parse_options_get_all_short_connectors(Parse_Options opts) {
264
+ return opts->all_short;
265
+ }
266
+
267
+ void parse_options_set_max_parse_time(Parse_Options opts, int dummy) {
268
+ opts->resources->max_parse_time = dummy;
269
+ }
270
+
271
+ int parse_options_get_max_parse_time(Parse_Options opts) {
272
+ return opts->resources->max_parse_time;
273
+ }
274
+
275
+ void parse_options_set_max_memory(Parse_Options opts, int dummy) {
276
+ opts->resources->max_memory = dummy;
277
+ }
278
+
279
+ int parse_options_get_max_memory(Parse_Options opts) {
280
+ return opts->resources->max_memory;
281
+ }
282
+
283
+ void parse_options_set_max_sentence_length(Parse_Options opts, int dummy) {
284
+ opts->max_sentence_length = dummy;
285
+ }
286
+
287
+ int parse_options_get_max_sentence_length(Parse_Options opts) {
288
+ return opts->max_sentence_length;
289
+ }
290
+
291
+ void parse_options_set_echo_on(Parse_Options opts, int dummy) {
292
+ opts->echo_on = dummy;
293
+ }
294
+
295
+ int parse_options_get_echo_on(Parse_Options opts) {
296
+ return opts->echo_on;
297
+ }
298
+
299
+ void parse_options_set_batch_mode(Parse_Options opts, int dummy) {
300
+ opts->batch_mode = dummy;
301
+ }
302
+
303
+ int parse_options_get_batch_mode(Parse_Options opts) {
304
+ return opts->batch_mode;
305
+ }
306
+
307
+ void parse_options_set_panic_mode(Parse_Options opts, int dummy) {
308
+ opts->panic_mode = dummy;
309
+ }
310
+
311
+ int parse_options_get_panic_mode(Parse_Options opts) {
312
+ return opts->panic_mode;
313
+ }
314
+
315
+ void parse_options_set_allow_null(Parse_Options opts, int dummy) {
316
+ opts->allow_null = dummy;
317
+ }
318
+
319
+ int parse_options_get_allow_null(Parse_Options opts) {
320
+ return opts->allow_null;
321
+ }
322
+
323
+ void parse_options_set_use_cluster_disjuncts(Parse_Options opts, int dummy) {
324
+ opts->use_cluster_disjuncts = dummy;
325
+ }
326
+
327
+ int parse_options_get_use_cluster_disjuncts(Parse_Options opts) {
328
+ return opts->use_cluster_disjuncts;
329
+ }
330
+
331
+ void parse_options_set_screen_width(Parse_Options opts, int dummy) {
332
+ opts->screen_width = dummy;
333
+ }
334
+
335
+ int parse_options_get_screen_width(Parse_Options opts) {
336
+ return opts->screen_width;
337
+ }
338
+
339
+
340
+ void parse_options_set_display_on(Parse_Options opts, int dummy) {
341
+ opts->display_on = dummy;
342
+ }
343
+
344
+ int parse_options_get_display_on(Parse_Options opts) {
345
+ return opts->display_on;
346
+ }
347
+
348
+ void parse_options_set_display_postscript(Parse_Options opts, int dummy) {
349
+ opts->display_postscript = dummy;
350
+ }
351
+
352
+ int parse_options_get_display_postscript(Parse_Options opts)
353
+ {
354
+ return opts->display_postscript;
355
+ }
356
+
357
+ void parse_options_set_display_constituents(Parse_Options opts, int dummy)
358
+ {
359
+ if ((dummy < 0) || (dummy > 3)) {
360
+ prt_error("Possible values for constituents: \n"
361
+ " 0 (no display)\n"
362
+ " 1 (treebank style, multi-line indented)\n"
363
+ " 2 (flat tree, square brackets)\n"
364
+ " 3 (flat treebank style)\n");
365
+ opts->display_constituents = 0;
366
+ }
367
+ else opts->display_constituents = dummy;
368
+ }
369
+
370
+ int parse_options_get_display_constituents(Parse_Options opts)
371
+ {
372
+ return opts->display_constituents;
373
+ }
374
+
375
+ void parse_options_set_display_bad(Parse_Options opts, int dummy) {
376
+ opts->display_bad = dummy;
377
+ }
378
+
379
+ int parse_options_get_display_bad(Parse_Options opts) {
380
+ return opts->display_bad;
381
+ }
382
+
383
+ void parse_options_set_display_disjuncts(Parse_Options opts, int dummy) {
384
+ opts->display_disjuncts = dummy;
385
+ }
386
+
387
+ int parse_options_get_display_disjuncts(Parse_Options opts) {
388
+ return opts->display_disjuncts;
389
+ }
390
+
391
+ void parse_options_set_display_links(Parse_Options opts, int dummy) {
392
+ opts->display_links = dummy;
393
+ }
394
+
395
+ int parse_options_get_display_links(Parse_Options opts) {
396
+ return opts->display_links;
397
+ }
398
+
399
+ void parse_options_set_display_senses(Parse_Options opts, int dummy) {
400
+ opts->display_senses = dummy;
401
+ }
402
+
403
+ int parse_options_get_display_senses(Parse_Options opts) {
404
+ return opts->display_senses;
405
+ }
406
+
407
+ void parse_options_set_display_walls(Parse_Options opts, int dummy) {
408
+ opts->display_walls = dummy;
409
+ }
410
+
411
+ int parse_options_get_display_walls(Parse_Options opts) {
412
+ return opts->display_walls;
413
+ }
414
+
415
+ int parse_options_get_display_union(Parse_Options opts) {
416
+ return opts->display_union;
417
+ }
418
+
419
+ void parse_options_set_display_union(Parse_Options opts, int dummy) {
420
+ opts->display_union = dummy;
421
+ }
422
+
423
+ int parse_options_timer_expired(Parse_Options opts) {
424
+ return resources_timer_expired(opts->resources);
425
+ }
426
+
427
+ int parse_options_memory_exhausted(Parse_Options opts) {
428
+ return resources_memory_exhausted(opts->resources);
429
+ }
430
+
431
+ int parse_options_resources_exhausted(Parse_Options opts) {
432
+ return (resources_timer_expired(opts->resources) || resources_memory_exhausted(opts->resources));
433
+ }
434
+
435
+ void parse_options_reset_resources(Parse_Options opts) {
436
+ resources_reset(opts->resources);
437
+ }
438
+
439
+
440
+ /***************************************************************
441
+ *
442
+ * Routines for manipulating Dictionary
443
+ *
444
+ ****************************************************************/
445
+
446
+ /* Units will typically have a ".u" at the end. Get
447
+ * rid of it, as otherwise stipping is messed up. */
448
+ static inline char * deinflect(const char * str)
449
+ {
450
+ char * s = strdup(str);
451
+ char * p = strchr(s, '.');
452
+ if (p && p != s) *p = 0x0;
453
+ return s;
454
+ }
455
+
456
+ static void affix_list_create(Dictionary dict)
457
+ {
458
+ int i, j, k, l, m;
459
+ int r_strippable=0, l_strippable=0, u_strippable=0;
460
+ int s_strippable=0, p_strippable=0;
461
+ Dict_node * dn, * dn2, * start_dn;
462
+
463
+ const char * rpunc_con = "RPUNC";
464
+ const char * lpunc_con = "LPUNC";
465
+ const char * units_con = "UNITS";
466
+
467
+ /* Hmm SUF and PRE do not seem to be used at this time ... */
468
+ const char * suf_con = "SUF";
469
+ const char * pre_con = "PRE";
470
+
471
+ dict->strip_left = NULL;
472
+ dict->strip_right = NULL;
473
+ dict->strip_units = NULL;
474
+ dict->prefix = NULL;
475
+ dict->suffix = NULL;
476
+
477
+ /* Load affixes from the affix table.
478
+ */
479
+ start_dn = list_whole_dictionary(dict->root, NULL);
480
+ for (dn = start_dn; dn != NULL; dn = dn->right)
481
+ {
482
+ if (word_has_connector(dn, rpunc_con, 0)) r_strippable++;
483
+ if (word_has_connector(dn, lpunc_con, 0)) l_strippable++;
484
+ if (word_has_connector(dn, units_con, 0)) u_strippable++;
485
+ if (word_has_connector(dn, suf_con, 0)) s_strippable++;
486
+ if (word_has_connector(dn, pre_con, 0)) p_strippable++;
487
+ }
488
+ dict->strip_right = (const char **) xalloc(r_strippable * sizeof(char *));
489
+ dict->strip_left = (const char **) xalloc(l_strippable * sizeof(char *));
490
+ dict->strip_units = (const char **) xalloc(u_strippable * sizeof(char *));
491
+ dict->suffix = (const char **) xalloc(s_strippable * sizeof(char *));
492
+ dict->prefix = (const char **) xalloc(p_strippable * sizeof(char *));
493
+
494
+ dict->r_strippable = r_strippable;
495
+ dict->l_strippable = l_strippable;
496
+ dict->u_strippable = u_strippable;
497
+ dict->p_strippable = p_strippable;
498
+ dict->s_strippable = s_strippable;
499
+
500
+ i = 0;
501
+ j = 0;
502
+ k = 0;
503
+ l = 0;
504
+ m = 0;
505
+ dn = start_dn;
506
+
507
+ while (dn != NULL)
508
+ {
509
+ if (word_has_connector(dn, rpunc_con, 0))
510
+ {
511
+ dict->strip_right[i] = deinflect(dn->string);
512
+ i++;
513
+ }
514
+ if (word_has_connector(dn, lpunc_con, 0))
515
+ {
516
+ dict->strip_left[j] = deinflect(dn->string);
517
+ j++;
518
+ }
519
+ if (word_has_connector(dn, units_con, 0))
520
+ {
521
+ dict->strip_units[m] = deinflect(dn->string);
522
+ m++;
523
+ }
524
+ if (word_has_connector(dn, suf_con, 0))
525
+ {
526
+ dict->suffix[k] = dn->string;
527
+ k++;
528
+ }
529
+ if (word_has_connector(dn, pre_con, 0))
530
+ {
531
+ dict->prefix[l] = dn->string;
532
+ l++;
533
+ }
534
+ dn2 = dn->right;
535
+ dn->right = NULL;
536
+ xfree(dn, sizeof(Dict_node));
537
+ dn = dn2;
538
+ }
539
+ }
540
+
541
+ static void affix_list_delete(Dictionary dict)
542
+ {
543
+ int i;
544
+ for (i=0; i<dict->l_strippable; i++)
545
+ {
546
+ free((char *)dict->strip_left[i]);
547
+ }
548
+ for (i=0; i<dict->r_strippable; i++)
549
+ {
550
+ free((char *)dict->strip_right[i]);
551
+ }
552
+ for (i=0; i<dict->u_strippable; i++)
553
+ {
554
+ free((char *)dict->strip_units[i]);
555
+ }
556
+ xfree(dict->strip_right, dict->r_strippable * sizeof(char *));
557
+ xfree(dict->strip_left, dict->l_strippable * sizeof(char *));
558
+ xfree(dict->strip_units, dict->u_strippable * sizeof(char *));
559
+ xfree(dict->suffix, dict->s_strippable * sizeof(char *));
560
+ xfree(dict->prefix, dict->p_strippable * sizeof(char *));
561
+ }
562
+
563
+ /**
564
+ * The following function is dictionary_create with an extra
565
+ * paramater called "path". If this is non-null, then the path
566
+ * used to find the file is taken from that path. Otherwise,
567
+ * the path is taken from the dict_name. This is only needed
568
+ * because an affix_file is opened by a recursive call to this
569
+ * function.
570
+ */
571
+ static Dictionary
572
+ dictionary_six(const char * lang, const char * dict_name,
573
+ const char * pp_name, const char * cons_name,
574
+ const char * affix_name, const char * regex_name)
575
+ {
576
+ const char * t;
577
+ Dictionary dict;
578
+ Dict_node *dict_node;
579
+
580
+ init_memusage();
581
+
582
+ dict = (Dictionary) xalloc(sizeof(struct Dictionary_s));
583
+ memset(dict, 0, sizeof(struct Dictionary_s));
584
+
585
+ dict->string_set = string_set_create();
586
+
587
+ dict->lang = lang;
588
+ t = strrchr (lang, '/');
589
+ if (t) dict->lang = string_set_add(t+1, dict->string_set);
590
+ dict->name = string_set_add(dict_name, dict->string_set);
591
+
592
+ dict->max_cost = 1000;
593
+ dict->num_entries = 0;
594
+ dict->is_special = FALSE;
595
+ dict->already_got_it = '\0';
596
+ dict->line_number = 1;
597
+ dict->root = NULL;
598
+ dict->word_file_header = NULL;
599
+ dict->exp_list = NULL;
600
+ dict->affix_table = NULL;
601
+ dict->recursive_error = FALSE;
602
+
603
+ /* To disable spell-checking, just set the cheker to NULL */
604
+ dict->spell_checker = spellcheck_create(dict->lang);
605
+
606
+ dict->fp = dictopen(dict->name, "r");
607
+ if (dict->fp == NULL)
608
+ {
609
+ prt_error("Error: Could not open dictionary %s\n", dict_name);
610
+ goto failure;
611
+ }
612
+
613
+ if (!read_dictionary(dict))
614
+ {
615
+ fclose(dict->fp);
616
+ goto failure;
617
+ }
618
+ fclose(dict->fp);
619
+
620
+ dict->affix_table = NULL;
621
+ if (affix_name != NULL)
622
+ {
623
+ dict->affix_table = dictionary_six(lang, affix_name, NULL, NULL, NULL, NULL);
624
+ if (dict->affix_table == NULL)
625
+ {
626
+ goto failure;
627
+ }
628
+ affix_list_create(dict->affix_table);
629
+ }
630
+
631
+ dict->regex_root = NULL;
632
+ if (regex_name != NULL)
633
+ {
634
+ int rc;
635
+ rc = read_regex_file(dict, regex_name);
636
+ if (rc) goto failure;
637
+ rc = compile_regexs(dict);
638
+ if (rc) goto failure;
639
+ }
640
+
641
+ #if USE_CORPUS
642
+ dict->corpus = NULL;
643
+ if (affix_name != NULL) /* Don't do this for the second time */
644
+ {
645
+ dict->corpus = lg_corpus_new();
646
+ }
647
+ #endif
648
+
649
+ dict->left_wall_defined = boolean_dictionary_lookup(dict, LEFT_WALL_WORD);
650
+ dict->right_wall_defined = boolean_dictionary_lookup(dict, RIGHT_WALL_WORD);
651
+ dict->postprocessor = post_process_open(pp_name);
652
+ dict->constituent_pp = post_process_open(cons_name);
653
+
654
+ dict->unknown_word_defined = boolean_dictionary_lookup(dict, UNKNOWN_WORD);
655
+ dict->use_unknown_word = TRUE;
656
+
657
+ #if DONT_USE_REGEX_GUESSING
658
+ dict->capitalized_word_defined = boolean_dictionary_lookup(dict, PROPER_WORD);
659
+ dict->pl_capitalized_word_defined = boolean_dictionary_lookup(dict, PL_PROPER_WORD);
660
+
661
+ dict->hyphenated_word_defined = boolean_dictionary_lookup(dict, HYPHENATED_WORD);
662
+ dict->number_word_defined = boolean_dictionary_lookup(dict, NUMBER_WORD);
663
+
664
+ dict->ing_word_defined = boolean_dictionary_lookup(dict, ING_WORD);
665
+ dict->s_word_defined = boolean_dictionary_lookup(dict, S_WORD);
666
+ dict->ed_word_defined = boolean_dictionary_lookup(dict, ED_WORD);
667
+ dict->ly_word_defined = boolean_dictionary_lookup(dict, LY_WORD);
668
+ #endif /* DONT_USE_REGEX_GUESSING */
669
+
670
+ if ((dict_node = dictionary_lookup_list(dict, ANDABLE_CONNECTORS_WORD)) != NULL) {
671
+ dict->andable_connector_set = connector_set_create(dict_node->exp);
672
+ } else {
673
+ dict->andable_connector_set = NULL;
674
+ }
675
+ free_lookup_list(dict_node);
676
+
677
+ if ((dict_node = dictionary_lookup_list(dict, UNLIMITED_CONNECTORS_WORD)) != NULL) {
678
+ dict->unlimited_connector_set = connector_set_create(dict_node->exp);
679
+ } else {
680
+ dict->unlimited_connector_set = NULL;
681
+ }
682
+ free_lookup_list(dict_node);
683
+
684
+ return dict;
685
+
686
+ failure:
687
+ string_set_delete(dict->string_set);
688
+ xfree(dict, sizeof(struct Dictionary_s));
689
+ return NULL;
690
+ }
691
+
692
+ Dictionary
693
+ dictionary_create(const char * dict_name, const char * pp_name,
694
+ const char * cons_name, const char * affix_name)
695
+ {
696
+ return dictionary_six("en", dict_name, pp_name, cons_name, affix_name, NULL);
697
+ }
698
+
699
+ Dictionary dictionary_create_lang(const char * lang)
700
+ {
701
+ Dictionary dictionary;
702
+
703
+ if(lang && *lang)
704
+ {
705
+ char * dict_name;
706
+ char * pp_name;
707
+ char * cons_name;
708
+ char * affix_name;
709
+ char * regex_name;
710
+
711
+ dict_name = join_path(lang, "4.0.dict");
712
+ pp_name = join_path(lang, "4.0.knowledge");
713
+ cons_name = join_path(lang, "4.0.constituent-knowledge");
714
+ affix_name = join_path(lang, "4.0.affix");
715
+ regex_name = join_path(lang, "4.0.regex");
716
+
717
+ dictionary = dictionary_six(lang, dict_name, pp_name, cons_name,
718
+ affix_name, regex_name);
719
+
720
+ free(regex_name);
721
+ free(affix_name);
722
+ free(cons_name);
723
+ free(pp_name);
724
+ free(dict_name);
725
+ }
726
+ else
727
+ {
728
+ prt_error("Error: No language specified!\n");
729
+ dictionary = NULL;
730
+ }
731
+
732
+ return dictionary;
733
+ }
734
+
735
+ Dictionary dictionary_create_default_lang(void)
736
+ {
737
+ Dictionary dictionary;
738
+ char * lang;
739
+
740
+ lang = get_default_locale();
741
+ if(lang && *lang) {
742
+ dictionary = dictionary_create_lang(lang);
743
+ free(lang);
744
+ } else {
745
+ /* Default to en when locales are broken (e.g. WIN32) */
746
+ dictionary = dictionary_create_lang("en");
747
+ }
748
+
749
+ return dictionary;
750
+ }
751
+
752
+ int dictionary_delete(Dictionary dict)
753
+ {
754
+ if (verbosity > 0) {
755
+ prt_error("Info: Freeing dictionary %s\n", dict->name);
756
+ }
757
+
758
+ #if USE_CORPUS
759
+ lg_corpus_delete(dict->corpus);
760
+ #endif
761
+
762
+ if (dict->affix_table != NULL) {
763
+ affix_list_delete(dict->affix_table);
764
+ dictionary_delete(dict->affix_table);
765
+ }
766
+ spellcheck_destroy(dict->spell_checker);
767
+
768
+ connector_set_delete(dict->andable_connector_set);
769
+ connector_set_delete(dict->unlimited_connector_set);
770
+
771
+ post_process_close(dict->postprocessor);
772
+ post_process_close(dict->constituent_pp);
773
+ string_set_delete(dict->string_set);
774
+ free_regexs(dict);
775
+ free_dictionary(dict);
776
+ xfree(dict, sizeof(struct Dictionary_s));
777
+
778
+ return 0;
779
+ }
780
+
781
+ int dictionary_get_max_cost(Dictionary dict)
782
+ {
783
+ return dict->max_cost;
784
+ }
785
+
786
+ /***************************************************************
787
+ *
788
+ * Routines for postprocessing
789
+ *
790
+ ****************************************************************/
791
+
792
+ static Linkage_info * linkage_info_new(int num_to_alloc)
793
+ {
794
+ Linkage_info *link_info;
795
+ link_info = (Linkage_info *) xalloc(num_to_alloc * sizeof(Linkage_info));
796
+ memset(link_info, 0, num_to_alloc * sizeof(Linkage_info));
797
+ return link_info;
798
+ }
799
+
800
+ static void linkage_info_delete(Linkage_info *link_info, int sz)
801
+ {
802
+ int i,j;
803
+
804
+ for (i=0; i<sz; i++)
805
+ {
806
+ Linkage_info *lifo = &link_info[i];
807
+ int nwords = lifo->nwords;
808
+ for (j=0; j<nwords; j++)
809
+ {
810
+ if (lifo->disjunct_list_str[j])
811
+ free(lifo->disjunct_list_str[j]);
812
+ }
813
+ free(lifo->disjunct_list_str);
814
+ #ifdef USE_CORPUS
815
+ lg_sense_delete(lifo);
816
+ #endif
817
+ }
818
+ xfree(link_info, sz);
819
+ }
820
+
821
+ static void free_andlists(Sentence sent)
822
+ {
823
+ int L;
824
+ Andlist * andlist, * next;
825
+ for(L=0; L<sent->num_linkages_post_processed; L++) {
826
+ /* printf("%d ", sent->link_info[L].canonical); */
827
+ /* if (sent->link_info[L].canonical==0) continue; */
828
+ andlist = sent->link_info[L].andlist;
829
+ while(1) {
830
+ if(andlist == NULL) break;
831
+ next = andlist->next;
832
+ xfree((char *) andlist, sizeof(Andlist));
833
+ andlist = next;
834
+ }
835
+ }
836
+ /* printf("\n"); */
837
+ }
838
+
839
+ static void free_post_processing(Sentence sent)
840
+ {
841
+ if (sent->link_info != NULL) {
842
+ /* postprocessing must have been done */
843
+ free_andlists(sent);
844
+ linkage_info_delete(sent->link_info, sent->num_linkages_alloced);
845
+ sent->link_info = NULL;
846
+ }
847
+ }
848
+
849
+ static void post_process_linkages(Sentence sent, Parse_Options opts)
850
+ {
851
+ int *indices;
852
+ int in, block_bottom, block_top;
853
+ int N_linkages_found, N_linkages_alloced;
854
+ int N_linkages_post_processed, N_valid_linkages;
855
+ int N_thin_linkages;
856
+ int overflowed, only_canonical_allowed;
857
+ Linkage_info *link_info;
858
+ int canonical;
859
+
860
+ free_post_processing(sent);
861
+
862
+ overflowed = build_parse_set(sent, sent->null_count, opts);
863
+ print_time(opts, "Built parse set");
864
+
865
+ if (overflowed && (1 < opts->verbosity))
866
+ {
867
+ err_ctxt ec;
868
+ ec.sent = sent;
869
+ err_msg(&ec, Warn, "Warning: Count overflow.\n"
870
+ "Considering a random subset of %d of an unknown and large number of linkages\n",
871
+ opts->linkage_limit);
872
+ }
873
+ N_linkages_found = sent->num_linkages_found;
874
+
875
+ if (sent->num_linkages_found == 0)
876
+ {
877
+ sent->num_linkages_alloced = 0;
878
+ sent->num_linkages_post_processed = 0;
879
+ sent->num_valid_linkages = 0;
880
+ sent->num_thin_linkages = 0;
881
+ sent->link_info = NULL;
882
+ return;
883
+ }
884
+
885
+ if (N_linkages_found > opts->linkage_limit)
886
+ {
887
+ N_linkages_alloced = opts->linkage_limit;
888
+ if (opts->verbosity > 1)
889
+ {
890
+ err_ctxt ec;
891
+ ec.sent = sent;
892
+ err_msg(&ec, Warn, "Warning: Considering a random subset of %d of %d linkages\n",
893
+ N_linkages_alloced, N_linkages_found);
894
+ }
895
+ }
896
+ else
897
+ {
898
+ N_linkages_alloced = N_linkages_found;
899
+ }
900
+
901
+ link_info = linkage_info_new(N_linkages_alloced);
902
+ N_valid_linkages = 0;
903
+
904
+ /* Generate an array of linkage indices to examine */
905
+ indices = (int *) xalloc(N_linkages_alloced * sizeof(int));
906
+ if (overflowed)
907
+ {
908
+ for (in=0; in < N_linkages_alloced; in++)
909
+ {
910
+ indices[in] = -(in+1);
911
+ }
912
+ }
913
+ else
914
+ {
915
+ sent->rand_state = N_linkages_found + sent->length;
916
+ for (in=0; in<N_linkages_alloced; in++)
917
+ {
918
+ double frac = (double) N_linkages_found;
919
+ frac /= (double) N_linkages_alloced;
920
+ block_bottom = (int) (((double) in) * frac);
921
+ block_top = (int) (((double) (in+1)) * frac);
922
+ indices[in] = block_bottom +
923
+ (rand_r(&sent->rand_state) % (block_top-block_bottom));
924
+ }
925
+ }
926
+
927
+ only_canonical_allowed = !(overflowed || (N_linkages_found > 2*opts->linkage_limit));
928
+ /* When we're processing only a small subset of the linkages,
929
+ * don't worry about restricting the set we consider to be
930
+ * canonical ones. In the extreme case where we are only
931
+ * generating 1 in a million linkages, it's very unlikely
932
+ * that we'll hit two symmetric variants of the same linkage
933
+ * anyway.
934
+ */
935
+ /* (optional) first pass: just visit the linkages */
936
+ /* The purpose of these two passes is to make the post-processing
937
+ * more efficient. Because (hopefully) by the time you do the
938
+ * real work in the 2nd pass you've pruned the relevant rule set
939
+ * in the first pass.
940
+ */
941
+ if (sent->length >= opts->twopass_length)
942
+ {
943
+ for (in=0; (in < N_linkages_alloced) &&
944
+ (!resources_exhausted(opts->resources)); in++)
945
+ {
946
+ extract_links(indices[in], sent->null_count, sent->parse_info);
947
+ if (set_has_fat_down(sent))
948
+ {
949
+ if (only_canonical_allowed && !is_canonical_linkage(sent)) continue;
950
+ analyze_fat_linkage(sent, opts, PP_FIRST_PASS);
951
+ }
952
+ else
953
+ {
954
+ analyze_thin_linkage(sent, opts, PP_FIRST_PASS);
955
+ }
956
+ }
957
+ }
958
+
959
+ /* second pass: actually perform post-processing */
960
+ N_linkages_post_processed = 0;
961
+ N_thin_linkages = 0;
962
+ for (in=0; (in < N_linkages_alloced) &&
963
+ (!resources_exhausted(opts->resources)); in++)
964
+ {
965
+ Linkage_info *lifo = &link_info[N_linkages_post_processed];
966
+ extract_links(indices[in], sent->null_count, sent->parse_info);
967
+ if (set_has_fat_down(sent))
968
+ {
969
+ canonical = is_canonical_linkage(sent);
970
+ if (only_canonical_allowed && !canonical) continue;
971
+ *lifo = analyze_fat_linkage(sent, opts, PP_SECOND_PASS);
972
+ lifo->fat = TRUE;
973
+ lifo->canonical = canonical;
974
+ }
975
+ else
976
+ {
977
+ *lifo = analyze_thin_linkage(sent, opts, PP_SECOND_PASS);
978
+ lifo->fat = FALSE;
979
+ lifo->canonical = TRUE;
980
+ }
981
+ if (0 == lifo->N_violations)
982
+ {
983
+ N_valid_linkages++;
984
+ if (FALSE == lifo->fat) N_thin_linkages++;
985
+ }
986
+ lifo->index = indices[in];
987
+ lg_corpus_score(sent, lifo);
988
+ N_linkages_post_processed++;
989
+ }
990
+
991
+ print_time(opts, "Postprocessed all linkages");
992
+ qsort((void *)link_info, N_linkages_post_processed, sizeof(Linkage_info),
993
+ (int (*)(const void *, const void *)) opts->cost_model.compare_fn);
994
+
995
+ if (!resources_exhausted(opts->resources))
996
+ {
997
+ if ((N_linkages_post_processed == 0) &&
998
+ (N_linkages_found > 0) &&
999
+ (N_linkages_found < opts->linkage_limit))
1000
+ {
1001
+ /* With the current parser, the following sentence will elicit
1002
+ * this error:
1003
+ *
1004
+ * Well, say, Joe, you can be Friar Tuck or Much the miller's
1005
+ * son, and lam me with a quarter-staff; or I'll be the Sheriff
1006
+ * of Nottingham and you be Robin Hood a little while and kill
1007
+ * me.
1008
+ */
1009
+ err_ctxt ec;
1010
+ ec.sent = sent;
1011
+ err_msg(&ec, Error, "Error: None of the linkages is canonical\n"
1012
+ "\tN_linkages_post_processed=%d "
1013
+ "N_linkages_found=%d\n",
1014
+ N_linkages_post_processed,
1015
+ N_linkages_found);
1016
+ }
1017
+ }
1018
+
1019
+ if (opts->verbosity > 1)
1020
+ {
1021
+ err_ctxt ec;
1022
+ ec.sent = sent;
1023
+ err_msg(&ec, Info, "Info: %d of %d linkages with no P.P. violations\n",
1024
+ N_valid_linkages, N_linkages_post_processed);
1025
+ }
1026
+
1027
+ print_time(opts, "Sorted all linkages");
1028
+
1029
+ sent->num_linkages_alloced = N_linkages_alloced;
1030
+ sent->num_linkages_post_processed = N_linkages_post_processed;
1031
+ sent->num_valid_linkages = N_valid_linkages;
1032
+ sent->num_thin_linkages = N_thin_linkages;
1033
+ sent->link_info = link_info;
1034
+
1035
+ xfree(indices, N_linkages_alloced * sizeof(int));
1036
+ /*if(N_valid_linkages == 0) free_andlists(sent); */
1037
+ }
1038
+
1039
+ /***************************************************************
1040
+ *
1041
+ * Routines for creating and destroying processing Sentences
1042
+ *
1043
+ ****************************************************************/
1044
+
1045
+ Sentence sentence_create(const char *input_string, Dictionary dict)
1046
+ {
1047
+ Sentence sent;
1048
+
1049
+ sent = (Sentence) xalloc(sizeof(struct Sentence_s));
1050
+ memset(sent, 0, sizeof(struct Sentence_s));
1051
+ sent->dict = dict;
1052
+ sent->length = 0;
1053
+ sent->num_linkages_found = 0;
1054
+ sent->num_linkages_alloced = 0;
1055
+ sent->num_linkages_post_processed = 0;
1056
+ sent->num_valid_linkages = 0;
1057
+ sent->link_info = NULL;
1058
+ sent->deletable = NULL;
1059
+ sent->effective_dist = NULL;
1060
+ sent->num_valid_linkages = 0;
1061
+ sent->null_count = 0;
1062
+ sent->parse_info = NULL;
1063
+ sent->string_set = string_set_create();
1064
+
1065
+ sent->q_pruned_rules = FALSE;
1066
+ sent->is_conjunction = NULL;
1067
+
1068
+ sent->dptr = NULL;
1069
+ sent->deletable = NULL;
1070
+
1071
+ /* Make a copy of the input */
1072
+ sent->orig_sentence = string_set_add (input_string, sent->string_set);
1073
+
1074
+ return sent;
1075
+ }
1076
+
1077
+ /* XXX Extreme hack alert -- English-language words are used
1078
+ * completely naked in the C source code!!! FIXME !!!!
1079
+ */
1080
+ static void set_is_conjunction(Sentence sent)
1081
+ {
1082
+ int w;
1083
+ char * s;
1084
+ for (w=0; w<sent->length; w++) {
1085
+ s = sent->word[w].string;
1086
+ sent->is_conjunction[w] =
1087
+ (strcmp(s, "and")==0) ||
1088
+ (strcmp(s, "or" )==0) ||
1089
+ (strcmp(s, "but")==0) ||
1090
+ (strcmp(s, "nor")==0);
1091
+ }
1092
+ }
1093
+
1094
+ int sentence_split(Sentence sent, Parse_Options opts)
1095
+ {
1096
+ int i;
1097
+ Dictionary dict = sent->dict;
1098
+
1099
+ /* Cleanup stuff previously allocated. This is because some free
1100
+ * routines depend on sent-length, which might change in different
1101
+ * parse-opts settings.
1102
+ */
1103
+ free_deletable(sent);
1104
+
1105
+ /* Tokenize */
1106
+ if (!separate_sentence(sent, opts))
1107
+ {
1108
+ return -1;
1109
+ }
1110
+
1111
+ sent->q_pruned_rules = FALSE; /* for post processing */
1112
+ sent->is_conjunction = (char *) xalloc(sizeof(char)*sent->length);
1113
+ set_is_conjunction(sent);
1114
+ initialize_conjunction_tables(sent);
1115
+
1116
+ for (i=0; i<sent->length; i++)
1117
+ {
1118
+ /* in case we free these before they set to anything else */
1119
+ sent->word[i].x = NULL;
1120
+ sent->word[i].d = NULL;
1121
+ }
1122
+
1123
+ if (!(dict->unknown_word_defined && dict->use_unknown_word))
1124
+ {
1125
+ if (!sentence_in_dictionary(sent)) {
1126
+ return -2;
1127
+ }
1128
+ }
1129
+
1130
+ /* Look up each word in the dictionary, collect up all
1131
+ * plausible disjunct expressions for each word.
1132
+ */
1133
+ if (!build_sentence_expressions(sent, opts))
1134
+ {
1135
+ sent->num_valid_linkages = 0;
1136
+ return -3;
1137
+ }
1138
+
1139
+ return 0;
1140
+ }
1141
+
1142
+ void sentence_delete(Sentence sent)
1143
+ {
1144
+ if (!sent) return;
1145
+ sat_sentence_delete(sent);
1146
+ /* free_andlists(sent); */
1147
+ free_sentence_disjuncts(sent);
1148
+ free_sentence_expressions(sent);
1149
+ string_set_delete(sent->string_set);
1150
+ if (sent->parse_info) free_parse_info(sent->parse_info);
1151
+ free_post_processing(sent);
1152
+ post_process_close_sentence(sent->dict->postprocessor);
1153
+ free_deletable(sent);
1154
+ free_effective_dist(sent);
1155
+ free_count(sent);
1156
+ free_analyze(sent);
1157
+ if (sent->is_conjunction) xfree(sent->is_conjunction, sizeof(char)*sent->length);
1158
+ xfree((char *) sent, sizeof(struct Sentence_s));
1159
+ }
1160
+
1161
+ int sentence_length(Sentence sent)
1162
+ {
1163
+ if (!sent) return 0;
1164
+ return sent->length;
1165
+ }
1166
+
1167
+ const char * sentence_get_word(Sentence sent, int index)
1168
+ {
1169
+ if (!sent) return NULL;
1170
+ return sent->word[index].string;
1171
+ }
1172
+
1173
+ const char * sentence_get_nth_word(Sentence sent, int index)
1174
+ {
1175
+ if (!sent) return NULL;
1176
+ return sent->word[index].string;
1177
+ }
1178
+
1179
+ int sentence_null_count(Sentence sent) {
1180
+ if (!sent) return 0;
1181
+ return sent->null_count;
1182
+ }
1183
+
1184
+ int sentence_num_thin_linkages(Sentence sent) {
1185
+ if (!sent) return 0;
1186
+ return sent->num_thin_linkages;
1187
+ }
1188
+
1189
+ int sentence_num_linkages_found(Sentence sent) {
1190
+ if (!sent) return 0;
1191
+ return sent->num_linkages_found;
1192
+ }
1193
+
1194
+ int sentence_num_valid_linkages(Sentence sent) {
1195
+ if (!sent) return 0;
1196
+ return sent->num_valid_linkages;
1197
+ }
1198
+
1199
+ int sentence_num_linkages_post_processed(Sentence sent) {
1200
+ if (!sent) return 0;
1201
+ return sent->num_linkages_post_processed;
1202
+ }
1203
+
1204
+ int sentence_num_violations(Sentence sent, int i) {
1205
+ if (!sent) return 0;
1206
+
1207
+ /* The sat solver (currently) fails to fill in link_info */
1208
+ if (!sent->link_info) return 0;
1209
+ return sent->link_info[i].N_violations;
1210
+ }
1211
+
1212
+ int sentence_and_cost(Sentence sent, int i) {
1213
+ if (!sent) return 0;
1214
+
1215
+ /* The sat solver (currently) fails to fill in link_info */
1216
+ if (!sent->link_info) return 0;
1217
+ return sent->link_info[i].and_cost;
1218
+ }
1219
+
1220
+ int sentence_disjunct_cost(Sentence sent, int i) {
1221
+ if (!sent) return 0;
1222
+
1223
+ /* The sat solver (currently) fails to fill in link_info */
1224
+ if (!sent->link_info) return 0;
1225
+ return sent->link_info[i].disjunct_cost;
1226
+ }
1227
+
1228
+ int sentence_link_cost(Sentence sent, int i) {
1229
+ if (!sent) return 0;
1230
+
1231
+ /* The sat solver (currently) fails to fill in link_info */
1232
+ if (!sent->link_info) return 0;
1233
+ return sent->link_info[i].link_cost;
1234
+ }
1235
+
1236
+ int sentence_nth_word_has_disjunction(Sentence sent, int i)
1237
+ {
1238
+ if (!sent) return 0;
1239
+ prt_error("Warning: sentence_nth_word_has_disjunction() is deprecated!\n");
1240
+ return (sent->parse_info->chosen_disjuncts[i] != NULL);
1241
+ }
1242
+
1243
+ static void chart_parse(Sentence sent, Parse_Options opts)
1244
+ {
1245
+ int nl;
1246
+
1247
+ /* Build lists of disjuncts */
1248
+ prepare_to_parse(sent, opts);
1249
+
1250
+ init_fast_matcher(sent);
1251
+ init_count(sent);
1252
+
1253
+ /* A parse set may have been already been built for this sentence,
1254
+ * if it was previously parsed. If so we free it up before
1255
+ * building another. */
1256
+ if (sent->parse_info) free_parse_info(sent->parse_info);
1257
+ sent->parse_info = parse_info_new(sent->length);
1258
+
1259
+ for (nl = opts->min_null_count; nl<=opts->max_null_count ; ++nl)
1260
+ {
1261
+ s64 total;
1262
+ if (resources_exhausted(opts->resources)) break;
1263
+ sent->null_count = nl;
1264
+ total = do_parse(sent, sent->null_count, opts);
1265
+
1266
+ if (verbosity > 1)
1267
+ {
1268
+ prt_error("Info: Total count with %d null links: %lld\n",
1269
+ sent->null_count, total);
1270
+ }
1271
+
1272
+ /* Give up if the parse count is overflowing */
1273
+ if (PARSE_NUM_OVERFLOW < total)
1274
+ {
1275
+ if (verbosity > 0)
1276
+ {
1277
+ prt_error("WARNING: Combinatorial explosion! nulls=%d cnt=%lld\n"
1278
+ "Consider retrying the parse with the max allowed disjunct cost set lower.\n",
1279
+ sent->null_count, total);
1280
+ }
1281
+ total = (total>INT_MAX) ? INT_MAX : total;
1282
+ }
1283
+
1284
+ sent->num_linkages_found = (int) total;
1285
+ print_time(opts, "Counted parses");
1286
+
1287
+ post_process_linkages(sent, opts);
1288
+ if (sent->num_valid_linkages > 0) break;
1289
+
1290
+ /* If we are here, then no valid linakges were found.
1291
+ * If there was a parse overflow, give up now. */
1292
+ if (PARSE_NUM_OVERFLOW < total) break;
1293
+ }
1294
+
1295
+ free_count(sent);
1296
+ free_fast_matcher(sent);
1297
+ }
1298
+
1299
+ int sentence_parse(Sentence sent, Parse_Options opts)
1300
+ {
1301
+ int rc;
1302
+
1303
+ verbosity = opts->verbosity;
1304
+
1305
+ /* If the sentence has not yet been split, do so now.
1306
+ * This is for backwards compatibility, for existing programs
1307
+ * that do not explicitly call the splitter.
1308
+ */
1309
+ if (0 == sent->length)
1310
+ {
1311
+ rc = sentence_split(sent, opts);
1312
+ if (rc) return -1;
1313
+ }
1314
+
1315
+ /* Check for bad sentence length */
1316
+ if (MAX_SENTENCE <= sent->length)
1317
+ {
1318
+ prt_error("Error: sentence too long, contains more than %d words\n",
1319
+ MAX_SENTENCE);
1320
+ return -2;
1321
+ }
1322
+
1323
+ /* Initialize/free any leftover garbage */
1324
+ free_sentence_disjuncts(sent);
1325
+ resources_reset_space(opts->resources);
1326
+
1327
+ if (resources_exhausted(opts->resources)) {
1328
+ sent->num_valid_linkages = 0;
1329
+ return 0;
1330
+ }
1331
+
1332
+ init_analyze(sent);
1333
+
1334
+ /* Expressions were previously set up during the tokenize stage. */
1335
+ expression_prune(sent);
1336
+ print_time(opts, "Finished expression pruning");
1337
+ if (opts->use_sat_solver)
1338
+ {
1339
+ sat_parse(sent, opts);
1340
+ }
1341
+ else
1342
+ {
1343
+ chart_parse(sent, opts);
1344
+ }
1345
+ print_time(opts, "Finished parse");
1346
+
1347
+ return sent->num_valid_linkages;
1348
+ }
1349
+
1350
+ /***************************************************************
1351
+ *
1352
+ * Routines which allow user access to Linkages.
1353
+ *
1354
+ ****************************************************************/
1355
+
1356
+ Linkage linkage_create(int k, Sentence sent, Parse_Options opts)
1357
+ {
1358
+ Linkage linkage;
1359
+
1360
+ if (opts->use_sat_solver)
1361
+ {
1362
+ //return sat_create_linkage(k, sent, opts);
1363
+ }
1364
+
1365
+ if ((k >= sent->num_linkages_post_processed) || (k < 0)) return NULL;
1366
+
1367
+ /* Using exalloc since this is external to the parser itself. */
1368
+ linkage = (Linkage) exalloc(sizeof(struct Linkage_s));
1369
+
1370
+ linkage->num_words = sent->length;
1371
+ linkage->word = (const char **) exalloc(linkage->num_words*sizeof(char *));
1372
+ linkage->current = 0;
1373
+ linkage->num_sublinkages=0;
1374
+ linkage->sublinkage = NULL;
1375
+ linkage->unionized = FALSE;
1376
+ linkage->sent = sent;
1377
+ linkage->opts = opts;
1378
+ linkage->info = &sent->link_info[k];
1379
+ linkage->dis_con_tree = NULL;
1380
+
1381
+ extract_links(sent->link_info[k].index, sent->null_count, sent->parse_info);
1382
+ compute_chosen_words(sent, linkage);
1383
+
1384
+ if (set_has_fat_down(sent))
1385
+ {
1386
+ extract_fat_linkage(sent, opts, linkage);
1387
+ }
1388
+ else
1389
+ {
1390
+ extract_thin_linkage(sent, opts, linkage);
1391
+ }
1392
+
1393
+ if (sent->dict->postprocessor != NULL)
1394
+ {
1395
+ linkage_post_process(linkage, sent->dict->postprocessor);
1396
+ }
1397
+
1398
+ return linkage;
1399
+ }
1400
+
1401
+ int linkage_get_current_sublinkage(Linkage linkage) {
1402
+ return linkage->current;
1403
+ }
1404
+
1405
+ int linkage_set_current_sublinkage(Linkage linkage, int index)
1406
+ {
1407
+ if ((index < 0) ||
1408
+ (index >= linkage->num_sublinkages))
1409
+ {
1410
+ return 0;
1411
+ }
1412
+ linkage->current = index;
1413
+ return 1;
1414
+ }
1415
+
1416
+ static void exfree_pp_info(PP_info *ppi)
1417
+ {
1418
+ if (ppi->num_domains > 0)
1419
+ exfree(ppi->domain_name, sizeof(const char *)*ppi->num_domains);
1420
+ ppi->domain_name = NULL;
1421
+ ppi->num_domains = 0;
1422
+ }
1423
+
1424
+ void linkage_delete(Linkage linkage)
1425
+ {
1426
+ int i, j;
1427
+ Sublinkage *s;
1428
+
1429
+ /* Can happen on panic timeout or user error */
1430
+ if (NULL == linkage) return;
1431
+
1432
+ for (i=0; i<linkage->num_words; ++i)
1433
+ {
1434
+ exfree((void *) linkage->word[i], strlen(linkage->word[i])+1);
1435
+ }
1436
+ exfree(linkage->word, sizeof(char *)*linkage->num_words);
1437
+
1438
+ for (i=0; i<linkage->num_sublinkages; ++i)
1439
+ {
1440
+ s = &(linkage->sublinkage[i]);
1441
+ for (j=0; j<s->num_links; ++j) {
1442
+ exfree_link(s->link[j]);
1443
+ }
1444
+ exfree(s->link, sizeof(Link)*s->num_links);
1445
+ if (s->pp_info != NULL) {
1446
+ for (j=0; j<s->num_links; ++j) {
1447
+ exfree_pp_info(&s->pp_info[j]);
1448
+ }
1449
+ exfree(s->pp_info, sizeof(PP_info)*s->num_links);
1450
+ s->pp_info = NULL;
1451
+ post_process_free_data(&s->pp_data);
1452
+ }
1453
+ if (s->violation != NULL) {
1454
+ exfree((void *) s->violation, sizeof(char)*(strlen(s->violation)+1));
1455
+ }
1456
+ }
1457
+ exfree(linkage->sublinkage, sizeof(Sublinkage)*linkage->num_sublinkages);
1458
+ if (linkage->dis_con_tree)
1459
+ free_DIS_tree(linkage->dis_con_tree);
1460
+ exfree(linkage, sizeof(struct Linkage_s));
1461
+ }
1462
+
1463
+ static int links_are_equal(Link *l, Link *m)
1464
+ {
1465
+ return ((l->l == m->l) && (l->r == m->r) && (strcmp(l->name, m->name)==0));
1466
+ }
1467
+
1468
+ static int link_already_appears(Linkage linkage, Link *link, int a)
1469
+ {
1470
+ int i, j;
1471
+
1472
+ for (i=0; i<a; ++i) {
1473
+ for (j=0; j<linkage->sublinkage[i].num_links; ++j) {
1474
+ if (links_are_equal(linkage->sublinkage[i].link[j], link)) return TRUE;
1475
+ }
1476
+ }
1477
+ return FALSE;
1478
+ }
1479
+
1480
+ static PP_info excopy_pp_info(PP_info ppi)
1481
+ {
1482
+ PP_info newppi;
1483
+ int i;
1484
+
1485
+ newppi.num_domains = ppi.num_domains;
1486
+ newppi.domain_name = (const char **) exalloc(sizeof(const char *)*ppi.num_domains);
1487
+ for (i=0; i<newppi.num_domains; ++i)
1488
+ {
1489
+ newppi.domain_name[i] = ppi.domain_name[i];
1490
+ }
1491
+ return newppi;
1492
+ }
1493
+
1494
+
1495
+ static Sublinkage unionize_linkage(Linkage linkage)
1496
+ {
1497
+ int i, j, num_in_union=0;
1498
+ Sublinkage u;
1499
+ Link *link;
1500
+ const char *p;
1501
+
1502
+ for (i=0; i<linkage->num_sublinkages; ++i) {
1503
+ for (j=0; j<linkage->sublinkage[i].num_links; ++j) {
1504
+ link = linkage->sublinkage[i].link[j];
1505
+ if (!link_already_appears(linkage, link, i)) num_in_union++;
1506
+ }
1507
+ }
1508
+
1509
+ u.link = (Link **) exalloc(sizeof(Link *)*num_in_union);
1510
+ u.num_links = num_in_union;
1511
+ zero_sublinkage(&u);
1512
+
1513
+ u.pp_info = (PP_info *) exalloc(sizeof(PP_info)*num_in_union);
1514
+ u.violation = NULL;
1515
+ u.num_links = num_in_union;
1516
+
1517
+ num_in_union = 0;
1518
+
1519
+ for (i=0; i<linkage->num_sublinkages; ++i) {
1520
+ for (j=0; j<linkage->sublinkage[i].num_links; ++j) {
1521
+ link = linkage->sublinkage[i].link[j];
1522
+ if (!link_already_appears(linkage, link, i)) {
1523
+ u.link[num_in_union] = excopy_link(link);
1524
+ u.pp_info[num_in_union] = excopy_pp_info(linkage->sublinkage[i].pp_info[j]);
1525
+ if (((p=linkage->sublinkage[i].violation) != NULL) &&
1526
+ (u.violation == NULL)) {
1527
+ char *s = (char *) exalloc((strlen(p)+1)*sizeof(char));
1528
+ strcpy(s, p);
1529
+ u.violation = s;
1530
+ }
1531
+ num_in_union++;
1532
+ }
1533
+ }
1534
+ }
1535
+
1536
+ return u;
1537
+ }
1538
+
1539
+ int linkage_compute_union(Linkage linkage)
1540
+ {
1541
+ int i, num_subs=linkage->num_sublinkages;
1542
+ Sublinkage * new_sublinkage, *s;
1543
+
1544
+ if (linkage->unionized) {
1545
+ linkage->current = linkage->num_sublinkages-1;
1546
+ return 0;
1547
+ }
1548
+ if (num_subs == 1) {
1549
+ linkage->unionized = TRUE;
1550
+ return 1;
1551
+ }
1552
+
1553
+ new_sublinkage =
1554
+ (Sublinkage *) exalloc(sizeof(Sublinkage)*(num_subs+1));
1555
+
1556
+ for (i=0; i<num_subs; ++i) {
1557
+ new_sublinkage[i] = linkage->sublinkage[i];
1558
+ }
1559
+ exfree(linkage->sublinkage, sizeof(Sublinkage)*num_subs);
1560
+ linkage->sublinkage = new_sublinkage;
1561
+
1562
+ /* Zero out the new sublinkage, then unionize it. */
1563
+ s = &new_sublinkage[num_subs];
1564
+ s->link = NULL;
1565
+ s->num_links = 0;
1566
+ zero_sublinkage(s);
1567
+ linkage->sublinkage[num_subs] = unionize_linkage(linkage);
1568
+
1569
+ linkage->num_sublinkages++;
1570
+
1571
+ linkage->unionized = TRUE;
1572
+ linkage->current = linkage->num_sublinkages-1;
1573
+ return 1;
1574
+ }
1575
+
1576
+ int linkage_get_num_sublinkages(Linkage linkage) {
1577
+ return linkage->num_sublinkages;
1578
+ }
1579
+
1580
+ int linkage_get_num_words(Linkage linkage)
1581
+ {
1582
+ return linkage->num_words;
1583
+ }
1584
+
1585
+ int linkage_get_num_links(Linkage linkage)
1586
+ {
1587
+ int current = linkage->current;
1588
+ return linkage->sublinkage[current].num_links;
1589
+ }
1590
+
1591
+ static inline int verify_link_index(Linkage linkage, int index)
1592
+ {
1593
+ if ((index < 0) ||
1594
+ (index >= linkage->sublinkage[linkage->current].num_links))
1595
+ {
1596
+ return 0;
1597
+ }
1598
+ return 1;
1599
+ }
1600
+
1601
+ int linkage_get_link_length(Linkage linkage, int index)
1602
+ {
1603
+ Link *link;
1604
+ int word_has_link[MAX_SENTENCE];
1605
+ int i, length;
1606
+ int current = linkage->current;
1607
+
1608
+ if (!verify_link_index(linkage, index)) return -1;
1609
+
1610
+ for (i=0; i<linkage->num_words+1; ++i) {
1611
+ word_has_link[i] = FALSE;
1612
+ }
1613
+
1614
+ for (i=0; i<linkage->sublinkage[current].num_links; ++i) {
1615
+ link = linkage->sublinkage[current].link[i];
1616
+ word_has_link[link->l] = TRUE;
1617
+ word_has_link[link->r] = TRUE;
1618
+ }
1619
+
1620
+ link = linkage->sublinkage[current].link[index];
1621
+ length = link->r - link->l;
1622
+ for (i= link->l+1; i < link->r; ++i) {
1623
+ if (!word_has_link[i]) length--;
1624
+ }
1625
+ return length;
1626
+ }
1627
+
1628
+ int linkage_get_link_lword(Linkage linkage, int index)
1629
+ {
1630
+ Link *link;
1631
+ if (!verify_link_index(linkage, index)) return -1;
1632
+ link = linkage->sublinkage[linkage->current].link[index];
1633
+ return link->l;
1634
+ }
1635
+
1636
+ int linkage_get_link_rword(Linkage linkage, int index)
1637
+ {
1638
+ Link *link;
1639
+ if (!verify_link_index(linkage, index)) return -1;
1640
+ link = linkage->sublinkage[linkage->current].link[index];
1641
+ return link->r;
1642
+ }
1643
+
1644
+ const char * linkage_get_link_label(Linkage linkage, int index)
1645
+ {
1646
+ Link *link;
1647
+ if (!verify_link_index(linkage, index)) return NULL;
1648
+ link = linkage->sublinkage[linkage->current].link[index];
1649
+ return link->name;
1650
+ }
1651
+
1652
+ const char * linkage_get_link_llabel(Linkage linkage, int index)
1653
+ {
1654
+ Link *link;
1655
+ if (!verify_link_index(linkage, index)) return NULL;
1656
+ link = linkage->sublinkage[linkage->current].link[index];
1657
+ return link->lc->string;
1658
+ }
1659
+
1660
+ const char * linkage_get_link_rlabel(Linkage linkage, int index)
1661
+ {
1662
+ Link *link;
1663
+ if (!verify_link_index(linkage, index)) return NULL;
1664
+ link = linkage->sublinkage[linkage->current].link[index];
1665
+ return link->rc->string;
1666
+ }
1667
+
1668
+ const char ** linkage_get_words(Linkage linkage)
1669
+ {
1670
+ return linkage->word;
1671
+ }
1672
+
1673
+ Sentence linkage_get_sentence(Linkage linkage)
1674
+ {
1675
+ return linkage->sent;
1676
+ }
1677
+
1678
+ const char * linkage_get_disjunct_str(Linkage linkage, int w)
1679
+ {
1680
+ Disjunct *dj;
1681
+
1682
+ if (NULL == linkage->info->disjunct_list_str)
1683
+ {
1684
+ lg_compute_disjunct_strings(linkage->sent, linkage->info);
1685
+ }
1686
+
1687
+ /* dj will be null if the word wasn't used in the parse. */
1688
+ dj = linkage->sent->parse_info->chosen_disjuncts[w];
1689
+ if (NULL == dj) return "";
1690
+
1691
+ return linkage->info->disjunct_list_str[w];
1692
+ }
1693
+
1694
+ double linkage_get_disjunct_cost(Linkage linkage, int w)
1695
+ {
1696
+ Disjunct *dj = linkage->sent->parse_info->chosen_disjuncts[w];
1697
+
1698
+ /* dj may be null, if the word didn't participate in the parse. */
1699
+ if (dj) return dj->cost;
1700
+ return 0.0;
1701
+ }
1702
+
1703
+ double linkage_get_disjunct_corpus_score(Linkage linkage, int w)
1704
+ {
1705
+ Disjunct *dj = linkage->sent->parse_info->chosen_disjuncts[w];
1706
+
1707
+ /* dj may be null, if the word didn't participate in the parse. */
1708
+ if (NULL == dj) return 99.999;
1709
+
1710
+ return lg_corpus_disjunct_score(linkage, w);
1711
+ }
1712
+
1713
+ const char * linkage_get_word(Linkage linkage, int w)
1714
+ {
1715
+ return linkage->word[w];
1716
+ }
1717
+
1718
+ int linkage_unused_word_cost(Linkage linkage)
1719
+ {
1720
+ /* The sat solver (currently) fails to fill in info */
1721
+ if (!linkage->info) return 0;
1722
+ return linkage->info->unused_word_cost;
1723
+ }
1724
+
1725
+ int linkage_disjunct_cost(Linkage linkage)
1726
+ {
1727
+ /* The sat solver (currently) fails to fill in info */
1728
+ if (!linkage->info) return 0;
1729
+ return (int) floorf(linkage->info->disjunct_cost);
1730
+ }
1731
+
1732
+ int linkage_is_fat(Linkage linkage)
1733
+ {
1734
+ /* The sat solver (currently) fails to fill in info */
1735
+ if (!linkage->info) return 0;
1736
+ return linkage->info->fat;
1737
+ }
1738
+
1739
+ int linkage_and_cost(Linkage linkage)
1740
+ {
1741
+ /* The sat solver (currently) fails to fill in info */
1742
+ if (!linkage->info) return 0;
1743
+ return linkage->info->and_cost;
1744
+ }
1745
+
1746
+ int linkage_link_cost(Linkage linkage)
1747
+ {
1748
+ /* The sat solver (currently) fails to fill in info */
1749
+ if (!linkage->info) return 0;
1750
+ return linkage->info->link_cost;
1751
+ }
1752
+
1753
+ double linkage_corpus_cost(Linkage linkage)
1754
+ {
1755
+ /* The sat solver (currently) fails to fill in info */
1756
+ if (!linkage->info) return 0.0;
1757
+ return linkage->info->corpus_cost;
1758
+ }
1759
+
1760
+ int linkage_get_link_num_domains(Linkage linkage, int index)
1761
+ {
1762
+ PP_info *pp_info;
1763
+ if (!verify_link_index(linkage, index)) return -1;
1764
+ pp_info = &linkage->sublinkage[linkage->current].pp_info[index];
1765
+ return pp_info->num_domains;
1766
+ }
1767
+
1768
+ const char ** linkage_get_link_domain_names(Linkage linkage, int index)
1769
+ {
1770
+ PP_info *pp_info;
1771
+ if (!verify_link_index(linkage, index)) return NULL;
1772
+ pp_info = &linkage->sublinkage[linkage->current].pp_info[index];
1773
+ return pp_info->domain_name;
1774
+ }
1775
+
1776
+ const char * linkage_get_violation_name(Linkage linkage)
1777
+ {
1778
+ return linkage->sublinkage[linkage->current].violation;
1779
+ }
1780
+
1781
+ int linkage_is_canonical(Linkage linkage)
1782
+ {
1783
+ /* The sat solver (currently) fails to fill in info */
1784
+ if (!linkage->info) return TRUE;
1785
+ return linkage->info->canonical;
1786
+ }
1787
+
1788
+ int linkage_is_improper(Linkage linkage)
1789
+ {
1790
+ /* The sat solver (currently) fails to fill in info */
1791
+ if (!linkage->info) return FALSE;
1792
+ return linkage->info->improper_fat_linkage;
1793
+ }
1794
+
1795
+ int linkage_has_inconsistent_domains(Linkage linkage)
1796
+ {
1797
+ /* The sat solver (currently) fails to fill in info */
1798
+ if (!linkage->info) return FALSE;
1799
+ return linkage->info->inconsistent_domains;
1800
+ }
1801
+
1802
+ void linkage_post_process(Linkage linkage, Postprocessor * postprocessor)
1803
+ {
1804
+ int N_sublinkages = linkage_get_num_sublinkages(linkage);
1805
+ Parse_Options opts = linkage->opts;
1806
+ Sentence sent = linkage->sent;
1807
+ Sublinkage * subl;
1808
+ PP_node * pp;
1809
+ int i, j, k;
1810
+ D_type_list * d;
1811
+
1812
+ for (i = 0; i < N_sublinkages; ++i)
1813
+ {
1814
+ subl = &linkage->sublinkage[i];
1815
+ if (subl->pp_info != NULL)
1816
+ {
1817
+ for (j = 0; j < subl->num_links; ++j)
1818
+ {
1819
+ exfree_pp_info(&subl->pp_info[j]);
1820
+ }
1821
+ post_process_free_data(&subl->pp_data);
1822
+ exfree(subl->pp_info, sizeof(PP_info)*subl->num_links);
1823
+ }
1824
+ subl->pp_info = (PP_info *) exalloc(sizeof(PP_info)*subl->num_links);
1825
+ for (j = 0; j < subl->num_links; ++j)
1826
+ {
1827
+ subl->pp_info[j].num_domains = 0;
1828
+ subl->pp_info[j].domain_name = NULL;
1829
+ }
1830
+ if (subl->violation != NULL)
1831
+ {
1832
+ exfree((void *)subl->violation, sizeof(char)*(strlen(subl->violation)+1));
1833
+ subl->violation = NULL;
1834
+ }
1835
+
1836
+ if (linkage->info->improper_fat_linkage)
1837
+ {
1838
+ pp = NULL;
1839
+ }
1840
+ else
1841
+ {
1842
+ pp = post_process(postprocessor, opts, sent, subl, FALSE);
1843
+ /* This can return NULL, for example if there is no
1844
+ post-processor */
1845
+ }
1846
+
1847
+ if (pp == NULL)
1848
+ {
1849
+ for (j = 0; j < subl->num_links; ++j)
1850
+ {
1851
+ subl->pp_info[j].num_domains = 0;
1852
+ subl->pp_info[j].domain_name = NULL;
1853
+ }
1854
+ }
1855
+ else
1856
+ {
1857
+ for (j = 0; j < subl->num_links; ++j)
1858
+ {
1859
+ k = 0;
1860
+ for (d = pp->d_type_array[j]; d != NULL; d = d->next) k++;
1861
+ subl->pp_info[j].num_domains = k;
1862
+ if (k > 0)
1863
+ {
1864
+ subl->pp_info[j].domain_name = (const char **) exalloc(sizeof(const char *)*k);
1865
+ }
1866
+ k = 0;
1867
+ for (d = pp->d_type_array[j]; d != NULL; d = d->next)
1868
+ {
1869
+ char buff[5];
1870
+ sprintf(buff, "%c", d->type);
1871
+ subl->pp_info[j].domain_name[k] = string_set_add (buff, sent->string_set);
1872
+
1873
+ k++;
1874
+ }
1875
+ }
1876
+ subl->pp_data = postprocessor->pp_data;
1877
+ if (pp->violation != NULL)
1878
+ {
1879
+ char * s = (char *) exalloc(sizeof(char)*(strlen(pp->violation)+1));
1880
+ strcpy(s, pp->violation);
1881
+ subl->violation = s;
1882
+ }
1883
+ }
1884
+ }
1885
+ post_process_close_sentence(postprocessor);
1886
+ }
1887
+ #endif