grammar_cop 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (344) hide show
  1. data/.DS_Store +0 -0
  2. data/.gitignore +4 -0
  3. data/Gemfile +4 -0
  4. data/Rakefile +8 -0
  5. data/data/.DS_Store +0 -0
  6. data/data/Makefile +511 -0
  7. data/data/Makefile.am +4 -0
  8. data/data/Makefile.in +511 -0
  9. data/data/de/.DS_Store +0 -0
  10. data/data/de/4.0.affix +7 -0
  11. data/data/de/4.0.dict +474 -0
  12. data/data/de/Makefile +387 -0
  13. data/data/de/Makefile.am +9 -0
  14. data/data/de/Makefile.in +387 -0
  15. data/data/en/.DS_Store +0 -0
  16. data/data/en/4.0.affix +26 -0
  17. data/data/en/4.0.batch +1002 -0
  18. data/data/en/4.0.biolg.batch +411 -0
  19. data/data/en/4.0.constituent-knowledge +127 -0
  20. data/data/en/4.0.dict +8759 -0
  21. data/data/en/4.0.dict.m4 +6928 -0
  22. data/data/en/4.0.enwiki.batch +14 -0
  23. data/data/en/4.0.fixes.batch +2776 -0
  24. data/data/en/4.0.knowledge +306 -0
  25. data/data/en/4.0.regex +225 -0
  26. data/data/en/4.0.voa.batch +114 -0
  27. data/data/en/Makefile +554 -0
  28. data/data/en/Makefile.am +19 -0
  29. data/data/en/Makefile.in +554 -0
  30. data/data/en/README +173 -0
  31. data/data/en/tiny.dict +157 -0
  32. data/data/en/words/.DS_Store +0 -0
  33. data/data/en/words/Makefile +456 -0
  34. data/data/en/words/Makefile.am +78 -0
  35. data/data/en/words/Makefile.in +456 -0
  36. data/data/en/words/currency +205 -0
  37. data/data/en/words/currency.p +28 -0
  38. data/data/en/words/entities.given-bisex.sing +39 -0
  39. data/data/en/words/entities.given-female.sing +4141 -0
  40. data/data/en/words/entities.given-male.sing +1633 -0
  41. data/data/en/words/entities.locations.sing +68 -0
  42. data/data/en/words/entities.national.sing +253 -0
  43. data/data/en/words/entities.organizations.sing +7 -0
  44. data/data/en/words/entities.us-states.sing +11 -0
  45. data/data/en/words/units.1 +45 -0
  46. data/data/en/words/units.1.dot +4 -0
  47. data/data/en/words/units.3 +2 -0
  48. data/data/en/words/units.4 +5 -0
  49. data/data/en/words/units.4.dot +1 -0
  50. data/data/en/words/words-medical.adv.1 +1191 -0
  51. data/data/en/words/words-medical.prep.1 +67 -0
  52. data/data/en/words/words-medical.v.4.1 +2835 -0
  53. data/data/en/words/words-medical.v.4.2 +2848 -0
  54. data/data/en/words/words-medical.v.4.3 +3011 -0
  55. data/data/en/words/words-medical.v.4.4 +3036 -0
  56. data/data/en/words/words-medical.v.4.5 +3050 -0
  57. data/data/en/words/words.adj.1 +6794 -0
  58. data/data/en/words/words.adj.2 +638 -0
  59. data/data/en/words/words.adj.3 +667 -0
  60. data/data/en/words/words.adv.1 +1573 -0
  61. data/data/en/words/words.adv.2 +67 -0
  62. data/data/en/words/words.adv.3 +157 -0
  63. data/data/en/words/words.adv.4 +80 -0
  64. data/data/en/words/words.n.1 +11464 -0
  65. data/data/en/words/words.n.1.wiki +264 -0
  66. data/data/en/words/words.n.2.s +2017 -0
  67. data/data/en/words/words.n.2.s.biolg +1 -0
  68. data/data/en/words/words.n.2.s.wiki +298 -0
  69. data/data/en/words/words.n.2.x +65 -0
  70. data/data/en/words/words.n.2.x.wiki +10 -0
  71. data/data/en/words/words.n.3 +5717 -0
  72. data/data/en/words/words.n.t +23 -0
  73. data/data/en/words/words.v.1.1 +1038 -0
  74. data/data/en/words/words.v.1.2 +1043 -0
  75. data/data/en/words/words.v.1.3 +1052 -0
  76. data/data/en/words/words.v.1.4 +1023 -0
  77. data/data/en/words/words.v.1.p +17 -0
  78. data/data/en/words/words.v.10.1 +14 -0
  79. data/data/en/words/words.v.10.2 +15 -0
  80. data/data/en/words/words.v.10.3 +88 -0
  81. data/data/en/words/words.v.10.4 +17 -0
  82. data/data/en/words/words.v.2.1 +1253 -0
  83. data/data/en/words/words.v.2.2 +1304 -0
  84. data/data/en/words/words.v.2.3 +1280 -0
  85. data/data/en/words/words.v.2.4 +1285 -0
  86. data/data/en/words/words.v.2.5 +1287 -0
  87. data/data/en/words/words.v.4.1 +2472 -0
  88. data/data/en/words/words.v.4.2 +2487 -0
  89. data/data/en/words/words.v.4.3 +2441 -0
  90. data/data/en/words/words.v.4.4 +2478 -0
  91. data/data/en/words/words.v.4.5 +2483 -0
  92. data/data/en/words/words.v.5.1 +98 -0
  93. data/data/en/words/words.v.5.2 +98 -0
  94. data/data/en/words/words.v.5.3 +103 -0
  95. data/data/en/words/words.v.5.4 +102 -0
  96. data/data/en/words/words.v.6.1 +388 -0
  97. data/data/en/words/words.v.6.2 +401 -0
  98. data/data/en/words/words.v.6.3 +397 -0
  99. data/data/en/words/words.v.6.4 +405 -0
  100. data/data/en/words/words.v.6.5 +401 -0
  101. data/data/en/words/words.v.8.1 +117 -0
  102. data/data/en/words/words.v.8.2 +118 -0
  103. data/data/en/words/words.v.8.3 +118 -0
  104. data/data/en/words/words.v.8.4 +119 -0
  105. data/data/en/words/words.v.8.5 +119 -0
  106. data/data/en/words/words.y +104 -0
  107. data/data/lt/.DS_Store +0 -0
  108. data/data/lt/4.0.affix +6 -0
  109. data/data/lt/4.0.constituent-knowledge +24 -0
  110. data/data/lt/4.0.dict +135 -0
  111. data/data/lt/4.0.knowledge +38 -0
  112. data/data/lt/Makefile +389 -0
  113. data/data/lt/Makefile.am +11 -0
  114. data/data/lt/Makefile.in +389 -0
  115. data/ext/.DS_Store +0 -0
  116. data/ext/link_grammar/.DS_Store +0 -0
  117. data/ext/link_grammar/extconf.rb +2 -0
  118. data/ext/link_grammar/link-grammar/.DS_Store +0 -0
  119. data/ext/link_grammar/link-grammar/.deps/analyze-linkage.Plo +198 -0
  120. data/ext/link_grammar/link-grammar/.deps/and.Plo +202 -0
  121. data/ext/link_grammar/link-grammar/.deps/api.Plo +244 -0
  122. data/ext/link_grammar/link-grammar/.deps/build-disjuncts.Plo +212 -0
  123. data/ext/link_grammar/link-grammar/.deps/command-line.Plo +201 -0
  124. data/ext/link_grammar/link-grammar/.deps/constituents.Plo +201 -0
  125. data/ext/link_grammar/link-grammar/.deps/count.Plo +202 -0
  126. data/ext/link_grammar/link-grammar/.deps/disjunct-utils.Plo +126 -0
  127. data/ext/link_grammar/link-grammar/.deps/disjuncts.Plo +123 -0
  128. data/ext/link_grammar/link-grammar/.deps/error.Plo +121 -0
  129. data/ext/link_grammar/link-grammar/.deps/expand.Plo +133 -0
  130. data/ext/link_grammar/link-grammar/.deps/extract-links.Plo +198 -0
  131. data/ext/link_grammar/link-grammar/.deps/fast-match.Plo +200 -0
  132. data/ext/link_grammar/link-grammar/.deps/idiom.Plo +200 -0
  133. data/ext/link_grammar/link-grammar/.deps/jni-client.Plo +217 -0
  134. data/ext/link_grammar/link-grammar/.deps/link-parser.Po +1 -0
  135. data/ext/link_grammar/link-grammar/.deps/massage.Plo +202 -0
  136. data/ext/link_grammar/link-grammar/.deps/post-process.Plo +202 -0
  137. data/ext/link_grammar/link-grammar/.deps/pp_knowledge.Plo +202 -0
  138. data/ext/link_grammar/link-grammar/.deps/pp_lexer.Plo +201 -0
  139. data/ext/link_grammar/link-grammar/.deps/pp_linkset.Plo +200 -0
  140. data/ext/link_grammar/link-grammar/.deps/prefix.Plo +102 -0
  141. data/ext/link_grammar/link-grammar/.deps/preparation.Plo +202 -0
  142. data/ext/link_grammar/link-grammar/.deps/print-util.Plo +200 -0
  143. data/ext/link_grammar/link-grammar/.deps/print.Plo +201 -0
  144. data/ext/link_grammar/link-grammar/.deps/prune.Plo +202 -0
  145. data/ext/link_grammar/link-grammar/.deps/read-dict.Plo +223 -0
  146. data/ext/link_grammar/link-grammar/.deps/read-regex.Plo +123 -0
  147. data/ext/link_grammar/link-grammar/.deps/regex-morph.Plo +131 -0
  148. data/ext/link_grammar/link-grammar/.deps/resources.Plo +203 -0
  149. data/ext/link_grammar/link-grammar/.deps/spellcheck-aspell.Plo +1 -0
  150. data/ext/link_grammar/link-grammar/.deps/spellcheck-hun.Plo +115 -0
  151. data/ext/link_grammar/link-grammar/.deps/string-set.Plo +198 -0
  152. data/ext/link_grammar/link-grammar/.deps/tokenize.Plo +160 -0
  153. data/ext/link_grammar/link-grammar/.deps/utilities.Plo +222 -0
  154. data/ext/link_grammar/link-grammar/.deps/word-file.Plo +201 -0
  155. data/ext/link_grammar/link-grammar/.deps/word-utils.Plo +212 -0
  156. data/ext/link_grammar/link-grammar/.libs/analyze-linkage.o +0 -0
  157. data/ext/link_grammar/link-grammar/.libs/and.o +0 -0
  158. data/ext/link_grammar/link-grammar/.libs/api.o +0 -0
  159. data/ext/link_grammar/link-grammar/.libs/build-disjuncts.o +0 -0
  160. data/ext/link_grammar/link-grammar/.libs/command-line.o +0 -0
  161. data/ext/link_grammar/link-grammar/.libs/constituents.o +0 -0
  162. data/ext/link_grammar/link-grammar/.libs/count.o +0 -0
  163. data/ext/link_grammar/link-grammar/.libs/disjunct-utils.o +0 -0
  164. data/ext/link_grammar/link-grammar/.libs/disjuncts.o +0 -0
  165. data/ext/link_grammar/link-grammar/.libs/error.o +0 -0
  166. data/ext/link_grammar/link-grammar/.libs/expand.o +0 -0
  167. data/ext/link_grammar/link-grammar/.libs/extract-links.o +0 -0
  168. data/ext/link_grammar/link-grammar/.libs/fast-match.o +0 -0
  169. data/ext/link_grammar/link-grammar/.libs/idiom.o +0 -0
  170. data/ext/link_grammar/link-grammar/.libs/jni-client.o +0 -0
  171. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java-symbols.expsym +31 -0
  172. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib +0 -0
  173. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib.dSYM/Contents/Info.plist +20 -0
  174. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib.dSYM/Contents/Resources/DWARF/liblink-grammar-java.4.dylib +0 -0
  175. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.a +0 -0
  176. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.dylib +0 -0
  177. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-symbols.expsym +194 -0
  178. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib +0 -0
  179. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib.dSYM/Contents/Info.plist +20 -0
  180. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib.dSYM/Contents/Resources/DWARF/liblink-grammar.4.dylib +0 -0
  181. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.a +0 -0
  182. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.dylib +0 -0
  183. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.la +41 -0
  184. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.lai +41 -0
  185. data/ext/link_grammar/link-grammar/.libs/massage.o +0 -0
  186. data/ext/link_grammar/link-grammar/.libs/post-process.o +0 -0
  187. data/ext/link_grammar/link-grammar/.libs/pp_knowledge.o +0 -0
  188. data/ext/link_grammar/link-grammar/.libs/pp_lexer.o +0 -0
  189. data/ext/link_grammar/link-grammar/.libs/pp_linkset.o +0 -0
  190. data/ext/link_grammar/link-grammar/.libs/prefix.o +0 -0
  191. data/ext/link_grammar/link-grammar/.libs/preparation.o +0 -0
  192. data/ext/link_grammar/link-grammar/.libs/print-util.o +0 -0
  193. data/ext/link_grammar/link-grammar/.libs/print.o +0 -0
  194. data/ext/link_grammar/link-grammar/.libs/prune.o +0 -0
  195. data/ext/link_grammar/link-grammar/.libs/read-dict.o +0 -0
  196. data/ext/link_grammar/link-grammar/.libs/read-regex.o +0 -0
  197. data/ext/link_grammar/link-grammar/.libs/regex-morph.o +0 -0
  198. data/ext/link_grammar/link-grammar/.libs/resources.o +0 -0
  199. data/ext/link_grammar/link-grammar/.libs/spellcheck-aspell.o +0 -0
  200. data/ext/link_grammar/link-grammar/.libs/spellcheck-hun.o +0 -0
  201. data/ext/link_grammar/link-grammar/.libs/string-set.o +0 -0
  202. data/ext/link_grammar/link-grammar/.libs/tokenize.o +0 -0
  203. data/ext/link_grammar/link-grammar/.libs/utilities.o +0 -0
  204. data/ext/link_grammar/link-grammar/.libs/word-file.o +0 -0
  205. data/ext/link_grammar/link-grammar/.libs/word-utils.o +0 -0
  206. data/ext/link_grammar/link-grammar/Makefile +900 -0
  207. data/ext/link_grammar/link-grammar/Makefile.am +202 -0
  208. data/ext/link_grammar/link-grammar/Makefile.in +900 -0
  209. data/ext/link_grammar/link-grammar/analyze-linkage.c +1317 -0
  210. data/ext/link_grammar/link-grammar/analyze-linkage.h +24 -0
  211. data/ext/link_grammar/link-grammar/and.c +1603 -0
  212. data/ext/link_grammar/link-grammar/and.h +27 -0
  213. data/ext/link_grammar/link-grammar/api-structures.h +362 -0
  214. data/ext/link_grammar/link-grammar/api-types.h +72 -0
  215. data/ext/link_grammar/link-grammar/api.c +1887 -0
  216. data/ext/link_grammar/link-grammar/api.h +96 -0
  217. data/ext/link_grammar/link-grammar/autoit/.DS_Store +0 -0
  218. data/ext/link_grammar/link-grammar/autoit/README +10 -0
  219. data/ext/link_grammar/link-grammar/autoit/_LGTest.au3 +22 -0
  220. data/ext/link_grammar/link-grammar/autoit/_LinkGrammar.au3 +545 -0
  221. data/ext/link_grammar/link-grammar/build-disjuncts.c +487 -0
  222. data/ext/link_grammar/link-grammar/build-disjuncts.h +21 -0
  223. data/ext/link_grammar/link-grammar/command-line.c +458 -0
  224. data/ext/link_grammar/link-grammar/command-line.h +15 -0
  225. data/ext/link_grammar/link-grammar/constituents.c +1836 -0
  226. data/ext/link_grammar/link-grammar/constituents.h +26 -0
  227. data/ext/link_grammar/link-grammar/corpus/.DS_Store +0 -0
  228. data/ext/link_grammar/link-grammar/corpus/.deps/cluster.Plo +1 -0
  229. data/ext/link_grammar/link-grammar/corpus/.deps/corpus.Plo +1 -0
  230. data/ext/link_grammar/link-grammar/corpus/Makefile +527 -0
  231. data/ext/link_grammar/link-grammar/corpus/Makefile.am +46 -0
  232. data/ext/link_grammar/link-grammar/corpus/Makefile.in +527 -0
  233. data/ext/link_grammar/link-grammar/corpus/README +17 -0
  234. data/ext/link_grammar/link-grammar/corpus/cluster.c +286 -0
  235. data/ext/link_grammar/link-grammar/corpus/cluster.h +32 -0
  236. data/ext/link_grammar/link-grammar/corpus/corpus.c +483 -0
  237. data/ext/link_grammar/link-grammar/corpus/corpus.h +46 -0
  238. data/ext/link_grammar/link-grammar/count.c +828 -0
  239. data/ext/link_grammar/link-grammar/count.h +25 -0
  240. data/ext/link_grammar/link-grammar/disjunct-utils.c +261 -0
  241. data/ext/link_grammar/link-grammar/disjunct-utils.h +27 -0
  242. data/ext/link_grammar/link-grammar/disjuncts.c +138 -0
  243. data/ext/link_grammar/link-grammar/disjuncts.h +13 -0
  244. data/ext/link_grammar/link-grammar/error.c +92 -0
  245. data/ext/link_grammar/link-grammar/error.h +35 -0
  246. data/ext/link_grammar/link-grammar/expand.c +67 -0
  247. data/ext/link_grammar/link-grammar/expand.h +13 -0
  248. data/ext/link_grammar/link-grammar/externs.h +22 -0
  249. data/ext/link_grammar/link-grammar/extract-links.c +625 -0
  250. data/ext/link_grammar/link-grammar/extract-links.h +16 -0
  251. data/ext/link_grammar/link-grammar/fast-match.c +309 -0
  252. data/ext/link_grammar/link-grammar/fast-match.h +17 -0
  253. data/ext/link_grammar/link-grammar/idiom.c +373 -0
  254. data/ext/link_grammar/link-grammar/idiom.h +15 -0
  255. data/ext/link_grammar/link-grammar/jni-client.c +779 -0
  256. data/ext/link_grammar/link-grammar/jni-client.h +236 -0
  257. data/ext/link_grammar/link-grammar/liblink-grammar-java.la +42 -0
  258. data/ext/link_grammar/link-grammar/liblink-grammar.la +41 -0
  259. data/ext/link_grammar/link-grammar/link-features.h +37 -0
  260. data/ext/link_grammar/link-grammar/link-features.h.in +37 -0
  261. data/ext/link_grammar/link-grammar/link-grammar-java.def +31 -0
  262. data/ext/link_grammar/link-grammar/link-grammar.def +194 -0
  263. data/ext/link_grammar/link-grammar/link-includes.h +465 -0
  264. data/ext/link_grammar/link-grammar/link-parser.c +849 -0
  265. data/ext/link_grammar/link-grammar/massage.c +329 -0
  266. data/ext/link_grammar/link-grammar/massage.h +13 -0
  267. data/ext/link_grammar/link-grammar/post-process.c +1113 -0
  268. data/ext/link_grammar/link-grammar/post-process.h +45 -0
  269. data/ext/link_grammar/link-grammar/pp_knowledge.c +376 -0
  270. data/ext/link_grammar/link-grammar/pp_knowledge.h +14 -0
  271. data/ext/link_grammar/link-grammar/pp_lexer.c +1920 -0
  272. data/ext/link_grammar/link-grammar/pp_lexer.h +19 -0
  273. data/ext/link_grammar/link-grammar/pp_linkset.c +158 -0
  274. data/ext/link_grammar/link-grammar/pp_linkset.h +20 -0
  275. data/ext/link_grammar/link-grammar/prefix.c +482 -0
  276. data/ext/link_grammar/link-grammar/prefix.h +139 -0
  277. data/ext/link_grammar/link-grammar/preparation.c +412 -0
  278. data/ext/link_grammar/link-grammar/preparation.h +20 -0
  279. data/ext/link_grammar/link-grammar/print-util.c +87 -0
  280. data/ext/link_grammar/link-grammar/print-util.h +32 -0
  281. data/ext/link_grammar/link-grammar/print.c +1085 -0
  282. data/ext/link_grammar/link-grammar/print.h +16 -0
  283. data/ext/link_grammar/link-grammar/prune.c +1864 -0
  284. data/ext/link_grammar/link-grammar/prune.h +17 -0
  285. data/ext/link_grammar/link-grammar/read-dict.c +1785 -0
  286. data/ext/link_grammar/link-grammar/read-dict.h +29 -0
  287. data/ext/link_grammar/link-grammar/read-regex.c +161 -0
  288. data/ext/link_grammar/link-grammar/read-regex.h +12 -0
  289. data/ext/link_grammar/link-grammar/regex-morph.c +126 -0
  290. data/ext/link_grammar/link-grammar/regex-morph.h +17 -0
  291. data/ext/link_grammar/link-grammar/resources.c +180 -0
  292. data/ext/link_grammar/link-grammar/resources.h +23 -0
  293. data/ext/link_grammar/link-grammar/sat-solver/.DS_Store +0 -0
  294. data/ext/link_grammar/link-grammar/sat-solver/.deps/fast-sprintf.Plo +1 -0
  295. data/ext/link_grammar/link-grammar/sat-solver/.deps/sat-encoder.Plo +1 -0
  296. data/ext/link_grammar/link-grammar/sat-solver/.deps/util.Plo +1 -0
  297. data/ext/link_grammar/link-grammar/sat-solver/.deps/variables.Plo +1 -0
  298. data/ext/link_grammar/link-grammar/sat-solver/.deps/word-tag.Plo +1 -0
  299. data/ext/link_grammar/link-grammar/sat-solver/Makefile +527 -0
  300. data/ext/link_grammar/link-grammar/sat-solver/Makefile.am +29 -0
  301. data/ext/link_grammar/link-grammar/sat-solver/Makefile.in +527 -0
  302. data/ext/link_grammar/link-grammar/sat-solver/clock.hpp +33 -0
  303. data/ext/link_grammar/link-grammar/sat-solver/fast-sprintf.cpp +26 -0
  304. data/ext/link_grammar/link-grammar/sat-solver/fast-sprintf.hpp +7 -0
  305. data/ext/link_grammar/link-grammar/sat-solver/guiding.hpp +244 -0
  306. data/ext/link_grammar/link-grammar/sat-solver/matrix-ut.hpp +79 -0
  307. data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.cpp +2811 -0
  308. data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.h +11 -0
  309. data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.hpp +381 -0
  310. data/ext/link_grammar/link-grammar/sat-solver/trie.hpp +118 -0
  311. data/ext/link_grammar/link-grammar/sat-solver/util.cpp +23 -0
  312. data/ext/link_grammar/link-grammar/sat-solver/util.hpp +14 -0
  313. data/ext/link_grammar/link-grammar/sat-solver/variables.cpp +5 -0
  314. data/ext/link_grammar/link-grammar/sat-solver/variables.hpp +829 -0
  315. data/ext/link_grammar/link-grammar/sat-solver/word-tag.cpp +159 -0
  316. data/ext/link_grammar/link-grammar/sat-solver/word-tag.hpp +162 -0
  317. data/ext/link_grammar/link-grammar/spellcheck-aspell.c +148 -0
  318. data/ext/link_grammar/link-grammar/spellcheck-hun.c +136 -0
  319. data/ext/link_grammar/link-grammar/spellcheck.h +34 -0
  320. data/ext/link_grammar/link-grammar/string-set.c +169 -0
  321. data/ext/link_grammar/link-grammar/string-set.h +16 -0
  322. data/ext/link_grammar/link-grammar/structures.h +498 -0
  323. data/ext/link_grammar/link-grammar/tokenize.c +1049 -0
  324. data/ext/link_grammar/link-grammar/tokenize.h +15 -0
  325. data/ext/link_grammar/link-grammar/utilities.c +847 -0
  326. data/ext/link_grammar/link-grammar/utilities.h +281 -0
  327. data/ext/link_grammar/link-grammar/word-file.c +124 -0
  328. data/ext/link_grammar/link-grammar/word-file.h +15 -0
  329. data/ext/link_grammar/link-grammar/word-utils.c +526 -0
  330. data/ext/link_grammar/link-grammar/word-utils.h +152 -0
  331. data/ext/link_grammar/link_grammar.c +202 -0
  332. data/ext/link_grammar/link_grammar.h +99 -0
  333. data/grammar_cop.gemspec +24 -0
  334. data/lib/.DS_Store +0 -0
  335. data/lib/grammar_cop.rb +9 -0
  336. data/lib/grammar_cop/.DS_Store +0 -0
  337. data/lib/grammar_cop/dictionary.rb +19 -0
  338. data/lib/grammar_cop/linkage.rb +30 -0
  339. data/lib/grammar_cop/parse_options.rb +32 -0
  340. data/lib/grammar_cop/sentence.rb +36 -0
  341. data/lib/grammar_cop/version.rb +3 -0
  342. data/test/.DS_Store +0 -0
  343. data/test/grammar_cop_test.rb +27 -0
  344. metadata +407 -0
@@ -0,0 +1,1049 @@
1
+ /*************************************************************************/
2
+ /* Copyright (c) 2004 */
3
+ /* Daniel Sleator, David Temperley, and John Lafferty */
4
+ /* Copyright (c) 2009 Linas Vepstas */
5
+ /* All rights reserved */
6
+ /* */
7
+ /* Use of the link grammar parsing system is subject to the terms of the */
8
+ /* license set forth in the LICENSE file included with this software, */
9
+ /* and also available at http://www.link.cs.cmu.edu/link/license.html */
10
+ /* This license allows free redistribution and use in source and binary */
11
+ /* forms, with or without modification, subject to certain conditions. */
12
+ /* */
13
+ /*************************************************************************/
14
+
15
+ #ifndef _WIN32
16
+ #include <langinfo.h>
17
+ #endif
18
+ #include <limits.h>
19
+
20
+ #include "build-disjuncts.h"
21
+ #include "error.h"
22
+ #include "externs.h"
23
+ #include "read-dict.h"
24
+ #include "regex-morph.h"
25
+ #include "spellcheck.h"
26
+ #include "string-set.h"
27
+ #include "structures.h"
28
+ #include "tokenize.h"
29
+ #include "utilities.h"
30
+ #include "word-utils.h"
31
+
32
+ #define MAX_STRIP 10
33
+
34
+ /* These are no longer in use, but are read from the 4.0.affix file */
35
+ /* I've left these here, as an axample of what to expect. */
36
+ /*static char * strip_left[] = {"(", "$", "``", NULL}; */
37
+ /*static char * strip_right[] = {")", "%", ",", ".", ":", ";", "?", "!", "''", "'", "'s", NULL};*/
38
+
39
+ #define ENTITY_MARKER "<marker-entity>"
40
+ #define COMMON_ENTITY_MARKER "<marker-common-entity>"
41
+
42
+ /**
43
+ * is_common_entity - Return true if word is a common noun or adjective
44
+ * Common nouns and adjectives are typically used in corporate entity
45
+ * names -- e.g. "Sun State Bank" -- "sun", "state" and "bank" are all
46
+ * common nouns.
47
+ */
48
+ static int is_common_entity(Dictionary dict, const char * str)
49
+ {
50
+ if (word_contains(dict, str, COMMON_ENTITY_MARKER) == 1)
51
+ return TRUE;
52
+ return FALSE;
53
+ }
54
+
55
+ static int is_entity(Dictionary dict, const char * str)
56
+ {
57
+ const char * regex_name;
58
+ if (word_contains(dict, str, ENTITY_MARKER) == 1)
59
+ return TRUE;
60
+ regex_name = match_regex(dict, str);
61
+ if (NULL == regex_name) return FALSE;
62
+ return word_contains(dict, regex_name, ENTITY_MARKER);
63
+ }
64
+
65
+
66
+ /**
67
+ * Return TRUE if word is a proper name.
68
+ * XXX This is a cheap hack that works only in English, and is
69
+ * broken for German! We need to replace this with something
70
+ * language-specific.
71
+ *
72
+ * Basically, if word starts with upper-case latter, we assume
73
+ * its a proper name, and that's that.
74
+ */
75
+ static int is_proper_name(const char * word)
76
+ {
77
+ return is_utf8_upper(word);
78
+ }
79
+
80
+ /* Create a string containing anything that can be construed to
81
+ * be a quotation mark. This works, because link-grammar is more
82
+ * or less ignorant of quotes at this time.
83
+ */
84
+ static const wchar_t *list_of_quotes(void)
85
+ {
86
+ #define QUSZ 50
87
+ static wchar_t wqs[QUSZ];
88
+ mbstate_t mbs;
89
+ /* Single-quotes are used for abbreviations, don't mess with them */
90
+ /* const char * qs = "\"\'«»《》【】『』‘’`„“"; */
91
+ const char * qs = "\"«»《》【】『』`„“";
92
+
93
+ const char *pqs = qs;
94
+
95
+ memset(&mbs, 0, sizeof(mbs));
96
+
97
+ mbsrtowcs(wqs, &pqs, QUSZ, &mbs);
98
+
99
+ return wqs;
100
+ }
101
+
102
+ /**
103
+ * Return TRUE if the character is a quotation character.
104
+ */
105
+ static int is_quote(wchar_t wc)
106
+ {
107
+ static const wchar_t *quotes = NULL;
108
+ if (NULL == quotes) quotes = list_of_quotes();
109
+
110
+ if (NULL != wcschr(quotes, wc)) return TRUE;
111
+ return FALSE;
112
+ }
113
+
114
+ /**
115
+ * Returns true if the word can be interpreted as a number.
116
+ * The ":" is included here so we allow "10:30" to be a number.
117
+ * We also allow U+00A0 "no-break space"
118
+ */
119
+ static int is_number(const char * s)
120
+ {
121
+ mbstate_t mbs;
122
+ int nb = 1;
123
+ wchar_t c;
124
+ if (!is_utf8_digit(s)) return FALSE;
125
+
126
+ memset(&mbs, 0, sizeof(mbs));
127
+ while ((*s != 0) && (0 < nb))
128
+ {
129
+ nb = mbrtowc(&c, s, MB_CUR_MAX, &mbs);
130
+ if (iswdigit(c)) { s += nb; }
131
+
132
+ /* U+00A0 no break space */
133
+ else if (0xa0 == c) { s += nb; }
134
+
135
+ else if ((*s == '.') || (*s == ',') || (*s == ':')) { s++; }
136
+ else return FALSE;
137
+ }
138
+ return TRUE;
139
+ }
140
+
141
+ /**
142
+ * Returns true if the word contains digits.
143
+ */
144
+ static int contains_digits(const char * s)
145
+ {
146
+ mbstate_t mbs;
147
+ int nb = 1;
148
+ wchar_t c;
149
+
150
+ memset(&mbs, 0, sizeof(mbs));
151
+ while ((*s != 0) && (0 < nb))
152
+ {
153
+ nb = mbrtowc(&c, s, MB_CUR_MAX, &mbs);
154
+ if (iswdigit(c)) return TRUE;
155
+ s += nb;
156
+ }
157
+ return FALSE;
158
+ }
159
+
160
+ /**
161
+ * The string s is the next word of the sentence.
162
+ * Do not issue the empty string.
163
+ * Return false if too many words or the word is too long.
164
+ */
165
+ static int issue_sentence_word(Sentence sent, const char * s)
166
+ {
167
+ if (*s == '\0') return TRUE;
168
+ if (strlen(s) > MAX_WORD)
169
+ {
170
+ err_ctxt ec;
171
+ ec.sent = sent;
172
+ err_msg(&ec, Error,
173
+ "Error separating sentence. The word \"%s\" is too long.\n"
174
+ "A word can have a maximum of %d characters.\n", s, MAX_WORD);
175
+ return FALSE;
176
+ }
177
+
178
+ if (sent->length >= MAX_SENTENCE)
179
+ {
180
+ err_ctxt ec;
181
+ ec.sent = sent;
182
+ err_msg(&ec, Error,
183
+ "Error separating sentence. The sentence has too many words.\n");
184
+ return FALSE;
185
+ }
186
+
187
+ strcpy(sent->word[sent->length].string, s);
188
+
189
+ /* Now we record whether the first character of the word is upper-case.
190
+ (The first character may be made lower-case
191
+ later, but we may want to get at the original version) */
192
+ if (is_utf8_upper(s)) sent->word[sent->length].firstupper=1;
193
+ else sent->word[sent->length].firstupper = 0;
194
+ sent->length++;
195
+ return TRUE;
196
+ }
197
+
198
+ /*
199
+ Here's a summary of how subscripts are handled:
200
+
201
+ Reading the dictionary:
202
+
203
+ If the last "." in a string is followed by a non-digit character,
204
+ then the "." and everything after it is considered to be the subscript
205
+ of the word.
206
+
207
+ The dictionary reader does not allow you to have two words that
208
+ match according to the criterion below. (so you can't have
209
+ "dog.n" and "dog")
210
+
211
+ Quote marks are used to allow you to define words in the dictionary
212
+ which would otherwise be considered part of the dictionary, as in
213
+
214
+ ";": {@Xca-} & Xx- & (W+ or Qd+) & {Xx+};
215
+ "%" : (ND- & {DD-} & <noun-sub-x> &
216
+ (<noun-main-x> or B*x+)) or (ND- & (OD- or AN+));
217
+
218
+ Rules for chopping words from the input sentence:
219
+
220
+ First the prefix chars are stripped off of the word. These
221
+ characters are "(" and "$" (and now "``")
222
+
223
+ Now, repeat the following as long as necessary:
224
+
225
+ Look up the word in the dictionary.
226
+ If it's there, the process terminates.
227
+
228
+ If it's not there and it ends in one of the right strippable
229
+ strings (see "strip_right") then remove the strippable string
230
+ and make it into a separate word.
231
+
232
+ If there is no strippable string, then the process terminates.
233
+
234
+ Rule for defining subscripts in input words:
235
+
236
+ The subscript rule is followed just as when reading the dictionary.
237
+
238
+ When does a word in the sentence match a word in the dictionary?
239
+
240
+ Matching is done as follows: Two words with subscripts must match
241
+ exactly. If neither has a subscript they must match exactly. If one
242
+ does and one doesn't then they must match when the subscript is
243
+ removed. Notice that this is symmetric.
244
+
245
+ So, under this system, the dictonary could have the words "Ill" and
246
+ also the word "Ill." It could also have the word "i.e.", which could be
247
+ used in a sentence.
248
+ */
249
+
250
+ #undef MIN
251
+ #define MIN(a, b) (((a) < (b)) ? (a) : (b))
252
+
253
+ static int boolean_reg_dict_lookup(Dictionary dict, const char * word)
254
+ {
255
+ const char * regex_name;
256
+ if (boolean_dictionary_lookup(dict, word)) return TRUE;
257
+
258
+ regex_name = match_regex(dict, word);
259
+ if (NULL == regex_name) return FALSE;
260
+
261
+ return boolean_dictionary_lookup(dict, regex_name);
262
+ }
263
+
264
+ static int downcase_is_in_dict(Dictionary dict, char * word)
265
+ {
266
+ int i, rc;
267
+ char low[MB_LEN_MAX];
268
+ char save[MB_LEN_MAX];
269
+ wchar_t c;
270
+ int nbl, nbh;
271
+ mbstate_t mbs, mbss;
272
+
273
+ if (!is_utf8_upper(word)) return FALSE;
274
+
275
+ memset(&mbs, 0, sizeof(mbs));
276
+ memset(&mbss, 0, sizeof(mbss));
277
+
278
+ nbh = mbrtowc (&c, word, MB_CUR_MAX, &mbs);
279
+ c = towlower(c);
280
+ nbl = wctomb_check(low, c, &mbss);
281
+ if (nbh != nbl)
282
+ {
283
+ prt_error("Warning: can't downcase multi-byte string: %s\n", word);
284
+ return FALSE;
285
+ }
286
+
287
+ /* Downcase */
288
+ for (i=0; i<nbl; i++) { save[i] = word[i]; word[i] = low[i]; }
289
+
290
+ /* Look it up, then restore old value */
291
+ rc = boolean_reg_dict_lookup(dict, word);
292
+ for (i=0; i<nbh; i++) { word[i] = save[i]; }
293
+
294
+ return rc;
295
+ }
296
+
297
+ /**
298
+ * w points to a string, wend points to the char one after the end. The
299
+ * "word" w contains no blanks. This function splits up the word if
300
+ * necessary, and calls "issue_sentence_word()" on each of the resulting
301
+ * parts. The process is described above. Returns TRUE if OK, FALSE if
302
+ * too many punctuation marks or other separation error.
303
+ */
304
+ static int separate_word(Sentence sent, Parse_Options opts,
305
+ const char *w, const char *wend,
306
+ int is_first_word, int quote_found)
307
+ {
308
+ size_t sz;
309
+ int i, j, len;
310
+ int r_strippable=0, l_strippable=0, u_strippable=0;
311
+ int s_strippable=0, p_strippable=0;
312
+ int n_r_stripped, s_stripped;
313
+ int word_is_in_dict, s_ok;
314
+ int issued = FALSE;
315
+
316
+ int found_number = 0;
317
+ int n_r_stripped_save;
318
+ const char * wend_save;
319
+
320
+ const char ** strip_left = NULL;
321
+ const char ** strip_right = NULL;
322
+ const char ** strip_units = NULL;
323
+ const char ** prefix = NULL;
324
+ const char ** suffix = NULL;
325
+ char word[MAX_WORD+1];
326
+ char newword[MAX_WORD+1];
327
+
328
+ const char *r_stripped[MAX_STRIP]; /* these were stripped from the right */
329
+
330
+ /* First, see if we can already recognize the word as-is. If
331
+ * so, then we are done. Else we'll try stripping prefixes, suffixes.
332
+ */
333
+ sz = MIN(wend-w, MAX_WORD);
334
+ strncpy(word, w, sz);
335
+ word[sz] = '\0';
336
+ word_is_in_dict = FALSE;
337
+
338
+ if (boolean_reg_dict_lookup(sent->dict, word))
339
+ word_is_in_dict = TRUE;
340
+ else if (is_first_word && downcase_is_in_dict (sent->dict,word))
341
+ word_is_in_dict = TRUE;
342
+
343
+ if (word_is_in_dict)
344
+ {
345
+ return issue_sentence_word(sent, word);
346
+ }
347
+
348
+ /* Set up affix tables. */
349
+ if (sent->dict->affix_table != NULL)
350
+ {
351
+ Dictionary dict = sent->dict->affix_table;
352
+ r_strippable = dict->r_strippable;
353
+ l_strippable = dict->l_strippable;
354
+ u_strippable = dict->u_strippable;
355
+ p_strippable = dict->p_strippable;
356
+ s_strippable = dict->s_strippable;
357
+
358
+ strip_left = dict->strip_left;
359
+ strip_right = dict->strip_right;
360
+ strip_units = dict->strip_units;
361
+ prefix = dict->prefix;
362
+ suffix = dict->suffix;
363
+ }
364
+
365
+ /* Strip off punctuation, etc. on the left-hand side. */
366
+ /* XXX FIXME: this fails in certain cases: e.g.
367
+ * "By the '50s, he was very prosperous."
368
+ * where the leading quote is striped, and then "50s," cannot be
369
+ * found in the dict. Next, the comma is removed, and "50s" is still
370
+ * not in the dict ... the trick was that the comma should be
371
+ * right-stripped first, then the possible quotes.
372
+ * More generally, link-grammar does not support multiple possible
373
+ * tokenizations.
374
+ */
375
+ for (;;)
376
+ {
377
+ for (i=0; i<l_strippable; i++)
378
+ {
379
+ /* This is UTF8-safe, I beleive ... */
380
+ sz = strlen(strip_left[i]);
381
+ if (strncmp(w, strip_left[i], sz) == 0)
382
+ {
383
+ if (!issue_sentence_word(sent, strip_left[i])) return FALSE;
384
+ w += sz;
385
+ break;
386
+ }
387
+ }
388
+ if (i == l_strippable) break;
389
+ }
390
+
391
+ /* Its possible that the token consisted entirely of
392
+ * left-punctuation, in which case, it has all been issued.
393
+ * So -- we're done, return.
394
+ */
395
+ if (w >= wend) return TRUE;
396
+
397
+ /* Now w points to the string starting just to the right of
398
+ * any left-stripped characters.
399
+ * stripped[] is an array of numbers, indicating the index
400
+ * numbers (in the strip_right array) of any strings stripped off;
401
+ * stripped[0] is the number of the first string stripped off, etc.
402
+ * When it breaks out of this loop, n_stripped will be the number
403
+ * of strings stripped off.
404
+ */
405
+ for (n_r_stripped = 0; n_r_stripped < MAX_STRIP; n_r_stripped++)
406
+ {
407
+ sz = MIN(wend-w, MAX_WORD);
408
+ strncpy(word, w, sz);
409
+ word[sz] = '\0';
410
+ if (wend == w) break; /* it will work without this */
411
+
412
+ if (boolean_reg_dict_lookup(sent->dict, word))
413
+ {
414
+ word_is_in_dict = TRUE;
415
+ break;
416
+ }
417
+
418
+ /* This could happen if it's a word after a colon, also! */
419
+ if (is_first_word && downcase_is_in_dict (sent->dict, word))
420
+ {
421
+ word_is_in_dict = TRUE;
422
+ break;
423
+ }
424
+
425
+ for (i=0; i < r_strippable; i++)
426
+ {
427
+ len = strlen(strip_right[i]);
428
+
429
+ /* the remaining w is too short for a possible match */
430
+ if ((wend-w) < len) continue;
431
+ if (strncmp(wend-len, strip_right[i], len) == 0)
432
+ {
433
+ r_stripped[n_r_stripped] = strip_right[i];
434
+ wend -= len;
435
+ break;
436
+ }
437
+ }
438
+ if (i == r_strippable) break;
439
+ }
440
+
441
+ /* Is there a number in the word? If so, then search for
442
+ * trailing units suffixes.
443
+ */
444
+ if ((FALSE == word_is_in_dict) && contains_digits(word))
445
+ {
446
+ /* Same as above, but with a twist: the only thing that can
447
+ * preceed a units suffix is a number. This is so that we can
448
+ * split up things like "12ft" (twelve feet) but not split up
449
+ * things like "Delft blue". Multiple passes allow for
450
+ * constructions such as 12sq.ft.
451
+ */
452
+ n_r_stripped_save = n_r_stripped;
453
+ wend_save = wend;
454
+ for (; n_r_stripped < MAX_STRIP; n_r_stripped++)
455
+ {
456
+ size_t sz = MIN(wend-w, MAX_WORD);
457
+ strncpy(word, w, sz);
458
+ word[sz] = '\0';
459
+ if (wend == w) break; /* it will work without this */
460
+
461
+ /* Number */
462
+ if (is_number(word))
463
+ {
464
+ found_number = 1;
465
+ break;
466
+ }
467
+
468
+ for (i=0; i < u_strippable; i++)
469
+ {
470
+ len = strlen(strip_units[i]);
471
+
472
+ /* the remaining w is too short for a possible match */
473
+ if ((wend-w) < len) continue;
474
+ if (strncmp(wend-len, strip_units[i], len) == 0)
475
+ {
476
+ r_stripped[n_r_stripped] = strip_units[i];
477
+ wend -= len;
478
+ break;
479
+ }
480
+ }
481
+ if (i == u_strippable) break;
482
+ }
483
+
484
+ /* The root *must* be a number! */
485
+ if (0 == found_number)
486
+ {
487
+ wend = wend_save;
488
+ n_r_stripped = n_r_stripped_save;
489
+ }
490
+ }
491
+
492
+ /* Now we strip off suffixes...w points to the remaining word,
493
+ * "wend" to the end of the word. */
494
+
495
+ s_stripped = -1;
496
+ strncpy(word, w, MIN(wend-w, MAX_WORD));
497
+ word[MIN(wend-w, MAX_WORD)] = '\0';
498
+
499
+ /* Umm, double-check, if need be ... !?? */
500
+ if (FALSE == word_is_in_dict)
501
+ {
502
+ if (boolean_reg_dict_lookup(sent->dict, word))
503
+ word_is_in_dict = TRUE;
504
+ else if (is_first_word && downcase_is_in_dict (sent->dict,word))
505
+ word_is_in_dict = TRUE;
506
+ }
507
+
508
+ if (FALSE == word_is_in_dict)
509
+ {
510
+ j=0;
511
+ for (i=0; i <= s_strippable; i++)
512
+ {
513
+ s_ok = 0;
514
+ /* Go through once for each suffix; then go through one
515
+ * final time for the no-suffix case */
516
+ if (i < s_strippable)
517
+ {
518
+ len = strlen(suffix[i]);
519
+
520
+ /* The remaining w is too short for a possible match */
521
+ if ((wend-w) < len) continue;
522
+ if (strncmp(wend-len, suffix[i], len) == 0) s_ok=1;
523
+ }
524
+ else
525
+ len = 0;
526
+
527
+ if (s_ok || i == s_strippable)
528
+ {
529
+ strncpy(newword, w, MIN((wend-len)-w, MAX_WORD));
530
+ newword[MIN((wend-len)-w, MAX_WORD)] = '\0';
531
+
532
+ /* Check if the remainder is in the dictionary;
533
+ * for the no-suffix case, it won't be */
534
+ if (boolean_reg_dict_lookup(sent->dict, newword))
535
+ {
536
+ if ((verbosity>1) && (i < s_strippable))
537
+ printf("Splitting word into two: %s-%s\n", newword, suffix[i]);
538
+ s_stripped = i;
539
+ wend -= len;
540
+ strncpy(word, w, MIN(wend-w, MAX_WORD));
541
+ word[MIN(wend-w, MAX_WORD)] = '\0';
542
+ word_is_in_dict = TRUE;
543
+ break;
544
+ }
545
+
546
+ /* If the remainder isn't in the dictionary,
547
+ * try stripping off prefixes */
548
+ else
549
+ {
550
+ for (j=0; j<p_strippable; j++)
551
+ {
552
+ if (strncmp(w, prefix[j], strlen(prefix[j])) == 0)
553
+ {
554
+ int sz = MIN((wend-len)-(w+strlen(prefix[j])), MAX_WORD);
555
+ strncpy(newword, w+strlen(prefix[j]), sz);
556
+ newword[sz] = '\0';
557
+ if (boolean_reg_dict_lookup(sent->dict, newword))
558
+ {
559
+ if ((verbosity>1) && (i < s_strippable))
560
+ printf("Splitting word into three: %s-%s-%s\n",
561
+ prefix[j], newword, suffix[i]);
562
+ if (!issue_sentence_word(sent, prefix[j])) return FALSE;
563
+ if (i < s_strippable) s_stripped = i;
564
+ wend -= len;
565
+ w += strlen(prefix[j]);
566
+ sz = MIN(wend-w, MAX_WORD);
567
+ strncpy(word, w, sz);
568
+ word[sz] = '\0';
569
+ word_is_in_dict = TRUE;
570
+ break;
571
+ }
572
+ }
573
+ }
574
+ }
575
+ if (j != p_strippable) break;
576
+ }
577
+ }
578
+ }
579
+
580
+ /* word is now what remains after all the stripping has been done */
581
+ issued = FALSE;
582
+
583
+ /* If n_r_stripped exceed max, the "word" is most likely a long
584
+ * sequence of periods. Just accept it as an unknown "word",
585
+ * and move on.
586
+ */
587
+ if (n_r_stripped >= MAX_STRIP)
588
+ {
589
+ n_r_stripped = 0;
590
+ word_is_in_dict = TRUE;
591
+ }
592
+
593
+ if (quote_found == TRUE) sent->post_quote[sent->length] = 1;
594
+
595
+ #if defined HAVE_HUNSPELL || defined HAVE_ASPELL
596
+ /* If the word is still not being found, then it might be
597
+ * a run-on of two words. Ask the spell-checker to split
598
+ * the word in two, if possible. Do this only if the word
599
+ * is not a proper name, and if spell-checking is enabled.
600
+ */
601
+ if ((FALSE == word_is_in_dict) &&
602
+ TRUE == opts->use_spell_guess &&
603
+ sent->dict->spell_checker &&
604
+ (FALSE == is_proper_name(word)))
605
+ {
606
+ char **alternates = NULL;
607
+ char *sp = NULL;
608
+ char *wp;
609
+ int j, n;
610
+ n = spellcheck_suggest(sent->dict->spell_checker, &alternates, word);
611
+ for (j=0; j<n; j++)
612
+ {
613
+ /* Uhh, XXX this is not utf8 safe! */
614
+ sp = strchr(alternates[j], ' ');
615
+ if (sp) break;
616
+ }
617
+
618
+ if (sp) issued = TRUE;
619
+
620
+ wp = alternates[j];
621
+ while (sp)
622
+ {
623
+ *sp = 0x0;
624
+ if (!issue_sentence_word(sent, wp)) return FALSE;
625
+ wp = sp+1;
626
+ sp = strchr(wp, ' ');
627
+ if (NULL == sp)
628
+ {
629
+ if (!issue_sentence_word(sent, wp)) return FALSE;
630
+ }
631
+ }
632
+ if (alternates) spellcheck_free_suggest(alternates, n);
633
+ }
634
+ #endif /* HAVE_HUNSPELL */
635
+
636
+ if (FALSE == issued)
637
+ {
638
+ if (!issue_sentence_word(sent, word)) return FALSE;
639
+ }
640
+
641
+ if (s_stripped != -1)
642
+ {
643
+ if (!issue_sentence_word(sent, suffix[s_stripped])) return FALSE;
644
+ }
645
+
646
+ for (i = n_r_stripped-1; i>=0; i--)
647
+ {
648
+ if (!issue_sentence_word(sent, r_stripped[i])) return FALSE;
649
+ }
650
+
651
+ return TRUE;
652
+ }
653
+
654
+ /**
655
+ * The string s has just been read in from standard input.
656
+ * This function breaks it up into words and stores these words in
657
+ * the sent->word[] array. Returns TRUE if all is well, FALSE otherwise.
658
+ * Quote marks are treated just like blanks.
659
+ */
660
+ int separate_sentence(Sentence sent, Parse_Options opts)
661
+ {
662
+ const char *t;
663
+ int is_first, quote_found;
664
+ Dictionary dict = sent->dict;
665
+ mbstate_t mbs;
666
+ const char * s = sent->orig_sentence;
667
+
668
+ memset(sent->post_quote, 0, MAX_SENTENCE*sizeof(int));
669
+ sent->length = 0;
670
+
671
+ if (dict->left_wall_defined)
672
+ if (!issue_sentence_word(sent, LEFT_WALL_WORD)) return FALSE;
673
+
674
+ /* Reset the multibyte shift state to the initial state */
675
+ memset(&mbs, 0, sizeof(mbs));
676
+
677
+ is_first = TRUE;
678
+ for(;;)
679
+ {
680
+ int isq;
681
+ wchar_t c;
682
+ int nb = mbrtowc(&c, s, MB_CUR_MAX, &mbs);
683
+ quote_found = FALSE;
684
+
685
+ if (0 > nb) goto failure;
686
+
687
+ /* Skip all whitespace. Also, ignore *all* quotation marks.
688
+ * XXX This is sort-of a hack, but that is because LG does
689
+ * not have any intelligent support for quoted character
690
+ * strings at this time.
691
+ */
692
+ isq = is_quote (c);
693
+ if (isq) quote_found = TRUE;
694
+ while (iswspace(c) || isq)
695
+ {
696
+ s += nb;
697
+ nb = mbrtowc(&c, s, MB_CUR_MAX, &mbs);
698
+ if (0 == nb) break;
699
+ if (0 > nb) goto failure;
700
+ isq = is_quote (c);
701
+ if (isq) quote_found = TRUE;
702
+ }
703
+
704
+ if (*s == '\0') break;
705
+
706
+ t = s;
707
+ nb = mbrtowc(&c, t, MB_CUR_MAX, &mbs);
708
+ if (0 > nb) goto failure;
709
+ while (!iswspace(c) && !is_quote(c) && (c != 0) && (nb != 0))
710
+ {
711
+ t += nb;
712
+ nb = mbrtowc(&c, t, MB_CUR_MAX, &mbs);
713
+ if (0 > nb) goto failure;
714
+ }
715
+
716
+ if (!separate_word(sent, opts, s, t, is_first, quote_found)) return FALSE;
717
+ is_first = FALSE;
718
+ s = t;
719
+ if (*s == '\0') break;
720
+ }
721
+
722
+ if (dict->right_wall_defined)
723
+ if (!issue_sentence_word(sent, RIGHT_WALL_WORD)) return FALSE;
724
+
725
+ return (sent->length > dict->left_wall_defined + dict->right_wall_defined);
726
+
727
+ failure:
728
+ prt_error("Unable to process UTF8 input string in current locale %s\n",
729
+ nl_langinfo(CODESET));
730
+ return FALSE;
731
+ }
732
+
733
+ /**
734
+ * Build the word expressions, and add a tag to the word to indicate
735
+ * that it was guessed by means of regular-expression matching.
736
+ * Also, add a subscript to the resulting word to indicate the
737
+ * rule origin.
738
+ */
739
+ static void tag_regex_string(Sentence sent, int i, const char * type)
740
+ {
741
+ char str[MAX_WORD+1];
742
+ char * t;
743
+ X_node * e;
744
+ sent->word[i].x = build_word_expressions(sent->dict, type);
745
+ for (e = sent->word[i].x; e != NULL; e = e->next)
746
+ {
747
+ t = strchr(e->string, '.');
748
+ e->string = sent->word[i].string;
749
+ if (NULL != t)
750
+ {
751
+ snprintf(str, MAX_WORD, "%.50s[!].%.5s", e->string, t+1);
752
+ }
753
+ else
754
+ {
755
+ snprintf(str, MAX_WORD, "%.50s", e->string);
756
+ }
757
+ e->string = string_set_add(str, sent->string_set);
758
+ }
759
+ }
760
+
761
+ /**
762
+ * Puts into word[i].x the expression for the unknown word
763
+ * the parameter s is the word that was not in the dictionary
764
+ * it massages the names to have the corresponding subscripts
765
+ * to those of the unknown words
766
+ * so "grok" becomes "grok[?].v"
767
+ */
768
+ static void handle_unknown_word(Sentence sent, int i, char * s)
769
+ {
770
+ char *t;
771
+ X_node *d;
772
+ char str[MAX_WORD+1];
773
+
774
+ sent->word[i].x = build_word_expressions(sent->dict, UNKNOWN_WORD);
775
+ if (sent->word[i].x == NULL)
776
+ assert(FALSE, "UNKNOWN_WORD should have been there");
777
+
778
+ for (d = sent->word[i].x; d != NULL; d = d->next)
779
+ {
780
+ t = strchr(d->string, '.');
781
+ if (t != NULL)
782
+ {
783
+ snprintf(str, MAX_WORD, "%.50s[?].%.5s", s, t+1);
784
+ }
785
+ else
786
+ {
787
+ snprintf(str, MAX_WORD, "%.50s[?]", s);
788
+ }
789
+ d->string = string_set_add(str, sent->string_set);
790
+ }
791
+ }
792
+
793
+ /**
794
+ * If a word appears to be mis-spelled, then add alternate
795
+ * spellings. Maybe one of those will do ...
796
+ */
797
+ static void guess_misspelled_word(Sentence sent, int i, char * s)
798
+ {
799
+ int spelling_ok;
800
+ char str[MAX_WORD+1];
801
+ Dictionary dict = sent->dict;
802
+ X_node *d, *head = NULL;
803
+ int j, n;
804
+ char **alternates = NULL;
805
+
806
+ /* Spell-guessing is disabled if no spell-checker is speficified */
807
+ if (NULL == dict->spell_checker)
808
+ {
809
+ handle_unknown_word(sent, i, s);
810
+ return;
811
+ }
812
+
813
+ /* If the spell-checker knows about this word, and we don't ...
814
+ * Dang. We should fix it someday. Accept it as such. */
815
+ spelling_ok = spellcheck_test(dict->spell_checker, s);
816
+ if (spelling_ok)
817
+ {
818
+ handle_unknown_word(sent, i, s);
819
+ return;
820
+ }
821
+
822
+ /* Else, ask the spell-checker for alternate spellings
823
+ * and see if these are in the dict. */
824
+ n = spellcheck_suggest(dict->spell_checker, &alternates, s);
825
+ for (j=0; j<n; j++)
826
+ {
827
+ if (boolean_reg_dict_lookup(sent->dict, alternates[j]))
828
+ {
829
+ X_node *x = build_word_expressions(sent->dict, alternates[j]);
830
+ head = catenate_X_nodes(x, head);
831
+ }
832
+ }
833
+ sent->word[i].x = head;
834
+ if (alternates) spellcheck_free_suggest(alternates, n);
835
+
836
+ /* Add a [~] to the output to signify that its the result of
837
+ * guessing. */
838
+ for (d = sent->word[i].x; d != NULL; d = d->next)
839
+ {
840
+ const char * t = strchr(d->string, '.');
841
+ if (t != NULL)
842
+ {
843
+ size_t off = t - d->string;
844
+ strncpy(str, d->string, off);
845
+ str[off] = 0;
846
+ strcat(str, "[~]");
847
+ strcat(str, t);
848
+ }
849
+ else
850
+ {
851
+ snprintf(str, MAX_WORD, "%.50s[~]", s);
852
+ }
853
+ d->string = string_set_add(str, sent->string_set);
854
+ }
855
+
856
+ /* If nothing found at all... */
857
+ if (NULL == head)
858
+ {
859
+ handle_unknown_word(sent, i, s);
860
+ }
861
+ }
862
+
863
+ /**
864
+ * Corrects case of first word, fills in other proper nouns, and
865
+ * builds the expression lists for the resulting words.
866
+ *
867
+ * Algorithm:
868
+ * Apply the following step to all words w:
869
+ * If w is in the dictionary, use it.
870
+ * Else if w is identified by regex matching, use the
871
+ * appropriately matched disjunct collection.
872
+ *
873
+ * Now, we correct the first word, w.
874
+ * If w is upper case, let w' be the lower case version of w.
875
+ * If both w and w' are in the dict, concatenate these disjncts.
876
+ * Else if just w' is in dict, use disjuncts of w', together with
877
+ * the CAPITALIZED-WORDS rule.
878
+ * Else leave the disjuncts alone.
879
+ */
880
+ int build_sentence_expressions(Sentence sent, Parse_Options opts)
881
+ {
882
+ int i, first_word; /* the index of the first word after the wall */
883
+ char *s, temp_word[MAX_WORD+1];
884
+ const char * regex_name;
885
+ X_node * e;
886
+ Dictionary dict = sent->dict;
887
+
888
+ if (dict->left_wall_defined) {
889
+ first_word = 1;
890
+ } else {
891
+ first_word = 0;
892
+ }
893
+
894
+ /* The following loop treats all words the same
895
+ * (nothing special for 1st word) */
896
+ for (i=0; i<sent->length; i++)
897
+ {
898
+ s = sent->word[i].string;
899
+ if (boolean_dictionary_lookup(sent->dict, s))
900
+ {
901
+ sent->word[i].x = build_word_expressions(sent->dict, s);
902
+ }
903
+ else if ((NULL != (regex_name = match_regex(sent->dict, s))) &&
904
+ boolean_dictionary_lookup(sent->dict, regex_name))
905
+ {
906
+ tag_regex_string(sent, i, regex_name);
907
+ }
908
+ else if (dict->unknown_word_defined && dict->use_unknown_word)
909
+ {
910
+ if (opts->use_spell_guess)
911
+ {
912
+ guess_misspelled_word(sent, i, s);
913
+ }
914
+ else
915
+ {
916
+ handle_unknown_word(sent, i, s);
917
+ }
918
+ }
919
+ else
920
+ {
921
+ /* The reason I can assert this is that the word
922
+ * should have been looked up already if we get here.
923
+ */
924
+ assert(FALSE, "I should have found that word.");
925
+ }
926
+ }
927
+
928
+ /* Under certain cases--if it's the first word of the sentence,
929
+ * or if it follows a colon or a quotation mark--a word that's
930
+ * capitalized has to be looked up as an uncapitalized word
931
+ * (as well as a capitalized word).
932
+ * XXX This rule is English-language-oriented, and should be
933
+ * abstracted.
934
+ */
935
+ for (i=0; i<sent->length; i++)
936
+ {
937
+ if (! (i == first_word ||
938
+ (i > 0 && strcmp(":", sent->word[i-1].string)==0) ||
939
+ sent->post_quote[i] == 1)) continue;
940
+ s = sent->word[i].string;
941
+
942
+ /* If the lower-case version of this word is in the dictionary,
943
+ * then add the disjuncts for the lower-case version. The upper
944
+ * case version disjuncts had previously come from matching the
945
+ * CAPITALIZED-WORDS regex.
946
+ *
947
+ * Err .. add the lower-case version only if the lower-case word
948
+ * is a common noun or adjective; otherwise, *replace* the
949
+ * upper-case word with the lower-case one. This allows common
950
+ * nouns and adjectives to be used for entity names: e.g.
951
+ * "Great Southern Union declares bankruptcy", allowing Great
952
+ * to be capitalized, while preventing an upper-case "She" being
953
+ * used as a proper name in "She declared bankruptcy".
954
+ *
955
+ * Arghh. This is still messed up. The capitalized-regex runs
956
+ * too early, I think. We need to *add* Sue.f (female name Sue)
957
+ * even though sue.v (the verb "to sue") is in the dict. So
958
+ * test for capitalized entity names. Glurg. Too much complexity
959
+ * here, it seems to me.
960
+ *
961
+ * This is actually a great example of a combo of an algorithm
962
+ * together with a list of words used to determine grammatical
963
+ * function.
964
+ */
965
+ if (is_utf8_upper(s))
966
+ {
967
+ const char * lc;
968
+ downcase_utf8_str(temp_word, s, MAX_WORD);
969
+ lc = string_set_add(temp_word, sent->string_set);
970
+
971
+ /* The lower-case dict lookup might trigger regex
972
+ * matches in the dictionary. We want to avoid these.
973
+ * e.g. "Cornwallis" triggers both PL-CAPITALIZED_WORDS
974
+ * and S-WORDS. Since its not an entity, the regex
975
+ * matches will erroneously discard the upper-case version.
976
+ */
977
+ if (boolean_dictionary_lookup(sent->dict, lc))
978
+ {
979
+ if (is_entity(sent->dict,s) ||
980
+ is_common_entity(sent->dict,lc))
981
+ {
982
+ if (1 < verbosity)
983
+ {
984
+ printf ("Info: First word: %s entity=%d common=%d\n",
985
+ s, is_entity(sent->dict,s),
986
+ is_common_entity(sent->dict,lc));
987
+ }
988
+ e = build_word_expressions(sent->dict, lc);
989
+ sent->word[i].x =
990
+ catenate_X_nodes(sent->word[i].x, e);
991
+ }
992
+ else
993
+ {
994
+ if (1 < verbosity)
995
+ {
996
+ printf("Info: First word: %s downcase only\n", lc);
997
+ }
998
+ safe_strcpy(s, lc, MAX_WORD);
999
+ e = build_word_expressions(sent->dict, s);
1000
+ free_X_nodes(sent->word[i].x);
1001
+ sent->word[i].x = e;
1002
+ }
1003
+ }
1004
+ }
1005
+ }
1006
+
1007
+ return TRUE;
1008
+ }
1009
+
1010
+
1011
+ /**
1012
+ * This just looks up all the words in the sentence, and builds
1013
+ * up an appropriate error message in case some are not there.
1014
+ * It has no side effect on the sentence. Returns TRUE if all
1015
+ * went well.
1016
+ *
1017
+ * This code is called only is the 'unkown-words' flag is set.
1018
+ */
1019
+ int sentence_in_dictionary(Sentence sent)
1020
+ {
1021
+ int w, ok_so_far;
1022
+ char * s;
1023
+ Dictionary dict = sent->dict;
1024
+ char temp[1024];
1025
+
1026
+ ok_so_far = TRUE;
1027
+ for (w=0; w<sent->length; w++)
1028
+ {
1029
+ s = sent->word[w].string;
1030
+ if (!boolean_reg_dict_lookup(dict, s))
1031
+ {
1032
+ if (ok_so_far)
1033
+ {
1034
+ safe_strcpy(temp, "The following words are not in the dictionary:", sizeof(temp));
1035
+ ok_so_far = FALSE;
1036
+ }
1037
+ safe_strcat(temp, " \"", sizeof(temp));
1038
+ safe_strcat(temp, sent->word[w].string, sizeof(temp));
1039
+ safe_strcat(temp, "\"", sizeof(temp));
1040
+ }
1041
+ }
1042
+ if (!ok_so_far)
1043
+ {
1044
+ err_ctxt ec;
1045
+ ec.sent = sent;
1046
+ err_msg(&ec, Error, "Error: Sentence not in dictionary\n%s\n", temp);
1047
+ }
1048
+ return ok_so_far;
1049
+ }