grammar_cop 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (344) hide show
  1. data/.DS_Store +0 -0
  2. data/.gitignore +4 -0
  3. data/Gemfile +4 -0
  4. data/Rakefile +8 -0
  5. data/data/.DS_Store +0 -0
  6. data/data/Makefile +511 -0
  7. data/data/Makefile.am +4 -0
  8. data/data/Makefile.in +511 -0
  9. data/data/de/.DS_Store +0 -0
  10. data/data/de/4.0.affix +7 -0
  11. data/data/de/4.0.dict +474 -0
  12. data/data/de/Makefile +387 -0
  13. data/data/de/Makefile.am +9 -0
  14. data/data/de/Makefile.in +387 -0
  15. data/data/en/.DS_Store +0 -0
  16. data/data/en/4.0.affix +26 -0
  17. data/data/en/4.0.batch +1002 -0
  18. data/data/en/4.0.biolg.batch +411 -0
  19. data/data/en/4.0.constituent-knowledge +127 -0
  20. data/data/en/4.0.dict +8759 -0
  21. data/data/en/4.0.dict.m4 +6928 -0
  22. data/data/en/4.0.enwiki.batch +14 -0
  23. data/data/en/4.0.fixes.batch +2776 -0
  24. data/data/en/4.0.knowledge +306 -0
  25. data/data/en/4.0.regex +225 -0
  26. data/data/en/4.0.voa.batch +114 -0
  27. data/data/en/Makefile +554 -0
  28. data/data/en/Makefile.am +19 -0
  29. data/data/en/Makefile.in +554 -0
  30. data/data/en/README +173 -0
  31. data/data/en/tiny.dict +157 -0
  32. data/data/en/words/.DS_Store +0 -0
  33. data/data/en/words/Makefile +456 -0
  34. data/data/en/words/Makefile.am +78 -0
  35. data/data/en/words/Makefile.in +456 -0
  36. data/data/en/words/currency +205 -0
  37. data/data/en/words/currency.p +28 -0
  38. data/data/en/words/entities.given-bisex.sing +39 -0
  39. data/data/en/words/entities.given-female.sing +4141 -0
  40. data/data/en/words/entities.given-male.sing +1633 -0
  41. data/data/en/words/entities.locations.sing +68 -0
  42. data/data/en/words/entities.national.sing +253 -0
  43. data/data/en/words/entities.organizations.sing +7 -0
  44. data/data/en/words/entities.us-states.sing +11 -0
  45. data/data/en/words/units.1 +45 -0
  46. data/data/en/words/units.1.dot +4 -0
  47. data/data/en/words/units.3 +2 -0
  48. data/data/en/words/units.4 +5 -0
  49. data/data/en/words/units.4.dot +1 -0
  50. data/data/en/words/words-medical.adv.1 +1191 -0
  51. data/data/en/words/words-medical.prep.1 +67 -0
  52. data/data/en/words/words-medical.v.4.1 +2835 -0
  53. data/data/en/words/words-medical.v.4.2 +2848 -0
  54. data/data/en/words/words-medical.v.4.3 +3011 -0
  55. data/data/en/words/words-medical.v.4.4 +3036 -0
  56. data/data/en/words/words-medical.v.4.5 +3050 -0
  57. data/data/en/words/words.adj.1 +6794 -0
  58. data/data/en/words/words.adj.2 +638 -0
  59. data/data/en/words/words.adj.3 +667 -0
  60. data/data/en/words/words.adv.1 +1573 -0
  61. data/data/en/words/words.adv.2 +67 -0
  62. data/data/en/words/words.adv.3 +157 -0
  63. data/data/en/words/words.adv.4 +80 -0
  64. data/data/en/words/words.n.1 +11464 -0
  65. data/data/en/words/words.n.1.wiki +264 -0
  66. data/data/en/words/words.n.2.s +2017 -0
  67. data/data/en/words/words.n.2.s.biolg +1 -0
  68. data/data/en/words/words.n.2.s.wiki +298 -0
  69. data/data/en/words/words.n.2.x +65 -0
  70. data/data/en/words/words.n.2.x.wiki +10 -0
  71. data/data/en/words/words.n.3 +5717 -0
  72. data/data/en/words/words.n.t +23 -0
  73. data/data/en/words/words.v.1.1 +1038 -0
  74. data/data/en/words/words.v.1.2 +1043 -0
  75. data/data/en/words/words.v.1.3 +1052 -0
  76. data/data/en/words/words.v.1.4 +1023 -0
  77. data/data/en/words/words.v.1.p +17 -0
  78. data/data/en/words/words.v.10.1 +14 -0
  79. data/data/en/words/words.v.10.2 +15 -0
  80. data/data/en/words/words.v.10.3 +88 -0
  81. data/data/en/words/words.v.10.4 +17 -0
  82. data/data/en/words/words.v.2.1 +1253 -0
  83. data/data/en/words/words.v.2.2 +1304 -0
  84. data/data/en/words/words.v.2.3 +1280 -0
  85. data/data/en/words/words.v.2.4 +1285 -0
  86. data/data/en/words/words.v.2.5 +1287 -0
  87. data/data/en/words/words.v.4.1 +2472 -0
  88. data/data/en/words/words.v.4.2 +2487 -0
  89. data/data/en/words/words.v.4.3 +2441 -0
  90. data/data/en/words/words.v.4.4 +2478 -0
  91. data/data/en/words/words.v.4.5 +2483 -0
  92. data/data/en/words/words.v.5.1 +98 -0
  93. data/data/en/words/words.v.5.2 +98 -0
  94. data/data/en/words/words.v.5.3 +103 -0
  95. data/data/en/words/words.v.5.4 +102 -0
  96. data/data/en/words/words.v.6.1 +388 -0
  97. data/data/en/words/words.v.6.2 +401 -0
  98. data/data/en/words/words.v.6.3 +397 -0
  99. data/data/en/words/words.v.6.4 +405 -0
  100. data/data/en/words/words.v.6.5 +401 -0
  101. data/data/en/words/words.v.8.1 +117 -0
  102. data/data/en/words/words.v.8.2 +118 -0
  103. data/data/en/words/words.v.8.3 +118 -0
  104. data/data/en/words/words.v.8.4 +119 -0
  105. data/data/en/words/words.v.8.5 +119 -0
  106. data/data/en/words/words.y +104 -0
  107. data/data/lt/.DS_Store +0 -0
  108. data/data/lt/4.0.affix +6 -0
  109. data/data/lt/4.0.constituent-knowledge +24 -0
  110. data/data/lt/4.0.dict +135 -0
  111. data/data/lt/4.0.knowledge +38 -0
  112. data/data/lt/Makefile +389 -0
  113. data/data/lt/Makefile.am +11 -0
  114. data/data/lt/Makefile.in +389 -0
  115. data/ext/.DS_Store +0 -0
  116. data/ext/link_grammar/.DS_Store +0 -0
  117. data/ext/link_grammar/extconf.rb +2 -0
  118. data/ext/link_grammar/link-grammar/.DS_Store +0 -0
  119. data/ext/link_grammar/link-grammar/.deps/analyze-linkage.Plo +198 -0
  120. data/ext/link_grammar/link-grammar/.deps/and.Plo +202 -0
  121. data/ext/link_grammar/link-grammar/.deps/api.Plo +244 -0
  122. data/ext/link_grammar/link-grammar/.deps/build-disjuncts.Plo +212 -0
  123. data/ext/link_grammar/link-grammar/.deps/command-line.Plo +201 -0
  124. data/ext/link_grammar/link-grammar/.deps/constituents.Plo +201 -0
  125. data/ext/link_grammar/link-grammar/.deps/count.Plo +202 -0
  126. data/ext/link_grammar/link-grammar/.deps/disjunct-utils.Plo +126 -0
  127. data/ext/link_grammar/link-grammar/.deps/disjuncts.Plo +123 -0
  128. data/ext/link_grammar/link-grammar/.deps/error.Plo +121 -0
  129. data/ext/link_grammar/link-grammar/.deps/expand.Plo +133 -0
  130. data/ext/link_grammar/link-grammar/.deps/extract-links.Plo +198 -0
  131. data/ext/link_grammar/link-grammar/.deps/fast-match.Plo +200 -0
  132. data/ext/link_grammar/link-grammar/.deps/idiom.Plo +200 -0
  133. data/ext/link_grammar/link-grammar/.deps/jni-client.Plo +217 -0
  134. data/ext/link_grammar/link-grammar/.deps/link-parser.Po +1 -0
  135. data/ext/link_grammar/link-grammar/.deps/massage.Plo +202 -0
  136. data/ext/link_grammar/link-grammar/.deps/post-process.Plo +202 -0
  137. data/ext/link_grammar/link-grammar/.deps/pp_knowledge.Plo +202 -0
  138. data/ext/link_grammar/link-grammar/.deps/pp_lexer.Plo +201 -0
  139. data/ext/link_grammar/link-grammar/.deps/pp_linkset.Plo +200 -0
  140. data/ext/link_grammar/link-grammar/.deps/prefix.Plo +102 -0
  141. data/ext/link_grammar/link-grammar/.deps/preparation.Plo +202 -0
  142. data/ext/link_grammar/link-grammar/.deps/print-util.Plo +200 -0
  143. data/ext/link_grammar/link-grammar/.deps/print.Plo +201 -0
  144. data/ext/link_grammar/link-grammar/.deps/prune.Plo +202 -0
  145. data/ext/link_grammar/link-grammar/.deps/read-dict.Plo +223 -0
  146. data/ext/link_grammar/link-grammar/.deps/read-regex.Plo +123 -0
  147. data/ext/link_grammar/link-grammar/.deps/regex-morph.Plo +131 -0
  148. data/ext/link_grammar/link-grammar/.deps/resources.Plo +203 -0
  149. data/ext/link_grammar/link-grammar/.deps/spellcheck-aspell.Plo +1 -0
  150. data/ext/link_grammar/link-grammar/.deps/spellcheck-hun.Plo +115 -0
  151. data/ext/link_grammar/link-grammar/.deps/string-set.Plo +198 -0
  152. data/ext/link_grammar/link-grammar/.deps/tokenize.Plo +160 -0
  153. data/ext/link_grammar/link-grammar/.deps/utilities.Plo +222 -0
  154. data/ext/link_grammar/link-grammar/.deps/word-file.Plo +201 -0
  155. data/ext/link_grammar/link-grammar/.deps/word-utils.Plo +212 -0
  156. data/ext/link_grammar/link-grammar/.libs/analyze-linkage.o +0 -0
  157. data/ext/link_grammar/link-grammar/.libs/and.o +0 -0
  158. data/ext/link_grammar/link-grammar/.libs/api.o +0 -0
  159. data/ext/link_grammar/link-grammar/.libs/build-disjuncts.o +0 -0
  160. data/ext/link_grammar/link-grammar/.libs/command-line.o +0 -0
  161. data/ext/link_grammar/link-grammar/.libs/constituents.o +0 -0
  162. data/ext/link_grammar/link-grammar/.libs/count.o +0 -0
  163. data/ext/link_grammar/link-grammar/.libs/disjunct-utils.o +0 -0
  164. data/ext/link_grammar/link-grammar/.libs/disjuncts.o +0 -0
  165. data/ext/link_grammar/link-grammar/.libs/error.o +0 -0
  166. data/ext/link_grammar/link-grammar/.libs/expand.o +0 -0
  167. data/ext/link_grammar/link-grammar/.libs/extract-links.o +0 -0
  168. data/ext/link_grammar/link-grammar/.libs/fast-match.o +0 -0
  169. data/ext/link_grammar/link-grammar/.libs/idiom.o +0 -0
  170. data/ext/link_grammar/link-grammar/.libs/jni-client.o +0 -0
  171. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java-symbols.expsym +31 -0
  172. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib +0 -0
  173. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib.dSYM/Contents/Info.plist +20 -0
  174. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib.dSYM/Contents/Resources/DWARF/liblink-grammar-java.4.dylib +0 -0
  175. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.a +0 -0
  176. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.dylib +0 -0
  177. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-symbols.expsym +194 -0
  178. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib +0 -0
  179. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib.dSYM/Contents/Info.plist +20 -0
  180. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib.dSYM/Contents/Resources/DWARF/liblink-grammar.4.dylib +0 -0
  181. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.a +0 -0
  182. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.dylib +0 -0
  183. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.la +41 -0
  184. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.lai +41 -0
  185. data/ext/link_grammar/link-grammar/.libs/massage.o +0 -0
  186. data/ext/link_grammar/link-grammar/.libs/post-process.o +0 -0
  187. data/ext/link_grammar/link-grammar/.libs/pp_knowledge.o +0 -0
  188. data/ext/link_grammar/link-grammar/.libs/pp_lexer.o +0 -0
  189. data/ext/link_grammar/link-grammar/.libs/pp_linkset.o +0 -0
  190. data/ext/link_grammar/link-grammar/.libs/prefix.o +0 -0
  191. data/ext/link_grammar/link-grammar/.libs/preparation.o +0 -0
  192. data/ext/link_grammar/link-grammar/.libs/print-util.o +0 -0
  193. data/ext/link_grammar/link-grammar/.libs/print.o +0 -0
  194. data/ext/link_grammar/link-grammar/.libs/prune.o +0 -0
  195. data/ext/link_grammar/link-grammar/.libs/read-dict.o +0 -0
  196. data/ext/link_grammar/link-grammar/.libs/read-regex.o +0 -0
  197. data/ext/link_grammar/link-grammar/.libs/regex-morph.o +0 -0
  198. data/ext/link_grammar/link-grammar/.libs/resources.o +0 -0
  199. data/ext/link_grammar/link-grammar/.libs/spellcheck-aspell.o +0 -0
  200. data/ext/link_grammar/link-grammar/.libs/spellcheck-hun.o +0 -0
  201. data/ext/link_grammar/link-grammar/.libs/string-set.o +0 -0
  202. data/ext/link_grammar/link-grammar/.libs/tokenize.o +0 -0
  203. data/ext/link_grammar/link-grammar/.libs/utilities.o +0 -0
  204. data/ext/link_grammar/link-grammar/.libs/word-file.o +0 -0
  205. data/ext/link_grammar/link-grammar/.libs/word-utils.o +0 -0
  206. data/ext/link_grammar/link-grammar/Makefile +900 -0
  207. data/ext/link_grammar/link-grammar/Makefile.am +202 -0
  208. data/ext/link_grammar/link-grammar/Makefile.in +900 -0
  209. data/ext/link_grammar/link-grammar/analyze-linkage.c +1317 -0
  210. data/ext/link_grammar/link-grammar/analyze-linkage.h +24 -0
  211. data/ext/link_grammar/link-grammar/and.c +1603 -0
  212. data/ext/link_grammar/link-grammar/and.h +27 -0
  213. data/ext/link_grammar/link-grammar/api-structures.h +362 -0
  214. data/ext/link_grammar/link-grammar/api-types.h +72 -0
  215. data/ext/link_grammar/link-grammar/api.c +1887 -0
  216. data/ext/link_grammar/link-grammar/api.h +96 -0
  217. data/ext/link_grammar/link-grammar/autoit/.DS_Store +0 -0
  218. data/ext/link_grammar/link-grammar/autoit/README +10 -0
  219. data/ext/link_grammar/link-grammar/autoit/_LGTest.au3 +22 -0
  220. data/ext/link_grammar/link-grammar/autoit/_LinkGrammar.au3 +545 -0
  221. data/ext/link_grammar/link-grammar/build-disjuncts.c +487 -0
  222. data/ext/link_grammar/link-grammar/build-disjuncts.h +21 -0
  223. data/ext/link_grammar/link-grammar/command-line.c +458 -0
  224. data/ext/link_grammar/link-grammar/command-line.h +15 -0
  225. data/ext/link_grammar/link-grammar/constituents.c +1836 -0
  226. data/ext/link_grammar/link-grammar/constituents.h +26 -0
  227. data/ext/link_grammar/link-grammar/corpus/.DS_Store +0 -0
  228. data/ext/link_grammar/link-grammar/corpus/.deps/cluster.Plo +1 -0
  229. data/ext/link_grammar/link-grammar/corpus/.deps/corpus.Plo +1 -0
  230. data/ext/link_grammar/link-grammar/corpus/Makefile +527 -0
  231. data/ext/link_grammar/link-grammar/corpus/Makefile.am +46 -0
  232. data/ext/link_grammar/link-grammar/corpus/Makefile.in +527 -0
  233. data/ext/link_grammar/link-grammar/corpus/README +17 -0
  234. data/ext/link_grammar/link-grammar/corpus/cluster.c +286 -0
  235. data/ext/link_grammar/link-grammar/corpus/cluster.h +32 -0
  236. data/ext/link_grammar/link-grammar/corpus/corpus.c +483 -0
  237. data/ext/link_grammar/link-grammar/corpus/corpus.h +46 -0
  238. data/ext/link_grammar/link-grammar/count.c +828 -0
  239. data/ext/link_grammar/link-grammar/count.h +25 -0
  240. data/ext/link_grammar/link-grammar/disjunct-utils.c +261 -0
  241. data/ext/link_grammar/link-grammar/disjunct-utils.h +27 -0
  242. data/ext/link_grammar/link-grammar/disjuncts.c +138 -0
  243. data/ext/link_grammar/link-grammar/disjuncts.h +13 -0
  244. data/ext/link_grammar/link-grammar/error.c +92 -0
  245. data/ext/link_grammar/link-grammar/error.h +35 -0
  246. data/ext/link_grammar/link-grammar/expand.c +67 -0
  247. data/ext/link_grammar/link-grammar/expand.h +13 -0
  248. data/ext/link_grammar/link-grammar/externs.h +22 -0
  249. data/ext/link_grammar/link-grammar/extract-links.c +625 -0
  250. data/ext/link_grammar/link-grammar/extract-links.h +16 -0
  251. data/ext/link_grammar/link-grammar/fast-match.c +309 -0
  252. data/ext/link_grammar/link-grammar/fast-match.h +17 -0
  253. data/ext/link_grammar/link-grammar/idiom.c +373 -0
  254. data/ext/link_grammar/link-grammar/idiom.h +15 -0
  255. data/ext/link_grammar/link-grammar/jni-client.c +779 -0
  256. data/ext/link_grammar/link-grammar/jni-client.h +236 -0
  257. data/ext/link_grammar/link-grammar/liblink-grammar-java.la +42 -0
  258. data/ext/link_grammar/link-grammar/liblink-grammar.la +41 -0
  259. data/ext/link_grammar/link-grammar/link-features.h +37 -0
  260. data/ext/link_grammar/link-grammar/link-features.h.in +37 -0
  261. data/ext/link_grammar/link-grammar/link-grammar-java.def +31 -0
  262. data/ext/link_grammar/link-grammar/link-grammar.def +194 -0
  263. data/ext/link_grammar/link-grammar/link-includes.h +465 -0
  264. data/ext/link_grammar/link-grammar/link-parser.c +849 -0
  265. data/ext/link_grammar/link-grammar/massage.c +329 -0
  266. data/ext/link_grammar/link-grammar/massage.h +13 -0
  267. data/ext/link_grammar/link-grammar/post-process.c +1113 -0
  268. data/ext/link_grammar/link-grammar/post-process.h +45 -0
  269. data/ext/link_grammar/link-grammar/pp_knowledge.c +376 -0
  270. data/ext/link_grammar/link-grammar/pp_knowledge.h +14 -0
  271. data/ext/link_grammar/link-grammar/pp_lexer.c +1920 -0
  272. data/ext/link_grammar/link-grammar/pp_lexer.h +19 -0
  273. data/ext/link_grammar/link-grammar/pp_linkset.c +158 -0
  274. data/ext/link_grammar/link-grammar/pp_linkset.h +20 -0
  275. data/ext/link_grammar/link-grammar/prefix.c +482 -0
  276. data/ext/link_grammar/link-grammar/prefix.h +139 -0
  277. data/ext/link_grammar/link-grammar/preparation.c +412 -0
  278. data/ext/link_grammar/link-grammar/preparation.h +20 -0
  279. data/ext/link_grammar/link-grammar/print-util.c +87 -0
  280. data/ext/link_grammar/link-grammar/print-util.h +32 -0
  281. data/ext/link_grammar/link-grammar/print.c +1085 -0
  282. data/ext/link_grammar/link-grammar/print.h +16 -0
  283. data/ext/link_grammar/link-grammar/prune.c +1864 -0
  284. data/ext/link_grammar/link-grammar/prune.h +17 -0
  285. data/ext/link_grammar/link-grammar/read-dict.c +1785 -0
  286. data/ext/link_grammar/link-grammar/read-dict.h +29 -0
  287. data/ext/link_grammar/link-grammar/read-regex.c +161 -0
  288. data/ext/link_grammar/link-grammar/read-regex.h +12 -0
  289. data/ext/link_grammar/link-grammar/regex-morph.c +126 -0
  290. data/ext/link_grammar/link-grammar/regex-morph.h +17 -0
  291. data/ext/link_grammar/link-grammar/resources.c +180 -0
  292. data/ext/link_grammar/link-grammar/resources.h +23 -0
  293. data/ext/link_grammar/link-grammar/sat-solver/.DS_Store +0 -0
  294. data/ext/link_grammar/link-grammar/sat-solver/.deps/fast-sprintf.Plo +1 -0
  295. data/ext/link_grammar/link-grammar/sat-solver/.deps/sat-encoder.Plo +1 -0
  296. data/ext/link_grammar/link-grammar/sat-solver/.deps/util.Plo +1 -0
  297. data/ext/link_grammar/link-grammar/sat-solver/.deps/variables.Plo +1 -0
  298. data/ext/link_grammar/link-grammar/sat-solver/.deps/word-tag.Plo +1 -0
  299. data/ext/link_grammar/link-grammar/sat-solver/Makefile +527 -0
  300. data/ext/link_grammar/link-grammar/sat-solver/Makefile.am +29 -0
  301. data/ext/link_grammar/link-grammar/sat-solver/Makefile.in +527 -0
  302. data/ext/link_grammar/link-grammar/sat-solver/clock.hpp +33 -0
  303. data/ext/link_grammar/link-grammar/sat-solver/fast-sprintf.cpp +26 -0
  304. data/ext/link_grammar/link-grammar/sat-solver/fast-sprintf.hpp +7 -0
  305. data/ext/link_grammar/link-grammar/sat-solver/guiding.hpp +244 -0
  306. data/ext/link_grammar/link-grammar/sat-solver/matrix-ut.hpp +79 -0
  307. data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.cpp +2811 -0
  308. data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.h +11 -0
  309. data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.hpp +381 -0
  310. data/ext/link_grammar/link-grammar/sat-solver/trie.hpp +118 -0
  311. data/ext/link_grammar/link-grammar/sat-solver/util.cpp +23 -0
  312. data/ext/link_grammar/link-grammar/sat-solver/util.hpp +14 -0
  313. data/ext/link_grammar/link-grammar/sat-solver/variables.cpp +5 -0
  314. data/ext/link_grammar/link-grammar/sat-solver/variables.hpp +829 -0
  315. data/ext/link_grammar/link-grammar/sat-solver/word-tag.cpp +159 -0
  316. data/ext/link_grammar/link-grammar/sat-solver/word-tag.hpp +162 -0
  317. data/ext/link_grammar/link-grammar/spellcheck-aspell.c +148 -0
  318. data/ext/link_grammar/link-grammar/spellcheck-hun.c +136 -0
  319. data/ext/link_grammar/link-grammar/spellcheck.h +34 -0
  320. data/ext/link_grammar/link-grammar/string-set.c +169 -0
  321. data/ext/link_grammar/link-grammar/string-set.h +16 -0
  322. data/ext/link_grammar/link-grammar/structures.h +498 -0
  323. data/ext/link_grammar/link-grammar/tokenize.c +1049 -0
  324. data/ext/link_grammar/link-grammar/tokenize.h +15 -0
  325. data/ext/link_grammar/link-grammar/utilities.c +847 -0
  326. data/ext/link_grammar/link-grammar/utilities.h +281 -0
  327. data/ext/link_grammar/link-grammar/word-file.c +124 -0
  328. data/ext/link_grammar/link-grammar/word-file.h +15 -0
  329. data/ext/link_grammar/link-grammar/word-utils.c +526 -0
  330. data/ext/link_grammar/link-grammar/word-utils.h +152 -0
  331. data/ext/link_grammar/link_grammar.c +202 -0
  332. data/ext/link_grammar/link_grammar.h +99 -0
  333. data/grammar_cop.gemspec +24 -0
  334. data/lib/.DS_Store +0 -0
  335. data/lib/grammar_cop.rb +9 -0
  336. data/lib/grammar_cop/.DS_Store +0 -0
  337. data/lib/grammar_cop/dictionary.rb +19 -0
  338. data/lib/grammar_cop/linkage.rb +30 -0
  339. data/lib/grammar_cop/parse_options.rb +32 -0
  340. data/lib/grammar_cop/sentence.rb +36 -0
  341. data/lib/grammar_cop/version.rb +3 -0
  342. data/test/.DS_Store +0 -0
  343. data/test/grammar_cop_test.rb +27 -0
  344. metadata +407 -0
@@ -0,0 +1,306 @@
1
+ ; Post-processing knowledge file
2
+ ; 6/96
3
+
4
+ ; ----------------------------------------------------------------------------
5
+ ; This file contains the knowledge related to post-processing, in the
6
+ ; form of lists and rules. This file is read by post-process.c at run-time.
7
+ ; Syntax of file:
8
+ ; line starting with ";" is a comment
9
+ ; commas are field delimiters
10
+ ; any token beginning with the character @ is expanded to the set
11
+ ; of symbols it defined. e.g. one could write
12
+ ; FOO: blah1 blah2 blah3
13
+ ; thus defining a set FOO containing three strings. Then one could later write
14
+ ; BAR: blah5 @FOO blah8
15
+ ; which defines a set BAR containing 5 strings.
16
+ ;
17
+ ; Capitalized tokens are *required*, though if you feel like providing an
18
+ ; empty list afterwards, that's your right.
19
+ ; ----------------------------------------------------------------------------
20
+
21
+
22
+ ; The following links start a domain. Each must be given a name in the
23
+ ; table below (STARTING_LINK_TYPE_TABLE)
24
+
25
+ DOMAIN_STARTER_LINKS:
26
+ W Ce Cs Ca Cc Ci R* Rn Re RSe Mr QI#d Mv* Jr Mj Qd
27
+ TOn TOi Mg* MVi Ss#d Bsd ER Z Ma#* SIs#g BIqx MX#p MX#a
28
+ MX#r MX#j MV#o MV#p Eq COq CCq AFd PFc
29
+
30
+
31
+
32
+ ; ----------------------------------------------------------------------
33
+ ; The following links start a urfl domain. They are also included in the
34
+ ; domain, as opposed to regular starter links (above), which are not. A
35
+ ; urfl domain includes links accessible from the root word, tracing to
36
+ ; the right (as well as everything accessible from the left end of the
37
+ ; starter link).
38
+
39
+ URFL_DOMAIN_STARTER_LINKS: TOo I#j Pa##j CP
40
+
41
+
42
+
43
+ ; ----------------------------------------------------------------------
44
+ ; The following start a urfl_only domain. These include _only_ links :
45
+ ; reachable from the root word, tracing to the right. They aren't
46
+ ; included in the domain
47
+
48
+ URFL_ONLY_DOMAIN_STARTER_LINKS: SFsx Ss#g COp
49
+
50
+
51
+
52
+ ; ----------------------------------------------------------------------
53
+ ; Links which start a domain and are also part of the domain. This must be
54
+ ; a sublist of the domain_starter_list
55
+
56
+ DOMAIN_CONTAINS_LINKS:
57
+ Mg* Mx Bsd MX#a Ma#* Mv* MX#r Ss#d Ws Wq Qd Mj Wj
58
+ Wi MX#j AFd PFc Jr Wd Mr
59
+
60
+
61
+
62
+ ; ----------------------------------------------------------------------
63
+ ; These links are not put in the word/link graph. They also cannot be the
64
+ ; starter links for a domain. (These links may also only be used in cycles.)
65
+
66
+ IGNORE_THESE_LINKS: Xca
67
+
68
+
69
+
70
+ ; ----------------------------------------------------------------------
71
+ ; These links may only be used in cycles.
72
+
73
+ MUST_FORM_A_CYCLE_LINKS: R#* TOt EXx HA SFsic Jr JQ Xca
74
+
75
+
76
+ ; ----------------------------------------------------------------------
77
+ ; These links are not traced further if they point back before the root word.
78
+ ; The creation of Rw necessitated making B#m a restricted link, to
79
+ ; prevent the (e) domain, started by Ce, from extending around through
80
+ ; the Rw link.
81
+ ; Reverted.
82
+ ; This breaks parsing of
83
+ ; How fast a program does he think it is
84
+ ; I wonder how fast a program he thinks it is
85
+ ; I wonder how much money you earned
86
+ ; I wonder how many people you saw
87
+ ; I wonder how big a department it is
88
+ ; I wonder how much oil they spilled
89
+ ; This is the man whose dog I bought
90
+ ; I wonder which dog he said you chased
91
+ ; How efficient a program is it
92
+ ; Meanwhile, I can't find the Ce problem mentioned ... this needs more
93
+ ; documentation!
94
+
95
+ RESTRICTED_LINKS:
96
+ B#* D##w B#w B#d AFh MVt Xx HL SFsic AFd Bc CX EAh
97
+ H HA PFc B#j Wd PF Z
98
+
99
+ ; H HA PFc B#j Wd PF Z B#m
100
+
101
+
102
+ ; ----------------------------------------------------------------------
103
+ ; ---------------------- LINK TYPE TABLE-------------------------------
104
+ ; ----------------------------------------------------------------------
105
+ ; The following table associates a domain type with each possible
106
+ ; starting link. It contains pairs: the first of each pair is a link
107
+ ; type, and the second is the domain to which that link type belongs.
108
+
109
+ STARTING_LINK_TYPE_TABLE:
110
+ Ce e
111
+ R* r
112
+ Rn r
113
+ Re r
114
+ W m
115
+ RSe e
116
+ Cs s
117
+ Ca s
118
+ Jr e
119
+ Mr r
120
+ Cc s
121
+ Mv* e
122
+ QI#d s
123
+ BIqx s
124
+ TOn e
125
+ TOi e
126
+ MVi e
127
+ MV#o s
128
+ MV#p s
129
+ AFd s
130
+ PFc s
131
+ Mg* e
132
+ Mj j
133
+ Qd m
134
+ MX#j j
135
+ TOo x
136
+ I#j x
137
+ Pa##j x
138
+ CP x
139
+ COp d
140
+ SFsx d
141
+ Ss#g d
142
+ SIs#g s
143
+ Ss#d s
144
+ Bsd s
145
+ ER s
146
+ Z s
147
+ Ma#* e
148
+ MX#p e
149
+ Ci e
150
+ MX#a e
151
+ Eq e
152
+ COq e
153
+ CCq s
154
+ MX#r r
155
+
156
+
157
+ ; ----------------------------------------------------------------------
158
+ ; ----------------------- LINK SETS ------------------------------------
159
+ ; ----------------------------------------------------------------------
160
+ ; (Not in use at present; see comment at beginning of file)
161
+
162
+ ; ----------------------------------------------------------------------
163
+ ; ----------------- RULES ----------------------------------------------
164
+ ; ----------------------------------------------------------------------
165
+ ; Explanation of syntax: as usual, each stanza begins with a label
166
+ ; terminated by a colon. The interpretation of the rule depends on
167
+ ; the label, as specified in each stanza.
168
+
169
+ ; The following rule asserts that the linkage must *still* be connected
170
+ ; when the specified set(s) of links are removed from the linkage.
171
+
172
+ FORM_A_CYCLE_RULES:
173
+ @MUST_FORM_A_CYCLE_LINKS , "'must form a cycle' violation0"
174
+
175
+
176
+ ; For the following rules, if a domain contains a link matching the 1st
177
+ ; column, it must also contain a linkage matching one of the members of the
178
+ ; set in the 2nd column. The individual rules are demarcated by semicolons and
179
+ ; the fields within a rule are demarcated by commas.
180
+
181
+ CONTAINS_ONE_RULES:
182
+ SI#* , Wq Qd CQ PFc , "Bad use of s-v inversion1" ,
183
+ SI#x , Wq Qd CQ PFc , "Bad use of s-v inversion2" ,
184
+ SFI##* , Wq Qd CQ PFc , "Bad use of s-v inversion3",
185
+ SXI , Wq Qd CQ PFc , "Bad use of s-v inversion4" ,
186
+ Ws , D##w S##w H , "S-V inversion required5",
187
+ I#a , B#m B#w , "incorrect use of 'to'6" ,
188
+ Wq , SI SFI SXI , "S-V inversion required7" ,
189
+ Qd , SI SFI SXI , "S-V inversion required8" ,
190
+ PFc , SI SFI SXI , "S-V inversion required9" ,
191
+ Mj , Jw JQ , "Incorrect relative10" ,
192
+ MX#j , Jw JQ , "Incorrect relative11" ,
193
+ Wj , Jw JQ , "Misuse of preposition12" ,
194
+ JQ , Mj Wj MX#j , "Misuse of preposition13" ,
195
+ Jw , Mj Wj MX#j , "Misuse of preposition14" ,
196
+ B#j , Jr , "Incorrect relative15" ,
197
+ Jr , B#j , "Incorrect relative16" ,
198
+ EAh , AF Bsm B*m Qe Ca AFm
199
+ , "Incorrect use of 'how'17" ,
200
+ EEh , AF Bsm B*m Qe Ca AFm
201
+ , "Incorrect use of 'how'18" ,
202
+ Qe , EEh , "Incorrect use of adverb19" ,
203
+ THi , SFsi SFIsi OXi , "Complement requires 'it'20" ,
204
+ TSi , SFsi SFIsi OXi , "Complement requires 'it'21" ,
205
+ QIi , SFsi SFIsi OXi , "Complement requires 'it'22" ,
206
+ TOi , SFsi SFIsi OXi , "Complement requires 'it'23" ,
207
+ Ci , SFsi SFIsi OXi , "Complement requires 'it'24" ,
208
+ COqi , SFsi SFIsi OXi , "Complement requires 'it'25" ,
209
+ CPi , SFsi SFIsi OXi , "Complement requires 'it'26" ,
210
+ Eqi , SFsi SFIsi OXi , "Complement requires 'it'27" ,
211
+ LEi , SFsi SFIsi OXi , "Complement requires 'it'28" ,
212
+ MVti , SFsi SFIsi OXi , "Complement requires 'it'29" ,
213
+ AFdi , SFsi SFIsi OXi , "Complement requires 'it'30" ,
214
+ O#i , SFsi SFIsi OXi , "Complement requires 'it'31" ,
215
+ SFst , O*t Ost Omt Bs#t B*#t Bc#t , "Bad use of 'there'32" ,
216
+ SFIst , O*t Ost Omt Bs#t B*#t Bc#t , "Bad use of 'there'33" ,
217
+ SFp , Opt Omt O*t Bp#t B*#t Bc#t , "Bad use of 'there'34" ,
218
+ ;
219
+ ; This SFu rule forces subject-object agreement for uncountable noun objects
220
+ SFu , Out Omt O*t Bp#t B*#t Bc#t , "Bad use of 'there'34a" ,
221
+ SFIp , Opt Omt O*t Bp#t B*#t Bc#t , "Bad use of 'there'35" ,
222
+ OXt , O#t B##t , "Bad use of 'there'36" ,
223
+ SFsi* , TOi THi QIi TSi O#i Ci THb CPi
224
+ COqi CPi Eqi AFdi BIh , "Bad use of 'it'37" ,
225
+ SFIsi , TOi THi QIi TSi O#i Ci THb CPi
226
+ COqi CPi Eqi AFdi BIh , "Bad use of 'it'38" ,
227
+ OXi , TOi THi QIi TSi O#i Ci THb CPi
228
+ COqi CPi Eqi AFdi BIh , "Bad use of 'it'39" ,
229
+ THb , S##t SI##t SFsi SFIsi , "Bad use of predicate40" ,
230
+ BIh , Ss#b SIs#b SFsi SFIsi , "Bad use of predicate41" ,
231
+ BIq , S##q SI##q SFsi Ss#b SFIsi SIs#b
232
+ , "Bad use of predicate42" ,
233
+ MVt , Dm#m EAm EEm MVm Pam Pafm AFm EB#m MVb AJrc
234
+ Om Mam Am Jm Ds*m MX#m , "Bad comparative43" ,
235
+ MVz , D##y EAy EEy MVy EB#y , "Bad comparative44" ,
236
+ MV#a , Pam Pafm EAm Ds*m EAy AFm Mam Am
237
+ , "Bad comparative45" ,
238
+ MV#i , Pam Pafm EAm Ds*m EAy AFm Mam Am
239
+ , "Bad comparative46" ,
240
+ MV#o , D##m D##y Om Oy Jm Jy Am MX#m
241
+ , "Bad comparative47" ,
242
+ MV#p , EEm MVb Dm#m EEy D##y MVm Om Oy
243
+ Jm Jy Am MX#m
244
+ , "Bad comparative48" ,
245
+ Pafc , EB#m EB#y , "Bad comparative49" ,
246
+ Pafc , Pa* Paf* , "Bad comparative50" ,
247
+ MVat , MVm , "Bad comparative51" ,
248
+ MVpt , MVm , "Bad comparative52" ,
249
+ MVat , MVa MVp , "Bad comparative53" ,
250
+ MVpt , MVa MVp , "Bad comparative54" ,
251
+ U#t , D##m D##y Om Oy Jm Jy Am MX#m
252
+ , "Bad comparative55" ,
253
+ Cc , EEm EEy MVm MVb MVy
254
+ , "Bad comparative56" ,
255
+ Sp#c , Dmcm Dmcy Om Oy Jm Jy MX#m
256
+ , "Bad comparative57" ,
257
+ Ss#c , Dmum Dmuy Om Oy Jm Jy Ds*y MX#m
258
+ , "Bad comparative58" ,
259
+ S##c , Dm#m D##y Om Oy Jm Jy MX#m
260
+ , "Bad comparative59" ,
261
+ THc , TH , "Bad comparative60" ,
262
+ TOc , TO** TOf* TOi* , "Bad comparative61" ,
263
+ TOtc , TOt , "Bad comparative62" ,
264
+ Ma** , TO TOf TH MVp TOt QI OF MVt MVz MVh Ytm Ya
265
+ , "Bad use of adjective63" ,
266
+ Mam , TO TOf TH MVp TOt QI OF MVt MVz MVh Ytm Ya
267
+ , "Bad use of adjective64" ,
268
+ MX#a , TO TOf TH MVp TOt QI OF MVt MVz MVh Ytm Ya MJ
269
+ , "Bad use of adjective65" ,
270
+
271
+ ; There's no ZZZ connector, which means that Ixd and Oxn
272
+ ; are prohibited from ever occuring. 4.0.batch covers this.
273
+ Ixd , ZZZ , "Can't use 'do' with that verb" ,
274
+ Oxn , ZZZ , "Bad use of pronoun66" ,
275
+ MVh , EExk EAxk D##k , "Incorrect use of that67" ,
276
+
277
+ ; The Rw link necessitated commenting out 68, because we had to make B#m
278
+ ; a restricted link(see above) xxx reverted .. this is needed ...
279
+ ;
280
+ B#m , D##w H HA , "Bad use of gerund68"
281
+
282
+ CONTAINS_NONE_RULES:
283
+ S , Spxi , "Bad n-v agreement69" ,
284
+ SI , SIpxi , "Bad n-v agreement70" ,
285
+ Ws , B#m Ca BT , "Question inversion violated71" ,
286
+ SF , I* PP* TO* Pa* Pam Pg* Pv* LE* AFd* MVta
287
+ , "Bad use of 'filler' subject72" ,
288
+ SFI , I* PP* TO* Pa* Pam Pg* Pv* LE* AFd* MVta
289
+ , "Bad use of 'filler' subject73" ,
290
+ OX , I* PP* TO* Pa* Pam Pg* Pv* LE* AFd* MVta
291
+ , "Bad use of 'filler' subject74" ,
292
+ MXsr , Sp#w , "Bad n-v agreement75" ,
293
+ MXpr , Ss#w S#iw , "Bad n-v agreement76" ,
294
+ Mr , B#* , "Bad use of 'whose'77"
295
+
296
+
297
+ ; ----------------------------------------------------------------------
298
+ ; The following rule asserts that all specified domains must have the
299
+ ; property that all of the words that touch a link in the domain are
300
+ ; not to the left of the root word of the domain. These rules are
301
+ ; different from the above in that the first field is a *domain name*,
302
+ ; rather than a set of links.
303
+
304
+ BOUNDED_RULES:
305
+ s , "Unbounded s domain78" ,
306
+ r , "Unbounded r domain79"
data/data/en/4.0.regex ADDED
@@ -0,0 +1,225 @@
1
+ %***************************************************************************%
2
+ % %
3
+ % Copyright (C) 2005, 2006 Sampo Pyysalo, Sophie Aubin %
4
+ % See file "LICENSE" for information about commercial use of this system %
5
+ % %
6
+ %***************************************************************************%
7
+
8
+ % This file contains regular expressions that are used to match
9
+ % tokens not found in the dictionary. Each regex is given a name which
10
+ % determines the disjuncts assigned when the regex matches; this name
11
+ % must be defined in the dictionary along with the appropriate disjuncts.
12
+ % Note that the order of the regular expressions matters: matches will
13
+ % be attempted in the order in which the regexs appear in this file,
14
+ % and only the first match will be used.
15
+
16
+ % Numbers.
17
+ % XXX, we need to add utf8 U+00A0 "no-break space"
18
+ %
19
+ % Allows at most two colons in hour-muinute-second HH:MM:SS expressions
20
+ % Allows at most two digits between colons
21
+ HMS-TIME: /^[0-9][0-9]?(:[0-9][0-9]?(:[0-9][0-9]?)?)?(AM|PM|am|pm)?$/
22
+
23
+ % e.g. 1950's leading number can be higher, for science fiction.
24
+ % Must be four digits, or possible three. Must end in s, 's ’s
25
+ DECADE-TIME: /^([1-4][0-9][0-9]|[1-9][0-9])0(s|'s|’s)$/
26
+
27
+ % Day-of-month names; this regex will match before the one below.
28
+ DAY-ORDINALS: /^(1st|2nd|3rd|[4-9]th|1[0-9]th|2(0th|1st|2nd|3rd|[4-9]th)|30th|31st)$/
29
+
30
+ % Ordinal numbers; everything except 1st through 13th
31
+ % is handled by regex.
32
+ ORDINALS: /^[1-9][0-9]*(0th|1st|2nd|3rd|[4-9]th)$/
33
+
34
+ % Allows any number of commas or periods
35
+ % Be careful not match the period at the end of a sentence;
36
+ % for example: "It happened in 1942."
37
+ NUMBERS: /^[0-9,.]*[0-9]$/
38
+ % This parses signed numbers and ranges, e.g. "-5" and "5-10" and "9+/-6.5"
39
+ NUMBERS: /^[0-9.,-]*[0-9](\+\/-[0-9.,-]*[0-9])?$/
40
+ % Parses simple fractions e.g. "1/60" with no decimal points or anything fancy
41
+ FRACTION: /^[0-9]+\/[0-9]+$/
42
+ % "10(3)" exponent (used in PubMed)
43
+ NUMBERS: /^[0-9.,-]*[0-9][0-9.,-]*\([0-9:.,-]*[0-9][0-9.,-]*\)$/
44
+
45
+ % Roman numerals
46
+ % The first expr has the potential(?) problem that it matches an empty
47
+ % string. Thus, the next three rules specify that at least one section
48
+ % is non-empty.
49
+ ROMAN-NUMERAL-WORDS: /^M*(CM|D?C{0,3}|CD)(XC|L?X{0,3}|XL)(IX|V?I{0,3}|IV)$/
50
+ % ROMAN-NUMERAL-WORDS: /^M*(CM|D?C{0,3}|CD){1}(XC|L?X{0,3}|XL)(IX|V?I{0,3}|IV)$/
51
+ % ROMAN-NUMERAL-WORDS: /^M*(CM|D?C{0,3}|CD)(XC|L?X{0,3}|XL){1}(IX|V?I{0,3}|IV)$/
52
+ % ROMAN-NUMERAL-WORDS: /^M*(CM|D?C{0,3}|CD)(XC|L?X{0,3}|XL)(IX|V?I{0,3}|IV){1}$/
53
+
54
+ % Strings of initials. e.g "Dr. J.G.D. Smith lives on Main St."
55
+ INITIALS: /^([A-Z]\.)+$/
56
+
57
+ % Greek letters with numbers
58
+ GREEK-LETTER-AND-NUMBER: /^(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)\-?[0-9]+$/
59
+ PL-GREEK-LETTER-AND-NUMBER: /^(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)s\-?[0-9]+$/
60
+
61
+ % Some "safe" derived units. Simple units are in dictionary.
62
+ % The idea here is for the regex to match something that is almost
63
+ % certainly part of a derived unit, and allow the rest to be
64
+ % anything; this way we can capture difficult derived units such
65
+ % as "mg/kg/day" and even oddities such as "micrograms/mouse/day"
66
+ % without listing them explicitly.
67
+ % TODO: add more.
68
+ % Some (real) misses from these:
69
+ % micrograms.kg-1.h-1 microM-1 J/cm2 %/day mN/m cm/yr
70
+ % m/s days/week ml/s degrees/sec cm/sec cm/s mm/s N/mm (is that a unit?)
71
+ % cuts/minute clicks/s beats/minute x/week W/kg/W %/patient-year
72
+ % microIU/ml degrees/s counts/mm2 cells/mm3 tumors/mouse
73
+ % mm/sec ml/hr mJ/cm(2) m2/g amol/mm2 animals/group
74
+ % h-1 min-1 day-1 cm-1 mg-1 kg-1 mg.m-2.min-1 ms.cm-1 g-1
75
+ % sec-1 ms-1 ml.min.-1kg-1 ml.hr-1
76
+ % also, both kilometer and kilometers seem to be absent(!)
77
+ % remember "mm"!
78
+
79
+ UNITS: /^([npmk]|nano|pico|milli|micro|kilo)?(g|grams?)\// % grams/anything
80
+ UNITS: /^([fnmp]|femto|nano|micro|pico|mu)?mol(es)?\// % mol/anything
81
+ UNITS: /^[a-zA-Z\/.]+\/((m|micro)?[lLg]|kg|mol|min|day|h)$/ % common endings
82
+ % common endings, except in the style "mg.kg-1" instead of "mg/kg".
83
+ UNITS: /^[a-zA-Z\/.1-]+\.((m|micro)?[lLg]|kg|mol|min|day|h)(-1|\(-1\))$/
84
+
85
+ % combinations of numbers and units, e.g. "50-kDa", "1-2h"
86
+ % TODO: Clean up and check that these are up-to-date wrt the
87
+ % dictionary-recognized units; this is quite a mess currently.
88
+ % TODO: Extend the "number" part of the regex to allow anything
89
+ % that the NUMBER regex matches.
90
+ % One problem here is a failure to split up the expression ...
91
+ % e.g. "2hr" becomes 2 - ND - hr with the ND link. But 2-hr is treated
92
+ % as a single word ('I is a 2-hr wait')
93
+ % NUMBER-AND-UNIT: /^[0-9.,-]+(msec|s|min|hour|h|hr|day|week|wk|month|year|yr|kDa|kilodalton|base|kilobase|base-pair|kD|kd|kDa|bp|nt|kb|mm|mg|cm|nm|g|Hz|ms|kg|ml|mL|km|microm|\%)$/
94
+ % Comment out above, it screws up handling of unit suffixes, for
95
+ % example: "Zangbert stock fell 30% to $2.50 yesterday."
96
+
97
+
98
+ % fold-words. Matches NUMBER-fold, where NUMBER can be either numeric
99
+ % or a spelled-out number, and the hyphen is optional. Note that for
100
+ % spelled-out numbers, anything is allowed between the "initial" number
101
+ % and "fold" to catch e.g. "two-to-three fold" ("fourteen" etc. are absent
102
+ % as the prefix "four" is sufficient to match).
103
+ FOLD-WORDS: /^[0-9.,:-]*[0-9]([0-9.,:-]|\([0-9.,:-]*[0-9][0-9.,:-]*\)|\+\/-)*-?fold$/
104
+ FOLD-WORDS: /^(one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fifteen|twenty|thirty|fifty|hundred|thousand|million).*fold$/
105
+
106
+ % Plural proper nouns.
107
+ % Make sure that apostrophe-s is split out correctly.
108
+ PL-CAPITALIZED-WORDS: /^[[:upper:]].*[^iuoys'’]s$/
109
+
110
+ % Other proper nouns.
111
+ % We demand that these end with an alphanumeric, i.e. explicitly
112
+ % reject punctuation. We don't want this regex to "swallow" any trailing
113
+ % commas, colons, or periods/question-marks at the end of sentences.
114
+ % In addition, this must not swallow words ending in 's 'll etc.
115
+ % (... any affix, for that matter ...) and so no embedded apostrophe
116
+ CAPITALIZED-WORDS: /^[[:upper:]][^'’]*[^[:punct:]]$/
117
+
118
+ % SUFFIX GUESSING
119
+ % For all suffix-guessing patterns, we insist that the pattern start
120
+ % with an alphanumeric. This is needed to guarentee that the
121
+ % prefix-stripping code works correctly, as otherwise, the regex will
122
+ % gobble the prefix. So for example: "We left (carrying the dog) and
123
+ % Fred followed." Since "(carrying" is not in the dict, we need to be
124
+ % sure to not match the leading paren so that it will get tripped.
125
+ %
126
+ ING-WORDS: /^\w.+ing$/
127
+
128
+ % Plurals or verb-s. Make sure that apostrophe-s is split out correctly.
129
+ % e.g. "The subject's name is John Doe." should be
130
+ % +--Ds--+---YS--+--Ds-+
131
+ % | | | |
132
+ % the subject.n 's.p name.n
133
+ S-WORDS: /^\w.+[^iuoys'’]s$/
134
+
135
+ % Verbs ending -ed.
136
+ ED-WORDS: /^\w.+ed$/
137
+
138
+ % Advebs ending -ly.
139
+ LY-WORDS: /^\w.+ly$/
140
+
141
+ % Nouns ending in -ism, -asm (chiliasm .. ) Usualy mass nouns
142
+ % Stubbed out for now; I'm not convinced this improves accuracy.
143
+ % ISM-WORDS: /^\w.+asm$/
144
+ % ISM-WORDS: /^\w.+ism$/
145
+
146
+ % Corresponding count noun version of above (chiliast...)
147
+ % AST-WORDS: /^\w.+ast$/
148
+ % AST-WORDS: /^\w.+ist$/
149
+
150
+ % Corresponding adjectival form of above
151
+ ADJ-WORDS: /^\w.+astic$/
152
+ ADJ-WORDS: /^\w.+istic$/
153
+
154
+ % Nouns ending -ation stubbed out in BioLG, stub out here ...
155
+ %ATION-WORDS: /^\w.+ation$/
156
+
157
+ % Extension by LIPN 11/10/2005
158
+ % nouns -- typically seen in (bio-)chemistry texts
159
+ % synthetase, kinase
160
+ % 5-(hydroxymethyl)-2’-deoxyuridine
161
+ % hydroxyethyl, hydroxymethyl
162
+ % septation, reguion
163
+ % isomaltotetraose, isomaltotriose
164
+ % glycosylphosphatidylinositol
165
+ % iodide, oligodeoxynucleotide
166
+ % chronicity, hypochromicity
167
+ MC-NOUN-WORDS: /^\w.+ase$/
168
+ MC-NOUN-WORDS: /^\w.+ine?$/
169
+ MC-NOUN-WORDS: /^\w.+yl$/
170
+ MC-NOUN-WORDS: /^\w.+ion$/
171
+ MC-NOUN-WORDS: /^\w.+ose$/
172
+ MC-NOUN-WORDS: /^\w.+ol$/
173
+ MC-NOUN-WORDS: /^\w.+ide$/
174
+ MC-NOUN-WORDS: /^\w.+ity$/
175
+
176
+ % replicon, intron
177
+ C-NOUN-WORDS: /^\w.+o[rn]$/
178
+
179
+ % adjectives
180
+ % exogenous, heterologous
181
+ % intermolecular, intramolecular
182
+ % glycolytic, ribonucleic, uronic
183
+ % ribosomal, ribsosomal
184
+ % nonpermissive, thermosensitive
185
+ % inducible, metastable
186
+ ADJ-WORDS: /^\w.+ous$/
187
+ ADJ-WORDS: /^\w.+ar$/
188
+ ADJ-WORDS: /^\w.+ic$/
189
+ ADJ-WORDS: /^\w.+al$/
190
+ ADJ-WORDS: /^\w.+ive$/
191
+ ADJ-WORDS: /^\w.+ble$/
192
+
193
+ % latin (postposed) adjectives
194
+ % influenzae, tarentolae
195
+ % pentosaceus, luteus, carnosus
196
+ LATIN-ADJ-WORDS: /^\w.+ae$/
197
+ LATIN-ADJ-WORDS: /^\w.+us$/ % must appear after -ous in this file
198
+
199
+ % latin (postposed) adjectives or latin plural noun
200
+ % brevis, israelensis
201
+ % japonicum, tabacum, xylinum
202
+ LATIN-ADJ-P-NOUN-WORDS: /^\w.+is?$/
203
+ LATIN-ADJ-S-NOUN-WORDS: /^\w.+um$/
204
+
205
+
206
+ % Hyphenated words. In the original LG morpho-guessing system that
207
+ % predated the regex-based system, hyphenated words were detected
208
+ % before ING-WORDS, S-WORDS etc., causing e.g. "cross-linked" to be
209
+ % treated as a HYPHENATED-WORD (a generic adjective/noun), and
210
+ % never a verb. To return to this ordering, move this regex just
211
+ % after the CAPITALIZED-WORDS regex.
212
+ HYPHENATED-WORDS: /^[[:alpha:][:digit:],.][[:alpha:][:digit:],.-]*-[[:alpha:][:digit:],.-]*[[:alpha:][:digit:],.]$/
213
+
214
+ % proteins often end "ase", so we'll assume those things are names.
215
+ % removed, too many false positives.
216
+ % NAME: /ase$/
217
+
218
+ % Sequence of punctuation marks. If some mark appears in the affix table
219
+ % such as a period, comma, dash or underscore, and there's a sequence of
220
+ % these, then treat it as a "fill-in-the-blank" placeholder.
221
+ % This matters only for punc. appearing in the affix table, since the
222
+ % tokenizer explicitly mangles based on these punctution marks.
223
+ %
224
+ % Look for at least four in a row.
225
+ UNKNOWN-WORD: /^[.,-]{4}[.,-]*$/