grammar_cop 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (344) hide show
  1. data/.DS_Store +0 -0
  2. data/.gitignore +4 -0
  3. data/Gemfile +4 -0
  4. data/Rakefile +8 -0
  5. data/data/.DS_Store +0 -0
  6. data/data/Makefile +511 -0
  7. data/data/Makefile.am +4 -0
  8. data/data/Makefile.in +511 -0
  9. data/data/de/.DS_Store +0 -0
  10. data/data/de/4.0.affix +7 -0
  11. data/data/de/4.0.dict +474 -0
  12. data/data/de/Makefile +387 -0
  13. data/data/de/Makefile.am +9 -0
  14. data/data/de/Makefile.in +387 -0
  15. data/data/en/.DS_Store +0 -0
  16. data/data/en/4.0.affix +26 -0
  17. data/data/en/4.0.batch +1002 -0
  18. data/data/en/4.0.biolg.batch +411 -0
  19. data/data/en/4.0.constituent-knowledge +127 -0
  20. data/data/en/4.0.dict +8759 -0
  21. data/data/en/4.0.dict.m4 +6928 -0
  22. data/data/en/4.0.enwiki.batch +14 -0
  23. data/data/en/4.0.fixes.batch +2776 -0
  24. data/data/en/4.0.knowledge +306 -0
  25. data/data/en/4.0.regex +225 -0
  26. data/data/en/4.0.voa.batch +114 -0
  27. data/data/en/Makefile +554 -0
  28. data/data/en/Makefile.am +19 -0
  29. data/data/en/Makefile.in +554 -0
  30. data/data/en/README +173 -0
  31. data/data/en/tiny.dict +157 -0
  32. data/data/en/words/.DS_Store +0 -0
  33. data/data/en/words/Makefile +456 -0
  34. data/data/en/words/Makefile.am +78 -0
  35. data/data/en/words/Makefile.in +456 -0
  36. data/data/en/words/currency +205 -0
  37. data/data/en/words/currency.p +28 -0
  38. data/data/en/words/entities.given-bisex.sing +39 -0
  39. data/data/en/words/entities.given-female.sing +4141 -0
  40. data/data/en/words/entities.given-male.sing +1633 -0
  41. data/data/en/words/entities.locations.sing +68 -0
  42. data/data/en/words/entities.national.sing +253 -0
  43. data/data/en/words/entities.organizations.sing +7 -0
  44. data/data/en/words/entities.us-states.sing +11 -0
  45. data/data/en/words/units.1 +45 -0
  46. data/data/en/words/units.1.dot +4 -0
  47. data/data/en/words/units.3 +2 -0
  48. data/data/en/words/units.4 +5 -0
  49. data/data/en/words/units.4.dot +1 -0
  50. data/data/en/words/words-medical.adv.1 +1191 -0
  51. data/data/en/words/words-medical.prep.1 +67 -0
  52. data/data/en/words/words-medical.v.4.1 +2835 -0
  53. data/data/en/words/words-medical.v.4.2 +2848 -0
  54. data/data/en/words/words-medical.v.4.3 +3011 -0
  55. data/data/en/words/words-medical.v.4.4 +3036 -0
  56. data/data/en/words/words-medical.v.4.5 +3050 -0
  57. data/data/en/words/words.adj.1 +6794 -0
  58. data/data/en/words/words.adj.2 +638 -0
  59. data/data/en/words/words.adj.3 +667 -0
  60. data/data/en/words/words.adv.1 +1573 -0
  61. data/data/en/words/words.adv.2 +67 -0
  62. data/data/en/words/words.adv.3 +157 -0
  63. data/data/en/words/words.adv.4 +80 -0
  64. data/data/en/words/words.n.1 +11464 -0
  65. data/data/en/words/words.n.1.wiki +264 -0
  66. data/data/en/words/words.n.2.s +2017 -0
  67. data/data/en/words/words.n.2.s.biolg +1 -0
  68. data/data/en/words/words.n.2.s.wiki +298 -0
  69. data/data/en/words/words.n.2.x +65 -0
  70. data/data/en/words/words.n.2.x.wiki +10 -0
  71. data/data/en/words/words.n.3 +5717 -0
  72. data/data/en/words/words.n.t +23 -0
  73. data/data/en/words/words.v.1.1 +1038 -0
  74. data/data/en/words/words.v.1.2 +1043 -0
  75. data/data/en/words/words.v.1.3 +1052 -0
  76. data/data/en/words/words.v.1.4 +1023 -0
  77. data/data/en/words/words.v.1.p +17 -0
  78. data/data/en/words/words.v.10.1 +14 -0
  79. data/data/en/words/words.v.10.2 +15 -0
  80. data/data/en/words/words.v.10.3 +88 -0
  81. data/data/en/words/words.v.10.4 +17 -0
  82. data/data/en/words/words.v.2.1 +1253 -0
  83. data/data/en/words/words.v.2.2 +1304 -0
  84. data/data/en/words/words.v.2.3 +1280 -0
  85. data/data/en/words/words.v.2.4 +1285 -0
  86. data/data/en/words/words.v.2.5 +1287 -0
  87. data/data/en/words/words.v.4.1 +2472 -0
  88. data/data/en/words/words.v.4.2 +2487 -0
  89. data/data/en/words/words.v.4.3 +2441 -0
  90. data/data/en/words/words.v.4.4 +2478 -0
  91. data/data/en/words/words.v.4.5 +2483 -0
  92. data/data/en/words/words.v.5.1 +98 -0
  93. data/data/en/words/words.v.5.2 +98 -0
  94. data/data/en/words/words.v.5.3 +103 -0
  95. data/data/en/words/words.v.5.4 +102 -0
  96. data/data/en/words/words.v.6.1 +388 -0
  97. data/data/en/words/words.v.6.2 +401 -0
  98. data/data/en/words/words.v.6.3 +397 -0
  99. data/data/en/words/words.v.6.4 +405 -0
  100. data/data/en/words/words.v.6.5 +401 -0
  101. data/data/en/words/words.v.8.1 +117 -0
  102. data/data/en/words/words.v.8.2 +118 -0
  103. data/data/en/words/words.v.8.3 +118 -0
  104. data/data/en/words/words.v.8.4 +119 -0
  105. data/data/en/words/words.v.8.5 +119 -0
  106. data/data/en/words/words.y +104 -0
  107. data/data/lt/.DS_Store +0 -0
  108. data/data/lt/4.0.affix +6 -0
  109. data/data/lt/4.0.constituent-knowledge +24 -0
  110. data/data/lt/4.0.dict +135 -0
  111. data/data/lt/4.0.knowledge +38 -0
  112. data/data/lt/Makefile +389 -0
  113. data/data/lt/Makefile.am +11 -0
  114. data/data/lt/Makefile.in +389 -0
  115. data/ext/.DS_Store +0 -0
  116. data/ext/link_grammar/.DS_Store +0 -0
  117. data/ext/link_grammar/extconf.rb +2 -0
  118. data/ext/link_grammar/link-grammar/.DS_Store +0 -0
  119. data/ext/link_grammar/link-grammar/.deps/analyze-linkage.Plo +198 -0
  120. data/ext/link_grammar/link-grammar/.deps/and.Plo +202 -0
  121. data/ext/link_grammar/link-grammar/.deps/api.Plo +244 -0
  122. data/ext/link_grammar/link-grammar/.deps/build-disjuncts.Plo +212 -0
  123. data/ext/link_grammar/link-grammar/.deps/command-line.Plo +201 -0
  124. data/ext/link_grammar/link-grammar/.deps/constituents.Plo +201 -0
  125. data/ext/link_grammar/link-grammar/.deps/count.Plo +202 -0
  126. data/ext/link_grammar/link-grammar/.deps/disjunct-utils.Plo +126 -0
  127. data/ext/link_grammar/link-grammar/.deps/disjuncts.Plo +123 -0
  128. data/ext/link_grammar/link-grammar/.deps/error.Plo +121 -0
  129. data/ext/link_grammar/link-grammar/.deps/expand.Plo +133 -0
  130. data/ext/link_grammar/link-grammar/.deps/extract-links.Plo +198 -0
  131. data/ext/link_grammar/link-grammar/.deps/fast-match.Plo +200 -0
  132. data/ext/link_grammar/link-grammar/.deps/idiom.Plo +200 -0
  133. data/ext/link_grammar/link-grammar/.deps/jni-client.Plo +217 -0
  134. data/ext/link_grammar/link-grammar/.deps/link-parser.Po +1 -0
  135. data/ext/link_grammar/link-grammar/.deps/massage.Plo +202 -0
  136. data/ext/link_grammar/link-grammar/.deps/post-process.Plo +202 -0
  137. data/ext/link_grammar/link-grammar/.deps/pp_knowledge.Plo +202 -0
  138. data/ext/link_grammar/link-grammar/.deps/pp_lexer.Plo +201 -0
  139. data/ext/link_grammar/link-grammar/.deps/pp_linkset.Plo +200 -0
  140. data/ext/link_grammar/link-grammar/.deps/prefix.Plo +102 -0
  141. data/ext/link_grammar/link-grammar/.deps/preparation.Plo +202 -0
  142. data/ext/link_grammar/link-grammar/.deps/print-util.Plo +200 -0
  143. data/ext/link_grammar/link-grammar/.deps/print.Plo +201 -0
  144. data/ext/link_grammar/link-grammar/.deps/prune.Plo +202 -0
  145. data/ext/link_grammar/link-grammar/.deps/read-dict.Plo +223 -0
  146. data/ext/link_grammar/link-grammar/.deps/read-regex.Plo +123 -0
  147. data/ext/link_grammar/link-grammar/.deps/regex-morph.Plo +131 -0
  148. data/ext/link_grammar/link-grammar/.deps/resources.Plo +203 -0
  149. data/ext/link_grammar/link-grammar/.deps/spellcheck-aspell.Plo +1 -0
  150. data/ext/link_grammar/link-grammar/.deps/spellcheck-hun.Plo +115 -0
  151. data/ext/link_grammar/link-grammar/.deps/string-set.Plo +198 -0
  152. data/ext/link_grammar/link-grammar/.deps/tokenize.Plo +160 -0
  153. data/ext/link_grammar/link-grammar/.deps/utilities.Plo +222 -0
  154. data/ext/link_grammar/link-grammar/.deps/word-file.Plo +201 -0
  155. data/ext/link_grammar/link-grammar/.deps/word-utils.Plo +212 -0
  156. data/ext/link_grammar/link-grammar/.libs/analyze-linkage.o +0 -0
  157. data/ext/link_grammar/link-grammar/.libs/and.o +0 -0
  158. data/ext/link_grammar/link-grammar/.libs/api.o +0 -0
  159. data/ext/link_grammar/link-grammar/.libs/build-disjuncts.o +0 -0
  160. data/ext/link_grammar/link-grammar/.libs/command-line.o +0 -0
  161. data/ext/link_grammar/link-grammar/.libs/constituents.o +0 -0
  162. data/ext/link_grammar/link-grammar/.libs/count.o +0 -0
  163. data/ext/link_grammar/link-grammar/.libs/disjunct-utils.o +0 -0
  164. data/ext/link_grammar/link-grammar/.libs/disjuncts.o +0 -0
  165. data/ext/link_grammar/link-grammar/.libs/error.o +0 -0
  166. data/ext/link_grammar/link-grammar/.libs/expand.o +0 -0
  167. data/ext/link_grammar/link-grammar/.libs/extract-links.o +0 -0
  168. data/ext/link_grammar/link-grammar/.libs/fast-match.o +0 -0
  169. data/ext/link_grammar/link-grammar/.libs/idiom.o +0 -0
  170. data/ext/link_grammar/link-grammar/.libs/jni-client.o +0 -0
  171. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java-symbols.expsym +31 -0
  172. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib +0 -0
  173. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib.dSYM/Contents/Info.plist +20 -0
  174. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib.dSYM/Contents/Resources/DWARF/liblink-grammar-java.4.dylib +0 -0
  175. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.a +0 -0
  176. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.dylib +0 -0
  177. data/ext/link_grammar/link-grammar/.libs/liblink-grammar-symbols.expsym +194 -0
  178. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib +0 -0
  179. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib.dSYM/Contents/Info.plist +20 -0
  180. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib.dSYM/Contents/Resources/DWARF/liblink-grammar.4.dylib +0 -0
  181. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.a +0 -0
  182. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.dylib +0 -0
  183. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.la +41 -0
  184. data/ext/link_grammar/link-grammar/.libs/liblink-grammar.lai +41 -0
  185. data/ext/link_grammar/link-grammar/.libs/massage.o +0 -0
  186. data/ext/link_grammar/link-grammar/.libs/post-process.o +0 -0
  187. data/ext/link_grammar/link-grammar/.libs/pp_knowledge.o +0 -0
  188. data/ext/link_grammar/link-grammar/.libs/pp_lexer.o +0 -0
  189. data/ext/link_grammar/link-grammar/.libs/pp_linkset.o +0 -0
  190. data/ext/link_grammar/link-grammar/.libs/prefix.o +0 -0
  191. data/ext/link_grammar/link-grammar/.libs/preparation.o +0 -0
  192. data/ext/link_grammar/link-grammar/.libs/print-util.o +0 -0
  193. data/ext/link_grammar/link-grammar/.libs/print.o +0 -0
  194. data/ext/link_grammar/link-grammar/.libs/prune.o +0 -0
  195. data/ext/link_grammar/link-grammar/.libs/read-dict.o +0 -0
  196. data/ext/link_grammar/link-grammar/.libs/read-regex.o +0 -0
  197. data/ext/link_grammar/link-grammar/.libs/regex-morph.o +0 -0
  198. data/ext/link_grammar/link-grammar/.libs/resources.o +0 -0
  199. data/ext/link_grammar/link-grammar/.libs/spellcheck-aspell.o +0 -0
  200. data/ext/link_grammar/link-grammar/.libs/spellcheck-hun.o +0 -0
  201. data/ext/link_grammar/link-grammar/.libs/string-set.o +0 -0
  202. data/ext/link_grammar/link-grammar/.libs/tokenize.o +0 -0
  203. data/ext/link_grammar/link-grammar/.libs/utilities.o +0 -0
  204. data/ext/link_grammar/link-grammar/.libs/word-file.o +0 -0
  205. data/ext/link_grammar/link-grammar/.libs/word-utils.o +0 -0
  206. data/ext/link_grammar/link-grammar/Makefile +900 -0
  207. data/ext/link_grammar/link-grammar/Makefile.am +202 -0
  208. data/ext/link_grammar/link-grammar/Makefile.in +900 -0
  209. data/ext/link_grammar/link-grammar/analyze-linkage.c +1317 -0
  210. data/ext/link_grammar/link-grammar/analyze-linkage.h +24 -0
  211. data/ext/link_grammar/link-grammar/and.c +1603 -0
  212. data/ext/link_grammar/link-grammar/and.h +27 -0
  213. data/ext/link_grammar/link-grammar/api-structures.h +362 -0
  214. data/ext/link_grammar/link-grammar/api-types.h +72 -0
  215. data/ext/link_grammar/link-grammar/api.c +1887 -0
  216. data/ext/link_grammar/link-grammar/api.h +96 -0
  217. data/ext/link_grammar/link-grammar/autoit/.DS_Store +0 -0
  218. data/ext/link_grammar/link-grammar/autoit/README +10 -0
  219. data/ext/link_grammar/link-grammar/autoit/_LGTest.au3 +22 -0
  220. data/ext/link_grammar/link-grammar/autoit/_LinkGrammar.au3 +545 -0
  221. data/ext/link_grammar/link-grammar/build-disjuncts.c +487 -0
  222. data/ext/link_grammar/link-grammar/build-disjuncts.h +21 -0
  223. data/ext/link_grammar/link-grammar/command-line.c +458 -0
  224. data/ext/link_grammar/link-grammar/command-line.h +15 -0
  225. data/ext/link_grammar/link-grammar/constituents.c +1836 -0
  226. data/ext/link_grammar/link-grammar/constituents.h +26 -0
  227. data/ext/link_grammar/link-grammar/corpus/.DS_Store +0 -0
  228. data/ext/link_grammar/link-grammar/corpus/.deps/cluster.Plo +1 -0
  229. data/ext/link_grammar/link-grammar/corpus/.deps/corpus.Plo +1 -0
  230. data/ext/link_grammar/link-grammar/corpus/Makefile +527 -0
  231. data/ext/link_grammar/link-grammar/corpus/Makefile.am +46 -0
  232. data/ext/link_grammar/link-grammar/corpus/Makefile.in +527 -0
  233. data/ext/link_grammar/link-grammar/corpus/README +17 -0
  234. data/ext/link_grammar/link-grammar/corpus/cluster.c +286 -0
  235. data/ext/link_grammar/link-grammar/corpus/cluster.h +32 -0
  236. data/ext/link_grammar/link-grammar/corpus/corpus.c +483 -0
  237. data/ext/link_grammar/link-grammar/corpus/corpus.h +46 -0
  238. data/ext/link_grammar/link-grammar/count.c +828 -0
  239. data/ext/link_grammar/link-grammar/count.h +25 -0
  240. data/ext/link_grammar/link-grammar/disjunct-utils.c +261 -0
  241. data/ext/link_grammar/link-grammar/disjunct-utils.h +27 -0
  242. data/ext/link_grammar/link-grammar/disjuncts.c +138 -0
  243. data/ext/link_grammar/link-grammar/disjuncts.h +13 -0
  244. data/ext/link_grammar/link-grammar/error.c +92 -0
  245. data/ext/link_grammar/link-grammar/error.h +35 -0
  246. data/ext/link_grammar/link-grammar/expand.c +67 -0
  247. data/ext/link_grammar/link-grammar/expand.h +13 -0
  248. data/ext/link_grammar/link-grammar/externs.h +22 -0
  249. data/ext/link_grammar/link-grammar/extract-links.c +625 -0
  250. data/ext/link_grammar/link-grammar/extract-links.h +16 -0
  251. data/ext/link_grammar/link-grammar/fast-match.c +309 -0
  252. data/ext/link_grammar/link-grammar/fast-match.h +17 -0
  253. data/ext/link_grammar/link-grammar/idiom.c +373 -0
  254. data/ext/link_grammar/link-grammar/idiom.h +15 -0
  255. data/ext/link_grammar/link-grammar/jni-client.c +779 -0
  256. data/ext/link_grammar/link-grammar/jni-client.h +236 -0
  257. data/ext/link_grammar/link-grammar/liblink-grammar-java.la +42 -0
  258. data/ext/link_grammar/link-grammar/liblink-grammar.la +41 -0
  259. data/ext/link_grammar/link-grammar/link-features.h +37 -0
  260. data/ext/link_grammar/link-grammar/link-features.h.in +37 -0
  261. data/ext/link_grammar/link-grammar/link-grammar-java.def +31 -0
  262. data/ext/link_grammar/link-grammar/link-grammar.def +194 -0
  263. data/ext/link_grammar/link-grammar/link-includes.h +465 -0
  264. data/ext/link_grammar/link-grammar/link-parser.c +849 -0
  265. data/ext/link_grammar/link-grammar/massage.c +329 -0
  266. data/ext/link_grammar/link-grammar/massage.h +13 -0
  267. data/ext/link_grammar/link-grammar/post-process.c +1113 -0
  268. data/ext/link_grammar/link-grammar/post-process.h +45 -0
  269. data/ext/link_grammar/link-grammar/pp_knowledge.c +376 -0
  270. data/ext/link_grammar/link-grammar/pp_knowledge.h +14 -0
  271. data/ext/link_grammar/link-grammar/pp_lexer.c +1920 -0
  272. data/ext/link_grammar/link-grammar/pp_lexer.h +19 -0
  273. data/ext/link_grammar/link-grammar/pp_linkset.c +158 -0
  274. data/ext/link_grammar/link-grammar/pp_linkset.h +20 -0
  275. data/ext/link_grammar/link-grammar/prefix.c +482 -0
  276. data/ext/link_grammar/link-grammar/prefix.h +139 -0
  277. data/ext/link_grammar/link-grammar/preparation.c +412 -0
  278. data/ext/link_grammar/link-grammar/preparation.h +20 -0
  279. data/ext/link_grammar/link-grammar/print-util.c +87 -0
  280. data/ext/link_grammar/link-grammar/print-util.h +32 -0
  281. data/ext/link_grammar/link-grammar/print.c +1085 -0
  282. data/ext/link_grammar/link-grammar/print.h +16 -0
  283. data/ext/link_grammar/link-grammar/prune.c +1864 -0
  284. data/ext/link_grammar/link-grammar/prune.h +17 -0
  285. data/ext/link_grammar/link-grammar/read-dict.c +1785 -0
  286. data/ext/link_grammar/link-grammar/read-dict.h +29 -0
  287. data/ext/link_grammar/link-grammar/read-regex.c +161 -0
  288. data/ext/link_grammar/link-grammar/read-regex.h +12 -0
  289. data/ext/link_grammar/link-grammar/regex-morph.c +126 -0
  290. data/ext/link_grammar/link-grammar/regex-morph.h +17 -0
  291. data/ext/link_grammar/link-grammar/resources.c +180 -0
  292. data/ext/link_grammar/link-grammar/resources.h +23 -0
  293. data/ext/link_grammar/link-grammar/sat-solver/.DS_Store +0 -0
  294. data/ext/link_grammar/link-grammar/sat-solver/.deps/fast-sprintf.Plo +1 -0
  295. data/ext/link_grammar/link-grammar/sat-solver/.deps/sat-encoder.Plo +1 -0
  296. data/ext/link_grammar/link-grammar/sat-solver/.deps/util.Plo +1 -0
  297. data/ext/link_grammar/link-grammar/sat-solver/.deps/variables.Plo +1 -0
  298. data/ext/link_grammar/link-grammar/sat-solver/.deps/word-tag.Plo +1 -0
  299. data/ext/link_grammar/link-grammar/sat-solver/Makefile +527 -0
  300. data/ext/link_grammar/link-grammar/sat-solver/Makefile.am +29 -0
  301. data/ext/link_grammar/link-grammar/sat-solver/Makefile.in +527 -0
  302. data/ext/link_grammar/link-grammar/sat-solver/clock.hpp +33 -0
  303. data/ext/link_grammar/link-grammar/sat-solver/fast-sprintf.cpp +26 -0
  304. data/ext/link_grammar/link-grammar/sat-solver/fast-sprintf.hpp +7 -0
  305. data/ext/link_grammar/link-grammar/sat-solver/guiding.hpp +244 -0
  306. data/ext/link_grammar/link-grammar/sat-solver/matrix-ut.hpp +79 -0
  307. data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.cpp +2811 -0
  308. data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.h +11 -0
  309. data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.hpp +381 -0
  310. data/ext/link_grammar/link-grammar/sat-solver/trie.hpp +118 -0
  311. data/ext/link_grammar/link-grammar/sat-solver/util.cpp +23 -0
  312. data/ext/link_grammar/link-grammar/sat-solver/util.hpp +14 -0
  313. data/ext/link_grammar/link-grammar/sat-solver/variables.cpp +5 -0
  314. data/ext/link_grammar/link-grammar/sat-solver/variables.hpp +829 -0
  315. data/ext/link_grammar/link-grammar/sat-solver/word-tag.cpp +159 -0
  316. data/ext/link_grammar/link-grammar/sat-solver/word-tag.hpp +162 -0
  317. data/ext/link_grammar/link-grammar/spellcheck-aspell.c +148 -0
  318. data/ext/link_grammar/link-grammar/spellcheck-hun.c +136 -0
  319. data/ext/link_grammar/link-grammar/spellcheck.h +34 -0
  320. data/ext/link_grammar/link-grammar/string-set.c +169 -0
  321. data/ext/link_grammar/link-grammar/string-set.h +16 -0
  322. data/ext/link_grammar/link-grammar/structures.h +498 -0
  323. data/ext/link_grammar/link-grammar/tokenize.c +1049 -0
  324. data/ext/link_grammar/link-grammar/tokenize.h +15 -0
  325. data/ext/link_grammar/link-grammar/utilities.c +847 -0
  326. data/ext/link_grammar/link-grammar/utilities.h +281 -0
  327. data/ext/link_grammar/link-grammar/word-file.c +124 -0
  328. data/ext/link_grammar/link-grammar/word-file.h +15 -0
  329. data/ext/link_grammar/link-grammar/word-utils.c +526 -0
  330. data/ext/link_grammar/link-grammar/word-utils.h +152 -0
  331. data/ext/link_grammar/link_grammar.c +202 -0
  332. data/ext/link_grammar/link_grammar.h +99 -0
  333. data/grammar_cop.gemspec +24 -0
  334. data/lib/.DS_Store +0 -0
  335. data/lib/grammar_cop.rb +9 -0
  336. data/lib/grammar_cop/.DS_Store +0 -0
  337. data/lib/grammar_cop/dictionary.rb +19 -0
  338. data/lib/grammar_cop/linkage.rb +30 -0
  339. data/lib/grammar_cop/parse_options.rb +32 -0
  340. data/lib/grammar_cop/sentence.rb +36 -0
  341. data/lib/grammar_cop/version.rb +3 -0
  342. data/test/.DS_Store +0 -0
  343. data/test/grammar_cop_test.rb +27 -0
  344. metadata +407 -0
@@ -0,0 +1,306 @@
1
+ ; Post-processing knowledge file
2
+ ; 6/96
3
+
4
+ ; ----------------------------------------------------------------------------
5
+ ; This file contains the knowledge related to post-processing, in the
6
+ ; form of lists and rules. This file is read by post-process.c at run-time.
7
+ ; Syntax of file:
8
+ ; line starting with ";" is a comment
9
+ ; commas are field delimiters
10
+ ; any token beginning with the character @ is expanded to the set
11
+ ; of symbols it defined. e.g. one could write
12
+ ; FOO: blah1 blah2 blah3
13
+ ; thus defining a set FOO containing three strings. Then one could later write
14
+ ; BAR: blah5 @FOO blah8
15
+ ; which defines a set BAR containing 5 strings.
16
+ ;
17
+ ; Capitalized tokens are *required*, though if you feel like providing an
18
+ ; empty list afterwards, that's your right.
19
+ ; ----------------------------------------------------------------------------
20
+
21
+
22
+ ; The following links start a domain. Each must be given a name in the
23
+ ; table below (STARTING_LINK_TYPE_TABLE)
24
+
25
+ DOMAIN_STARTER_LINKS:
26
+ W Ce Cs Ca Cc Ci R* Rn Re RSe Mr QI#d Mv* Jr Mj Qd
27
+ TOn TOi Mg* MVi Ss#d Bsd ER Z Ma#* SIs#g BIqx MX#p MX#a
28
+ MX#r MX#j MV#o MV#p Eq COq CCq AFd PFc
29
+
30
+
31
+
32
+ ; ----------------------------------------------------------------------
33
+ ; The following links start a urfl domain. They are also included in the
34
+ ; domain, as opposed to regular starter links (above), which are not. A
35
+ ; urfl domain includes links accessible from the root word, tracing to
36
+ ; the right (as well as everything accessible from the left end of the
37
+ ; starter link).
38
+
39
+ URFL_DOMAIN_STARTER_LINKS: TOo I#j Pa##j CP
40
+
41
+
42
+
43
+ ; ----------------------------------------------------------------------
44
+ ; The following start a urfl_only domain. These include _only_ links :
45
+ ; reachable from the root word, tracing to the right. They aren't
46
+ ; included in the domain
47
+
48
+ URFL_ONLY_DOMAIN_STARTER_LINKS: SFsx Ss#g COp
49
+
50
+
51
+
52
+ ; ----------------------------------------------------------------------
53
+ ; Links which start a domain and are also part of the domain. This must be
54
+ ; a sublist of the domain_starter_list
55
+
56
+ DOMAIN_CONTAINS_LINKS:
57
+ Mg* Mx Bsd MX#a Ma#* Mv* MX#r Ss#d Ws Wq Qd Mj Wj
58
+ Wi MX#j AFd PFc Jr Wd Mr
59
+
60
+
61
+
62
+ ; ----------------------------------------------------------------------
63
+ ; These links are not put in the word/link graph. They also cannot be the
64
+ ; starter links for a domain. (These links may also only be used in cycles.)
65
+
66
+ IGNORE_THESE_LINKS: Xca
67
+
68
+
69
+
70
+ ; ----------------------------------------------------------------------
71
+ ; These links may only be used in cycles.
72
+
73
+ MUST_FORM_A_CYCLE_LINKS: R#* TOt EXx HA SFsic Jr JQ Xca
74
+
75
+
76
+ ; ----------------------------------------------------------------------
77
+ ; These links are not traced further if they point back before the root word.
78
+ ; The creation of Rw necessitated making B#m a restricted link, to
79
+ ; prevent the (e) domain, started by Ce, from extending around through
80
+ ; the Rw link.
81
+ ; Reverted.
82
+ ; This breaks parsing of
83
+ ; How fast a program does he think it is
84
+ ; I wonder how fast a program he thinks it is
85
+ ; I wonder how much money you earned
86
+ ; I wonder how many people you saw
87
+ ; I wonder how big a department it is
88
+ ; I wonder how much oil they spilled
89
+ ; This is the man whose dog I bought
90
+ ; I wonder which dog he said you chased
91
+ ; How efficient a program is it
92
+ ; Meanwhile, I can't find the Ce problem mentioned ... this needs more
93
+ ; documentation!
94
+
95
+ RESTRICTED_LINKS:
96
+ B#* D##w B#w B#d AFh MVt Xx HL SFsic AFd Bc CX EAh
97
+ H HA PFc B#j Wd PF Z
98
+
99
+ ; H HA PFc B#j Wd PF Z B#m
100
+
101
+
102
+ ; ----------------------------------------------------------------------
103
+ ; ---------------------- LINK TYPE TABLE-------------------------------
104
+ ; ----------------------------------------------------------------------
105
+ ; The following table associates a domain type with each possible
106
+ ; starting link. It contains pairs: the first of each pair is a link
107
+ ; type, and the second is the domain to which that link type belongs.
108
+
109
+ STARTING_LINK_TYPE_TABLE:
110
+ Ce e
111
+ R* r
112
+ Rn r
113
+ Re r
114
+ W m
115
+ RSe e
116
+ Cs s
117
+ Ca s
118
+ Jr e
119
+ Mr r
120
+ Cc s
121
+ Mv* e
122
+ QI#d s
123
+ BIqx s
124
+ TOn e
125
+ TOi e
126
+ MVi e
127
+ MV#o s
128
+ MV#p s
129
+ AFd s
130
+ PFc s
131
+ Mg* e
132
+ Mj j
133
+ Qd m
134
+ MX#j j
135
+ TOo x
136
+ I#j x
137
+ Pa##j x
138
+ CP x
139
+ COp d
140
+ SFsx d
141
+ Ss#g d
142
+ SIs#g s
143
+ Ss#d s
144
+ Bsd s
145
+ ER s
146
+ Z s
147
+ Ma#* e
148
+ MX#p e
149
+ Ci e
150
+ MX#a e
151
+ Eq e
152
+ COq e
153
+ CCq s
154
+ MX#r r
155
+
156
+
157
+ ; ----------------------------------------------------------------------
158
+ ; ----------------------- LINK SETS ------------------------------------
159
+ ; ----------------------------------------------------------------------
160
+ ; (Not in use at present; see comment at beginning of file)
161
+
162
+ ; ----------------------------------------------------------------------
163
+ ; ----------------- RULES ----------------------------------------------
164
+ ; ----------------------------------------------------------------------
165
+ ; Explanation of syntax: as usual, each stanza begins with a label
166
+ ; terminated by a colon. The interpretation of the rule depends on
167
+ ; the label, as specified in each stanza.
168
+
169
+ ; The following rule asserts that the linkage must *still* be connected
170
+ ; when the specified set(s) of links are removed from the linkage.
171
+
172
+ FORM_A_CYCLE_RULES:
173
+ @MUST_FORM_A_CYCLE_LINKS , "'must form a cycle' violation0"
174
+
175
+
176
+ ; For the following rules, if a domain contains a link matching the 1st
177
+ ; column, it must also contain a linkage matching one of the members of the
178
+ ; set in the 2nd column. The individual rules are demarcated by semicolons and
179
+ ; the fields within a rule are demarcated by commas.
180
+
181
+ CONTAINS_ONE_RULES:
182
+ SI#* , Wq Qd CQ PFc , "Bad use of s-v inversion1" ,
183
+ SI#x , Wq Qd CQ PFc , "Bad use of s-v inversion2" ,
184
+ SFI##* , Wq Qd CQ PFc , "Bad use of s-v inversion3",
185
+ SXI , Wq Qd CQ PFc , "Bad use of s-v inversion4" ,
186
+ Ws , D##w S##w H , "S-V inversion required5",
187
+ I#a , B#m B#w , "incorrect use of 'to'6" ,
188
+ Wq , SI SFI SXI , "S-V inversion required7" ,
189
+ Qd , SI SFI SXI , "S-V inversion required8" ,
190
+ PFc , SI SFI SXI , "S-V inversion required9" ,
191
+ Mj , Jw JQ , "Incorrect relative10" ,
192
+ MX#j , Jw JQ , "Incorrect relative11" ,
193
+ Wj , Jw JQ , "Misuse of preposition12" ,
194
+ JQ , Mj Wj MX#j , "Misuse of preposition13" ,
195
+ Jw , Mj Wj MX#j , "Misuse of preposition14" ,
196
+ B#j , Jr , "Incorrect relative15" ,
197
+ Jr , B#j , "Incorrect relative16" ,
198
+ EAh , AF Bsm B*m Qe Ca AFm
199
+ , "Incorrect use of 'how'17" ,
200
+ EEh , AF Bsm B*m Qe Ca AFm
201
+ , "Incorrect use of 'how'18" ,
202
+ Qe , EEh , "Incorrect use of adverb19" ,
203
+ THi , SFsi SFIsi OXi , "Complement requires 'it'20" ,
204
+ TSi , SFsi SFIsi OXi , "Complement requires 'it'21" ,
205
+ QIi , SFsi SFIsi OXi , "Complement requires 'it'22" ,
206
+ TOi , SFsi SFIsi OXi , "Complement requires 'it'23" ,
207
+ Ci , SFsi SFIsi OXi , "Complement requires 'it'24" ,
208
+ COqi , SFsi SFIsi OXi , "Complement requires 'it'25" ,
209
+ CPi , SFsi SFIsi OXi , "Complement requires 'it'26" ,
210
+ Eqi , SFsi SFIsi OXi , "Complement requires 'it'27" ,
211
+ LEi , SFsi SFIsi OXi , "Complement requires 'it'28" ,
212
+ MVti , SFsi SFIsi OXi , "Complement requires 'it'29" ,
213
+ AFdi , SFsi SFIsi OXi , "Complement requires 'it'30" ,
214
+ O#i , SFsi SFIsi OXi , "Complement requires 'it'31" ,
215
+ SFst , O*t Ost Omt Bs#t B*#t Bc#t , "Bad use of 'there'32" ,
216
+ SFIst , O*t Ost Omt Bs#t B*#t Bc#t , "Bad use of 'there'33" ,
217
+ SFp , Opt Omt O*t Bp#t B*#t Bc#t , "Bad use of 'there'34" ,
218
+ ;
219
+ ; This SFu rule forces subject-object agreement for uncountable noun objects
220
+ SFu , Out Omt O*t Bp#t B*#t Bc#t , "Bad use of 'there'34a" ,
221
+ SFIp , Opt Omt O*t Bp#t B*#t Bc#t , "Bad use of 'there'35" ,
222
+ OXt , O#t B##t , "Bad use of 'there'36" ,
223
+ SFsi* , TOi THi QIi TSi O#i Ci THb CPi
224
+ COqi CPi Eqi AFdi BIh , "Bad use of 'it'37" ,
225
+ SFIsi , TOi THi QIi TSi O#i Ci THb CPi
226
+ COqi CPi Eqi AFdi BIh , "Bad use of 'it'38" ,
227
+ OXi , TOi THi QIi TSi O#i Ci THb CPi
228
+ COqi CPi Eqi AFdi BIh , "Bad use of 'it'39" ,
229
+ THb , S##t SI##t SFsi SFIsi , "Bad use of predicate40" ,
230
+ BIh , Ss#b SIs#b SFsi SFIsi , "Bad use of predicate41" ,
231
+ BIq , S##q SI##q SFsi Ss#b SFIsi SIs#b
232
+ , "Bad use of predicate42" ,
233
+ MVt , Dm#m EAm EEm MVm Pam Pafm AFm EB#m MVb AJrc
234
+ Om Mam Am Jm Ds*m MX#m , "Bad comparative43" ,
235
+ MVz , D##y EAy EEy MVy EB#y , "Bad comparative44" ,
236
+ MV#a , Pam Pafm EAm Ds*m EAy AFm Mam Am
237
+ , "Bad comparative45" ,
238
+ MV#i , Pam Pafm EAm Ds*m EAy AFm Mam Am
239
+ , "Bad comparative46" ,
240
+ MV#o , D##m D##y Om Oy Jm Jy Am MX#m
241
+ , "Bad comparative47" ,
242
+ MV#p , EEm MVb Dm#m EEy D##y MVm Om Oy
243
+ Jm Jy Am MX#m
244
+ , "Bad comparative48" ,
245
+ Pafc , EB#m EB#y , "Bad comparative49" ,
246
+ Pafc , Pa* Paf* , "Bad comparative50" ,
247
+ MVat , MVm , "Bad comparative51" ,
248
+ MVpt , MVm , "Bad comparative52" ,
249
+ MVat , MVa MVp , "Bad comparative53" ,
250
+ MVpt , MVa MVp , "Bad comparative54" ,
251
+ U#t , D##m D##y Om Oy Jm Jy Am MX#m
252
+ , "Bad comparative55" ,
253
+ Cc , EEm EEy MVm MVb MVy
254
+ , "Bad comparative56" ,
255
+ Sp#c , Dmcm Dmcy Om Oy Jm Jy MX#m
256
+ , "Bad comparative57" ,
257
+ Ss#c , Dmum Dmuy Om Oy Jm Jy Ds*y MX#m
258
+ , "Bad comparative58" ,
259
+ S##c , Dm#m D##y Om Oy Jm Jy MX#m
260
+ , "Bad comparative59" ,
261
+ THc , TH , "Bad comparative60" ,
262
+ TOc , TO** TOf* TOi* , "Bad comparative61" ,
263
+ TOtc , TOt , "Bad comparative62" ,
264
+ Ma** , TO TOf TH MVp TOt QI OF MVt MVz MVh Ytm Ya
265
+ , "Bad use of adjective63" ,
266
+ Mam , TO TOf TH MVp TOt QI OF MVt MVz MVh Ytm Ya
267
+ , "Bad use of adjective64" ,
268
+ MX#a , TO TOf TH MVp TOt QI OF MVt MVz MVh Ytm Ya MJ
269
+ , "Bad use of adjective65" ,
270
+
271
+ ; There's no ZZZ connector, which means that Ixd and Oxn
272
+ ; are prohibited from ever occuring. 4.0.batch covers this.
273
+ Ixd , ZZZ , "Can't use 'do' with that verb" ,
274
+ Oxn , ZZZ , "Bad use of pronoun66" ,
275
+ MVh , EExk EAxk D##k , "Incorrect use of that67" ,
276
+
277
+ ; The Rw link necessitated commenting out 68, because we had to make B#m
278
+ ; a restricted link(see above) xxx reverted .. this is needed ...
279
+ ;
280
+ B#m , D##w H HA , "Bad use of gerund68"
281
+
282
+ CONTAINS_NONE_RULES:
283
+ S , Spxi , "Bad n-v agreement69" ,
284
+ SI , SIpxi , "Bad n-v agreement70" ,
285
+ Ws , B#m Ca BT , "Question inversion violated71" ,
286
+ SF , I* PP* TO* Pa* Pam Pg* Pv* LE* AFd* MVta
287
+ , "Bad use of 'filler' subject72" ,
288
+ SFI , I* PP* TO* Pa* Pam Pg* Pv* LE* AFd* MVta
289
+ , "Bad use of 'filler' subject73" ,
290
+ OX , I* PP* TO* Pa* Pam Pg* Pv* LE* AFd* MVta
291
+ , "Bad use of 'filler' subject74" ,
292
+ MXsr , Sp#w , "Bad n-v agreement75" ,
293
+ MXpr , Ss#w S#iw , "Bad n-v agreement76" ,
294
+ Mr , B#* , "Bad use of 'whose'77"
295
+
296
+
297
+ ; ----------------------------------------------------------------------
298
+ ; The following rule asserts that all specified domains must have the
299
+ ; property that all of the words that touch a link in the domain are
300
+ ; not to the left of the root word of the domain. These rules are
301
+ ; different from the above in that the first field is a *domain name*,
302
+ ; rather than a set of links.
303
+
304
+ BOUNDED_RULES:
305
+ s , "Unbounded s domain78" ,
306
+ r , "Unbounded r domain79"
data/data/en/4.0.regex ADDED
@@ -0,0 +1,225 @@
1
+ %***************************************************************************%
2
+ % %
3
+ % Copyright (C) 2005, 2006 Sampo Pyysalo, Sophie Aubin %
4
+ % See file "LICENSE" for information about commercial use of this system %
5
+ % %
6
+ %***************************************************************************%
7
+
8
+ % This file contains regular expressions that are used to match
9
+ % tokens not found in the dictionary. Each regex is given a name which
10
+ % determines the disjuncts assigned when the regex matches; this name
11
+ % must be defined in the dictionary along with the appropriate disjuncts.
12
+ % Note that the order of the regular expressions matters: matches will
13
+ % be attempted in the order in which the regexs appear in this file,
14
+ % and only the first match will be used.
15
+
16
+ % Numbers.
17
+ % XXX, we need to add utf8 U+00A0 "no-break space"
18
+ %
19
+ % Allows at most two colons in hour-muinute-second HH:MM:SS expressions
20
+ % Allows at most two digits between colons
21
+ HMS-TIME: /^[0-9][0-9]?(:[0-9][0-9]?(:[0-9][0-9]?)?)?(AM|PM|am|pm)?$/
22
+
23
+ % e.g. 1950's leading number can be higher, for science fiction.
24
+ % Must be four digits, or possible three. Must end in s, 's ’s
25
+ DECADE-TIME: /^([1-4][0-9][0-9]|[1-9][0-9])0(s|'s|’s)$/
26
+
27
+ % Day-of-month names; this regex will match before the one below.
28
+ DAY-ORDINALS: /^(1st|2nd|3rd|[4-9]th|1[0-9]th|2(0th|1st|2nd|3rd|[4-9]th)|30th|31st)$/
29
+
30
+ % Ordinal numbers; everything except 1st through 13th
31
+ % is handled by regex.
32
+ ORDINALS: /^[1-9][0-9]*(0th|1st|2nd|3rd|[4-9]th)$/
33
+
34
+ % Allows any number of commas or periods
35
+ % Be careful not match the period at the end of a sentence;
36
+ % for example: "It happened in 1942."
37
+ NUMBERS: /^[0-9,.]*[0-9]$/
38
+ % This parses signed numbers and ranges, e.g. "-5" and "5-10" and "9+/-6.5"
39
+ NUMBERS: /^[0-9.,-]*[0-9](\+\/-[0-9.,-]*[0-9])?$/
40
+ % Parses simple fractions e.g. "1/60" with no decimal points or anything fancy
41
+ FRACTION: /^[0-9]+\/[0-9]+$/
42
+ % "10(3)" exponent (used in PubMed)
43
+ NUMBERS: /^[0-9.,-]*[0-9][0-9.,-]*\([0-9:.,-]*[0-9][0-9.,-]*\)$/
44
+
45
+ % Roman numerals
46
+ % The first expr has the potential(?) problem that it matches an empty
47
+ % string. Thus, the next three rules specify that at least one section
48
+ % is non-empty.
49
+ ROMAN-NUMERAL-WORDS: /^M*(CM|D?C{0,3}|CD)(XC|L?X{0,3}|XL)(IX|V?I{0,3}|IV)$/
50
+ % ROMAN-NUMERAL-WORDS: /^M*(CM|D?C{0,3}|CD){1}(XC|L?X{0,3}|XL)(IX|V?I{0,3}|IV)$/
51
+ % ROMAN-NUMERAL-WORDS: /^M*(CM|D?C{0,3}|CD)(XC|L?X{0,3}|XL){1}(IX|V?I{0,3}|IV)$/
52
+ % ROMAN-NUMERAL-WORDS: /^M*(CM|D?C{0,3}|CD)(XC|L?X{0,3}|XL)(IX|V?I{0,3}|IV){1}$/
53
+
54
+ % Strings of initials. e.g "Dr. J.G.D. Smith lives on Main St."
55
+ INITIALS: /^([A-Z]\.)+$/
56
+
57
+ % Greek letters with numbers
58
+ GREEK-LETTER-AND-NUMBER: /^(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)\-?[0-9]+$/
59
+ PL-GREEK-LETTER-AND-NUMBER: /^(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)s\-?[0-9]+$/
60
+
61
+ % Some "safe" derived units. Simple units are in dictionary.
62
+ % The idea here is for the regex to match something that is almost
63
+ % certainly part of a derived unit, and allow the rest to be
64
+ % anything; this way we can capture difficult derived units such
65
+ % as "mg/kg/day" and even oddities such as "micrograms/mouse/day"
66
+ % without listing them explicitly.
67
+ % TODO: add more.
68
+ % Some (real) misses from these:
69
+ % micrograms.kg-1.h-1 microM-1 J/cm2 %/day mN/m cm/yr
70
+ % m/s days/week ml/s degrees/sec cm/sec cm/s mm/s N/mm (is that a unit?)
71
+ % cuts/minute clicks/s beats/minute x/week W/kg/W %/patient-year
72
+ % microIU/ml degrees/s counts/mm2 cells/mm3 tumors/mouse
73
+ % mm/sec ml/hr mJ/cm(2) m2/g amol/mm2 animals/group
74
+ % h-1 min-1 day-1 cm-1 mg-1 kg-1 mg.m-2.min-1 ms.cm-1 g-1
75
+ % sec-1 ms-1 ml.min.-1kg-1 ml.hr-1
76
+ % also, both kilometer and kilometers seem to be absent(!)
77
+ % remember "mm"!
78
+
79
+ UNITS: /^([npmk]|nano|pico|milli|micro|kilo)?(g|grams?)\// % grams/anything
80
+ UNITS: /^([fnmp]|femto|nano|micro|pico|mu)?mol(es)?\// % mol/anything
81
+ UNITS: /^[a-zA-Z\/.]+\/((m|micro)?[lLg]|kg|mol|min|day|h)$/ % common endings
82
+ % common endings, except in the style "mg.kg-1" instead of "mg/kg".
83
+ UNITS: /^[a-zA-Z\/.1-]+\.((m|micro)?[lLg]|kg|mol|min|day|h)(-1|\(-1\))$/
84
+
85
+ % combinations of numbers and units, e.g. "50-kDa", "1-2h"
86
+ % TODO: Clean up and check that these are up-to-date wrt the
87
+ % dictionary-recognized units; this is quite a mess currently.
88
+ % TODO: Extend the "number" part of the regex to allow anything
89
+ % that the NUMBER regex matches.
90
+ % One problem here is a failure to split up the expression ...
91
+ % e.g. "2hr" becomes 2 - ND - hr with the ND link. But 2-hr is treated
92
+ % as a single word ('I is a 2-hr wait')
93
+ % NUMBER-AND-UNIT: /^[0-9.,-]+(msec|s|min|hour|h|hr|day|week|wk|month|year|yr|kDa|kilodalton|base|kilobase|base-pair|kD|kd|kDa|bp|nt|kb|mm|mg|cm|nm|g|Hz|ms|kg|ml|mL|km|microm|\%)$/
94
+ % Comment out above, it screws up handling of unit suffixes, for
95
+ % example: "Zangbert stock fell 30% to $2.50 yesterday."
96
+
97
+
98
+ % fold-words. Matches NUMBER-fold, where NUMBER can be either numeric
99
+ % or a spelled-out number, and the hyphen is optional. Note that for
100
+ % spelled-out numbers, anything is allowed between the "initial" number
101
+ % and "fold" to catch e.g. "two-to-three fold" ("fourteen" etc. are absent
102
+ % as the prefix "four" is sufficient to match).
103
+ FOLD-WORDS: /^[0-9.,:-]*[0-9]([0-9.,:-]|\([0-9.,:-]*[0-9][0-9.,:-]*\)|\+\/-)*-?fold$/
104
+ FOLD-WORDS: /^(one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fifteen|twenty|thirty|fifty|hundred|thousand|million).*fold$/
105
+
106
+ % Plural proper nouns.
107
+ % Make sure that apostrophe-s is split out correctly.
108
+ PL-CAPITALIZED-WORDS: /^[[:upper:]].*[^iuoys'’]s$/
109
+
110
+ % Other proper nouns.
111
+ % We demand that these end with an alphanumeric, i.e. explicitly
112
+ % reject punctuation. We don't want this regex to "swallow" any trailing
113
+ % commas, colons, or periods/question-marks at the end of sentences.
114
+ % In addition, this must not swallow words ending in 's 'll etc.
115
+ % (... any affix, for that matter ...) and so no embedded apostrophe
116
+ CAPITALIZED-WORDS: /^[[:upper:]][^'’]*[^[:punct:]]$/
117
+
118
+ % SUFFIX GUESSING
119
+ % For all suffix-guessing patterns, we insist that the pattern start
120
+ % with an alphanumeric. This is needed to guarentee that the
121
+ % prefix-stripping code works correctly, as otherwise, the regex will
122
+ % gobble the prefix. So for example: "We left (carrying the dog) and
123
+ % Fred followed." Since "(carrying" is not in the dict, we need to be
124
+ % sure to not match the leading paren so that it will get tripped.
125
+ %
126
+ ING-WORDS: /^\w.+ing$/
127
+
128
+ % Plurals or verb-s. Make sure that apostrophe-s is split out correctly.
129
+ % e.g. "The subject's name is John Doe." should be
130
+ % +--Ds--+---YS--+--Ds-+
131
+ % | | | |
132
+ % the subject.n 's.p name.n
133
+ S-WORDS: /^\w.+[^iuoys'’]s$/
134
+
135
+ % Verbs ending -ed.
136
+ ED-WORDS: /^\w.+ed$/
137
+
138
+ % Advebs ending -ly.
139
+ LY-WORDS: /^\w.+ly$/
140
+
141
+ % Nouns ending in -ism, -asm (chiliasm .. ) Usualy mass nouns
142
+ % Stubbed out for now; I'm not convinced this improves accuracy.
143
+ % ISM-WORDS: /^\w.+asm$/
144
+ % ISM-WORDS: /^\w.+ism$/
145
+
146
+ % Corresponding count noun version of above (chiliast...)
147
+ % AST-WORDS: /^\w.+ast$/
148
+ % AST-WORDS: /^\w.+ist$/
149
+
150
+ % Corresponding adjectival form of above
151
+ ADJ-WORDS: /^\w.+astic$/
152
+ ADJ-WORDS: /^\w.+istic$/
153
+
154
+ % Nouns ending -ation stubbed out in BioLG, stub out here ...
155
+ %ATION-WORDS: /^\w.+ation$/
156
+
157
+ % Extension by LIPN 11/10/2005
158
+ % nouns -- typically seen in (bio-)chemistry texts
159
+ % synthetase, kinase
160
+ % 5-(hydroxymethyl)-2’-deoxyuridine
161
+ % hydroxyethyl, hydroxymethyl
162
+ % septation, reguion
163
+ % isomaltotetraose, isomaltotriose
164
+ % glycosylphosphatidylinositol
165
+ % iodide, oligodeoxynucleotide
166
+ % chronicity, hypochromicity
167
+ MC-NOUN-WORDS: /^\w.+ase$/
168
+ MC-NOUN-WORDS: /^\w.+ine?$/
169
+ MC-NOUN-WORDS: /^\w.+yl$/
170
+ MC-NOUN-WORDS: /^\w.+ion$/
171
+ MC-NOUN-WORDS: /^\w.+ose$/
172
+ MC-NOUN-WORDS: /^\w.+ol$/
173
+ MC-NOUN-WORDS: /^\w.+ide$/
174
+ MC-NOUN-WORDS: /^\w.+ity$/
175
+
176
+ % replicon, intron
177
+ C-NOUN-WORDS: /^\w.+o[rn]$/
178
+
179
+ % adjectives
180
+ % exogenous, heterologous
181
+ % intermolecular, intramolecular
182
+ % glycolytic, ribonucleic, uronic
183
+ % ribosomal, ribsosomal
184
+ % nonpermissive, thermosensitive
185
+ % inducible, metastable
186
+ ADJ-WORDS: /^\w.+ous$/
187
+ ADJ-WORDS: /^\w.+ar$/
188
+ ADJ-WORDS: /^\w.+ic$/
189
+ ADJ-WORDS: /^\w.+al$/
190
+ ADJ-WORDS: /^\w.+ive$/
191
+ ADJ-WORDS: /^\w.+ble$/
192
+
193
+ % latin (postposed) adjectives
194
+ % influenzae, tarentolae
195
+ % pentosaceus, luteus, carnosus
196
+ LATIN-ADJ-WORDS: /^\w.+ae$/
197
+ LATIN-ADJ-WORDS: /^\w.+us$/ % must appear after -ous in this file
198
+
199
+ % latin (postposed) adjectives or latin plural noun
200
+ % brevis, israelensis
201
+ % japonicum, tabacum, xylinum
202
+ LATIN-ADJ-P-NOUN-WORDS: /^\w.+is?$/
203
+ LATIN-ADJ-S-NOUN-WORDS: /^\w.+um$/
204
+
205
+
206
+ % Hyphenated words. In the original LG morpho-guessing system that
207
+ % predated the regex-based system, hyphenated words were detected
208
+ % before ING-WORDS, S-WORDS etc., causing e.g. "cross-linked" to be
209
+ % treated as a HYPHENATED-WORD (a generic adjective/noun), and
210
+ % never a verb. To return to this ordering, move this regex just
211
+ % after the CAPITALIZED-WORDS regex.
212
+ HYPHENATED-WORDS: /^[[:alpha:][:digit:],.][[:alpha:][:digit:],.-]*-[[:alpha:][:digit:],.-]*[[:alpha:][:digit:],.]$/
213
+
214
+ % proteins often end "ase", so we'll assume those things are names.
215
+ % removed, too many false positives.
216
+ % NAME: /ase$/
217
+
218
+ % Sequence of punctuation marks. If some mark appears in the affix table
219
+ % such as a period, comma, dash or underscore, and there's a sequence of
220
+ % these, then treat it as a "fill-in-the-blank" placeholder.
221
+ % This matters only for punc. appearing in the affix table, since the
222
+ % tokenizer explicitly mangles based on these punctution marks.
223
+ %
224
+ % Look for at least four in a row.
225
+ UNKNOWN-WORD: /^[.,-]{4}[.,-]*$/