grammar_cop 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.DS_Store +0 -0
- data/.gitignore +4 -0
- data/Gemfile +4 -0
- data/Rakefile +8 -0
- data/data/.DS_Store +0 -0
- data/data/Makefile +511 -0
- data/data/Makefile.am +4 -0
- data/data/Makefile.in +511 -0
- data/data/de/.DS_Store +0 -0
- data/data/de/4.0.affix +7 -0
- data/data/de/4.0.dict +474 -0
- data/data/de/Makefile +387 -0
- data/data/de/Makefile.am +9 -0
- data/data/de/Makefile.in +387 -0
- data/data/en/.DS_Store +0 -0
- data/data/en/4.0.affix +26 -0
- data/data/en/4.0.batch +1002 -0
- data/data/en/4.0.biolg.batch +411 -0
- data/data/en/4.0.constituent-knowledge +127 -0
- data/data/en/4.0.dict +8759 -0
- data/data/en/4.0.dict.m4 +6928 -0
- data/data/en/4.0.enwiki.batch +14 -0
- data/data/en/4.0.fixes.batch +2776 -0
- data/data/en/4.0.knowledge +306 -0
- data/data/en/4.0.regex +225 -0
- data/data/en/4.0.voa.batch +114 -0
- data/data/en/Makefile +554 -0
- data/data/en/Makefile.am +19 -0
- data/data/en/Makefile.in +554 -0
- data/data/en/README +173 -0
- data/data/en/tiny.dict +157 -0
- data/data/en/words/.DS_Store +0 -0
- data/data/en/words/Makefile +456 -0
- data/data/en/words/Makefile.am +78 -0
- data/data/en/words/Makefile.in +456 -0
- data/data/en/words/currency +205 -0
- data/data/en/words/currency.p +28 -0
- data/data/en/words/entities.given-bisex.sing +39 -0
- data/data/en/words/entities.given-female.sing +4141 -0
- data/data/en/words/entities.given-male.sing +1633 -0
- data/data/en/words/entities.locations.sing +68 -0
- data/data/en/words/entities.national.sing +253 -0
- data/data/en/words/entities.organizations.sing +7 -0
- data/data/en/words/entities.us-states.sing +11 -0
- data/data/en/words/units.1 +45 -0
- data/data/en/words/units.1.dot +4 -0
- data/data/en/words/units.3 +2 -0
- data/data/en/words/units.4 +5 -0
- data/data/en/words/units.4.dot +1 -0
- data/data/en/words/words-medical.adv.1 +1191 -0
- data/data/en/words/words-medical.prep.1 +67 -0
- data/data/en/words/words-medical.v.4.1 +2835 -0
- data/data/en/words/words-medical.v.4.2 +2848 -0
- data/data/en/words/words-medical.v.4.3 +3011 -0
- data/data/en/words/words-medical.v.4.4 +3036 -0
- data/data/en/words/words-medical.v.4.5 +3050 -0
- data/data/en/words/words.adj.1 +6794 -0
- data/data/en/words/words.adj.2 +638 -0
- data/data/en/words/words.adj.3 +667 -0
- data/data/en/words/words.adv.1 +1573 -0
- data/data/en/words/words.adv.2 +67 -0
- data/data/en/words/words.adv.3 +157 -0
- data/data/en/words/words.adv.4 +80 -0
- data/data/en/words/words.n.1 +11464 -0
- data/data/en/words/words.n.1.wiki +264 -0
- data/data/en/words/words.n.2.s +2017 -0
- data/data/en/words/words.n.2.s.biolg +1 -0
- data/data/en/words/words.n.2.s.wiki +298 -0
- data/data/en/words/words.n.2.x +65 -0
- data/data/en/words/words.n.2.x.wiki +10 -0
- data/data/en/words/words.n.3 +5717 -0
- data/data/en/words/words.n.t +23 -0
- data/data/en/words/words.v.1.1 +1038 -0
- data/data/en/words/words.v.1.2 +1043 -0
- data/data/en/words/words.v.1.3 +1052 -0
- data/data/en/words/words.v.1.4 +1023 -0
- data/data/en/words/words.v.1.p +17 -0
- data/data/en/words/words.v.10.1 +14 -0
- data/data/en/words/words.v.10.2 +15 -0
- data/data/en/words/words.v.10.3 +88 -0
- data/data/en/words/words.v.10.4 +17 -0
- data/data/en/words/words.v.2.1 +1253 -0
- data/data/en/words/words.v.2.2 +1304 -0
- data/data/en/words/words.v.2.3 +1280 -0
- data/data/en/words/words.v.2.4 +1285 -0
- data/data/en/words/words.v.2.5 +1287 -0
- data/data/en/words/words.v.4.1 +2472 -0
- data/data/en/words/words.v.4.2 +2487 -0
- data/data/en/words/words.v.4.3 +2441 -0
- data/data/en/words/words.v.4.4 +2478 -0
- data/data/en/words/words.v.4.5 +2483 -0
- data/data/en/words/words.v.5.1 +98 -0
- data/data/en/words/words.v.5.2 +98 -0
- data/data/en/words/words.v.5.3 +103 -0
- data/data/en/words/words.v.5.4 +102 -0
- data/data/en/words/words.v.6.1 +388 -0
- data/data/en/words/words.v.6.2 +401 -0
- data/data/en/words/words.v.6.3 +397 -0
- data/data/en/words/words.v.6.4 +405 -0
- data/data/en/words/words.v.6.5 +401 -0
- data/data/en/words/words.v.8.1 +117 -0
- data/data/en/words/words.v.8.2 +118 -0
- data/data/en/words/words.v.8.3 +118 -0
- data/data/en/words/words.v.8.4 +119 -0
- data/data/en/words/words.v.8.5 +119 -0
- data/data/en/words/words.y +104 -0
- data/data/lt/.DS_Store +0 -0
- data/data/lt/4.0.affix +6 -0
- data/data/lt/4.0.constituent-knowledge +24 -0
- data/data/lt/4.0.dict +135 -0
- data/data/lt/4.0.knowledge +38 -0
- data/data/lt/Makefile +389 -0
- data/data/lt/Makefile.am +11 -0
- data/data/lt/Makefile.in +389 -0
- data/ext/.DS_Store +0 -0
- data/ext/link_grammar/.DS_Store +0 -0
- data/ext/link_grammar/extconf.rb +2 -0
- data/ext/link_grammar/link-grammar/.DS_Store +0 -0
- data/ext/link_grammar/link-grammar/.deps/analyze-linkage.Plo +198 -0
- data/ext/link_grammar/link-grammar/.deps/and.Plo +202 -0
- data/ext/link_grammar/link-grammar/.deps/api.Plo +244 -0
- data/ext/link_grammar/link-grammar/.deps/build-disjuncts.Plo +212 -0
- data/ext/link_grammar/link-grammar/.deps/command-line.Plo +201 -0
- data/ext/link_grammar/link-grammar/.deps/constituents.Plo +201 -0
- data/ext/link_grammar/link-grammar/.deps/count.Plo +202 -0
- data/ext/link_grammar/link-grammar/.deps/disjunct-utils.Plo +126 -0
- data/ext/link_grammar/link-grammar/.deps/disjuncts.Plo +123 -0
- data/ext/link_grammar/link-grammar/.deps/error.Plo +121 -0
- data/ext/link_grammar/link-grammar/.deps/expand.Plo +133 -0
- data/ext/link_grammar/link-grammar/.deps/extract-links.Plo +198 -0
- data/ext/link_grammar/link-grammar/.deps/fast-match.Plo +200 -0
- data/ext/link_grammar/link-grammar/.deps/idiom.Plo +200 -0
- data/ext/link_grammar/link-grammar/.deps/jni-client.Plo +217 -0
- data/ext/link_grammar/link-grammar/.deps/link-parser.Po +1 -0
- data/ext/link_grammar/link-grammar/.deps/massage.Plo +202 -0
- data/ext/link_grammar/link-grammar/.deps/post-process.Plo +202 -0
- data/ext/link_grammar/link-grammar/.deps/pp_knowledge.Plo +202 -0
- data/ext/link_grammar/link-grammar/.deps/pp_lexer.Plo +201 -0
- data/ext/link_grammar/link-grammar/.deps/pp_linkset.Plo +200 -0
- data/ext/link_grammar/link-grammar/.deps/prefix.Plo +102 -0
- data/ext/link_grammar/link-grammar/.deps/preparation.Plo +202 -0
- data/ext/link_grammar/link-grammar/.deps/print-util.Plo +200 -0
- data/ext/link_grammar/link-grammar/.deps/print.Plo +201 -0
- data/ext/link_grammar/link-grammar/.deps/prune.Plo +202 -0
- data/ext/link_grammar/link-grammar/.deps/read-dict.Plo +223 -0
- data/ext/link_grammar/link-grammar/.deps/read-regex.Plo +123 -0
- data/ext/link_grammar/link-grammar/.deps/regex-morph.Plo +131 -0
- data/ext/link_grammar/link-grammar/.deps/resources.Plo +203 -0
- data/ext/link_grammar/link-grammar/.deps/spellcheck-aspell.Plo +1 -0
- data/ext/link_grammar/link-grammar/.deps/spellcheck-hun.Plo +115 -0
- data/ext/link_grammar/link-grammar/.deps/string-set.Plo +198 -0
- data/ext/link_grammar/link-grammar/.deps/tokenize.Plo +160 -0
- data/ext/link_grammar/link-grammar/.deps/utilities.Plo +222 -0
- data/ext/link_grammar/link-grammar/.deps/word-file.Plo +201 -0
- data/ext/link_grammar/link-grammar/.deps/word-utils.Plo +212 -0
- data/ext/link_grammar/link-grammar/.libs/analyze-linkage.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/and.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/api.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/build-disjuncts.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/command-line.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/constituents.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/count.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/disjunct-utils.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/disjuncts.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/error.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/expand.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/extract-links.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/fast-match.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/idiom.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/jni-client.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java-symbols.expsym +31 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib.dSYM/Contents/Info.plist +20 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib.dSYM/Contents/Resources/DWARF/liblink-grammar-java.4.dylib +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.a +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.dylib +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar-symbols.expsym +194 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib.dSYM/Contents/Info.plist +20 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib.dSYM/Contents/Resources/DWARF/liblink-grammar.4.dylib +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar.a +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar.dylib +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar.la +41 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar.lai +41 -0
- data/ext/link_grammar/link-grammar/.libs/massage.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/post-process.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/pp_knowledge.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/pp_lexer.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/pp_linkset.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/prefix.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/preparation.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/print-util.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/print.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/prune.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/read-dict.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/read-regex.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/regex-morph.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/resources.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/spellcheck-aspell.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/spellcheck-hun.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/string-set.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/tokenize.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/utilities.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/word-file.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/word-utils.o +0 -0
- data/ext/link_grammar/link-grammar/Makefile +900 -0
- data/ext/link_grammar/link-grammar/Makefile.am +202 -0
- data/ext/link_grammar/link-grammar/Makefile.in +900 -0
- data/ext/link_grammar/link-grammar/analyze-linkage.c +1317 -0
- data/ext/link_grammar/link-grammar/analyze-linkage.h +24 -0
- data/ext/link_grammar/link-grammar/and.c +1603 -0
- data/ext/link_grammar/link-grammar/and.h +27 -0
- data/ext/link_grammar/link-grammar/api-structures.h +362 -0
- data/ext/link_grammar/link-grammar/api-types.h +72 -0
- data/ext/link_grammar/link-grammar/api.c +1887 -0
- data/ext/link_grammar/link-grammar/api.h +96 -0
- data/ext/link_grammar/link-grammar/autoit/.DS_Store +0 -0
- data/ext/link_grammar/link-grammar/autoit/README +10 -0
- data/ext/link_grammar/link-grammar/autoit/_LGTest.au3 +22 -0
- data/ext/link_grammar/link-grammar/autoit/_LinkGrammar.au3 +545 -0
- data/ext/link_grammar/link-grammar/build-disjuncts.c +487 -0
- data/ext/link_grammar/link-grammar/build-disjuncts.h +21 -0
- data/ext/link_grammar/link-grammar/command-line.c +458 -0
- data/ext/link_grammar/link-grammar/command-line.h +15 -0
- data/ext/link_grammar/link-grammar/constituents.c +1836 -0
- data/ext/link_grammar/link-grammar/constituents.h +26 -0
- data/ext/link_grammar/link-grammar/corpus/.DS_Store +0 -0
- data/ext/link_grammar/link-grammar/corpus/.deps/cluster.Plo +1 -0
- data/ext/link_grammar/link-grammar/corpus/.deps/corpus.Plo +1 -0
- data/ext/link_grammar/link-grammar/corpus/Makefile +527 -0
- data/ext/link_grammar/link-grammar/corpus/Makefile.am +46 -0
- data/ext/link_grammar/link-grammar/corpus/Makefile.in +527 -0
- data/ext/link_grammar/link-grammar/corpus/README +17 -0
- data/ext/link_grammar/link-grammar/corpus/cluster.c +286 -0
- data/ext/link_grammar/link-grammar/corpus/cluster.h +32 -0
- data/ext/link_grammar/link-grammar/corpus/corpus.c +483 -0
- data/ext/link_grammar/link-grammar/corpus/corpus.h +46 -0
- data/ext/link_grammar/link-grammar/count.c +828 -0
- data/ext/link_grammar/link-grammar/count.h +25 -0
- data/ext/link_grammar/link-grammar/disjunct-utils.c +261 -0
- data/ext/link_grammar/link-grammar/disjunct-utils.h +27 -0
- data/ext/link_grammar/link-grammar/disjuncts.c +138 -0
- data/ext/link_grammar/link-grammar/disjuncts.h +13 -0
- data/ext/link_grammar/link-grammar/error.c +92 -0
- data/ext/link_grammar/link-grammar/error.h +35 -0
- data/ext/link_grammar/link-grammar/expand.c +67 -0
- data/ext/link_grammar/link-grammar/expand.h +13 -0
- data/ext/link_grammar/link-grammar/externs.h +22 -0
- data/ext/link_grammar/link-grammar/extract-links.c +625 -0
- data/ext/link_grammar/link-grammar/extract-links.h +16 -0
- data/ext/link_grammar/link-grammar/fast-match.c +309 -0
- data/ext/link_grammar/link-grammar/fast-match.h +17 -0
- data/ext/link_grammar/link-grammar/idiom.c +373 -0
- data/ext/link_grammar/link-grammar/idiom.h +15 -0
- data/ext/link_grammar/link-grammar/jni-client.c +779 -0
- data/ext/link_grammar/link-grammar/jni-client.h +236 -0
- data/ext/link_grammar/link-grammar/liblink-grammar-java.la +42 -0
- data/ext/link_grammar/link-grammar/liblink-grammar.la +41 -0
- data/ext/link_grammar/link-grammar/link-features.h +37 -0
- data/ext/link_grammar/link-grammar/link-features.h.in +37 -0
- data/ext/link_grammar/link-grammar/link-grammar-java.def +31 -0
- data/ext/link_grammar/link-grammar/link-grammar.def +194 -0
- data/ext/link_grammar/link-grammar/link-includes.h +465 -0
- data/ext/link_grammar/link-grammar/link-parser.c +849 -0
- data/ext/link_grammar/link-grammar/massage.c +329 -0
- data/ext/link_grammar/link-grammar/massage.h +13 -0
- data/ext/link_grammar/link-grammar/post-process.c +1113 -0
- data/ext/link_grammar/link-grammar/post-process.h +45 -0
- data/ext/link_grammar/link-grammar/pp_knowledge.c +376 -0
- data/ext/link_grammar/link-grammar/pp_knowledge.h +14 -0
- data/ext/link_grammar/link-grammar/pp_lexer.c +1920 -0
- data/ext/link_grammar/link-grammar/pp_lexer.h +19 -0
- data/ext/link_grammar/link-grammar/pp_linkset.c +158 -0
- data/ext/link_grammar/link-grammar/pp_linkset.h +20 -0
- data/ext/link_grammar/link-grammar/prefix.c +482 -0
- data/ext/link_grammar/link-grammar/prefix.h +139 -0
- data/ext/link_grammar/link-grammar/preparation.c +412 -0
- data/ext/link_grammar/link-grammar/preparation.h +20 -0
- data/ext/link_grammar/link-grammar/print-util.c +87 -0
- data/ext/link_grammar/link-grammar/print-util.h +32 -0
- data/ext/link_grammar/link-grammar/print.c +1085 -0
- data/ext/link_grammar/link-grammar/print.h +16 -0
- data/ext/link_grammar/link-grammar/prune.c +1864 -0
- data/ext/link_grammar/link-grammar/prune.h +17 -0
- data/ext/link_grammar/link-grammar/read-dict.c +1785 -0
- data/ext/link_grammar/link-grammar/read-dict.h +29 -0
- data/ext/link_grammar/link-grammar/read-regex.c +161 -0
- data/ext/link_grammar/link-grammar/read-regex.h +12 -0
- data/ext/link_grammar/link-grammar/regex-morph.c +126 -0
- data/ext/link_grammar/link-grammar/regex-morph.h +17 -0
- data/ext/link_grammar/link-grammar/resources.c +180 -0
- data/ext/link_grammar/link-grammar/resources.h +23 -0
- data/ext/link_grammar/link-grammar/sat-solver/.DS_Store +0 -0
- data/ext/link_grammar/link-grammar/sat-solver/.deps/fast-sprintf.Plo +1 -0
- data/ext/link_grammar/link-grammar/sat-solver/.deps/sat-encoder.Plo +1 -0
- data/ext/link_grammar/link-grammar/sat-solver/.deps/util.Plo +1 -0
- data/ext/link_grammar/link-grammar/sat-solver/.deps/variables.Plo +1 -0
- data/ext/link_grammar/link-grammar/sat-solver/.deps/word-tag.Plo +1 -0
- data/ext/link_grammar/link-grammar/sat-solver/Makefile +527 -0
- data/ext/link_grammar/link-grammar/sat-solver/Makefile.am +29 -0
- data/ext/link_grammar/link-grammar/sat-solver/Makefile.in +527 -0
- data/ext/link_grammar/link-grammar/sat-solver/clock.hpp +33 -0
- data/ext/link_grammar/link-grammar/sat-solver/fast-sprintf.cpp +26 -0
- data/ext/link_grammar/link-grammar/sat-solver/fast-sprintf.hpp +7 -0
- data/ext/link_grammar/link-grammar/sat-solver/guiding.hpp +244 -0
- data/ext/link_grammar/link-grammar/sat-solver/matrix-ut.hpp +79 -0
- data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.cpp +2811 -0
- data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.h +11 -0
- data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.hpp +381 -0
- data/ext/link_grammar/link-grammar/sat-solver/trie.hpp +118 -0
- data/ext/link_grammar/link-grammar/sat-solver/util.cpp +23 -0
- data/ext/link_grammar/link-grammar/sat-solver/util.hpp +14 -0
- data/ext/link_grammar/link-grammar/sat-solver/variables.cpp +5 -0
- data/ext/link_grammar/link-grammar/sat-solver/variables.hpp +829 -0
- data/ext/link_grammar/link-grammar/sat-solver/word-tag.cpp +159 -0
- data/ext/link_grammar/link-grammar/sat-solver/word-tag.hpp +162 -0
- data/ext/link_grammar/link-grammar/spellcheck-aspell.c +148 -0
- data/ext/link_grammar/link-grammar/spellcheck-hun.c +136 -0
- data/ext/link_grammar/link-grammar/spellcheck.h +34 -0
- data/ext/link_grammar/link-grammar/string-set.c +169 -0
- data/ext/link_grammar/link-grammar/string-set.h +16 -0
- data/ext/link_grammar/link-grammar/structures.h +498 -0
- data/ext/link_grammar/link-grammar/tokenize.c +1049 -0
- data/ext/link_grammar/link-grammar/tokenize.h +15 -0
- data/ext/link_grammar/link-grammar/utilities.c +847 -0
- data/ext/link_grammar/link-grammar/utilities.h +281 -0
- data/ext/link_grammar/link-grammar/word-file.c +124 -0
- data/ext/link_grammar/link-grammar/word-file.h +15 -0
- data/ext/link_grammar/link-grammar/word-utils.c +526 -0
- data/ext/link_grammar/link-grammar/word-utils.h +152 -0
- data/ext/link_grammar/link_grammar.c +202 -0
- data/ext/link_grammar/link_grammar.h +99 -0
- data/grammar_cop.gemspec +24 -0
- data/lib/.DS_Store +0 -0
- data/lib/grammar_cop.rb +9 -0
- data/lib/grammar_cop/.DS_Store +0 -0
- data/lib/grammar_cop/dictionary.rb +19 -0
- data/lib/grammar_cop/linkage.rb +30 -0
- data/lib/grammar_cop/parse_options.rb +32 -0
- data/lib/grammar_cop/sentence.rb +36 -0
- data/lib/grammar_cop/version.rb +3 -0
- data/test/.DS_Store +0 -0
- data/test/grammar_cop_test.rb +27 -0
- metadata +407 -0
@@ -0,0 +1,306 @@
|
|
1
|
+
; Post-processing knowledge file
|
2
|
+
; 6/96
|
3
|
+
|
4
|
+
; ----------------------------------------------------------------------------
|
5
|
+
; This file contains the knowledge related to post-processing, in the
|
6
|
+
; form of lists and rules. This file is read by post-process.c at run-time.
|
7
|
+
; Syntax of file:
|
8
|
+
; line starting with ";" is a comment
|
9
|
+
; commas are field delimiters
|
10
|
+
; any token beginning with the character @ is expanded to the set
|
11
|
+
; of symbols it defined. e.g. one could write
|
12
|
+
; FOO: blah1 blah2 blah3
|
13
|
+
; thus defining a set FOO containing three strings. Then one could later write
|
14
|
+
; BAR: blah5 @FOO blah8
|
15
|
+
; which defines a set BAR containing 5 strings.
|
16
|
+
;
|
17
|
+
; Capitalized tokens are *required*, though if you feel like providing an
|
18
|
+
; empty list afterwards, that's your right.
|
19
|
+
; ----------------------------------------------------------------------------
|
20
|
+
|
21
|
+
|
22
|
+
; The following links start a domain. Each must be given a name in the
|
23
|
+
; table below (STARTING_LINK_TYPE_TABLE)
|
24
|
+
|
25
|
+
DOMAIN_STARTER_LINKS:
|
26
|
+
W Ce Cs Ca Cc Ci R* Rn Re RSe Mr QI#d Mv* Jr Mj Qd
|
27
|
+
TOn TOi Mg* MVi Ss#d Bsd ER Z Ma#* SIs#g BIqx MX#p MX#a
|
28
|
+
MX#r MX#j MV#o MV#p Eq COq CCq AFd PFc
|
29
|
+
|
30
|
+
|
31
|
+
|
32
|
+
; ----------------------------------------------------------------------
|
33
|
+
; The following links start a urfl domain. They are also included in the
|
34
|
+
; domain, as opposed to regular starter links (above), which are not. A
|
35
|
+
; urfl domain includes links accessible from the root word, tracing to
|
36
|
+
; the right (as well as everything accessible from the left end of the
|
37
|
+
; starter link).
|
38
|
+
|
39
|
+
URFL_DOMAIN_STARTER_LINKS: TOo I#j Pa##j CP
|
40
|
+
|
41
|
+
|
42
|
+
|
43
|
+
; ----------------------------------------------------------------------
|
44
|
+
; The following start a urfl_only domain. These include _only_ links :
|
45
|
+
; reachable from the root word, tracing to the right. They aren't
|
46
|
+
; included in the domain
|
47
|
+
|
48
|
+
URFL_ONLY_DOMAIN_STARTER_LINKS: SFsx Ss#g COp
|
49
|
+
|
50
|
+
|
51
|
+
|
52
|
+
; ----------------------------------------------------------------------
|
53
|
+
; Links which start a domain and are also part of the domain. This must be
|
54
|
+
; a sublist of the domain_starter_list
|
55
|
+
|
56
|
+
DOMAIN_CONTAINS_LINKS:
|
57
|
+
Mg* Mx Bsd MX#a Ma#* Mv* MX#r Ss#d Ws Wq Qd Mj Wj
|
58
|
+
Wi MX#j AFd PFc Jr Wd Mr
|
59
|
+
|
60
|
+
|
61
|
+
|
62
|
+
; ----------------------------------------------------------------------
|
63
|
+
; These links are not put in the word/link graph. They also cannot be the
|
64
|
+
; starter links for a domain. (These links may also only be used in cycles.)
|
65
|
+
|
66
|
+
IGNORE_THESE_LINKS: Xca
|
67
|
+
|
68
|
+
|
69
|
+
|
70
|
+
; ----------------------------------------------------------------------
|
71
|
+
; These links may only be used in cycles.
|
72
|
+
|
73
|
+
MUST_FORM_A_CYCLE_LINKS: R#* TOt EXx HA SFsic Jr JQ Xca
|
74
|
+
|
75
|
+
|
76
|
+
; ----------------------------------------------------------------------
|
77
|
+
; These links are not traced further if they point back before the root word.
|
78
|
+
; The creation of Rw necessitated making B#m a restricted link, to
|
79
|
+
; prevent the (e) domain, started by Ce, from extending around through
|
80
|
+
; the Rw link.
|
81
|
+
; Reverted.
|
82
|
+
; This breaks parsing of
|
83
|
+
; How fast a program does he think it is
|
84
|
+
; I wonder how fast a program he thinks it is
|
85
|
+
; I wonder how much money you earned
|
86
|
+
; I wonder how many people you saw
|
87
|
+
; I wonder how big a department it is
|
88
|
+
; I wonder how much oil they spilled
|
89
|
+
; This is the man whose dog I bought
|
90
|
+
; I wonder which dog he said you chased
|
91
|
+
; How efficient a program is it
|
92
|
+
; Meanwhile, I can't find the Ce problem mentioned ... this needs more
|
93
|
+
; documentation!
|
94
|
+
|
95
|
+
RESTRICTED_LINKS:
|
96
|
+
B#* D##w B#w B#d AFh MVt Xx HL SFsic AFd Bc CX EAh
|
97
|
+
H HA PFc B#j Wd PF Z
|
98
|
+
|
99
|
+
; H HA PFc B#j Wd PF Z B#m
|
100
|
+
|
101
|
+
|
102
|
+
; ----------------------------------------------------------------------
|
103
|
+
; ---------------------- LINK TYPE TABLE-------------------------------
|
104
|
+
; ----------------------------------------------------------------------
|
105
|
+
; The following table associates a domain type with each possible
|
106
|
+
; starting link. It contains pairs: the first of each pair is a link
|
107
|
+
; type, and the second is the domain to which that link type belongs.
|
108
|
+
|
109
|
+
STARTING_LINK_TYPE_TABLE:
|
110
|
+
Ce e
|
111
|
+
R* r
|
112
|
+
Rn r
|
113
|
+
Re r
|
114
|
+
W m
|
115
|
+
RSe e
|
116
|
+
Cs s
|
117
|
+
Ca s
|
118
|
+
Jr e
|
119
|
+
Mr r
|
120
|
+
Cc s
|
121
|
+
Mv* e
|
122
|
+
QI#d s
|
123
|
+
BIqx s
|
124
|
+
TOn e
|
125
|
+
TOi e
|
126
|
+
MVi e
|
127
|
+
MV#o s
|
128
|
+
MV#p s
|
129
|
+
AFd s
|
130
|
+
PFc s
|
131
|
+
Mg* e
|
132
|
+
Mj j
|
133
|
+
Qd m
|
134
|
+
MX#j j
|
135
|
+
TOo x
|
136
|
+
I#j x
|
137
|
+
Pa##j x
|
138
|
+
CP x
|
139
|
+
COp d
|
140
|
+
SFsx d
|
141
|
+
Ss#g d
|
142
|
+
SIs#g s
|
143
|
+
Ss#d s
|
144
|
+
Bsd s
|
145
|
+
ER s
|
146
|
+
Z s
|
147
|
+
Ma#* e
|
148
|
+
MX#p e
|
149
|
+
Ci e
|
150
|
+
MX#a e
|
151
|
+
Eq e
|
152
|
+
COq e
|
153
|
+
CCq s
|
154
|
+
MX#r r
|
155
|
+
|
156
|
+
|
157
|
+
; ----------------------------------------------------------------------
|
158
|
+
; ----------------------- LINK SETS ------------------------------------
|
159
|
+
; ----------------------------------------------------------------------
|
160
|
+
; (Not in use at present; see comment at beginning of file)
|
161
|
+
|
162
|
+
; ----------------------------------------------------------------------
|
163
|
+
; ----------------- RULES ----------------------------------------------
|
164
|
+
; ----------------------------------------------------------------------
|
165
|
+
; Explanation of syntax: as usual, each stanza begins with a label
|
166
|
+
; terminated by a colon. The interpretation of the rule depends on
|
167
|
+
; the label, as specified in each stanza.
|
168
|
+
|
169
|
+
; The following rule asserts that the linkage must *still* be connected
|
170
|
+
; when the specified set(s) of links are removed from the linkage.
|
171
|
+
|
172
|
+
FORM_A_CYCLE_RULES:
|
173
|
+
@MUST_FORM_A_CYCLE_LINKS , "'must form a cycle' violation0"
|
174
|
+
|
175
|
+
|
176
|
+
; For the following rules, if a domain contains a link matching the 1st
|
177
|
+
; column, it must also contain a linkage matching one of the members of the
|
178
|
+
; set in the 2nd column. The individual rules are demarcated by semicolons and
|
179
|
+
; the fields within a rule are demarcated by commas.
|
180
|
+
|
181
|
+
CONTAINS_ONE_RULES:
|
182
|
+
SI#* , Wq Qd CQ PFc , "Bad use of s-v inversion1" ,
|
183
|
+
SI#x , Wq Qd CQ PFc , "Bad use of s-v inversion2" ,
|
184
|
+
SFI##* , Wq Qd CQ PFc , "Bad use of s-v inversion3",
|
185
|
+
SXI , Wq Qd CQ PFc , "Bad use of s-v inversion4" ,
|
186
|
+
Ws , D##w S##w H , "S-V inversion required5",
|
187
|
+
I#a , B#m B#w , "incorrect use of 'to'6" ,
|
188
|
+
Wq , SI SFI SXI , "S-V inversion required7" ,
|
189
|
+
Qd , SI SFI SXI , "S-V inversion required8" ,
|
190
|
+
PFc , SI SFI SXI , "S-V inversion required9" ,
|
191
|
+
Mj , Jw JQ , "Incorrect relative10" ,
|
192
|
+
MX#j , Jw JQ , "Incorrect relative11" ,
|
193
|
+
Wj , Jw JQ , "Misuse of preposition12" ,
|
194
|
+
JQ , Mj Wj MX#j , "Misuse of preposition13" ,
|
195
|
+
Jw , Mj Wj MX#j , "Misuse of preposition14" ,
|
196
|
+
B#j , Jr , "Incorrect relative15" ,
|
197
|
+
Jr , B#j , "Incorrect relative16" ,
|
198
|
+
EAh , AF Bsm B*m Qe Ca AFm
|
199
|
+
, "Incorrect use of 'how'17" ,
|
200
|
+
EEh , AF Bsm B*m Qe Ca AFm
|
201
|
+
, "Incorrect use of 'how'18" ,
|
202
|
+
Qe , EEh , "Incorrect use of adverb19" ,
|
203
|
+
THi , SFsi SFIsi OXi , "Complement requires 'it'20" ,
|
204
|
+
TSi , SFsi SFIsi OXi , "Complement requires 'it'21" ,
|
205
|
+
QIi , SFsi SFIsi OXi , "Complement requires 'it'22" ,
|
206
|
+
TOi , SFsi SFIsi OXi , "Complement requires 'it'23" ,
|
207
|
+
Ci , SFsi SFIsi OXi , "Complement requires 'it'24" ,
|
208
|
+
COqi , SFsi SFIsi OXi , "Complement requires 'it'25" ,
|
209
|
+
CPi , SFsi SFIsi OXi , "Complement requires 'it'26" ,
|
210
|
+
Eqi , SFsi SFIsi OXi , "Complement requires 'it'27" ,
|
211
|
+
LEi , SFsi SFIsi OXi , "Complement requires 'it'28" ,
|
212
|
+
MVti , SFsi SFIsi OXi , "Complement requires 'it'29" ,
|
213
|
+
AFdi , SFsi SFIsi OXi , "Complement requires 'it'30" ,
|
214
|
+
O#i , SFsi SFIsi OXi , "Complement requires 'it'31" ,
|
215
|
+
SFst , O*t Ost Omt Bs#t B*#t Bc#t , "Bad use of 'there'32" ,
|
216
|
+
SFIst , O*t Ost Omt Bs#t B*#t Bc#t , "Bad use of 'there'33" ,
|
217
|
+
SFp , Opt Omt O*t Bp#t B*#t Bc#t , "Bad use of 'there'34" ,
|
218
|
+
;
|
219
|
+
; This SFu rule forces subject-object agreement for uncountable noun objects
|
220
|
+
SFu , Out Omt O*t Bp#t B*#t Bc#t , "Bad use of 'there'34a" ,
|
221
|
+
SFIp , Opt Omt O*t Bp#t B*#t Bc#t , "Bad use of 'there'35" ,
|
222
|
+
OXt , O#t B##t , "Bad use of 'there'36" ,
|
223
|
+
SFsi* , TOi THi QIi TSi O#i Ci THb CPi
|
224
|
+
COqi CPi Eqi AFdi BIh , "Bad use of 'it'37" ,
|
225
|
+
SFIsi , TOi THi QIi TSi O#i Ci THb CPi
|
226
|
+
COqi CPi Eqi AFdi BIh , "Bad use of 'it'38" ,
|
227
|
+
OXi , TOi THi QIi TSi O#i Ci THb CPi
|
228
|
+
COqi CPi Eqi AFdi BIh , "Bad use of 'it'39" ,
|
229
|
+
THb , S##t SI##t SFsi SFIsi , "Bad use of predicate40" ,
|
230
|
+
BIh , Ss#b SIs#b SFsi SFIsi , "Bad use of predicate41" ,
|
231
|
+
BIq , S##q SI##q SFsi Ss#b SFIsi SIs#b
|
232
|
+
, "Bad use of predicate42" ,
|
233
|
+
MVt , Dm#m EAm EEm MVm Pam Pafm AFm EB#m MVb AJrc
|
234
|
+
Om Mam Am Jm Ds*m MX#m , "Bad comparative43" ,
|
235
|
+
MVz , D##y EAy EEy MVy EB#y , "Bad comparative44" ,
|
236
|
+
MV#a , Pam Pafm EAm Ds*m EAy AFm Mam Am
|
237
|
+
, "Bad comparative45" ,
|
238
|
+
MV#i , Pam Pafm EAm Ds*m EAy AFm Mam Am
|
239
|
+
, "Bad comparative46" ,
|
240
|
+
MV#o , D##m D##y Om Oy Jm Jy Am MX#m
|
241
|
+
, "Bad comparative47" ,
|
242
|
+
MV#p , EEm MVb Dm#m EEy D##y MVm Om Oy
|
243
|
+
Jm Jy Am MX#m
|
244
|
+
, "Bad comparative48" ,
|
245
|
+
Pafc , EB#m EB#y , "Bad comparative49" ,
|
246
|
+
Pafc , Pa* Paf* , "Bad comparative50" ,
|
247
|
+
MVat , MVm , "Bad comparative51" ,
|
248
|
+
MVpt , MVm , "Bad comparative52" ,
|
249
|
+
MVat , MVa MVp , "Bad comparative53" ,
|
250
|
+
MVpt , MVa MVp , "Bad comparative54" ,
|
251
|
+
U#t , D##m D##y Om Oy Jm Jy Am MX#m
|
252
|
+
, "Bad comparative55" ,
|
253
|
+
Cc , EEm EEy MVm MVb MVy
|
254
|
+
, "Bad comparative56" ,
|
255
|
+
Sp#c , Dmcm Dmcy Om Oy Jm Jy MX#m
|
256
|
+
, "Bad comparative57" ,
|
257
|
+
Ss#c , Dmum Dmuy Om Oy Jm Jy Ds*y MX#m
|
258
|
+
, "Bad comparative58" ,
|
259
|
+
S##c , Dm#m D##y Om Oy Jm Jy MX#m
|
260
|
+
, "Bad comparative59" ,
|
261
|
+
THc , TH , "Bad comparative60" ,
|
262
|
+
TOc , TO** TOf* TOi* , "Bad comparative61" ,
|
263
|
+
TOtc , TOt , "Bad comparative62" ,
|
264
|
+
Ma** , TO TOf TH MVp TOt QI OF MVt MVz MVh Ytm Ya
|
265
|
+
, "Bad use of adjective63" ,
|
266
|
+
Mam , TO TOf TH MVp TOt QI OF MVt MVz MVh Ytm Ya
|
267
|
+
, "Bad use of adjective64" ,
|
268
|
+
MX#a , TO TOf TH MVp TOt QI OF MVt MVz MVh Ytm Ya MJ
|
269
|
+
, "Bad use of adjective65" ,
|
270
|
+
|
271
|
+
; There's no ZZZ connector, which means that Ixd and Oxn
|
272
|
+
; are prohibited from ever occuring. 4.0.batch covers this.
|
273
|
+
Ixd , ZZZ , "Can't use 'do' with that verb" ,
|
274
|
+
Oxn , ZZZ , "Bad use of pronoun66" ,
|
275
|
+
MVh , EExk EAxk D##k , "Incorrect use of that67" ,
|
276
|
+
|
277
|
+
; The Rw link necessitated commenting out 68, because we had to make B#m
|
278
|
+
; a restricted link(see above) xxx reverted .. this is needed ...
|
279
|
+
;
|
280
|
+
B#m , D##w H HA , "Bad use of gerund68"
|
281
|
+
|
282
|
+
CONTAINS_NONE_RULES:
|
283
|
+
S , Spxi , "Bad n-v agreement69" ,
|
284
|
+
SI , SIpxi , "Bad n-v agreement70" ,
|
285
|
+
Ws , B#m Ca BT , "Question inversion violated71" ,
|
286
|
+
SF , I* PP* TO* Pa* Pam Pg* Pv* LE* AFd* MVta
|
287
|
+
, "Bad use of 'filler' subject72" ,
|
288
|
+
SFI , I* PP* TO* Pa* Pam Pg* Pv* LE* AFd* MVta
|
289
|
+
, "Bad use of 'filler' subject73" ,
|
290
|
+
OX , I* PP* TO* Pa* Pam Pg* Pv* LE* AFd* MVta
|
291
|
+
, "Bad use of 'filler' subject74" ,
|
292
|
+
MXsr , Sp#w , "Bad n-v agreement75" ,
|
293
|
+
MXpr , Ss#w S#iw , "Bad n-v agreement76" ,
|
294
|
+
Mr , B#* , "Bad use of 'whose'77"
|
295
|
+
|
296
|
+
|
297
|
+
; ----------------------------------------------------------------------
|
298
|
+
; The following rule asserts that all specified domains must have the
|
299
|
+
; property that all of the words that touch a link in the domain are
|
300
|
+
; not to the left of the root word of the domain. These rules are
|
301
|
+
; different from the above in that the first field is a *domain name*,
|
302
|
+
; rather than a set of links.
|
303
|
+
|
304
|
+
BOUNDED_RULES:
|
305
|
+
s , "Unbounded s domain78" ,
|
306
|
+
r , "Unbounded r domain79"
|
data/data/en/4.0.regex
ADDED
@@ -0,0 +1,225 @@
|
|
1
|
+
%***************************************************************************%
|
2
|
+
% %
|
3
|
+
% Copyright (C) 2005, 2006 Sampo Pyysalo, Sophie Aubin %
|
4
|
+
% See file "LICENSE" for information about commercial use of this system %
|
5
|
+
% %
|
6
|
+
%***************************************************************************%
|
7
|
+
|
8
|
+
% This file contains regular expressions that are used to match
|
9
|
+
% tokens not found in the dictionary. Each regex is given a name which
|
10
|
+
% determines the disjuncts assigned when the regex matches; this name
|
11
|
+
% must be defined in the dictionary along with the appropriate disjuncts.
|
12
|
+
% Note that the order of the regular expressions matters: matches will
|
13
|
+
% be attempted in the order in which the regexs appear in this file,
|
14
|
+
% and only the first match will be used.
|
15
|
+
|
16
|
+
% Numbers.
|
17
|
+
% XXX, we need to add utf8 U+00A0 "no-break space"
|
18
|
+
%
|
19
|
+
% Allows at most two colons in hour-muinute-second HH:MM:SS expressions
|
20
|
+
% Allows at most two digits between colons
|
21
|
+
HMS-TIME: /^[0-9][0-9]?(:[0-9][0-9]?(:[0-9][0-9]?)?)?(AM|PM|am|pm)?$/
|
22
|
+
|
23
|
+
% e.g. 1950's leading number can be higher, for science fiction.
|
24
|
+
% Must be four digits, or possible three. Must end in s, 's ’s
|
25
|
+
DECADE-TIME: /^([1-4][0-9][0-9]|[1-9][0-9])0(s|'s|’s)$/
|
26
|
+
|
27
|
+
% Day-of-month names; this regex will match before the one below.
|
28
|
+
DAY-ORDINALS: /^(1st|2nd|3rd|[4-9]th|1[0-9]th|2(0th|1st|2nd|3rd|[4-9]th)|30th|31st)$/
|
29
|
+
|
30
|
+
% Ordinal numbers; everything except 1st through 13th
|
31
|
+
% is handled by regex.
|
32
|
+
ORDINALS: /^[1-9][0-9]*(0th|1st|2nd|3rd|[4-9]th)$/
|
33
|
+
|
34
|
+
% Allows any number of commas or periods
|
35
|
+
% Be careful not match the period at the end of a sentence;
|
36
|
+
% for example: "It happened in 1942."
|
37
|
+
NUMBERS: /^[0-9,.]*[0-9]$/
|
38
|
+
% This parses signed numbers and ranges, e.g. "-5" and "5-10" and "9+/-6.5"
|
39
|
+
NUMBERS: /^[0-9.,-]*[0-9](\+\/-[0-9.,-]*[0-9])?$/
|
40
|
+
% Parses simple fractions e.g. "1/60" with no decimal points or anything fancy
|
41
|
+
FRACTION: /^[0-9]+\/[0-9]+$/
|
42
|
+
% "10(3)" exponent (used in PubMed)
|
43
|
+
NUMBERS: /^[0-9.,-]*[0-9][0-9.,-]*\([0-9:.,-]*[0-9][0-9.,-]*\)$/
|
44
|
+
|
45
|
+
% Roman numerals
|
46
|
+
% The first expr has the potential(?) problem that it matches an empty
|
47
|
+
% string. Thus, the next three rules specify that at least one section
|
48
|
+
% is non-empty.
|
49
|
+
ROMAN-NUMERAL-WORDS: /^M*(CM|D?C{0,3}|CD)(XC|L?X{0,3}|XL)(IX|V?I{0,3}|IV)$/
|
50
|
+
% ROMAN-NUMERAL-WORDS: /^M*(CM|D?C{0,3}|CD){1}(XC|L?X{0,3}|XL)(IX|V?I{0,3}|IV)$/
|
51
|
+
% ROMAN-NUMERAL-WORDS: /^M*(CM|D?C{0,3}|CD)(XC|L?X{0,3}|XL){1}(IX|V?I{0,3}|IV)$/
|
52
|
+
% ROMAN-NUMERAL-WORDS: /^M*(CM|D?C{0,3}|CD)(XC|L?X{0,3}|XL)(IX|V?I{0,3}|IV){1}$/
|
53
|
+
|
54
|
+
% Strings of initials. e.g "Dr. J.G.D. Smith lives on Main St."
|
55
|
+
INITIALS: /^([A-Z]\.)+$/
|
56
|
+
|
57
|
+
% Greek letters with numbers
|
58
|
+
GREEK-LETTER-AND-NUMBER: /^(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)\-?[0-9]+$/
|
59
|
+
PL-GREEK-LETTER-AND-NUMBER: /^(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)s\-?[0-9]+$/
|
60
|
+
|
61
|
+
% Some "safe" derived units. Simple units are in dictionary.
|
62
|
+
% The idea here is for the regex to match something that is almost
|
63
|
+
% certainly part of a derived unit, and allow the rest to be
|
64
|
+
% anything; this way we can capture difficult derived units such
|
65
|
+
% as "mg/kg/day" and even oddities such as "micrograms/mouse/day"
|
66
|
+
% without listing them explicitly.
|
67
|
+
% TODO: add more.
|
68
|
+
% Some (real) misses from these:
|
69
|
+
% micrograms.kg-1.h-1 microM-1 J/cm2 %/day mN/m cm/yr
|
70
|
+
% m/s days/week ml/s degrees/sec cm/sec cm/s mm/s N/mm (is that a unit?)
|
71
|
+
% cuts/minute clicks/s beats/minute x/week W/kg/W %/patient-year
|
72
|
+
% microIU/ml degrees/s counts/mm2 cells/mm3 tumors/mouse
|
73
|
+
% mm/sec ml/hr mJ/cm(2) m2/g amol/mm2 animals/group
|
74
|
+
% h-1 min-1 day-1 cm-1 mg-1 kg-1 mg.m-2.min-1 ms.cm-1 g-1
|
75
|
+
% sec-1 ms-1 ml.min.-1kg-1 ml.hr-1
|
76
|
+
% also, both kilometer and kilometers seem to be absent(!)
|
77
|
+
% remember "mm"!
|
78
|
+
|
79
|
+
UNITS: /^([npmk]|nano|pico|milli|micro|kilo)?(g|grams?)\// % grams/anything
|
80
|
+
UNITS: /^([fnmp]|femto|nano|micro|pico|mu)?mol(es)?\// % mol/anything
|
81
|
+
UNITS: /^[a-zA-Z\/.]+\/((m|micro)?[lLg]|kg|mol|min|day|h)$/ % common endings
|
82
|
+
% common endings, except in the style "mg.kg-1" instead of "mg/kg".
|
83
|
+
UNITS: /^[a-zA-Z\/.1-]+\.((m|micro)?[lLg]|kg|mol|min|day|h)(-1|\(-1\))$/
|
84
|
+
|
85
|
+
% combinations of numbers and units, e.g. "50-kDa", "1-2h"
|
86
|
+
% TODO: Clean up and check that these are up-to-date wrt the
|
87
|
+
% dictionary-recognized units; this is quite a mess currently.
|
88
|
+
% TODO: Extend the "number" part of the regex to allow anything
|
89
|
+
% that the NUMBER regex matches.
|
90
|
+
% One problem here is a failure to split up the expression ...
|
91
|
+
% e.g. "2hr" becomes 2 - ND - hr with the ND link. But 2-hr is treated
|
92
|
+
% as a single word ('I is a 2-hr wait')
|
93
|
+
% NUMBER-AND-UNIT: /^[0-9.,-]+(msec|s|min|hour|h|hr|day|week|wk|month|year|yr|kDa|kilodalton|base|kilobase|base-pair|kD|kd|kDa|bp|nt|kb|mm|mg|cm|nm|g|Hz|ms|kg|ml|mL|km|microm|\%)$/
|
94
|
+
% Comment out above, it screws up handling of unit suffixes, for
|
95
|
+
% example: "Zangbert stock fell 30% to $2.50 yesterday."
|
96
|
+
|
97
|
+
|
98
|
+
% fold-words. Matches NUMBER-fold, where NUMBER can be either numeric
|
99
|
+
% or a spelled-out number, and the hyphen is optional. Note that for
|
100
|
+
% spelled-out numbers, anything is allowed between the "initial" number
|
101
|
+
% and "fold" to catch e.g. "two-to-three fold" ("fourteen" etc. are absent
|
102
|
+
% as the prefix "four" is sufficient to match).
|
103
|
+
FOLD-WORDS: /^[0-9.,:-]*[0-9]([0-9.,:-]|\([0-9.,:-]*[0-9][0-9.,:-]*\)|\+\/-)*-?fold$/
|
104
|
+
FOLD-WORDS: /^(one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fifteen|twenty|thirty|fifty|hundred|thousand|million).*fold$/
|
105
|
+
|
106
|
+
% Plural proper nouns.
|
107
|
+
% Make sure that apostrophe-s is split out correctly.
|
108
|
+
PL-CAPITALIZED-WORDS: /^[[:upper:]].*[^iuoys'’]s$/
|
109
|
+
|
110
|
+
% Other proper nouns.
|
111
|
+
% We demand that these end with an alphanumeric, i.e. explicitly
|
112
|
+
% reject punctuation. We don't want this regex to "swallow" any trailing
|
113
|
+
% commas, colons, or periods/question-marks at the end of sentences.
|
114
|
+
% In addition, this must not swallow words ending in 's 'll etc.
|
115
|
+
% (... any affix, for that matter ...) and so no embedded apostrophe
|
116
|
+
CAPITALIZED-WORDS: /^[[:upper:]][^'’]*[^[:punct:]]$/
|
117
|
+
|
118
|
+
% SUFFIX GUESSING
|
119
|
+
% For all suffix-guessing patterns, we insist that the pattern start
|
120
|
+
% with an alphanumeric. This is needed to guarentee that the
|
121
|
+
% prefix-stripping code works correctly, as otherwise, the regex will
|
122
|
+
% gobble the prefix. So for example: "We left (carrying the dog) and
|
123
|
+
% Fred followed." Since "(carrying" is not in the dict, we need to be
|
124
|
+
% sure to not match the leading paren so that it will get tripped.
|
125
|
+
%
|
126
|
+
ING-WORDS: /^\w.+ing$/
|
127
|
+
|
128
|
+
% Plurals or verb-s. Make sure that apostrophe-s is split out correctly.
|
129
|
+
% e.g. "The subject's name is John Doe." should be
|
130
|
+
% +--Ds--+---YS--+--Ds-+
|
131
|
+
% | | | |
|
132
|
+
% the subject.n 's.p name.n
|
133
|
+
S-WORDS: /^\w.+[^iuoys'’]s$/
|
134
|
+
|
135
|
+
% Verbs ending -ed.
|
136
|
+
ED-WORDS: /^\w.+ed$/
|
137
|
+
|
138
|
+
% Advebs ending -ly.
|
139
|
+
LY-WORDS: /^\w.+ly$/
|
140
|
+
|
141
|
+
% Nouns ending in -ism, -asm (chiliasm .. ) Usualy mass nouns
|
142
|
+
% Stubbed out for now; I'm not convinced this improves accuracy.
|
143
|
+
% ISM-WORDS: /^\w.+asm$/
|
144
|
+
% ISM-WORDS: /^\w.+ism$/
|
145
|
+
|
146
|
+
% Corresponding count noun version of above (chiliast...)
|
147
|
+
% AST-WORDS: /^\w.+ast$/
|
148
|
+
% AST-WORDS: /^\w.+ist$/
|
149
|
+
|
150
|
+
% Corresponding adjectival form of above
|
151
|
+
ADJ-WORDS: /^\w.+astic$/
|
152
|
+
ADJ-WORDS: /^\w.+istic$/
|
153
|
+
|
154
|
+
% Nouns ending -ation stubbed out in BioLG, stub out here ...
|
155
|
+
%ATION-WORDS: /^\w.+ation$/
|
156
|
+
|
157
|
+
% Extension by LIPN 11/10/2005
|
158
|
+
% nouns -- typically seen in (bio-)chemistry texts
|
159
|
+
% synthetase, kinase
|
160
|
+
% 5-(hydroxymethyl)-2’-deoxyuridine
|
161
|
+
% hydroxyethyl, hydroxymethyl
|
162
|
+
% septation, reguion
|
163
|
+
% isomaltotetraose, isomaltotriose
|
164
|
+
% glycosylphosphatidylinositol
|
165
|
+
% iodide, oligodeoxynucleotide
|
166
|
+
% chronicity, hypochromicity
|
167
|
+
MC-NOUN-WORDS: /^\w.+ase$/
|
168
|
+
MC-NOUN-WORDS: /^\w.+ine?$/
|
169
|
+
MC-NOUN-WORDS: /^\w.+yl$/
|
170
|
+
MC-NOUN-WORDS: /^\w.+ion$/
|
171
|
+
MC-NOUN-WORDS: /^\w.+ose$/
|
172
|
+
MC-NOUN-WORDS: /^\w.+ol$/
|
173
|
+
MC-NOUN-WORDS: /^\w.+ide$/
|
174
|
+
MC-NOUN-WORDS: /^\w.+ity$/
|
175
|
+
|
176
|
+
% replicon, intron
|
177
|
+
C-NOUN-WORDS: /^\w.+o[rn]$/
|
178
|
+
|
179
|
+
% adjectives
|
180
|
+
% exogenous, heterologous
|
181
|
+
% intermolecular, intramolecular
|
182
|
+
% glycolytic, ribonucleic, uronic
|
183
|
+
% ribosomal, ribsosomal
|
184
|
+
% nonpermissive, thermosensitive
|
185
|
+
% inducible, metastable
|
186
|
+
ADJ-WORDS: /^\w.+ous$/
|
187
|
+
ADJ-WORDS: /^\w.+ar$/
|
188
|
+
ADJ-WORDS: /^\w.+ic$/
|
189
|
+
ADJ-WORDS: /^\w.+al$/
|
190
|
+
ADJ-WORDS: /^\w.+ive$/
|
191
|
+
ADJ-WORDS: /^\w.+ble$/
|
192
|
+
|
193
|
+
% latin (postposed) adjectives
|
194
|
+
% influenzae, tarentolae
|
195
|
+
% pentosaceus, luteus, carnosus
|
196
|
+
LATIN-ADJ-WORDS: /^\w.+ae$/
|
197
|
+
LATIN-ADJ-WORDS: /^\w.+us$/ % must appear after -ous in this file
|
198
|
+
|
199
|
+
% latin (postposed) adjectives or latin plural noun
|
200
|
+
% brevis, israelensis
|
201
|
+
% japonicum, tabacum, xylinum
|
202
|
+
LATIN-ADJ-P-NOUN-WORDS: /^\w.+is?$/
|
203
|
+
LATIN-ADJ-S-NOUN-WORDS: /^\w.+um$/
|
204
|
+
|
205
|
+
|
206
|
+
% Hyphenated words. In the original LG morpho-guessing system that
|
207
|
+
% predated the regex-based system, hyphenated words were detected
|
208
|
+
% before ING-WORDS, S-WORDS etc., causing e.g. "cross-linked" to be
|
209
|
+
% treated as a HYPHENATED-WORD (a generic adjective/noun), and
|
210
|
+
% never a verb. To return to this ordering, move this regex just
|
211
|
+
% after the CAPITALIZED-WORDS regex.
|
212
|
+
HYPHENATED-WORDS: /^[[:alpha:][:digit:],.][[:alpha:][:digit:],.-]*-[[:alpha:][:digit:],.-]*[[:alpha:][:digit:],.]$/
|
213
|
+
|
214
|
+
% proteins often end "ase", so we'll assume those things are names.
|
215
|
+
% removed, too many false positives.
|
216
|
+
% NAME: /ase$/
|
217
|
+
|
218
|
+
% Sequence of punctuation marks. If some mark appears in the affix table
|
219
|
+
% such as a period, comma, dash or underscore, and there's a sequence of
|
220
|
+
% these, then treat it as a "fill-in-the-blank" placeholder.
|
221
|
+
% This matters only for punc. appearing in the affix table, since the
|
222
|
+
% tokenizer explicitly mangles based on these punctution marks.
|
223
|
+
%
|
224
|
+
% Look for at least four in a row.
|
225
|
+
UNKNOWN-WORD: /^[.,-]{4}[.,-]*$/
|