grammar_cop 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.DS_Store +0 -0
- data/.gitignore +4 -0
- data/Gemfile +4 -0
- data/Rakefile +8 -0
- data/data/.DS_Store +0 -0
- data/data/Makefile +511 -0
- data/data/Makefile.am +4 -0
- data/data/Makefile.in +511 -0
- data/data/de/.DS_Store +0 -0
- data/data/de/4.0.affix +7 -0
- data/data/de/4.0.dict +474 -0
- data/data/de/Makefile +387 -0
- data/data/de/Makefile.am +9 -0
- data/data/de/Makefile.in +387 -0
- data/data/en/.DS_Store +0 -0
- data/data/en/4.0.affix +26 -0
- data/data/en/4.0.batch +1002 -0
- data/data/en/4.0.biolg.batch +411 -0
- data/data/en/4.0.constituent-knowledge +127 -0
- data/data/en/4.0.dict +8759 -0
- data/data/en/4.0.dict.m4 +6928 -0
- data/data/en/4.0.enwiki.batch +14 -0
- data/data/en/4.0.fixes.batch +2776 -0
- data/data/en/4.0.knowledge +306 -0
- data/data/en/4.0.regex +225 -0
- data/data/en/4.0.voa.batch +114 -0
- data/data/en/Makefile +554 -0
- data/data/en/Makefile.am +19 -0
- data/data/en/Makefile.in +554 -0
- data/data/en/README +173 -0
- data/data/en/tiny.dict +157 -0
- data/data/en/words/.DS_Store +0 -0
- data/data/en/words/Makefile +456 -0
- data/data/en/words/Makefile.am +78 -0
- data/data/en/words/Makefile.in +456 -0
- data/data/en/words/currency +205 -0
- data/data/en/words/currency.p +28 -0
- data/data/en/words/entities.given-bisex.sing +39 -0
- data/data/en/words/entities.given-female.sing +4141 -0
- data/data/en/words/entities.given-male.sing +1633 -0
- data/data/en/words/entities.locations.sing +68 -0
- data/data/en/words/entities.national.sing +253 -0
- data/data/en/words/entities.organizations.sing +7 -0
- data/data/en/words/entities.us-states.sing +11 -0
- data/data/en/words/units.1 +45 -0
- data/data/en/words/units.1.dot +4 -0
- data/data/en/words/units.3 +2 -0
- data/data/en/words/units.4 +5 -0
- data/data/en/words/units.4.dot +1 -0
- data/data/en/words/words-medical.adv.1 +1191 -0
- data/data/en/words/words-medical.prep.1 +67 -0
- data/data/en/words/words-medical.v.4.1 +2835 -0
- data/data/en/words/words-medical.v.4.2 +2848 -0
- data/data/en/words/words-medical.v.4.3 +3011 -0
- data/data/en/words/words-medical.v.4.4 +3036 -0
- data/data/en/words/words-medical.v.4.5 +3050 -0
- data/data/en/words/words.adj.1 +6794 -0
- data/data/en/words/words.adj.2 +638 -0
- data/data/en/words/words.adj.3 +667 -0
- data/data/en/words/words.adv.1 +1573 -0
- data/data/en/words/words.adv.2 +67 -0
- data/data/en/words/words.adv.3 +157 -0
- data/data/en/words/words.adv.4 +80 -0
- data/data/en/words/words.n.1 +11464 -0
- data/data/en/words/words.n.1.wiki +264 -0
- data/data/en/words/words.n.2.s +2017 -0
- data/data/en/words/words.n.2.s.biolg +1 -0
- data/data/en/words/words.n.2.s.wiki +298 -0
- data/data/en/words/words.n.2.x +65 -0
- data/data/en/words/words.n.2.x.wiki +10 -0
- data/data/en/words/words.n.3 +5717 -0
- data/data/en/words/words.n.t +23 -0
- data/data/en/words/words.v.1.1 +1038 -0
- data/data/en/words/words.v.1.2 +1043 -0
- data/data/en/words/words.v.1.3 +1052 -0
- data/data/en/words/words.v.1.4 +1023 -0
- data/data/en/words/words.v.1.p +17 -0
- data/data/en/words/words.v.10.1 +14 -0
- data/data/en/words/words.v.10.2 +15 -0
- data/data/en/words/words.v.10.3 +88 -0
- data/data/en/words/words.v.10.4 +17 -0
- data/data/en/words/words.v.2.1 +1253 -0
- data/data/en/words/words.v.2.2 +1304 -0
- data/data/en/words/words.v.2.3 +1280 -0
- data/data/en/words/words.v.2.4 +1285 -0
- data/data/en/words/words.v.2.5 +1287 -0
- data/data/en/words/words.v.4.1 +2472 -0
- data/data/en/words/words.v.4.2 +2487 -0
- data/data/en/words/words.v.4.3 +2441 -0
- data/data/en/words/words.v.4.4 +2478 -0
- data/data/en/words/words.v.4.5 +2483 -0
- data/data/en/words/words.v.5.1 +98 -0
- data/data/en/words/words.v.5.2 +98 -0
- data/data/en/words/words.v.5.3 +103 -0
- data/data/en/words/words.v.5.4 +102 -0
- data/data/en/words/words.v.6.1 +388 -0
- data/data/en/words/words.v.6.2 +401 -0
- data/data/en/words/words.v.6.3 +397 -0
- data/data/en/words/words.v.6.4 +405 -0
- data/data/en/words/words.v.6.5 +401 -0
- data/data/en/words/words.v.8.1 +117 -0
- data/data/en/words/words.v.8.2 +118 -0
- data/data/en/words/words.v.8.3 +118 -0
- data/data/en/words/words.v.8.4 +119 -0
- data/data/en/words/words.v.8.5 +119 -0
- data/data/en/words/words.y +104 -0
- data/data/lt/.DS_Store +0 -0
- data/data/lt/4.0.affix +6 -0
- data/data/lt/4.0.constituent-knowledge +24 -0
- data/data/lt/4.0.dict +135 -0
- data/data/lt/4.0.knowledge +38 -0
- data/data/lt/Makefile +389 -0
- data/data/lt/Makefile.am +11 -0
- data/data/lt/Makefile.in +389 -0
- data/ext/.DS_Store +0 -0
- data/ext/link_grammar/.DS_Store +0 -0
- data/ext/link_grammar/extconf.rb +2 -0
- data/ext/link_grammar/link-grammar/.DS_Store +0 -0
- data/ext/link_grammar/link-grammar/.deps/analyze-linkage.Plo +198 -0
- data/ext/link_grammar/link-grammar/.deps/and.Plo +202 -0
- data/ext/link_grammar/link-grammar/.deps/api.Plo +244 -0
- data/ext/link_grammar/link-grammar/.deps/build-disjuncts.Plo +212 -0
- data/ext/link_grammar/link-grammar/.deps/command-line.Plo +201 -0
- data/ext/link_grammar/link-grammar/.deps/constituents.Plo +201 -0
- data/ext/link_grammar/link-grammar/.deps/count.Plo +202 -0
- data/ext/link_grammar/link-grammar/.deps/disjunct-utils.Plo +126 -0
- data/ext/link_grammar/link-grammar/.deps/disjuncts.Plo +123 -0
- data/ext/link_grammar/link-grammar/.deps/error.Plo +121 -0
- data/ext/link_grammar/link-grammar/.deps/expand.Plo +133 -0
- data/ext/link_grammar/link-grammar/.deps/extract-links.Plo +198 -0
- data/ext/link_grammar/link-grammar/.deps/fast-match.Plo +200 -0
- data/ext/link_grammar/link-grammar/.deps/idiom.Plo +200 -0
- data/ext/link_grammar/link-grammar/.deps/jni-client.Plo +217 -0
- data/ext/link_grammar/link-grammar/.deps/link-parser.Po +1 -0
- data/ext/link_grammar/link-grammar/.deps/massage.Plo +202 -0
- data/ext/link_grammar/link-grammar/.deps/post-process.Plo +202 -0
- data/ext/link_grammar/link-grammar/.deps/pp_knowledge.Plo +202 -0
- data/ext/link_grammar/link-grammar/.deps/pp_lexer.Plo +201 -0
- data/ext/link_grammar/link-grammar/.deps/pp_linkset.Plo +200 -0
- data/ext/link_grammar/link-grammar/.deps/prefix.Plo +102 -0
- data/ext/link_grammar/link-grammar/.deps/preparation.Plo +202 -0
- data/ext/link_grammar/link-grammar/.deps/print-util.Plo +200 -0
- data/ext/link_grammar/link-grammar/.deps/print.Plo +201 -0
- data/ext/link_grammar/link-grammar/.deps/prune.Plo +202 -0
- data/ext/link_grammar/link-grammar/.deps/read-dict.Plo +223 -0
- data/ext/link_grammar/link-grammar/.deps/read-regex.Plo +123 -0
- data/ext/link_grammar/link-grammar/.deps/regex-morph.Plo +131 -0
- data/ext/link_grammar/link-grammar/.deps/resources.Plo +203 -0
- data/ext/link_grammar/link-grammar/.deps/spellcheck-aspell.Plo +1 -0
- data/ext/link_grammar/link-grammar/.deps/spellcheck-hun.Plo +115 -0
- data/ext/link_grammar/link-grammar/.deps/string-set.Plo +198 -0
- data/ext/link_grammar/link-grammar/.deps/tokenize.Plo +160 -0
- data/ext/link_grammar/link-grammar/.deps/utilities.Plo +222 -0
- data/ext/link_grammar/link-grammar/.deps/word-file.Plo +201 -0
- data/ext/link_grammar/link-grammar/.deps/word-utils.Plo +212 -0
- data/ext/link_grammar/link-grammar/.libs/analyze-linkage.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/and.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/api.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/build-disjuncts.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/command-line.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/constituents.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/count.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/disjunct-utils.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/disjuncts.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/error.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/expand.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/extract-links.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/fast-match.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/idiom.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/jni-client.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java-symbols.expsym +31 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib.dSYM/Contents/Info.plist +20 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib.dSYM/Contents/Resources/DWARF/liblink-grammar-java.4.dylib +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.a +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.dylib +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar-symbols.expsym +194 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib.dSYM/Contents/Info.plist +20 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib.dSYM/Contents/Resources/DWARF/liblink-grammar.4.dylib +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar.a +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar.dylib +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar.la +41 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar.lai +41 -0
- data/ext/link_grammar/link-grammar/.libs/massage.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/post-process.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/pp_knowledge.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/pp_lexer.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/pp_linkset.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/prefix.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/preparation.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/print-util.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/print.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/prune.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/read-dict.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/read-regex.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/regex-morph.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/resources.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/spellcheck-aspell.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/spellcheck-hun.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/string-set.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/tokenize.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/utilities.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/word-file.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/word-utils.o +0 -0
- data/ext/link_grammar/link-grammar/Makefile +900 -0
- data/ext/link_grammar/link-grammar/Makefile.am +202 -0
- data/ext/link_grammar/link-grammar/Makefile.in +900 -0
- data/ext/link_grammar/link-grammar/analyze-linkage.c +1317 -0
- data/ext/link_grammar/link-grammar/analyze-linkage.h +24 -0
- data/ext/link_grammar/link-grammar/and.c +1603 -0
- data/ext/link_grammar/link-grammar/and.h +27 -0
- data/ext/link_grammar/link-grammar/api-structures.h +362 -0
- data/ext/link_grammar/link-grammar/api-types.h +72 -0
- data/ext/link_grammar/link-grammar/api.c +1887 -0
- data/ext/link_grammar/link-grammar/api.h +96 -0
- data/ext/link_grammar/link-grammar/autoit/.DS_Store +0 -0
- data/ext/link_grammar/link-grammar/autoit/README +10 -0
- data/ext/link_grammar/link-grammar/autoit/_LGTest.au3 +22 -0
- data/ext/link_grammar/link-grammar/autoit/_LinkGrammar.au3 +545 -0
- data/ext/link_grammar/link-grammar/build-disjuncts.c +487 -0
- data/ext/link_grammar/link-grammar/build-disjuncts.h +21 -0
- data/ext/link_grammar/link-grammar/command-line.c +458 -0
- data/ext/link_grammar/link-grammar/command-line.h +15 -0
- data/ext/link_grammar/link-grammar/constituents.c +1836 -0
- data/ext/link_grammar/link-grammar/constituents.h +26 -0
- data/ext/link_grammar/link-grammar/corpus/.DS_Store +0 -0
- data/ext/link_grammar/link-grammar/corpus/.deps/cluster.Plo +1 -0
- data/ext/link_grammar/link-grammar/corpus/.deps/corpus.Plo +1 -0
- data/ext/link_grammar/link-grammar/corpus/Makefile +527 -0
- data/ext/link_grammar/link-grammar/corpus/Makefile.am +46 -0
- data/ext/link_grammar/link-grammar/corpus/Makefile.in +527 -0
- data/ext/link_grammar/link-grammar/corpus/README +17 -0
- data/ext/link_grammar/link-grammar/corpus/cluster.c +286 -0
- data/ext/link_grammar/link-grammar/corpus/cluster.h +32 -0
- data/ext/link_grammar/link-grammar/corpus/corpus.c +483 -0
- data/ext/link_grammar/link-grammar/corpus/corpus.h +46 -0
- data/ext/link_grammar/link-grammar/count.c +828 -0
- data/ext/link_grammar/link-grammar/count.h +25 -0
- data/ext/link_grammar/link-grammar/disjunct-utils.c +261 -0
- data/ext/link_grammar/link-grammar/disjunct-utils.h +27 -0
- data/ext/link_grammar/link-grammar/disjuncts.c +138 -0
- data/ext/link_grammar/link-grammar/disjuncts.h +13 -0
- data/ext/link_grammar/link-grammar/error.c +92 -0
- data/ext/link_grammar/link-grammar/error.h +35 -0
- data/ext/link_grammar/link-grammar/expand.c +67 -0
- data/ext/link_grammar/link-grammar/expand.h +13 -0
- data/ext/link_grammar/link-grammar/externs.h +22 -0
- data/ext/link_grammar/link-grammar/extract-links.c +625 -0
- data/ext/link_grammar/link-grammar/extract-links.h +16 -0
- data/ext/link_grammar/link-grammar/fast-match.c +309 -0
- data/ext/link_grammar/link-grammar/fast-match.h +17 -0
- data/ext/link_grammar/link-grammar/idiom.c +373 -0
- data/ext/link_grammar/link-grammar/idiom.h +15 -0
- data/ext/link_grammar/link-grammar/jni-client.c +779 -0
- data/ext/link_grammar/link-grammar/jni-client.h +236 -0
- data/ext/link_grammar/link-grammar/liblink-grammar-java.la +42 -0
- data/ext/link_grammar/link-grammar/liblink-grammar.la +41 -0
- data/ext/link_grammar/link-grammar/link-features.h +37 -0
- data/ext/link_grammar/link-grammar/link-features.h.in +37 -0
- data/ext/link_grammar/link-grammar/link-grammar-java.def +31 -0
- data/ext/link_grammar/link-grammar/link-grammar.def +194 -0
- data/ext/link_grammar/link-grammar/link-includes.h +465 -0
- data/ext/link_grammar/link-grammar/link-parser.c +849 -0
- data/ext/link_grammar/link-grammar/massage.c +329 -0
- data/ext/link_grammar/link-grammar/massage.h +13 -0
- data/ext/link_grammar/link-grammar/post-process.c +1113 -0
- data/ext/link_grammar/link-grammar/post-process.h +45 -0
- data/ext/link_grammar/link-grammar/pp_knowledge.c +376 -0
- data/ext/link_grammar/link-grammar/pp_knowledge.h +14 -0
- data/ext/link_grammar/link-grammar/pp_lexer.c +1920 -0
- data/ext/link_grammar/link-grammar/pp_lexer.h +19 -0
- data/ext/link_grammar/link-grammar/pp_linkset.c +158 -0
- data/ext/link_grammar/link-grammar/pp_linkset.h +20 -0
- data/ext/link_grammar/link-grammar/prefix.c +482 -0
- data/ext/link_grammar/link-grammar/prefix.h +139 -0
- data/ext/link_grammar/link-grammar/preparation.c +412 -0
- data/ext/link_grammar/link-grammar/preparation.h +20 -0
- data/ext/link_grammar/link-grammar/print-util.c +87 -0
- data/ext/link_grammar/link-grammar/print-util.h +32 -0
- data/ext/link_grammar/link-grammar/print.c +1085 -0
- data/ext/link_grammar/link-grammar/print.h +16 -0
- data/ext/link_grammar/link-grammar/prune.c +1864 -0
- data/ext/link_grammar/link-grammar/prune.h +17 -0
- data/ext/link_grammar/link-grammar/read-dict.c +1785 -0
- data/ext/link_grammar/link-grammar/read-dict.h +29 -0
- data/ext/link_grammar/link-grammar/read-regex.c +161 -0
- data/ext/link_grammar/link-grammar/read-regex.h +12 -0
- data/ext/link_grammar/link-grammar/regex-morph.c +126 -0
- data/ext/link_grammar/link-grammar/regex-morph.h +17 -0
- data/ext/link_grammar/link-grammar/resources.c +180 -0
- data/ext/link_grammar/link-grammar/resources.h +23 -0
- data/ext/link_grammar/link-grammar/sat-solver/.DS_Store +0 -0
- data/ext/link_grammar/link-grammar/sat-solver/.deps/fast-sprintf.Plo +1 -0
- data/ext/link_grammar/link-grammar/sat-solver/.deps/sat-encoder.Plo +1 -0
- data/ext/link_grammar/link-grammar/sat-solver/.deps/util.Plo +1 -0
- data/ext/link_grammar/link-grammar/sat-solver/.deps/variables.Plo +1 -0
- data/ext/link_grammar/link-grammar/sat-solver/.deps/word-tag.Plo +1 -0
- data/ext/link_grammar/link-grammar/sat-solver/Makefile +527 -0
- data/ext/link_grammar/link-grammar/sat-solver/Makefile.am +29 -0
- data/ext/link_grammar/link-grammar/sat-solver/Makefile.in +527 -0
- data/ext/link_grammar/link-grammar/sat-solver/clock.hpp +33 -0
- data/ext/link_grammar/link-grammar/sat-solver/fast-sprintf.cpp +26 -0
- data/ext/link_grammar/link-grammar/sat-solver/fast-sprintf.hpp +7 -0
- data/ext/link_grammar/link-grammar/sat-solver/guiding.hpp +244 -0
- data/ext/link_grammar/link-grammar/sat-solver/matrix-ut.hpp +79 -0
- data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.cpp +2811 -0
- data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.h +11 -0
- data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.hpp +381 -0
- data/ext/link_grammar/link-grammar/sat-solver/trie.hpp +118 -0
- data/ext/link_grammar/link-grammar/sat-solver/util.cpp +23 -0
- data/ext/link_grammar/link-grammar/sat-solver/util.hpp +14 -0
- data/ext/link_grammar/link-grammar/sat-solver/variables.cpp +5 -0
- data/ext/link_grammar/link-grammar/sat-solver/variables.hpp +829 -0
- data/ext/link_grammar/link-grammar/sat-solver/word-tag.cpp +159 -0
- data/ext/link_grammar/link-grammar/sat-solver/word-tag.hpp +162 -0
- data/ext/link_grammar/link-grammar/spellcheck-aspell.c +148 -0
- data/ext/link_grammar/link-grammar/spellcheck-hun.c +136 -0
- data/ext/link_grammar/link-grammar/spellcheck.h +34 -0
- data/ext/link_grammar/link-grammar/string-set.c +169 -0
- data/ext/link_grammar/link-grammar/string-set.h +16 -0
- data/ext/link_grammar/link-grammar/structures.h +498 -0
- data/ext/link_grammar/link-grammar/tokenize.c +1049 -0
- data/ext/link_grammar/link-grammar/tokenize.h +15 -0
- data/ext/link_grammar/link-grammar/utilities.c +847 -0
- data/ext/link_grammar/link-grammar/utilities.h +281 -0
- data/ext/link_grammar/link-grammar/word-file.c +124 -0
- data/ext/link_grammar/link-grammar/word-file.h +15 -0
- data/ext/link_grammar/link-grammar/word-utils.c +526 -0
- data/ext/link_grammar/link-grammar/word-utils.h +152 -0
- data/ext/link_grammar/link_grammar.c +202 -0
- data/ext/link_grammar/link_grammar.h +99 -0
- data/grammar_cop.gemspec +24 -0
- data/lib/.DS_Store +0 -0
- data/lib/grammar_cop.rb +9 -0
- data/lib/grammar_cop/.DS_Store +0 -0
- data/lib/grammar_cop/dictionary.rb +19 -0
- data/lib/grammar_cop/linkage.rb +30 -0
- data/lib/grammar_cop/parse_options.rb +32 -0
- data/lib/grammar_cop/sentence.rb +36 -0
- data/lib/grammar_cop/version.rb +3 -0
- data/test/.DS_Store +0 -0
- data/test/grammar_cop_test.rb +27 -0
- metadata +407 -0
@@ -0,0 +1,306 @@
|
|
1
|
+
; Post-processing knowledge file
|
2
|
+
; 6/96
|
3
|
+
|
4
|
+
; ----------------------------------------------------------------------------
|
5
|
+
; This file contains the knowledge related to post-processing, in the
|
6
|
+
; form of lists and rules. This file is read by post-process.c at run-time.
|
7
|
+
; Syntax of file:
|
8
|
+
; line starting with ";" is a comment
|
9
|
+
; commas are field delimiters
|
10
|
+
; any token beginning with the character @ is expanded to the set
|
11
|
+
; of symbols it defined. e.g. one could write
|
12
|
+
; FOO: blah1 blah2 blah3
|
13
|
+
; thus defining a set FOO containing three strings. Then one could later write
|
14
|
+
; BAR: blah5 @FOO blah8
|
15
|
+
; which defines a set BAR containing 5 strings.
|
16
|
+
;
|
17
|
+
; Capitalized tokens are *required*, though if you feel like providing an
|
18
|
+
; empty list afterwards, that's your right.
|
19
|
+
; ----------------------------------------------------------------------------
|
20
|
+
|
21
|
+
|
22
|
+
; The following links start a domain. Each must be given a name in the
|
23
|
+
; table below (STARTING_LINK_TYPE_TABLE)
|
24
|
+
|
25
|
+
DOMAIN_STARTER_LINKS:
|
26
|
+
W Ce Cs Ca Cc Ci R* Rn Re RSe Mr QI#d Mv* Jr Mj Qd
|
27
|
+
TOn TOi Mg* MVi Ss#d Bsd ER Z Ma#* SIs#g BIqx MX#p MX#a
|
28
|
+
MX#r MX#j MV#o MV#p Eq COq CCq AFd PFc
|
29
|
+
|
30
|
+
|
31
|
+
|
32
|
+
; ----------------------------------------------------------------------
|
33
|
+
; The following links start a urfl domain. They are also included in the
|
34
|
+
; domain, as opposed to regular starter links (above), which are not. A
|
35
|
+
; urfl domain includes links accessible from the root word, tracing to
|
36
|
+
; the right (as well as everything accessible from the left end of the
|
37
|
+
; starter link).
|
38
|
+
|
39
|
+
URFL_DOMAIN_STARTER_LINKS: TOo I#j Pa##j CP
|
40
|
+
|
41
|
+
|
42
|
+
|
43
|
+
; ----------------------------------------------------------------------
|
44
|
+
; The following start a urfl_only domain. These include _only_ links :
|
45
|
+
; reachable from the root word, tracing to the right. They aren't
|
46
|
+
; included in the domain
|
47
|
+
|
48
|
+
URFL_ONLY_DOMAIN_STARTER_LINKS: SFsx Ss#g COp
|
49
|
+
|
50
|
+
|
51
|
+
|
52
|
+
; ----------------------------------------------------------------------
|
53
|
+
; Links which start a domain and are also part of the domain. This must be
|
54
|
+
; a sublist of the domain_starter_list
|
55
|
+
|
56
|
+
DOMAIN_CONTAINS_LINKS:
|
57
|
+
Mg* Mx Bsd MX#a Ma#* Mv* MX#r Ss#d Ws Wq Qd Mj Wj
|
58
|
+
Wi MX#j AFd PFc Jr Wd Mr
|
59
|
+
|
60
|
+
|
61
|
+
|
62
|
+
; ----------------------------------------------------------------------
|
63
|
+
; These links are not put in the word/link graph. They also cannot be the
|
64
|
+
; starter links for a domain. (These links may also only be used in cycles.)
|
65
|
+
|
66
|
+
IGNORE_THESE_LINKS: Xca
|
67
|
+
|
68
|
+
|
69
|
+
|
70
|
+
; ----------------------------------------------------------------------
|
71
|
+
; These links may only be used in cycles.
|
72
|
+
|
73
|
+
MUST_FORM_A_CYCLE_LINKS: R#* TOt EXx HA SFsic Jr JQ Xca
|
74
|
+
|
75
|
+
|
76
|
+
; ----------------------------------------------------------------------
|
77
|
+
; These links are not traced further if they point back before the root word.
|
78
|
+
; The creation of Rw necessitated making B#m a restricted link, to
|
79
|
+
; prevent the (e) domain, started by Ce, from extending around through
|
80
|
+
; the Rw link.
|
81
|
+
; Reverted.
|
82
|
+
; This breaks parsing of
|
83
|
+
; How fast a program does he think it is
|
84
|
+
; I wonder how fast a program he thinks it is
|
85
|
+
; I wonder how much money you earned
|
86
|
+
; I wonder how many people you saw
|
87
|
+
; I wonder how big a department it is
|
88
|
+
; I wonder how much oil they spilled
|
89
|
+
; This is the man whose dog I bought
|
90
|
+
; I wonder which dog he said you chased
|
91
|
+
; How efficient a program is it
|
92
|
+
; Meanwhile, I can't find the Ce problem mentioned ... this needs more
|
93
|
+
; documentation!
|
94
|
+
|
95
|
+
RESTRICTED_LINKS:
|
96
|
+
B#* D##w B#w B#d AFh MVt Xx HL SFsic AFd Bc CX EAh
|
97
|
+
H HA PFc B#j Wd PF Z
|
98
|
+
|
99
|
+
; H HA PFc B#j Wd PF Z B#m
|
100
|
+
|
101
|
+
|
102
|
+
; ----------------------------------------------------------------------
|
103
|
+
; ---------------------- LINK TYPE TABLE-------------------------------
|
104
|
+
; ----------------------------------------------------------------------
|
105
|
+
; The following table associates a domain type with each possible
|
106
|
+
; starting link. It contains pairs: the first of each pair is a link
|
107
|
+
; type, and the second is the domain to which that link type belongs.
|
108
|
+
|
109
|
+
STARTING_LINK_TYPE_TABLE:
|
110
|
+
Ce e
|
111
|
+
R* r
|
112
|
+
Rn r
|
113
|
+
Re r
|
114
|
+
W m
|
115
|
+
RSe e
|
116
|
+
Cs s
|
117
|
+
Ca s
|
118
|
+
Jr e
|
119
|
+
Mr r
|
120
|
+
Cc s
|
121
|
+
Mv* e
|
122
|
+
QI#d s
|
123
|
+
BIqx s
|
124
|
+
TOn e
|
125
|
+
TOi e
|
126
|
+
MVi e
|
127
|
+
MV#o s
|
128
|
+
MV#p s
|
129
|
+
AFd s
|
130
|
+
PFc s
|
131
|
+
Mg* e
|
132
|
+
Mj j
|
133
|
+
Qd m
|
134
|
+
MX#j j
|
135
|
+
TOo x
|
136
|
+
I#j x
|
137
|
+
Pa##j x
|
138
|
+
CP x
|
139
|
+
COp d
|
140
|
+
SFsx d
|
141
|
+
Ss#g d
|
142
|
+
SIs#g s
|
143
|
+
Ss#d s
|
144
|
+
Bsd s
|
145
|
+
ER s
|
146
|
+
Z s
|
147
|
+
Ma#* e
|
148
|
+
MX#p e
|
149
|
+
Ci e
|
150
|
+
MX#a e
|
151
|
+
Eq e
|
152
|
+
COq e
|
153
|
+
CCq s
|
154
|
+
MX#r r
|
155
|
+
|
156
|
+
|
157
|
+
; ----------------------------------------------------------------------
|
158
|
+
; ----------------------- LINK SETS ------------------------------------
|
159
|
+
; ----------------------------------------------------------------------
|
160
|
+
; (Not in use at present; see comment at beginning of file)
|
161
|
+
|
162
|
+
; ----------------------------------------------------------------------
|
163
|
+
; ----------------- RULES ----------------------------------------------
|
164
|
+
; ----------------------------------------------------------------------
|
165
|
+
; Explanation of syntax: as usual, each stanza begins with a label
|
166
|
+
; terminated by a colon. The interpretation of the rule depends on
|
167
|
+
; the label, as specified in each stanza.
|
168
|
+
|
169
|
+
; The following rule asserts that the linkage must *still* be connected
|
170
|
+
; when the specified set(s) of links are removed from the linkage.
|
171
|
+
|
172
|
+
FORM_A_CYCLE_RULES:
|
173
|
+
@MUST_FORM_A_CYCLE_LINKS , "'must form a cycle' violation0"
|
174
|
+
|
175
|
+
|
176
|
+
; For the following rules, if a domain contains a link matching the 1st
|
177
|
+
; column, it must also contain a linkage matching one of the members of the
|
178
|
+
; set in the 2nd column. The individual rules are demarcated by semicolons and
|
179
|
+
; the fields within a rule are demarcated by commas.
|
180
|
+
|
181
|
+
CONTAINS_ONE_RULES:
|
182
|
+
SI#* , Wq Qd CQ PFc , "Bad use of s-v inversion1" ,
|
183
|
+
SI#x , Wq Qd CQ PFc , "Bad use of s-v inversion2" ,
|
184
|
+
SFI##* , Wq Qd CQ PFc , "Bad use of s-v inversion3",
|
185
|
+
SXI , Wq Qd CQ PFc , "Bad use of s-v inversion4" ,
|
186
|
+
Ws , D##w S##w H , "S-V inversion required5",
|
187
|
+
I#a , B#m B#w , "incorrect use of 'to'6" ,
|
188
|
+
Wq , SI SFI SXI , "S-V inversion required7" ,
|
189
|
+
Qd , SI SFI SXI , "S-V inversion required8" ,
|
190
|
+
PFc , SI SFI SXI , "S-V inversion required9" ,
|
191
|
+
Mj , Jw JQ , "Incorrect relative10" ,
|
192
|
+
MX#j , Jw JQ , "Incorrect relative11" ,
|
193
|
+
Wj , Jw JQ , "Misuse of preposition12" ,
|
194
|
+
JQ , Mj Wj MX#j , "Misuse of preposition13" ,
|
195
|
+
Jw , Mj Wj MX#j , "Misuse of preposition14" ,
|
196
|
+
B#j , Jr , "Incorrect relative15" ,
|
197
|
+
Jr , B#j , "Incorrect relative16" ,
|
198
|
+
EAh , AF Bsm B*m Qe Ca AFm
|
199
|
+
, "Incorrect use of 'how'17" ,
|
200
|
+
EEh , AF Bsm B*m Qe Ca AFm
|
201
|
+
, "Incorrect use of 'how'18" ,
|
202
|
+
Qe , EEh , "Incorrect use of adverb19" ,
|
203
|
+
THi , SFsi SFIsi OXi , "Complement requires 'it'20" ,
|
204
|
+
TSi , SFsi SFIsi OXi , "Complement requires 'it'21" ,
|
205
|
+
QIi , SFsi SFIsi OXi , "Complement requires 'it'22" ,
|
206
|
+
TOi , SFsi SFIsi OXi , "Complement requires 'it'23" ,
|
207
|
+
Ci , SFsi SFIsi OXi , "Complement requires 'it'24" ,
|
208
|
+
COqi , SFsi SFIsi OXi , "Complement requires 'it'25" ,
|
209
|
+
CPi , SFsi SFIsi OXi , "Complement requires 'it'26" ,
|
210
|
+
Eqi , SFsi SFIsi OXi , "Complement requires 'it'27" ,
|
211
|
+
LEi , SFsi SFIsi OXi , "Complement requires 'it'28" ,
|
212
|
+
MVti , SFsi SFIsi OXi , "Complement requires 'it'29" ,
|
213
|
+
AFdi , SFsi SFIsi OXi , "Complement requires 'it'30" ,
|
214
|
+
O#i , SFsi SFIsi OXi , "Complement requires 'it'31" ,
|
215
|
+
SFst , O*t Ost Omt Bs#t B*#t Bc#t , "Bad use of 'there'32" ,
|
216
|
+
SFIst , O*t Ost Omt Bs#t B*#t Bc#t , "Bad use of 'there'33" ,
|
217
|
+
SFp , Opt Omt O*t Bp#t B*#t Bc#t , "Bad use of 'there'34" ,
|
218
|
+
;
|
219
|
+
; This SFu rule forces subject-object agreement for uncountable noun objects
|
220
|
+
SFu , Out Omt O*t Bp#t B*#t Bc#t , "Bad use of 'there'34a" ,
|
221
|
+
SFIp , Opt Omt O*t Bp#t B*#t Bc#t , "Bad use of 'there'35" ,
|
222
|
+
OXt , O#t B##t , "Bad use of 'there'36" ,
|
223
|
+
SFsi* , TOi THi QIi TSi O#i Ci THb CPi
|
224
|
+
COqi CPi Eqi AFdi BIh , "Bad use of 'it'37" ,
|
225
|
+
SFIsi , TOi THi QIi TSi O#i Ci THb CPi
|
226
|
+
COqi CPi Eqi AFdi BIh , "Bad use of 'it'38" ,
|
227
|
+
OXi , TOi THi QIi TSi O#i Ci THb CPi
|
228
|
+
COqi CPi Eqi AFdi BIh , "Bad use of 'it'39" ,
|
229
|
+
THb , S##t SI##t SFsi SFIsi , "Bad use of predicate40" ,
|
230
|
+
BIh , Ss#b SIs#b SFsi SFIsi , "Bad use of predicate41" ,
|
231
|
+
BIq , S##q SI##q SFsi Ss#b SFIsi SIs#b
|
232
|
+
, "Bad use of predicate42" ,
|
233
|
+
MVt , Dm#m EAm EEm MVm Pam Pafm AFm EB#m MVb AJrc
|
234
|
+
Om Mam Am Jm Ds*m MX#m , "Bad comparative43" ,
|
235
|
+
MVz , D##y EAy EEy MVy EB#y , "Bad comparative44" ,
|
236
|
+
MV#a , Pam Pafm EAm Ds*m EAy AFm Mam Am
|
237
|
+
, "Bad comparative45" ,
|
238
|
+
MV#i , Pam Pafm EAm Ds*m EAy AFm Mam Am
|
239
|
+
, "Bad comparative46" ,
|
240
|
+
MV#o , D##m D##y Om Oy Jm Jy Am MX#m
|
241
|
+
, "Bad comparative47" ,
|
242
|
+
MV#p , EEm MVb Dm#m EEy D##y MVm Om Oy
|
243
|
+
Jm Jy Am MX#m
|
244
|
+
, "Bad comparative48" ,
|
245
|
+
Pafc , EB#m EB#y , "Bad comparative49" ,
|
246
|
+
Pafc , Pa* Paf* , "Bad comparative50" ,
|
247
|
+
MVat , MVm , "Bad comparative51" ,
|
248
|
+
MVpt , MVm , "Bad comparative52" ,
|
249
|
+
MVat , MVa MVp , "Bad comparative53" ,
|
250
|
+
MVpt , MVa MVp , "Bad comparative54" ,
|
251
|
+
U#t , D##m D##y Om Oy Jm Jy Am MX#m
|
252
|
+
, "Bad comparative55" ,
|
253
|
+
Cc , EEm EEy MVm MVb MVy
|
254
|
+
, "Bad comparative56" ,
|
255
|
+
Sp#c , Dmcm Dmcy Om Oy Jm Jy MX#m
|
256
|
+
, "Bad comparative57" ,
|
257
|
+
Ss#c , Dmum Dmuy Om Oy Jm Jy Ds*y MX#m
|
258
|
+
, "Bad comparative58" ,
|
259
|
+
S##c , Dm#m D##y Om Oy Jm Jy MX#m
|
260
|
+
, "Bad comparative59" ,
|
261
|
+
THc , TH , "Bad comparative60" ,
|
262
|
+
TOc , TO** TOf* TOi* , "Bad comparative61" ,
|
263
|
+
TOtc , TOt , "Bad comparative62" ,
|
264
|
+
Ma** , TO TOf TH MVp TOt QI OF MVt MVz MVh Ytm Ya
|
265
|
+
, "Bad use of adjective63" ,
|
266
|
+
Mam , TO TOf TH MVp TOt QI OF MVt MVz MVh Ytm Ya
|
267
|
+
, "Bad use of adjective64" ,
|
268
|
+
MX#a , TO TOf TH MVp TOt QI OF MVt MVz MVh Ytm Ya MJ
|
269
|
+
, "Bad use of adjective65" ,
|
270
|
+
|
271
|
+
; There's no ZZZ connector, which means that Ixd and Oxn
|
272
|
+
; are prohibited from ever occuring. 4.0.batch covers this.
|
273
|
+
Ixd , ZZZ , "Can't use 'do' with that verb" ,
|
274
|
+
Oxn , ZZZ , "Bad use of pronoun66" ,
|
275
|
+
MVh , EExk EAxk D##k , "Incorrect use of that67" ,
|
276
|
+
|
277
|
+
; The Rw link necessitated commenting out 68, because we had to make B#m
|
278
|
+
; a restricted link(see above) xxx reverted .. this is needed ...
|
279
|
+
;
|
280
|
+
B#m , D##w H HA , "Bad use of gerund68"
|
281
|
+
|
282
|
+
CONTAINS_NONE_RULES:
|
283
|
+
S , Spxi , "Bad n-v agreement69" ,
|
284
|
+
SI , SIpxi , "Bad n-v agreement70" ,
|
285
|
+
Ws , B#m Ca BT , "Question inversion violated71" ,
|
286
|
+
SF , I* PP* TO* Pa* Pam Pg* Pv* LE* AFd* MVta
|
287
|
+
, "Bad use of 'filler' subject72" ,
|
288
|
+
SFI , I* PP* TO* Pa* Pam Pg* Pv* LE* AFd* MVta
|
289
|
+
, "Bad use of 'filler' subject73" ,
|
290
|
+
OX , I* PP* TO* Pa* Pam Pg* Pv* LE* AFd* MVta
|
291
|
+
, "Bad use of 'filler' subject74" ,
|
292
|
+
MXsr , Sp#w , "Bad n-v agreement75" ,
|
293
|
+
MXpr , Ss#w S#iw , "Bad n-v agreement76" ,
|
294
|
+
Mr , B#* , "Bad use of 'whose'77"
|
295
|
+
|
296
|
+
|
297
|
+
; ----------------------------------------------------------------------
|
298
|
+
; The following rule asserts that all specified domains must have the
|
299
|
+
; property that all of the words that touch a link in the domain are
|
300
|
+
; not to the left of the root word of the domain. These rules are
|
301
|
+
; different from the above in that the first field is a *domain name*,
|
302
|
+
; rather than a set of links.
|
303
|
+
|
304
|
+
BOUNDED_RULES:
|
305
|
+
s , "Unbounded s domain78" ,
|
306
|
+
r , "Unbounded r domain79"
|
data/data/en/4.0.regex
ADDED
@@ -0,0 +1,225 @@
|
|
1
|
+
%***************************************************************************%
|
2
|
+
% %
|
3
|
+
% Copyright (C) 2005, 2006 Sampo Pyysalo, Sophie Aubin %
|
4
|
+
% See file "LICENSE" for information about commercial use of this system %
|
5
|
+
% %
|
6
|
+
%***************************************************************************%
|
7
|
+
|
8
|
+
% This file contains regular expressions that are used to match
|
9
|
+
% tokens not found in the dictionary. Each regex is given a name which
|
10
|
+
% determines the disjuncts assigned when the regex matches; this name
|
11
|
+
% must be defined in the dictionary along with the appropriate disjuncts.
|
12
|
+
% Note that the order of the regular expressions matters: matches will
|
13
|
+
% be attempted in the order in which the regexs appear in this file,
|
14
|
+
% and only the first match will be used.
|
15
|
+
|
16
|
+
% Numbers.
|
17
|
+
% XXX, we need to add utf8 U+00A0 "no-break space"
|
18
|
+
%
|
19
|
+
% Allows at most two colons in hour-muinute-second HH:MM:SS expressions
|
20
|
+
% Allows at most two digits between colons
|
21
|
+
HMS-TIME: /^[0-9][0-9]?(:[0-9][0-9]?(:[0-9][0-9]?)?)?(AM|PM|am|pm)?$/
|
22
|
+
|
23
|
+
% e.g. 1950's leading number can be higher, for science fiction.
|
24
|
+
% Must be four digits, or possible three. Must end in s, 's ’s
|
25
|
+
DECADE-TIME: /^([1-4][0-9][0-9]|[1-9][0-9])0(s|'s|’s)$/
|
26
|
+
|
27
|
+
% Day-of-month names; this regex will match before the one below.
|
28
|
+
DAY-ORDINALS: /^(1st|2nd|3rd|[4-9]th|1[0-9]th|2(0th|1st|2nd|3rd|[4-9]th)|30th|31st)$/
|
29
|
+
|
30
|
+
% Ordinal numbers; everything except 1st through 13th
|
31
|
+
% is handled by regex.
|
32
|
+
ORDINALS: /^[1-9][0-9]*(0th|1st|2nd|3rd|[4-9]th)$/
|
33
|
+
|
34
|
+
% Allows any number of commas or periods
|
35
|
+
% Be careful not match the period at the end of a sentence;
|
36
|
+
% for example: "It happened in 1942."
|
37
|
+
NUMBERS: /^[0-9,.]*[0-9]$/
|
38
|
+
% This parses signed numbers and ranges, e.g. "-5" and "5-10" and "9+/-6.5"
|
39
|
+
NUMBERS: /^[0-9.,-]*[0-9](\+\/-[0-9.,-]*[0-9])?$/
|
40
|
+
% Parses simple fractions e.g. "1/60" with no decimal points or anything fancy
|
41
|
+
FRACTION: /^[0-9]+\/[0-9]+$/
|
42
|
+
% "10(3)" exponent (used in PubMed)
|
43
|
+
NUMBERS: /^[0-9.,-]*[0-9][0-9.,-]*\([0-9:.,-]*[0-9][0-9.,-]*\)$/
|
44
|
+
|
45
|
+
% Roman numerals
|
46
|
+
% The first expr has the potential(?) problem that it matches an empty
|
47
|
+
% string. Thus, the next three rules specify that at least one section
|
48
|
+
% is non-empty.
|
49
|
+
ROMAN-NUMERAL-WORDS: /^M*(CM|D?C{0,3}|CD)(XC|L?X{0,3}|XL)(IX|V?I{0,3}|IV)$/
|
50
|
+
% ROMAN-NUMERAL-WORDS: /^M*(CM|D?C{0,3}|CD){1}(XC|L?X{0,3}|XL)(IX|V?I{0,3}|IV)$/
|
51
|
+
% ROMAN-NUMERAL-WORDS: /^M*(CM|D?C{0,3}|CD)(XC|L?X{0,3}|XL){1}(IX|V?I{0,3}|IV)$/
|
52
|
+
% ROMAN-NUMERAL-WORDS: /^M*(CM|D?C{0,3}|CD)(XC|L?X{0,3}|XL)(IX|V?I{0,3}|IV){1}$/
|
53
|
+
|
54
|
+
% Strings of initials. e.g "Dr. J.G.D. Smith lives on Main St."
|
55
|
+
INITIALS: /^([A-Z]\.)+$/
|
56
|
+
|
57
|
+
% Greek letters with numbers
|
58
|
+
GREEK-LETTER-AND-NUMBER: /^(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)\-?[0-9]+$/
|
59
|
+
PL-GREEK-LETTER-AND-NUMBER: /^(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)s\-?[0-9]+$/
|
60
|
+
|
61
|
+
% Some "safe" derived units. Simple units are in dictionary.
|
62
|
+
% The idea here is for the regex to match something that is almost
|
63
|
+
% certainly part of a derived unit, and allow the rest to be
|
64
|
+
% anything; this way we can capture difficult derived units such
|
65
|
+
% as "mg/kg/day" and even oddities such as "micrograms/mouse/day"
|
66
|
+
% without listing them explicitly.
|
67
|
+
% TODO: add more.
|
68
|
+
% Some (real) misses from these:
|
69
|
+
% micrograms.kg-1.h-1 microM-1 J/cm2 %/day mN/m cm/yr
|
70
|
+
% m/s days/week ml/s degrees/sec cm/sec cm/s mm/s N/mm (is that a unit?)
|
71
|
+
% cuts/minute clicks/s beats/minute x/week W/kg/W %/patient-year
|
72
|
+
% microIU/ml degrees/s counts/mm2 cells/mm3 tumors/mouse
|
73
|
+
% mm/sec ml/hr mJ/cm(2) m2/g amol/mm2 animals/group
|
74
|
+
% h-1 min-1 day-1 cm-1 mg-1 kg-1 mg.m-2.min-1 ms.cm-1 g-1
|
75
|
+
% sec-1 ms-1 ml.min.-1kg-1 ml.hr-1
|
76
|
+
% also, both kilometer and kilometers seem to be absent(!)
|
77
|
+
% remember "mm"!
|
78
|
+
|
79
|
+
UNITS: /^([npmk]|nano|pico|milli|micro|kilo)?(g|grams?)\// % grams/anything
|
80
|
+
UNITS: /^([fnmp]|femto|nano|micro|pico|mu)?mol(es)?\// % mol/anything
|
81
|
+
UNITS: /^[a-zA-Z\/.]+\/((m|micro)?[lLg]|kg|mol|min|day|h)$/ % common endings
|
82
|
+
% common endings, except in the style "mg.kg-1" instead of "mg/kg".
|
83
|
+
UNITS: /^[a-zA-Z\/.1-]+\.((m|micro)?[lLg]|kg|mol|min|day|h)(-1|\(-1\))$/
|
84
|
+
|
85
|
+
% combinations of numbers and units, e.g. "50-kDa", "1-2h"
|
86
|
+
% TODO: Clean up and check that these are up-to-date wrt the
|
87
|
+
% dictionary-recognized units; this is quite a mess currently.
|
88
|
+
% TODO: Extend the "number" part of the regex to allow anything
|
89
|
+
% that the NUMBER regex matches.
|
90
|
+
% One problem here is a failure to split up the expression ...
|
91
|
+
% e.g. "2hr" becomes 2 - ND - hr with the ND link. But 2-hr is treated
|
92
|
+
% as a single word ('I is a 2-hr wait')
|
93
|
+
% NUMBER-AND-UNIT: /^[0-9.,-]+(msec|s|min|hour|h|hr|day|week|wk|month|year|yr|kDa|kilodalton|base|kilobase|base-pair|kD|kd|kDa|bp|nt|kb|mm|mg|cm|nm|g|Hz|ms|kg|ml|mL|km|microm|\%)$/
|
94
|
+
% Comment out above, it screws up handling of unit suffixes, for
|
95
|
+
% example: "Zangbert stock fell 30% to $2.50 yesterday."
|
96
|
+
|
97
|
+
|
98
|
+
% fold-words. Matches NUMBER-fold, where NUMBER can be either numeric
|
99
|
+
% or a spelled-out number, and the hyphen is optional. Note that for
|
100
|
+
% spelled-out numbers, anything is allowed between the "initial" number
|
101
|
+
% and "fold" to catch e.g. "two-to-three fold" ("fourteen" etc. are absent
|
102
|
+
% as the prefix "four" is sufficient to match).
|
103
|
+
FOLD-WORDS: /^[0-9.,:-]*[0-9]([0-9.,:-]|\([0-9.,:-]*[0-9][0-9.,:-]*\)|\+\/-)*-?fold$/
|
104
|
+
FOLD-WORDS: /^(one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fifteen|twenty|thirty|fifty|hundred|thousand|million).*fold$/
|
105
|
+
|
106
|
+
% Plural proper nouns.
|
107
|
+
% Make sure that apostrophe-s is split out correctly.
|
108
|
+
PL-CAPITALIZED-WORDS: /^[[:upper:]].*[^iuoys'’]s$/
|
109
|
+
|
110
|
+
% Other proper nouns.
|
111
|
+
% We demand that these end with an alphanumeric, i.e. explicitly
|
112
|
+
% reject punctuation. We don't want this regex to "swallow" any trailing
|
113
|
+
% commas, colons, or periods/question-marks at the end of sentences.
|
114
|
+
% In addition, this must not swallow words ending in 's 'll etc.
|
115
|
+
% (... any affix, for that matter ...) and so no embedded apostrophe
|
116
|
+
CAPITALIZED-WORDS: /^[[:upper:]][^'’]*[^[:punct:]]$/
|
117
|
+
|
118
|
+
% SUFFIX GUESSING
|
119
|
+
% For all suffix-guessing patterns, we insist that the pattern start
|
120
|
+
% with an alphanumeric. This is needed to guarentee that the
|
121
|
+
% prefix-stripping code works correctly, as otherwise, the regex will
|
122
|
+
% gobble the prefix. So for example: "We left (carrying the dog) and
|
123
|
+
% Fred followed." Since "(carrying" is not in the dict, we need to be
|
124
|
+
% sure to not match the leading paren so that it will get tripped.
|
125
|
+
%
|
126
|
+
ING-WORDS: /^\w.+ing$/
|
127
|
+
|
128
|
+
% Plurals or verb-s. Make sure that apostrophe-s is split out correctly.
|
129
|
+
% e.g. "The subject's name is John Doe." should be
|
130
|
+
% +--Ds--+---YS--+--Ds-+
|
131
|
+
% | | | |
|
132
|
+
% the subject.n 's.p name.n
|
133
|
+
S-WORDS: /^\w.+[^iuoys'’]s$/
|
134
|
+
|
135
|
+
% Verbs ending -ed.
|
136
|
+
ED-WORDS: /^\w.+ed$/
|
137
|
+
|
138
|
+
% Advebs ending -ly.
|
139
|
+
LY-WORDS: /^\w.+ly$/
|
140
|
+
|
141
|
+
% Nouns ending in -ism, -asm (chiliasm .. ) Usualy mass nouns
|
142
|
+
% Stubbed out for now; I'm not convinced this improves accuracy.
|
143
|
+
% ISM-WORDS: /^\w.+asm$/
|
144
|
+
% ISM-WORDS: /^\w.+ism$/
|
145
|
+
|
146
|
+
% Corresponding count noun version of above (chiliast...)
|
147
|
+
% AST-WORDS: /^\w.+ast$/
|
148
|
+
% AST-WORDS: /^\w.+ist$/
|
149
|
+
|
150
|
+
% Corresponding adjectival form of above
|
151
|
+
ADJ-WORDS: /^\w.+astic$/
|
152
|
+
ADJ-WORDS: /^\w.+istic$/
|
153
|
+
|
154
|
+
% Nouns ending -ation stubbed out in BioLG, stub out here ...
|
155
|
+
%ATION-WORDS: /^\w.+ation$/
|
156
|
+
|
157
|
+
% Extension by LIPN 11/10/2005
|
158
|
+
% nouns -- typically seen in (bio-)chemistry texts
|
159
|
+
% synthetase, kinase
|
160
|
+
% 5-(hydroxymethyl)-2’-deoxyuridine
|
161
|
+
% hydroxyethyl, hydroxymethyl
|
162
|
+
% septation, reguion
|
163
|
+
% isomaltotetraose, isomaltotriose
|
164
|
+
% glycosylphosphatidylinositol
|
165
|
+
% iodide, oligodeoxynucleotide
|
166
|
+
% chronicity, hypochromicity
|
167
|
+
MC-NOUN-WORDS: /^\w.+ase$/
|
168
|
+
MC-NOUN-WORDS: /^\w.+ine?$/
|
169
|
+
MC-NOUN-WORDS: /^\w.+yl$/
|
170
|
+
MC-NOUN-WORDS: /^\w.+ion$/
|
171
|
+
MC-NOUN-WORDS: /^\w.+ose$/
|
172
|
+
MC-NOUN-WORDS: /^\w.+ol$/
|
173
|
+
MC-NOUN-WORDS: /^\w.+ide$/
|
174
|
+
MC-NOUN-WORDS: /^\w.+ity$/
|
175
|
+
|
176
|
+
% replicon, intron
|
177
|
+
C-NOUN-WORDS: /^\w.+o[rn]$/
|
178
|
+
|
179
|
+
% adjectives
|
180
|
+
% exogenous, heterologous
|
181
|
+
% intermolecular, intramolecular
|
182
|
+
% glycolytic, ribonucleic, uronic
|
183
|
+
% ribosomal, ribsosomal
|
184
|
+
% nonpermissive, thermosensitive
|
185
|
+
% inducible, metastable
|
186
|
+
ADJ-WORDS: /^\w.+ous$/
|
187
|
+
ADJ-WORDS: /^\w.+ar$/
|
188
|
+
ADJ-WORDS: /^\w.+ic$/
|
189
|
+
ADJ-WORDS: /^\w.+al$/
|
190
|
+
ADJ-WORDS: /^\w.+ive$/
|
191
|
+
ADJ-WORDS: /^\w.+ble$/
|
192
|
+
|
193
|
+
% latin (postposed) adjectives
|
194
|
+
% influenzae, tarentolae
|
195
|
+
% pentosaceus, luteus, carnosus
|
196
|
+
LATIN-ADJ-WORDS: /^\w.+ae$/
|
197
|
+
LATIN-ADJ-WORDS: /^\w.+us$/ % must appear after -ous in this file
|
198
|
+
|
199
|
+
% latin (postposed) adjectives or latin plural noun
|
200
|
+
% brevis, israelensis
|
201
|
+
% japonicum, tabacum, xylinum
|
202
|
+
LATIN-ADJ-P-NOUN-WORDS: /^\w.+is?$/
|
203
|
+
LATIN-ADJ-S-NOUN-WORDS: /^\w.+um$/
|
204
|
+
|
205
|
+
|
206
|
+
% Hyphenated words. In the original LG morpho-guessing system that
|
207
|
+
% predated the regex-based system, hyphenated words were detected
|
208
|
+
% before ING-WORDS, S-WORDS etc., causing e.g. "cross-linked" to be
|
209
|
+
% treated as a HYPHENATED-WORD (a generic adjective/noun), and
|
210
|
+
% never a verb. To return to this ordering, move this regex just
|
211
|
+
% after the CAPITALIZED-WORDS regex.
|
212
|
+
HYPHENATED-WORDS: /^[[:alpha:][:digit:],.][[:alpha:][:digit:],.-]*-[[:alpha:][:digit:],.-]*[[:alpha:][:digit:],.]$/
|
213
|
+
|
214
|
+
% proteins often end "ase", so we'll assume those things are names.
|
215
|
+
% removed, too many false positives.
|
216
|
+
% NAME: /ase$/
|
217
|
+
|
218
|
+
% Sequence of punctuation marks. If some mark appears in the affix table
|
219
|
+
% such as a period, comma, dash or underscore, and there's a sequence of
|
220
|
+
% these, then treat it as a "fill-in-the-blank" placeholder.
|
221
|
+
% This matters only for punc. appearing in the affix table, since the
|
222
|
+
% tokenizer explicitly mangles based on these punctution marks.
|
223
|
+
%
|
224
|
+
% Look for at least four in a row.
|
225
|
+
UNKNOWN-WORD: /^[.,-]{4}[.,-]*$/
|