grammar_cop 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.DS_Store +0 -0
- data/.gitignore +4 -0
- data/Gemfile +4 -0
- data/Rakefile +8 -0
- data/data/.DS_Store +0 -0
- data/data/Makefile +511 -0
- data/data/Makefile.am +4 -0
- data/data/Makefile.in +511 -0
- data/data/de/.DS_Store +0 -0
- data/data/de/4.0.affix +7 -0
- data/data/de/4.0.dict +474 -0
- data/data/de/Makefile +387 -0
- data/data/de/Makefile.am +9 -0
- data/data/de/Makefile.in +387 -0
- data/data/en/.DS_Store +0 -0
- data/data/en/4.0.affix +26 -0
- data/data/en/4.0.batch +1002 -0
- data/data/en/4.0.biolg.batch +411 -0
- data/data/en/4.0.constituent-knowledge +127 -0
- data/data/en/4.0.dict +8759 -0
- data/data/en/4.0.dict.m4 +6928 -0
- data/data/en/4.0.enwiki.batch +14 -0
- data/data/en/4.0.fixes.batch +2776 -0
- data/data/en/4.0.knowledge +306 -0
- data/data/en/4.0.regex +225 -0
- data/data/en/4.0.voa.batch +114 -0
- data/data/en/Makefile +554 -0
- data/data/en/Makefile.am +19 -0
- data/data/en/Makefile.in +554 -0
- data/data/en/README +173 -0
- data/data/en/tiny.dict +157 -0
- data/data/en/words/.DS_Store +0 -0
- data/data/en/words/Makefile +456 -0
- data/data/en/words/Makefile.am +78 -0
- data/data/en/words/Makefile.in +456 -0
- data/data/en/words/currency +205 -0
- data/data/en/words/currency.p +28 -0
- data/data/en/words/entities.given-bisex.sing +39 -0
- data/data/en/words/entities.given-female.sing +4141 -0
- data/data/en/words/entities.given-male.sing +1633 -0
- data/data/en/words/entities.locations.sing +68 -0
- data/data/en/words/entities.national.sing +253 -0
- data/data/en/words/entities.organizations.sing +7 -0
- data/data/en/words/entities.us-states.sing +11 -0
- data/data/en/words/units.1 +45 -0
- data/data/en/words/units.1.dot +4 -0
- data/data/en/words/units.3 +2 -0
- data/data/en/words/units.4 +5 -0
- data/data/en/words/units.4.dot +1 -0
- data/data/en/words/words-medical.adv.1 +1191 -0
- data/data/en/words/words-medical.prep.1 +67 -0
- data/data/en/words/words-medical.v.4.1 +2835 -0
- data/data/en/words/words-medical.v.4.2 +2848 -0
- data/data/en/words/words-medical.v.4.3 +3011 -0
- data/data/en/words/words-medical.v.4.4 +3036 -0
- data/data/en/words/words-medical.v.4.5 +3050 -0
- data/data/en/words/words.adj.1 +6794 -0
- data/data/en/words/words.adj.2 +638 -0
- data/data/en/words/words.adj.3 +667 -0
- data/data/en/words/words.adv.1 +1573 -0
- data/data/en/words/words.adv.2 +67 -0
- data/data/en/words/words.adv.3 +157 -0
- data/data/en/words/words.adv.4 +80 -0
- data/data/en/words/words.n.1 +11464 -0
- data/data/en/words/words.n.1.wiki +264 -0
- data/data/en/words/words.n.2.s +2017 -0
- data/data/en/words/words.n.2.s.biolg +1 -0
- data/data/en/words/words.n.2.s.wiki +298 -0
- data/data/en/words/words.n.2.x +65 -0
- data/data/en/words/words.n.2.x.wiki +10 -0
- data/data/en/words/words.n.3 +5717 -0
- data/data/en/words/words.n.t +23 -0
- data/data/en/words/words.v.1.1 +1038 -0
- data/data/en/words/words.v.1.2 +1043 -0
- data/data/en/words/words.v.1.3 +1052 -0
- data/data/en/words/words.v.1.4 +1023 -0
- data/data/en/words/words.v.1.p +17 -0
- data/data/en/words/words.v.10.1 +14 -0
- data/data/en/words/words.v.10.2 +15 -0
- data/data/en/words/words.v.10.3 +88 -0
- data/data/en/words/words.v.10.4 +17 -0
- data/data/en/words/words.v.2.1 +1253 -0
- data/data/en/words/words.v.2.2 +1304 -0
- data/data/en/words/words.v.2.3 +1280 -0
- data/data/en/words/words.v.2.4 +1285 -0
- data/data/en/words/words.v.2.5 +1287 -0
- data/data/en/words/words.v.4.1 +2472 -0
- data/data/en/words/words.v.4.2 +2487 -0
- data/data/en/words/words.v.4.3 +2441 -0
- data/data/en/words/words.v.4.4 +2478 -0
- data/data/en/words/words.v.4.5 +2483 -0
- data/data/en/words/words.v.5.1 +98 -0
- data/data/en/words/words.v.5.2 +98 -0
- data/data/en/words/words.v.5.3 +103 -0
- data/data/en/words/words.v.5.4 +102 -0
- data/data/en/words/words.v.6.1 +388 -0
- data/data/en/words/words.v.6.2 +401 -0
- data/data/en/words/words.v.6.3 +397 -0
- data/data/en/words/words.v.6.4 +405 -0
- data/data/en/words/words.v.6.5 +401 -0
- data/data/en/words/words.v.8.1 +117 -0
- data/data/en/words/words.v.8.2 +118 -0
- data/data/en/words/words.v.8.3 +118 -0
- data/data/en/words/words.v.8.4 +119 -0
- data/data/en/words/words.v.8.5 +119 -0
- data/data/en/words/words.y +104 -0
- data/data/lt/.DS_Store +0 -0
- data/data/lt/4.0.affix +6 -0
- data/data/lt/4.0.constituent-knowledge +24 -0
- data/data/lt/4.0.dict +135 -0
- data/data/lt/4.0.knowledge +38 -0
- data/data/lt/Makefile +389 -0
- data/data/lt/Makefile.am +11 -0
- data/data/lt/Makefile.in +389 -0
- data/ext/.DS_Store +0 -0
- data/ext/link_grammar/.DS_Store +0 -0
- data/ext/link_grammar/extconf.rb +2 -0
- data/ext/link_grammar/link-grammar/.DS_Store +0 -0
- data/ext/link_grammar/link-grammar/.deps/analyze-linkage.Plo +198 -0
- data/ext/link_grammar/link-grammar/.deps/and.Plo +202 -0
- data/ext/link_grammar/link-grammar/.deps/api.Plo +244 -0
- data/ext/link_grammar/link-grammar/.deps/build-disjuncts.Plo +212 -0
- data/ext/link_grammar/link-grammar/.deps/command-line.Plo +201 -0
- data/ext/link_grammar/link-grammar/.deps/constituents.Plo +201 -0
- data/ext/link_grammar/link-grammar/.deps/count.Plo +202 -0
- data/ext/link_grammar/link-grammar/.deps/disjunct-utils.Plo +126 -0
- data/ext/link_grammar/link-grammar/.deps/disjuncts.Plo +123 -0
- data/ext/link_grammar/link-grammar/.deps/error.Plo +121 -0
- data/ext/link_grammar/link-grammar/.deps/expand.Plo +133 -0
- data/ext/link_grammar/link-grammar/.deps/extract-links.Plo +198 -0
- data/ext/link_grammar/link-grammar/.deps/fast-match.Plo +200 -0
- data/ext/link_grammar/link-grammar/.deps/idiom.Plo +200 -0
- data/ext/link_grammar/link-grammar/.deps/jni-client.Plo +217 -0
- data/ext/link_grammar/link-grammar/.deps/link-parser.Po +1 -0
- data/ext/link_grammar/link-grammar/.deps/massage.Plo +202 -0
- data/ext/link_grammar/link-grammar/.deps/post-process.Plo +202 -0
- data/ext/link_grammar/link-grammar/.deps/pp_knowledge.Plo +202 -0
- data/ext/link_grammar/link-grammar/.deps/pp_lexer.Plo +201 -0
- data/ext/link_grammar/link-grammar/.deps/pp_linkset.Plo +200 -0
- data/ext/link_grammar/link-grammar/.deps/prefix.Plo +102 -0
- data/ext/link_grammar/link-grammar/.deps/preparation.Plo +202 -0
- data/ext/link_grammar/link-grammar/.deps/print-util.Plo +200 -0
- data/ext/link_grammar/link-grammar/.deps/print.Plo +201 -0
- data/ext/link_grammar/link-grammar/.deps/prune.Plo +202 -0
- data/ext/link_grammar/link-grammar/.deps/read-dict.Plo +223 -0
- data/ext/link_grammar/link-grammar/.deps/read-regex.Plo +123 -0
- data/ext/link_grammar/link-grammar/.deps/regex-morph.Plo +131 -0
- data/ext/link_grammar/link-grammar/.deps/resources.Plo +203 -0
- data/ext/link_grammar/link-grammar/.deps/spellcheck-aspell.Plo +1 -0
- data/ext/link_grammar/link-grammar/.deps/spellcheck-hun.Plo +115 -0
- data/ext/link_grammar/link-grammar/.deps/string-set.Plo +198 -0
- data/ext/link_grammar/link-grammar/.deps/tokenize.Plo +160 -0
- data/ext/link_grammar/link-grammar/.deps/utilities.Plo +222 -0
- data/ext/link_grammar/link-grammar/.deps/word-file.Plo +201 -0
- data/ext/link_grammar/link-grammar/.deps/word-utils.Plo +212 -0
- data/ext/link_grammar/link-grammar/.libs/analyze-linkage.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/and.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/api.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/build-disjuncts.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/command-line.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/constituents.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/count.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/disjunct-utils.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/disjuncts.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/error.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/expand.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/extract-links.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/fast-match.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/idiom.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/jni-client.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java-symbols.expsym +31 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib.dSYM/Contents/Info.plist +20 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib.dSYM/Contents/Resources/DWARF/liblink-grammar-java.4.dylib +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.a +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.dylib +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar-symbols.expsym +194 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib.dSYM/Contents/Info.plist +20 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib.dSYM/Contents/Resources/DWARF/liblink-grammar.4.dylib +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar.a +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar.dylib +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar.la +41 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar.lai +41 -0
- data/ext/link_grammar/link-grammar/.libs/massage.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/post-process.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/pp_knowledge.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/pp_lexer.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/pp_linkset.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/prefix.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/preparation.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/print-util.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/print.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/prune.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/read-dict.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/read-regex.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/regex-morph.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/resources.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/spellcheck-aspell.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/spellcheck-hun.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/string-set.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/tokenize.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/utilities.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/word-file.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/word-utils.o +0 -0
- data/ext/link_grammar/link-grammar/Makefile +900 -0
- data/ext/link_grammar/link-grammar/Makefile.am +202 -0
- data/ext/link_grammar/link-grammar/Makefile.in +900 -0
- data/ext/link_grammar/link-grammar/analyze-linkage.c +1317 -0
- data/ext/link_grammar/link-grammar/analyze-linkage.h +24 -0
- data/ext/link_grammar/link-grammar/and.c +1603 -0
- data/ext/link_grammar/link-grammar/and.h +27 -0
- data/ext/link_grammar/link-grammar/api-structures.h +362 -0
- data/ext/link_grammar/link-grammar/api-types.h +72 -0
- data/ext/link_grammar/link-grammar/api.c +1887 -0
- data/ext/link_grammar/link-grammar/api.h +96 -0
- data/ext/link_grammar/link-grammar/autoit/.DS_Store +0 -0
- data/ext/link_grammar/link-grammar/autoit/README +10 -0
- data/ext/link_grammar/link-grammar/autoit/_LGTest.au3 +22 -0
- data/ext/link_grammar/link-grammar/autoit/_LinkGrammar.au3 +545 -0
- data/ext/link_grammar/link-grammar/build-disjuncts.c +487 -0
- data/ext/link_grammar/link-grammar/build-disjuncts.h +21 -0
- data/ext/link_grammar/link-grammar/command-line.c +458 -0
- data/ext/link_grammar/link-grammar/command-line.h +15 -0
- data/ext/link_grammar/link-grammar/constituents.c +1836 -0
- data/ext/link_grammar/link-grammar/constituents.h +26 -0
- data/ext/link_grammar/link-grammar/corpus/.DS_Store +0 -0
- data/ext/link_grammar/link-grammar/corpus/.deps/cluster.Plo +1 -0
- data/ext/link_grammar/link-grammar/corpus/.deps/corpus.Plo +1 -0
- data/ext/link_grammar/link-grammar/corpus/Makefile +527 -0
- data/ext/link_grammar/link-grammar/corpus/Makefile.am +46 -0
- data/ext/link_grammar/link-grammar/corpus/Makefile.in +527 -0
- data/ext/link_grammar/link-grammar/corpus/README +17 -0
- data/ext/link_grammar/link-grammar/corpus/cluster.c +286 -0
- data/ext/link_grammar/link-grammar/corpus/cluster.h +32 -0
- data/ext/link_grammar/link-grammar/corpus/corpus.c +483 -0
- data/ext/link_grammar/link-grammar/corpus/corpus.h +46 -0
- data/ext/link_grammar/link-grammar/count.c +828 -0
- data/ext/link_grammar/link-grammar/count.h +25 -0
- data/ext/link_grammar/link-grammar/disjunct-utils.c +261 -0
- data/ext/link_grammar/link-grammar/disjunct-utils.h +27 -0
- data/ext/link_grammar/link-grammar/disjuncts.c +138 -0
- data/ext/link_grammar/link-grammar/disjuncts.h +13 -0
- data/ext/link_grammar/link-grammar/error.c +92 -0
- data/ext/link_grammar/link-grammar/error.h +35 -0
- data/ext/link_grammar/link-grammar/expand.c +67 -0
- data/ext/link_grammar/link-grammar/expand.h +13 -0
- data/ext/link_grammar/link-grammar/externs.h +22 -0
- data/ext/link_grammar/link-grammar/extract-links.c +625 -0
- data/ext/link_grammar/link-grammar/extract-links.h +16 -0
- data/ext/link_grammar/link-grammar/fast-match.c +309 -0
- data/ext/link_grammar/link-grammar/fast-match.h +17 -0
- data/ext/link_grammar/link-grammar/idiom.c +373 -0
- data/ext/link_grammar/link-grammar/idiom.h +15 -0
- data/ext/link_grammar/link-grammar/jni-client.c +779 -0
- data/ext/link_grammar/link-grammar/jni-client.h +236 -0
- data/ext/link_grammar/link-grammar/liblink-grammar-java.la +42 -0
- data/ext/link_grammar/link-grammar/liblink-grammar.la +41 -0
- data/ext/link_grammar/link-grammar/link-features.h +37 -0
- data/ext/link_grammar/link-grammar/link-features.h.in +37 -0
- data/ext/link_grammar/link-grammar/link-grammar-java.def +31 -0
- data/ext/link_grammar/link-grammar/link-grammar.def +194 -0
- data/ext/link_grammar/link-grammar/link-includes.h +465 -0
- data/ext/link_grammar/link-grammar/link-parser.c +849 -0
- data/ext/link_grammar/link-grammar/massage.c +329 -0
- data/ext/link_grammar/link-grammar/massage.h +13 -0
- data/ext/link_grammar/link-grammar/post-process.c +1113 -0
- data/ext/link_grammar/link-grammar/post-process.h +45 -0
- data/ext/link_grammar/link-grammar/pp_knowledge.c +376 -0
- data/ext/link_grammar/link-grammar/pp_knowledge.h +14 -0
- data/ext/link_grammar/link-grammar/pp_lexer.c +1920 -0
- data/ext/link_grammar/link-grammar/pp_lexer.h +19 -0
- data/ext/link_grammar/link-grammar/pp_linkset.c +158 -0
- data/ext/link_grammar/link-grammar/pp_linkset.h +20 -0
- data/ext/link_grammar/link-grammar/prefix.c +482 -0
- data/ext/link_grammar/link-grammar/prefix.h +139 -0
- data/ext/link_grammar/link-grammar/preparation.c +412 -0
- data/ext/link_grammar/link-grammar/preparation.h +20 -0
- data/ext/link_grammar/link-grammar/print-util.c +87 -0
- data/ext/link_grammar/link-grammar/print-util.h +32 -0
- data/ext/link_grammar/link-grammar/print.c +1085 -0
- data/ext/link_grammar/link-grammar/print.h +16 -0
- data/ext/link_grammar/link-grammar/prune.c +1864 -0
- data/ext/link_grammar/link-grammar/prune.h +17 -0
- data/ext/link_grammar/link-grammar/read-dict.c +1785 -0
- data/ext/link_grammar/link-grammar/read-dict.h +29 -0
- data/ext/link_grammar/link-grammar/read-regex.c +161 -0
- data/ext/link_grammar/link-grammar/read-regex.h +12 -0
- data/ext/link_grammar/link-grammar/regex-morph.c +126 -0
- data/ext/link_grammar/link-grammar/regex-morph.h +17 -0
- data/ext/link_grammar/link-grammar/resources.c +180 -0
- data/ext/link_grammar/link-grammar/resources.h +23 -0
- data/ext/link_grammar/link-grammar/sat-solver/.DS_Store +0 -0
- data/ext/link_grammar/link-grammar/sat-solver/.deps/fast-sprintf.Plo +1 -0
- data/ext/link_grammar/link-grammar/sat-solver/.deps/sat-encoder.Plo +1 -0
- data/ext/link_grammar/link-grammar/sat-solver/.deps/util.Plo +1 -0
- data/ext/link_grammar/link-grammar/sat-solver/.deps/variables.Plo +1 -0
- data/ext/link_grammar/link-grammar/sat-solver/.deps/word-tag.Plo +1 -0
- data/ext/link_grammar/link-grammar/sat-solver/Makefile +527 -0
- data/ext/link_grammar/link-grammar/sat-solver/Makefile.am +29 -0
- data/ext/link_grammar/link-grammar/sat-solver/Makefile.in +527 -0
- data/ext/link_grammar/link-grammar/sat-solver/clock.hpp +33 -0
- data/ext/link_grammar/link-grammar/sat-solver/fast-sprintf.cpp +26 -0
- data/ext/link_grammar/link-grammar/sat-solver/fast-sprintf.hpp +7 -0
- data/ext/link_grammar/link-grammar/sat-solver/guiding.hpp +244 -0
- data/ext/link_grammar/link-grammar/sat-solver/matrix-ut.hpp +79 -0
- data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.cpp +2811 -0
- data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.h +11 -0
- data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.hpp +381 -0
- data/ext/link_grammar/link-grammar/sat-solver/trie.hpp +118 -0
- data/ext/link_grammar/link-grammar/sat-solver/util.cpp +23 -0
- data/ext/link_grammar/link-grammar/sat-solver/util.hpp +14 -0
- data/ext/link_grammar/link-grammar/sat-solver/variables.cpp +5 -0
- data/ext/link_grammar/link-grammar/sat-solver/variables.hpp +829 -0
- data/ext/link_grammar/link-grammar/sat-solver/word-tag.cpp +159 -0
- data/ext/link_grammar/link-grammar/sat-solver/word-tag.hpp +162 -0
- data/ext/link_grammar/link-grammar/spellcheck-aspell.c +148 -0
- data/ext/link_grammar/link-grammar/spellcheck-hun.c +136 -0
- data/ext/link_grammar/link-grammar/spellcheck.h +34 -0
- data/ext/link_grammar/link-grammar/string-set.c +169 -0
- data/ext/link_grammar/link-grammar/string-set.h +16 -0
- data/ext/link_grammar/link-grammar/structures.h +498 -0
- data/ext/link_grammar/link-grammar/tokenize.c +1049 -0
- data/ext/link_grammar/link-grammar/tokenize.h +15 -0
- data/ext/link_grammar/link-grammar/utilities.c +847 -0
- data/ext/link_grammar/link-grammar/utilities.h +281 -0
- data/ext/link_grammar/link-grammar/word-file.c +124 -0
- data/ext/link_grammar/link-grammar/word-file.h +15 -0
- data/ext/link_grammar/link-grammar/word-utils.c +526 -0
- data/ext/link_grammar/link-grammar/word-utils.h +152 -0
- data/ext/link_grammar/link_grammar.c +202 -0
- data/ext/link_grammar/link_grammar.h +99 -0
- data/grammar_cop.gemspec +24 -0
- data/lib/.DS_Store +0 -0
- data/lib/grammar_cop.rb +9 -0
- data/lib/grammar_cop/.DS_Store +0 -0
- data/lib/grammar_cop/dictionary.rb +19 -0
- data/lib/grammar_cop/linkage.rb +30 -0
- data/lib/grammar_cop/parse_options.rb +32 -0
- data/lib/grammar_cop/sentence.rb +36 -0
- data/lib/grammar_cop/version.rb +3 -0
- data/test/.DS_Store +0 -0
- data/test/grammar_cop_test.rb +27 -0
- metadata +407 -0
@@ -0,0 +1,15 @@
|
|
1
|
+
/*************************************************************************/
|
2
|
+
/* Copyright (c) 2004 */
|
3
|
+
/* Daniel Sleator, David Temperley, and John Lafferty */
|
4
|
+
/* All rights reserved */
|
5
|
+
/* */
|
6
|
+
/* Use of the link grammar parsing system is subject to the terms of the */
|
7
|
+
/* license set forth in the LICENSE file included with this software, */
|
8
|
+
/* and also available at http://www.link.cs.cmu.edu/link/license.html */
|
9
|
+
/* This license allows free redistribution and use in source and binary */
|
10
|
+
/* forms, with or without modification, subject to certain conditions. */
|
11
|
+
/* */
|
12
|
+
/*************************************************************************/
|
13
|
+
|
14
|
+
|
15
|
+
|
@@ -0,0 +1,1836 @@
|
|
1
|
+
/*************************************************************************/
|
2
|
+
/* Copyright (c) 2004 */
|
3
|
+
/* Daniel Sleator, David Temperley, and John Lafferty */
|
4
|
+
/* All rights reserved */
|
5
|
+
/* */
|
6
|
+
/* Use of the link grammar parsing system is subject to the terms of the */
|
7
|
+
/* license set forth in the LICENSE file included with this software, */
|
8
|
+
/* and also available at http://www.link.cs.cmu.edu/link/license.html */
|
9
|
+
/* This license allows free redistribution and use in source and binary */
|
10
|
+
/* forms, with or without modification, subject to certain conditions. */
|
11
|
+
/* */
|
12
|
+
/*************************************************************************/
|
13
|
+
|
14
|
+
#include <stdarg.h>
|
15
|
+
#include <string.h>
|
16
|
+
#include <link-grammar/api.h>
|
17
|
+
#include "error.h"
|
18
|
+
#include "constituents.h"
|
19
|
+
|
20
|
+
#define MAXCONSTITUENTS 8192
|
21
|
+
#define MAXSUBL 16
|
22
|
+
#define OPEN_BRACKET '['
|
23
|
+
#define CLOSE_BRACKET ']'
|
24
|
+
|
25
|
+
typedef enum {OPEN_TOK, CLOSE_TOK, WORD_TOK} CType;
|
26
|
+
typedef enum {NONE, STYPE, PTYPE, QTYPE, QDTYPE} WType;
|
27
|
+
|
28
|
+
typedef struct
|
29
|
+
{
|
30
|
+
int left;
|
31
|
+
int right;
|
32
|
+
const char * type;
|
33
|
+
char domain_type;
|
34
|
+
const char * start_link;
|
35
|
+
int start_num;
|
36
|
+
int subl;
|
37
|
+
int canon;
|
38
|
+
int valid;
|
39
|
+
#ifdef AUX_CODE_IS_DEAD
|
40
|
+
/* The only code that actually sets aux to a non-zero value is code
|
41
|
+
* followed by code that zets it to zero. -- its dead code, and so
|
42
|
+
* aux is never actually used. Comment this code out.
|
43
|
+
*/
|
44
|
+
int aux;
|
45
|
+
/* 0: it's an ordinary VP (or other type);
|
46
|
+
* 1: it's an AUX, don't print it;
|
47
|
+
* 2: it's an AUX, and print it
|
48
|
+
*/
|
49
|
+
#endif /* AUX_CODE_IS_DEAD */
|
50
|
+
} constituent_t;
|
51
|
+
|
52
|
+
/* XXX it seems like the old code worked fine with MAX_ELTS=10 */
|
53
|
+
#define MAX_ELTS 100
|
54
|
+
typedef struct
|
55
|
+
{
|
56
|
+
int num;
|
57
|
+
int e[MAX_ELTS];
|
58
|
+
int valid;
|
59
|
+
} andlist_t;
|
60
|
+
|
61
|
+
/*
|
62
|
+
* Context used to store assorted intermediate data
|
63
|
+
* when the constituent string is being generated.
|
64
|
+
*/
|
65
|
+
#define MAX_ANDS 1024
|
66
|
+
typedef struct
|
67
|
+
{
|
68
|
+
String_set * phrase_ss;
|
69
|
+
WType wordtype[MAX_SENTENCE];
|
70
|
+
int word_used[MAXSUBL][MAX_SENTENCE];
|
71
|
+
int templist[MAX_ELTS];
|
72
|
+
constituent_t constituent[MAXCONSTITUENTS];
|
73
|
+
andlist_t andlist[MAX_ANDS];
|
74
|
+
} con_context_t;
|
75
|
+
|
76
|
+
/* ================================================================ */
|
77
|
+
|
78
|
+
static inline int uppercompare(const char * s, const char * t)
|
79
|
+
{
|
80
|
+
return (FALSE == utf8_upper_match(s,t));
|
81
|
+
}
|
82
|
+
|
83
|
+
/**
|
84
|
+
* If a constituent c has a comma at either end, we exclude the
|
85
|
+
* comma. (We continue to shift the boundary until we get to
|
86
|
+
* something inside the current sublinkage)
|
87
|
+
*/
|
88
|
+
static void adjust_for_left_comma(con_context_t * ctxt, Linkage linkage, int c)
|
89
|
+
{
|
90
|
+
int w;
|
91
|
+
w = ctxt->constituent[c].left;
|
92
|
+
if (strcmp(linkage->word[w], ",") == 0)
|
93
|
+
{
|
94
|
+
w++;
|
95
|
+
while (1) {
|
96
|
+
if (ctxt->word_used[linkage->current][w] == 1) break;
|
97
|
+
w++;
|
98
|
+
}
|
99
|
+
}
|
100
|
+
ctxt->constituent[c].left = w;
|
101
|
+
}
|
102
|
+
|
103
|
+
static void adjust_for_right_comma(con_context_t *ctxt, Linkage linkage, int c)
|
104
|
+
{
|
105
|
+
int w;
|
106
|
+
w = ctxt->constituent[c].right;
|
107
|
+
if ((strcmp(linkage->word[w], ",") == 0) ||
|
108
|
+
(strcmp(linkage->word[w], "RIGHT-WALL") == 0))
|
109
|
+
{
|
110
|
+
w--;
|
111
|
+
while (1)
|
112
|
+
{
|
113
|
+
if (ctxt->word_used[linkage->current][w]==1) break;
|
114
|
+
w--;
|
115
|
+
}
|
116
|
+
}
|
117
|
+
ctxt->constituent[c].right = w;
|
118
|
+
}
|
119
|
+
|
120
|
+
static void print_constituent(con_context_t *ctxt, Linkage linkage, int c)
|
121
|
+
{
|
122
|
+
int w;
|
123
|
+
if (verbosity < 2) return;
|
124
|
+
|
125
|
+
printf(" c %2d %4s [%c] (%2d-%2d): ",
|
126
|
+
c, ctxt->constituent[c].type, ctxt->constituent[c].domain_type,
|
127
|
+
ctxt->constituent[c].left, ctxt->constituent[c].right);
|
128
|
+
for (w = ctxt->constituent[c].left; w <= ctxt->constituent[c].right; w++) {
|
129
|
+
printf("%s ", linkage->word[w]); /**PV**/
|
130
|
+
}
|
131
|
+
printf("\n");
|
132
|
+
}
|
133
|
+
|
134
|
+
/******************************************************
|
135
|
+
* These functions do the bulk of the actual
|
136
|
+
* constituent-generating; they're called once for each
|
137
|
+
* sublinkage
|
138
|
+
*********************************************************/
|
139
|
+
|
140
|
+
/**
|
141
|
+
* This function looks for constituents of type ctype1. Say it finds
|
142
|
+
* one, call it c1. It searches for the next larger constituent of
|
143
|
+
* type ctype2, call it c2. It then generates a new constituent of
|
144
|
+
* ctype3, containing all the words in c2 but not c1.
|
145
|
+
*/
|
146
|
+
static int gen_comp(con_context_t *ctxt, Linkage linkage,
|
147
|
+
int numcon_total, int numcon_subl,
|
148
|
+
const char * ctype1, const char * ctype2,
|
149
|
+
const char * ctype3, int x)
|
150
|
+
{
|
151
|
+
int w, w2, w3, c, c1, c2, done;
|
152
|
+
c = numcon_total + numcon_subl;
|
153
|
+
|
154
|
+
for (c1=numcon_total; c1<numcon_total + numcon_subl; c1++)
|
155
|
+
{
|
156
|
+
/* If ctype1 is NP, it has to be an appositive to continue */
|
157
|
+
if ((x==4) && (post_process_match("MX#*", ctxt->constituent[c1].start_link)==0))
|
158
|
+
continue;
|
159
|
+
|
160
|
+
/* If ctype1 is X, and domain_type is t, it's an infinitive - skip it */
|
161
|
+
if ((x==2) && (ctxt->constituent[c1].domain_type=='t'))
|
162
|
+
continue;
|
163
|
+
|
164
|
+
/* If it's domain-type z, it's a subject-relative clause;
|
165
|
+
the VP doesn't need an NP */
|
166
|
+
if (ctxt->constituent[c1].domain_type=='z')
|
167
|
+
continue;
|
168
|
+
|
169
|
+
/* If ctype1 is X or VP, and it's not started by an S, don't generate an NP
|
170
|
+
(Neither of the two previous checks are necessary now, right?) */
|
171
|
+
if ((x==1 || x==2) &&
|
172
|
+
(((post_process_match("S", ctxt->constituent[c1].start_link) == 0) &&
|
173
|
+
(post_process_match("SX", ctxt->constituent[c1].start_link) == 0) &&
|
174
|
+
(post_process_match("SF", ctxt->constituent[c1].start_link) == 0)) ||
|
175
|
+
(post_process_match("S##w", ctxt->constituent[c1].start_link) != 0)))
|
176
|
+
continue;
|
177
|
+
|
178
|
+
/* If it's an SBAR (relative clause case), it has to be a relative clause */
|
179
|
+
if ((x==3) &&
|
180
|
+
((post_process_match("Rn", ctxt->constituent[c1].start_link) == 0) &&
|
181
|
+
(post_process_match("R*", ctxt->constituent[c1].start_link) == 0) &&
|
182
|
+
(post_process_match("MX#r", ctxt->constituent[c1].start_link) == 0) &&
|
183
|
+
(post_process_match("Mr", ctxt->constituent[c1].start_link) == 0) &&
|
184
|
+
(post_process_match("MX#d", ctxt->constituent[c1].start_link) == 0)))
|
185
|
+
continue;
|
186
|
+
|
187
|
+
/* If ctype1 is SBAR (clause opener case), it has to be an f domain */
|
188
|
+
if ((x==5) && (ctxt->constituent[c1].domain_type!='f'))
|
189
|
+
continue;
|
190
|
+
|
191
|
+
/* If ctype1 is SBAR (pp opener case), it has to be a g domain */
|
192
|
+
if ((x==6) && (ctxt->constituent[c1].domain_type!='g'))
|
193
|
+
continue;
|
194
|
+
|
195
|
+
/* If ctype1 is NP (paraphrase case), it has to be started by an SI */
|
196
|
+
if ((x==7) && (post_process_match("SI", ctxt->constituent[c1].start_link)==0))
|
197
|
+
continue;
|
198
|
+
|
199
|
+
/* If ctype1 is VP (participle modifier case), it has to be
|
200
|
+
started by an Mv or Mg */
|
201
|
+
if ((x==8) && (post_process_match("M", ctxt->constituent[c1].start_link)==0))
|
202
|
+
continue;
|
203
|
+
|
204
|
+
/* If ctype1 is VP (participle opener case), it has
|
205
|
+
to be started by a COp */
|
206
|
+
if ((x==9) && (post_process_match("COp", ctxt->constituent[c1].start_link)==0))
|
207
|
+
continue;
|
208
|
+
|
209
|
+
/* Now start at the bounds of c1, and work outwards until you
|
210
|
+
find a larger constituent of type ctype2 */
|
211
|
+
if (!(strcmp(ctxt->constituent[c1].type, ctype1)==0))
|
212
|
+
continue;
|
213
|
+
|
214
|
+
if (verbosity >= 2)
|
215
|
+
printf("Generating complement constituent for c %d of type %s\n",
|
216
|
+
c1, ctype1);
|
217
|
+
done = 0;
|
218
|
+
for (w2=ctxt->constituent[c1].left; (done==0) && (w2>=0); w2--) {
|
219
|
+
for (w3=ctxt->constituent[c1].right; w3<linkage->num_words; w3++) {
|
220
|
+
for (c2=numcon_total; (done==0) &&
|
221
|
+
(c2 < numcon_total + numcon_subl); c2++) {
|
222
|
+
if (!((ctxt->constituent[c2].left==w2) &&
|
223
|
+
(ctxt->constituent[c2].right==w3)) || (c2==c1))
|
224
|
+
continue;
|
225
|
+
if (!(strcmp(ctxt->constituent[c2].type, ctype2)==0))
|
226
|
+
continue;
|
227
|
+
|
228
|
+
/* if the new constituent (c) is to the left
|
229
|
+
of c1, its right edge should be adjacent to the
|
230
|
+
left edge of c1 - or as close as possible
|
231
|
+
without going outside the current sublinkage.
|
232
|
+
(Or substituting right and left as necessary.) */
|
233
|
+
|
234
|
+
if ((x==5) || (x==6) || (x==9)) {
|
235
|
+
/* This is the case where c is to the
|
236
|
+
RIGHT of c1 */
|
237
|
+
w = ctxt->constituent[c1].right+1;
|
238
|
+
while(1) {
|
239
|
+
if (ctxt->word_used[linkage->current][w]==1)
|
240
|
+
break;
|
241
|
+
w++;
|
242
|
+
}
|
243
|
+
if (w > ctxt->constituent[c2].right)
|
244
|
+
{
|
245
|
+
done=1;
|
246
|
+
continue;
|
247
|
+
}
|
248
|
+
ctxt->constituent[c].left = w;
|
249
|
+
ctxt->constituent[c].right = ctxt->constituent[c2].right;
|
250
|
+
}
|
251
|
+
else {
|
252
|
+
w = ctxt->constituent[c1].left-1;
|
253
|
+
while(1) {
|
254
|
+
if (ctxt->word_used[linkage->current][w] == 1)
|
255
|
+
break;
|
256
|
+
w--;
|
257
|
+
}
|
258
|
+
if (w < ctxt->constituent[c2].left) {
|
259
|
+
done=1;
|
260
|
+
continue;
|
261
|
+
}
|
262
|
+
ctxt->constituent[c].right = w;
|
263
|
+
ctxt->constituent[c].left = ctxt->constituent[c2].left;
|
264
|
+
}
|
265
|
+
|
266
|
+
adjust_for_left_comma(ctxt, linkage, c1);
|
267
|
+
adjust_for_right_comma(ctxt, linkage, c1);
|
268
|
+
|
269
|
+
ctxt->constituent[c].type =
|
270
|
+
string_set_add(ctype3, ctxt->phrase_ss);
|
271
|
+
ctxt->constituent[c].domain_type = 'x';
|
272
|
+
ctxt->constituent[c].start_link =
|
273
|
+
string_set_add("XX", ctxt->phrase_ss);
|
274
|
+
ctxt->constituent[c].start_num =
|
275
|
+
ctxt->constituent[c1].start_num; /* bogus */
|
276
|
+
if (verbosity >= 2)
|
277
|
+
{
|
278
|
+
printf("Larger c found: c %d (%s); ",
|
279
|
+
c2, ctype2);
|
280
|
+
printf("Adding constituent:\n");
|
281
|
+
print_constituent(ctxt, linkage, c);
|
282
|
+
}
|
283
|
+
c++;
|
284
|
+
if (MAXCONSTITUENTS <= c)
|
285
|
+
{
|
286
|
+
err_ctxt ec;
|
287
|
+
ec.sent = linkage->sent;
|
288
|
+
err_msg(&ec, Error, "Error: Too many constituents (a).\n");
|
289
|
+
c--;
|
290
|
+
}
|
291
|
+
done = 1;
|
292
|
+
}
|
293
|
+
}
|
294
|
+
}
|
295
|
+
if (verbosity >= 2)
|
296
|
+
{
|
297
|
+
if (done == 0)
|
298
|
+
printf("No constituent added, because no larger %s " \
|
299
|
+
" was found\n", ctype2);
|
300
|
+
}
|
301
|
+
}
|
302
|
+
numcon_subl = c - numcon_total;
|
303
|
+
return numcon_subl;
|
304
|
+
}
|
305
|
+
|
306
|
+
/**
|
307
|
+
* Look for a constituent started by an MVs or MVg.
|
308
|
+
* Find any VP's or ADJP's that contain it (without going
|
309
|
+
* beyond a larger S or NP). Adjust them so that
|
310
|
+
* they end right before the m domain starts.
|
311
|
+
*/
|
312
|
+
static void adjust_subordinate_clauses(con_context_t *ctxt, Linkage linkage,
|
313
|
+
int numcon_total,
|
314
|
+
int numcon_subl)
|
315
|
+
{
|
316
|
+
int c, w, c2, w2, done;
|
317
|
+
|
318
|
+
for (c=numcon_total; c<numcon_total + numcon_subl; c++) {
|
319
|
+
if ((post_process_match("MVs", ctxt->constituent[c].start_link) == 1) ||
|
320
|
+
(post_process_match("MVg", ctxt->constituent[c].start_link)==1)) {
|
321
|
+
done=0;
|
322
|
+
for (w2=ctxt->constituent[c].left-1; (done==0) && w2>=0; w2--) {
|
323
|
+
for (c2=numcon_total; c2<numcon_total + numcon_subl; c2++) {
|
324
|
+
if (!((ctxt->constituent[c2].left==w2) &&
|
325
|
+
(ctxt->constituent[c2].right >= ctxt->constituent[c].right)))
|
326
|
+
continue;
|
327
|
+
if ((strcmp(ctxt->constituent[c2].type, "S") == 0) ||
|
328
|
+
(strcmp(ctxt->constituent[c2].type, "NP") == 0)) {
|
329
|
+
done=1;
|
330
|
+
break;
|
331
|
+
}
|
332
|
+
if ((ctxt->constituent[c2].domain_type == 'v') ||
|
333
|
+
(ctxt->constituent[c2].domain_type == 'a')) {
|
334
|
+
w = ctxt->constituent[c].left-1;
|
335
|
+
while (1) {
|
336
|
+
if (ctxt->word_used[linkage->current][w] == 1) break;
|
337
|
+
w--;
|
338
|
+
}
|
339
|
+
ctxt->constituent[c2].right = w;
|
340
|
+
|
341
|
+
if (verbosity >= 2)
|
342
|
+
printf("Adjusting constituent %d:\n", c2);
|
343
|
+
print_constituent(ctxt, linkage, c2);
|
344
|
+
}
|
345
|
+
}
|
346
|
+
}
|
347
|
+
if (strcmp(linkage->word[ctxt->constituent[c].left], ",") == 0)
|
348
|
+
ctxt->constituent[c].left++;
|
349
|
+
}
|
350
|
+
}
|
351
|
+
}
|
352
|
+
|
353
|
+
/******************************************************
|
354
|
+
* These functions are called once, after constituents
|
355
|
+
* for each sublinkage have been generated, to merge them
|
356
|
+
* together and fix up some other things.
|
357
|
+
*
|
358
|
+
********************************************************/
|
359
|
+
|
360
|
+
/**
|
361
|
+
* Here we're looking for the next andlist element to add on
|
362
|
+
* to a conjectural andlist, stored in the array templist.
|
363
|
+
* We go through the constituents, starting at "start".
|
364
|
+
*/
|
365
|
+
static int find_next_element(con_context_t *ctxt,
|
366
|
+
Linkage linkage,
|
367
|
+
int start,
|
368
|
+
int numcon_total,
|
369
|
+
int num_elements,
|
370
|
+
int num_lists)
|
371
|
+
{
|
372
|
+
int c, a, ok, c2, c3, addedone=0, n;
|
373
|
+
|
374
|
+
assert(num_elements <= MAX_ELTS, "Constutent element array overflow!\n");
|
375
|
+
|
376
|
+
n = num_lists;
|
377
|
+
for (c=start+1; c<numcon_total; c++)
|
378
|
+
{
|
379
|
+
constituent_t *cc = &ctxt->constituent[c];
|
380
|
+
|
381
|
+
if (cc->valid == 0)
|
382
|
+
continue;
|
383
|
+
if (strcmp(ctxt->constituent[ctxt->templist[0]].type, cc->type)!=0)
|
384
|
+
continue;
|
385
|
+
ok = 1;
|
386
|
+
|
387
|
+
/* We're considering adding constituent c to the andlist.
|
388
|
+
If c is in the same sublinkage as one of the other andlist
|
389
|
+
elements, don't add it. If it overlaps with one of the other
|
390
|
+
constituents, don't add it. If there's a constituent
|
391
|
+
identical to c that occurs in a sublinkage in which one of
|
392
|
+
the other elements occurs, don't add it. */
|
393
|
+
|
394
|
+
for (a=0; a<num_elements; a++)
|
395
|
+
{
|
396
|
+
int t = ctxt->templist[a];
|
397
|
+
constituent_t *ct = &ctxt->constituent[t];
|
398
|
+
|
399
|
+
if (cc->subl == ct->subl)
|
400
|
+
ok=0;
|
401
|
+
if (((cc->left < ct->left) && (cc->right > ct->left))
|
402
|
+
||
|
403
|
+
((cc->right > ct->right) && (cc->left < ct->right))
|
404
|
+
||
|
405
|
+
((cc->right > ct->right) && (cc->left < ct->right))
|
406
|
+
||
|
407
|
+
((cc->left > ct->left) && (cc->right < ct->right)))
|
408
|
+
ok=0;
|
409
|
+
|
410
|
+
for (c2=0; c2<numcon_total; c2++)
|
411
|
+
{
|
412
|
+
if (ctxt->constituent[c2].canon != cc->canon)
|
413
|
+
continue;
|
414
|
+
for (c3=0; c3<numcon_total; c3++)
|
415
|
+
{
|
416
|
+
if ((ctxt->constituent[c3].canon == ct->canon)
|
417
|
+
&& (ctxt->constituent[c3].subl == ctxt->constituent[c2].subl))
|
418
|
+
ok=0;
|
419
|
+
}
|
420
|
+
}
|
421
|
+
}
|
422
|
+
if (ok == 0) continue;
|
423
|
+
|
424
|
+
ctxt->templist[num_elements] = c;
|
425
|
+
addedone = 1;
|
426
|
+
num_lists = find_next_element(ctxt, linkage, c, numcon_total,
|
427
|
+
num_elements+1, num_lists);
|
428
|
+
|
429
|
+
/* Test for overlow of the and-list.
|
430
|
+
* With the current parser, the following will cause an
|
431
|
+
* overflow:
|
432
|
+
*
|
433
|
+
* I have not seen the grysbok, or the suni, or the dibitag, or
|
434
|
+
* the lechwi, or the aoul, or the gerenuk, or the blaauwbok,
|
435
|
+
* or the chevrotain, or lots of others, but who in the world
|
436
|
+
* could guess what they were or what they looked like, judging
|
437
|
+
* only from the names?
|
438
|
+
*/
|
439
|
+
if (MAX_ANDS <= num_lists)
|
440
|
+
{
|
441
|
+
err_ctxt ec;
|
442
|
+
ec.sent = linkage->sent;
|
443
|
+
err_msg(&ec, Error, "Error: Constituent overflowed andlist!\n");
|
444
|
+
return MAX_ANDS;
|
445
|
+
}
|
446
|
+
}
|
447
|
+
|
448
|
+
if (addedone == 0 && num_elements > 1)
|
449
|
+
{
|
450
|
+
for (a=0; a<num_elements; a++) {
|
451
|
+
ctxt->andlist[num_lists].e[a] = ctxt->templist[a];
|
452
|
+
ctxt->andlist[num_lists].num = num_elements;
|
453
|
+
}
|
454
|
+
num_lists++;
|
455
|
+
}
|
456
|
+
return num_lists;
|
457
|
+
}
|
458
|
+
|
459
|
+
static int merge_constituents(con_context_t *ctxt, Linkage linkage, int numcon_total)
|
460
|
+
{
|
461
|
+
int c1, c2=0, c3, ok, a, n, a2, n2, match, listmatch, a3;
|
462
|
+
int num_lists, num_elements;
|
463
|
+
int leftend, rightend;
|
464
|
+
|
465
|
+
for (c1=0; c1<numcon_total; c1++)
|
466
|
+
{
|
467
|
+
ctxt->constituent[c1].valid = 1;
|
468
|
+
|
469
|
+
/* Find and invalidate any constituents with negative length */
|
470
|
+
if(ctxt->constituent[c1].right < ctxt->constituent[c1].left)
|
471
|
+
{
|
472
|
+
if(verbosity >= 2)
|
473
|
+
{
|
474
|
+
err_ctxt ec;
|
475
|
+
ec.sent = linkage->sent;
|
476
|
+
err_msg(&ec, Warn,
|
477
|
+
"Warning: Constituent %d has negative length. Deleting it.\n", c1);
|
478
|
+
}
|
479
|
+
ctxt->constituent[c1].valid = 0;
|
480
|
+
}
|
481
|
+
ctxt->constituent[c1].canon = c1;
|
482
|
+
}
|
483
|
+
|
484
|
+
/* First go through and give each constituent a canonical number
|
485
|
+
(the index number of the lowest-numbered constituent
|
486
|
+
identical to it) */
|
487
|
+
|
488
|
+
for (c1 = 0; c1 < numcon_total; c1++)
|
489
|
+
{
|
490
|
+
if (ctxt->constituent[c1].canon != c1) continue;
|
491
|
+
for (c2 = c1 + 1; c2 < numcon_total; c2++)
|
492
|
+
{
|
493
|
+
if ((ctxt->constituent[c1].left == ctxt->constituent[c2].left) &&
|
494
|
+
(ctxt->constituent[c1].right == ctxt->constituent[c2].right) &&
|
495
|
+
(strcmp(ctxt->constituent[c1].type, ctxt->constituent[c2].type) == 0))
|
496
|
+
{
|
497
|
+
ctxt->constituent[c2].canon = c1;
|
498
|
+
}
|
499
|
+
}
|
500
|
+
}
|
501
|
+
|
502
|
+
/* If constituents A and B in different sublinkages X and Y
|
503
|
+
* have one endpoint in common, but A is larger at the other end,
|
504
|
+
* and B has no duplicate in X, then declare B invalid. (Example:
|
505
|
+
* " [A [B We saw the cat B] and the dog A] "
|
506
|
+
*/
|
507
|
+
for (c1 = 0; c1 < numcon_total; c1++)
|
508
|
+
{
|
509
|
+
if (ctxt->constituent[c1].valid == 0) continue;
|
510
|
+
for (c2 = 0; c2 < numcon_total; c2++)
|
511
|
+
{
|
512
|
+
if (ctxt->constituent[c2].subl == ctxt->constituent[c1].subl) continue;
|
513
|
+
ok = 1;
|
514
|
+
/* Does c2 have a duplicate in the sublinkage containing c1?
|
515
|
+
If so, bag it */
|
516
|
+
for (c3 = 0; c3 < numcon_total; c3++)
|
517
|
+
{
|
518
|
+
if ((ctxt->constituent[c2].canon == ctxt->constituent[c3].canon) &&
|
519
|
+
(ctxt->constituent[c3].subl == ctxt->constituent[c1].subl))
|
520
|
+
ok = 0;
|
521
|
+
}
|
522
|
+
for (c3 = 0; c3 < numcon_total; c3++)
|
523
|
+
{
|
524
|
+
if ((ctxt->constituent[c1].canon == ctxt->constituent[c3].canon) &&
|
525
|
+
(ctxt->constituent[c3].subl == ctxt->constituent[c2].subl))
|
526
|
+
ok = 0;
|
527
|
+
}
|
528
|
+
if (ok == 0) continue;
|
529
|
+
if ((ctxt->constituent[c1].left == ctxt->constituent[c2].left) &&
|
530
|
+
(ctxt->constituent[c1].right > ctxt->constituent[c2].right) &&
|
531
|
+
(strcmp(ctxt->constituent[c1].type, ctxt->constituent[c2].type) == 0))
|
532
|
+
{
|
533
|
+
ctxt->constituent[c2].valid = 0;
|
534
|
+
}
|
535
|
+
|
536
|
+
if ((ctxt->constituent[c1].left < ctxt->constituent[c2].left) &&
|
537
|
+
(ctxt->constituent[c1].right == ctxt->constituent[c2].right) &&
|
538
|
+
(strcmp(ctxt->constituent[c1].type, ctxt->constituent[c2].type) == 0))
|
539
|
+
{
|
540
|
+
ctxt->constituent[c2].valid = 0;
|
541
|
+
}
|
542
|
+
}
|
543
|
+
}
|
544
|
+
|
545
|
+
/* Now go through and find duplicates; if a pair is found,
|
546
|
+
* mark one as invalid. (It doesn't matter if they're in the
|
547
|
+
* same sublinkage or not)
|
548
|
+
*/
|
549
|
+
for (c1 = 0; c1 < numcon_total; c1++)
|
550
|
+
{
|
551
|
+
if (ctxt->constituent[c1].valid == 0) continue;
|
552
|
+
for (c2 = c1 + 1; c2 < numcon_total; c2++)
|
553
|
+
{
|
554
|
+
if (ctxt->constituent[c2].canon == ctxt->constituent[c1].canon)
|
555
|
+
ctxt->constituent[c2].valid = 0;
|
556
|
+
}
|
557
|
+
}
|
558
|
+
|
559
|
+
/* Now we generate the and-lists. An and-list is a set of mutually
|
560
|
+
* exclusive constituents. Each constituent in the list may not
|
561
|
+
* be present in the same sublinkage as any of the others.
|
562
|
+
*/
|
563
|
+
num_lists = 0;
|
564
|
+
for (c1 = 0; c1 < numcon_total; c1++)
|
565
|
+
{
|
566
|
+
if (ctxt->constituent[c1].valid == 0) continue;
|
567
|
+
num_elements = 1;
|
568
|
+
ctxt->templist[0] = c1;
|
569
|
+
num_lists = find_next_element(ctxt, linkage, c1, numcon_total,
|
570
|
+
num_elements, num_lists);
|
571
|
+
|
572
|
+
/* If we're overflowing, then punt */
|
573
|
+
if (MAX_ANDS <= num_lists)
|
574
|
+
break;
|
575
|
+
}
|
576
|
+
|
577
|
+
if (verbosity >= 2)
|
578
|
+
{
|
579
|
+
printf("And-lists:\n");
|
580
|
+
for (n=0; n<num_lists; n++)
|
581
|
+
{
|
582
|
+
printf(" %d: ", n);
|
583
|
+
for (a=0; a < ctxt->andlist[n].num; a++)
|
584
|
+
{
|
585
|
+
printf("%d ", ctxt->andlist[n].e[a]);
|
586
|
+
}
|
587
|
+
printf("\n");
|
588
|
+
}
|
589
|
+
}
|
590
|
+
|
591
|
+
/* Now we prune out any andlists that are subsumed by other
|
592
|
+
* andlists--e.g. if andlist X contains constituents A and B,
|
593
|
+
* and Y contains A B and C, we throw out X
|
594
|
+
*/
|
595
|
+
for (n = 0; n < num_lists; n++)
|
596
|
+
{
|
597
|
+
ctxt->andlist[n].valid = 1;
|
598
|
+
for (n2 = 0; n2 < num_lists; n2++)
|
599
|
+
{
|
600
|
+
if (n2 == n) continue;
|
601
|
+
if (ctxt->andlist[n2].num < ctxt->andlist[n].num)
|
602
|
+
continue;
|
603
|
+
|
604
|
+
listmatch = 1;
|
605
|
+
for (a = 0; a < ctxt->andlist[n].num; a++)
|
606
|
+
{
|
607
|
+
match = 0;
|
608
|
+
for (a2 = 0; a2 < ctxt->andlist[n2].num; a2++)
|
609
|
+
{
|
610
|
+
if (ctxt->andlist[n2].e[a2] == ctxt->andlist[n].e[a])
|
611
|
+
match = 1;
|
612
|
+
}
|
613
|
+
if (match == 0) listmatch = 0;
|
614
|
+
/* At least one element was not matched by n2 */
|
615
|
+
}
|
616
|
+
if (listmatch == 1) ctxt->andlist[n].valid = 0;
|
617
|
+
}
|
618
|
+
}
|
619
|
+
|
620
|
+
/* If an element of an andlist contains an element of another
|
621
|
+
* andlist, it must contain the entire andlist.
|
622
|
+
*/
|
623
|
+
for (n = 0; n < num_lists; n++)
|
624
|
+
{
|
625
|
+
if (ctxt->andlist[n].valid == 0)
|
626
|
+
continue;
|
627
|
+
for (a = 0; (a < ctxt->andlist[n].num) && (ctxt->andlist[n].valid); a++)
|
628
|
+
{
|
629
|
+
for (n2 = 0; (n2 < num_lists) && (ctxt->andlist[n].valid); n2++)
|
630
|
+
{
|
631
|
+
if ((n2 == n) || (ctxt->andlist[n2].valid == 0))
|
632
|
+
continue;
|
633
|
+
for (a2 = 0; (a2 < ctxt->andlist[n2].num) && (ctxt->andlist[n].valid); a2++)
|
634
|
+
{
|
635
|
+
c1 = ctxt->andlist[n].e[a];
|
636
|
+
c2 = ctxt->andlist[n2].e[a2];
|
637
|
+
if (c1 == c2)
|
638
|
+
continue;
|
639
|
+
if (!((ctxt->constituent[c2].left <= ctxt->constituent[c1].left) &&
|
640
|
+
(ctxt->constituent[c2].right >= ctxt->constituent[c1].right)))
|
641
|
+
continue;
|
642
|
+
if (verbosity >= 2)
|
643
|
+
printf("Found that c%d in list %d is bigger " \
|
644
|
+
"than c%d in list %d\n", c2, n2, c1, n);
|
645
|
+
ok = 1;
|
646
|
+
|
647
|
+
/* An element of n2 contains an element of n.
|
648
|
+
* Now, we check to see if that element of n2
|
649
|
+
* contains ALL the elements of n.
|
650
|
+
* If not, n is invalid.
|
651
|
+
*/
|
652
|
+
for (a3 = 0; a3 < ctxt->andlist[n].num; a3++)
|
653
|
+
{
|
654
|
+
c3 = ctxt->andlist[n].e[a3];
|
655
|
+
if ((ctxt->constituent[c2].left>ctxt->constituent[c3].left) ||
|
656
|
+
(ctxt->constituent[c2].right<ctxt->constituent[c3].right))
|
657
|
+
ok = 0;
|
658
|
+
}
|
659
|
+
if (ok != 0)
|
660
|
+
continue;
|
661
|
+
ctxt->andlist[n].valid = 0;
|
662
|
+
if (verbosity >= 2)
|
663
|
+
{
|
664
|
+
printf("Eliminating andlist, " \
|
665
|
+
"n=%d, a=%d, n2=%d, a2=%d: ",
|
666
|
+
n, a, n2, a2);
|
667
|
+
for (a3 = 0; a3 < ctxt->andlist[n].num; a3++)
|
668
|
+
{
|
669
|
+
printf("%d ", ctxt->andlist[n].e[a3]);
|
670
|
+
}
|
671
|
+
printf("\n");
|
672
|
+
}
|
673
|
+
}
|
674
|
+
}
|
675
|
+
}
|
676
|
+
}
|
677
|
+
|
678
|
+
if (verbosity >= 2)
|
679
|
+
{
|
680
|
+
printf("And-lists after pruning:\n");
|
681
|
+
for (n=0; n<num_lists; n++) {
|
682
|
+
if (ctxt->andlist[n].valid==0)
|
683
|
+
continue;
|
684
|
+
printf(" %d: ", n);
|
685
|
+
for (a=0; a<ctxt->andlist[n].num; a++) {
|
686
|
+
printf("%d ", ctxt->andlist[n].e[a]);
|
687
|
+
}
|
688
|
+
printf("\n");
|
689
|
+
}
|
690
|
+
}
|
691
|
+
|
692
|
+
c1 = numcon_total;
|
693
|
+
for (n = 0; n < num_lists; n++)
|
694
|
+
{
|
695
|
+
if (ctxt->andlist[n].valid == 0) continue;
|
696
|
+
leftend = 256;
|
697
|
+
rightend = -1;
|
698
|
+
for (a = 0; a < ctxt->andlist[n].num; a++)
|
699
|
+
{
|
700
|
+
c2 = ctxt->andlist[n].e[a];
|
701
|
+
if (ctxt->constituent[c2].left < leftend)
|
702
|
+
{
|
703
|
+
leftend = ctxt->constituent[c2].left;
|
704
|
+
}
|
705
|
+
if (ctxt->constituent[c2].right > rightend)
|
706
|
+
{
|
707
|
+
rightend=ctxt->constituent[c2].right;
|
708
|
+
}
|
709
|
+
}
|
710
|
+
|
711
|
+
ctxt->constituent[c1].left = leftend;
|
712
|
+
ctxt->constituent[c1].right = rightend;
|
713
|
+
ctxt->constituent[c1].type = ctxt->constituent[c2].type;
|
714
|
+
ctxt->constituent[c1].domain_type = 'x';
|
715
|
+
ctxt->constituent[c1].valid = 1;
|
716
|
+
ctxt->constituent[c1].start_link = ctxt->constituent[c2].start_link; /* bogus */
|
717
|
+
ctxt->constituent[c1].start_num = ctxt->constituent[c2].start_num; /* bogus */
|
718
|
+
|
719
|
+
#ifdef AUX_CODE_IS_DEAD /* See comments above */
|
720
|
+
/* If a constituent within the andlist is an aux (aux==1),
|
721
|
+
* set aux for the whole-list constituent to 2, also set
|
722
|
+
* aux for the smaller constituent to 2, meaning they'll both
|
723
|
+
* be printed (as an "X"). (If aux is 2 for the smaller
|
724
|
+
* constituent going in, the same thing should be done,
|
725
|
+
* though I doubt this ever happens.)
|
726
|
+
*/
|
727
|
+
for (a = 0; a < ctxt->andlist[n].num; a++)
|
728
|
+
{
|
729
|
+
c2 = ctxt->andlist[n].e[a];
|
730
|
+
if ((ctxt->constituent[c2].aux == 1) || (ctxt->constituent[c2].aux == 2))
|
731
|
+
{
|
732
|
+
ctxt->constituent[c1].aux = 2;
|
733
|
+
ctxt->constituent[c2].aux = 2;
|
734
|
+
}
|
735
|
+
}
|
736
|
+
#endif /* AUX_CODE_IS_DEAD */
|
737
|
+
|
738
|
+
if (verbosity >= 2)
|
739
|
+
printf("Adding constituent:\n");
|
740
|
+
print_constituent(ctxt, linkage, c1);
|
741
|
+
c1++;
|
742
|
+
}
|
743
|
+
numcon_total = c1;
|
744
|
+
return numcon_total;
|
745
|
+
}
|
746
|
+
|
747
|
+
/**
|
748
|
+
* Go through all the words. If a word is on the right end of
|
749
|
+
* an S (or SF or SX), wordtype[w]=STYPE. If it's also on the left end of a
|
750
|
+
* Pg*b, I, PP, or Pv, wordtype[w]=PTYPE. If it's a question-word
|
751
|
+
* used in an indirect question, wordtype[w]=QTYPE. If it's a
|
752
|
+
* question-word determiner, wordtype[w]=QDTYPE. Else wordtype[w]=NONE.
|
753
|
+
* (This function is called once for each sublinkage.)
|
754
|
+
*/
|
755
|
+
static void generate_misc_word_info(con_context_t * ctxt, Linkage linkage)
|
756
|
+
{
|
757
|
+
int l1, l2, w1, w2;
|
758
|
+
const char * label1, * label2;
|
759
|
+
|
760
|
+
for (w1=0; w1<linkage->num_words; w1++)
|
761
|
+
ctxt->wordtype[w1]=NONE;
|
762
|
+
|
763
|
+
for (l1=0; l1<linkage_get_num_links(linkage); l1++) {
|
764
|
+
w1=linkage_get_link_rword(linkage, l1);
|
765
|
+
label1 = linkage_get_link_label(linkage, l1);
|
766
|
+
if ((uppercompare(label1, "S")==0) ||
|
767
|
+
(uppercompare(label1, "SX")==0) ||
|
768
|
+
(uppercompare(label1, "SF")==0)) {
|
769
|
+
ctxt->wordtype[w1] = STYPE;
|
770
|
+
for (l2=0; l2<linkage_get_num_links(linkage); l2++) {
|
771
|
+
w2=linkage_get_link_lword(linkage, l2);
|
772
|
+
label2 = linkage_get_link_label(linkage, l2);
|
773
|
+
if ((w1==w2) &&
|
774
|
+
((post_process_match("Pg#b", label2)==1) ||
|
775
|
+
(uppercompare(label2, "I")==0) ||
|
776
|
+
(uppercompare(label2, "PP")==0) ||
|
777
|
+
(post_process_match("Pv", label2)==1))) {
|
778
|
+
/* Pvf, Pgf? */
|
779
|
+
ctxt->wordtype[w1] = PTYPE;
|
780
|
+
}
|
781
|
+
}
|
782
|
+
}
|
783
|
+
if (post_process_match("QI#d", label1)==1) {
|
784
|
+
ctxt->wordtype[w1] = QTYPE;
|
785
|
+
for (l2=0; l2<linkage_get_num_links(linkage); l2++) {
|
786
|
+
w2=linkage_get_link_lword(linkage, l2);
|
787
|
+
label2 = linkage_get_link_label(linkage, l2);
|
788
|
+
if ((w1==w2) && (post_process_match("D##w", label2)==1)) {
|
789
|
+
ctxt->wordtype[w1] = QDTYPE;
|
790
|
+
}
|
791
|
+
}
|
792
|
+
}
|
793
|
+
if (post_process_match("Mr", label1)==1) ctxt->wordtype[w1] = QDTYPE;
|
794
|
+
if (post_process_match("MX#d", label1)==1) ctxt->wordtype[w1] = QDTYPE;
|
795
|
+
}
|
796
|
+
}
|
797
|
+
|
798
|
+
static int last_minute_fixes(con_context_t *ctxt, Linkage linkage, int numcon_total)
|
799
|
+
{
|
800
|
+
int c, c2, global_leftend_found, adjustment_made,
|
801
|
+
global_rightend_found, lastword, newcon_total = 0;
|
802
|
+
Sentence sent;
|
803
|
+
sent = linkage_get_sentence(linkage);
|
804
|
+
|
805
|
+
for (c = 0; c < numcon_total; c++)
|
806
|
+
{
|
807
|
+
/* In a paraphrase construction ("John ran, he said"),
|
808
|
+
the paraphrasing clause doesn't get
|
809
|
+
an S. (This is true in Treebank II, not Treebank I) */
|
810
|
+
|
811
|
+
if (uppercompare(ctxt->constituent[c].start_link, "CP") == 0)
|
812
|
+
{
|
813
|
+
ctxt->constituent[c].valid = 0;
|
814
|
+
}
|
815
|
+
|
816
|
+
/* If it's a possessive with an "'s", the NP on the left
|
817
|
+
should be extended to include the "'s". */
|
818
|
+
if ((uppercompare(ctxt->constituent[c].start_link, "YS") == 0) ||
|
819
|
+
(uppercompare(ctxt->constituent[c].start_link, "YP") == 0))
|
820
|
+
{
|
821
|
+
ctxt->constituent[c].right++;
|
822
|
+
}
|
823
|
+
|
824
|
+
/* If a constituent has starting link MVpn, it's a time
|
825
|
+
expression like "last week"; label it as a noun phrase
|
826
|
+
(incorrectly) */
|
827
|
+
|
828
|
+
if (strcmp(ctxt->constituent[c].start_link, "MVpn") == 0)
|
829
|
+
{
|
830
|
+
ctxt->constituent[c].type = string_set_add("NP", ctxt->phrase_ss);
|
831
|
+
}
|
832
|
+
if (strcmp(ctxt->constituent[c].start_link, "COn") == 0)
|
833
|
+
{
|
834
|
+
ctxt->constituent[c].type = string_set_add("NP", ctxt->phrase_ss);
|
835
|
+
}
|
836
|
+
if (strcmp(ctxt->constituent[c].start_link, "Mpn") == 0)
|
837
|
+
{
|
838
|
+
ctxt->constituent[c].type = string_set_add("NP", ctxt->phrase_ss);
|
839
|
+
}
|
840
|
+
|
841
|
+
/* If the constituent is an S started by "but" or "and" at
|
842
|
+
the beginning of the sentence, it should be ignored. */
|
843
|
+
|
844
|
+
if ((strcmp(ctxt->constituent[c].start_link, "Wdc") == 0) &&
|
845
|
+
(ctxt->constituent[c].left == 2))
|
846
|
+
{
|
847
|
+
ctxt->constituent[c].valid = 0;
|
848
|
+
}
|
849
|
+
|
850
|
+
/* For prenominal adjectives, an ADJP constituent is assigned
|
851
|
+
if it's a hyphenated (Ah) or comparative (Am) adjective;
|
852
|
+
otherwise no ADJP is assigned, unless the phrase is more
|
853
|
+
than one word long (e.g. "very big"). The same with certain
|
854
|
+
types of adverbs. */
|
855
|
+
/* That was for Treebank I. For Treebank II, the rule only
|
856
|
+
seems to apply to prenominal adjectives (of all kinds).
|
857
|
+
However, it also applies to number expressions ("QP"). */
|
858
|
+
|
859
|
+
if ((post_process_match("A", ctxt->constituent[c].start_link) == 1) ||
|
860
|
+
(ctxt->constituent[c].domain_type == 'd') ||
|
861
|
+
(ctxt->constituent[c].domain_type == 'h')) {
|
862
|
+
if (ctxt->constituent[c].right-ctxt->constituent[c].left == 0)
|
863
|
+
{
|
864
|
+
ctxt->constituent[c].valid = 0;
|
865
|
+
}
|
866
|
+
}
|
867
|
+
|
868
|
+
if ((ctxt->constituent[c].domain_type == 'h') &&
|
869
|
+
(strcmp(linkage->word[ctxt->constituent[c].left - 1], "$") == 0))
|
870
|
+
{
|
871
|
+
ctxt->constituent[c].left--;
|
872
|
+
}
|
873
|
+
|
874
|
+
#ifdef AUX_CODE_IS_DEAD /* See comments at top */
|
875
|
+
/* If a constituent has type VP and its aux value is 2,
|
876
|
+
this means it's an aux that should be printed; change its
|
877
|
+
type to "X". If its aux value is 1, set "valid" to 0. (This
|
878
|
+
applies to Treebank I only) */
|
879
|
+
|
880
|
+
if (ctxt->constituent[c].aux == 2)
|
881
|
+
{
|
882
|
+
ctxt->constituent[c].type = string_set_add("X", ctxt->phrase_ss);
|
883
|
+
}
|
884
|
+
if (ctxt->constituent[c].aux == 1)
|
885
|
+
{
|
886
|
+
ctxt->constituent[c].valid = 0;
|
887
|
+
}
|
888
|
+
#endif /* AUX_CODE_IS_DEAD */
|
889
|
+
}
|
890
|
+
|
891
|
+
numcon_total = numcon_total + newcon_total;
|
892
|
+
|
893
|
+
/* If there's a global S constituent that includes everything
|
894
|
+
except a final period or question mark, extend it by one word */
|
895
|
+
|
896
|
+
for (c = 0; c < numcon_total; c++)
|
897
|
+
{
|
898
|
+
if ((ctxt->constituent[c].right == linkage->num_words -3) &&
|
899
|
+
(ctxt->constituent[c].left == 1) &&
|
900
|
+
(strcmp(ctxt->constituent[c].type, "S") == 0) &&
|
901
|
+
(strcmp(sent->word[linkage->num_words -2].string, ".") == 0))
|
902
|
+
ctxt->constituent[c].right++;
|
903
|
+
}
|
904
|
+
|
905
|
+
/* If there's no S boundary at the very left end of the sentence,
|
906
|
+
or the very right end, create a new S spanning the entire sentence */
|
907
|
+
|
908
|
+
lastword = linkage->num_words - 2;
|
909
|
+
global_leftend_found = 0;
|
910
|
+
global_rightend_found = 0;
|
911
|
+
for (c = 0; c < numcon_total; c++)
|
912
|
+
{
|
913
|
+
if ((ctxt->constituent[c].left == 1) && (strcmp(ctxt->constituent[c].type, "S") == 0) &&
|
914
|
+
(ctxt->constituent[c].valid == 1))
|
915
|
+
{
|
916
|
+
global_leftend_found = 1;
|
917
|
+
}
|
918
|
+
}
|
919
|
+
for (c = 0; c < numcon_total; c++)
|
920
|
+
{
|
921
|
+
if ((ctxt->constituent[c].right >= lastword) &&
|
922
|
+
(strcmp(ctxt->constituent[c].type, "S") == 0) && (ctxt->constituent[c].valid == 1))
|
923
|
+
{
|
924
|
+
global_rightend_found = 1;
|
925
|
+
}
|
926
|
+
}
|
927
|
+
if ((global_leftend_found == 0) || (global_rightend_found == 0))
|
928
|
+
{
|
929
|
+
c = numcon_total;
|
930
|
+
ctxt->constituent[c].left = 1;
|
931
|
+
ctxt->constituent[c].right = linkage->num_words-1;
|
932
|
+
ctxt->constituent[c].type = string_set_add("S", ctxt->phrase_ss);
|
933
|
+
ctxt->constituent[c].valid = 1;
|
934
|
+
ctxt->constituent[c].domain_type = 'x';
|
935
|
+
numcon_total++;
|
936
|
+
if (verbosity >= 2)
|
937
|
+
printf("Adding global sentence constituent:\n");
|
938
|
+
print_constituent(ctxt, linkage, c);
|
939
|
+
}
|
940
|
+
|
941
|
+
/* Check once more to see if constituents are nested (checking BETWEEN sublinkages
|
942
|
+
this time) */
|
943
|
+
|
944
|
+
while (1)
|
945
|
+
{
|
946
|
+
adjustment_made=0;
|
947
|
+
for (c = 0; c < numcon_total; c++)
|
948
|
+
{
|
949
|
+
if(ctxt->constituent[c].valid == 0) continue;
|
950
|
+
for (c2 = 0; c2 < numcon_total; c2++)
|
951
|
+
{
|
952
|
+
if(ctxt->constituent[c2].valid == 0) continue;
|
953
|
+
if ((ctxt->constituent[c].left < ctxt->constituent[c2].left) &&
|
954
|
+
(ctxt->constituent[c].right < ctxt->constituent[c2].right) &&
|
955
|
+
(ctxt->constituent[c].right >= ctxt->constituent[c2].left))
|
956
|
+
{
|
957
|
+
if (verbosity >= 2)
|
958
|
+
{
|
959
|
+
err_ctxt ec;
|
960
|
+
ec.sent = linkage->sent;
|
961
|
+
err_msg(&ec, Warn, "Warning: the constituents aren't nested! "
|
962
|
+
"Adjusting them. (%d, %d)\n", c, c2);
|
963
|
+
}
|
964
|
+
ctxt->constituent[c].left = ctxt->constituent[c2].left;
|
965
|
+
}
|
966
|
+
}
|
967
|
+
}
|
968
|
+
if (adjustment_made == 0) break;
|
969
|
+
}
|
970
|
+
return numcon_total;
|
971
|
+
}
|
972
|
+
|
973
|
+
/**
|
974
|
+
* This function generates a table, word_used[i][w], showing
|
975
|
+
* whether each word w is used in each sublinkage i; if so,
|
976
|
+
* the value for that cell of the table is 1.
|
977
|
+
*/
|
978
|
+
static void count_words_used(con_context_t *ctxt, Linkage linkage)
|
979
|
+
{
|
980
|
+
int i, w, link, num_subl;
|
981
|
+
|
982
|
+
num_subl = linkage->num_sublinkages;
|
983
|
+
if(linkage->unionized == 1 && num_subl > 1) num_subl--;
|
984
|
+
|
985
|
+
if (verbosity >= 2)
|
986
|
+
printf("Number of sublinkages = %d\n", num_subl);
|
987
|
+
|
988
|
+
for (i=0; i<num_subl; i++)
|
989
|
+
{
|
990
|
+
for (w = 0; w < linkage->num_words; w++) ctxt->word_used[i][w] = 0;
|
991
|
+
linkage->current = i;
|
992
|
+
for (link = 0; link < linkage_get_num_links(linkage); link++)
|
993
|
+
{
|
994
|
+
ctxt->word_used[i][linkage_get_link_lword(linkage, link)] = 1;
|
995
|
+
ctxt->word_used[i][linkage_get_link_rword(linkage, link)] = 1;
|
996
|
+
}
|
997
|
+
if (verbosity >= 2)
|
998
|
+
{
|
999
|
+
printf("Sublinkage %d: ", i);
|
1000
|
+
for (w = 0; w < linkage->num_words; w++)
|
1001
|
+
{
|
1002
|
+
if (ctxt->word_used[i][w] == 0) printf("0 ");
|
1003
|
+
if (ctxt->word_used[i][w] == 1) printf("1 ");
|
1004
|
+
}
|
1005
|
+
printf("\n");
|
1006
|
+
}
|
1007
|
+
}
|
1008
|
+
}
|
1009
|
+
|
1010
|
+
static int add_constituent(con_context_t *ctxt, int c, Linkage linkage, Domain domain,
|
1011
|
+
int l, int r, const char * name)
|
1012
|
+
{
|
1013
|
+
int nwords = linkage->num_words-2;
|
1014
|
+
c++;
|
1015
|
+
|
1016
|
+
/* Avoid running off end, to walls. */
|
1017
|
+
if (l < 1) l=1;
|
1018
|
+
if (r > nwords) r = nwords;
|
1019
|
+
if (l > nwords) l = nwords;
|
1020
|
+
assert(l <= r, "negative constituent length!" );
|
1021
|
+
|
1022
|
+
ctxt->constituent[c].left = l;
|
1023
|
+
ctxt->constituent[c].right = r;
|
1024
|
+
ctxt->constituent[c].domain_type = domain.type;
|
1025
|
+
ctxt->constituent[c].start_link =
|
1026
|
+
linkage_get_link_label(linkage, domain.start_link);
|
1027
|
+
ctxt->constituent[c].start_num = domain.start_link;
|
1028
|
+
ctxt->constituent[c].type = string_set_add(name, ctxt->phrase_ss);
|
1029
|
+
return c;
|
1030
|
+
}
|
1031
|
+
|
1032
|
+
static const char * cons_of_domain(Linkage linkage, char domain_type)
|
1033
|
+
{
|
1034
|
+
switch (domain_type) {
|
1035
|
+
case 'a':
|
1036
|
+
return "ADJP";
|
1037
|
+
case 'b':
|
1038
|
+
return "SBAR";
|
1039
|
+
case 'c':
|
1040
|
+
return "VP";
|
1041
|
+
case 'd':
|
1042
|
+
return "QP";
|
1043
|
+
case 'e':
|
1044
|
+
return "ADVP";
|
1045
|
+
case 'f':
|
1046
|
+
return "SBAR";
|
1047
|
+
case 'g':
|
1048
|
+
return "PP";
|
1049
|
+
case 'h':
|
1050
|
+
return "QP";
|
1051
|
+
case 'i':
|
1052
|
+
return "ADVP";
|
1053
|
+
case 'k':
|
1054
|
+
return "PRT";
|
1055
|
+
case 'n':
|
1056
|
+
return "NP";
|
1057
|
+
case 'p':
|
1058
|
+
return "PP";
|
1059
|
+
case 'q':
|
1060
|
+
return "SINV";
|
1061
|
+
case 's':
|
1062
|
+
return "S";
|
1063
|
+
case 't':
|
1064
|
+
return "VP";
|
1065
|
+
case 'u':
|
1066
|
+
return "ADJP";
|
1067
|
+
case 'v':
|
1068
|
+
return "VP";
|
1069
|
+
case 'y':
|
1070
|
+
return "NP";
|
1071
|
+
case 'z':
|
1072
|
+
return "VP";
|
1073
|
+
default:
|
1074
|
+
{
|
1075
|
+
err_ctxt ec;
|
1076
|
+
ec.sent = linkage->sent;
|
1077
|
+
err_msg(&ec, Error, "Error: Illegal domain: %c\n", domain_type);
|
1078
|
+
return "";
|
1079
|
+
}
|
1080
|
+
}
|
1081
|
+
}
|
1082
|
+
|
1083
|
+
static int read_constituents_from_domains(con_context_t *ctxt, Linkage linkage,
|
1084
|
+
int numcon_total, int s)
|
1085
|
+
{
|
1086
|
+
int d, c, leftlimit, l, leftmost, rightmost, w, c2, numcon_subl = 0, w2;
|
1087
|
+
List_o_links * dlink;
|
1088
|
+
int rootright, rootleft, adjustment_made;
|
1089
|
+
Sublinkage * subl;
|
1090
|
+
const char * name;
|
1091
|
+
Domain domain;
|
1092
|
+
|
1093
|
+
subl = &linkage->sublinkage[s];
|
1094
|
+
|
1095
|
+
for (d = 0, c = numcon_total; d < subl->pp_data.N_domains; d++, c++)
|
1096
|
+
{
|
1097
|
+
domain = subl->pp_data.domain_array[d];
|
1098
|
+
rootright = linkage_get_link_rword(linkage, domain.start_link);
|
1099
|
+
rootleft = linkage_get_link_lword(linkage, domain.start_link);
|
1100
|
+
|
1101
|
+
if ((domain.type=='c') ||
|
1102
|
+
(domain.type=='d') ||
|
1103
|
+
(domain.type=='e') ||
|
1104
|
+
(domain.type=='f') ||
|
1105
|
+
(domain.type=='g') ||
|
1106
|
+
(domain.type=='u') ||
|
1107
|
+
(domain.type=='y'))
|
1108
|
+
{
|
1109
|
+
leftlimit = 0;
|
1110
|
+
leftmost = linkage_get_link_lword(linkage, domain.start_link);
|
1111
|
+
rightmost = linkage_get_link_lword(linkage, domain.start_link);
|
1112
|
+
}
|
1113
|
+
else
|
1114
|
+
{
|
1115
|
+
leftlimit = linkage_get_link_lword(linkage, domain.start_link) + 1;
|
1116
|
+
leftmost = linkage_get_link_rword(linkage, domain.start_link);
|
1117
|
+
rightmost = linkage_get_link_rword(linkage, domain.start_link);
|
1118
|
+
}
|
1119
|
+
|
1120
|
+
/* Start by assigning both left and right limits to the
|
1121
|
+
* right word of the start link. This will always be contained
|
1122
|
+
* in the constituent. This will also handle the case
|
1123
|
+
* where the domain contains no links.
|
1124
|
+
*/
|
1125
|
+
for (dlink = domain.lol; dlink != NULL; dlink = dlink->next)
|
1126
|
+
{
|
1127
|
+
l = dlink->link;
|
1128
|
+
|
1129
|
+
if ((linkage_get_link_lword(linkage, l) < leftmost) &&
|
1130
|
+
(linkage_get_link_lword(linkage, l) >= leftlimit))
|
1131
|
+
{
|
1132
|
+
leftmost = linkage_get_link_lword(linkage, l);
|
1133
|
+
}
|
1134
|
+
|
1135
|
+
if (linkage_get_link_rword(linkage, l) > rightmost)
|
1136
|
+
{
|
1137
|
+
rightmost = linkage_get_link_rword(linkage, l);
|
1138
|
+
}
|
1139
|
+
}
|
1140
|
+
|
1141
|
+
c--;
|
1142
|
+
c = add_constituent(ctxt, c, linkage, domain, leftmost, rightmost,
|
1143
|
+
cons_of_domain(linkage, domain.type));
|
1144
|
+
|
1145
|
+
if (domain.type == 'z')
|
1146
|
+
{
|
1147
|
+
c = add_constituent(ctxt, c, linkage, domain, leftmost, rightmost, "S");
|
1148
|
+
}
|
1149
|
+
if (domain.type=='c')
|
1150
|
+
{
|
1151
|
+
c = add_constituent(ctxt, c, linkage, domain, leftmost, rightmost, "S");
|
1152
|
+
}
|
1153
|
+
if ((post_process_match("Ce*", ctxt->constituent[c].start_link)==1) ||
|
1154
|
+
(post_process_match("Rn", ctxt->constituent[c].start_link)==1))
|
1155
|
+
{
|
1156
|
+
c = add_constituent(ctxt, c, linkage, domain, leftmost, rightmost, "SBAR");
|
1157
|
+
}
|
1158
|
+
if ((post_process_match("R*", ctxt->constituent[c].start_link)==1) ||
|
1159
|
+
(post_process_match("MX#r", ctxt->constituent[c].start_link)==1))
|
1160
|
+
{
|
1161
|
+
w = leftmost;
|
1162
|
+
if (strcmp(linkage->word[w], ",") == 0) w++;
|
1163
|
+
c = add_constituent(ctxt, c, linkage, domain, w, w, "WHNP");
|
1164
|
+
}
|
1165
|
+
if (post_process_match("Mj", ctxt->constituent[c].start_link) == 1)
|
1166
|
+
{
|
1167
|
+
w = leftmost;
|
1168
|
+
if (strcmp(linkage->word[w], ",") == 0) w++;
|
1169
|
+
c = add_constituent(ctxt, c, linkage, domain, w, w+1, "WHPP");
|
1170
|
+
c = add_constituent(ctxt, c, linkage, domain, w+1, w+1, "WHNP");
|
1171
|
+
}
|
1172
|
+
if ((post_process_match("Ss#d", ctxt->constituent[c].start_link)==1) ||
|
1173
|
+
(post_process_match("B#d", ctxt->constituent[c].start_link)==1))
|
1174
|
+
{
|
1175
|
+
c = add_constituent(ctxt, c, linkage, domain, rootleft, rootleft, "WHNP");
|
1176
|
+
c = add_constituent(ctxt, c, linkage, domain,
|
1177
|
+
rootleft, ctxt->constituent[c-1].right, "SBAR");
|
1178
|
+
}
|
1179
|
+
if (post_process_match("CP", ctxt->constituent[c].start_link)==1)
|
1180
|
+
{
|
1181
|
+
if (strcmp(linkage->word[leftmost], ",") == 0)
|
1182
|
+
ctxt->constituent[c].left++;
|
1183
|
+
c = add_constituent(ctxt, c, linkage, domain, 1, linkage->num_words-1, "S");
|
1184
|
+
}
|
1185
|
+
if ((post_process_match("MVs", ctxt->constituent[c].start_link)==1) ||
|
1186
|
+
(domain.type=='f'))
|
1187
|
+
{
|
1188
|
+
w = ctxt->constituent[c].left;
|
1189
|
+
if (strcmp(linkage->word[w], ",") == 0)
|
1190
|
+
w++;
|
1191
|
+
if (strcmp(linkage->word[w], "when") == 0)
|
1192
|
+
{
|
1193
|
+
c = add_constituent(ctxt, c, linkage, domain, w, w, "WHADVP");
|
1194
|
+
}
|
1195
|
+
}
|
1196
|
+
if (domain.type=='t')
|
1197
|
+
{
|
1198
|
+
c = add_constituent(ctxt, c, linkage, domain, leftmost, rightmost, "S");
|
1199
|
+
}
|
1200
|
+
if ((post_process_match("QI", ctxt->constituent[c].start_link) == 1) ||
|
1201
|
+
(post_process_match("Mr", ctxt->constituent[c].start_link) == 1) ||
|
1202
|
+
(post_process_match("MX#d", ctxt->constituent[c].start_link) == 1))
|
1203
|
+
{
|
1204
|
+
w = leftmost;
|
1205
|
+
if (strcmp(linkage->word[w], ",") == 0) w++;
|
1206
|
+
if (ctxt->wordtype[w] == NONE)
|
1207
|
+
name = "WHADVP";
|
1208
|
+
else if (ctxt->wordtype[w] == QTYPE)
|
1209
|
+
name = "WHNP";
|
1210
|
+
else if (ctxt->wordtype[w] == QDTYPE)
|
1211
|
+
name = "WHNP";
|
1212
|
+
else
|
1213
|
+
assert(0, "Unexpected word type");
|
1214
|
+
c = add_constituent(ctxt, c, linkage, domain, w, w, name);
|
1215
|
+
|
1216
|
+
if (ctxt->wordtype[w] == QDTYPE)
|
1217
|
+
{
|
1218
|
+
/* Now find the finite verb to the right, start an S */
|
1219
|
+
/* Limit w2 to sentence length. */
|
1220
|
+
// for( w2=w+1; w2 < ctxt->r_limit-1; w2++ )
|
1221
|
+
for (w2 = w+1; w2 < rightmost; w2++)
|
1222
|
+
if ((ctxt->wordtype[w2] == STYPE) || (ctxt->wordtype[w2] == PTYPE)) break;
|
1223
|
+
|
1224
|
+
/* Adjust the right boundary of previous constituent */
|
1225
|
+
ctxt->constituent[c].right = w2 - 1;
|
1226
|
+
c = add_constituent(ctxt, c, linkage, domain, w2, rightmost, "S");
|
1227
|
+
}
|
1228
|
+
}
|
1229
|
+
|
1230
|
+
if (ctxt->constituent[c].domain_type == '\0')
|
1231
|
+
{
|
1232
|
+
err_ctxt ec;
|
1233
|
+
ec.sent = linkage->sent;
|
1234
|
+
err_msg(&ec, Error, "Error: no domain type assigned to constituent\n");
|
1235
|
+
}
|
1236
|
+
if (ctxt->constituent[c].start_link == NULL)
|
1237
|
+
{
|
1238
|
+
err_ctxt ec;
|
1239
|
+
ec.sent = linkage->sent;
|
1240
|
+
err_msg(&ec, Error, "Error: no type assigned to constituent\n");
|
1241
|
+
}
|
1242
|
+
}
|
1243
|
+
|
1244
|
+
numcon_subl = c - numcon_total;
|
1245
|
+
/* numcon_subl = handle_islands(linkage, numcon_total, numcon_subl); */
|
1246
|
+
|
1247
|
+
if (verbosity >= 2)
|
1248
|
+
printf("Constituents added at first stage for subl %d:\n",
|
1249
|
+
linkage->current);
|
1250
|
+
for (c = numcon_total; c < numcon_total + numcon_subl; c++)
|
1251
|
+
{
|
1252
|
+
print_constituent(ctxt, linkage, c);
|
1253
|
+
}
|
1254
|
+
|
1255
|
+
/* Opener case - generates S around main clause.
|
1256
|
+
(This must be done first; the S generated will be needed for
|
1257
|
+
later cases.) */
|
1258
|
+
numcon_subl =
|
1259
|
+
gen_comp(ctxt, linkage, numcon_total, numcon_subl, "SBAR", "S", "S", 5);
|
1260
|
+
|
1261
|
+
/* pp opener case */
|
1262
|
+
numcon_subl =
|
1263
|
+
gen_comp(ctxt, linkage, numcon_total, numcon_subl, "PP", "S", "S", 6);
|
1264
|
+
|
1265
|
+
/* participle opener case */
|
1266
|
+
numcon_subl =
|
1267
|
+
gen_comp(ctxt, linkage, numcon_total, numcon_subl, "S", "S", "S", 9);
|
1268
|
+
|
1269
|
+
/* Subject-phrase case; every main VP generates an S */
|
1270
|
+
numcon_subl =
|
1271
|
+
gen_comp(ctxt, linkage, numcon_total, numcon_subl, "VP", "S", "NP", 1);
|
1272
|
+
|
1273
|
+
/* Relative clause case; an SBAR generates a complement NP */
|
1274
|
+
numcon_subl =
|
1275
|
+
gen_comp(ctxt, linkage, numcon_total, numcon_subl, "SBAR", "NP", "NP", 3);
|
1276
|
+
|
1277
|
+
/* Participle modifier case */
|
1278
|
+
numcon_subl =
|
1279
|
+
gen_comp(ctxt, linkage, numcon_total, numcon_subl, "VP", "NP", "NP", 8);
|
1280
|
+
|
1281
|
+
/* PP modifying NP */
|
1282
|
+
numcon_subl =
|
1283
|
+
gen_comp(ctxt, linkage, numcon_total, numcon_subl, "PP", "NP", "NP", 8);
|
1284
|
+
|
1285
|
+
/* Appositive case */
|
1286
|
+
numcon_subl =
|
1287
|
+
gen_comp(ctxt, linkage, numcon_total, numcon_subl, "NP", "NP", "NP", 4);
|
1288
|
+
|
1289
|
+
/* S-V inversion case; an NP generates a complement VP */
|
1290
|
+
numcon_subl =
|
1291
|
+
gen_comp(ctxt, linkage, numcon_total, numcon_subl, "NP", "SINV", "VP", 7);
|
1292
|
+
|
1293
|
+
adjust_subordinate_clauses(ctxt, linkage, numcon_total, numcon_subl);
|
1294
|
+
for (c = numcon_total; c < numcon_total + numcon_subl; c++)
|
1295
|
+
{
|
1296
|
+
if ((ctxt->constituent[c].domain_type=='p') &&
|
1297
|
+
(strcmp(linkage->word[ctxt->constituent[c].left], ",")==0))
|
1298
|
+
{
|
1299
|
+
ctxt->constituent[c].left++;
|
1300
|
+
}
|
1301
|
+
}
|
1302
|
+
|
1303
|
+
/* Make sure the constituents are nested. If two constituents
|
1304
|
+
* are not nested: whichever constituent has the furthest left
|
1305
|
+
* boundary, shift that boundary rightwards to the left boundary
|
1306
|
+
* of the other one.
|
1307
|
+
*/
|
1308
|
+
while (1)
|
1309
|
+
{
|
1310
|
+
adjustment_made = 0;
|
1311
|
+
for (c = numcon_total; c < numcon_total + numcon_subl; c++)
|
1312
|
+
{
|
1313
|
+
for (c2 = numcon_total; c2 < numcon_total + numcon_subl; c2++)
|
1314
|
+
{
|
1315
|
+
if ((ctxt->constituent[c].left < ctxt->constituent[c2].left) &&
|
1316
|
+
(ctxt->constituent[c].right < ctxt->constituent[c2].right) &&
|
1317
|
+
(ctxt->constituent[c].right >= ctxt->constituent[c2].left))
|
1318
|
+
{
|
1319
|
+
/* We've found two overlapping constituents.
|
1320
|
+
If one is larger, except the smaller one
|
1321
|
+
includes an extra comma, adjust the smaller one
|
1322
|
+
to exclude the comma */
|
1323
|
+
|
1324
|
+
if ((strcmp(linkage->word[ctxt->constituent[c2].right], ",") == 0) ||
|
1325
|
+
(strcmp(linkage->word[ctxt->constituent[c2].right],
|
1326
|
+
"RIGHT-WALL") == 0))
|
1327
|
+
{
|
1328
|
+
if (verbosity >= 2)
|
1329
|
+
printf("Adjusting %d to fix comma overlap\n", c2);
|
1330
|
+
adjust_for_right_comma(ctxt, linkage, c2);
|
1331
|
+
adjustment_made = 1;
|
1332
|
+
}
|
1333
|
+
else if (strcmp(linkage->word[ctxt->constituent[c].left], ",") == 0)
|
1334
|
+
{
|
1335
|
+
if (verbosity >= 2)
|
1336
|
+
printf("Adjusting c %d to fix comma overlap\n", c);
|
1337
|
+
adjust_for_left_comma(ctxt, linkage, c);
|
1338
|
+
adjustment_made = 1;
|
1339
|
+
}
|
1340
|
+
else
|
1341
|
+
{
|
1342
|
+
if (verbosity >= 2)
|
1343
|
+
{
|
1344
|
+
err_ctxt ec;
|
1345
|
+
ec.sent = linkage->sent;
|
1346
|
+
err_msg(&ec, Warn,
|
1347
|
+
"Warning: the constituents aren't nested! "
|
1348
|
+
"Adjusting them. (%d, %d)\n", c, c2);
|
1349
|
+
}
|
1350
|
+
ctxt->constituent[c].left = ctxt->constituent[c2].left;
|
1351
|
+
}
|
1352
|
+
}
|
1353
|
+
}
|
1354
|
+
}
|
1355
|
+
if (adjustment_made == 0) break;
|
1356
|
+
}
|
1357
|
+
|
1358
|
+
#ifdef AUX_CODE_IS_DEAD
|
1359
|
+
/* The code here is ifdef-dead as it appears to be dead, as the computation it does
|
1360
|
+
* is immediately undone in the very next block.
|
1361
|
+
*/
|
1362
|
+
/* This labels certain words as auxiliaries (such as forms of "be"
|
1363
|
+
* with passives, forms of "have" wth past participles,
|
1364
|
+
* "to" with infinitives). These words start VP's which include
|
1365
|
+
* them. In Treebank I, these don't get printed unless they're part of an
|
1366
|
+
* andlist, in which case they get labeled "X". (this is why we need to
|
1367
|
+
* label them as "aux".) In Treebank II, however, they seem to be treated
|
1368
|
+
* just like other verbs, so the "aux" stuff isn't needed.
|
1369
|
+
*/
|
1370
|
+
for (c = numcon_total; c < numcon_total + numcon_subl; c++)
|
1371
|
+
{
|
1372
|
+
ctxt->constituent[c].subl = linkage->current;
|
1373
|
+
if (((ctxt->constituent[c].domain_type == 'v') &&
|
1374
|
+
(ctxt->wordtype[linkage_get_link_rword(linkage,
|
1375
|
+
ctxt->constituent[c].start_num)] == PTYPE))
|
1376
|
+
||
|
1377
|
+
((ctxt->constituent[c].domain_type == 't') &&
|
1378
|
+
(strcmp(ctxt->constituent[c].type, "VP") == 0)))
|
1379
|
+
{
|
1380
|
+
ctxt->constituent[c].aux = 1;
|
1381
|
+
}
|
1382
|
+
else
|
1383
|
+
{
|
1384
|
+
ctxt->constituent[c].aux = 0;
|
1385
|
+
}
|
1386
|
+
}
|
1387
|
+
#endif /* AUX_CODE_IS_DEAD */
|
1388
|
+
|
1389
|
+
if (MAXCONSTITUENTS <= numcon_total + numcon_subl)
|
1390
|
+
{
|
1391
|
+
err_ctxt ec;
|
1392
|
+
ec.sent = linkage->sent;
|
1393
|
+
err_msg(&ec, Error, "Error: Too many constituents (a2).\n");
|
1394
|
+
numcon_total = MAXCONSTITUENTS - numcon_subl;
|
1395
|
+
}
|
1396
|
+
for (c = numcon_total; c < numcon_total + numcon_subl; c++)
|
1397
|
+
{
|
1398
|
+
ctxt->constituent[c].subl = linkage->current;
|
1399
|
+
#ifdef AUX_CODE_IS_DEAD /* See comments at top */
|
1400
|
+
ctxt->constituent[c].aux = 0;
|
1401
|
+
#endif /* AUX_CODE_IS_DEAD */
|
1402
|
+
}
|
1403
|
+
|
1404
|
+
return numcon_subl;
|
1405
|
+
}
|
1406
|
+
|
1407
|
+
static char * exprint_constituent_structure(con_context_t *ctxt, Linkage linkage, int numcon_total)
|
1408
|
+
{
|
1409
|
+
int have_opened = 1;
|
1410
|
+
int c, w;
|
1411
|
+
int leftdone[MAXCONSTITUENTS];
|
1412
|
+
int rightdone[MAXCONSTITUENTS];
|
1413
|
+
int best, bestright, bestleft;
|
1414
|
+
Sentence sent;
|
1415
|
+
char s[100], * p;
|
1416
|
+
String * cs = string_new();
|
1417
|
+
|
1418
|
+
assert (numcon_total < MAXCONSTITUENTS, "Too many constituents (b)");
|
1419
|
+
sent = linkage_get_sentence(linkage);
|
1420
|
+
|
1421
|
+
for (c = 0; c < numcon_total; c++)
|
1422
|
+
{
|
1423
|
+
leftdone[c] = 0;
|
1424
|
+
rightdone[c] = 0;
|
1425
|
+
}
|
1426
|
+
|
1427
|
+
if (verbosity >= 2)
|
1428
|
+
printf("\n");
|
1429
|
+
|
1430
|
+
for (w = 1; w < linkage->num_words; w++)
|
1431
|
+
{
|
1432
|
+
/* Skip left wall; don't skip right wall, since it may
|
1433
|
+
have constituent boundaries */
|
1434
|
+
|
1435
|
+
while(1)
|
1436
|
+
{
|
1437
|
+
best = -1;
|
1438
|
+
bestright = -1;
|
1439
|
+
for (c = 0; c < numcon_total; c++)
|
1440
|
+
{
|
1441
|
+
if ((ctxt->constituent[c].left == w) &&
|
1442
|
+
(leftdone[c] == 0) && (ctxt->constituent[c].valid == 1) &&
|
1443
|
+
(ctxt->constituent[c].right >= bestright)) {
|
1444
|
+
best = c;
|
1445
|
+
bestright = ctxt->constituent[c].right;
|
1446
|
+
}
|
1447
|
+
}
|
1448
|
+
if (best == -1)
|
1449
|
+
break;
|
1450
|
+
|
1451
|
+
leftdone[best] = 1;
|
1452
|
+
/* have_open is a hack to avoid printing anything until
|
1453
|
+
* bracket is opened */
|
1454
|
+
if (w == 1) have_opened = 0;
|
1455
|
+
#ifdef AUX_CODE_IS_DEAD /* See comments at top */
|
1456
|
+
if (ctxt->constituent[best].aux == 1) continue;
|
1457
|
+
#endif /* AUX_CODE_IS_DEAD */
|
1458
|
+
have_opened = 1;
|
1459
|
+
append_string(cs, "%c%s ", OPEN_BRACKET, ctxt->constituent[best].type);
|
1460
|
+
}
|
1461
|
+
|
1462
|
+
/* Don't print out right wall */
|
1463
|
+
if (have_opened && (w < linkage->num_words - 1))
|
1464
|
+
{
|
1465
|
+
char *p;
|
1466
|
+
strcpy(s, sent->word[w].string);
|
1467
|
+
|
1468
|
+
/* Constituent processing will crash if the sentence contains
|
1469
|
+
* square brackets, so we have to do something ... replace
|
1470
|
+
* them with curly braces ... will have to do.
|
1471
|
+
*/
|
1472
|
+
p = strchr(s, OPEN_BRACKET);
|
1473
|
+
while(p)
|
1474
|
+
{
|
1475
|
+
*p = '{';
|
1476
|
+
p = strchr(p, OPEN_BRACKET);
|
1477
|
+
}
|
1478
|
+
|
1479
|
+
p = strchr(s, CLOSE_BRACKET);
|
1480
|
+
while(p)
|
1481
|
+
{
|
1482
|
+
*p = '}';
|
1483
|
+
p = strchr(p, CLOSE_BRACKET);
|
1484
|
+
}
|
1485
|
+
|
1486
|
+
/* Now, if the first character of the word was
|
1487
|
+
originally uppercase, we put it back that way */
|
1488
|
+
if (sent->word[w].firstupper == 1)
|
1489
|
+
upcase_utf8_str(s, s, MAX_WORD);
|
1490
|
+
append_string(cs, "%s ", s);
|
1491
|
+
}
|
1492
|
+
|
1493
|
+
while(1)
|
1494
|
+
{
|
1495
|
+
best = -1;
|
1496
|
+
bestleft = -1;
|
1497
|
+
for(c = 0; c < numcon_total; c++)
|
1498
|
+
{
|
1499
|
+
if ((ctxt->constituent[c].right == w) &&
|
1500
|
+
(rightdone[c] == 0) && (ctxt->constituent[c].valid == 1) &&
|
1501
|
+
(ctxt->constituent[c].left > bestleft)) {
|
1502
|
+
best = c;
|
1503
|
+
bestleft = ctxt->constituent[c].left;
|
1504
|
+
}
|
1505
|
+
}
|
1506
|
+
if (best == -1)
|
1507
|
+
break;
|
1508
|
+
rightdone[best] = 1;
|
1509
|
+
#ifdef AUX_CODE_IS_DEAD /* See comments at top */
|
1510
|
+
if (ctxt->constituent[best].aux == 1)
|
1511
|
+
continue;
|
1512
|
+
#endif /* AUX_CODE_IS_DEAD */
|
1513
|
+
append_string(cs, "%s%c ", ctxt->constituent[best].type, CLOSE_BRACKET);
|
1514
|
+
}
|
1515
|
+
}
|
1516
|
+
|
1517
|
+
append_string(cs, "\n");
|
1518
|
+
p = string_copy(cs);
|
1519
|
+
string_delete(cs);
|
1520
|
+
return p;
|
1521
|
+
}
|
1522
|
+
|
1523
|
+
static char * do_print_flat_constituents(con_context_t *ctxt, Linkage linkage)
|
1524
|
+
{
|
1525
|
+
int num_words;
|
1526
|
+
Sentence sent;
|
1527
|
+
Postprocessor * pp;
|
1528
|
+
int s, numcon_total, numcon_subl, num_subl;
|
1529
|
+
char * q;
|
1530
|
+
|
1531
|
+
sent = linkage_get_sentence(linkage);
|
1532
|
+
ctxt->phrase_ss = string_set_create();
|
1533
|
+
pp = linkage->sent->dict->constituent_pp;
|
1534
|
+
numcon_total = 0;
|
1535
|
+
|
1536
|
+
count_words_used(ctxt, linkage);
|
1537
|
+
|
1538
|
+
num_subl = linkage->num_sublinkages;
|
1539
|
+
if (num_subl > MAXSUBL)
|
1540
|
+
{
|
1541
|
+
num_subl = MAXSUBL;
|
1542
|
+
if (verbosity >= 2)
|
1543
|
+
printf("Number of sublinkages exceeds maximum: only considering first %d sublinkages\n", MAXSUBL);
|
1544
|
+
}
|
1545
|
+
|
1546
|
+
if (linkage->unionized == 1 && num_subl > 1) num_subl--;
|
1547
|
+
for (s = 0; s < num_subl; s++)
|
1548
|
+
{
|
1549
|
+
linkage_set_current_sublinkage(linkage, s);
|
1550
|
+
linkage_post_process(linkage, pp);
|
1551
|
+
num_words = linkage_get_num_words(linkage);
|
1552
|
+
generate_misc_word_info(ctxt, linkage);
|
1553
|
+
numcon_subl = read_constituents_from_domains(ctxt, linkage, numcon_total, s);
|
1554
|
+
numcon_total = numcon_total + numcon_subl;
|
1555
|
+
if (MAXCONSTITUENTS <= numcon_total)
|
1556
|
+
{
|
1557
|
+
err_ctxt ec;
|
1558
|
+
ec.sent = linkage->sent;
|
1559
|
+
err_msg(&ec, Error, "Error: Too many constituents (c).\n");
|
1560
|
+
numcon_total = MAXCONSTITUENTS-1;
|
1561
|
+
break;
|
1562
|
+
}
|
1563
|
+
}
|
1564
|
+
numcon_total = merge_constituents(ctxt, linkage, numcon_total);
|
1565
|
+
if (MAXCONSTITUENTS <= numcon_total)
|
1566
|
+
{
|
1567
|
+
err_ctxt ec;
|
1568
|
+
ec.sent = linkage->sent;
|
1569
|
+
err_msg(&ec, Error, "Error: Too many constituents (d).\n");
|
1570
|
+
numcon_total = MAXCONSTITUENTS-1;
|
1571
|
+
}
|
1572
|
+
numcon_total = last_minute_fixes(ctxt, linkage, numcon_total);
|
1573
|
+
if (MAXCONSTITUENTS <= numcon_total)
|
1574
|
+
{
|
1575
|
+
err_ctxt ec;
|
1576
|
+
ec.sent = linkage->sent;
|
1577
|
+
err_msg(&ec, Error, "Error: Too many constituents (e).\n");
|
1578
|
+
numcon_total = MAXCONSTITUENTS-1;
|
1579
|
+
}
|
1580
|
+
q = exprint_constituent_structure(ctxt, linkage, numcon_total);
|
1581
|
+
string_set_delete(ctxt->phrase_ss);
|
1582
|
+
ctxt->phrase_ss = NULL;
|
1583
|
+
return q;
|
1584
|
+
}
|
1585
|
+
|
1586
|
+
static char * print_flat_constituents(Linkage linkage)
|
1587
|
+
{
|
1588
|
+
/* In principle, the ctxt could be allocated on stack, instead of
|
1589
|
+
* with malloc(). However, The java6 jvm (and MS Windows jvm's)
|
1590
|
+
* gives JNI clients only a small amount of stack space. Alloc'ing
|
1591
|
+
* this (rather large) structure on stack will blow up the JVM.
|
1592
|
+
* This was discovered only after much work. Bummer.
|
1593
|
+
*/
|
1594
|
+
char * p;
|
1595
|
+
con_context_t *ctxt = (con_context_t *) malloc (sizeof(con_context_t));
|
1596
|
+
memset(ctxt, 0, sizeof(con_context_t));
|
1597
|
+
p = do_print_flat_constituents(ctxt, linkage);
|
1598
|
+
free(ctxt);
|
1599
|
+
return p;
|
1600
|
+
}
|
1601
|
+
|
1602
|
+
static CType token_type (char *token)
|
1603
|
+
{
|
1604
|
+
if ((token[0] == OPEN_BRACKET) && (strlen(token) > 1))
|
1605
|
+
return OPEN_TOK;
|
1606
|
+
if ((strlen(token) > 1) && (token[strlen(token) - 1] == CLOSE_BRACKET))
|
1607
|
+
return CLOSE_TOK;
|
1608
|
+
return WORD_TOK;
|
1609
|
+
}
|
1610
|
+
|
1611
|
+
static CNode * make_CNode(char *q)
|
1612
|
+
{
|
1613
|
+
CNode * cn;
|
1614
|
+
cn = (CNode *) exalloc(sizeof(CNode));
|
1615
|
+
cn->label = (char *) exalloc(sizeof(char)*(strlen(q)+1));
|
1616
|
+
strcpy(cn->label, q);
|
1617
|
+
cn->child = cn->next = (CNode *) NULL;
|
1618
|
+
cn->next = (CNode *) NULL;
|
1619
|
+
cn->start = cn->end = -1;
|
1620
|
+
return cn;
|
1621
|
+
}
|
1622
|
+
|
1623
|
+
static CNode * parse_string(CNode * n, char **saveptr)
|
1624
|
+
{
|
1625
|
+
char *q;
|
1626
|
+
CNode *m, *last_child=NULL;
|
1627
|
+
|
1628
|
+
while ((q = strtok_r(NULL, " ", saveptr))) {
|
1629
|
+
switch (token_type(q)) {
|
1630
|
+
case CLOSE_TOK :
|
1631
|
+
q[strlen(q)-1]='\0';
|
1632
|
+
assert(strcmp(q, n->label)==0,
|
1633
|
+
"Constituent tree: Labels do not match.");
|
1634
|
+
return n;
|
1635
|
+
break;
|
1636
|
+
case OPEN_TOK:
|
1637
|
+
m = make_CNode(q+1);
|
1638
|
+
m = parse_string(m, saveptr);
|
1639
|
+
break;
|
1640
|
+
case WORD_TOK:
|
1641
|
+
m = make_CNode(q);
|
1642
|
+
break;
|
1643
|
+
default:
|
1644
|
+
assert(0, "Constituent tree: Illegal token type");
|
1645
|
+
}
|
1646
|
+
if (n->child == NULL) {
|
1647
|
+
last_child = n->child = m;
|
1648
|
+
}
|
1649
|
+
else {
|
1650
|
+
last_child->next = m;
|
1651
|
+
last_child = m;
|
1652
|
+
}
|
1653
|
+
}
|
1654
|
+
assert(0, "Constituent tree: Constituent did not close");
|
1655
|
+
return NULL;
|
1656
|
+
}
|
1657
|
+
|
1658
|
+
static void print_tree(String * cs, int indent, CNode * n, int o1, int o2)
|
1659
|
+
{
|
1660
|
+
int i, child_offset;
|
1661
|
+
CNode * m;
|
1662
|
+
|
1663
|
+
if (n == NULL) return;
|
1664
|
+
|
1665
|
+
if (indent)
|
1666
|
+
for (i = 0; i < o1; ++i)
|
1667
|
+
append_string(cs, " ");
|
1668
|
+
append_string(cs, "(%s ", n->label);
|
1669
|
+
child_offset = o2 + strlen(n->label) + 2;
|
1670
|
+
|
1671
|
+
for (m = n->child; m != NULL; m = m->next)
|
1672
|
+
{
|
1673
|
+
if (m->child == NULL)
|
1674
|
+
{
|
1675
|
+
char * p;
|
1676
|
+
/* If the original string has left or right parens in it,
|
1677
|
+
* the printed string will be messed up by these ...
|
1678
|
+
* so replace them by curly braces. What else can one do?
|
1679
|
+
*/
|
1680
|
+
p = strchr(m->label, '(');
|
1681
|
+
while(p)
|
1682
|
+
{
|
1683
|
+
*p = '{';
|
1684
|
+
p = strchr(p, '(');
|
1685
|
+
}
|
1686
|
+
|
1687
|
+
p = strchr(m->label, ')');
|
1688
|
+
while(p)
|
1689
|
+
{
|
1690
|
+
*p = '}';
|
1691
|
+
p = strchr(p, ')');
|
1692
|
+
}
|
1693
|
+
|
1694
|
+
append_string(cs, "%s", m->label);
|
1695
|
+
if ((m->next != NULL) && (m->next->child == NULL))
|
1696
|
+
append_string(cs, " ");
|
1697
|
+
}
|
1698
|
+
else
|
1699
|
+
{
|
1700
|
+
if (m != n->child)
|
1701
|
+
{
|
1702
|
+
if (indent) append_string(cs, "\n");
|
1703
|
+
else append_string(cs, " ");
|
1704
|
+
print_tree(cs, indent, m, child_offset, child_offset);
|
1705
|
+
}
|
1706
|
+
else
|
1707
|
+
{
|
1708
|
+
print_tree(cs, indent, m, 0, child_offset);
|
1709
|
+
}
|
1710
|
+
if ((m->next != NULL) && (m->next->child == NULL))
|
1711
|
+
{
|
1712
|
+
if (indent)
|
1713
|
+
{
|
1714
|
+
append_string(cs, "\n");
|
1715
|
+
for (i = 0; i < child_offset; ++i)
|
1716
|
+
append_string(cs, " ");
|
1717
|
+
}
|
1718
|
+
else append_string(cs, " ");
|
1719
|
+
}
|
1720
|
+
}
|
1721
|
+
}
|
1722
|
+
append_string(cs, ")");
|
1723
|
+
}
|
1724
|
+
|
1725
|
+
static int assign_spans(CNode * n, int start) {
|
1726
|
+
int num_words=0;
|
1727
|
+
CNode * m=NULL;
|
1728
|
+
if (n==NULL) return 0;
|
1729
|
+
n->start = start;
|
1730
|
+
if (n->child == NULL) {
|
1731
|
+
n->end = start;
|
1732
|
+
return 1;
|
1733
|
+
}
|
1734
|
+
else {
|
1735
|
+
for (m=n->child; m!=NULL; m=m->next) {
|
1736
|
+
num_words += assign_spans(m, start+num_words);
|
1737
|
+
}
|
1738
|
+
n->end = start+num_words-1;
|
1739
|
+
}
|
1740
|
+
return num_words;
|
1741
|
+
}
|
1742
|
+
|
1743
|
+
CNode * linkage_constituent_tree(Linkage linkage)
|
1744
|
+
{
|
1745
|
+
char *p, *q, *saveptr;
|
1746
|
+
int len;
|
1747
|
+
CNode * root;
|
1748
|
+
|
1749
|
+
p = print_flat_constituents(linkage);
|
1750
|
+
|
1751
|
+
len = strlen(p);
|
1752
|
+
q = strtok_r(p, " ", &saveptr);
|
1753
|
+
assert(token_type(q) == OPEN_TOK, "Illegal beginning of string");
|
1754
|
+
root = make_CNode(q+1);
|
1755
|
+
root = parse_string(root, &saveptr);
|
1756
|
+
assign_spans(root, 0);
|
1757
|
+
exfree(p, sizeof(char)*(len+1));
|
1758
|
+
return root;
|
1759
|
+
}
|
1760
|
+
|
1761
|
+
void linkage_free_constituent_tree(CNode * n)
|
1762
|
+
{
|
1763
|
+
CNode *m, *x;
|
1764
|
+
for (m=n->child; m!=NULL; m=x) {
|
1765
|
+
x=m->next;
|
1766
|
+
linkage_free_constituent_tree(m);
|
1767
|
+
}
|
1768
|
+
exfree(n->label, sizeof(char)*(strlen(n->label)+1));
|
1769
|
+
exfree(n, sizeof(CNode));
|
1770
|
+
}
|
1771
|
+
|
1772
|
+
/**
|
1773
|
+
* Print out the constituent tree.
|
1774
|
+
* mode 1: treebank-style constituent tree
|
1775
|
+
* mode 2: flat, bracketed tree [A like [B this B] A]
|
1776
|
+
* mode 3: flat, treebank-style tree (A like (B this) )
|
1777
|
+
*/
|
1778
|
+
char * linkage_print_constituent_tree(Linkage linkage, int mode)
|
1779
|
+
{
|
1780
|
+
String * cs;
|
1781
|
+
CNode * root;
|
1782
|
+
char * p;
|
1783
|
+
|
1784
|
+
if ((mode == 0) || (linkage->sent->dict->constituent_pp == NULL))
|
1785
|
+
{
|
1786
|
+
return NULL;
|
1787
|
+
}
|
1788
|
+
else if (mode == 1 || mode == 3)
|
1789
|
+
{
|
1790
|
+
cs = string_new();
|
1791
|
+
root = linkage_constituent_tree(linkage);
|
1792
|
+
print_tree(cs, (mode==1), root, 0, 0);
|
1793
|
+
linkage_free_constituent_tree(root);
|
1794
|
+
append_string(cs, "\n");
|
1795
|
+
p = string_copy(cs);
|
1796
|
+
string_delete(cs);
|
1797
|
+
return p;
|
1798
|
+
}
|
1799
|
+
else if (mode == 2)
|
1800
|
+
{
|
1801
|
+
return print_flat_constituents(linkage);
|
1802
|
+
}
|
1803
|
+
assert(0, "Illegal mode in linkage_print_constituent_tree");
|
1804
|
+
return NULL;
|
1805
|
+
}
|
1806
|
+
|
1807
|
+
void linkage_free_constituent_tree_str(char * s)
|
1808
|
+
{
|
1809
|
+
exfree(s, strlen(s)+1);
|
1810
|
+
}
|
1811
|
+
|
1812
|
+
const char * linkage_constituent_node_get_label(const CNode *n)
|
1813
|
+
{
|
1814
|
+
return n->label;
|
1815
|
+
}
|
1816
|
+
|
1817
|
+
|
1818
|
+
CNode * linkage_constituent_node_get_child(const CNode *n)
|
1819
|
+
{
|
1820
|
+
return n->child;
|
1821
|
+
}
|
1822
|
+
|
1823
|
+
CNode * linkage_constituent_node_get_next(const CNode *n)
|
1824
|
+
{
|
1825
|
+
return n->next;
|
1826
|
+
}
|
1827
|
+
|
1828
|
+
int linkage_constituent_node_get_start(const CNode *n)
|
1829
|
+
{
|
1830
|
+
return n->start;
|
1831
|
+
}
|
1832
|
+
|
1833
|
+
int linkage_constituent_node_get_end(const CNode *n)
|
1834
|
+
{
|
1835
|
+
return n->end;
|
1836
|
+
}
|