grammar_cop 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.DS_Store +0 -0
- data/.gitignore +4 -0
- data/Gemfile +4 -0
- data/Rakefile +8 -0
- data/data/.DS_Store +0 -0
- data/data/Makefile +511 -0
- data/data/Makefile.am +4 -0
- data/data/Makefile.in +511 -0
- data/data/de/.DS_Store +0 -0
- data/data/de/4.0.affix +7 -0
- data/data/de/4.0.dict +474 -0
- data/data/de/Makefile +387 -0
- data/data/de/Makefile.am +9 -0
- data/data/de/Makefile.in +387 -0
- data/data/en/.DS_Store +0 -0
- data/data/en/4.0.affix +26 -0
- data/data/en/4.0.batch +1002 -0
- data/data/en/4.0.biolg.batch +411 -0
- data/data/en/4.0.constituent-knowledge +127 -0
- data/data/en/4.0.dict +8759 -0
- data/data/en/4.0.dict.m4 +6928 -0
- data/data/en/4.0.enwiki.batch +14 -0
- data/data/en/4.0.fixes.batch +2776 -0
- data/data/en/4.0.knowledge +306 -0
- data/data/en/4.0.regex +225 -0
- data/data/en/4.0.voa.batch +114 -0
- data/data/en/Makefile +554 -0
- data/data/en/Makefile.am +19 -0
- data/data/en/Makefile.in +554 -0
- data/data/en/README +173 -0
- data/data/en/tiny.dict +157 -0
- data/data/en/words/.DS_Store +0 -0
- data/data/en/words/Makefile +456 -0
- data/data/en/words/Makefile.am +78 -0
- data/data/en/words/Makefile.in +456 -0
- data/data/en/words/currency +205 -0
- data/data/en/words/currency.p +28 -0
- data/data/en/words/entities.given-bisex.sing +39 -0
- data/data/en/words/entities.given-female.sing +4141 -0
- data/data/en/words/entities.given-male.sing +1633 -0
- data/data/en/words/entities.locations.sing +68 -0
- data/data/en/words/entities.national.sing +253 -0
- data/data/en/words/entities.organizations.sing +7 -0
- data/data/en/words/entities.us-states.sing +11 -0
- data/data/en/words/units.1 +45 -0
- data/data/en/words/units.1.dot +4 -0
- data/data/en/words/units.3 +2 -0
- data/data/en/words/units.4 +5 -0
- data/data/en/words/units.4.dot +1 -0
- data/data/en/words/words-medical.adv.1 +1191 -0
- data/data/en/words/words-medical.prep.1 +67 -0
- data/data/en/words/words-medical.v.4.1 +2835 -0
- data/data/en/words/words-medical.v.4.2 +2848 -0
- data/data/en/words/words-medical.v.4.3 +3011 -0
- data/data/en/words/words-medical.v.4.4 +3036 -0
- data/data/en/words/words-medical.v.4.5 +3050 -0
- data/data/en/words/words.adj.1 +6794 -0
- data/data/en/words/words.adj.2 +638 -0
- data/data/en/words/words.adj.3 +667 -0
- data/data/en/words/words.adv.1 +1573 -0
- data/data/en/words/words.adv.2 +67 -0
- data/data/en/words/words.adv.3 +157 -0
- data/data/en/words/words.adv.4 +80 -0
- data/data/en/words/words.n.1 +11464 -0
- data/data/en/words/words.n.1.wiki +264 -0
- data/data/en/words/words.n.2.s +2017 -0
- data/data/en/words/words.n.2.s.biolg +1 -0
- data/data/en/words/words.n.2.s.wiki +298 -0
- data/data/en/words/words.n.2.x +65 -0
- data/data/en/words/words.n.2.x.wiki +10 -0
- data/data/en/words/words.n.3 +5717 -0
- data/data/en/words/words.n.t +23 -0
- data/data/en/words/words.v.1.1 +1038 -0
- data/data/en/words/words.v.1.2 +1043 -0
- data/data/en/words/words.v.1.3 +1052 -0
- data/data/en/words/words.v.1.4 +1023 -0
- data/data/en/words/words.v.1.p +17 -0
- data/data/en/words/words.v.10.1 +14 -0
- data/data/en/words/words.v.10.2 +15 -0
- data/data/en/words/words.v.10.3 +88 -0
- data/data/en/words/words.v.10.4 +17 -0
- data/data/en/words/words.v.2.1 +1253 -0
- data/data/en/words/words.v.2.2 +1304 -0
- data/data/en/words/words.v.2.3 +1280 -0
- data/data/en/words/words.v.2.4 +1285 -0
- data/data/en/words/words.v.2.5 +1287 -0
- data/data/en/words/words.v.4.1 +2472 -0
- data/data/en/words/words.v.4.2 +2487 -0
- data/data/en/words/words.v.4.3 +2441 -0
- data/data/en/words/words.v.4.4 +2478 -0
- data/data/en/words/words.v.4.5 +2483 -0
- data/data/en/words/words.v.5.1 +98 -0
- data/data/en/words/words.v.5.2 +98 -0
- data/data/en/words/words.v.5.3 +103 -0
- data/data/en/words/words.v.5.4 +102 -0
- data/data/en/words/words.v.6.1 +388 -0
- data/data/en/words/words.v.6.2 +401 -0
- data/data/en/words/words.v.6.3 +397 -0
- data/data/en/words/words.v.6.4 +405 -0
- data/data/en/words/words.v.6.5 +401 -0
- data/data/en/words/words.v.8.1 +117 -0
- data/data/en/words/words.v.8.2 +118 -0
- data/data/en/words/words.v.8.3 +118 -0
- data/data/en/words/words.v.8.4 +119 -0
- data/data/en/words/words.v.8.5 +119 -0
- data/data/en/words/words.y +104 -0
- data/data/lt/.DS_Store +0 -0
- data/data/lt/4.0.affix +6 -0
- data/data/lt/4.0.constituent-knowledge +24 -0
- data/data/lt/4.0.dict +135 -0
- data/data/lt/4.0.knowledge +38 -0
- data/data/lt/Makefile +389 -0
- data/data/lt/Makefile.am +11 -0
- data/data/lt/Makefile.in +389 -0
- data/ext/.DS_Store +0 -0
- data/ext/link_grammar/.DS_Store +0 -0
- data/ext/link_grammar/extconf.rb +2 -0
- data/ext/link_grammar/link-grammar/.DS_Store +0 -0
- data/ext/link_grammar/link-grammar/.deps/analyze-linkage.Plo +198 -0
- data/ext/link_grammar/link-grammar/.deps/and.Plo +202 -0
- data/ext/link_grammar/link-grammar/.deps/api.Plo +244 -0
- data/ext/link_grammar/link-grammar/.deps/build-disjuncts.Plo +212 -0
- data/ext/link_grammar/link-grammar/.deps/command-line.Plo +201 -0
- data/ext/link_grammar/link-grammar/.deps/constituents.Plo +201 -0
- data/ext/link_grammar/link-grammar/.deps/count.Plo +202 -0
- data/ext/link_grammar/link-grammar/.deps/disjunct-utils.Plo +126 -0
- data/ext/link_grammar/link-grammar/.deps/disjuncts.Plo +123 -0
- data/ext/link_grammar/link-grammar/.deps/error.Plo +121 -0
- data/ext/link_grammar/link-grammar/.deps/expand.Plo +133 -0
- data/ext/link_grammar/link-grammar/.deps/extract-links.Plo +198 -0
- data/ext/link_grammar/link-grammar/.deps/fast-match.Plo +200 -0
- data/ext/link_grammar/link-grammar/.deps/idiom.Plo +200 -0
- data/ext/link_grammar/link-grammar/.deps/jni-client.Plo +217 -0
- data/ext/link_grammar/link-grammar/.deps/link-parser.Po +1 -0
- data/ext/link_grammar/link-grammar/.deps/massage.Plo +202 -0
- data/ext/link_grammar/link-grammar/.deps/post-process.Plo +202 -0
- data/ext/link_grammar/link-grammar/.deps/pp_knowledge.Plo +202 -0
- data/ext/link_grammar/link-grammar/.deps/pp_lexer.Plo +201 -0
- data/ext/link_grammar/link-grammar/.deps/pp_linkset.Plo +200 -0
- data/ext/link_grammar/link-grammar/.deps/prefix.Plo +102 -0
- data/ext/link_grammar/link-grammar/.deps/preparation.Plo +202 -0
- data/ext/link_grammar/link-grammar/.deps/print-util.Plo +200 -0
- data/ext/link_grammar/link-grammar/.deps/print.Plo +201 -0
- data/ext/link_grammar/link-grammar/.deps/prune.Plo +202 -0
- data/ext/link_grammar/link-grammar/.deps/read-dict.Plo +223 -0
- data/ext/link_grammar/link-grammar/.deps/read-regex.Plo +123 -0
- data/ext/link_grammar/link-grammar/.deps/regex-morph.Plo +131 -0
- data/ext/link_grammar/link-grammar/.deps/resources.Plo +203 -0
- data/ext/link_grammar/link-grammar/.deps/spellcheck-aspell.Plo +1 -0
- data/ext/link_grammar/link-grammar/.deps/spellcheck-hun.Plo +115 -0
- data/ext/link_grammar/link-grammar/.deps/string-set.Plo +198 -0
- data/ext/link_grammar/link-grammar/.deps/tokenize.Plo +160 -0
- data/ext/link_grammar/link-grammar/.deps/utilities.Plo +222 -0
- data/ext/link_grammar/link-grammar/.deps/word-file.Plo +201 -0
- data/ext/link_grammar/link-grammar/.deps/word-utils.Plo +212 -0
- data/ext/link_grammar/link-grammar/.libs/analyze-linkage.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/and.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/api.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/build-disjuncts.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/command-line.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/constituents.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/count.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/disjunct-utils.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/disjuncts.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/error.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/expand.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/extract-links.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/fast-match.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/idiom.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/jni-client.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java-symbols.expsym +31 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib.dSYM/Contents/Info.plist +20 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.4.dylib.dSYM/Contents/Resources/DWARF/liblink-grammar-java.4.dylib +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.a +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar-java.dylib +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar-symbols.expsym +194 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib.dSYM/Contents/Info.plist +20 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar.4.dylib.dSYM/Contents/Resources/DWARF/liblink-grammar.4.dylib +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar.a +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar.dylib +0 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar.la +41 -0
- data/ext/link_grammar/link-grammar/.libs/liblink-grammar.lai +41 -0
- data/ext/link_grammar/link-grammar/.libs/massage.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/post-process.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/pp_knowledge.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/pp_lexer.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/pp_linkset.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/prefix.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/preparation.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/print-util.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/print.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/prune.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/read-dict.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/read-regex.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/regex-morph.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/resources.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/spellcheck-aspell.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/spellcheck-hun.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/string-set.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/tokenize.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/utilities.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/word-file.o +0 -0
- data/ext/link_grammar/link-grammar/.libs/word-utils.o +0 -0
- data/ext/link_grammar/link-grammar/Makefile +900 -0
- data/ext/link_grammar/link-grammar/Makefile.am +202 -0
- data/ext/link_grammar/link-grammar/Makefile.in +900 -0
- data/ext/link_grammar/link-grammar/analyze-linkage.c +1317 -0
- data/ext/link_grammar/link-grammar/analyze-linkage.h +24 -0
- data/ext/link_grammar/link-grammar/and.c +1603 -0
- data/ext/link_grammar/link-grammar/and.h +27 -0
- data/ext/link_grammar/link-grammar/api-structures.h +362 -0
- data/ext/link_grammar/link-grammar/api-types.h +72 -0
- data/ext/link_grammar/link-grammar/api.c +1887 -0
- data/ext/link_grammar/link-grammar/api.h +96 -0
- data/ext/link_grammar/link-grammar/autoit/.DS_Store +0 -0
- data/ext/link_grammar/link-grammar/autoit/README +10 -0
- data/ext/link_grammar/link-grammar/autoit/_LGTest.au3 +22 -0
- data/ext/link_grammar/link-grammar/autoit/_LinkGrammar.au3 +545 -0
- data/ext/link_grammar/link-grammar/build-disjuncts.c +487 -0
- data/ext/link_grammar/link-grammar/build-disjuncts.h +21 -0
- data/ext/link_grammar/link-grammar/command-line.c +458 -0
- data/ext/link_grammar/link-grammar/command-line.h +15 -0
- data/ext/link_grammar/link-grammar/constituents.c +1836 -0
- data/ext/link_grammar/link-grammar/constituents.h +26 -0
- data/ext/link_grammar/link-grammar/corpus/.DS_Store +0 -0
- data/ext/link_grammar/link-grammar/corpus/.deps/cluster.Plo +1 -0
- data/ext/link_grammar/link-grammar/corpus/.deps/corpus.Plo +1 -0
- data/ext/link_grammar/link-grammar/corpus/Makefile +527 -0
- data/ext/link_grammar/link-grammar/corpus/Makefile.am +46 -0
- data/ext/link_grammar/link-grammar/corpus/Makefile.in +527 -0
- data/ext/link_grammar/link-grammar/corpus/README +17 -0
- data/ext/link_grammar/link-grammar/corpus/cluster.c +286 -0
- data/ext/link_grammar/link-grammar/corpus/cluster.h +32 -0
- data/ext/link_grammar/link-grammar/corpus/corpus.c +483 -0
- data/ext/link_grammar/link-grammar/corpus/corpus.h +46 -0
- data/ext/link_grammar/link-grammar/count.c +828 -0
- data/ext/link_grammar/link-grammar/count.h +25 -0
- data/ext/link_grammar/link-grammar/disjunct-utils.c +261 -0
- data/ext/link_grammar/link-grammar/disjunct-utils.h +27 -0
- data/ext/link_grammar/link-grammar/disjuncts.c +138 -0
- data/ext/link_grammar/link-grammar/disjuncts.h +13 -0
- data/ext/link_grammar/link-grammar/error.c +92 -0
- data/ext/link_grammar/link-grammar/error.h +35 -0
- data/ext/link_grammar/link-grammar/expand.c +67 -0
- data/ext/link_grammar/link-grammar/expand.h +13 -0
- data/ext/link_grammar/link-grammar/externs.h +22 -0
- data/ext/link_grammar/link-grammar/extract-links.c +625 -0
- data/ext/link_grammar/link-grammar/extract-links.h +16 -0
- data/ext/link_grammar/link-grammar/fast-match.c +309 -0
- data/ext/link_grammar/link-grammar/fast-match.h +17 -0
- data/ext/link_grammar/link-grammar/idiom.c +373 -0
- data/ext/link_grammar/link-grammar/idiom.h +15 -0
- data/ext/link_grammar/link-grammar/jni-client.c +779 -0
- data/ext/link_grammar/link-grammar/jni-client.h +236 -0
- data/ext/link_grammar/link-grammar/liblink-grammar-java.la +42 -0
- data/ext/link_grammar/link-grammar/liblink-grammar.la +41 -0
- data/ext/link_grammar/link-grammar/link-features.h +37 -0
- data/ext/link_grammar/link-grammar/link-features.h.in +37 -0
- data/ext/link_grammar/link-grammar/link-grammar-java.def +31 -0
- data/ext/link_grammar/link-grammar/link-grammar.def +194 -0
- data/ext/link_grammar/link-grammar/link-includes.h +465 -0
- data/ext/link_grammar/link-grammar/link-parser.c +849 -0
- data/ext/link_grammar/link-grammar/massage.c +329 -0
- data/ext/link_grammar/link-grammar/massage.h +13 -0
- data/ext/link_grammar/link-grammar/post-process.c +1113 -0
- data/ext/link_grammar/link-grammar/post-process.h +45 -0
- data/ext/link_grammar/link-grammar/pp_knowledge.c +376 -0
- data/ext/link_grammar/link-grammar/pp_knowledge.h +14 -0
- data/ext/link_grammar/link-grammar/pp_lexer.c +1920 -0
- data/ext/link_grammar/link-grammar/pp_lexer.h +19 -0
- data/ext/link_grammar/link-grammar/pp_linkset.c +158 -0
- data/ext/link_grammar/link-grammar/pp_linkset.h +20 -0
- data/ext/link_grammar/link-grammar/prefix.c +482 -0
- data/ext/link_grammar/link-grammar/prefix.h +139 -0
- data/ext/link_grammar/link-grammar/preparation.c +412 -0
- data/ext/link_grammar/link-grammar/preparation.h +20 -0
- data/ext/link_grammar/link-grammar/print-util.c +87 -0
- data/ext/link_grammar/link-grammar/print-util.h +32 -0
- data/ext/link_grammar/link-grammar/print.c +1085 -0
- data/ext/link_grammar/link-grammar/print.h +16 -0
- data/ext/link_grammar/link-grammar/prune.c +1864 -0
- data/ext/link_grammar/link-grammar/prune.h +17 -0
- data/ext/link_grammar/link-grammar/read-dict.c +1785 -0
- data/ext/link_grammar/link-grammar/read-dict.h +29 -0
- data/ext/link_grammar/link-grammar/read-regex.c +161 -0
- data/ext/link_grammar/link-grammar/read-regex.h +12 -0
- data/ext/link_grammar/link-grammar/regex-morph.c +126 -0
- data/ext/link_grammar/link-grammar/regex-morph.h +17 -0
- data/ext/link_grammar/link-grammar/resources.c +180 -0
- data/ext/link_grammar/link-grammar/resources.h +23 -0
- data/ext/link_grammar/link-grammar/sat-solver/.DS_Store +0 -0
- data/ext/link_grammar/link-grammar/sat-solver/.deps/fast-sprintf.Plo +1 -0
- data/ext/link_grammar/link-grammar/sat-solver/.deps/sat-encoder.Plo +1 -0
- data/ext/link_grammar/link-grammar/sat-solver/.deps/util.Plo +1 -0
- data/ext/link_grammar/link-grammar/sat-solver/.deps/variables.Plo +1 -0
- data/ext/link_grammar/link-grammar/sat-solver/.deps/word-tag.Plo +1 -0
- data/ext/link_grammar/link-grammar/sat-solver/Makefile +527 -0
- data/ext/link_grammar/link-grammar/sat-solver/Makefile.am +29 -0
- data/ext/link_grammar/link-grammar/sat-solver/Makefile.in +527 -0
- data/ext/link_grammar/link-grammar/sat-solver/clock.hpp +33 -0
- data/ext/link_grammar/link-grammar/sat-solver/fast-sprintf.cpp +26 -0
- data/ext/link_grammar/link-grammar/sat-solver/fast-sprintf.hpp +7 -0
- data/ext/link_grammar/link-grammar/sat-solver/guiding.hpp +244 -0
- data/ext/link_grammar/link-grammar/sat-solver/matrix-ut.hpp +79 -0
- data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.cpp +2811 -0
- data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.h +11 -0
- data/ext/link_grammar/link-grammar/sat-solver/sat-encoder.hpp +381 -0
- data/ext/link_grammar/link-grammar/sat-solver/trie.hpp +118 -0
- data/ext/link_grammar/link-grammar/sat-solver/util.cpp +23 -0
- data/ext/link_grammar/link-grammar/sat-solver/util.hpp +14 -0
- data/ext/link_grammar/link-grammar/sat-solver/variables.cpp +5 -0
- data/ext/link_grammar/link-grammar/sat-solver/variables.hpp +829 -0
- data/ext/link_grammar/link-grammar/sat-solver/word-tag.cpp +159 -0
- data/ext/link_grammar/link-grammar/sat-solver/word-tag.hpp +162 -0
- data/ext/link_grammar/link-grammar/spellcheck-aspell.c +148 -0
- data/ext/link_grammar/link-grammar/spellcheck-hun.c +136 -0
- data/ext/link_grammar/link-grammar/spellcheck.h +34 -0
- data/ext/link_grammar/link-grammar/string-set.c +169 -0
- data/ext/link_grammar/link-grammar/string-set.h +16 -0
- data/ext/link_grammar/link-grammar/structures.h +498 -0
- data/ext/link_grammar/link-grammar/tokenize.c +1049 -0
- data/ext/link_grammar/link-grammar/tokenize.h +15 -0
- data/ext/link_grammar/link-grammar/utilities.c +847 -0
- data/ext/link_grammar/link-grammar/utilities.h +281 -0
- data/ext/link_grammar/link-grammar/word-file.c +124 -0
- data/ext/link_grammar/link-grammar/word-file.h +15 -0
- data/ext/link_grammar/link-grammar/word-utils.c +526 -0
- data/ext/link_grammar/link-grammar/word-utils.h +152 -0
- data/ext/link_grammar/link_grammar.c +202 -0
- data/ext/link_grammar/link_grammar.h +99 -0
- data/grammar_cop.gemspec +24 -0
- data/lib/.DS_Store +0 -0
- data/lib/grammar_cop.rb +9 -0
- data/lib/grammar_cop/.DS_Store +0 -0
- data/lib/grammar_cop/dictionary.rb +19 -0
- data/lib/grammar_cop/linkage.rb +30 -0
- data/lib/grammar_cop/parse_options.rb +32 -0
- data/lib/grammar_cop/sentence.rb +36 -0
- data/lib/grammar_cop/version.rb +3 -0
- data/test/.DS_Store +0 -0
- data/test/grammar_cop_test.rb +27 -0
- metadata +407 -0
@@ -0,0 +1,1049 @@
|
|
1
|
+
/*************************************************************************/
|
2
|
+
/* Copyright (c) 2004 */
|
3
|
+
/* Daniel Sleator, David Temperley, and John Lafferty */
|
4
|
+
/* Copyright (c) 2009 Linas Vepstas */
|
5
|
+
/* All rights reserved */
|
6
|
+
/* */
|
7
|
+
/* Use of the link grammar parsing system is subject to the terms of the */
|
8
|
+
/* license set forth in the LICENSE file included with this software, */
|
9
|
+
/* and also available at http://www.link.cs.cmu.edu/link/license.html */
|
10
|
+
/* This license allows free redistribution and use in source and binary */
|
11
|
+
/* forms, with or without modification, subject to certain conditions. */
|
12
|
+
/* */
|
13
|
+
/*************************************************************************/
|
14
|
+
|
15
|
+
#ifndef _WIN32
|
16
|
+
#include <langinfo.h>
|
17
|
+
#endif
|
18
|
+
#include <limits.h>
|
19
|
+
|
20
|
+
#include "build-disjuncts.h"
|
21
|
+
#include "error.h"
|
22
|
+
#include "externs.h"
|
23
|
+
#include "read-dict.h"
|
24
|
+
#include "regex-morph.h"
|
25
|
+
#include "spellcheck.h"
|
26
|
+
#include "string-set.h"
|
27
|
+
#include "structures.h"
|
28
|
+
#include "tokenize.h"
|
29
|
+
#include "utilities.h"
|
30
|
+
#include "word-utils.h"
|
31
|
+
|
32
|
+
#define MAX_STRIP 10
|
33
|
+
|
34
|
+
/* These are no longer in use, but are read from the 4.0.affix file */
|
35
|
+
/* I've left these here, as an axample of what to expect. */
|
36
|
+
/*static char * strip_left[] = {"(", "$", "``", NULL}; */
|
37
|
+
/*static char * strip_right[] = {")", "%", ",", ".", ":", ";", "?", "!", "''", "'", "'s", NULL};*/
|
38
|
+
|
39
|
+
#define ENTITY_MARKER "<marker-entity>"
|
40
|
+
#define COMMON_ENTITY_MARKER "<marker-common-entity>"
|
41
|
+
|
42
|
+
/**
|
43
|
+
* is_common_entity - Return true if word is a common noun or adjective
|
44
|
+
* Common nouns and adjectives are typically used in corporate entity
|
45
|
+
* names -- e.g. "Sun State Bank" -- "sun", "state" and "bank" are all
|
46
|
+
* common nouns.
|
47
|
+
*/
|
48
|
+
static int is_common_entity(Dictionary dict, const char * str)
|
49
|
+
{
|
50
|
+
if (word_contains(dict, str, COMMON_ENTITY_MARKER) == 1)
|
51
|
+
return TRUE;
|
52
|
+
return FALSE;
|
53
|
+
}
|
54
|
+
|
55
|
+
static int is_entity(Dictionary dict, const char * str)
|
56
|
+
{
|
57
|
+
const char * regex_name;
|
58
|
+
if (word_contains(dict, str, ENTITY_MARKER) == 1)
|
59
|
+
return TRUE;
|
60
|
+
regex_name = match_regex(dict, str);
|
61
|
+
if (NULL == regex_name) return FALSE;
|
62
|
+
return word_contains(dict, regex_name, ENTITY_MARKER);
|
63
|
+
}
|
64
|
+
|
65
|
+
|
66
|
+
/**
|
67
|
+
* Return TRUE if word is a proper name.
|
68
|
+
* XXX This is a cheap hack that works only in English, and is
|
69
|
+
* broken for German! We need to replace this with something
|
70
|
+
* language-specific.
|
71
|
+
*
|
72
|
+
* Basically, if word starts with upper-case latter, we assume
|
73
|
+
* its a proper name, and that's that.
|
74
|
+
*/
|
75
|
+
static int is_proper_name(const char * word)
|
76
|
+
{
|
77
|
+
return is_utf8_upper(word);
|
78
|
+
}
|
79
|
+
|
80
|
+
/* Create a string containing anything that can be construed to
|
81
|
+
* be a quotation mark. This works, because link-grammar is more
|
82
|
+
* or less ignorant of quotes at this time.
|
83
|
+
*/
|
84
|
+
static const wchar_t *list_of_quotes(void)
|
85
|
+
{
|
86
|
+
#define QUSZ 50
|
87
|
+
static wchar_t wqs[QUSZ];
|
88
|
+
mbstate_t mbs;
|
89
|
+
/* Single-quotes are used for abbreviations, don't mess with them */
|
90
|
+
/* const char * qs = "\"\'«»《》【】『』‘’`„“"; */
|
91
|
+
const char * qs = "\"«»《》【】『』`„“";
|
92
|
+
|
93
|
+
const char *pqs = qs;
|
94
|
+
|
95
|
+
memset(&mbs, 0, sizeof(mbs));
|
96
|
+
|
97
|
+
mbsrtowcs(wqs, &pqs, QUSZ, &mbs);
|
98
|
+
|
99
|
+
return wqs;
|
100
|
+
}
|
101
|
+
|
102
|
+
/**
|
103
|
+
* Return TRUE if the character is a quotation character.
|
104
|
+
*/
|
105
|
+
static int is_quote(wchar_t wc)
|
106
|
+
{
|
107
|
+
static const wchar_t *quotes = NULL;
|
108
|
+
if (NULL == quotes) quotes = list_of_quotes();
|
109
|
+
|
110
|
+
if (NULL != wcschr(quotes, wc)) return TRUE;
|
111
|
+
return FALSE;
|
112
|
+
}
|
113
|
+
|
114
|
+
/**
|
115
|
+
* Returns true if the word can be interpreted as a number.
|
116
|
+
* The ":" is included here so we allow "10:30" to be a number.
|
117
|
+
* We also allow U+00A0 "no-break space"
|
118
|
+
*/
|
119
|
+
static int is_number(const char * s)
|
120
|
+
{
|
121
|
+
mbstate_t mbs;
|
122
|
+
int nb = 1;
|
123
|
+
wchar_t c;
|
124
|
+
if (!is_utf8_digit(s)) return FALSE;
|
125
|
+
|
126
|
+
memset(&mbs, 0, sizeof(mbs));
|
127
|
+
while ((*s != 0) && (0 < nb))
|
128
|
+
{
|
129
|
+
nb = mbrtowc(&c, s, MB_CUR_MAX, &mbs);
|
130
|
+
if (iswdigit(c)) { s += nb; }
|
131
|
+
|
132
|
+
/* U+00A0 no break space */
|
133
|
+
else if (0xa0 == c) { s += nb; }
|
134
|
+
|
135
|
+
else if ((*s == '.') || (*s == ',') || (*s == ':')) { s++; }
|
136
|
+
else return FALSE;
|
137
|
+
}
|
138
|
+
return TRUE;
|
139
|
+
}
|
140
|
+
|
141
|
+
/**
|
142
|
+
* Returns true if the word contains digits.
|
143
|
+
*/
|
144
|
+
static int contains_digits(const char * s)
|
145
|
+
{
|
146
|
+
mbstate_t mbs;
|
147
|
+
int nb = 1;
|
148
|
+
wchar_t c;
|
149
|
+
|
150
|
+
memset(&mbs, 0, sizeof(mbs));
|
151
|
+
while ((*s != 0) && (0 < nb))
|
152
|
+
{
|
153
|
+
nb = mbrtowc(&c, s, MB_CUR_MAX, &mbs);
|
154
|
+
if (iswdigit(c)) return TRUE;
|
155
|
+
s += nb;
|
156
|
+
}
|
157
|
+
return FALSE;
|
158
|
+
}
|
159
|
+
|
160
|
+
/**
|
161
|
+
* The string s is the next word of the sentence.
|
162
|
+
* Do not issue the empty string.
|
163
|
+
* Return false if too many words or the word is too long.
|
164
|
+
*/
|
165
|
+
static int issue_sentence_word(Sentence sent, const char * s)
|
166
|
+
{
|
167
|
+
if (*s == '\0') return TRUE;
|
168
|
+
if (strlen(s) > MAX_WORD)
|
169
|
+
{
|
170
|
+
err_ctxt ec;
|
171
|
+
ec.sent = sent;
|
172
|
+
err_msg(&ec, Error,
|
173
|
+
"Error separating sentence. The word \"%s\" is too long.\n"
|
174
|
+
"A word can have a maximum of %d characters.\n", s, MAX_WORD);
|
175
|
+
return FALSE;
|
176
|
+
}
|
177
|
+
|
178
|
+
if (sent->length >= MAX_SENTENCE)
|
179
|
+
{
|
180
|
+
err_ctxt ec;
|
181
|
+
ec.sent = sent;
|
182
|
+
err_msg(&ec, Error,
|
183
|
+
"Error separating sentence. The sentence has too many words.\n");
|
184
|
+
return FALSE;
|
185
|
+
}
|
186
|
+
|
187
|
+
strcpy(sent->word[sent->length].string, s);
|
188
|
+
|
189
|
+
/* Now we record whether the first character of the word is upper-case.
|
190
|
+
(The first character may be made lower-case
|
191
|
+
later, but we may want to get at the original version) */
|
192
|
+
if (is_utf8_upper(s)) sent->word[sent->length].firstupper=1;
|
193
|
+
else sent->word[sent->length].firstupper = 0;
|
194
|
+
sent->length++;
|
195
|
+
return TRUE;
|
196
|
+
}
|
197
|
+
|
198
|
+
/*
|
199
|
+
Here's a summary of how subscripts are handled:
|
200
|
+
|
201
|
+
Reading the dictionary:
|
202
|
+
|
203
|
+
If the last "." in a string is followed by a non-digit character,
|
204
|
+
then the "." and everything after it is considered to be the subscript
|
205
|
+
of the word.
|
206
|
+
|
207
|
+
The dictionary reader does not allow you to have two words that
|
208
|
+
match according to the criterion below. (so you can't have
|
209
|
+
"dog.n" and "dog")
|
210
|
+
|
211
|
+
Quote marks are used to allow you to define words in the dictionary
|
212
|
+
which would otherwise be considered part of the dictionary, as in
|
213
|
+
|
214
|
+
";": {@Xca-} & Xx- & (W+ or Qd+) & {Xx+};
|
215
|
+
"%" : (ND- & {DD-} & <noun-sub-x> &
|
216
|
+
(<noun-main-x> or B*x+)) or (ND- & (OD- or AN+));
|
217
|
+
|
218
|
+
Rules for chopping words from the input sentence:
|
219
|
+
|
220
|
+
First the prefix chars are stripped off of the word. These
|
221
|
+
characters are "(" and "$" (and now "``")
|
222
|
+
|
223
|
+
Now, repeat the following as long as necessary:
|
224
|
+
|
225
|
+
Look up the word in the dictionary.
|
226
|
+
If it's there, the process terminates.
|
227
|
+
|
228
|
+
If it's not there and it ends in one of the right strippable
|
229
|
+
strings (see "strip_right") then remove the strippable string
|
230
|
+
and make it into a separate word.
|
231
|
+
|
232
|
+
If there is no strippable string, then the process terminates.
|
233
|
+
|
234
|
+
Rule for defining subscripts in input words:
|
235
|
+
|
236
|
+
The subscript rule is followed just as when reading the dictionary.
|
237
|
+
|
238
|
+
When does a word in the sentence match a word in the dictionary?
|
239
|
+
|
240
|
+
Matching is done as follows: Two words with subscripts must match
|
241
|
+
exactly. If neither has a subscript they must match exactly. If one
|
242
|
+
does and one doesn't then they must match when the subscript is
|
243
|
+
removed. Notice that this is symmetric.
|
244
|
+
|
245
|
+
So, under this system, the dictonary could have the words "Ill" and
|
246
|
+
also the word "Ill." It could also have the word "i.e.", which could be
|
247
|
+
used in a sentence.
|
248
|
+
*/
|
249
|
+
|
250
|
+
#undef MIN
|
251
|
+
#define MIN(a, b) (((a) < (b)) ? (a) : (b))
|
252
|
+
|
253
|
+
static int boolean_reg_dict_lookup(Dictionary dict, const char * word)
|
254
|
+
{
|
255
|
+
const char * regex_name;
|
256
|
+
if (boolean_dictionary_lookup(dict, word)) return TRUE;
|
257
|
+
|
258
|
+
regex_name = match_regex(dict, word);
|
259
|
+
if (NULL == regex_name) return FALSE;
|
260
|
+
|
261
|
+
return boolean_dictionary_lookup(dict, regex_name);
|
262
|
+
}
|
263
|
+
|
264
|
+
static int downcase_is_in_dict(Dictionary dict, char * word)
|
265
|
+
{
|
266
|
+
int i, rc;
|
267
|
+
char low[MB_LEN_MAX];
|
268
|
+
char save[MB_LEN_MAX];
|
269
|
+
wchar_t c;
|
270
|
+
int nbl, nbh;
|
271
|
+
mbstate_t mbs, mbss;
|
272
|
+
|
273
|
+
if (!is_utf8_upper(word)) return FALSE;
|
274
|
+
|
275
|
+
memset(&mbs, 0, sizeof(mbs));
|
276
|
+
memset(&mbss, 0, sizeof(mbss));
|
277
|
+
|
278
|
+
nbh = mbrtowc (&c, word, MB_CUR_MAX, &mbs);
|
279
|
+
c = towlower(c);
|
280
|
+
nbl = wctomb_check(low, c, &mbss);
|
281
|
+
if (nbh != nbl)
|
282
|
+
{
|
283
|
+
prt_error("Warning: can't downcase multi-byte string: %s\n", word);
|
284
|
+
return FALSE;
|
285
|
+
}
|
286
|
+
|
287
|
+
/* Downcase */
|
288
|
+
for (i=0; i<nbl; i++) { save[i] = word[i]; word[i] = low[i]; }
|
289
|
+
|
290
|
+
/* Look it up, then restore old value */
|
291
|
+
rc = boolean_reg_dict_lookup(dict, word);
|
292
|
+
for (i=0; i<nbh; i++) { word[i] = save[i]; }
|
293
|
+
|
294
|
+
return rc;
|
295
|
+
}
|
296
|
+
|
297
|
+
/**
|
298
|
+
* w points to a string, wend points to the char one after the end. The
|
299
|
+
* "word" w contains no blanks. This function splits up the word if
|
300
|
+
* necessary, and calls "issue_sentence_word()" on each of the resulting
|
301
|
+
* parts. The process is described above. Returns TRUE if OK, FALSE if
|
302
|
+
* too many punctuation marks or other separation error.
|
303
|
+
*/
|
304
|
+
static int separate_word(Sentence sent, Parse_Options opts,
|
305
|
+
const char *w, const char *wend,
|
306
|
+
int is_first_word, int quote_found)
|
307
|
+
{
|
308
|
+
size_t sz;
|
309
|
+
int i, j, len;
|
310
|
+
int r_strippable=0, l_strippable=0, u_strippable=0;
|
311
|
+
int s_strippable=0, p_strippable=0;
|
312
|
+
int n_r_stripped, s_stripped;
|
313
|
+
int word_is_in_dict, s_ok;
|
314
|
+
int issued = FALSE;
|
315
|
+
|
316
|
+
int found_number = 0;
|
317
|
+
int n_r_stripped_save;
|
318
|
+
const char * wend_save;
|
319
|
+
|
320
|
+
const char ** strip_left = NULL;
|
321
|
+
const char ** strip_right = NULL;
|
322
|
+
const char ** strip_units = NULL;
|
323
|
+
const char ** prefix = NULL;
|
324
|
+
const char ** suffix = NULL;
|
325
|
+
char word[MAX_WORD+1];
|
326
|
+
char newword[MAX_WORD+1];
|
327
|
+
|
328
|
+
const char *r_stripped[MAX_STRIP]; /* these were stripped from the right */
|
329
|
+
|
330
|
+
/* First, see if we can already recognize the word as-is. If
|
331
|
+
* so, then we are done. Else we'll try stripping prefixes, suffixes.
|
332
|
+
*/
|
333
|
+
sz = MIN(wend-w, MAX_WORD);
|
334
|
+
strncpy(word, w, sz);
|
335
|
+
word[sz] = '\0';
|
336
|
+
word_is_in_dict = FALSE;
|
337
|
+
|
338
|
+
if (boolean_reg_dict_lookup(sent->dict, word))
|
339
|
+
word_is_in_dict = TRUE;
|
340
|
+
else if (is_first_word && downcase_is_in_dict (sent->dict,word))
|
341
|
+
word_is_in_dict = TRUE;
|
342
|
+
|
343
|
+
if (word_is_in_dict)
|
344
|
+
{
|
345
|
+
return issue_sentence_word(sent, word);
|
346
|
+
}
|
347
|
+
|
348
|
+
/* Set up affix tables. */
|
349
|
+
if (sent->dict->affix_table != NULL)
|
350
|
+
{
|
351
|
+
Dictionary dict = sent->dict->affix_table;
|
352
|
+
r_strippable = dict->r_strippable;
|
353
|
+
l_strippable = dict->l_strippable;
|
354
|
+
u_strippable = dict->u_strippable;
|
355
|
+
p_strippable = dict->p_strippable;
|
356
|
+
s_strippable = dict->s_strippable;
|
357
|
+
|
358
|
+
strip_left = dict->strip_left;
|
359
|
+
strip_right = dict->strip_right;
|
360
|
+
strip_units = dict->strip_units;
|
361
|
+
prefix = dict->prefix;
|
362
|
+
suffix = dict->suffix;
|
363
|
+
}
|
364
|
+
|
365
|
+
/* Strip off punctuation, etc. on the left-hand side. */
|
366
|
+
/* XXX FIXME: this fails in certain cases: e.g.
|
367
|
+
* "By the '50s, he was very prosperous."
|
368
|
+
* where the leading quote is striped, and then "50s," cannot be
|
369
|
+
* found in the dict. Next, the comma is removed, and "50s" is still
|
370
|
+
* not in the dict ... the trick was that the comma should be
|
371
|
+
* right-stripped first, then the possible quotes.
|
372
|
+
* More generally, link-grammar does not support multiple possible
|
373
|
+
* tokenizations.
|
374
|
+
*/
|
375
|
+
for (;;)
|
376
|
+
{
|
377
|
+
for (i=0; i<l_strippable; i++)
|
378
|
+
{
|
379
|
+
/* This is UTF8-safe, I beleive ... */
|
380
|
+
sz = strlen(strip_left[i]);
|
381
|
+
if (strncmp(w, strip_left[i], sz) == 0)
|
382
|
+
{
|
383
|
+
if (!issue_sentence_word(sent, strip_left[i])) return FALSE;
|
384
|
+
w += sz;
|
385
|
+
break;
|
386
|
+
}
|
387
|
+
}
|
388
|
+
if (i == l_strippable) break;
|
389
|
+
}
|
390
|
+
|
391
|
+
/* Its possible that the token consisted entirely of
|
392
|
+
* left-punctuation, in which case, it has all been issued.
|
393
|
+
* So -- we're done, return.
|
394
|
+
*/
|
395
|
+
if (w >= wend) return TRUE;
|
396
|
+
|
397
|
+
/* Now w points to the string starting just to the right of
|
398
|
+
* any left-stripped characters.
|
399
|
+
* stripped[] is an array of numbers, indicating the index
|
400
|
+
* numbers (in the strip_right array) of any strings stripped off;
|
401
|
+
* stripped[0] is the number of the first string stripped off, etc.
|
402
|
+
* When it breaks out of this loop, n_stripped will be the number
|
403
|
+
* of strings stripped off.
|
404
|
+
*/
|
405
|
+
for (n_r_stripped = 0; n_r_stripped < MAX_STRIP; n_r_stripped++)
|
406
|
+
{
|
407
|
+
sz = MIN(wend-w, MAX_WORD);
|
408
|
+
strncpy(word, w, sz);
|
409
|
+
word[sz] = '\0';
|
410
|
+
if (wend == w) break; /* it will work without this */
|
411
|
+
|
412
|
+
if (boolean_reg_dict_lookup(sent->dict, word))
|
413
|
+
{
|
414
|
+
word_is_in_dict = TRUE;
|
415
|
+
break;
|
416
|
+
}
|
417
|
+
|
418
|
+
/* This could happen if it's a word after a colon, also! */
|
419
|
+
if (is_first_word && downcase_is_in_dict (sent->dict, word))
|
420
|
+
{
|
421
|
+
word_is_in_dict = TRUE;
|
422
|
+
break;
|
423
|
+
}
|
424
|
+
|
425
|
+
for (i=0; i < r_strippable; i++)
|
426
|
+
{
|
427
|
+
len = strlen(strip_right[i]);
|
428
|
+
|
429
|
+
/* the remaining w is too short for a possible match */
|
430
|
+
if ((wend-w) < len) continue;
|
431
|
+
if (strncmp(wend-len, strip_right[i], len) == 0)
|
432
|
+
{
|
433
|
+
r_stripped[n_r_stripped] = strip_right[i];
|
434
|
+
wend -= len;
|
435
|
+
break;
|
436
|
+
}
|
437
|
+
}
|
438
|
+
if (i == r_strippable) break;
|
439
|
+
}
|
440
|
+
|
441
|
+
/* Is there a number in the word? If so, then search for
|
442
|
+
* trailing units suffixes.
|
443
|
+
*/
|
444
|
+
if ((FALSE == word_is_in_dict) && contains_digits(word))
|
445
|
+
{
|
446
|
+
/* Same as above, but with a twist: the only thing that can
|
447
|
+
* preceed a units suffix is a number. This is so that we can
|
448
|
+
* split up things like "12ft" (twelve feet) but not split up
|
449
|
+
* things like "Delft blue". Multiple passes allow for
|
450
|
+
* constructions such as 12sq.ft.
|
451
|
+
*/
|
452
|
+
n_r_stripped_save = n_r_stripped;
|
453
|
+
wend_save = wend;
|
454
|
+
for (; n_r_stripped < MAX_STRIP; n_r_stripped++)
|
455
|
+
{
|
456
|
+
size_t sz = MIN(wend-w, MAX_WORD);
|
457
|
+
strncpy(word, w, sz);
|
458
|
+
word[sz] = '\0';
|
459
|
+
if (wend == w) break; /* it will work without this */
|
460
|
+
|
461
|
+
/* Number */
|
462
|
+
if (is_number(word))
|
463
|
+
{
|
464
|
+
found_number = 1;
|
465
|
+
break;
|
466
|
+
}
|
467
|
+
|
468
|
+
for (i=0; i < u_strippable; i++)
|
469
|
+
{
|
470
|
+
len = strlen(strip_units[i]);
|
471
|
+
|
472
|
+
/* the remaining w is too short for a possible match */
|
473
|
+
if ((wend-w) < len) continue;
|
474
|
+
if (strncmp(wend-len, strip_units[i], len) == 0)
|
475
|
+
{
|
476
|
+
r_stripped[n_r_stripped] = strip_units[i];
|
477
|
+
wend -= len;
|
478
|
+
break;
|
479
|
+
}
|
480
|
+
}
|
481
|
+
if (i == u_strippable) break;
|
482
|
+
}
|
483
|
+
|
484
|
+
/* The root *must* be a number! */
|
485
|
+
if (0 == found_number)
|
486
|
+
{
|
487
|
+
wend = wend_save;
|
488
|
+
n_r_stripped = n_r_stripped_save;
|
489
|
+
}
|
490
|
+
}
|
491
|
+
|
492
|
+
/* Now we strip off suffixes...w points to the remaining word,
|
493
|
+
* "wend" to the end of the word. */
|
494
|
+
|
495
|
+
s_stripped = -1;
|
496
|
+
strncpy(word, w, MIN(wend-w, MAX_WORD));
|
497
|
+
word[MIN(wend-w, MAX_WORD)] = '\0';
|
498
|
+
|
499
|
+
/* Umm, double-check, if need be ... !?? */
|
500
|
+
if (FALSE == word_is_in_dict)
|
501
|
+
{
|
502
|
+
if (boolean_reg_dict_lookup(sent->dict, word))
|
503
|
+
word_is_in_dict = TRUE;
|
504
|
+
else if (is_first_word && downcase_is_in_dict (sent->dict,word))
|
505
|
+
word_is_in_dict = TRUE;
|
506
|
+
}
|
507
|
+
|
508
|
+
if (FALSE == word_is_in_dict)
|
509
|
+
{
|
510
|
+
j=0;
|
511
|
+
for (i=0; i <= s_strippable; i++)
|
512
|
+
{
|
513
|
+
s_ok = 0;
|
514
|
+
/* Go through once for each suffix; then go through one
|
515
|
+
* final time for the no-suffix case */
|
516
|
+
if (i < s_strippable)
|
517
|
+
{
|
518
|
+
len = strlen(suffix[i]);
|
519
|
+
|
520
|
+
/* The remaining w is too short for a possible match */
|
521
|
+
if ((wend-w) < len) continue;
|
522
|
+
if (strncmp(wend-len, suffix[i], len) == 0) s_ok=1;
|
523
|
+
}
|
524
|
+
else
|
525
|
+
len = 0;
|
526
|
+
|
527
|
+
if (s_ok || i == s_strippable)
|
528
|
+
{
|
529
|
+
strncpy(newword, w, MIN((wend-len)-w, MAX_WORD));
|
530
|
+
newword[MIN((wend-len)-w, MAX_WORD)] = '\0';
|
531
|
+
|
532
|
+
/* Check if the remainder is in the dictionary;
|
533
|
+
* for the no-suffix case, it won't be */
|
534
|
+
if (boolean_reg_dict_lookup(sent->dict, newword))
|
535
|
+
{
|
536
|
+
if ((verbosity>1) && (i < s_strippable))
|
537
|
+
printf("Splitting word into two: %s-%s\n", newword, suffix[i]);
|
538
|
+
s_stripped = i;
|
539
|
+
wend -= len;
|
540
|
+
strncpy(word, w, MIN(wend-w, MAX_WORD));
|
541
|
+
word[MIN(wend-w, MAX_WORD)] = '\0';
|
542
|
+
word_is_in_dict = TRUE;
|
543
|
+
break;
|
544
|
+
}
|
545
|
+
|
546
|
+
/* If the remainder isn't in the dictionary,
|
547
|
+
* try stripping off prefixes */
|
548
|
+
else
|
549
|
+
{
|
550
|
+
for (j=0; j<p_strippable; j++)
|
551
|
+
{
|
552
|
+
if (strncmp(w, prefix[j], strlen(prefix[j])) == 0)
|
553
|
+
{
|
554
|
+
int sz = MIN((wend-len)-(w+strlen(prefix[j])), MAX_WORD);
|
555
|
+
strncpy(newword, w+strlen(prefix[j]), sz);
|
556
|
+
newword[sz] = '\0';
|
557
|
+
if (boolean_reg_dict_lookup(sent->dict, newword))
|
558
|
+
{
|
559
|
+
if ((verbosity>1) && (i < s_strippable))
|
560
|
+
printf("Splitting word into three: %s-%s-%s\n",
|
561
|
+
prefix[j], newword, suffix[i]);
|
562
|
+
if (!issue_sentence_word(sent, prefix[j])) return FALSE;
|
563
|
+
if (i < s_strippable) s_stripped = i;
|
564
|
+
wend -= len;
|
565
|
+
w += strlen(prefix[j]);
|
566
|
+
sz = MIN(wend-w, MAX_WORD);
|
567
|
+
strncpy(word, w, sz);
|
568
|
+
word[sz] = '\0';
|
569
|
+
word_is_in_dict = TRUE;
|
570
|
+
break;
|
571
|
+
}
|
572
|
+
}
|
573
|
+
}
|
574
|
+
}
|
575
|
+
if (j != p_strippable) break;
|
576
|
+
}
|
577
|
+
}
|
578
|
+
}
|
579
|
+
|
580
|
+
/* word is now what remains after all the stripping has been done */
|
581
|
+
issued = FALSE;
|
582
|
+
|
583
|
+
/* If n_r_stripped exceed max, the "word" is most likely a long
|
584
|
+
* sequence of periods. Just accept it as an unknown "word",
|
585
|
+
* and move on.
|
586
|
+
*/
|
587
|
+
if (n_r_stripped >= MAX_STRIP)
|
588
|
+
{
|
589
|
+
n_r_stripped = 0;
|
590
|
+
word_is_in_dict = TRUE;
|
591
|
+
}
|
592
|
+
|
593
|
+
if (quote_found == TRUE) sent->post_quote[sent->length] = 1;
|
594
|
+
|
595
|
+
#if defined HAVE_HUNSPELL || defined HAVE_ASPELL
|
596
|
+
/* If the word is still not being found, then it might be
|
597
|
+
* a run-on of two words. Ask the spell-checker to split
|
598
|
+
* the word in two, if possible. Do this only if the word
|
599
|
+
* is not a proper name, and if spell-checking is enabled.
|
600
|
+
*/
|
601
|
+
if ((FALSE == word_is_in_dict) &&
|
602
|
+
TRUE == opts->use_spell_guess &&
|
603
|
+
sent->dict->spell_checker &&
|
604
|
+
(FALSE == is_proper_name(word)))
|
605
|
+
{
|
606
|
+
char **alternates = NULL;
|
607
|
+
char *sp = NULL;
|
608
|
+
char *wp;
|
609
|
+
int j, n;
|
610
|
+
n = spellcheck_suggest(sent->dict->spell_checker, &alternates, word);
|
611
|
+
for (j=0; j<n; j++)
|
612
|
+
{
|
613
|
+
/* Uhh, XXX this is not utf8 safe! */
|
614
|
+
sp = strchr(alternates[j], ' ');
|
615
|
+
if (sp) break;
|
616
|
+
}
|
617
|
+
|
618
|
+
if (sp) issued = TRUE;
|
619
|
+
|
620
|
+
wp = alternates[j];
|
621
|
+
while (sp)
|
622
|
+
{
|
623
|
+
*sp = 0x0;
|
624
|
+
if (!issue_sentence_word(sent, wp)) return FALSE;
|
625
|
+
wp = sp+1;
|
626
|
+
sp = strchr(wp, ' ');
|
627
|
+
if (NULL == sp)
|
628
|
+
{
|
629
|
+
if (!issue_sentence_word(sent, wp)) return FALSE;
|
630
|
+
}
|
631
|
+
}
|
632
|
+
if (alternates) spellcheck_free_suggest(alternates, n);
|
633
|
+
}
|
634
|
+
#endif /* HAVE_HUNSPELL */
|
635
|
+
|
636
|
+
if (FALSE == issued)
|
637
|
+
{
|
638
|
+
if (!issue_sentence_word(sent, word)) return FALSE;
|
639
|
+
}
|
640
|
+
|
641
|
+
if (s_stripped != -1)
|
642
|
+
{
|
643
|
+
if (!issue_sentence_word(sent, suffix[s_stripped])) return FALSE;
|
644
|
+
}
|
645
|
+
|
646
|
+
for (i = n_r_stripped-1; i>=0; i--)
|
647
|
+
{
|
648
|
+
if (!issue_sentence_word(sent, r_stripped[i])) return FALSE;
|
649
|
+
}
|
650
|
+
|
651
|
+
return TRUE;
|
652
|
+
}
|
653
|
+
|
654
|
+
/**
|
655
|
+
* The string s has just been read in from standard input.
|
656
|
+
* This function breaks it up into words and stores these words in
|
657
|
+
* the sent->word[] array. Returns TRUE if all is well, FALSE otherwise.
|
658
|
+
* Quote marks are treated just like blanks.
|
659
|
+
*/
|
660
|
+
int separate_sentence(Sentence sent, Parse_Options opts)
|
661
|
+
{
|
662
|
+
const char *t;
|
663
|
+
int is_first, quote_found;
|
664
|
+
Dictionary dict = sent->dict;
|
665
|
+
mbstate_t mbs;
|
666
|
+
const char * s = sent->orig_sentence;
|
667
|
+
|
668
|
+
memset(sent->post_quote, 0, MAX_SENTENCE*sizeof(int));
|
669
|
+
sent->length = 0;
|
670
|
+
|
671
|
+
if (dict->left_wall_defined)
|
672
|
+
if (!issue_sentence_word(sent, LEFT_WALL_WORD)) return FALSE;
|
673
|
+
|
674
|
+
/* Reset the multibyte shift state to the initial state */
|
675
|
+
memset(&mbs, 0, sizeof(mbs));
|
676
|
+
|
677
|
+
is_first = TRUE;
|
678
|
+
for(;;)
|
679
|
+
{
|
680
|
+
int isq;
|
681
|
+
wchar_t c;
|
682
|
+
int nb = mbrtowc(&c, s, MB_CUR_MAX, &mbs);
|
683
|
+
quote_found = FALSE;
|
684
|
+
|
685
|
+
if (0 > nb) goto failure;
|
686
|
+
|
687
|
+
/* Skip all whitespace. Also, ignore *all* quotation marks.
|
688
|
+
* XXX This is sort-of a hack, but that is because LG does
|
689
|
+
* not have any intelligent support for quoted character
|
690
|
+
* strings at this time.
|
691
|
+
*/
|
692
|
+
isq = is_quote (c);
|
693
|
+
if (isq) quote_found = TRUE;
|
694
|
+
while (iswspace(c) || isq)
|
695
|
+
{
|
696
|
+
s += nb;
|
697
|
+
nb = mbrtowc(&c, s, MB_CUR_MAX, &mbs);
|
698
|
+
if (0 == nb) break;
|
699
|
+
if (0 > nb) goto failure;
|
700
|
+
isq = is_quote (c);
|
701
|
+
if (isq) quote_found = TRUE;
|
702
|
+
}
|
703
|
+
|
704
|
+
if (*s == '\0') break;
|
705
|
+
|
706
|
+
t = s;
|
707
|
+
nb = mbrtowc(&c, t, MB_CUR_MAX, &mbs);
|
708
|
+
if (0 > nb) goto failure;
|
709
|
+
while (!iswspace(c) && !is_quote(c) && (c != 0) && (nb != 0))
|
710
|
+
{
|
711
|
+
t += nb;
|
712
|
+
nb = mbrtowc(&c, t, MB_CUR_MAX, &mbs);
|
713
|
+
if (0 > nb) goto failure;
|
714
|
+
}
|
715
|
+
|
716
|
+
if (!separate_word(sent, opts, s, t, is_first, quote_found)) return FALSE;
|
717
|
+
is_first = FALSE;
|
718
|
+
s = t;
|
719
|
+
if (*s == '\0') break;
|
720
|
+
}
|
721
|
+
|
722
|
+
if (dict->right_wall_defined)
|
723
|
+
if (!issue_sentence_word(sent, RIGHT_WALL_WORD)) return FALSE;
|
724
|
+
|
725
|
+
return (sent->length > dict->left_wall_defined + dict->right_wall_defined);
|
726
|
+
|
727
|
+
failure:
|
728
|
+
prt_error("Unable to process UTF8 input string in current locale %s\n",
|
729
|
+
nl_langinfo(CODESET));
|
730
|
+
return FALSE;
|
731
|
+
}
|
732
|
+
|
733
|
+
/**
|
734
|
+
* Build the word expressions, and add a tag to the word to indicate
|
735
|
+
* that it was guessed by means of regular-expression matching.
|
736
|
+
* Also, add a subscript to the resulting word to indicate the
|
737
|
+
* rule origin.
|
738
|
+
*/
|
739
|
+
static void tag_regex_string(Sentence sent, int i, const char * type)
|
740
|
+
{
|
741
|
+
char str[MAX_WORD+1];
|
742
|
+
char * t;
|
743
|
+
X_node * e;
|
744
|
+
sent->word[i].x = build_word_expressions(sent->dict, type);
|
745
|
+
for (e = sent->word[i].x; e != NULL; e = e->next)
|
746
|
+
{
|
747
|
+
t = strchr(e->string, '.');
|
748
|
+
e->string = sent->word[i].string;
|
749
|
+
if (NULL != t)
|
750
|
+
{
|
751
|
+
snprintf(str, MAX_WORD, "%.50s[!].%.5s", e->string, t+1);
|
752
|
+
}
|
753
|
+
else
|
754
|
+
{
|
755
|
+
snprintf(str, MAX_WORD, "%.50s", e->string);
|
756
|
+
}
|
757
|
+
e->string = string_set_add(str, sent->string_set);
|
758
|
+
}
|
759
|
+
}
|
760
|
+
|
761
|
+
/**
|
762
|
+
* Puts into word[i].x the expression for the unknown word
|
763
|
+
* the parameter s is the word that was not in the dictionary
|
764
|
+
* it massages the names to have the corresponding subscripts
|
765
|
+
* to those of the unknown words
|
766
|
+
* so "grok" becomes "grok[?].v"
|
767
|
+
*/
|
768
|
+
static void handle_unknown_word(Sentence sent, int i, char * s)
|
769
|
+
{
|
770
|
+
char *t;
|
771
|
+
X_node *d;
|
772
|
+
char str[MAX_WORD+1];
|
773
|
+
|
774
|
+
sent->word[i].x = build_word_expressions(sent->dict, UNKNOWN_WORD);
|
775
|
+
if (sent->word[i].x == NULL)
|
776
|
+
assert(FALSE, "UNKNOWN_WORD should have been there");
|
777
|
+
|
778
|
+
for (d = sent->word[i].x; d != NULL; d = d->next)
|
779
|
+
{
|
780
|
+
t = strchr(d->string, '.');
|
781
|
+
if (t != NULL)
|
782
|
+
{
|
783
|
+
snprintf(str, MAX_WORD, "%.50s[?].%.5s", s, t+1);
|
784
|
+
}
|
785
|
+
else
|
786
|
+
{
|
787
|
+
snprintf(str, MAX_WORD, "%.50s[?]", s);
|
788
|
+
}
|
789
|
+
d->string = string_set_add(str, sent->string_set);
|
790
|
+
}
|
791
|
+
}
|
792
|
+
|
793
|
+
/**
|
794
|
+
* If a word appears to be mis-spelled, then add alternate
|
795
|
+
* spellings. Maybe one of those will do ...
|
796
|
+
*/
|
797
|
+
static void guess_misspelled_word(Sentence sent, int i, char * s)
|
798
|
+
{
|
799
|
+
int spelling_ok;
|
800
|
+
char str[MAX_WORD+1];
|
801
|
+
Dictionary dict = sent->dict;
|
802
|
+
X_node *d, *head = NULL;
|
803
|
+
int j, n;
|
804
|
+
char **alternates = NULL;
|
805
|
+
|
806
|
+
/* Spell-guessing is disabled if no spell-checker is speficified */
|
807
|
+
if (NULL == dict->spell_checker)
|
808
|
+
{
|
809
|
+
handle_unknown_word(sent, i, s);
|
810
|
+
return;
|
811
|
+
}
|
812
|
+
|
813
|
+
/* If the spell-checker knows about this word, and we don't ...
|
814
|
+
* Dang. We should fix it someday. Accept it as such. */
|
815
|
+
spelling_ok = spellcheck_test(dict->spell_checker, s);
|
816
|
+
if (spelling_ok)
|
817
|
+
{
|
818
|
+
handle_unknown_word(sent, i, s);
|
819
|
+
return;
|
820
|
+
}
|
821
|
+
|
822
|
+
/* Else, ask the spell-checker for alternate spellings
|
823
|
+
* and see if these are in the dict. */
|
824
|
+
n = spellcheck_suggest(dict->spell_checker, &alternates, s);
|
825
|
+
for (j=0; j<n; j++)
|
826
|
+
{
|
827
|
+
if (boolean_reg_dict_lookup(sent->dict, alternates[j]))
|
828
|
+
{
|
829
|
+
X_node *x = build_word_expressions(sent->dict, alternates[j]);
|
830
|
+
head = catenate_X_nodes(x, head);
|
831
|
+
}
|
832
|
+
}
|
833
|
+
sent->word[i].x = head;
|
834
|
+
if (alternates) spellcheck_free_suggest(alternates, n);
|
835
|
+
|
836
|
+
/* Add a [~] to the output to signify that its the result of
|
837
|
+
* guessing. */
|
838
|
+
for (d = sent->word[i].x; d != NULL; d = d->next)
|
839
|
+
{
|
840
|
+
const char * t = strchr(d->string, '.');
|
841
|
+
if (t != NULL)
|
842
|
+
{
|
843
|
+
size_t off = t - d->string;
|
844
|
+
strncpy(str, d->string, off);
|
845
|
+
str[off] = 0;
|
846
|
+
strcat(str, "[~]");
|
847
|
+
strcat(str, t);
|
848
|
+
}
|
849
|
+
else
|
850
|
+
{
|
851
|
+
snprintf(str, MAX_WORD, "%.50s[~]", s);
|
852
|
+
}
|
853
|
+
d->string = string_set_add(str, sent->string_set);
|
854
|
+
}
|
855
|
+
|
856
|
+
/* If nothing found at all... */
|
857
|
+
if (NULL == head)
|
858
|
+
{
|
859
|
+
handle_unknown_word(sent, i, s);
|
860
|
+
}
|
861
|
+
}
|
862
|
+
|
863
|
+
/**
|
864
|
+
* Corrects case of first word, fills in other proper nouns, and
|
865
|
+
* builds the expression lists for the resulting words.
|
866
|
+
*
|
867
|
+
* Algorithm:
|
868
|
+
* Apply the following step to all words w:
|
869
|
+
* If w is in the dictionary, use it.
|
870
|
+
* Else if w is identified by regex matching, use the
|
871
|
+
* appropriately matched disjunct collection.
|
872
|
+
*
|
873
|
+
* Now, we correct the first word, w.
|
874
|
+
* If w is upper case, let w' be the lower case version of w.
|
875
|
+
* If both w and w' are in the dict, concatenate these disjncts.
|
876
|
+
* Else if just w' is in dict, use disjuncts of w', together with
|
877
|
+
* the CAPITALIZED-WORDS rule.
|
878
|
+
* Else leave the disjuncts alone.
|
879
|
+
*/
|
880
|
+
int build_sentence_expressions(Sentence sent, Parse_Options opts)
|
881
|
+
{
|
882
|
+
int i, first_word; /* the index of the first word after the wall */
|
883
|
+
char *s, temp_word[MAX_WORD+1];
|
884
|
+
const char * regex_name;
|
885
|
+
X_node * e;
|
886
|
+
Dictionary dict = sent->dict;
|
887
|
+
|
888
|
+
if (dict->left_wall_defined) {
|
889
|
+
first_word = 1;
|
890
|
+
} else {
|
891
|
+
first_word = 0;
|
892
|
+
}
|
893
|
+
|
894
|
+
/* The following loop treats all words the same
|
895
|
+
* (nothing special for 1st word) */
|
896
|
+
for (i=0; i<sent->length; i++)
|
897
|
+
{
|
898
|
+
s = sent->word[i].string;
|
899
|
+
if (boolean_dictionary_lookup(sent->dict, s))
|
900
|
+
{
|
901
|
+
sent->word[i].x = build_word_expressions(sent->dict, s);
|
902
|
+
}
|
903
|
+
else if ((NULL != (regex_name = match_regex(sent->dict, s))) &&
|
904
|
+
boolean_dictionary_lookup(sent->dict, regex_name))
|
905
|
+
{
|
906
|
+
tag_regex_string(sent, i, regex_name);
|
907
|
+
}
|
908
|
+
else if (dict->unknown_word_defined && dict->use_unknown_word)
|
909
|
+
{
|
910
|
+
if (opts->use_spell_guess)
|
911
|
+
{
|
912
|
+
guess_misspelled_word(sent, i, s);
|
913
|
+
}
|
914
|
+
else
|
915
|
+
{
|
916
|
+
handle_unknown_word(sent, i, s);
|
917
|
+
}
|
918
|
+
}
|
919
|
+
else
|
920
|
+
{
|
921
|
+
/* The reason I can assert this is that the word
|
922
|
+
* should have been looked up already if we get here.
|
923
|
+
*/
|
924
|
+
assert(FALSE, "I should have found that word.");
|
925
|
+
}
|
926
|
+
}
|
927
|
+
|
928
|
+
/* Under certain cases--if it's the first word of the sentence,
|
929
|
+
* or if it follows a colon or a quotation mark--a word that's
|
930
|
+
* capitalized has to be looked up as an uncapitalized word
|
931
|
+
* (as well as a capitalized word).
|
932
|
+
* XXX This rule is English-language-oriented, and should be
|
933
|
+
* abstracted.
|
934
|
+
*/
|
935
|
+
for (i=0; i<sent->length; i++)
|
936
|
+
{
|
937
|
+
if (! (i == first_word ||
|
938
|
+
(i > 0 && strcmp(":", sent->word[i-1].string)==0) ||
|
939
|
+
sent->post_quote[i] == 1)) continue;
|
940
|
+
s = sent->word[i].string;
|
941
|
+
|
942
|
+
/* If the lower-case version of this word is in the dictionary,
|
943
|
+
* then add the disjuncts for the lower-case version. The upper
|
944
|
+
* case version disjuncts had previously come from matching the
|
945
|
+
* CAPITALIZED-WORDS regex.
|
946
|
+
*
|
947
|
+
* Err .. add the lower-case version only if the lower-case word
|
948
|
+
* is a common noun or adjective; otherwise, *replace* the
|
949
|
+
* upper-case word with the lower-case one. This allows common
|
950
|
+
* nouns and adjectives to be used for entity names: e.g.
|
951
|
+
* "Great Southern Union declares bankruptcy", allowing Great
|
952
|
+
* to be capitalized, while preventing an upper-case "She" being
|
953
|
+
* used as a proper name in "She declared bankruptcy".
|
954
|
+
*
|
955
|
+
* Arghh. This is still messed up. The capitalized-regex runs
|
956
|
+
* too early, I think. We need to *add* Sue.f (female name Sue)
|
957
|
+
* even though sue.v (the verb "to sue") is in the dict. So
|
958
|
+
* test for capitalized entity names. Glurg. Too much complexity
|
959
|
+
* here, it seems to me.
|
960
|
+
*
|
961
|
+
* This is actually a great example of a combo of an algorithm
|
962
|
+
* together with a list of words used to determine grammatical
|
963
|
+
* function.
|
964
|
+
*/
|
965
|
+
if (is_utf8_upper(s))
|
966
|
+
{
|
967
|
+
const char * lc;
|
968
|
+
downcase_utf8_str(temp_word, s, MAX_WORD);
|
969
|
+
lc = string_set_add(temp_word, sent->string_set);
|
970
|
+
|
971
|
+
/* The lower-case dict lookup might trigger regex
|
972
|
+
* matches in the dictionary. We want to avoid these.
|
973
|
+
* e.g. "Cornwallis" triggers both PL-CAPITALIZED_WORDS
|
974
|
+
* and S-WORDS. Since its not an entity, the regex
|
975
|
+
* matches will erroneously discard the upper-case version.
|
976
|
+
*/
|
977
|
+
if (boolean_dictionary_lookup(sent->dict, lc))
|
978
|
+
{
|
979
|
+
if (is_entity(sent->dict,s) ||
|
980
|
+
is_common_entity(sent->dict,lc))
|
981
|
+
{
|
982
|
+
if (1 < verbosity)
|
983
|
+
{
|
984
|
+
printf ("Info: First word: %s entity=%d common=%d\n",
|
985
|
+
s, is_entity(sent->dict,s),
|
986
|
+
is_common_entity(sent->dict,lc));
|
987
|
+
}
|
988
|
+
e = build_word_expressions(sent->dict, lc);
|
989
|
+
sent->word[i].x =
|
990
|
+
catenate_X_nodes(sent->word[i].x, e);
|
991
|
+
}
|
992
|
+
else
|
993
|
+
{
|
994
|
+
if (1 < verbosity)
|
995
|
+
{
|
996
|
+
printf("Info: First word: %s downcase only\n", lc);
|
997
|
+
}
|
998
|
+
safe_strcpy(s, lc, MAX_WORD);
|
999
|
+
e = build_word_expressions(sent->dict, s);
|
1000
|
+
free_X_nodes(sent->word[i].x);
|
1001
|
+
sent->word[i].x = e;
|
1002
|
+
}
|
1003
|
+
}
|
1004
|
+
}
|
1005
|
+
}
|
1006
|
+
|
1007
|
+
return TRUE;
|
1008
|
+
}
|
1009
|
+
|
1010
|
+
|
1011
|
+
/**
|
1012
|
+
* This just looks up all the words in the sentence, and builds
|
1013
|
+
* up an appropriate error message in case some are not there.
|
1014
|
+
* It has no side effect on the sentence. Returns TRUE if all
|
1015
|
+
* went well.
|
1016
|
+
*
|
1017
|
+
* This code is called only is the 'unkown-words' flag is set.
|
1018
|
+
*/
|
1019
|
+
int sentence_in_dictionary(Sentence sent)
|
1020
|
+
{
|
1021
|
+
int w, ok_so_far;
|
1022
|
+
char * s;
|
1023
|
+
Dictionary dict = sent->dict;
|
1024
|
+
char temp[1024];
|
1025
|
+
|
1026
|
+
ok_so_far = TRUE;
|
1027
|
+
for (w=0; w<sent->length; w++)
|
1028
|
+
{
|
1029
|
+
s = sent->word[w].string;
|
1030
|
+
if (!boolean_reg_dict_lookup(dict, s))
|
1031
|
+
{
|
1032
|
+
if (ok_so_far)
|
1033
|
+
{
|
1034
|
+
safe_strcpy(temp, "The following words are not in the dictionary:", sizeof(temp));
|
1035
|
+
ok_so_far = FALSE;
|
1036
|
+
}
|
1037
|
+
safe_strcat(temp, " \"", sizeof(temp));
|
1038
|
+
safe_strcat(temp, sent->word[w].string, sizeof(temp));
|
1039
|
+
safe_strcat(temp, "\"", sizeof(temp));
|
1040
|
+
}
|
1041
|
+
}
|
1042
|
+
if (!ok_so_far)
|
1043
|
+
{
|
1044
|
+
err_ctxt ec;
|
1045
|
+
ec.sent = sent;
|
1046
|
+
err_msg(&ec, Error, "Error: Sentence not in dictionary\n%s\n", temp);
|
1047
|
+
}
|
1048
|
+
return ok_so_far;
|
1049
|
+
}
|