grammar_police 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.DS_Store +0 -0
- data/.gitignore +4 -0
- data/Gemfile +4 -0
- data/Rakefile +2 -0
- data/c/.DS_Store +0 -0
- data/c/link-grammar.c +65 -0
- data/c/link-grammar.h +60 -0
- data/c/link-grammar.o +0 -0
- data/c/link-grammar.so +0 -0
- data/c/link-grammar/.DS_Store +0 -0
- data/c/link-grammar/.deps/analyze-linkage.Plo +198 -0
- data/c/link-grammar/.deps/and.Plo +202 -0
- data/c/link-grammar/.deps/api.Plo +244 -0
- data/c/link-grammar/.deps/build-disjuncts.Plo +212 -0
- data/c/link-grammar/.deps/command-line.Plo +201 -0
- data/c/link-grammar/.deps/constituents.Plo +201 -0
- data/c/link-grammar/.deps/count.Plo +202 -0
- data/c/link-grammar/.deps/disjunct-utils.Plo +126 -0
- data/c/link-grammar/.deps/disjuncts.Plo +123 -0
- data/c/link-grammar/.deps/error.Plo +121 -0
- data/c/link-grammar/.deps/expand.Plo +133 -0
- data/c/link-grammar/.deps/extract-links.Plo +198 -0
- data/c/link-grammar/.deps/fast-match.Plo +200 -0
- data/c/link-grammar/.deps/idiom.Plo +200 -0
- data/c/link-grammar/.deps/jni-client.Plo +217 -0
- data/c/link-grammar/.deps/link-parser.Po +1 -0
- data/c/link-grammar/.deps/massage.Plo +202 -0
- data/c/link-grammar/.deps/post-process.Plo +202 -0
- data/c/link-grammar/.deps/pp_knowledge.Plo +202 -0
- data/c/link-grammar/.deps/pp_lexer.Plo +201 -0
- data/c/link-grammar/.deps/pp_linkset.Plo +200 -0
- data/c/link-grammar/.deps/prefix.Plo +102 -0
- data/c/link-grammar/.deps/preparation.Plo +202 -0
- data/c/link-grammar/.deps/print-util.Plo +200 -0
- data/c/link-grammar/.deps/print.Plo +201 -0
- data/c/link-grammar/.deps/prune.Plo +202 -0
- data/c/link-grammar/.deps/read-dict.Plo +223 -0
- data/c/link-grammar/.deps/read-regex.Plo +123 -0
- data/c/link-grammar/.deps/regex-morph.Plo +131 -0
- data/c/link-grammar/.deps/resources.Plo +203 -0
- data/c/link-grammar/.deps/spellcheck-aspell.Plo +1 -0
- data/c/link-grammar/.deps/spellcheck-hun.Plo +115 -0
- data/c/link-grammar/.deps/string-set.Plo +198 -0
- data/c/link-grammar/.deps/tokenize.Plo +160 -0
- data/c/link-grammar/.deps/utilities.Plo +222 -0
- data/c/link-grammar/.deps/word-file.Plo +201 -0
- data/c/link-grammar/.deps/word-utils.Plo +212 -0
- data/c/link-grammar/.libs/analyze-linkage.o +0 -0
- data/c/link-grammar/.libs/and.o +0 -0
- data/c/link-grammar/.libs/api.o +0 -0
- data/c/link-grammar/.libs/build-disjuncts.o +0 -0
- data/c/link-grammar/.libs/command-line.o +0 -0
- data/c/link-grammar/.libs/constituents.o +0 -0
- data/c/link-grammar/.libs/count.o +0 -0
- data/c/link-grammar/.libs/disjunct-utils.o +0 -0
- data/c/link-grammar/.libs/disjuncts.o +0 -0
- data/c/link-grammar/.libs/error.o +0 -0
- data/c/link-grammar/.libs/expand.o +0 -0
- data/c/link-grammar/.libs/extract-links.o +0 -0
- data/c/link-grammar/.libs/fast-match.o +0 -0
- data/c/link-grammar/.libs/idiom.o +0 -0
- data/c/link-grammar/.libs/jni-client.o +0 -0
- data/c/link-grammar/.libs/liblink-grammar-java-symbols.expsym +31 -0
- data/c/link-grammar/.libs/liblink-grammar-java.4.dylib +0 -0
- data/c/link-grammar/.libs/liblink-grammar-java.4.dylib.dSYM/Contents/Info.plist +20 -0
- data/c/link-grammar/.libs/liblink-grammar-java.4.dylib.dSYM/Contents/Resources/DWARF/liblink-grammar-java.4.dylib +0 -0
- data/c/link-grammar/.libs/liblink-grammar-java.a +0 -0
- data/c/link-grammar/.libs/liblink-grammar-java.dylib +0 -0
- data/c/link-grammar/.libs/liblink-grammar-symbols.expsym +194 -0
- data/c/link-grammar/.libs/liblink-grammar.4.dylib +0 -0
- data/c/link-grammar/.libs/liblink-grammar.4.dylib.dSYM/Contents/Info.plist +20 -0
- data/c/link-grammar/.libs/liblink-grammar.4.dylib.dSYM/Contents/Resources/DWARF/liblink-grammar.4.dylib +0 -0
- data/c/link-grammar/.libs/liblink-grammar.a +0 -0
- data/c/link-grammar/.libs/liblink-grammar.dylib +0 -0
- data/c/link-grammar/.libs/liblink-grammar.la +41 -0
- data/c/link-grammar/.libs/liblink-grammar.lai +41 -0
- data/c/link-grammar/.libs/massage.o +0 -0
- data/c/link-grammar/.libs/post-process.o +0 -0
- data/c/link-grammar/.libs/pp_knowledge.o +0 -0
- data/c/link-grammar/.libs/pp_lexer.o +0 -0
- data/c/link-grammar/.libs/pp_linkset.o +0 -0
- data/c/link-grammar/.libs/prefix.o +0 -0
- data/c/link-grammar/.libs/preparation.o +0 -0
- data/c/link-grammar/.libs/print-util.o +0 -0
- data/c/link-grammar/.libs/print.o +0 -0
- data/c/link-grammar/.libs/prune.o +0 -0
- data/c/link-grammar/.libs/read-dict.o +0 -0
- data/c/link-grammar/.libs/read-regex.o +0 -0
- data/c/link-grammar/.libs/regex-morph.o +0 -0
- data/c/link-grammar/.libs/resources.o +0 -0
- data/c/link-grammar/.libs/spellcheck-aspell.o +0 -0
- data/c/link-grammar/.libs/spellcheck-hun.o +0 -0
- data/c/link-grammar/.libs/string-set.o +0 -0
- data/c/link-grammar/.libs/tokenize.o +0 -0
- data/c/link-grammar/.libs/utilities.o +0 -0
- data/c/link-grammar/.libs/word-file.o +0 -0
- data/c/link-grammar/.libs/word-utils.o +0 -0
- data/c/link-grammar/Makefile +900 -0
- data/c/link-grammar/Makefile.am +202 -0
- data/c/link-grammar/Makefile.in +900 -0
- data/c/link-grammar/analyze-linkage.c +1317 -0
- data/c/link-grammar/analyze-linkage.h +24 -0
- data/c/link-grammar/and.c +1603 -0
- data/c/link-grammar/and.h +27 -0
- data/c/link-grammar/api-structures.h +362 -0
- data/c/link-grammar/api-types.h +72 -0
- data/c/link-grammar/api.c +1887 -0
- data/c/link-grammar/api.h +96 -0
- data/c/link-grammar/autoit/.DS_Store +0 -0
- data/c/link-grammar/autoit/README +10 -0
- data/c/link-grammar/autoit/_LGTest.au3 +22 -0
- data/c/link-grammar/autoit/_LinkGrammar.au3 +545 -0
- data/c/link-grammar/build-disjuncts.c +487 -0
- data/c/link-grammar/build-disjuncts.h +21 -0
- data/c/link-grammar/command-line.c +458 -0
- data/c/link-grammar/command-line.h +15 -0
- data/c/link-grammar/constituents.c +1836 -0
- data/c/link-grammar/constituents.h +26 -0
- data/c/link-grammar/corpus/.DS_Store +0 -0
- data/c/link-grammar/corpus/.deps/cluster.Plo +1 -0
- data/c/link-grammar/corpus/.deps/corpus.Plo +1 -0
- data/c/link-grammar/corpus/Makefile +527 -0
- data/c/link-grammar/corpus/Makefile.am +46 -0
- data/c/link-grammar/corpus/Makefile.in +527 -0
- data/c/link-grammar/corpus/README +17 -0
- data/c/link-grammar/corpus/cluster.c +286 -0
- data/c/link-grammar/corpus/cluster.h +32 -0
- data/c/link-grammar/corpus/corpus.c +483 -0
- data/c/link-grammar/corpus/corpus.h +46 -0
- data/c/link-grammar/count.c +828 -0
- data/c/link-grammar/count.h +25 -0
- data/c/link-grammar/disjunct-utils.c +261 -0
- data/c/link-grammar/disjunct-utils.h +27 -0
- data/c/link-grammar/disjuncts.c +138 -0
- data/c/link-grammar/disjuncts.h +13 -0
- data/c/link-grammar/error.c +92 -0
- data/c/link-grammar/error.h +35 -0
- data/c/link-grammar/expand.c +67 -0
- data/c/link-grammar/expand.h +13 -0
- data/c/link-grammar/externs.h +22 -0
- data/c/link-grammar/extract-links.c +625 -0
- data/c/link-grammar/extract-links.h +16 -0
- data/c/link-grammar/fast-match.c +309 -0
- data/c/link-grammar/fast-match.h +17 -0
- data/c/link-grammar/idiom.c +373 -0
- data/c/link-grammar/idiom.h +15 -0
- data/c/link-grammar/jni-client.c +779 -0
- data/c/link-grammar/jni-client.h +236 -0
- data/c/link-grammar/liblink-grammar-java.la +42 -0
- data/c/link-grammar/liblink-grammar.la +41 -0
- data/c/link-grammar/link-features.h +37 -0
- data/c/link-grammar/link-features.h.in +37 -0
- data/c/link-grammar/link-grammar-java.def +31 -0
- data/c/link-grammar/link-grammar.def +194 -0
- data/c/link-grammar/link-includes.h +465 -0
- data/c/link-grammar/link-parser.c +849 -0
- data/c/link-grammar/massage.c +329 -0
- data/c/link-grammar/massage.h +13 -0
- data/c/link-grammar/post-process.c +1113 -0
- data/c/link-grammar/post-process.h +45 -0
- data/c/link-grammar/pp_knowledge.c +376 -0
- data/c/link-grammar/pp_knowledge.h +14 -0
- data/c/link-grammar/pp_lexer.c +1920 -0
- data/c/link-grammar/pp_lexer.h +19 -0
- data/c/link-grammar/pp_linkset.c +158 -0
- data/c/link-grammar/pp_linkset.h +20 -0
- data/c/link-grammar/prefix.c +482 -0
- data/c/link-grammar/prefix.h +139 -0
- data/c/link-grammar/preparation.c +412 -0
- data/c/link-grammar/preparation.h +20 -0
- data/c/link-grammar/print-util.c +87 -0
- data/c/link-grammar/print-util.h +32 -0
- data/c/link-grammar/print.c +1085 -0
- data/c/link-grammar/print.h +16 -0
- data/c/link-grammar/prune.c +1864 -0
- data/c/link-grammar/prune.h +17 -0
- data/c/link-grammar/read-dict.c +1785 -0
- data/c/link-grammar/read-dict.h +29 -0
- data/c/link-grammar/read-regex.c +161 -0
- data/c/link-grammar/read-regex.h +12 -0
- data/c/link-grammar/regex-morph.c +126 -0
- data/c/link-grammar/regex-morph.h +17 -0
- data/c/link-grammar/resources.c +180 -0
- data/c/link-grammar/resources.h +23 -0
- data/c/link-grammar/sat-solver/.DS_Store +0 -0
- data/c/link-grammar/sat-solver/.deps/fast-sprintf.Plo +1 -0
- data/c/link-grammar/sat-solver/.deps/sat-encoder.Plo +1 -0
- data/c/link-grammar/sat-solver/.deps/util.Plo +1 -0
- data/c/link-grammar/sat-solver/.deps/variables.Plo +1 -0
- data/c/link-grammar/sat-solver/.deps/word-tag.Plo +1 -0
- data/c/link-grammar/sat-solver/Makefile +527 -0
- data/c/link-grammar/sat-solver/Makefile.am +29 -0
- data/c/link-grammar/sat-solver/Makefile.in +527 -0
- data/c/link-grammar/sat-solver/clock.hpp +33 -0
- data/c/link-grammar/sat-solver/fast-sprintf.cpp +26 -0
- data/c/link-grammar/sat-solver/fast-sprintf.hpp +7 -0
- data/c/link-grammar/sat-solver/guiding.hpp +244 -0
- data/c/link-grammar/sat-solver/matrix-ut.hpp +79 -0
- data/c/link-grammar/sat-solver/sat-encoder.cpp +2811 -0
- data/c/link-grammar/sat-solver/sat-encoder.h +11 -0
- data/c/link-grammar/sat-solver/sat-encoder.hpp +381 -0
- data/c/link-grammar/sat-solver/trie.hpp +118 -0
- data/c/link-grammar/sat-solver/util.cpp +23 -0
- data/c/link-grammar/sat-solver/util.hpp +14 -0
- data/c/link-grammar/sat-solver/variables.cpp +5 -0
- data/c/link-grammar/sat-solver/variables.hpp +829 -0
- data/c/link-grammar/sat-solver/word-tag.cpp +159 -0
- data/c/link-grammar/sat-solver/word-tag.hpp +162 -0
- data/c/link-grammar/spellcheck-aspell.c +148 -0
- data/c/link-grammar/spellcheck-hun.c +136 -0
- data/c/link-grammar/spellcheck.h +34 -0
- data/c/link-grammar/string-set.c +169 -0
- data/c/link-grammar/string-set.h +16 -0
- data/c/link-grammar/structures.h +498 -0
- data/c/link-grammar/tokenize.c +1049 -0
- data/c/link-grammar/tokenize.h +15 -0
- data/c/link-grammar/utilities.c +847 -0
- data/c/link-grammar/utilities.h +281 -0
- data/c/link-grammar/word-file.c +124 -0
- data/c/link-grammar/word-file.h +15 -0
- data/c/link-grammar/word-utils.c +526 -0
- data/c/link-grammar/word-utils.h +152 -0
- data/data/.DS_Store +0 -0
- data/data/Makefile +511 -0
- data/data/Makefile.am +4 -0
- data/data/Makefile.in +511 -0
- data/data/de/.DS_Store +0 -0
- data/data/de/4.0.affix +7 -0
- data/data/de/4.0.dict +474 -0
- data/data/de/Makefile +387 -0
- data/data/de/Makefile.am +9 -0
- data/data/de/Makefile.in +387 -0
- data/data/en/.DS_Store +0 -0
- data/data/en/4.0.affix +26 -0
- data/data/en/4.0.batch +1002 -0
- data/data/en/4.0.biolg.batch +411 -0
- data/data/en/4.0.constituent-knowledge +127 -0
- data/data/en/4.0.dict +8759 -0
- data/data/en/4.0.dict.m4 +6928 -0
- data/data/en/4.0.enwiki.batch +14 -0
- data/data/en/4.0.fixes.batch +2776 -0
- data/data/en/4.0.knowledge +306 -0
- data/data/en/4.0.regex +225 -0
- data/data/en/4.0.voa.batch +114 -0
- data/data/en/Makefile +554 -0
- data/data/en/Makefile.am +19 -0
- data/data/en/Makefile.in +554 -0
- data/data/en/README +173 -0
- data/data/en/tiny.dict +157 -0
- data/data/en/words/.DS_Store +0 -0
- data/data/en/words/Makefile +456 -0
- data/data/en/words/Makefile.am +78 -0
- data/data/en/words/Makefile.in +456 -0
- data/data/en/words/currency +205 -0
- data/data/en/words/currency.p +28 -0
- data/data/en/words/entities.given-bisex.sing +39 -0
- data/data/en/words/entities.given-female.sing +4141 -0
- data/data/en/words/entities.given-male.sing +1633 -0
- data/data/en/words/entities.locations.sing +68 -0
- data/data/en/words/entities.national.sing +253 -0
- data/data/en/words/entities.organizations.sing +7 -0
- data/data/en/words/entities.us-states.sing +11 -0
- data/data/en/words/units.1 +45 -0
- data/data/en/words/units.1.dot +4 -0
- data/data/en/words/units.3 +2 -0
- data/data/en/words/units.4 +5 -0
- data/data/en/words/units.4.dot +1 -0
- data/data/en/words/words-medical.adv.1 +1191 -0
- data/data/en/words/words-medical.prep.1 +67 -0
- data/data/en/words/words-medical.v.4.1 +2835 -0
- data/data/en/words/words-medical.v.4.2 +2848 -0
- data/data/en/words/words-medical.v.4.3 +3011 -0
- data/data/en/words/words-medical.v.4.4 +3036 -0
- data/data/en/words/words-medical.v.4.5 +3050 -0
- data/data/en/words/words.adj.1 +6794 -0
- data/data/en/words/words.adj.2 +638 -0
- data/data/en/words/words.adj.3 +667 -0
- data/data/en/words/words.adv.1 +1573 -0
- data/data/en/words/words.adv.2 +67 -0
- data/data/en/words/words.adv.3 +157 -0
- data/data/en/words/words.adv.4 +80 -0
- data/data/en/words/words.n.1 +11464 -0
- data/data/en/words/words.n.1.wiki +264 -0
- data/data/en/words/words.n.2.s +2017 -0
- data/data/en/words/words.n.2.s.biolg +1 -0
- data/data/en/words/words.n.2.s.wiki +298 -0
- data/data/en/words/words.n.2.x +65 -0
- data/data/en/words/words.n.2.x.wiki +10 -0
- data/data/en/words/words.n.3 +5717 -0
- data/data/en/words/words.n.t +23 -0
- data/data/en/words/words.v.1.1 +1038 -0
- data/data/en/words/words.v.1.2 +1043 -0
- data/data/en/words/words.v.1.3 +1052 -0
- data/data/en/words/words.v.1.4 +1023 -0
- data/data/en/words/words.v.1.p +17 -0
- data/data/en/words/words.v.10.1 +14 -0
- data/data/en/words/words.v.10.2 +15 -0
- data/data/en/words/words.v.10.3 +88 -0
- data/data/en/words/words.v.10.4 +17 -0
- data/data/en/words/words.v.2.1 +1253 -0
- data/data/en/words/words.v.2.2 +1304 -0
- data/data/en/words/words.v.2.3 +1280 -0
- data/data/en/words/words.v.2.4 +1285 -0
- data/data/en/words/words.v.2.5 +1287 -0
- data/data/en/words/words.v.4.1 +2472 -0
- data/data/en/words/words.v.4.2 +2487 -0
- data/data/en/words/words.v.4.3 +2441 -0
- data/data/en/words/words.v.4.4 +2478 -0
- data/data/en/words/words.v.4.5 +2483 -0
- data/data/en/words/words.v.5.1 +98 -0
- data/data/en/words/words.v.5.2 +98 -0
- data/data/en/words/words.v.5.3 +103 -0
- data/data/en/words/words.v.5.4 +102 -0
- data/data/en/words/words.v.6.1 +388 -0
- data/data/en/words/words.v.6.2 +401 -0
- data/data/en/words/words.v.6.3 +397 -0
- data/data/en/words/words.v.6.4 +405 -0
- data/data/en/words/words.v.6.5 +401 -0
- data/data/en/words/words.v.8.1 +117 -0
- data/data/en/words/words.v.8.2 +118 -0
- data/data/en/words/words.v.8.3 +118 -0
- data/data/en/words/words.v.8.4 +119 -0
- data/data/en/words/words.v.8.5 +119 -0
- data/data/en/words/words.y +104 -0
- data/data/lt/.DS_Store +0 -0
- data/data/lt/4.0.affix +6 -0
- data/data/lt/4.0.constituent-knowledge +24 -0
- data/data/lt/4.0.dict +135 -0
- data/data/lt/4.0.knowledge +38 -0
- data/data/lt/Makefile +389 -0
- data/data/lt/Makefile.am +11 -0
- data/data/lt/Makefile.in +389 -0
- data/grammar_police.gemspec +23 -0
- data/lib/.DS_Store +0 -0
- data/lib/grammar_police.rb +11 -0
- data/lib/grammar_police/.DS_Store +0 -0
- data/lib/grammar_police/dictionary.rb +30 -0
- data/lib/grammar_police/linkage.rb +26 -0
- data/lib/grammar_police/parse_options.rb +32 -0
- data/lib/grammar_police/sentence.rb +44 -0
- data/lib/grammar_police/version.rb +3 -0
- data/tests/.DS_Store +0 -0
- data/tests/count_linkages.rb +29 -0
- data/tests/sentences.txt +86 -0
- metadata +408 -0
@@ -0,0 +1,17 @@
|
|
1
|
+
|
2
|
+
Parse Ranking and Word Sense Statistics
|
3
|
+
---------------------------------------
|
4
|
+
|
5
|
+
This directory contains code that computes a parse ranking, as well
|
6
|
+
as a word-sense probability (based on WordNet 3.0) by looking up
|
7
|
+
frequency statistics from an SQL database. The SQLite database engine
|
8
|
+
is used because it is "administration-free" for the user, and because
|
9
|
+
its license is compatbile with the current link-grammar license.
|
10
|
+
|
11
|
+
This directory also contains code for "broadening" word linakges.
|
12
|
+
See data/sql/README for more info.
|
13
|
+
|
14
|
+
This directory contains one administrative tool, "cluster-pop",
|
15
|
+
which is not built by default because users do not need this tool.
|
16
|
+
See the Makefile.am for notes on how to build it.
|
17
|
+
|
@@ -0,0 +1,286 @@
|
|
1
|
+
/*
|
2
|
+
* cluster.c
|
3
|
+
*
|
4
|
+
* Data for related-word clusters. Meant to expand disjunct coverage
|
5
|
+
* for the case where a parse cannot be completed without ommitting
|
6
|
+
* a word.
|
7
|
+
*
|
8
|
+
* Copyright (c) 2009 Linas Vepstas <linasvepstas@gmail.com>
|
9
|
+
*/
|
10
|
+
|
11
|
+
#include <stdio.h>
|
12
|
+
#include <stdlib.h>
|
13
|
+
#include <string.h>
|
14
|
+
#include <sqlite3.h>
|
15
|
+
#include "cluster.h"
|
16
|
+
#include "../build-disjuncts.h"
|
17
|
+
#include "../disjunct-utils.h"
|
18
|
+
#include "../structures.h"
|
19
|
+
#include "../utilities.h"
|
20
|
+
|
21
|
+
struct cluster_s
|
22
|
+
{
|
23
|
+
char * dbname;
|
24
|
+
sqlite3 *dbconn;
|
25
|
+
sqlite3_stmt *clu_query;
|
26
|
+
sqlite3_stmt *dj_query;
|
27
|
+
char *errmsg;
|
28
|
+
int rc;
|
29
|
+
};
|
30
|
+
|
31
|
+
/* ========================================================= */
|
32
|
+
|
33
|
+
static void * db_file_open(const char * dbname, void * user_data)
|
34
|
+
{
|
35
|
+
Cluster *c = (Cluster *) user_data;
|
36
|
+
int rc;
|
37
|
+
sqlite3 *dbconn;
|
38
|
+
c->rc = sqlite3_open_v2(dbname, &dbconn, SQLITE_OPEN_READONLY, NULL);
|
39
|
+
if (c->rc)
|
40
|
+
{
|
41
|
+
sqlite3_close(dbconn);
|
42
|
+
return NULL;
|
43
|
+
}
|
44
|
+
|
45
|
+
c->dbname = strdup(dbname);
|
46
|
+
return dbconn;
|
47
|
+
}
|
48
|
+
|
49
|
+
|
50
|
+
/**
|
51
|
+
* Initialize the cluster statistics subsystem.
|
52
|
+
*/
|
53
|
+
Cluster * lg_cluster_new(void)
|
54
|
+
{
|
55
|
+
int rc;
|
56
|
+
|
57
|
+
Cluster *c = (Cluster *) malloc(sizeof(Cluster));
|
58
|
+
c->clu_query = NULL;
|
59
|
+
c->dj_query = NULL;
|
60
|
+
c->errmsg = NULL;
|
61
|
+
c->dbname = NULL;
|
62
|
+
|
63
|
+
/* dbname = "/link-grammar/data/en/sql/clusters.db"; */
|
64
|
+
#define DBNAME "sql/clusters.db"
|
65
|
+
c->dbconn = object_open(DBNAME, db_file_open, c);
|
66
|
+
if (NULL == c->dbconn)
|
67
|
+
{
|
68
|
+
/* Very weird .. but if the database is not found, then sqlite
|
69
|
+
* reports an "out of memory" error! So hide this misleading
|
70
|
+
* error message.
|
71
|
+
*/
|
72
|
+
if (SQLITE_CANTOPEN == c->rc)
|
73
|
+
{
|
74
|
+
prt_error("Warning: Can't open database: File not found\n"
|
75
|
+
"\tWas looking for: " DBNAME);
|
76
|
+
}
|
77
|
+
else
|
78
|
+
{
|
79
|
+
prt_error("Warning: Can't open database: %s\n"
|
80
|
+
"\tWas looking for: " DBNAME,
|
81
|
+
sqlite3_errmsg(c->dbconn));
|
82
|
+
}
|
83
|
+
return c;
|
84
|
+
}
|
85
|
+
|
86
|
+
/* Now prepare the statements we plan to use */
|
87
|
+
rc = sqlite3_prepare_v2(c->dbconn,
|
88
|
+
"SELECT cluster_name FROM ClusterMembers "
|
89
|
+
"WHERE inflected_word = ?;",
|
90
|
+
-1, &c->clu_query, NULL);
|
91
|
+
if (rc != SQLITE_OK)
|
92
|
+
{
|
93
|
+
prt_error("Error: Can't prepare the cluster member statment: %s\n",
|
94
|
+
sqlite3_errmsg(c->dbconn));
|
95
|
+
}
|
96
|
+
|
97
|
+
rc = sqlite3_prepare_v2(c->dbconn,
|
98
|
+
"SELECT disjunct, cost FROM ClusterDisjuncts "
|
99
|
+
"WHERE cluster_name = ?;",
|
100
|
+
-1, &c->dj_query, NULL);
|
101
|
+
if (rc != SQLITE_OK)
|
102
|
+
{
|
103
|
+
prt_error("Error: Can't prepare the disjunct statment: %s\n",
|
104
|
+
sqlite3_errmsg(c->dbconn));
|
105
|
+
}
|
106
|
+
|
107
|
+
prt_error("Info: Cluster grouping database found at %s\n", c->dbname);
|
108
|
+
return c;
|
109
|
+
}
|
110
|
+
|
111
|
+
/**
|
112
|
+
* lg_cluster_delete -- shut down the cluster statistics subsystem.
|
113
|
+
*/
|
114
|
+
void lg_cluster_delete(Cluster *c)
|
115
|
+
{
|
116
|
+
if (NULL == c) return;
|
117
|
+
|
118
|
+
if (c->clu_query)
|
119
|
+
{
|
120
|
+
sqlite3_finalize(c->clu_query);
|
121
|
+
c->clu_query = NULL;
|
122
|
+
}
|
123
|
+
|
124
|
+
if (c->dj_query)
|
125
|
+
{
|
126
|
+
sqlite3_finalize(c->dj_query);
|
127
|
+
c->dj_query = NULL;
|
128
|
+
}
|
129
|
+
|
130
|
+
if (c->dbconn)
|
131
|
+
{
|
132
|
+
sqlite3_close(c->dbconn);
|
133
|
+
c->dbconn = NULL;
|
134
|
+
}
|
135
|
+
|
136
|
+
if (c->dbname)
|
137
|
+
{
|
138
|
+
free(c->dbname);
|
139
|
+
c->dbname = NULL;
|
140
|
+
}
|
141
|
+
free(c);
|
142
|
+
}
|
143
|
+
|
144
|
+
/* ========================================================= */
|
145
|
+
|
146
|
+
static Exp * make_exp(const char *djstr, double cost)
|
147
|
+
{
|
148
|
+
Exp *e = (Exp *) malloc(sizeof(Exp));
|
149
|
+
e->multi = 0;
|
150
|
+
e->dir = ' ';
|
151
|
+
e->cost = cost;
|
152
|
+
|
153
|
+
/* If its just a single connector, then do just that */
|
154
|
+
char *sp = strchr (djstr, ' ');
|
155
|
+
if (NULL == sp || 0x0 == sp[1])
|
156
|
+
{
|
157
|
+
e->type = CONNECTOR_type;
|
158
|
+
if ('@' == djstr[0]) { e->multi = 1; djstr++; }
|
159
|
+
size_t len = strlen(djstr) - 1;
|
160
|
+
if (sp) len--;
|
161
|
+
e->u.string = strndup(djstr, len);
|
162
|
+
e->dir = djstr[len];
|
163
|
+
return e;
|
164
|
+
}
|
165
|
+
|
166
|
+
/* If there are multiple connectors, and them together */
|
167
|
+
size_t len = sp - djstr;
|
168
|
+
char * tmp = strndup(djstr, len);
|
169
|
+
Exp *p1 = make_exp(tmp, 0.0);
|
170
|
+
free (tmp);
|
171
|
+
Exp *p2 = make_exp(sp+1, 0.0);
|
172
|
+
|
173
|
+
E_list *l;
|
174
|
+
E_list *lhead = NULL;
|
175
|
+
|
176
|
+
l = (E_list *) malloc(sizeof(E_list));
|
177
|
+
l->next = lhead;
|
178
|
+
l->e = p2;
|
179
|
+
lhead = l;
|
180
|
+
|
181
|
+
l = (E_list *) malloc(sizeof(E_list));
|
182
|
+
l->next = lhead;
|
183
|
+
l->e = p1;
|
184
|
+
lhead = l;
|
185
|
+
|
186
|
+
e->type = AND_type;
|
187
|
+
e->u.l = lhead;
|
188
|
+
|
189
|
+
return e;
|
190
|
+
}
|
191
|
+
|
192
|
+
#if NOT_NEEDED
|
193
|
+
static Exp * or_exp(Exp *p1, Exp *p2)
|
194
|
+
{
|
195
|
+
if (NULL == p2) return p1;
|
196
|
+
|
197
|
+
Exp *e = (Exp *) malloc(sizeof(Exp));
|
198
|
+
e->multi = 0;
|
199
|
+
e->dir = ' ';
|
200
|
+
e->cost = 0.0;
|
201
|
+
e->type = OR_type;
|
202
|
+
|
203
|
+
E_list *l;
|
204
|
+
E_list *lhead = NULL;
|
205
|
+
|
206
|
+
l = (E_list *) malloc(sizeof(E_list));
|
207
|
+
l->next = lhead;
|
208
|
+
l->e = p2;
|
209
|
+
lhead = l;
|
210
|
+
|
211
|
+
l = (E_list *) malloc(sizeof(E_list));
|
212
|
+
l->next = lhead;
|
213
|
+
l->e = p1;
|
214
|
+
lhead = l;
|
215
|
+
|
216
|
+
e->u.l = lhead;
|
217
|
+
return e;
|
218
|
+
}
|
219
|
+
#endif
|
220
|
+
|
221
|
+
static void free_exp(Exp *e)
|
222
|
+
{
|
223
|
+
if (CONNECTOR_type != e->type)
|
224
|
+
{
|
225
|
+
E_list *l = e->u.l;
|
226
|
+
while(l)
|
227
|
+
{
|
228
|
+
free_exp(l->e);
|
229
|
+
E_list *ln = l->next;
|
230
|
+
free(l);
|
231
|
+
l = ln;
|
232
|
+
}
|
233
|
+
return;
|
234
|
+
}
|
235
|
+
|
236
|
+
free((char *) e->u.string);
|
237
|
+
free(e);
|
238
|
+
}
|
239
|
+
|
240
|
+
Disjunct * lg_cluster_get_disjuncts(Cluster *c, const char * wrd)
|
241
|
+
{
|
242
|
+
Disjunct *djl = NULL;
|
243
|
+
int rc;
|
244
|
+
|
245
|
+
/* Look for a cluster containing this word */
|
246
|
+
rc = sqlite3_bind_text(c->clu_query, 1, wrd, -1, SQLITE_STATIC);
|
247
|
+
rc = sqlite3_step(c->clu_query);
|
248
|
+
if (rc != SQLITE_ROW) goto noclust;
|
249
|
+
|
250
|
+
/* Get the cluster name, and look for the disjuncts */
|
251
|
+
const char * cluname = sqlite3_column_text(c->clu_query,0);
|
252
|
+
rc = sqlite3_bind_text(c->dj_query, 1, cluname, -1, SQLITE_STATIC);
|
253
|
+
|
254
|
+
while(1)
|
255
|
+
{
|
256
|
+
rc = sqlite3_step(c->dj_query);
|
257
|
+
if (rc != SQLITE_ROW) break;
|
258
|
+
const char * djs = sqlite3_column_text(c->dj_query,0);
|
259
|
+
double cost = sqlite3_column_double(c->dj_query,1);
|
260
|
+
|
261
|
+
/* All expanded disjuncts are costly! */
|
262
|
+
// cost += 0.5;
|
263
|
+
cost -= 6.0;
|
264
|
+
if (cost < 0.0) cost = 0.0;
|
265
|
+
|
266
|
+
/* Building expressions */
|
267
|
+
Exp *e = make_exp(djs, cost);
|
268
|
+
X_node x;
|
269
|
+
x.exp = e;
|
270
|
+
x.string = wrd;
|
271
|
+
Disjunct *dj = build_disjuncts_for_X_node(&x, MAX_CONNECTOR_COST);
|
272
|
+
djl = catenate_disjuncts(dj, djl);
|
273
|
+
free_exp(e);
|
274
|
+
}
|
275
|
+
|
276
|
+
sqlite3_reset(c->dj_query);
|
277
|
+
sqlite3_clear_bindings(c->dj_query);
|
278
|
+
|
279
|
+
noclust:
|
280
|
+
sqlite3_reset(c->clu_query);
|
281
|
+
sqlite3_clear_bindings(c->clu_query);
|
282
|
+
return djl;
|
283
|
+
}
|
284
|
+
|
285
|
+
|
286
|
+
/* ======================= END OF FILE ===================== */
|
@@ -0,0 +1,32 @@
|
|
1
|
+
/*
|
2
|
+
* cluster.h
|
3
|
+
*
|
4
|
+
* Data for related-word clusters. Meant to expand disjunct covereage
|
5
|
+
* for the case where a parse cannot be completed without ommitting
|
6
|
+
* a word.
|
7
|
+
*
|
8
|
+
* Copyright (c) 2009 Linas Vepstas <linasvepstas@gmail.com>
|
9
|
+
*/
|
10
|
+
|
11
|
+
#ifndef _LINKGRAMMAR_CLUSTER_H
|
12
|
+
#define _LINKGRAMMAR_CLUSTER_H
|
13
|
+
|
14
|
+
#ifdef USE_CORPUS
|
15
|
+
|
16
|
+
#include "../api-types.h"
|
17
|
+
#include "../link-includes.h"
|
18
|
+
|
19
|
+
Cluster * lg_cluster_new(void);
|
20
|
+
void lg_cluster_delete(Cluster *);
|
21
|
+
|
22
|
+
Disjunct * lg_cluster_get_disjuncts(Cluster *, const char * wrd);
|
23
|
+
|
24
|
+
#else /* USE_CORPUS */
|
25
|
+
|
26
|
+
static inline Cluster * lg_cluster_new(void) { return NULL; }
|
27
|
+
static inline void lg_cluster_delete(Cluster *c) {}
|
28
|
+
static inline Disjunct * lg_cluster_get_disjuncts(Cluster *c, const char * wrd) { return NULL; }
|
29
|
+
|
30
|
+
#endif /* USE_CORPUS */
|
31
|
+
|
32
|
+
#endif /* _LINKGRAMMAR_CLUSTER_H */
|
@@ -0,0 +1,483 @@
|
|
1
|
+
/*
|
2
|
+
* corpus.c
|
3
|
+
*
|
4
|
+
* Data for corpus statistics, used to provide a parse ranking
|
5
|
+
* to drive the SAT solver, as well as parse ranking with the
|
6
|
+
* ordinary solver.
|
7
|
+
*
|
8
|
+
* Copyright (c) 2008, 2009 Linas Vepstas <linasvepstas@gmail.com>
|
9
|
+
*/
|
10
|
+
|
11
|
+
#include <stdio.h>
|
12
|
+
#include <stdlib.h>
|
13
|
+
#include <string.h>
|
14
|
+
#include <sqlite3.h>
|
15
|
+
#include "corpus.h"
|
16
|
+
#include "../api-structures.h"
|
17
|
+
#include "../disjuncts.h"
|
18
|
+
#include "../utilities.h"
|
19
|
+
|
20
|
+
struct corpus_s
|
21
|
+
{
|
22
|
+
char * dbname;
|
23
|
+
sqlite3 *dbconn;
|
24
|
+
sqlite3_stmt *rank_query;
|
25
|
+
sqlite3_stmt *sense_query;
|
26
|
+
char *errmsg;
|
27
|
+
int rc;
|
28
|
+
};
|
29
|
+
|
30
|
+
struct sense_s
|
31
|
+
{
|
32
|
+
int word;
|
33
|
+
const char * inflected_word;
|
34
|
+
const char * disjunct;
|
35
|
+
char * sense;
|
36
|
+
double score;
|
37
|
+
Sense *next;
|
38
|
+
};
|
39
|
+
|
40
|
+
/* ========================================================= */
|
41
|
+
|
42
|
+
static void * db_file_open(const char * dbname, void * user_data)
|
43
|
+
{
|
44
|
+
Corpus *c = (Corpus *) user_data;
|
45
|
+
int rc;
|
46
|
+
sqlite3 *dbconn;
|
47
|
+
c->rc = sqlite3_open_v2(dbname, &dbconn, SQLITE_OPEN_READONLY, NULL);
|
48
|
+
if (c->rc)
|
49
|
+
{
|
50
|
+
sqlite3_close(dbconn);
|
51
|
+
return NULL;
|
52
|
+
}
|
53
|
+
|
54
|
+
c->dbname = strdup(dbname);
|
55
|
+
return dbconn;
|
56
|
+
}
|
57
|
+
|
58
|
+
|
59
|
+
/**
|
60
|
+
* Initialize the corpus statistics subsystem.
|
61
|
+
*/
|
62
|
+
Corpus * lg_corpus_new(void)
|
63
|
+
{
|
64
|
+
int rc;
|
65
|
+
|
66
|
+
Corpus *c = (Corpus *) malloc(sizeof(Corpus));
|
67
|
+
c->rank_query = NULL;
|
68
|
+
c->sense_query = NULL;
|
69
|
+
c->errmsg = NULL;
|
70
|
+
c->dbname = NULL;
|
71
|
+
|
72
|
+
/* dbname = "/link-grammar/data/en/sql/disjuncts.db"; */
|
73
|
+
#define DBNAME "sql/disjuncts.db"
|
74
|
+
c->dbconn = object_open(DBNAME, db_file_open, c);
|
75
|
+
if (NULL == c->dbconn)
|
76
|
+
{
|
77
|
+
/* Very weird .. but if the database is not found, then sqlite
|
78
|
+
* reports an "out of memory" error! So hide this misleading
|
79
|
+
* error message.
|
80
|
+
*/
|
81
|
+
if (SQLITE_CANTOPEN == c->rc)
|
82
|
+
{
|
83
|
+
prt_error("Warning: Can't open database: File not found\n"
|
84
|
+
"\tWas looking for: " DBNAME);
|
85
|
+
}
|
86
|
+
else
|
87
|
+
{
|
88
|
+
prt_error("Warning: Can't open database: %s\n"
|
89
|
+
"\tWas looking for: " DBNAME,
|
90
|
+
sqlite3_errmsg(c->dbconn));
|
91
|
+
}
|
92
|
+
return c;
|
93
|
+
}
|
94
|
+
|
95
|
+
/* Now prepare the statements we plan to use */
|
96
|
+
rc = sqlite3_prepare_v2(c->dbconn,
|
97
|
+
"SELECT log_cond_probability FROM Disjuncts "
|
98
|
+
"WHERE inflected_word = ? AND disjunct = ?;",
|
99
|
+
-1, &c->rank_query, NULL);
|
100
|
+
if (rc != SQLITE_OK)
|
101
|
+
{
|
102
|
+
prt_error("Error: Can't prepare the ranking statment: %s\n",
|
103
|
+
sqlite3_errmsg(c->dbconn));
|
104
|
+
}
|
105
|
+
|
106
|
+
/* Results are returned in sorted order .. would it be faster
|
107
|
+
* to sort locally? Don't know ... */
|
108
|
+
rc = sqlite3_prepare_v2(c->dbconn,
|
109
|
+
"SELECT word_sense, log_cond_probability FROM DisjunctSenses "
|
110
|
+
"WHERE inflected_word = ? AND disjunct = ? "
|
111
|
+
"ORDER BY log_cond_probability ASC;",
|
112
|
+
-1, &c->sense_query, NULL);
|
113
|
+
if (rc != SQLITE_OK)
|
114
|
+
{
|
115
|
+
prt_error("Error: Can't prepare the sense statment: %s\n",
|
116
|
+
sqlite3_errmsg(c->dbconn));
|
117
|
+
}
|
118
|
+
|
119
|
+
prt_error("Info: Corpus statistics database found at %s\n", c->dbname);
|
120
|
+
return c;
|
121
|
+
}
|
122
|
+
|
123
|
+
/**
|
124
|
+
* lg_corpus_delete -- shut down the corpus statistics subsystem.
|
125
|
+
*/
|
126
|
+
void lg_corpus_delete(Corpus *c)
|
127
|
+
{
|
128
|
+
if (NULL == c) return;
|
129
|
+
|
130
|
+
if (c->rank_query)
|
131
|
+
{
|
132
|
+
sqlite3_finalize(c->rank_query);
|
133
|
+
c->rank_query = NULL;
|
134
|
+
}
|
135
|
+
|
136
|
+
if (c->sense_query)
|
137
|
+
{
|
138
|
+
sqlite3_finalize(c->sense_query);
|
139
|
+
c->sense_query = NULL;
|
140
|
+
}
|
141
|
+
|
142
|
+
if (c->dbconn)
|
143
|
+
{
|
144
|
+
sqlite3_close(c->dbconn);
|
145
|
+
c->dbconn = NULL;
|
146
|
+
}
|
147
|
+
|
148
|
+
if (c->dbname)
|
149
|
+
{
|
150
|
+
free(c->dbname);
|
151
|
+
c->dbname = NULL;
|
152
|
+
}
|
153
|
+
free(c);
|
154
|
+
}
|
155
|
+
|
156
|
+
/* ========================================================= */
|
157
|
+
|
158
|
+
/* LOW_SCORE is what is assumed if a disjunct-word pair is not found
|
159
|
+
* in the dictionary. It is meant to be -log_2(prob(d|w)) where
|
160
|
+
* prob(d|w) is the conditional probability of seeing the disjunct d
|
161
|
+
* given the word w. A value of 17 is about equal to 1 in 100,000.
|
162
|
+
*/
|
163
|
+
#define LOW_SCORE 17.0
|
164
|
+
|
165
|
+
/**
|
166
|
+
* get_disjunct_score -- get log probability of observing disjunt.
|
167
|
+
*
|
168
|
+
* Given an "inflected" word and a disjunct, thris routine returns the
|
169
|
+
* -log_2 conditional probability prob(d|w) of seeing the disjunct 'd'
|
170
|
+
* given that the word 'w' was observed. Here, "inflected word" means
|
171
|
+
* the link-grammar dictionary entry, complete with its trailing period
|
172
|
+
* and tag -- e.g. run.v or running.g -- everything after the dot is the
|
173
|
+
* "inflection".
|
174
|
+
*/
|
175
|
+
static double get_disjunct_score(Corpus *corp,
|
176
|
+
const char * inflected_word,
|
177
|
+
const char * disjunct)
|
178
|
+
{
|
179
|
+
double val;
|
180
|
+
int rc;
|
181
|
+
|
182
|
+
/* Look up the disjunct in the database */
|
183
|
+
rc = sqlite3_bind_text(corp->rank_query, 1,
|
184
|
+
inflected_word, -1, SQLITE_STATIC);
|
185
|
+
if (rc != SQLITE_OK)
|
186
|
+
{
|
187
|
+
prt_error("Error: SQLite can't bind word: rc=%d \n", rc);
|
188
|
+
return LOW_SCORE;
|
189
|
+
}
|
190
|
+
|
191
|
+
rc = sqlite3_bind_text(corp->rank_query, 2,
|
192
|
+
disjunct, -1, SQLITE_STATIC);
|
193
|
+
if (rc != SQLITE_OK)
|
194
|
+
{
|
195
|
+
prt_error("Error: SQLite can't bind disjunct: rc=%d \n", rc);
|
196
|
+
return LOW_SCORE;
|
197
|
+
}
|
198
|
+
|
199
|
+
rc = sqlite3_step(corp->rank_query);
|
200
|
+
if (rc != SQLITE_ROW)
|
201
|
+
{
|
202
|
+
val = LOW_SCORE;
|
203
|
+
#ifdef DEBUG
|
204
|
+
printf ("Word=%s dj=%s not found in dict, assume score=%f\n",
|
205
|
+
inflected_word, disjunct, val);
|
206
|
+
#endif
|
207
|
+
}
|
208
|
+
else
|
209
|
+
{
|
210
|
+
val = sqlite3_column_double(corp->rank_query, 0);
|
211
|
+
if (LOW_SCORE < val) val = LOW_SCORE;
|
212
|
+
#ifdef DEBUG
|
213
|
+
printf ("Word=%s dj=%s score=%f\n", inflected_word, disjunct, val);
|
214
|
+
#endif
|
215
|
+
}
|
216
|
+
|
217
|
+
/* Failure to do both a reset *and* a clear will cause subsequent
|
218
|
+
* binds tp fail. */
|
219
|
+
sqlite3_reset(corp->rank_query);
|
220
|
+
sqlite3_clear_bindings(corp->rank_query);
|
221
|
+
return val;
|
222
|
+
}
|
223
|
+
|
224
|
+
/* ========================================================= */
|
225
|
+
|
226
|
+
/**
|
227
|
+
* lg_corpus_score -- compute parse-ranking score for sentence.
|
228
|
+
*
|
229
|
+
* Given a parsed sentence, this routine will compute a parse ranking
|
230
|
+
* score, based on the probabilites of observing the indicated set of
|
231
|
+
* disjuncts in the statistics database.
|
232
|
+
*
|
233
|
+
* The score is stored in the Linkage_info->corpus_cost struct member.
|
234
|
+
*
|
235
|
+
* The score is currently computed as the average -log_2 conditional
|
236
|
+
* probability p(d|w) of observing disjunct 'd', given word 'w'.
|
237
|
+
* Lower scores are better -- they indicate more likely parses.
|
238
|
+
*/
|
239
|
+
void lg_corpus_score(Sentence sent, Linkage_info *lifo)
|
240
|
+
{
|
241
|
+
const char *infword, *djstr;
|
242
|
+
double tot_score = 0.0f;
|
243
|
+
Corpus *corp = sent->dict->corpus;
|
244
|
+
int nwords = sent->length;
|
245
|
+
int w;
|
246
|
+
|
247
|
+
/* No-op if the database is not open */
|
248
|
+
if (NULL == corp->dbconn) return;
|
249
|
+
|
250
|
+
lg_compute_disjunct_strings(sent, lifo);
|
251
|
+
|
252
|
+
/* Decrement nwords, so as to ignore the RIGHT-WALL */
|
253
|
+
nwords --;
|
254
|
+
|
255
|
+
/* Loop over each word in the sentence (skipping LEFT-WALL, which is
|
256
|
+
* word 0. */
|
257
|
+
for (w=1; w<nwords; w++)
|
258
|
+
{
|
259
|
+
Disjunct *disj = sent->parse_info->chosen_disjuncts[w];
|
260
|
+
|
261
|
+
/* disj is NULL if word did not participate in parse */
|
262
|
+
if (NULL == disj)
|
263
|
+
{
|
264
|
+
tot_score += LOW_SCORE;
|
265
|
+
continue;
|
266
|
+
}
|
267
|
+
infword = disj->string;
|
268
|
+
djstr = lifo->disjunct_list_str[w];
|
269
|
+
tot_score += get_disjunct_score(corp, infword, djstr);
|
270
|
+
}
|
271
|
+
|
272
|
+
/* Decrement nwords, so as to ignore the LEFT-WALL */
|
273
|
+
--nwords;
|
274
|
+
tot_score /= nwords;
|
275
|
+
lifo->corpus_cost = tot_score;
|
276
|
+
}
|
277
|
+
|
278
|
+
double lg_corpus_disjunct_score(Linkage linkage, int w)
|
279
|
+
{
|
280
|
+
double score;
|
281
|
+
const char *infword, *djstr;
|
282
|
+
Sentence sent = linkage->sent;
|
283
|
+
Linkage_info *lifo = linkage->info;
|
284
|
+
Corpus *corp = sent->dict->corpus;
|
285
|
+
Disjunct *disj;
|
286
|
+
|
287
|
+
/* No-op if the database is not open */
|
288
|
+
if (NULL == corp->dbconn) return LOW_SCORE;
|
289
|
+
|
290
|
+
/* disj is NULL if word did not participate in parse */
|
291
|
+
disj = sent->parse_info->chosen_disjuncts[w];
|
292
|
+
if (NULL == disj) return LOW_SCORE;
|
293
|
+
|
294
|
+
lg_compute_disjunct_strings(sent, lifo);
|
295
|
+
|
296
|
+
infword = disj->string;
|
297
|
+
djstr = lifo->disjunct_list_str[w];
|
298
|
+
score = get_disjunct_score(corp, infword, djstr);
|
299
|
+
|
300
|
+
return score;
|
301
|
+
}
|
302
|
+
|
303
|
+
/* ========================================================= */
|
304
|
+
|
305
|
+
/**
|
306
|
+
* lg_corpus_senses -- Given word and disjunct, look up senses.
|
307
|
+
*
|
308
|
+
* Given a particular disjunct for a word, look up its most
|
309
|
+
* likely sense assignments from the database.
|
310
|
+
*/
|
311
|
+
|
312
|
+
static Sense * lg_corpus_senses(Corpus *corp,
|
313
|
+
const char * inflected_word,
|
314
|
+
const char * disjunct,
|
315
|
+
int wrd)
|
316
|
+
{
|
317
|
+
double log_prob;
|
318
|
+
const unsigned char *sense;
|
319
|
+
Sense *sns, *head = NULL;
|
320
|
+
int rc;
|
321
|
+
|
322
|
+
/* Look up the disjunct in the database */
|
323
|
+
rc = sqlite3_bind_text(corp->sense_query, 1,
|
324
|
+
inflected_word, -1, SQLITE_STATIC);
|
325
|
+
if (rc != SQLITE_OK)
|
326
|
+
{
|
327
|
+
prt_error("Error: SQLite can't bind word in sense query: rc=%d \n", rc);
|
328
|
+
return NULL;
|
329
|
+
}
|
330
|
+
|
331
|
+
rc = sqlite3_bind_text(corp->sense_query, 2,
|
332
|
+
disjunct, -1, SQLITE_STATIC);
|
333
|
+
if (rc != SQLITE_OK)
|
334
|
+
{
|
335
|
+
prt_error("Error: SQLite can't bind disjunct in sense query: rc=%d \n", rc);
|
336
|
+
return NULL;
|
337
|
+
}
|
338
|
+
|
339
|
+
rc = sqlite3_step(corp->sense_query);
|
340
|
+
while (SQLITE_ROW == rc)
|
341
|
+
{
|
342
|
+
sense = sqlite3_column_text(corp->sense_query, 0);
|
343
|
+
log_prob = sqlite3_column_double(corp->sense_query, 1);
|
344
|
+
// printf ("Word=%s dj=%s sense=%s score=%f\n",
|
345
|
+
// inflected_word, disjunct, sense, log_prob);
|
346
|
+
|
347
|
+
sns = (Sense *) malloc(sizeof(Sense));
|
348
|
+
sns->next = head;
|
349
|
+
head = sns;
|
350
|
+
|
351
|
+
sns->inflected_word = inflected_word;
|
352
|
+
sns->disjunct = disjunct;
|
353
|
+
sns->sense = strdup(sense);
|
354
|
+
sns->score = log_prob;
|
355
|
+
sns->word = wrd;
|
356
|
+
|
357
|
+
/* Get the next row, if any */
|
358
|
+
rc = sqlite3_step(corp->sense_query);
|
359
|
+
}
|
360
|
+
|
361
|
+
/* Failure to do both a reset *and* a clear will cause subsequent
|
362
|
+
* binds tp fail. */
|
363
|
+
sqlite3_reset(corp->sense_query);
|
364
|
+
sqlite3_clear_bindings(corp->sense_query);
|
365
|
+
|
366
|
+
return head;
|
367
|
+
}
|
368
|
+
|
369
|
+
/* ========================================================= */
|
370
|
+
|
371
|
+
/**
|
372
|
+
* lg_corpus_linkage_senses -- Given a linkage, look up senses.
|
373
|
+
*
|
374
|
+
* Given a particular linakge, look up the most likely sense
|
375
|
+
* assignments from the database.
|
376
|
+
*
|
377
|
+
* This function is not used to guide the parsing process; it is
|
378
|
+
* only an informational look-up.
|
379
|
+
*/
|
380
|
+
|
381
|
+
void lg_corpus_linkage_senses(Linkage linkage)
|
382
|
+
{
|
383
|
+
const char * infword;
|
384
|
+
Sentence sent = linkage->sent;
|
385
|
+
Dictionary dict = sent->dict;
|
386
|
+
Corpus *corp = dict->corpus;
|
387
|
+
int nwords = sent->length;
|
388
|
+
Linkage_info *lifo = linkage->info;
|
389
|
+
int w;
|
390
|
+
|
391
|
+
if (lifo->sense_list) return;
|
392
|
+
|
393
|
+
/* Set up the disjunct strings first */
|
394
|
+
lg_compute_disjunct_strings(sent, lifo);
|
395
|
+
|
396
|
+
lifo->nwords = nwords;
|
397
|
+
lifo->sense_list = (Sense **) malloc(nwords * sizeof (Sense *));
|
398
|
+
memset(lifo->sense_list, 0, nwords * sizeof (Sense *));
|
399
|
+
|
400
|
+
/* Decrement nwords, so as to ignore the RIGHT-WALL */
|
401
|
+
nwords --;
|
402
|
+
|
403
|
+
/* Loop over each word in the sentence (skipping LEFT-WALL, which is
|
404
|
+
* word 0. */
|
405
|
+
for (w=1; w<nwords; w++)
|
406
|
+
{
|
407
|
+
Disjunct *disj = sent->parse_info->chosen_disjuncts[w];
|
408
|
+
|
409
|
+
/* disj is NULL if word did not participate in parse */
|
410
|
+
if (NULL == disj)
|
411
|
+
{
|
412
|
+
continue;
|
413
|
+
}
|
414
|
+
infword = disj->string;
|
415
|
+
|
416
|
+
lifo->sense_list[w] = lg_corpus_senses(corp, infword,
|
417
|
+
lifo->disjunct_list_str[w], w);
|
418
|
+
}
|
419
|
+
}
|
420
|
+
|
421
|
+
/* ========================================================= */
|
422
|
+
/* Return bits and pieces of the sense assignments */
|
423
|
+
|
424
|
+
Sense * lg_get_word_sense(Linkage_info *lifo, int word)
|
425
|
+
{
|
426
|
+
if (!lifo->sense_list) return NULL;
|
427
|
+
if (lifo->nwords <= word) return NULL;
|
428
|
+
return lifo->sense_list[word];
|
429
|
+
}
|
430
|
+
|
431
|
+
Sense * lg_sense_next(Sense *sns)
|
432
|
+
{
|
433
|
+
return sns->next;
|
434
|
+
}
|
435
|
+
|
436
|
+
int lg_sense_get_index(Sense *sns)
|
437
|
+
{
|
438
|
+
return sns->word;
|
439
|
+
}
|
440
|
+
|
441
|
+
const char * lg_sense_get_subscripted_word(Sense *sns)
|
442
|
+
{
|
443
|
+
return sns->inflected_word;
|
444
|
+
}
|
445
|
+
|
446
|
+
const char * lg_sense_get_disjunct(Sense *sns)
|
447
|
+
{
|
448
|
+
return sns->disjunct;
|
449
|
+
}
|
450
|
+
|
451
|
+
const char * lg_sense_get_sense(Sense *sns)
|
452
|
+
{
|
453
|
+
return sns->sense;
|
454
|
+
}
|
455
|
+
|
456
|
+
double lg_sense_get_score(Sense *sns)
|
457
|
+
{
|
458
|
+
return sns->score;
|
459
|
+
}
|
460
|
+
|
461
|
+
void lg_sense_delete(Linkage_info *lifo)
|
462
|
+
{
|
463
|
+
size_t nwords = lifo->nwords;
|
464
|
+
size_t w;
|
465
|
+
|
466
|
+
if (NULL == lifo->sense_list) return;
|
467
|
+
|
468
|
+
for (w=0; w<nwords; w++)
|
469
|
+
{
|
470
|
+
Sense *sns = lifo->sense_list[w];
|
471
|
+
while (sns)
|
472
|
+
{
|
473
|
+
Sense * nxt = sns->next;
|
474
|
+
free(sns->sense);
|
475
|
+
free(sns);
|
476
|
+
sns = nxt;
|
477
|
+
}
|
478
|
+
}
|
479
|
+
free (lifo->sense_list);
|
480
|
+
lifo->sense_list = NULL;
|
481
|
+
}
|
482
|
+
|
483
|
+
/* ======================= END OF FILE ===================== */
|