biblicit 1.0 → 2.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitmodules +3 -0
- data/Gemfile +1 -1
- data/README.md +125 -30
- data/Rakefile +22 -0
- data/biblicit.gemspec +9 -7
- data/lib/biblicit/cb2bib.rb +10 -11
- data/lib/biblicit/citeseer.rb +14 -26
- data/lib/biblicit/extractor.rb +40 -19
- data/lib/biblicit/parscit.rb +38 -0
- data/parscit/.gitignore +8 -0
- data/parscit/CHANGELOG +125 -0
- data/parscit/COPYING +674 -0
- data/parscit/COPYING.LESSER +165 -0
- data/parscit/INSTALL +105 -0
- data/parscit/README +97 -0
- data/{perl/ParsCit/README.TXT → parscit/USAGE} +25 -15
- data/parscit/bin/archtest.pl +31 -0
- data/parscit/bin/citeExtract.pl +562 -0
- data/parscit/bin/conlleval.pl +315 -0
- data/parscit/bin/headExtract.pl +40 -0
- data/parscit/bin/parsHed/convert2TokenLevel.pl +138 -0
- data/parscit/bin/parsHed/keywordGen.pl +308 -0
- data/parscit/bin/parsHed/parseXmlHeader.pl +141 -0
- data/parscit/bin/parsHed/redo.parsHed.pl +198 -0
- data/parscit/bin/parsHed/tr2crfpp_parsHed.pl +521 -0
- data/parscit/bin/parseRefStrings.pl +102 -0
- data/parscit/bin/phOutput2xml.pl +223 -0
- data/parscit/bin/redo.parsCit.pl +105 -0
- data/parscit/bin/sectExtract.pl +149 -0
- data/parscit/bin/sectLabel/README +110 -0
- data/parscit/bin/sectLabel/README.txt +110 -0
- data/parscit/bin/sectLabel/genericSect/crossValidation.rb +98 -0
- data/parscit/bin/sectLabel/genericSect/extractFeature.rb +104 -0
- data/parscit/bin/sectLabel/genericSectExtract.rb +53 -0
- data/parscit/bin/sectLabel/getStructureInfo.pl +156 -0
- data/parscit/bin/sectLabel/processOmniXML.pl +1427 -0
- data/parscit/bin/sectLabel/processOmniXML_new.pl +1025 -0
- data/parscit/bin/sectLabel/processOmniXMLv2.pl +1529 -0
- data/parscit/bin/sectLabel/processOmniXMLv3.pl +964 -0
- data/parscit/bin/sectLabel/redo.sectLabel.pl +219 -0
- data/parscit/bin/sectLabel/simplifyOmniXML.pl +382 -0
- data/parscit/bin/sectLabel/single2multi.pl +190 -0
- data/parscit/bin/sectLabel/tr2crfpp.pl +158 -0
- data/parscit/bin/tr2crfpp.pl +260 -0
- data/parscit/bin/xml2train.pl +193 -0
- data/parscit/lib/CSXUtil/SafeText.pm +130 -0
- data/parscit/lib/Omni/Config.pm +93 -0
- data/parscit/lib/Omni/Omnicell.pm +263 -0
- data/parscit/lib/Omni/Omnicol.pm +292 -0
- data/parscit/lib/Omni/Omnidd.pm +328 -0
- data/parscit/lib/Omni/Omnidoc.pm +153 -0
- data/parscit/lib/Omni/Omniframe.pm +223 -0
- data/parscit/lib/Omni/Omniline.pm +423 -0
- data/parscit/lib/Omni/Omnipage.pm +282 -0
- data/parscit/lib/Omni/Omnipara.pm +232 -0
- data/parscit/lib/Omni/Omnirun.pm +303 -0
- data/parscit/lib/Omni/Omnitable.pm +336 -0
- data/parscit/lib/Omni/Omniword.pm +162 -0
- data/parscit/lib/Omni/Traversal.pm +313 -0
- data/parscit/lib/ParsCit/.PostProcess.pm.swp +0 -0
- data/parscit/lib/ParsCit/Citation.pm +737 -0
- data/parscit/lib/ParsCit/CitationContext.pm +220 -0
- data/parscit/lib/ParsCit/Config.pm +35 -0
- data/parscit/lib/ParsCit/Controller.pm +653 -0
- data/parscit/lib/ParsCit/PostProcess.pm +505 -0
- data/parscit/lib/ParsCit/PreProcess.pm +1041 -0
- data/parscit/lib/ParsCit/Tr2crfpp.pm +1195 -0
- data/parscit/lib/ParsHed/Config.pm +49 -0
- data/parscit/lib/ParsHed/Controller.pm +143 -0
- data/parscit/lib/ParsHed/PostProcess.pm +322 -0
- data/parscit/lib/ParsHed/Tr2crfpp.pm +448 -0
- data/{perl/ParsCit/lib/ParsCit/Tr2crfpp.pm → parscit/lib/ParsHed/Tr2crfpp_token.pm} +22 -21
- data/parscit/lib/SectLabel/AAMatching.pm +1949 -0
- data/parscit/lib/SectLabel/Config.pm +88 -0
- data/parscit/lib/SectLabel/Controller.pm +332 -0
- data/parscit/lib/SectLabel/PostProcess.pm +425 -0
- data/parscit/lib/SectLabel/PreProcess.pm +116 -0
- data/parscit/lib/SectLabel/Tr2crfpp.pm +1246 -0
- data/parscit/resources/parsCit.model +0 -0
- data/parscit/resources/parsCit.split.model +0 -0
- data/{perl/ParsCit → parscit}/resources/parsCitDict.txt +205 -0
- data/parscit/resources/parsHed/bigram +10 -0
- data/parscit/resources/parsHed/keywords +10 -0
- data/parscit/resources/parsHed/parsHed.model +0 -0
- data/parscit/resources/parsHed/parsHed.template +178 -0
- data/parscit/resources/sectLabel/affiliation.model +0 -0
- data/parscit/resources/sectLabel/author.model +0 -0
- data/parscit/resources/sectLabel/funcWord +320 -0
- data/parscit/resources/sectLabel/genericSect.model +0 -0
- data/parscit/resources/sectLabel/sectLabel.config +42 -0
- data/parscit/resources/sectLabel/sectLabel.configXml +42 -0
- data/parscit/resources/sectLabel/sectLabel.model +0 -0
- data/sh/convert_to_text.sh +20 -0
- data/spec/biblicit/extractor_spec.rb +121 -0
- data/spec/fixtures/Review_of_Michael_Tyes_Consciousness_Revisited.docx +0 -0
- data/spec/fixtures/critical-infrastructures.ps +63951 -0
- data/spec/fixtures/txt/E06-1050.txt +867 -0
- data/spec/fixtures/txt/sample1.txt +902 -0
- data/spec/fixtures/txt/sample2.txt +394 -0
- data/spec/spec_helper.rb +3 -0
- data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Function.pm +2 -20
- data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +0 -7
- data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Parser.pm +0 -2
- data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +0 -7
- data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +6 -1
- data/svm-header-parse/HeaderParseService/tmp/.gitignore +4 -0
- data/svm-header-parse/extract.pl +75 -0
- metadata +351 -317
- data/perl/DocFilter/lib/DocFilter/Config.pm +0 -35
- data/perl/DocFilter/lib/DocFilter/Filter.pm +0 -51
- data/perl/FileConversionService/README.TXT +0 -11
- data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
- data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +0 -140
- data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +0 -77
- data/perl/FileConversionService/lib/FileConverter/Compression.pm +0 -137
- data/perl/FileConversionService/lib/FileConverter/Config.pm +0 -57
- data/perl/FileConversionService/lib/FileConverter/Controller.pm +0 -191
- data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +0 -61
- data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +0 -69
- data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +0 -69
- data/perl/FileConversionService/lib/FileConverter/PSToText.pm +0 -88
- data/perl/FileConversionService/lib/FileConverter/Prescript.pm +0 -68
- data/perl/FileConversionService/lib/FileConverter/TET.pm +0 -75
- data/perl/FileConversionService/lib/FileConverter/Utils.pm +0 -130
- data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +0 -140
- data/perl/HeaderParseService/resources/data/EbizHeaders.txt +0 -24330
- data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +0 -27506
- data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +0 -26495
- data/perl/HeaderParseService/resources/data/tagged_headers.txt +0 -40668
- data/perl/HeaderParseService/resources/data/test_header.txt +0 -31
- data/perl/HeaderParseService/resources/data/test_header.txt.parsed +0 -31
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +0 -23
- data/perl/ParsCit/crfpp/traindata/parsCit.template +0 -60
- data/perl/ParsCit/crfpp/traindata/parsCit.train.data +0 -12104
- data/perl/ParsCit/crfpp/traindata/tagged_references.txt +0 -500
- data/perl/ParsCit/lib/CSXUtil/SafeText.pm +0 -140
- data/perl/ParsCit/lib/ParsCit/Citation.pm +0 -462
- data/perl/ParsCit/lib/ParsCit/CitationContext.pm +0 -132
- data/perl/ParsCit/lib/ParsCit/Config.pm +0 -46
- data/perl/ParsCit/lib/ParsCit/Controller.pm +0 -306
- data/perl/ParsCit/lib/ParsCit/PostProcess.pm +0 -367
- data/perl/ParsCit/lib/ParsCit/PreProcess.pm +0 -333
- data/perl/ParsCit/resources/parsCit.model +0 -0
- data/perl/extract.pl +0 -199
- data/spec/biblicit/cb2bib_spec.rb +0 -48
- data/spec/biblicit/citeseer_spec.rb +0 -40
- /data/{perl → svm-header-parse}/HeaderParseService/README.TXT +0 -0
- /data/{perl/DocFilter → svm-header-parse/HeaderParseService}/lib/CSXUtil/SafeText.pm +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/50states +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AddrTopWords.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWords.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWordsAll.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/ChineseSurNames.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/DomainSuffixes.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/LabeledHeader +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/README +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstract.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstractTopWords +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/addr.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affi.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affis.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/allnamewords.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_US.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_world.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/city.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cityname.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/country_abbr.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/countryname.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/dateTopWords +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/degree.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/email.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/excludeWords.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/female-names +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstNames.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/intro.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keyword.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keywordTopWords +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/male-names +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/middleNames.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/month.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label.old +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.processed +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulAuthor +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulClassStat +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nickname.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nicknames.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/note.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/page.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/phone.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/postcode.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/pubnum.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/states_and_abbreviations.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surNames.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames_spec.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/A.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/B.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/C.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/D.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/E.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/F.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/G.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/H.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/I.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/J.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/K.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/L.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/M.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/N.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/O.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/P.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Q.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/R.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/S.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/T.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/U.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/V.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/W.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/X.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Y.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Z.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ae.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/am.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ar.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/at.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/au.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bd.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/be.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bg.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bh.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bm.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bn.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/br.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ca.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ch.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cl.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cn.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/co.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cy.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cz.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/de.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ec.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ee.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/eg.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/es.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/et.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/faq.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fi.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fj.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fo.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/geog.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gu.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hu.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/id.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ie.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/il.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/in.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/is.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/it.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jm.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jo.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jp.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kw.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lb.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lt.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lu.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lv.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ma.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mo.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mt.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mx.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/my.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ni.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nl.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/no.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nz.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pa.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pe.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ph.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pl.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ps.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pt.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/recognition.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/results.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ro.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ru.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sd.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/se.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sg.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/si.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/th.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tw.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ua.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ-full.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uy.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ve.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/yu.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/za.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/zm.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/url.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/webTopWords +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/words +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceModel +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceTrainF +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperBaseFeaDict +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperContextFeaDict +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +0 -0
@@ -0,0 +1,867 @@
|
|
1
|
+
A Probabilistic Answer Type Model
|
2
|
+
Christopher Pinchak
|
3
|
+
Department of Computing Science
|
4
|
+
University of Alberta
|
5
|
+
Edmonton, Alberta, Canada
|
6
|
+
pinchak@cs.ualberta.ca
|
7
|
+
Dekang Lin
|
8
|
+
Google, Inc.
|
9
|
+
1600 Amphitheatre Parkway
|
10
|
+
Mountain View, CA
|
11
|
+
lindek@google.com
|
12
|
+
Abstract
|
13
|
+
All questions are implicitly associated
|
14
|
+
with an expected answer type. Unlike
|
15
|
+
previous approaches that require a prede-
|
16
|
+
fined set of question types, we present
|
17
|
+
a method for dynamically constructing
|
18
|
+
a probability-based answer type model
|
19
|
+
for each different question. Our model
|
20
|
+
evaluates the appropriateness of a poten-
|
21
|
+
tial answer by the probability that it fits
|
22
|
+
into the question contexts. Evaluation
|
23
|
+
is performed against manual and semi-
|
24
|
+
automatic methods using a fixed set of an-
|
25
|
+
swer labels. Results show our approach to
|
26
|
+
be superior for those questions classified
|
27
|
+
as having a miscellaneous answer type.
|
28
|
+
1 Introduction
|
29
|
+
Given a question, people are usually able to form
|
30
|
+
an expectation about the type of the answer, even
|
31
|
+
if they do not know the actual answer. An accu-
|
32
|
+
rate expectation of the answer type makes it much
|
33
|
+
easier to select the answer from a sentence that
|
34
|
+
contains the query words. Consider the question
|
35
|
+
“What is the capital of Norway?” We would ex-
|
36
|
+
pect the answer to be a city and could filter out
|
37
|
+
most of the words in the following sentence:
|
38
|
+
The landed aristocracy was virtually crushed
|
39
|
+
by Hakon V, who reigned from 1299 to 1319,
|
40
|
+
and Oslo became the capital of Norway, re-
|
41
|
+
placing Bergen as the principal city of the
|
42
|
+
kingdom.
|
43
|
+
The goal of answer typing is to determine
|
44
|
+
whether a word’s semantic type is appropriate as
|
45
|
+
an answer for a question. Many previous ap-
|
46
|
+
proaches to answer typing, e.g., (Ittycheriah et al.,
|
47
|
+
2001; Li and Roth, 2002; Krishnan et al., 2005),
|
48
|
+
employ a predefined set of answer types and use
|
49
|
+
supervised learning or manually constructed rules
|
50
|
+
to classify a question according to expected an-
|
51
|
+
swer type. A disadvantage of this approach is that
|
52
|
+
there will always be questions whose answers do
|
53
|
+
not belong to any of the predefined types.
|
54
|
+
Consider the question: “What are tourist attrac-
|
55
|
+
tions in Reims?” The answer may be many things:
|
56
|
+
a church, a historic residence, a park, a famous
|
57
|
+
intersection, a statue, etc. A common method to
|
58
|
+
deal with this problem is to define a catch-all class.
|
59
|
+
This class, however, tends not to be as effective as
|
60
|
+
other answer types.
|
61
|
+
Another disadvantage of predefined answer
|
62
|
+
types is with regard to granularity. If the types
|
63
|
+
are too specific, they are more difficult to tag. If
|
64
|
+
they are too general, too many candidates may be
|
65
|
+
identified as having the appropriate type.
|
66
|
+
In contrast to previous approaches that use a su-
|
67
|
+
pervised classifier to categorize questions into a
|
68
|
+
predefined set of types, we propose an unsuper-
|
69
|
+
vised method to dynamically construct a proba-
|
70
|
+
bilistic answer type model for each question. Such
|
71
|
+
a model can be used to evaluate whether or not
|
72
|
+
a word fits into the question context. For exam-
|
73
|
+
ple, given the question “What are tourist attrac-
|
74
|
+
tions in Reims?”, we would expect the appropriate
|
75
|
+
answers to fit into the context “X is a tourist attrac-
|
76
|
+
tion.” From a corpus, we can find the words that
|
77
|
+
appeared in this context, such as:
|
78
|
+
A-Ama Temple, Aborigine, addition, Anak
|
79
|
+
Krakatau, archipelago, area, baseball,
|
80
|
+
Bletchley Park, brewery, cabaret, Cairo,
|
81
|
+
Cape Town, capital, center, ...
|
82
|
+
Using the frequency counts of these words in
|
83
|
+
the context, we construct a probabilistic model
|
84
|
+
to compute P(in(w, Γ)|w), the probability for a
|
85
|
+
word w to occur in a set of contexts Γ, given an
|
86
|
+
occurrence of w. The parameters in this model are
|
87
|
+
obtained from a large, automatically parsed, un-
|
88
|
+
labeled corpus. By asking whether a word would
|
89
|
+
occur in a particular context extracted from a ques-
|
90
|
+
393
|
91
|
+
tion, we avoid explicitly specifying a list of pos-
|
92
|
+
sible answer types. This has the added benefit
|
93
|
+
of being easily adapted to different domains and
|
94
|
+
corpora in which a list of explicit possible answer
|
95
|
+
types may be difficult to enumerate and/or identify
|
96
|
+
within the text.
|
97
|
+
The remainder of this paper is organized as fol-
|
98
|
+
lows. Section 2 discusses the work related to an-
|
99
|
+
swer typing. Section 3 discusses some of the key
|
100
|
+
concepts employed by our probabilistic model, in-
|
101
|
+
cluding word clusters and the contexts of a ques-
|
102
|
+
tion and a word. Section 4 presents our probabilis-
|
103
|
+
tic model for answer typing. Section 5 compares
|
104
|
+
the performance of our model with that of an or-
|
105
|
+
acle and a semi-automatic system performing the
|
106
|
+
same task. Finally, the concluding remarks in are
|
107
|
+
made in Section 6.
|
108
|
+
2 Related Work
|
109
|
+
Light et al. (2001) performed an analysis of the
|
110
|
+
effect of multiple answer type occurrences in a
|
111
|
+
sentence. When multiple words of the same type
|
112
|
+
appear in a sentence, answer typing with fixed
|
113
|
+
types must assign each the same score. Light et
|
114
|
+
al. found that even with perfect answer sentence
|
115
|
+
identification, question typing, and semantic tag-
|
116
|
+
ging, a system could only achieve 59% accuracy
|
117
|
+
over the TREC-9 questions when using their set of
|
118
|
+
24 non-overlapping answer types. By computing
|
119
|
+
the probability of an answer candidate occurring
|
120
|
+
in the question contexts directly, we avoid having
|
121
|
+
multiple candidates with the same level of appro-
|
122
|
+
priateness as answers.
|
123
|
+
There have been a variety of approaches to de-
|
124
|
+
termine the answer types, which are also known
|
125
|
+
as Qtargets (Echihabi et al., 2003). Most previous
|
126
|
+
approaches classify the answer type of a question
|
127
|
+
as one of a set of predefined types.
|
128
|
+
Many systems construct the classification rules
|
129
|
+
manually (Cui et al., 2004; Greenwood, 2004;
|
130
|
+
Hermjakob, 2001). The rules are usually triggered
|
131
|
+
by the presence of certain words in the question.
|
132
|
+
For example, if a question contains “author” then
|
133
|
+
the expected answer type is Person.
|
134
|
+
The number of answer types as well as the num-
|
135
|
+
ber of rules can vary a great deal. For example,
|
136
|
+
(Hermjakob, 2001) used 276 rules for 122 answer
|
137
|
+
types. Greenwood (2004), on the other hand, used
|
138
|
+
46 answer types with unspecified number of rules.
|
139
|
+
The classification rules can also be acquired
|
140
|
+
with supervised learning. Ittycheriah, et al. (2001)
|
141
|
+
describe a maximum entropy based question clas-
|
142
|
+
sification scheme to classify each question as hav-
|
143
|
+
ing one of the MUC answer types. In a similar ex-
|
144
|
+
periment, Li & Roth (2002) train a question clas-
|
145
|
+
sifier based on a modified version of SNoW using
|
146
|
+
a richer set of answer types than Ittycheriah et al.
|
147
|
+
The LCC system (Harabagiu et al., 2003) com-
|
148
|
+
bines fixed types with a novel loop-back strategy.
|
149
|
+
In the event that a question cannot be classified as
|
150
|
+
one of the fixed entity types or semantic concepts
|
151
|
+
derived from WordNet (Fellbaum, 1998), the an-
|
152
|
+
swer type model backs off to a logic prover that
|
153
|
+
uses axioms derived form WordNet, along with
|
154
|
+
logic rules, to justify phrases as answers. Thus, the
|
155
|
+
LCC system is able to avoid the use of a miscel-
|
156
|
+
laneous type that often exhibits poor performance.
|
157
|
+
However, the logic prover must have sufficient ev-
|
158
|
+
idence to link the question to the answer, and gen-
|
159
|
+
eral knowledge must be encoded as axioms into
|
160
|
+
the system. In contrast, our answer type model
|
161
|
+
derives all of its information automatically from
|
162
|
+
unannotated text.
|
163
|
+
Answer types are often used as filters. It was
|
164
|
+
noted in (Radev et al., 2002) that a wrong guess
|
165
|
+
about the answer type reduces the chance for the
|
166
|
+
system to answer the question correctly by as
|
167
|
+
much as 17 times. The approach presented here
|
168
|
+
is less brittle. Even if the correct candidate does
|
169
|
+
not have the highest likelihood according to the
|
170
|
+
model, it may still be selected when the answer
|
171
|
+
extraction module takes into account other factors
|
172
|
+
such as the proximity to the matched keywords.
|
173
|
+
Furthermore, a probabilistic model makes it eas-
|
174
|
+
ier to integrate the answer type scores with scores
|
175
|
+
computed by other components in a question an-
|
176
|
+
swering system in a principled fashion.
|
177
|
+
3 Resources
|
178
|
+
Before introducing our model, we first describe
|
179
|
+
the resources used in the model.
|
180
|
+
3.1 Word Clusters
|
181
|
+
Natural language data is extremely sparse. Word
|
182
|
+
clusters are a way of coping with data sparseness
|
183
|
+
by abstracting a given word to a class of related
|
184
|
+
words. Clusters, as used by our probabilistic an-
|
185
|
+
swer typing system, play a role similar to that of
|
186
|
+
named entity types. Many methods exist for clus-
|
187
|
+
tering, e.g., (Brown et al., 1990; Cutting et al.,
|
188
|
+
1992; Pereira et al., 1993; Karypis et al., 1999).
|
189
|
+
We used the Clustering By Committee (CBC)
|
190
|
+
394
|
191
|
+
Table 1: Words and their clusters
|
192
|
+
Word Clusters
|
193
|
+
suite software, network, wireless, ...
|
194
|
+
rooms, bathrooms, restrooms, ...
|
195
|
+
meeting room, conference room, ...
|
196
|
+
ghost rabbit, squirrel, duck, elephant, frog, ...
|
197
|
+
goblins, ghosts, vampires, ghouls, ...
|
198
|
+
punk, reggae, folk, pop, hip-pop, ...
|
199
|
+
huge, larger, vast, significant, ...
|
200
|
+
coming-of-age, true-life, ...
|
201
|
+
clouds, cloud, fog, haze, mist, ...
|
202
|
+
algorithm (Pantel and Lin, 2002) on a 10 GB En-
|
203
|
+
glish text corpus to obtain 3607 clusters. The fol-
|
204
|
+
lowing is an example cluster generated by CBC:
|
205
|
+
tension, anger, anxiety, tensions, frustration,
|
206
|
+
resentment, uncertainty, confusion, conflict,
|
207
|
+
discontent, insecurity, controversy, unease,
|
208
|
+
bitterness, dispute, disagreement, nervous-
|
209
|
+
ness, sadness, despair, animosity, hostility,
|
210
|
+
outrage, discord, pessimism, anguish, ...
|
211
|
+
In the clustering generated by CBC, a word may
|
212
|
+
belong to multiple clusters. The clusters to which
|
213
|
+
a word belongs often represent the senses of the
|
214
|
+
word. Table 1 shows two example words and their
|
215
|
+
clusters.
|
216
|
+
3.2 Contexts
|
217
|
+
The context in which a word appears often im-
|
218
|
+
poses constraints on the semantic type of the word.
|
219
|
+
This basic idea has been exploited by many pro-
|
220
|
+
posals for distributional similarity and clustering,
|
221
|
+
e.g., (Church and Hanks, 1989; Lin, 1998; Pereira
|
222
|
+
et al., 1993).
|
223
|
+
Similar to Lin and Pantel (2001), we define
|
224
|
+
the contexts of a word to be the undirected paths
|
225
|
+
in dependency trees involving that word at either
|
226
|
+
the beginning or the end. The following diagram
|
227
|
+
shows an example dependency tree:
|
228
|
+
Which city hosted the 1988 Winter Olympics?
|
229
|
+
det subj
|
230
|
+
obj
|
231
|
+
NN
|
232
|
+
NN
|
233
|
+
det
|
234
|
+
The links in the tree represent dependency rela-
|
235
|
+
tionships. The direction of a link is from the head
|
236
|
+
to the modifier in the relationship. Labels associ-
|
237
|
+
ated with the links represent types of relations.
|
238
|
+
In a context, the word itself is replaced with a
|
239
|
+
variable X. We say a word is the filler of a context
|
240
|
+
if it replaces X. For example, the contexts for the
|
241
|
+
word “Olympics” in the above sentence include
|
242
|
+
the following paths:
|
243
|
+
Context of “Olympics” Explanation
|
244
|
+
X Winter
|
245
|
+
NN
|
246
|
+
Winter X
|
247
|
+
X 1988
|
248
|
+
NN
|
249
|
+
1988 X
|
250
|
+
X host
|
251
|
+
obj
|
252
|
+
host X
|
253
|
+
X host
|
254
|
+
obj
|
255
|
+
city
|
256
|
+
subj
|
257
|
+
city hosted X
|
258
|
+
In these paths, words are reduced to their root
|
259
|
+
forms and proper names are reduced to their entity
|
260
|
+
tags (we used MUC7 named entity tags).
|
261
|
+
Paths allow us to balance the specificity of con-
|
262
|
+
texts and the sparseness of data. Longer paths typ-
|
263
|
+
ically impose stricter constraints on the slot fillers.
|
264
|
+
However, they tend to have fewer occurrences,
|
265
|
+
making them more prone to errors arising from
|
266
|
+
data sparseness. We have restricted the path length
|
267
|
+
to two (involving at most three words) and require
|
268
|
+
the two ends of the path to be nouns.
|
269
|
+
We parsed the AQUAINT corpus (3GB) with
|
270
|
+
Minipar (Lin, 2001) and collected the frequency
|
271
|
+
counts of words appearing in various contexts.
|
272
|
+
Parsing and database construction is performed
|
273
|
+
off-line as the database is identical for all ques-
|
274
|
+
tions. We extracted 527,768 contexts that ap-
|
275
|
+
peared at least 25 times in the corpus. An example
|
276
|
+
context and its fillers are shown in Figure 1.
|
277
|
+
X host Olympics
|
278
|
+
subj obj
|
279
|
+
Africa 2 grant 1 readiness 2
|
280
|
+
AP 1 he 2 Rio de Janeiro 1
|
281
|
+
Argentina 1 homeland 3 Rome 1
|
282
|
+
Athens 16 IOC 1 Salt Lake City 2
|
283
|
+
Atlanta 3 Iran 2 school 1
|
284
|
+
Bangkok 1 Jakarta 1 S. Africa 1
|
285
|
+
. .. . .. . . .
|
286
|
+
decades 1 president 2 Zakopane 4
|
287
|
+
facility 1 Pusan 1
|
288
|
+
government 1 race 1
|
289
|
+
Figure 1: An example context and its fillers
|
290
|
+
3.2.1 Question Contexts
|
291
|
+
To build a probabilistic model for answer typ-
|
292
|
+
ing, we extract a set of contexts, called question
|
293
|
+
contexts, from a question. An answer is expected
|
294
|
+
to be a plausible filler of the question contexts.
|
295
|
+
Question contexts are extracted from a question
|
296
|
+
with two rules. First, if the wh-word in a ques-
|
297
|
+
tion has a trace in the parse tree, the question con-
|
298
|
+
texts are the contexts of the trace. For example, the
|
299
|
+
395
|
300
|
+
question “What do most tourists visit in Reims?”
|
301
|
+
is parsed as:
|
302
|
+
Whati
|
303
|
+
do most tourists visit ei
|
304
|
+
in Reims?
|
305
|
+
det
|
306
|
+
i
|
307
|
+
subj
|
308
|
+
det
|
309
|
+
obj
|
310
|
+
in
|
311
|
+
The symbol ei is the trace of whati. Minipar
|
312
|
+
generates the trace to indicate that the word what
|
313
|
+
is the object of visit in the deep structure of the
|
314
|
+
sentence. The following question contexts are ex-
|
315
|
+
tracted from the above question:
|
316
|
+
Context Explanation
|
317
|
+
X visit tourist
|
318
|
+
obj subj
|
319
|
+
tourist visits X
|
320
|
+
X visit Reims
|
321
|
+
obj in
|
322
|
+
visit X in Reims
|
323
|
+
The second rule deals with situations where
|
324
|
+
the wh-word is a determiner, as in the question
|
325
|
+
“Which city hosted the 1988 Winter Olympics?”
|
326
|
+
(the parse tree for which is shown in section 3.2).
|
327
|
+
In such cases, the question contexts consist of a
|
328
|
+
single context involving the noun that is modified
|
329
|
+
by the determiner. The context for the above sen-
|
330
|
+
tence is X city
|
331
|
+
subj
|
332
|
+
, corresponding to the sentence
|
333
|
+
“X is a city.” This context is used because the
|
334
|
+
question explicitly states that the desired answer is
|
335
|
+
a city. The context overrides the other contexts be-
|
336
|
+
cause the question explicitly states the desired an-
|
337
|
+
swer type. Experimental results have shown that
|
338
|
+
using this context in conjunction with other con-
|
339
|
+
texts extracted from the question produces lower
|
340
|
+
performance than using this context alone.
|
341
|
+
In the event that a context extracted from a ques-
|
342
|
+
tion is not found in the database, we shorten the
|
343
|
+
context in one of two ways. We start by replac-
|
344
|
+
ing the word at the end of the path with a wildcard
|
345
|
+
that matches any word. If this fails to yield en-
|
346
|
+
tries in the context database, we shorten the con-
|
347
|
+
text to length one and replace the end word with
|
348
|
+
automatically determined similar words instead of
|
349
|
+
a wildcard.
|
350
|
+
3.2.2 Candidate Contexts
|
351
|
+
Candidate contexts are very similar in form to
|
352
|
+
question contexts, save for one important differ-
|
353
|
+
ence. Candidate contexts are extracted from the
|
354
|
+
parse trees of the answer candidates rather than the
|
355
|
+
question. In natural language, some words may
|
356
|
+
be polysemous. For example, Washington may re-
|
357
|
+
fer to a person, a city, or a state. The occurrences
|
358
|
+
of Washington in “Washington’s descendants” and
|
359
|
+
“suburban Washington” should not be given the
|
360
|
+
same score when the question is seeking a loca-
|
361
|
+
tion. Given that the sense of a word is largely de-
|
362
|
+
termined by its local context (Choueka and Lusig-
|
363
|
+
nan, 1985), candidate contexts allow the model to
|
364
|
+
take into account the candidate answers’ senses
|
365
|
+
implicitly.
|
366
|
+
4 Probabilistic Model
|
367
|
+
The goal of an answer typing model is to evalu-
|
368
|
+
ate the appropriateness of a candidate word as an
|
369
|
+
answer to the question. If we assume that a set
|
370
|
+
of answer candidates is provided to our model by
|
371
|
+
some means (e.g., words comprising documents
|
372
|
+
extracted by an information retrieval engine), we
|
373
|
+
wish to compute the value P(in(w, ΓQ)|w). That
|
374
|
+
is, the appropriateness of a candidate answer w is
|
375
|
+
proportional to the probability that it will occur in
|
376
|
+
the question contexts ΓQ extracted from the ques-
|
377
|
+
tion.
|
378
|
+
To mitigate data sparseness, we can introduce
|
379
|
+
a hidden variable C that represents the clusters to
|
380
|
+
which the candidate answer may belong. As a can-
|
381
|
+
didate may belong to multiple clusters, we obtain:
|
382
|
+
P(in(w, ΓQ)|w) =
|
383
|
+
X
|
384
|
+
C
|
385
|
+
P(in(w, ΓQ), C|w) (1)
|
386
|
+
=
|
387
|
+
X
|
388
|
+
C
|
389
|
+
P(C|w)P(in(w, ΓQ)|C, w) (2)
|
390
|
+
Given that a word appears, we assume that it has
|
391
|
+
the same probability to appear in a context as all
|
392
|
+
other words in the same cluster. Therefore:
|
393
|
+
P(in(w, ΓQ)|C, w) ≈ P(in(C, ΓQ)|C) (3)
|
394
|
+
We can now rewrite the equation in (2) as:
|
395
|
+
P(in(w, ΓQ)|w) ≈
|
396
|
+
X
|
397
|
+
C
|
398
|
+
P(C|w)P(in(C, ΓQ)|C) (4)
|
399
|
+
This equation splits our model into two parts:
|
400
|
+
one models which clusters a word belongs to and
|
401
|
+
the other models how appropriate a cluster is to
|
402
|
+
the question contexts. When ΓQ consists of multi-
|
403
|
+
ple contexts, we make the na¨ıve Bayes assumption
|
404
|
+
that each individual context γQ ∈ ΓQ is indepen-
|
405
|
+
dent of all other contexts given the cluster C.
|
406
|
+
P(in(w, ΓQ)|w) ≈
|
407
|
+
X
|
408
|
+
C
|
409
|
+
P(C|w)
|
410
|
+
Y
|
411
|
+
γQ∈ΓQ
|
412
|
+
P(in(C, γQ)|C) (5)
|
413
|
+
Equation (5) needs the parameters P(C|w) and
|
414
|
+
P(in(C, γQ)|C), neither of which are directly
|
415
|
+
available from the context-filler database. We will
|
416
|
+
discuss the estimation of these parameters in Sec-
|
417
|
+
tion 4.2.
|
418
|
+
396
|
419
|
+
4.1 Using Candidate Contexts
|
420
|
+
The previous model assigns the same likelihood to
|
421
|
+
every instance of a given word. As we noted in
|
422
|
+
section 3.2.2, a word may be polysemous. To take
|
423
|
+
into account a word’s context, we can instead com-
|
424
|
+
pute P(in(w, ΓQ)|w, in(w, Γw)), where Γw is the
|
425
|
+
set of contexts for the candidate word w in a re-
|
426
|
+
trieved passage.
|
427
|
+
By introducing word clusters as intermediate
|
428
|
+
variables as before and making a similar assump-
|
429
|
+
tion as in equation (3), we obtain:
|
430
|
+
P(in(w, ΓQ)|w, in(w, Γw))
|
431
|
+
=
|
432
|
+
X
|
433
|
+
C
|
434
|
+
P(in(w, ΓQ), C|w, in(w, Γw)) (6)
|
435
|
+
≈
|
436
|
+
X
|
437
|
+
C
|
438
|
+
P(C|w, in(w, Γw))P(in(C, ΓQ)|C) (7)
|
439
|
+
Like equation (4), equation (7) partitions the
|
440
|
+
model into two parts. Unlike P(C|w) in equation
|
441
|
+
(4), the probability of the cluster is now based on
|
442
|
+
the particular occurrence of the word in the candi-
|
443
|
+
date contexts. It can be estimated by:
|
444
|
+
P(C|w, in(w, Γw))
|
445
|
+
=
|
446
|
+
P(in(w, Γw)|w, C)P(w, C)
|
447
|
+
P(in(w, Γw)|w)P(w)
|
448
|
+
(8)
|
449
|
+
≈
|
450
|
+
Y
|
451
|
+
γw∈Γw
|
452
|
+
P(in(w, γw)|w, C)
|
453
|
+
Y
|
454
|
+
γw∈Γw
|
455
|
+
P(in(w, γw)|w)
|
456
|
+
× P(C|w) (9)
|
457
|
+
=
|
458
|
+
Y
|
459
|
+
γw∈Γw
|
460
|
+
„
|
461
|
+
P(C|w, in(w, γw))
|
462
|
+
P(C|w)
|
463
|
+
«
|
464
|
+
× P(C|w) (10)
|
465
|
+
4.2 Estimating Parameters
|
466
|
+
Our probabilistic model requires the parameters
|
467
|
+
P(C|w), P(C|w, in(w, γ)), and P(in(C, γ)|C),
|
468
|
+
where w is a word, C is a cluster that w belongs to,
|
469
|
+
and γ is a question or candidate context. This sec-
|
470
|
+
tion explains how these parameters are estimated
|
471
|
+
without using labeled data.
|
472
|
+
The context-filler database described in Sec-
|
473
|
+
tion 3.2 provides the joint and marginal fre-
|
474
|
+
quency counts of contexts and words (|in(γ, w)|,
|
475
|
+
|in(∗, γ)| and |in(w, ∗)|). These counts al-
|
476
|
+
low us to compute the probabilities P(in(w, γ)),
|
477
|
+
P(in(w, ∗)), and P(in(∗, γ)). We can also com-
|
478
|
+
pute P(in(w, γ)|w), which is smoothed with add-
|
479
|
+
one smoothing (see equation (11) in Figure 2).
|
480
|
+
The estimation of P(C|w) presents a challenge.
|
481
|
+
We have no corpus from which we can directly
|
482
|
+
measure P(C|w) because word instances are not
|
483
|
+
labeled with their clusters.
|
484
|
+
P(in(w, γ)|w) =
|
485
|
+
|in(w, γ)| + P(in(∗, γ))
|
486
|
+
|in(w, ∗)| + 1
|
487
|
+
(11)
|
488
|
+
Pu(C|w) =
|
489
|
+
(
|
490
|
+
1
|
491
|
+
|{C |w∈C }|
|
492
|
+
if w ∈ C,
|
493
|
+
0 otherwise
|
494
|
+
(12)
|
495
|
+
P(C|w) =
|
496
|
+
X
|
497
|
+
w ∈S(w)
|
498
|
+
sim(w, w ) × Pu(C|w )
|
499
|
+
X
|
500
|
+
{C |w∈C },
|
501
|
+
w ∈S(w)
|
502
|
+
sim(w, w ) × Pu(C |w )
|
503
|
+
(13)
|
504
|
+
P(in(C, γ)|C) =
|
505
|
+
X
|
506
|
+
w ∈C
|
507
|
+
P(C|w ) × |in(w , γ)| + P(in(∗, γ))
|
508
|
+
X
|
509
|
+
w ∈C
|
510
|
+
P(C|w ) × |in(w , ∗)| + 1
|
511
|
+
(14)
|
512
|
+
Figure 2: Probability estimation
|
513
|
+
We use the average weighted “guesses” of the
|
514
|
+
top similar words of w to compute P(C|w) (see
|
515
|
+
equation 13). The intuition is that if w and w
|
516
|
+
are similar words, P(C|w ) and P(C|w) tend
|
517
|
+
to have similar values. Since we do not know
|
518
|
+
P(C|w ) either, we substitute it with uniform dis-
|
519
|
+
tribution Pu(C|w ) as in equation (12) of Fig-
|
520
|
+
ure 2. Although Pu(C|w ) is a very crude guess,
|
521
|
+
the weighted average of a set of such guesses can
|
522
|
+
often be quite accurate.
|
523
|
+
The similarities between words are obtained as
|
524
|
+
a byproduct of the CBC algorithm. For each word,
|
525
|
+
we use S(w) to denote the top-n most similar
|
526
|
+
words (n=50 in our experiments) and sim(w, w )
|
527
|
+
to denote the similarity between words w and w .
|
528
|
+
The following is a sample similar word list for the
|
529
|
+
word suit:
|
530
|
+
S(suit) = {lawsuit 0.49, suits 0.47, com-
|
531
|
+
plaint 0.29, lawsuits 0.27, jacket 0.25, coun-
|
532
|
+
tersuit 0.24, counterclaim 0.24, pants 0.24,
|
533
|
+
trousers 0.22, shirt 0.21, slacks 0.21, case
|
534
|
+
0.21, pantsuit 0.21, shirts 0.20, sweater 0.20,
|
535
|
+
coat 0.20, ...}
|
536
|
+
The estimation for P(C|w, in(w, γw)) is sim-
|
537
|
+
ilar to that of P(C|w) except that instead of all
|
538
|
+
w ∈ S(w), we instead use {w |w ∈ S(w) ∧
|
539
|
+
in(w , γw)}. By only looking at a particular con-
|
540
|
+
text γw, we may obtain a different distribution over
|
541
|
+
C than P(C|w) specifies. In the event that the data
|
542
|
+
are too sparse to estimate P(C|w, in(w, γw)), we
|
543
|
+
fall back to using P(C|w).
|
544
|
+
P(in(C, γ)|C) is computed in (14) by assum-
|
545
|
+
ing each instance of w contains a fractional in-
|
546
|
+
stance of C and the fractional count is P(C|w).
|
547
|
+
Again, add-one smoothing is used.
|
548
|
+
397
|
549
|
+
System Median % Top 1% Top 5% Top 10% Top 50%
|
550
|
+
Oracle 0.7% 89 (57%) 123 (79%) 131 (85%) 154 (99%)
|
551
|
+
Frequency 7.7% 31 (20%) 67 (44%) 86 (56%) 112 (73%)
|
552
|
+
Our model 1.2% 71 (46%) 106 (69%) 119 (77%) 146 (95%)
|
553
|
+
no cand. contexts 2.2% 58 (38%) 102 (66%) 113 (73%) 145 (94%)
|
554
|
+
ANNIE 4.0% 54 (35%) 79 (51%) 93 (60%) 123 (80%)
|
555
|
+
Table 2: Summary of Results
|
556
|
+
5 Experimental Setup & Results
|
557
|
+
We evaluate our answer typing system by using
|
558
|
+
it to filter the contents of documents retrieved by
|
559
|
+
the information retrieval portion of a question an-
|
560
|
+
swering system. Each answer candidate in the set
|
561
|
+
of documents is scored by the answer typing sys-
|
562
|
+
tem and the list is sorted in descending order of
|
563
|
+
score. We treat the system as a filter and observe
|
564
|
+
the proportion of candidates that must be accepted
|
565
|
+
by the filter so that at least one correct answer is
|
566
|
+
accepted. A model that allows a low percentage
|
567
|
+
of candidates to pass while still allowing at least
|
568
|
+
one correct answer through is favorable to a model
|
569
|
+
in which a high number of candidates must pass.
|
570
|
+
This represents an intrinsic rather than extrinsic
|
571
|
+
evaluation (Moll´a and Hutchinson, 2003) that we
|
572
|
+
believe illustrates the usefulness of our model.
|
573
|
+
The evaluation data consist of 154 questions
|
574
|
+
from the TREC-2003 QA Track (Voorhees, 2003)
|
575
|
+
satisfying the following criteria, along with the top
|
576
|
+
10 documents returned for each question as iden-
|
577
|
+
tified by NIST using the PRISE1 search engine.
|
578
|
+
• the question begins with What, Which, or
|
579
|
+
Who. We restricted the evaluation such ques-
|
580
|
+
tions because our system is designed to deal
|
581
|
+
with questions whose answer types are often
|
582
|
+
semantically open-ended noun phrases.
|
583
|
+
• There exists entry for the question in the an-
|
584
|
+
swer patterns provided by Ken Litkowski2.
|
585
|
+
• One of the top-10 documents returned by
|
586
|
+
PRISE contains a correct answer.
|
587
|
+
We compare the performance of our prob-
|
588
|
+
abilistic model with that of two other sys-
|
589
|
+
tems. Both comparison systems make use of a
|
590
|
+
small, predefined set of manually-assigned MUC-
|
591
|
+
7 named-entity types (location, person, organiza-
|
592
|
+
tion, cardinal, percent, date, time, duration, mea-
|
593
|
+
sure, money) augmented with thing-name (proper
|
594
|
+
1
|
595
|
+
www.itl.nist.gov/iad/894.02/works/papers/zp2/zp2.html
|
596
|
+
2
|
597
|
+
trec.nist.gov/data/qa/2003 qadata/03QA.tasks/t12.pats.txt
|
598
|
+
names of inanimate objects) and miscellaneous
|
599
|
+
(a catch-all answer type of all other candidates).
|
600
|
+
Some examples of thing-name are Guinness Book
|
601
|
+
of World Records, Thriller, Mars Pathfinder, and
|
602
|
+
Grey Cup. Examples of miscellaneous answers are
|
603
|
+
copper, oil, red, and iris.
|
604
|
+
The differences in the comparison systems is
|
605
|
+
with respect to how entity types are assigned to the
|
606
|
+
words in the candidate documents. We make use
|
607
|
+
of the ANNIE (Maynard et al., 2002) named entity
|
608
|
+
recognition system, along with a manual assigned
|
609
|
+
“oracle” strategy, to assign types to candidate an-
|
610
|
+
swers. In each case, the score for a candidate is
|
611
|
+
either 1 if it is tagged as the same type as the ques-
|
612
|
+
tion or 0 otherwise. With this scoring scheme pro-
|
613
|
+
ducing a sorted list we can compute the probability
|
614
|
+
of the first correct answer appearing at rank R = k
|
615
|
+
as follows:
|
616
|
+
P(R = k) =
|
617
|
+
k−2Y
|
618
|
+
i=0
|
619
|
+
„
|
620
|
+
t − c − i
|
621
|
+
t − i
|
622
|
+
«
|
623
|
+
c
|
624
|
+
t − k + 1
|
625
|
+
(15)
|
626
|
+
where t is the number of unique candidate answers
|
627
|
+
that are of the appropriate type and c is the number
|
628
|
+
of unique candidate answers that are correct.
|
629
|
+
Using the probabilities in equation (15), we
|
630
|
+
compute the expected rank, E(R), of the first cor-
|
631
|
+
rect answer of a given question in the system as:
|
632
|
+
E(R) =
|
633
|
+
t−c+1X
|
634
|
+
k=1
|
635
|
+
kP(R = k) (16)
|
636
|
+
Answer candidates are the set of ANNIE-
|
637
|
+
identified tokens with stop words and punctuation
|
638
|
+
removed. This yields between 900 and 8000 can-
|
639
|
+
didates for each question, depending on the top 10
|
640
|
+
documents returned by PRISE. The oracle system
|
641
|
+
represents an upper bound on using the predefined
|
642
|
+
set of answer types. The ANNIE system repre-
|
643
|
+
sents a more realistic expectation of performance.
|
644
|
+
The median percentage of candidates that are
|
645
|
+
accepted by a filter over the questions of our eval-
|
646
|
+
uation data provides one measure of performance
|
647
|
+
and is preferred to the average because of the ef-
|
648
|
+
fect of large values on the average. In QA, a sys-
|
649
|
+
tem accepting 60% of the candidates is not signif-
|
650
|
+
icantly better or worse than one accepting 100%,
|
651
|
+
398
|
652
|
+
System Measure
|
653
|
+
Question Type
|
654
|
+
All Location Person Organization Thing-Name Misc Other
|
655
|
+
(154) (57) (17) (19) (17) (37) (7)
|
656
|
+
Our model
|
657
|
+
Median 1.2% 0.8% 2.0% 1.3% 3.7% 3.5% 12.2%
|
658
|
+
Top 1% 71 34 6 9 7 13 2
|
659
|
+
Top 5% 106 53 11 11 10 19 2
|
660
|
+
Top 10% 119 55 12 17 10 22 3
|
661
|
+
Top 50% 146 56 16 18 17 34 5
|
662
|
+
Oracle
|
663
|
+
Median 0.7% 0.4% 1.0% 0.3% 0.4% 16.0% 0.3%
|
664
|
+
Top 1% 89 44 8 16 14 1 6
|
665
|
+
Top 5% 123 57 17 19 17 6 7
|
666
|
+
Top 10% 131 57 17 19 17 14 7
|
667
|
+
Top 50% 154 57 17 19 17 37 7
|
668
|
+
ANNIE
|
669
|
+
Median 4.0% 0.6% 1.4% 6.1% 100% 16.7% 50.0%
|
670
|
+
Top 1% 54 39 5 7 0 0 3
|
671
|
+
Top 5% 79 53 12 9 0 2 3
|
672
|
+
Top 10% 93 54 13 11 0 12 3
|
673
|
+
Top 50% 123 56 16 15 5 28 3
|
674
|
+
Table 3: Detailed breakdown of performance
|
675
|
+
but the effect on average is quite high. Another
|
676
|
+
measure is to observe the number of questions
|
677
|
+
with at least one correct answer in the top N% for
|
678
|
+
various values of N. By examining the number of
|
679
|
+
correct answers found in the top N% we can better
|
680
|
+
understand what an effective cutoff would be.
|
681
|
+
The overall results of our comparison can be
|
682
|
+
found in Table 2. We have added the results of
|
683
|
+
a system that scores candidates based on their fre-
|
684
|
+
quency within the document as a comparison with
|
685
|
+
a simple, yet effective, strategy. The second col-
|
686
|
+
umn is the median percentage of where the highest
|
687
|
+
scored correct answer appears in the sorted candi-
|
688
|
+
date list. Low percentage values mean the answer
|
689
|
+
is usually found high in the sorted list. The re-
|
690
|
+
maining columns list the number of questions that
|
691
|
+
have a correct answer somewhere in the top N%
|
692
|
+
of their sorted lists. This is meant to show the ef-
|
693
|
+
fects of imposing a strict cutoff prior to running
|
694
|
+
the answer type model.
|
695
|
+
The oracle system performs best, as it bene-
|
696
|
+
fits from both manual question classification and
|
697
|
+
manual entity tagging. If entity assignment is
|
698
|
+
performed by an automatic system (as it is for
|
699
|
+
ANNIE), the performance drops noticeably. Our
|
700
|
+
probabilistic model performs better than ANNIE
|
701
|
+
and achieves approximately 2/3 of the perfor-
|
702
|
+
mance of the oracle system. Table 2 also shows
|
703
|
+
that the use of candidate contexts increases the
|
704
|
+
performance of our answer type model.
|
705
|
+
Table 3 shows the performance of the oracle
|
706
|
+
system, our model, and the ANNIE system broken
|
707
|
+
down by manually-assigned answer types. Due
|
708
|
+
to insufficient numbers of questions, the cardinal,
|
709
|
+
percent, time, duration, measure, and money types
|
710
|
+
are combined into an “Other” category. When
|
711
|
+
compared with the oracle system, our model per-
|
712
|
+
forms worse overall for questions of all types ex-
|
713
|
+
cept for those seeking miscellaneous answers. For
|
714
|
+
miscellaneous questions, the oracle identifies all
|
715
|
+
tokens that do not belong to one of the other
|
716
|
+
known categories as possible answers. For all
|
717
|
+
questions of non-miscellaneous type, only a small
|
718
|
+
subset of the candidates are marked appropriate.
|
719
|
+
In particular, our model performs worse than the
|
720
|
+
oracle for questions seeking persons and thing-
|
721
|
+
names. Person questions often seek rare person
|
722
|
+
names, which occur in few contexts and are diffi-
|
723
|
+
cult to reliably cluster. Thing-name questions are
|
724
|
+
easy for a human to identify but difficult for au-
|
725
|
+
tomatic system to identify. Thing-names are a di-
|
726
|
+
verse category and are not strongly associated with
|
727
|
+
any identifying contexts.
|
728
|
+
Our model outperforms the ANNIE system in
|
729
|
+
general, and for questions seeking organizations,
|
730
|
+
thing-names, and miscellaneous targets in partic-
|
731
|
+
ular. ANNIE may have low coverage on organi-
|
732
|
+
zation names, resulting in reduced performance.
|
733
|
+
Like the oracle, ANNIE treats all candidates not
|
734
|
+
assigned one of the categories as appropriate for
|
735
|
+
miscellaneous questions. Because ANNIE cannot
|
736
|
+
identify thing-names, they are treated as miscella-
|
737
|
+
neous. ANNIE shows low performance on thing-
|
738
|
+
names because words incorrectly assigned types
|
739
|
+
are sorted to the bottom of the list for miscella-
|
740
|
+
neous and thing-name questions. If a correct an-
|
741
|
+
swer is incorrectly assigned a type it will be sorted
|
742
|
+
near the bottom, resulting in a poor score.
|
743
|
+
399
|
744
|
+
6 Conclusions
|
745
|
+
We have presented an unsupervised probabilistic
|
746
|
+
answer type model. Our model uses contexts de-
|
747
|
+
rived from the question and the candidate answer
|
748
|
+
to calculate the appropriateness of a candidate an-
|
749
|
+
swer. Statistics gathered from a large corpus of
|
750
|
+
text are used in the calculation, and the model is
|
751
|
+
constructed to exploit these statistics without be-
|
752
|
+
ing overly specific or overly general.
|
753
|
+
The method presented here avoids the use of an
|
754
|
+
explicit list of answer types. Explicit answer types
|
755
|
+
can exhibit poor performance, especially for those
|
756
|
+
questions not fitting one of the types. They must
|
757
|
+
also be redefined when either the domain or corpus
|
758
|
+
substantially changes. By avoiding their use, our
|
759
|
+
answer typing method may be easier to adapt to
|
760
|
+
different corpora and question answering domains
|
761
|
+
(such as bioinformatics).
|
762
|
+
In addition to operating as a stand-alone answer
|
763
|
+
typing component, our system can be combined
|
764
|
+
with other existing answer typing strategies, es-
|
765
|
+
pecially in situations in which a catch-all answer
|
766
|
+
type is used. Our experimental results show that
|
767
|
+
our probabilistic model outperforms the oracle and
|
768
|
+
a system using automatic named entity recognition
|
769
|
+
under such circumstances. The performance of
|
770
|
+
our model is better than that of the semi-automatic
|
771
|
+
system, which is a better indication of the expected
|
772
|
+
performance of a comparable real-world answer
|
773
|
+
typing system.
|
774
|
+
Acknowledgments
|
775
|
+
The authors would like to thank the anonymous re-
|
776
|
+
viewers for their helpful comments on improving
|
777
|
+
the paper. The first author is supported by the Nat-
|
778
|
+
ural Sciences and Engineering Research Council
|
779
|
+
of Canada, the Alberta Ingenuity Fund, and the Al-
|
780
|
+
berta Informatics Circle of Research Excellence.
|
781
|
+
References
|
782
|
+
P.F. Brown, V.J. Della Pietra, P.V. deSouza, J.C. Lai, and R.L.
|
783
|
+
Mercer. 1990. Class-based n-gram Models of Natural
|
784
|
+
Language. Computational Linguistics, 16(2):79–85.
|
785
|
+
Y. Choueka and S. Lusignan. 1985. Disambiguation by Short
|
786
|
+
Contexts. Computer and the Humanities, 19:147–157.
|
787
|
+
K. Church and P. Hanks. 1989. Word Association Norms,
|
788
|
+
Mutual Information, and Lexicography. In Proceedings
|
789
|
+
of ACL-89, pages 76–83, Vancouver, British Columbia,
|
790
|
+
Canada.
|
791
|
+
H. Cui, K. Li, R. Sun, T-S. Chua, and M-K. Kan. 2004. Na-
|
792
|
+
tional University of Singapore at the TREC-13 Question
|
793
|
+
Answering Main Task. In Notebook of TREC 2004, pages
|
794
|
+
34–42, Gaithersburg, Maryland.
|
795
|
+
D.R. Cutting, D. Karger, J. Pedersen, and J.W. Tukey. 1992.
|
796
|
+
Scatter/Gather: A Cluster-based Approach to Browsing
|
797
|
+
Large Document Collections. In Proceedings of SIGIR-
|
798
|
+
92, pages 318–329, Copenhagen, Denmark.
|
799
|
+
A. Echihabi, U. Hermjakob, E. Hovy, D. Marcu, E. Melz,
|
800
|
+
and D. Ravichandran. 2003. Multiple-Engine Question
|
801
|
+
Answering in TextMap. In Proceedings of TREC 2003,
|
802
|
+
pages 772–781, Gaithersburg, Maryland.
|
803
|
+
C. Fellbaum. 1998. WordNet: An Electronic Lexical
|
804
|
+
Database. MIT Press, Cambridge, Massachusetts.
|
805
|
+
M.A. Greenwood. 2004. AnswerFinder: Question Answer-
|
806
|
+
ing from your Desktop. In Proceedings of the Seventh
|
807
|
+
Annual Colloquium for the UK Special Interest Group
|
808
|
+
for Computational Linguistics (CLUK ’04), University of
|
809
|
+
Birmingham, UK.
|
810
|
+
S. Harabagiu, D. Moldovan, C. Clark, M. Bowden,
|
811
|
+
J. Williams, and J. Bensley. 2003. Answer Mining by
|
812
|
+
Combining Extraction Techniques with Abductive Rea-
|
813
|
+
soning. In Proceedings of TREC 2003, pages 375–382,
|
814
|
+
Gaithersburg, Maryland.
|
815
|
+
U. Hermjakob. 2001. Parsing and Question Classification for
|
816
|
+
Question Answering. In Proceedings of the ACL Work-
|
817
|
+
shop on Open-Domain Question Answering, Toulouse,
|
818
|
+
France.
|
819
|
+
A. Ittycheriah, M. Franz, W-J. Zhu, and A. Ratnaparkhi.
|
820
|
+
2001. Question Answering Using Maximum Entropy
|
821
|
+
Components. In Proceedings of NAACL 2001, Pittsburgh,
|
822
|
+
Pennsylvania.
|
823
|
+
G. Karypis, E.-H. Han, and V. Kumar. 1999. Chameleon: A
|
824
|
+
Hierarchical Clustering Algorithm using Dynamic Model-
|
825
|
+
ing. IEEE Computer: Special Issue on Data Analysis and
|
826
|
+
Mining, 32(8):68–75.
|
827
|
+
V. Krishnan, S. Das, and S. Chakrabarti. 2005. Enhanced
|
828
|
+
Answer Type Inference from Questions using Sequential
|
829
|
+
Models. In Proceedings of HLT/EMNLP 2005, pages
|
830
|
+
315–322, Vancouver, British Columbia, Canada.
|
831
|
+
X. Li and D. Roth. 2002. Learning Question Classifiers.
|
832
|
+
In Proceedings of COLING 2002, pages 556–562, Taipei,
|
833
|
+
Taiwan.
|
834
|
+
M. Light, G. Mann, E. Riloff, and E. Breck. 2001. Analyses
|
835
|
+
for Elucidating Current Question Answering Technology.
|
836
|
+
Natural Language Engineering, 7(4):325–342.
|
837
|
+
D. Lin and P. Pantel. 2001. Discovery of Inference Rules
|
838
|
+
for Question Answering. Natural Language Engineering,
|
839
|
+
7(4):343–360.
|
840
|
+
D. Lin. 1998. Automatic Retrieval and Clustering of Similar
|
841
|
+
Words. In Proceedings of COLING-ACL 1998, Montreal,
|
842
|
+
Qu´ebec, Canada.
|
843
|
+
D. Lin. 2001. Language and Text Analysis Tools. In Pro-
|
844
|
+
ceedings of HLT 2001, pages 222–227, San Diego, Cali-
|
845
|
+
fornia.
|
846
|
+
D. Maynard, V. Tablan, H. Cunningham, C. Ursu, H. Sag-
|
847
|
+
gion, K. Bontcheva, and Y. Wilks. 2002. Architectural
|
848
|
+
Elements of Language Engineering Robustness. Natural
|
849
|
+
Language Engineering, 8(2/3):257–274.
|
850
|
+
D. Moll´a and B. Hutchinson. 2003. Intrinsic versus Extrinsic
|
851
|
+
Evaluations of Parsing Systems. In Proceedings of EACL
|
852
|
+
Workshop on Evaluation Initiatives in Natural Language
|
853
|
+
Processing, pages 43–50, Budapest, Hungary.
|
854
|
+
P. Pantel and D. Lin. 2002. Document Clustering with Com-
|
855
|
+
mittees. In Proceedings of SIGIR 2002, pages 199–206,
|
856
|
+
Tampere, Finland.
|
857
|
+
F. Pereira, N. Tishby, and L. Lee. 1993. Distributional Clus-
|
858
|
+
tering of English Words. In Proceedings of ACL 1992,
|
859
|
+
pages 183–190.
|
860
|
+
D. Radev, W. Fan, H. Qi, H. Wu, and A. Grewal. 2002. Prob-
|
861
|
+
ablistic Question Answering on the Web. In Proceedings
|
862
|
+
of the Eleventh International World Wide Web Conference.
|
863
|
+
E.M. Voorhees. 2003. Overview of the TREC 2003 Ques-
|
864
|
+
tion Answering Track. In Proceedings of TREC 2003,
|
865
|
+
Gaithersburg, Maryland.
|
866
|
+
400
|
867
|
+
|