biblicit 1.0 → 2.0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (406) hide show
  1. data/.gitmodules +3 -0
  2. data/Gemfile +1 -1
  3. data/README.md +125 -30
  4. data/Rakefile +22 -0
  5. data/biblicit.gemspec +9 -7
  6. data/lib/biblicit/cb2bib.rb +10 -11
  7. data/lib/biblicit/citeseer.rb +14 -26
  8. data/lib/biblicit/extractor.rb +40 -19
  9. data/lib/biblicit/parscit.rb +38 -0
  10. data/parscit/.gitignore +8 -0
  11. data/parscit/CHANGELOG +125 -0
  12. data/parscit/COPYING +674 -0
  13. data/parscit/COPYING.LESSER +165 -0
  14. data/parscit/INSTALL +105 -0
  15. data/parscit/README +97 -0
  16. data/{perl/ParsCit/README.TXT → parscit/USAGE} +25 -15
  17. data/parscit/bin/archtest.pl +31 -0
  18. data/parscit/bin/citeExtract.pl +562 -0
  19. data/parscit/bin/conlleval.pl +315 -0
  20. data/parscit/bin/headExtract.pl +40 -0
  21. data/parscit/bin/parsHed/convert2TokenLevel.pl +138 -0
  22. data/parscit/bin/parsHed/keywordGen.pl +308 -0
  23. data/parscit/bin/parsHed/parseXmlHeader.pl +141 -0
  24. data/parscit/bin/parsHed/redo.parsHed.pl +198 -0
  25. data/parscit/bin/parsHed/tr2crfpp_parsHed.pl +521 -0
  26. data/parscit/bin/parseRefStrings.pl +102 -0
  27. data/parscit/bin/phOutput2xml.pl +223 -0
  28. data/parscit/bin/redo.parsCit.pl +105 -0
  29. data/parscit/bin/sectExtract.pl +149 -0
  30. data/parscit/bin/sectLabel/README +110 -0
  31. data/parscit/bin/sectLabel/README.txt +110 -0
  32. data/parscit/bin/sectLabel/genericSect/crossValidation.rb +98 -0
  33. data/parscit/bin/sectLabel/genericSect/extractFeature.rb +104 -0
  34. data/parscit/bin/sectLabel/genericSectExtract.rb +53 -0
  35. data/parscit/bin/sectLabel/getStructureInfo.pl +156 -0
  36. data/parscit/bin/sectLabel/processOmniXML.pl +1427 -0
  37. data/parscit/bin/sectLabel/processOmniXML_new.pl +1025 -0
  38. data/parscit/bin/sectLabel/processOmniXMLv2.pl +1529 -0
  39. data/parscit/bin/sectLabel/processOmniXMLv3.pl +964 -0
  40. data/parscit/bin/sectLabel/redo.sectLabel.pl +219 -0
  41. data/parscit/bin/sectLabel/simplifyOmniXML.pl +382 -0
  42. data/parscit/bin/sectLabel/single2multi.pl +190 -0
  43. data/parscit/bin/sectLabel/tr2crfpp.pl +158 -0
  44. data/parscit/bin/tr2crfpp.pl +260 -0
  45. data/parscit/bin/xml2train.pl +193 -0
  46. data/parscit/lib/CSXUtil/SafeText.pm +130 -0
  47. data/parscit/lib/Omni/Config.pm +93 -0
  48. data/parscit/lib/Omni/Omnicell.pm +263 -0
  49. data/parscit/lib/Omni/Omnicol.pm +292 -0
  50. data/parscit/lib/Omni/Omnidd.pm +328 -0
  51. data/parscit/lib/Omni/Omnidoc.pm +153 -0
  52. data/parscit/lib/Omni/Omniframe.pm +223 -0
  53. data/parscit/lib/Omni/Omniline.pm +423 -0
  54. data/parscit/lib/Omni/Omnipage.pm +282 -0
  55. data/parscit/lib/Omni/Omnipara.pm +232 -0
  56. data/parscit/lib/Omni/Omnirun.pm +303 -0
  57. data/parscit/lib/Omni/Omnitable.pm +336 -0
  58. data/parscit/lib/Omni/Omniword.pm +162 -0
  59. data/parscit/lib/Omni/Traversal.pm +313 -0
  60. data/parscit/lib/ParsCit/.PostProcess.pm.swp +0 -0
  61. data/parscit/lib/ParsCit/Citation.pm +737 -0
  62. data/parscit/lib/ParsCit/CitationContext.pm +220 -0
  63. data/parscit/lib/ParsCit/Config.pm +35 -0
  64. data/parscit/lib/ParsCit/Controller.pm +653 -0
  65. data/parscit/lib/ParsCit/PostProcess.pm +505 -0
  66. data/parscit/lib/ParsCit/PreProcess.pm +1041 -0
  67. data/parscit/lib/ParsCit/Tr2crfpp.pm +1195 -0
  68. data/parscit/lib/ParsHed/Config.pm +49 -0
  69. data/parscit/lib/ParsHed/Controller.pm +143 -0
  70. data/parscit/lib/ParsHed/PostProcess.pm +322 -0
  71. data/parscit/lib/ParsHed/Tr2crfpp.pm +448 -0
  72. data/{perl/ParsCit/lib/ParsCit/Tr2crfpp.pm → parscit/lib/ParsHed/Tr2crfpp_token.pm} +22 -21
  73. data/parscit/lib/SectLabel/AAMatching.pm +1949 -0
  74. data/parscit/lib/SectLabel/Config.pm +88 -0
  75. data/parscit/lib/SectLabel/Controller.pm +332 -0
  76. data/parscit/lib/SectLabel/PostProcess.pm +425 -0
  77. data/parscit/lib/SectLabel/PreProcess.pm +116 -0
  78. data/parscit/lib/SectLabel/Tr2crfpp.pm +1246 -0
  79. data/parscit/resources/parsCit.model +0 -0
  80. data/parscit/resources/parsCit.split.model +0 -0
  81. data/{perl/ParsCit → parscit}/resources/parsCitDict.txt +205 -0
  82. data/parscit/resources/parsHed/bigram +10 -0
  83. data/parscit/resources/parsHed/keywords +10 -0
  84. data/parscit/resources/parsHed/parsHed.model +0 -0
  85. data/parscit/resources/parsHed/parsHed.template +178 -0
  86. data/parscit/resources/sectLabel/affiliation.model +0 -0
  87. data/parscit/resources/sectLabel/author.model +0 -0
  88. data/parscit/resources/sectLabel/funcWord +320 -0
  89. data/parscit/resources/sectLabel/genericSect.model +0 -0
  90. data/parscit/resources/sectLabel/sectLabel.config +42 -0
  91. data/parscit/resources/sectLabel/sectLabel.configXml +42 -0
  92. data/parscit/resources/sectLabel/sectLabel.model +0 -0
  93. data/sh/convert_to_text.sh +20 -0
  94. data/spec/biblicit/extractor_spec.rb +121 -0
  95. data/spec/fixtures/Review_of_Michael_Tyes_Consciousness_Revisited.docx +0 -0
  96. data/spec/fixtures/critical-infrastructures.ps +63951 -0
  97. data/spec/fixtures/txt/E06-1050.txt +867 -0
  98. data/spec/fixtures/txt/sample1.txt +902 -0
  99. data/spec/fixtures/txt/sample2.txt +394 -0
  100. data/spec/spec_helper.rb +3 -0
  101. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Function.pm +2 -20
  102. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +0 -7
  103. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Parser.pm +0 -2
  104. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +0 -7
  105. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +6 -1
  106. data/svm-header-parse/HeaderParseService/tmp/.gitignore +4 -0
  107. data/svm-header-parse/extract.pl +75 -0
  108. metadata +351 -317
  109. data/perl/DocFilter/lib/DocFilter/Config.pm +0 -35
  110. data/perl/DocFilter/lib/DocFilter/Filter.pm +0 -51
  111. data/perl/FileConversionService/README.TXT +0 -11
  112. data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
  113. data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +0 -140
  114. data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +0 -77
  115. data/perl/FileConversionService/lib/FileConverter/Compression.pm +0 -137
  116. data/perl/FileConversionService/lib/FileConverter/Config.pm +0 -57
  117. data/perl/FileConversionService/lib/FileConverter/Controller.pm +0 -191
  118. data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +0 -61
  119. data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +0 -69
  120. data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +0 -69
  121. data/perl/FileConversionService/lib/FileConverter/PSToText.pm +0 -88
  122. data/perl/FileConversionService/lib/FileConverter/Prescript.pm +0 -68
  123. data/perl/FileConversionService/lib/FileConverter/TET.pm +0 -75
  124. data/perl/FileConversionService/lib/FileConverter/Utils.pm +0 -130
  125. data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +0 -140
  126. data/perl/HeaderParseService/resources/data/EbizHeaders.txt +0 -24330
  127. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +0 -27506
  128. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +0 -26495
  129. data/perl/HeaderParseService/resources/data/tagged_headers.txt +0 -40668
  130. data/perl/HeaderParseService/resources/data/test_header.txt +0 -31
  131. data/perl/HeaderParseService/resources/data/test_header.txt.parsed +0 -31
  132. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +0 -23
  133. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +0 -23
  134. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +0 -23
  135. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +0 -23
  136. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +0 -23
  137. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +0 -23
  138. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +0 -23
  139. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +0 -23
  140. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +0 -23
  141. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +0 -23
  142. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +0 -23
  143. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +0 -23
  144. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +0 -23
  145. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +0 -23
  146. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +0 -23
  147. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +0 -23
  148. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +0 -23
  149. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +0 -23
  150. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +0 -23
  151. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +0 -23
  152. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +0 -23
  153. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +0 -23
  154. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +0 -23
  155. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +0 -23
  156. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +0 -23
  157. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +0 -23
  158. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +0 -23
  159. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +0 -23
  160. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +0 -23
  161. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +0 -23
  162. data/perl/ParsCit/crfpp/traindata/parsCit.template +0 -60
  163. data/perl/ParsCit/crfpp/traindata/parsCit.train.data +0 -12104
  164. data/perl/ParsCit/crfpp/traindata/tagged_references.txt +0 -500
  165. data/perl/ParsCit/lib/CSXUtil/SafeText.pm +0 -140
  166. data/perl/ParsCit/lib/ParsCit/Citation.pm +0 -462
  167. data/perl/ParsCit/lib/ParsCit/CitationContext.pm +0 -132
  168. data/perl/ParsCit/lib/ParsCit/Config.pm +0 -46
  169. data/perl/ParsCit/lib/ParsCit/Controller.pm +0 -306
  170. data/perl/ParsCit/lib/ParsCit/PostProcess.pm +0 -367
  171. data/perl/ParsCit/lib/ParsCit/PreProcess.pm +0 -333
  172. data/perl/ParsCit/resources/parsCit.model +0 -0
  173. data/perl/extract.pl +0 -199
  174. data/spec/biblicit/cb2bib_spec.rb +0 -48
  175. data/spec/biblicit/citeseer_spec.rb +0 -40
  176. /data/{perl → svm-header-parse}/HeaderParseService/README.TXT +0 -0
  177. /data/{perl/DocFilter → svm-header-parse/HeaderParseService}/lib/CSXUtil/SafeText.pm +0 -0
  178. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +0 -0
  179. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +0 -0
  180. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +0 -0
  181. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/50states +0 -0
  182. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AddrTopWords.txt +0 -0
  183. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWords.txt +0 -0
  184. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWordsAll.txt +0 -0
  185. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/ChineseSurNames.txt +0 -0
  186. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames.bin +0 -0
  187. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
  188. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/DomainSuffixes.txt +0 -0
  189. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/LabeledHeader +0 -0
  190. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/README +0 -0
  191. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines +0 -0
  192. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines1 +0 -0
  193. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstract.txt +0 -0
  194. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstractTopWords +0 -0
  195. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/addr.txt +0 -0
  196. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affi.txt +0 -0
  197. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affis.bin +0 -0
  198. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
  199. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/allnamewords.bin +0 -0
  200. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_US.txt +0 -0
  201. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_world.txt +0 -0
  202. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/city.txt +0 -0
  203. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cityname.txt +0 -0
  204. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/country_abbr.txt +0 -0
  205. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/countryname.txt +0 -0
  206. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/dateTopWords +0 -0
  207. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/degree.txt +0 -0
  208. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/email.txt +0 -0
  209. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/excludeWords.txt +0 -0
  210. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/female-names +0 -0
  211. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstNames.txt +0 -0
  212. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames.bin +0 -0
  213. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
  214. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/intro.txt +0 -0
  215. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keyword.txt +0 -0
  216. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keywordTopWords +0 -0
  217. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/male-names +0 -0
  218. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/middleNames.txt +0 -0
  219. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/month.txt +0 -0
  220. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul +0 -0
  221. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label +0 -0
  222. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label.old +0 -0
  223. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.processed +0 -0
  224. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulAuthor +0 -0
  225. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulClassStat +0 -0
  226. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nickname.txt +0 -0
  227. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nicknames.bin +0 -0
  228. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/note.txt +0 -0
  229. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/page.txt +0 -0
  230. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/phone.txt +0 -0
  231. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/postcode.txt +0 -0
  232. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/pubnum.txt +0 -0
  233. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.bin +0 -0
  234. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.txt +0 -0
  235. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/states_and_abbreviations.txt +0 -0
  236. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords +0 -0
  237. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords.bin +0 -0
  238. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surNames.txt +0 -0
  239. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames.bin +0 -0
  240. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames_spec.bin +0 -0
  241. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/A.html +0 -0
  242. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/B.html +0 -0
  243. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/C.html +0 -0
  244. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/D.html +0 -0
  245. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/E.html +0 -0
  246. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/F.html +0 -0
  247. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/G.html +0 -0
  248. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/H.html +0 -0
  249. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/I.html +0 -0
  250. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/J.html +0 -0
  251. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/K.html +0 -0
  252. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/L.html +0 -0
  253. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/M.html +0 -0
  254. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/N.html +0 -0
  255. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/O.html +0 -0
  256. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/P.html +0 -0
  257. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Q.html +0 -0
  258. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/R.html +0 -0
  259. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/S.html +0 -0
  260. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/T.html +0 -0
  261. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/U.html +0 -0
  262. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/V.html +0 -0
  263. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/W.html +0 -0
  264. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
  265. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/X.html +0 -0
  266. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Y.html +0 -0
  267. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Z.html +0 -0
  268. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ae.html +0 -0
  269. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/am.html +0 -0
  270. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ar.html +0 -0
  271. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/at.html +0 -0
  272. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/au.html +0 -0
  273. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bd.html +0 -0
  274. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/be.html +0 -0
  275. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bg.html +0 -0
  276. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bh.html +0 -0
  277. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
  278. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bm.html +0 -0
  279. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bn.html +0 -0
  280. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/br.html +0 -0
  281. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ca.html +0 -0
  282. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ch.html +0 -0
  283. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cl.html +0 -0
  284. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cn.html +0 -0
  285. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/co.html +0 -0
  286. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cr.html +0 -0
  287. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cy.html +0 -0
  288. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cz.html +0 -0
  289. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/de.html +0 -0
  290. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
  291. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dk.html +0 -0
  292. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ec.html +0 -0
  293. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ee.html +0 -0
  294. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/eg.html +0 -0
  295. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/es.html +0 -0
  296. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/et.html +0 -0
  297. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/faq.html +0 -0
  298. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fi.html +0 -0
  299. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fj.html +0 -0
  300. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fo.html +0 -0
  301. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fr.html +0 -0
  302. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/geog.html +0 -0
  303. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gr.html +0 -0
  304. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gu.html +0 -0
  305. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hk.html +0 -0
  306. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hr.html +0 -0
  307. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hu.html +0 -0
  308. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/id.html +0 -0
  309. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ie.html +0 -0
  310. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/il.html +0 -0
  311. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/in.html +0 -0
  312. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/is.html +0 -0
  313. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/it.html +0 -0
  314. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jm.html +0 -0
  315. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jo.html +0 -0
  316. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jp.html +0 -0
  317. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
  318. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kr.html +0 -0
  319. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kw.html +0 -0
  320. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lb.html +0 -0
  321. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
  322. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lk.html +0 -0
  323. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lt.html +0 -0
  324. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lu.html +0 -0
  325. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lv.html +0 -0
  326. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ma.html +0 -0
  327. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
  328. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
  329. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mk.html +0 -0
  330. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mo.html +0 -0
  331. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
  332. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mt.html +0 -0
  333. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mx.html +0 -0
  334. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/my.html +0 -0
  335. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ni.html +0 -0
  336. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nl.html +0 -0
  337. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/no.html +0 -0
  338. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nz.html +0 -0
  339. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pa.html +0 -0
  340. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pe.html +0 -0
  341. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ph.html +0 -0
  342. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pl.html +0 -0
  343. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
  344. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pr.html +0 -0
  345. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ps.html +0 -0
  346. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pt.html +0 -0
  347. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/recognition.html +0 -0
  348. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/results.html +0 -0
  349. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ro.html +0 -0
  350. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ru.html +0 -0
  351. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sd.html +0 -0
  352. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/se.html +0 -0
  353. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sg.html +0 -0
  354. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/si.html +0 -0
  355. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sk.html +0 -0
  356. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/th.html +0 -0
  357. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tr.html +0 -0
  358. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tw.html +0 -0
  359. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ua.html +0 -0
  360. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uk.html +0 -0
  361. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ-full.html +0 -0
  362. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ.html +0 -0
  363. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uy.html +0 -0
  364. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ve.html +0 -0
  365. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/yu.html +0 -0
  366. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/za.html +0 -0
  367. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/zm.html +0 -0
  368. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list.txt +0 -0
  369. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/url.txt +0 -0
  370. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/webTopWords +0 -0
  371. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/words +0 -0
  372. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10ContextModelfold1 +0 -0
  373. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10Modelfold1 +0 -0
  374. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11ContextModelfold1 +0 -0
  375. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11Modelfold1 +0 -0
  376. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12ContextModelfold1 +0 -0
  377. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12Modelfold1 +0 -0
  378. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13ContextModelfold1 +0 -0
  379. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13Modelfold1 +0 -0
  380. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14ContextModelfold1 +0 -0
  381. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14Modelfold1 +0 -0
  382. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15ContextModelfold1 +0 -0
  383. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15Modelfold1 +0 -0
  384. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1ContextModelfold1 +0 -0
  385. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1Modelfold1 +0 -0
  386. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2ContextModelfold1 +0 -0
  387. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2Modelfold1 +0 -0
  388. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3ContextModelfold1 +0 -0
  389. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3Modelfold1 +0 -0
  390. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4ContextModelfold1 +0 -0
  391. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4Modelfold1 +0 -0
  392. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5ContextModelfold1 +0 -0
  393. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5Modelfold1 +0 -0
  394. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6ContextModelfold1 +0 -0
  395. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6Modelfold1 +0 -0
  396. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7ContextModelfold1 +0 -0
  397. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7Modelfold1 +0 -0
  398. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8ContextModelfold1 +0 -0
  399. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8Modelfold1 +0 -0
  400. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9ContextModelfold1 +0 -0
  401. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9Modelfold1 +0 -0
  402. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceModel +0 -0
  403. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceTrainF +0 -0
  404. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperBaseFeaDict +0 -0
  405. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperContextFeaDict +0 -0
  406. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +0 -0
@@ -0,0 +1,867 @@
1
+ A Probabilistic Answer Type Model
2
+ Christopher Pinchak
3
+ Department of Computing Science
4
+ University of Alberta
5
+ Edmonton, Alberta, Canada
6
+ pinchak@cs.ualberta.ca
7
+ Dekang Lin
8
+ Google, Inc.
9
+ 1600 Amphitheatre Parkway
10
+ Mountain View, CA
11
+ lindek@google.com
12
+ Abstract
13
+ All questions are implicitly associated
14
+ with an expected answer type. Unlike
15
+ previous approaches that require a prede-
16
+ fined set of question types, we present
17
+ a method for dynamically constructing
18
+ a probability-based answer type model
19
+ for each different question. Our model
20
+ evaluates the appropriateness of a poten-
21
+ tial answer by the probability that it fits
22
+ into the question contexts. Evaluation
23
+ is performed against manual and semi-
24
+ automatic methods using a fixed set of an-
25
+ swer labels. Results show our approach to
26
+ be superior for those questions classified
27
+ as having a miscellaneous answer type.
28
+ 1 Introduction
29
+ Given a question, people are usually able to form
30
+ an expectation about the type of the answer, even
31
+ if they do not know the actual answer. An accu-
32
+ rate expectation of the answer type makes it much
33
+ easier to select the answer from a sentence that
34
+ contains the query words. Consider the question
35
+ “What is the capital of Norway?” We would ex-
36
+ pect the answer to be a city and could filter out
37
+ most of the words in the following sentence:
38
+ The landed aristocracy was virtually crushed
39
+ by Hakon V, who reigned from 1299 to 1319,
40
+ and Oslo became the capital of Norway, re-
41
+ placing Bergen as the principal city of the
42
+ kingdom.
43
+ The goal of answer typing is to determine
44
+ whether a word’s semantic type is appropriate as
45
+ an answer for a question. Many previous ap-
46
+ proaches to answer typing, e.g., (Ittycheriah et al.,
47
+ 2001; Li and Roth, 2002; Krishnan et al., 2005),
48
+ employ a predefined set of answer types and use
49
+ supervised learning or manually constructed rules
50
+ to classify a question according to expected an-
51
+ swer type. A disadvantage of this approach is that
52
+ there will always be questions whose answers do
53
+ not belong to any of the predefined types.
54
+ Consider the question: “What are tourist attrac-
55
+ tions in Reims?” The answer may be many things:
56
+ a church, a historic residence, a park, a famous
57
+ intersection, a statue, etc. A common method to
58
+ deal with this problem is to define a catch-all class.
59
+ This class, however, tends not to be as effective as
60
+ other answer types.
61
+ Another disadvantage of predefined answer
62
+ types is with regard to granularity. If the types
63
+ are too specific, they are more difficult to tag. If
64
+ they are too general, too many candidates may be
65
+ identified as having the appropriate type.
66
+ In contrast to previous approaches that use a su-
67
+ pervised classifier to categorize questions into a
68
+ predefined set of types, we propose an unsuper-
69
+ vised method to dynamically construct a proba-
70
+ bilistic answer type model for each question. Such
71
+ a model can be used to evaluate whether or not
72
+ a word fits into the question context. For exam-
73
+ ple, given the question “What are tourist attrac-
74
+ tions in Reims?”, we would expect the appropriate
75
+ answers to fit into the context “X is a tourist attrac-
76
+ tion.” From a corpus, we can find the words that
77
+ appeared in this context, such as:
78
+ A-Ama Temple, Aborigine, addition, Anak
79
+ Krakatau, archipelago, area, baseball,
80
+ Bletchley Park, brewery, cabaret, Cairo,
81
+ Cape Town, capital, center, ...
82
+ Using the frequency counts of these words in
83
+ the context, we construct a probabilistic model
84
+ to compute P(in(w, Γ)|w), the probability for a
85
+ word w to occur in a set of contexts Γ, given an
86
+ occurrence of w. The parameters in this model are
87
+ obtained from a large, automatically parsed, un-
88
+ labeled corpus. By asking whether a word would
89
+ occur in a particular context extracted from a ques-
90
+ 393
91
+ tion, we avoid explicitly specifying a list of pos-
92
+ sible answer types. This has the added benefit
93
+ of being easily adapted to different domains and
94
+ corpora in which a list of explicit possible answer
95
+ types may be difficult to enumerate and/or identify
96
+ within the text.
97
+ The remainder of this paper is organized as fol-
98
+ lows. Section 2 discusses the work related to an-
99
+ swer typing. Section 3 discusses some of the key
100
+ concepts employed by our probabilistic model, in-
101
+ cluding word clusters and the contexts of a ques-
102
+ tion and a word. Section 4 presents our probabilis-
103
+ tic model for answer typing. Section 5 compares
104
+ the performance of our model with that of an or-
105
+ acle and a semi-automatic system performing the
106
+ same task. Finally, the concluding remarks in are
107
+ made in Section 6.
108
+ 2 Related Work
109
+ Light et al. (2001) performed an analysis of the
110
+ effect of multiple answer type occurrences in a
111
+ sentence. When multiple words of the same type
112
+ appear in a sentence, answer typing with fixed
113
+ types must assign each the same score. Light et
114
+ al. found that even with perfect answer sentence
115
+ identification, question typing, and semantic tag-
116
+ ging, a system could only achieve 59% accuracy
117
+ over the TREC-9 questions when using their set of
118
+ 24 non-overlapping answer types. By computing
119
+ the probability of an answer candidate occurring
120
+ in the question contexts directly, we avoid having
121
+ multiple candidates with the same level of appro-
122
+ priateness as answers.
123
+ There have been a variety of approaches to de-
124
+ termine the answer types, which are also known
125
+ as Qtargets (Echihabi et al., 2003). Most previous
126
+ approaches classify the answer type of a question
127
+ as one of a set of predefined types.
128
+ Many systems construct the classification rules
129
+ manually (Cui et al., 2004; Greenwood, 2004;
130
+ Hermjakob, 2001). The rules are usually triggered
131
+ by the presence of certain words in the question.
132
+ For example, if a question contains “author” then
133
+ the expected answer type is Person.
134
+ The number of answer types as well as the num-
135
+ ber of rules can vary a great deal. For example,
136
+ (Hermjakob, 2001) used 276 rules for 122 answer
137
+ types. Greenwood (2004), on the other hand, used
138
+ 46 answer types with unspecified number of rules.
139
+ The classification rules can also be acquired
140
+ with supervised learning. Ittycheriah, et al. (2001)
141
+ describe a maximum entropy based question clas-
142
+ sification scheme to classify each question as hav-
143
+ ing one of the MUC answer types. In a similar ex-
144
+ periment, Li & Roth (2002) train a question clas-
145
+ sifier based on a modified version of SNoW using
146
+ a richer set of answer types than Ittycheriah et al.
147
+ The LCC system (Harabagiu et al., 2003) com-
148
+ bines fixed types with a novel loop-back strategy.
149
+ In the event that a question cannot be classified as
150
+ one of the fixed entity types or semantic concepts
151
+ derived from WordNet (Fellbaum, 1998), the an-
152
+ swer type model backs off to a logic prover that
153
+ uses axioms derived form WordNet, along with
154
+ logic rules, to justify phrases as answers. Thus, the
155
+ LCC system is able to avoid the use of a miscel-
156
+ laneous type that often exhibits poor performance.
157
+ However, the logic prover must have sufficient ev-
158
+ idence to link the question to the answer, and gen-
159
+ eral knowledge must be encoded as axioms into
160
+ the system. In contrast, our answer type model
161
+ derives all of its information automatically from
162
+ unannotated text.
163
+ Answer types are often used as filters. It was
164
+ noted in (Radev et al., 2002) that a wrong guess
165
+ about the answer type reduces the chance for the
166
+ system to answer the question correctly by as
167
+ much as 17 times. The approach presented here
168
+ is less brittle. Even if the correct candidate does
169
+ not have the highest likelihood according to the
170
+ model, it may still be selected when the answer
171
+ extraction module takes into account other factors
172
+ such as the proximity to the matched keywords.
173
+ Furthermore, a probabilistic model makes it eas-
174
+ ier to integrate the answer type scores with scores
175
+ computed by other components in a question an-
176
+ swering system in a principled fashion.
177
+ 3 Resources
178
+ Before introducing our model, we first describe
179
+ the resources used in the model.
180
+ 3.1 Word Clusters
181
+ Natural language data is extremely sparse. Word
182
+ clusters are a way of coping with data sparseness
183
+ by abstracting a given word to a class of related
184
+ words. Clusters, as used by our probabilistic an-
185
+ swer typing system, play a role similar to that of
186
+ named entity types. Many methods exist for clus-
187
+ tering, e.g., (Brown et al., 1990; Cutting et al.,
188
+ 1992; Pereira et al., 1993; Karypis et al., 1999).
189
+ We used the Clustering By Committee (CBC)
190
+ 394
191
+ Table 1: Words and their clusters
192
+ Word Clusters
193
+ suite software, network, wireless, ...
194
+ rooms, bathrooms, restrooms, ...
195
+ meeting room, conference room, ...
196
+ ghost rabbit, squirrel, duck, elephant, frog, ...
197
+ goblins, ghosts, vampires, ghouls, ...
198
+ punk, reggae, folk, pop, hip-pop, ...
199
+ huge, larger, vast, significant, ...
200
+ coming-of-age, true-life, ...
201
+ clouds, cloud, fog, haze, mist, ...
202
+ algorithm (Pantel and Lin, 2002) on a 10 GB En-
203
+ glish text corpus to obtain 3607 clusters. The fol-
204
+ lowing is an example cluster generated by CBC:
205
+ tension, anger, anxiety, tensions, frustration,
206
+ resentment, uncertainty, confusion, conflict,
207
+ discontent, insecurity, controversy, unease,
208
+ bitterness, dispute, disagreement, nervous-
209
+ ness, sadness, despair, animosity, hostility,
210
+ outrage, discord, pessimism, anguish, ...
211
+ In the clustering generated by CBC, a word may
212
+ belong to multiple clusters. The clusters to which
213
+ a word belongs often represent the senses of the
214
+ word. Table 1 shows two example words and their
215
+ clusters.
216
+ 3.2 Contexts
217
+ The context in which a word appears often im-
218
+ poses constraints on the semantic type of the word.
219
+ This basic idea has been exploited by many pro-
220
+ posals for distributional similarity and clustering,
221
+ e.g., (Church and Hanks, 1989; Lin, 1998; Pereira
222
+ et al., 1993).
223
+ Similar to Lin and Pantel (2001), we define
224
+ the contexts of a word to be the undirected paths
225
+ in dependency trees involving that word at either
226
+ the beginning or the end. The following diagram
227
+ shows an example dependency tree:
228
+ Which city hosted the 1988 Winter Olympics?
229
+ det subj
230
+ obj
231
+ NN
232
+ NN
233
+ det
234
+ The links in the tree represent dependency rela-
235
+ tionships. The direction of a link is from the head
236
+ to the modifier in the relationship. Labels associ-
237
+ ated with the links represent types of relations.
238
+ In a context, the word itself is replaced with a
239
+ variable X. We say a word is the filler of a context
240
+ if it replaces X. For example, the contexts for the
241
+ word “Olympics” in the above sentence include
242
+ the following paths:
243
+ Context of “Olympics” Explanation
244
+ X Winter
245
+ NN
246
+ Winter X
247
+ X 1988
248
+ NN
249
+ 1988 X
250
+ X host
251
+ obj
252
+ host X
253
+ X host
254
+ obj
255
+ city
256
+ subj
257
+ city hosted X
258
+ In these paths, words are reduced to their root
259
+ forms and proper names are reduced to their entity
260
+ tags (we used MUC7 named entity tags).
261
+ Paths allow us to balance the specificity of con-
262
+ texts and the sparseness of data. Longer paths typ-
263
+ ically impose stricter constraints on the slot fillers.
264
+ However, they tend to have fewer occurrences,
265
+ making them more prone to errors arising from
266
+ data sparseness. We have restricted the path length
267
+ to two (involving at most three words) and require
268
+ the two ends of the path to be nouns.
269
+ We parsed the AQUAINT corpus (3GB) with
270
+ Minipar (Lin, 2001) and collected the frequency
271
+ counts of words appearing in various contexts.
272
+ Parsing and database construction is performed
273
+ off-line as the database is identical for all ques-
274
+ tions. We extracted 527,768 contexts that ap-
275
+ peared at least 25 times in the corpus. An example
276
+ context and its fillers are shown in Figure 1.
277
+ X host Olympics
278
+ subj obj
279
+ Africa 2 grant 1 readiness 2
280
+ AP 1 he 2 Rio de Janeiro 1
281
+ Argentina 1 homeland 3 Rome 1
282
+ Athens 16 IOC 1 Salt Lake City 2
283
+ Atlanta 3 Iran 2 school 1
284
+ Bangkok 1 Jakarta 1 S. Africa 1
285
+ . .. . .. . . .
286
+ decades 1 president 2 Zakopane 4
287
+ facility 1 Pusan 1
288
+ government 1 race 1
289
+ Figure 1: An example context and its fillers
290
+ 3.2.1 Question Contexts
291
+ To build a probabilistic model for answer typ-
292
+ ing, we extract a set of contexts, called question
293
+ contexts, from a question. An answer is expected
294
+ to be a plausible filler of the question contexts.
295
+ Question contexts are extracted from a question
296
+ with two rules. First, if the wh-word in a ques-
297
+ tion has a trace in the parse tree, the question con-
298
+ texts are the contexts of the trace. For example, the
299
+ 395
300
+ question “What do most tourists visit in Reims?”
301
+ is parsed as:
302
+ Whati
303
+ do most tourists visit ei
304
+ in Reims?
305
+ det
306
+ i
307
+ subj
308
+ det
309
+ obj
310
+ in
311
+ The symbol ei is the trace of whati. Minipar
312
+ generates the trace to indicate that the word what
313
+ is the object of visit in the deep structure of the
314
+ sentence. The following question contexts are ex-
315
+ tracted from the above question:
316
+ Context Explanation
317
+ X visit tourist
318
+ obj subj
319
+ tourist visits X
320
+ X visit Reims
321
+ obj in
322
+ visit X in Reims
323
+ The second rule deals with situations where
324
+ the wh-word is a determiner, as in the question
325
+ “Which city hosted the 1988 Winter Olympics?”
326
+ (the parse tree for which is shown in section 3.2).
327
+ In such cases, the question contexts consist of a
328
+ single context involving the noun that is modified
329
+ by the determiner. The context for the above sen-
330
+ tence is X city
331
+ subj
332
+ , corresponding to the sentence
333
+ “X is a city.” This context is used because the
334
+ question explicitly states that the desired answer is
335
+ a city. The context overrides the other contexts be-
336
+ cause the question explicitly states the desired an-
337
+ swer type. Experimental results have shown that
338
+ using this context in conjunction with other con-
339
+ texts extracted from the question produces lower
340
+ performance than using this context alone.
341
+ In the event that a context extracted from a ques-
342
+ tion is not found in the database, we shorten the
343
+ context in one of two ways. We start by replac-
344
+ ing the word at the end of the path with a wildcard
345
+ that matches any word. If this fails to yield en-
346
+ tries in the context database, we shorten the con-
347
+ text to length one and replace the end word with
348
+ automatically determined similar words instead of
349
+ a wildcard.
350
+ 3.2.2 Candidate Contexts
351
+ Candidate contexts are very similar in form to
352
+ question contexts, save for one important differ-
353
+ ence. Candidate contexts are extracted from the
354
+ parse trees of the answer candidates rather than the
355
+ question. In natural language, some words may
356
+ be polysemous. For example, Washington may re-
357
+ fer to a person, a city, or a state. The occurrences
358
+ of Washington in “Washington’s descendants” and
359
+ “suburban Washington” should not be given the
360
+ same score when the question is seeking a loca-
361
+ tion. Given that the sense of a word is largely de-
362
+ termined by its local context (Choueka and Lusig-
363
+ nan, 1985), candidate contexts allow the model to
364
+ take into account the candidate answers’ senses
365
+ implicitly.
366
+ 4 Probabilistic Model
367
+ The goal of an answer typing model is to evalu-
368
+ ate the appropriateness of a candidate word as an
369
+ answer to the question. If we assume that a set
370
+ of answer candidates is provided to our model by
371
+ some means (e.g., words comprising documents
372
+ extracted by an information retrieval engine), we
373
+ wish to compute the value P(in(w, ΓQ)|w). That
374
+ is, the appropriateness of a candidate answer w is
375
+ proportional to the probability that it will occur in
376
+ the question contexts ΓQ extracted from the ques-
377
+ tion.
378
+ To mitigate data sparseness, we can introduce
379
+ a hidden variable C that represents the clusters to
380
+ which the candidate answer may belong. As a can-
381
+ didate may belong to multiple clusters, we obtain:
382
+ P(in(w, ΓQ)|w) =
383
+ X
384
+ C
385
+ P(in(w, ΓQ), C|w) (1)
386
+ =
387
+ X
388
+ C
389
+ P(C|w)P(in(w, ΓQ)|C, w) (2)
390
+ Given that a word appears, we assume that it has
391
+ the same probability to appear in a context as all
392
+ other words in the same cluster. Therefore:
393
+ P(in(w, ΓQ)|C, w) ≈ P(in(C, ΓQ)|C) (3)
394
+ We can now rewrite the equation in (2) as:
395
+ P(in(w, ΓQ)|w) ≈
396
+ X
397
+ C
398
+ P(C|w)P(in(C, ΓQ)|C) (4)
399
+ This equation splits our model into two parts:
400
+ one models which clusters a word belongs to and
401
+ the other models how appropriate a cluster is to
402
+ the question contexts. When ΓQ consists of multi-
403
+ ple contexts, we make the na¨ıve Bayes assumption
404
+ that each individual context γQ ∈ ΓQ is indepen-
405
+ dent of all other contexts given the cluster C.
406
+ P(in(w, ΓQ)|w) ≈
407
+ X
408
+ C
409
+ P(C|w)
410
+ Y
411
+ γQ∈ΓQ
412
+ P(in(C, γQ)|C) (5)
413
+ Equation (5) needs the parameters P(C|w) and
414
+ P(in(C, γQ)|C), neither of which are directly
415
+ available from the context-filler database. We will
416
+ discuss the estimation of these parameters in Sec-
417
+ tion 4.2.
418
+ 396
419
+ 4.1 Using Candidate Contexts
420
+ The previous model assigns the same likelihood to
421
+ every instance of a given word. As we noted in
422
+ section 3.2.2, a word may be polysemous. To take
423
+ into account a word’s context, we can instead com-
424
+ pute P(in(w, ΓQ)|w, in(w, Γw)), where Γw is the
425
+ set of contexts for the candidate word w in a re-
426
+ trieved passage.
427
+ By introducing word clusters as intermediate
428
+ variables as before and making a similar assump-
429
+ tion as in equation (3), we obtain:
430
+ P(in(w, ΓQ)|w, in(w, Γw))
431
+ =
432
+ X
433
+ C
434
+ P(in(w, ΓQ), C|w, in(w, Γw)) (6)
435
+
436
+ X
437
+ C
438
+ P(C|w, in(w, Γw))P(in(C, ΓQ)|C) (7)
439
+ Like equation (4), equation (7) partitions the
440
+ model into two parts. Unlike P(C|w) in equation
441
+ (4), the probability of the cluster is now based on
442
+ the particular occurrence of the word in the candi-
443
+ date contexts. It can be estimated by:
444
+ P(C|w, in(w, Γw))
445
+ =
446
+ P(in(w, Γw)|w, C)P(w, C)
447
+ P(in(w, Γw)|w)P(w)
448
+ (8)
449
+
450
+ Y
451
+ γw∈Γw
452
+ P(in(w, γw)|w, C)
453
+ Y
454
+ γw∈Γw
455
+ P(in(w, γw)|w)
456
+ × P(C|w) (9)
457
+ =
458
+ Y
459
+ γw∈Γw
460
+
461
+ P(C|w, in(w, γw))
462
+ P(C|w)
463
+ «
464
+ × P(C|w) (10)
465
+ 4.2 Estimating Parameters
466
+ Our probabilistic model requires the parameters
467
+ P(C|w), P(C|w, in(w, γ)), and P(in(C, γ)|C),
468
+ where w is a word, C is a cluster that w belongs to,
469
+ and γ is a question or candidate context. This sec-
470
+ tion explains how these parameters are estimated
471
+ without using labeled data.
472
+ The context-filler database described in Sec-
473
+ tion 3.2 provides the joint and marginal fre-
474
+ quency counts of contexts and words (|in(γ, w)|,
475
+ |in(∗, γ)| and |in(w, ∗)|). These counts al-
476
+ low us to compute the probabilities P(in(w, γ)),
477
+ P(in(w, ∗)), and P(in(∗, γ)). We can also com-
478
+ pute P(in(w, γ)|w), which is smoothed with add-
479
+ one smoothing (see equation (11) in Figure 2).
480
+ The estimation of P(C|w) presents a challenge.
481
+ We have no corpus from which we can directly
482
+ measure P(C|w) because word instances are not
483
+ labeled with their clusters.
484
+ P(in(w, γ)|w) =
485
+ |in(w, γ)| + P(in(∗, γ))
486
+ |in(w, ∗)| + 1
487
+ (11)
488
+ Pu(C|w) =
489
+ (
490
+ 1
491
+ |{C |w∈C }|
492
+ if w ∈ C,
493
+ 0 otherwise
494
+ (12)
495
+ P(C|w) =
496
+ X
497
+ w ∈S(w)
498
+ sim(w, w ) × Pu(C|w )
499
+ X
500
+ {C |w∈C },
501
+ w ∈S(w)
502
+ sim(w, w ) × Pu(C |w )
503
+ (13)
504
+ P(in(C, γ)|C) =
505
+ X
506
+ w ∈C
507
+ P(C|w ) × |in(w , γ)| + P(in(∗, γ))
508
+ X
509
+ w ∈C
510
+ P(C|w ) × |in(w , ∗)| + 1
511
+ (14)
512
+ Figure 2: Probability estimation
513
+ We use the average weighted “guesses” of the
514
+ top similar words of w to compute P(C|w) (see
515
+ equation 13). The intuition is that if w and w
516
+ are similar words, P(C|w ) and P(C|w) tend
517
+ to have similar values. Since we do not know
518
+ P(C|w ) either, we substitute it with uniform dis-
519
+ tribution Pu(C|w ) as in equation (12) of Fig-
520
+ ure 2. Although Pu(C|w ) is a very crude guess,
521
+ the weighted average of a set of such guesses can
522
+ often be quite accurate.
523
+ The similarities between words are obtained as
524
+ a byproduct of the CBC algorithm. For each word,
525
+ we use S(w) to denote the top-n most similar
526
+ words (n=50 in our experiments) and sim(w, w )
527
+ to denote the similarity between words w and w .
528
+ The following is a sample similar word list for the
529
+ word suit:
530
+ S(suit) = {lawsuit 0.49, suits 0.47, com-
531
+ plaint 0.29, lawsuits 0.27, jacket 0.25, coun-
532
+ tersuit 0.24, counterclaim 0.24, pants 0.24,
533
+ trousers 0.22, shirt 0.21, slacks 0.21, case
534
+ 0.21, pantsuit 0.21, shirts 0.20, sweater 0.20,
535
+ coat 0.20, ...}
536
+ The estimation for P(C|w, in(w, γw)) is sim-
537
+ ilar to that of P(C|w) except that instead of all
538
+ w ∈ S(w), we instead use {w |w ∈ S(w) ∧
539
+ in(w , γw)}. By only looking at a particular con-
540
+ text γw, we may obtain a different distribution over
541
+ C than P(C|w) specifies. In the event that the data
542
+ are too sparse to estimate P(C|w, in(w, γw)), we
543
+ fall back to using P(C|w).
544
+ P(in(C, γ)|C) is computed in (14) by assum-
545
+ ing each instance of w contains a fractional in-
546
+ stance of C and the fractional count is P(C|w).
547
+ Again, add-one smoothing is used.
548
+ 397
549
+ System Median % Top 1% Top 5% Top 10% Top 50%
550
+ Oracle 0.7% 89 (57%) 123 (79%) 131 (85%) 154 (99%)
551
+ Frequency 7.7% 31 (20%) 67 (44%) 86 (56%) 112 (73%)
552
+ Our model 1.2% 71 (46%) 106 (69%) 119 (77%) 146 (95%)
553
+ no cand. contexts 2.2% 58 (38%) 102 (66%) 113 (73%) 145 (94%)
554
+ ANNIE 4.0% 54 (35%) 79 (51%) 93 (60%) 123 (80%)
555
+ Table 2: Summary of Results
556
+ 5 Experimental Setup & Results
557
+ We evaluate our answer typing system by using
558
+ it to filter the contents of documents retrieved by
559
+ the information retrieval portion of a question an-
560
+ swering system. Each answer candidate in the set
561
+ of documents is scored by the answer typing sys-
562
+ tem and the list is sorted in descending order of
563
+ score. We treat the system as a filter and observe
564
+ the proportion of candidates that must be accepted
565
+ by the filter so that at least one correct answer is
566
+ accepted. A model that allows a low percentage
567
+ of candidates to pass while still allowing at least
568
+ one correct answer through is favorable to a model
569
+ in which a high number of candidates must pass.
570
+ This represents an intrinsic rather than extrinsic
571
+ evaluation (Moll´a and Hutchinson, 2003) that we
572
+ believe illustrates the usefulness of our model.
573
+ The evaluation data consist of 154 questions
574
+ from the TREC-2003 QA Track (Voorhees, 2003)
575
+ satisfying the following criteria, along with the top
576
+ 10 documents returned for each question as iden-
577
+ tified by NIST using the PRISE1 search engine.
578
+ • the question begins with What, Which, or
579
+ Who. We restricted the evaluation such ques-
580
+ tions because our system is designed to deal
581
+ with questions whose answer types are often
582
+ semantically open-ended noun phrases.
583
+ • There exists entry for the question in the an-
584
+ swer patterns provided by Ken Litkowski2.
585
+ • One of the top-10 documents returned by
586
+ PRISE contains a correct answer.
587
+ We compare the performance of our prob-
588
+ abilistic model with that of two other sys-
589
+ tems. Both comparison systems make use of a
590
+ small, predefined set of manually-assigned MUC-
591
+ 7 named-entity types (location, person, organiza-
592
+ tion, cardinal, percent, date, time, duration, mea-
593
+ sure, money) augmented with thing-name (proper
594
+ 1
595
+ www.itl.nist.gov/iad/894.02/works/papers/zp2/zp2.html
596
+ 2
597
+ trec.nist.gov/data/qa/2003 qadata/03QA.tasks/t12.pats.txt
598
+ names of inanimate objects) and miscellaneous
599
+ (a catch-all answer type of all other candidates).
600
+ Some examples of thing-name are Guinness Book
601
+ of World Records, Thriller, Mars Pathfinder, and
602
+ Grey Cup. Examples of miscellaneous answers are
603
+ copper, oil, red, and iris.
604
+ The differences in the comparison systems is
605
+ with respect to how entity types are assigned to the
606
+ words in the candidate documents. We make use
607
+ of the ANNIE (Maynard et al., 2002) named entity
608
+ recognition system, along with a manual assigned
609
+ “oracle” strategy, to assign types to candidate an-
610
+ swers. In each case, the score for a candidate is
611
+ either 1 if it is tagged as the same type as the ques-
612
+ tion or 0 otherwise. With this scoring scheme pro-
613
+ ducing a sorted list we can compute the probability
614
+ of the first correct answer appearing at rank R = k
615
+ as follows:
616
+ P(R = k) =
617
+ k−2Y
618
+ i=0
619
+
620
+ t − c − i
621
+ t − i
622
+ «
623
+ c
624
+ t − k + 1
625
+ (15)
626
+ where t is the number of unique candidate answers
627
+ that are of the appropriate type and c is the number
628
+ of unique candidate answers that are correct.
629
+ Using the probabilities in equation (15), we
630
+ compute the expected rank, E(R), of the first cor-
631
+ rect answer of a given question in the system as:
632
+ E(R) =
633
+ t−c+1X
634
+ k=1
635
+ kP(R = k) (16)
636
+ Answer candidates are the set of ANNIE-
637
+ identified tokens with stop words and punctuation
638
+ removed. This yields between 900 and 8000 can-
639
+ didates for each question, depending on the top 10
640
+ documents returned by PRISE. The oracle system
641
+ represents an upper bound on using the predefined
642
+ set of answer types. The ANNIE system repre-
643
+ sents a more realistic expectation of performance.
644
+ The median percentage of candidates that are
645
+ accepted by a filter over the questions of our eval-
646
+ uation data provides one measure of performance
647
+ and is preferred to the average because of the ef-
648
+ fect of large values on the average. In QA, a sys-
649
+ tem accepting 60% of the candidates is not signif-
650
+ icantly better or worse than one accepting 100%,
651
+ 398
652
+ System Measure
653
+ Question Type
654
+ All Location Person Organization Thing-Name Misc Other
655
+ (154) (57) (17) (19) (17) (37) (7)
656
+ Our model
657
+ Median 1.2% 0.8% 2.0% 1.3% 3.7% 3.5% 12.2%
658
+ Top 1% 71 34 6 9 7 13 2
659
+ Top 5% 106 53 11 11 10 19 2
660
+ Top 10% 119 55 12 17 10 22 3
661
+ Top 50% 146 56 16 18 17 34 5
662
+ Oracle
663
+ Median 0.7% 0.4% 1.0% 0.3% 0.4% 16.0% 0.3%
664
+ Top 1% 89 44 8 16 14 1 6
665
+ Top 5% 123 57 17 19 17 6 7
666
+ Top 10% 131 57 17 19 17 14 7
667
+ Top 50% 154 57 17 19 17 37 7
668
+ ANNIE
669
+ Median 4.0% 0.6% 1.4% 6.1% 100% 16.7% 50.0%
670
+ Top 1% 54 39 5 7 0 0 3
671
+ Top 5% 79 53 12 9 0 2 3
672
+ Top 10% 93 54 13 11 0 12 3
673
+ Top 50% 123 56 16 15 5 28 3
674
+ Table 3: Detailed breakdown of performance
675
+ but the effect on average is quite high. Another
676
+ measure is to observe the number of questions
677
+ with at least one correct answer in the top N% for
678
+ various values of N. By examining the number of
679
+ correct answers found in the top N% we can better
680
+ understand what an effective cutoff would be.
681
+ The overall results of our comparison can be
682
+ found in Table 2. We have added the results of
683
+ a system that scores candidates based on their fre-
684
+ quency within the document as a comparison with
685
+ a simple, yet effective, strategy. The second col-
686
+ umn is the median percentage of where the highest
687
+ scored correct answer appears in the sorted candi-
688
+ date list. Low percentage values mean the answer
689
+ is usually found high in the sorted list. The re-
690
+ maining columns list the number of questions that
691
+ have a correct answer somewhere in the top N%
692
+ of their sorted lists. This is meant to show the ef-
693
+ fects of imposing a strict cutoff prior to running
694
+ the answer type model.
695
+ The oracle system performs best, as it bene-
696
+ fits from both manual question classification and
697
+ manual entity tagging. If entity assignment is
698
+ performed by an automatic system (as it is for
699
+ ANNIE), the performance drops noticeably. Our
700
+ probabilistic model performs better than ANNIE
701
+ and achieves approximately 2/3 of the perfor-
702
+ mance of the oracle system. Table 2 also shows
703
+ that the use of candidate contexts increases the
704
+ performance of our answer type model.
705
+ Table 3 shows the performance of the oracle
706
+ system, our model, and the ANNIE system broken
707
+ down by manually-assigned answer types. Due
708
+ to insufficient numbers of questions, the cardinal,
709
+ percent, time, duration, measure, and money types
710
+ are combined into an “Other” category. When
711
+ compared with the oracle system, our model per-
712
+ forms worse overall for questions of all types ex-
713
+ cept for those seeking miscellaneous answers. For
714
+ miscellaneous questions, the oracle identifies all
715
+ tokens that do not belong to one of the other
716
+ known categories as possible answers. For all
717
+ questions of non-miscellaneous type, only a small
718
+ subset of the candidates are marked appropriate.
719
+ In particular, our model performs worse than the
720
+ oracle for questions seeking persons and thing-
721
+ names. Person questions often seek rare person
722
+ names, which occur in few contexts and are diffi-
723
+ cult to reliably cluster. Thing-name questions are
724
+ easy for a human to identify but difficult for au-
725
+ tomatic system to identify. Thing-names are a di-
726
+ verse category and are not strongly associated with
727
+ any identifying contexts.
728
+ Our model outperforms the ANNIE system in
729
+ general, and for questions seeking organizations,
730
+ thing-names, and miscellaneous targets in partic-
731
+ ular. ANNIE may have low coverage on organi-
732
+ zation names, resulting in reduced performance.
733
+ Like the oracle, ANNIE treats all candidates not
734
+ assigned one of the categories as appropriate for
735
+ miscellaneous questions. Because ANNIE cannot
736
+ identify thing-names, they are treated as miscella-
737
+ neous. ANNIE shows low performance on thing-
738
+ names because words incorrectly assigned types
739
+ are sorted to the bottom of the list for miscella-
740
+ neous and thing-name questions. If a correct an-
741
+ swer is incorrectly assigned a type it will be sorted
742
+ near the bottom, resulting in a poor score.
743
+ 399
744
+ 6 Conclusions
745
+ We have presented an unsupervised probabilistic
746
+ answer type model. Our model uses contexts de-
747
+ rived from the question and the candidate answer
748
+ to calculate the appropriateness of a candidate an-
749
+ swer. Statistics gathered from a large corpus of
750
+ text are used in the calculation, and the model is
751
+ constructed to exploit these statistics without be-
752
+ ing overly specific or overly general.
753
+ The method presented here avoids the use of an
754
+ explicit list of answer types. Explicit answer types
755
+ can exhibit poor performance, especially for those
756
+ questions not fitting one of the types. They must
757
+ also be redefined when either the domain or corpus
758
+ substantially changes. By avoiding their use, our
759
+ answer typing method may be easier to adapt to
760
+ different corpora and question answering domains
761
+ (such as bioinformatics).
762
+ In addition to operating as a stand-alone answer
763
+ typing component, our system can be combined
764
+ with other existing answer typing strategies, es-
765
+ pecially in situations in which a catch-all answer
766
+ type is used. Our experimental results show that
767
+ our probabilistic model outperforms the oracle and
768
+ a system using automatic named entity recognition
769
+ under such circumstances. The performance of
770
+ our model is better than that of the semi-automatic
771
+ system, which is a better indication of the expected
772
+ performance of a comparable real-world answer
773
+ typing system.
774
+ Acknowledgments
775
+ The authors would like to thank the anonymous re-
776
+ viewers for their helpful comments on improving
777
+ the paper. The first author is supported by the Nat-
778
+ ural Sciences and Engineering Research Council
779
+ of Canada, the Alberta Ingenuity Fund, and the Al-
780
+ berta Informatics Circle of Research Excellence.
781
+ References
782
+ P.F. Brown, V.J. Della Pietra, P.V. deSouza, J.C. Lai, and R.L.
783
+ Mercer. 1990. Class-based n-gram Models of Natural
784
+ Language. Computational Linguistics, 16(2):79–85.
785
+ Y. Choueka and S. Lusignan. 1985. Disambiguation by Short
786
+ Contexts. Computer and the Humanities, 19:147–157.
787
+ K. Church and P. Hanks. 1989. Word Association Norms,
788
+ Mutual Information, and Lexicography. In Proceedings
789
+ of ACL-89, pages 76–83, Vancouver, British Columbia,
790
+ Canada.
791
+ H. Cui, K. Li, R. Sun, T-S. Chua, and M-K. Kan. 2004. Na-
792
+ tional University of Singapore at the TREC-13 Question
793
+ Answering Main Task. In Notebook of TREC 2004, pages
794
+ 34–42, Gaithersburg, Maryland.
795
+ D.R. Cutting, D. Karger, J. Pedersen, and J.W. Tukey. 1992.
796
+ Scatter/Gather: A Cluster-based Approach to Browsing
797
+ Large Document Collections. In Proceedings of SIGIR-
798
+ 92, pages 318–329, Copenhagen, Denmark.
799
+ A. Echihabi, U. Hermjakob, E. Hovy, D. Marcu, E. Melz,
800
+ and D. Ravichandran. 2003. Multiple-Engine Question
801
+ Answering in TextMap. In Proceedings of TREC 2003,
802
+ pages 772–781, Gaithersburg, Maryland.
803
+ C. Fellbaum. 1998. WordNet: An Electronic Lexical
804
+ Database. MIT Press, Cambridge, Massachusetts.
805
+ M.A. Greenwood. 2004. AnswerFinder: Question Answer-
806
+ ing from your Desktop. In Proceedings of the Seventh
807
+ Annual Colloquium for the UK Special Interest Group
808
+ for Computational Linguistics (CLUK ’04), University of
809
+ Birmingham, UK.
810
+ S. Harabagiu, D. Moldovan, C. Clark, M. Bowden,
811
+ J. Williams, and J. Bensley. 2003. Answer Mining by
812
+ Combining Extraction Techniques with Abductive Rea-
813
+ soning. In Proceedings of TREC 2003, pages 375–382,
814
+ Gaithersburg, Maryland.
815
+ U. Hermjakob. 2001. Parsing and Question Classification for
816
+ Question Answering. In Proceedings of the ACL Work-
817
+ shop on Open-Domain Question Answering, Toulouse,
818
+ France.
819
+ A. Ittycheriah, M. Franz, W-J. Zhu, and A. Ratnaparkhi.
820
+ 2001. Question Answering Using Maximum Entropy
821
+ Components. In Proceedings of NAACL 2001, Pittsburgh,
822
+ Pennsylvania.
823
+ G. Karypis, E.-H. Han, and V. Kumar. 1999. Chameleon: A
824
+ Hierarchical Clustering Algorithm using Dynamic Model-
825
+ ing. IEEE Computer: Special Issue on Data Analysis and
826
+ Mining, 32(8):68–75.
827
+ V. Krishnan, S. Das, and S. Chakrabarti. 2005. Enhanced
828
+ Answer Type Inference from Questions using Sequential
829
+ Models. In Proceedings of HLT/EMNLP 2005, pages
830
+ 315–322, Vancouver, British Columbia, Canada.
831
+ X. Li and D. Roth. 2002. Learning Question Classifiers.
832
+ In Proceedings of COLING 2002, pages 556–562, Taipei,
833
+ Taiwan.
834
+ M. Light, G. Mann, E. Riloff, and E. Breck. 2001. Analyses
835
+ for Elucidating Current Question Answering Technology.
836
+ Natural Language Engineering, 7(4):325–342.
837
+ D. Lin and P. Pantel. 2001. Discovery of Inference Rules
838
+ for Question Answering. Natural Language Engineering,
839
+ 7(4):343–360.
840
+ D. Lin. 1998. Automatic Retrieval and Clustering of Similar
841
+ Words. In Proceedings of COLING-ACL 1998, Montreal,
842
+ Qu´ebec, Canada.
843
+ D. Lin. 2001. Language and Text Analysis Tools. In Pro-
844
+ ceedings of HLT 2001, pages 222–227, San Diego, Cali-
845
+ fornia.
846
+ D. Maynard, V. Tablan, H. Cunningham, C. Ursu, H. Sag-
847
+ gion, K. Bontcheva, and Y. Wilks. 2002. Architectural
848
+ Elements of Language Engineering Robustness. Natural
849
+ Language Engineering, 8(2/3):257–274.
850
+ D. Moll´a and B. Hutchinson. 2003. Intrinsic versus Extrinsic
851
+ Evaluations of Parsing Systems. In Proceedings of EACL
852
+ Workshop on Evaluation Initiatives in Natural Language
853
+ Processing, pages 43–50, Budapest, Hungary.
854
+ P. Pantel and D. Lin. 2002. Document Clustering with Com-
855
+ mittees. In Proceedings of SIGIR 2002, pages 199–206,
856
+ Tampere, Finland.
857
+ F. Pereira, N. Tishby, and L. Lee. 1993. Distributional Clus-
858
+ tering of English Words. In Proceedings of ACL 1992,
859
+ pages 183–190.
860
+ D. Radev, W. Fan, H. Qi, H. Wu, and A. Grewal. 2002. Prob-
861
+ ablistic Question Answering on the Web. In Proceedings
862
+ of the Eleventh International World Wide Web Conference.
863
+ E.M. Voorhees. 2003. Overview of the TREC 2003 Ques-
864
+ tion Answering Track. In Proceedings of TREC 2003,
865
+ Gaithersburg, Maryland.
866
+ 400
867
+