biblicit 1.0 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (406) hide show
  1. data/.gitmodules +3 -0
  2. data/Gemfile +1 -1
  3. data/README.md +125 -30
  4. data/Rakefile +22 -0
  5. data/biblicit.gemspec +9 -7
  6. data/lib/biblicit/cb2bib.rb +10 -11
  7. data/lib/biblicit/citeseer.rb +14 -26
  8. data/lib/biblicit/extractor.rb +40 -19
  9. data/lib/biblicit/parscit.rb +38 -0
  10. data/parscit/.gitignore +8 -0
  11. data/parscit/CHANGELOG +125 -0
  12. data/parscit/COPYING +674 -0
  13. data/parscit/COPYING.LESSER +165 -0
  14. data/parscit/INSTALL +105 -0
  15. data/parscit/README +97 -0
  16. data/{perl/ParsCit/README.TXT → parscit/USAGE} +25 -15
  17. data/parscit/bin/archtest.pl +31 -0
  18. data/parscit/bin/citeExtract.pl +562 -0
  19. data/parscit/bin/conlleval.pl +315 -0
  20. data/parscit/bin/headExtract.pl +40 -0
  21. data/parscit/bin/parsHed/convert2TokenLevel.pl +138 -0
  22. data/parscit/bin/parsHed/keywordGen.pl +308 -0
  23. data/parscit/bin/parsHed/parseXmlHeader.pl +141 -0
  24. data/parscit/bin/parsHed/redo.parsHed.pl +198 -0
  25. data/parscit/bin/parsHed/tr2crfpp_parsHed.pl +521 -0
  26. data/parscit/bin/parseRefStrings.pl +102 -0
  27. data/parscit/bin/phOutput2xml.pl +223 -0
  28. data/parscit/bin/redo.parsCit.pl +105 -0
  29. data/parscit/bin/sectExtract.pl +149 -0
  30. data/parscit/bin/sectLabel/README +110 -0
  31. data/parscit/bin/sectLabel/README.txt +110 -0
  32. data/parscit/bin/sectLabel/genericSect/crossValidation.rb +98 -0
  33. data/parscit/bin/sectLabel/genericSect/extractFeature.rb +104 -0
  34. data/parscit/bin/sectLabel/genericSectExtract.rb +53 -0
  35. data/parscit/bin/sectLabel/getStructureInfo.pl +156 -0
  36. data/parscit/bin/sectLabel/processOmniXML.pl +1427 -0
  37. data/parscit/bin/sectLabel/processOmniXML_new.pl +1025 -0
  38. data/parscit/bin/sectLabel/processOmniXMLv2.pl +1529 -0
  39. data/parscit/bin/sectLabel/processOmniXMLv3.pl +964 -0
  40. data/parscit/bin/sectLabel/redo.sectLabel.pl +219 -0
  41. data/parscit/bin/sectLabel/simplifyOmniXML.pl +382 -0
  42. data/parscit/bin/sectLabel/single2multi.pl +190 -0
  43. data/parscit/bin/sectLabel/tr2crfpp.pl +158 -0
  44. data/parscit/bin/tr2crfpp.pl +260 -0
  45. data/parscit/bin/xml2train.pl +193 -0
  46. data/parscit/lib/CSXUtil/SafeText.pm +130 -0
  47. data/parscit/lib/Omni/Config.pm +93 -0
  48. data/parscit/lib/Omni/Omnicell.pm +263 -0
  49. data/parscit/lib/Omni/Omnicol.pm +292 -0
  50. data/parscit/lib/Omni/Omnidd.pm +328 -0
  51. data/parscit/lib/Omni/Omnidoc.pm +153 -0
  52. data/parscit/lib/Omni/Omniframe.pm +223 -0
  53. data/parscit/lib/Omni/Omniline.pm +423 -0
  54. data/parscit/lib/Omni/Omnipage.pm +282 -0
  55. data/parscit/lib/Omni/Omnipara.pm +232 -0
  56. data/parscit/lib/Omni/Omnirun.pm +303 -0
  57. data/parscit/lib/Omni/Omnitable.pm +336 -0
  58. data/parscit/lib/Omni/Omniword.pm +162 -0
  59. data/parscit/lib/Omni/Traversal.pm +313 -0
  60. data/parscit/lib/ParsCit/.PostProcess.pm.swp +0 -0
  61. data/parscit/lib/ParsCit/Citation.pm +737 -0
  62. data/parscit/lib/ParsCit/CitationContext.pm +220 -0
  63. data/parscit/lib/ParsCit/Config.pm +35 -0
  64. data/parscit/lib/ParsCit/Controller.pm +653 -0
  65. data/parscit/lib/ParsCit/PostProcess.pm +505 -0
  66. data/parscit/lib/ParsCit/PreProcess.pm +1041 -0
  67. data/parscit/lib/ParsCit/Tr2crfpp.pm +1195 -0
  68. data/parscit/lib/ParsHed/Config.pm +49 -0
  69. data/parscit/lib/ParsHed/Controller.pm +143 -0
  70. data/parscit/lib/ParsHed/PostProcess.pm +322 -0
  71. data/parscit/lib/ParsHed/Tr2crfpp.pm +448 -0
  72. data/{perl/ParsCit/lib/ParsCit/Tr2crfpp.pm → parscit/lib/ParsHed/Tr2crfpp_token.pm} +22 -21
  73. data/parscit/lib/SectLabel/AAMatching.pm +1949 -0
  74. data/parscit/lib/SectLabel/Config.pm +88 -0
  75. data/parscit/lib/SectLabel/Controller.pm +332 -0
  76. data/parscit/lib/SectLabel/PostProcess.pm +425 -0
  77. data/parscit/lib/SectLabel/PreProcess.pm +116 -0
  78. data/parscit/lib/SectLabel/Tr2crfpp.pm +1246 -0
  79. data/parscit/resources/parsCit.model +0 -0
  80. data/parscit/resources/parsCit.split.model +0 -0
  81. data/{perl/ParsCit → parscit}/resources/parsCitDict.txt +205 -0
  82. data/parscit/resources/parsHed/bigram +10 -0
  83. data/parscit/resources/parsHed/keywords +10 -0
  84. data/parscit/resources/parsHed/parsHed.model +0 -0
  85. data/parscit/resources/parsHed/parsHed.template +178 -0
  86. data/parscit/resources/sectLabel/affiliation.model +0 -0
  87. data/parscit/resources/sectLabel/author.model +0 -0
  88. data/parscit/resources/sectLabel/funcWord +320 -0
  89. data/parscit/resources/sectLabel/genericSect.model +0 -0
  90. data/parscit/resources/sectLabel/sectLabel.config +42 -0
  91. data/parscit/resources/sectLabel/sectLabel.configXml +42 -0
  92. data/parscit/resources/sectLabel/sectLabel.model +0 -0
  93. data/sh/convert_to_text.sh +20 -0
  94. data/spec/biblicit/extractor_spec.rb +121 -0
  95. data/spec/fixtures/Review_of_Michael_Tyes_Consciousness_Revisited.docx +0 -0
  96. data/spec/fixtures/critical-infrastructures.ps +63951 -0
  97. data/spec/fixtures/txt/E06-1050.txt +867 -0
  98. data/spec/fixtures/txt/sample1.txt +902 -0
  99. data/spec/fixtures/txt/sample2.txt +394 -0
  100. data/spec/spec_helper.rb +3 -0
  101. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Function.pm +2 -20
  102. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +0 -7
  103. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Parser.pm +0 -2
  104. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +0 -7
  105. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +6 -1
  106. data/svm-header-parse/HeaderParseService/tmp/.gitignore +4 -0
  107. data/svm-header-parse/extract.pl +75 -0
  108. metadata +351 -317
  109. data/perl/DocFilter/lib/DocFilter/Config.pm +0 -35
  110. data/perl/DocFilter/lib/DocFilter/Filter.pm +0 -51
  111. data/perl/FileConversionService/README.TXT +0 -11
  112. data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
  113. data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +0 -140
  114. data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +0 -77
  115. data/perl/FileConversionService/lib/FileConverter/Compression.pm +0 -137
  116. data/perl/FileConversionService/lib/FileConverter/Config.pm +0 -57
  117. data/perl/FileConversionService/lib/FileConverter/Controller.pm +0 -191
  118. data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +0 -61
  119. data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +0 -69
  120. data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +0 -69
  121. data/perl/FileConversionService/lib/FileConverter/PSToText.pm +0 -88
  122. data/perl/FileConversionService/lib/FileConverter/Prescript.pm +0 -68
  123. data/perl/FileConversionService/lib/FileConverter/TET.pm +0 -75
  124. data/perl/FileConversionService/lib/FileConverter/Utils.pm +0 -130
  125. data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +0 -140
  126. data/perl/HeaderParseService/resources/data/EbizHeaders.txt +0 -24330
  127. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +0 -27506
  128. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +0 -26495
  129. data/perl/HeaderParseService/resources/data/tagged_headers.txt +0 -40668
  130. data/perl/HeaderParseService/resources/data/test_header.txt +0 -31
  131. data/perl/HeaderParseService/resources/data/test_header.txt.parsed +0 -31
  132. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +0 -23
  133. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +0 -23
  134. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +0 -23
  135. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +0 -23
  136. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +0 -23
  137. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +0 -23
  138. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +0 -23
  139. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +0 -23
  140. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +0 -23
  141. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +0 -23
  142. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +0 -23
  143. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +0 -23
  144. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +0 -23
  145. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +0 -23
  146. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +0 -23
  147. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +0 -23
  148. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +0 -23
  149. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +0 -23
  150. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +0 -23
  151. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +0 -23
  152. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +0 -23
  153. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +0 -23
  154. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +0 -23
  155. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +0 -23
  156. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +0 -23
  157. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +0 -23
  158. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +0 -23
  159. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +0 -23
  160. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +0 -23
  161. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +0 -23
  162. data/perl/ParsCit/crfpp/traindata/parsCit.template +0 -60
  163. data/perl/ParsCit/crfpp/traindata/parsCit.train.data +0 -12104
  164. data/perl/ParsCit/crfpp/traindata/tagged_references.txt +0 -500
  165. data/perl/ParsCit/lib/CSXUtil/SafeText.pm +0 -140
  166. data/perl/ParsCit/lib/ParsCit/Citation.pm +0 -462
  167. data/perl/ParsCit/lib/ParsCit/CitationContext.pm +0 -132
  168. data/perl/ParsCit/lib/ParsCit/Config.pm +0 -46
  169. data/perl/ParsCit/lib/ParsCit/Controller.pm +0 -306
  170. data/perl/ParsCit/lib/ParsCit/PostProcess.pm +0 -367
  171. data/perl/ParsCit/lib/ParsCit/PreProcess.pm +0 -333
  172. data/perl/ParsCit/resources/parsCit.model +0 -0
  173. data/perl/extract.pl +0 -199
  174. data/spec/biblicit/cb2bib_spec.rb +0 -48
  175. data/spec/biblicit/citeseer_spec.rb +0 -40
  176. /data/{perl → svm-header-parse}/HeaderParseService/README.TXT +0 -0
  177. /data/{perl/DocFilter → svm-header-parse/HeaderParseService}/lib/CSXUtil/SafeText.pm +0 -0
  178. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +0 -0
  179. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +0 -0
  180. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +0 -0
  181. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/50states +0 -0
  182. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AddrTopWords.txt +0 -0
  183. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWords.txt +0 -0
  184. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWordsAll.txt +0 -0
  185. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/ChineseSurNames.txt +0 -0
  186. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames.bin +0 -0
  187. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
  188. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/DomainSuffixes.txt +0 -0
  189. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/LabeledHeader +0 -0
  190. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/README +0 -0
  191. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines +0 -0
  192. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines1 +0 -0
  193. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstract.txt +0 -0
  194. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstractTopWords +0 -0
  195. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/addr.txt +0 -0
  196. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affi.txt +0 -0
  197. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affis.bin +0 -0
  198. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
  199. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/allnamewords.bin +0 -0
  200. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_US.txt +0 -0
  201. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_world.txt +0 -0
  202. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/city.txt +0 -0
  203. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cityname.txt +0 -0
  204. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/country_abbr.txt +0 -0
  205. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/countryname.txt +0 -0
  206. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/dateTopWords +0 -0
  207. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/degree.txt +0 -0
  208. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/email.txt +0 -0
  209. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/excludeWords.txt +0 -0
  210. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/female-names +0 -0
  211. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstNames.txt +0 -0
  212. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames.bin +0 -0
  213. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
  214. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/intro.txt +0 -0
  215. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keyword.txt +0 -0
  216. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keywordTopWords +0 -0
  217. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/male-names +0 -0
  218. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/middleNames.txt +0 -0
  219. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/month.txt +0 -0
  220. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul +0 -0
  221. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label +0 -0
  222. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label.old +0 -0
  223. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.processed +0 -0
  224. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulAuthor +0 -0
  225. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulClassStat +0 -0
  226. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nickname.txt +0 -0
  227. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nicknames.bin +0 -0
  228. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/note.txt +0 -0
  229. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/page.txt +0 -0
  230. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/phone.txt +0 -0
  231. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/postcode.txt +0 -0
  232. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/pubnum.txt +0 -0
  233. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.bin +0 -0
  234. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.txt +0 -0
  235. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/states_and_abbreviations.txt +0 -0
  236. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords +0 -0
  237. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords.bin +0 -0
  238. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surNames.txt +0 -0
  239. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames.bin +0 -0
  240. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames_spec.bin +0 -0
  241. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/A.html +0 -0
  242. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/B.html +0 -0
  243. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/C.html +0 -0
  244. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/D.html +0 -0
  245. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/E.html +0 -0
  246. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/F.html +0 -0
  247. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/G.html +0 -0
  248. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/H.html +0 -0
  249. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/I.html +0 -0
  250. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/J.html +0 -0
  251. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/K.html +0 -0
  252. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/L.html +0 -0
  253. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/M.html +0 -0
  254. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/N.html +0 -0
  255. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/O.html +0 -0
  256. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/P.html +0 -0
  257. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Q.html +0 -0
  258. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/R.html +0 -0
  259. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/S.html +0 -0
  260. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/T.html +0 -0
  261. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/U.html +0 -0
  262. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/V.html +0 -0
  263. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/W.html +0 -0
  264. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
  265. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/X.html +0 -0
  266. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Y.html +0 -0
  267. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Z.html +0 -0
  268. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ae.html +0 -0
  269. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/am.html +0 -0
  270. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ar.html +0 -0
  271. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/at.html +0 -0
  272. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/au.html +0 -0
  273. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bd.html +0 -0
  274. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/be.html +0 -0
  275. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bg.html +0 -0
  276. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bh.html +0 -0
  277. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
  278. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bm.html +0 -0
  279. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bn.html +0 -0
  280. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/br.html +0 -0
  281. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ca.html +0 -0
  282. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ch.html +0 -0
  283. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cl.html +0 -0
  284. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cn.html +0 -0
  285. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/co.html +0 -0
  286. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cr.html +0 -0
  287. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cy.html +0 -0
  288. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cz.html +0 -0
  289. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/de.html +0 -0
  290. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
  291. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dk.html +0 -0
  292. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ec.html +0 -0
  293. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ee.html +0 -0
  294. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/eg.html +0 -0
  295. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/es.html +0 -0
  296. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/et.html +0 -0
  297. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/faq.html +0 -0
  298. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fi.html +0 -0
  299. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fj.html +0 -0
  300. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fo.html +0 -0
  301. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fr.html +0 -0
  302. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/geog.html +0 -0
  303. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gr.html +0 -0
  304. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gu.html +0 -0
  305. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hk.html +0 -0
  306. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hr.html +0 -0
  307. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hu.html +0 -0
  308. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/id.html +0 -0
  309. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ie.html +0 -0
  310. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/il.html +0 -0
  311. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/in.html +0 -0
  312. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/is.html +0 -0
  313. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/it.html +0 -0
  314. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jm.html +0 -0
  315. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jo.html +0 -0
  316. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jp.html +0 -0
  317. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
  318. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kr.html +0 -0
  319. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kw.html +0 -0
  320. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lb.html +0 -0
  321. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
  322. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lk.html +0 -0
  323. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lt.html +0 -0
  324. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lu.html +0 -0
  325. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lv.html +0 -0
  326. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ma.html +0 -0
  327. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
  328. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
  329. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mk.html +0 -0
  330. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mo.html +0 -0
  331. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
  332. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mt.html +0 -0
  333. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mx.html +0 -0
  334. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/my.html +0 -0
  335. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ni.html +0 -0
  336. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nl.html +0 -0
  337. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/no.html +0 -0
  338. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nz.html +0 -0
  339. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pa.html +0 -0
  340. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pe.html +0 -0
  341. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ph.html +0 -0
  342. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pl.html +0 -0
  343. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
  344. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pr.html +0 -0
  345. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ps.html +0 -0
  346. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pt.html +0 -0
  347. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/recognition.html +0 -0
  348. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/results.html +0 -0
  349. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ro.html +0 -0
  350. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ru.html +0 -0
  351. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sd.html +0 -0
  352. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/se.html +0 -0
  353. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sg.html +0 -0
  354. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/si.html +0 -0
  355. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sk.html +0 -0
  356. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/th.html +0 -0
  357. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tr.html +0 -0
  358. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tw.html +0 -0
  359. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ua.html +0 -0
  360. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uk.html +0 -0
  361. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ-full.html +0 -0
  362. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ.html +0 -0
  363. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uy.html +0 -0
  364. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ve.html +0 -0
  365. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/yu.html +0 -0
  366. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/za.html +0 -0
  367. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/zm.html +0 -0
  368. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list.txt +0 -0
  369. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/url.txt +0 -0
  370. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/webTopWords +0 -0
  371. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/words +0 -0
  372. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10ContextModelfold1 +0 -0
  373. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10Modelfold1 +0 -0
  374. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11ContextModelfold1 +0 -0
  375. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11Modelfold1 +0 -0
  376. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12ContextModelfold1 +0 -0
  377. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12Modelfold1 +0 -0
  378. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13ContextModelfold1 +0 -0
  379. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13Modelfold1 +0 -0
  380. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14ContextModelfold1 +0 -0
  381. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14Modelfold1 +0 -0
  382. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15ContextModelfold1 +0 -0
  383. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15Modelfold1 +0 -0
  384. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1ContextModelfold1 +0 -0
  385. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1Modelfold1 +0 -0
  386. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2ContextModelfold1 +0 -0
  387. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2Modelfold1 +0 -0
  388. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3ContextModelfold1 +0 -0
  389. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3Modelfold1 +0 -0
  390. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4ContextModelfold1 +0 -0
  391. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4Modelfold1 +0 -0
  392. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5ContextModelfold1 +0 -0
  393. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5Modelfold1 +0 -0
  394. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6ContextModelfold1 +0 -0
  395. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6Modelfold1 +0 -0
  396. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7ContextModelfold1 +0 -0
  397. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7Modelfold1 +0 -0
  398. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8ContextModelfold1 +0 -0
  399. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8Modelfold1 +0 -0
  400. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9ContextModelfold1 +0 -0
  401. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9Modelfold1 +0 -0
  402. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceModel +0 -0
  403. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceTrainF +0 -0
  404. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperBaseFeaDict +0 -0
  405. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperContextFeaDict +0 -0
  406. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +0 -0
@@ -0,0 +1,867 @@
1
+ A Probabilistic Answer Type Model
2
+ Christopher Pinchak
3
+ Department of Computing Science
4
+ University of Alberta
5
+ Edmonton, Alberta, Canada
6
+ pinchak@cs.ualberta.ca
7
+ Dekang Lin
8
+ Google, Inc.
9
+ 1600 Amphitheatre Parkway
10
+ Mountain View, CA
11
+ lindek@google.com
12
+ Abstract
13
+ All questions are implicitly associated
14
+ with an expected answer type. Unlike
15
+ previous approaches that require a prede-
16
+ fined set of question types, we present
17
+ a method for dynamically constructing
18
+ a probability-based answer type model
19
+ for each different question. Our model
20
+ evaluates the appropriateness of a poten-
21
+ tial answer by the probability that it fits
22
+ into the question contexts. Evaluation
23
+ is performed against manual and semi-
24
+ automatic methods using a fixed set of an-
25
+ swer labels. Results show our approach to
26
+ be superior for those questions classified
27
+ as having a miscellaneous answer type.
28
+ 1 Introduction
29
+ Given a question, people are usually able to form
30
+ an expectation about the type of the answer, even
31
+ if they do not know the actual answer. An accu-
32
+ rate expectation of the answer type makes it much
33
+ easier to select the answer from a sentence that
34
+ contains the query words. Consider the question
35
+ “What is the capital of Norway?” We would ex-
36
+ pect the answer to be a city and could filter out
37
+ most of the words in the following sentence:
38
+ The landed aristocracy was virtually crushed
39
+ by Hakon V, who reigned from 1299 to 1319,
40
+ and Oslo became the capital of Norway, re-
41
+ placing Bergen as the principal city of the
42
+ kingdom.
43
+ The goal of answer typing is to determine
44
+ whether a word’s semantic type is appropriate as
45
+ an answer for a question. Many previous ap-
46
+ proaches to answer typing, e.g., (Ittycheriah et al.,
47
+ 2001; Li and Roth, 2002; Krishnan et al., 2005),
48
+ employ a predefined set of answer types and use
49
+ supervised learning or manually constructed rules
50
+ to classify a question according to expected an-
51
+ swer type. A disadvantage of this approach is that
52
+ there will always be questions whose answers do
53
+ not belong to any of the predefined types.
54
+ Consider the question: “What are tourist attrac-
55
+ tions in Reims?” The answer may be many things:
56
+ a church, a historic residence, a park, a famous
57
+ intersection, a statue, etc. A common method to
58
+ deal with this problem is to define a catch-all class.
59
+ This class, however, tends not to be as effective as
60
+ other answer types.
61
+ Another disadvantage of predefined answer
62
+ types is with regard to granularity. If the types
63
+ are too specific, they are more difficult to tag. If
64
+ they are too general, too many candidates may be
65
+ identified as having the appropriate type.
66
+ In contrast to previous approaches that use a su-
67
+ pervised classifier to categorize questions into a
68
+ predefined set of types, we propose an unsuper-
69
+ vised method to dynamically construct a proba-
70
+ bilistic answer type model for each question. Such
71
+ a model can be used to evaluate whether or not
72
+ a word fits into the question context. For exam-
73
+ ple, given the question “What are tourist attrac-
74
+ tions in Reims?”, we would expect the appropriate
75
+ answers to fit into the context “X is a tourist attrac-
76
+ tion.” From a corpus, we can find the words that
77
+ appeared in this context, such as:
78
+ A-Ama Temple, Aborigine, addition, Anak
79
+ Krakatau, archipelago, area, baseball,
80
+ Bletchley Park, brewery, cabaret, Cairo,
81
+ Cape Town, capital, center, ...
82
+ Using the frequency counts of these words in
83
+ the context, we construct a probabilistic model
84
+ to compute P(in(w, Γ)|w), the probability for a
85
+ word w to occur in a set of contexts Γ, given an
86
+ occurrence of w. The parameters in this model are
87
+ obtained from a large, automatically parsed, un-
88
+ labeled corpus. By asking whether a word would
89
+ occur in a particular context extracted from a ques-
90
+ 393
91
+ tion, we avoid explicitly specifying a list of pos-
92
+ sible answer types. This has the added benefit
93
+ of being easily adapted to different domains and
94
+ corpora in which a list of explicit possible answer
95
+ types may be difficult to enumerate and/or identify
96
+ within the text.
97
+ The remainder of this paper is organized as fol-
98
+ lows. Section 2 discusses the work related to an-
99
+ swer typing. Section 3 discusses some of the key
100
+ concepts employed by our probabilistic model, in-
101
+ cluding word clusters and the contexts of a ques-
102
+ tion and a word. Section 4 presents our probabilis-
103
+ tic model for answer typing. Section 5 compares
104
+ the performance of our model with that of an or-
105
+ acle and a semi-automatic system performing the
106
+ same task. Finally, the concluding remarks in are
107
+ made in Section 6.
108
+ 2 Related Work
109
+ Light et al. (2001) performed an analysis of the
110
+ effect of multiple answer type occurrences in a
111
+ sentence. When multiple words of the same type
112
+ appear in a sentence, answer typing with fixed
113
+ types must assign each the same score. Light et
114
+ al. found that even with perfect answer sentence
115
+ identification, question typing, and semantic tag-
116
+ ging, a system could only achieve 59% accuracy
117
+ over the TREC-9 questions when using their set of
118
+ 24 non-overlapping answer types. By computing
119
+ the probability of an answer candidate occurring
120
+ in the question contexts directly, we avoid having
121
+ multiple candidates with the same level of appro-
122
+ priateness as answers.
123
+ There have been a variety of approaches to de-
124
+ termine the answer types, which are also known
125
+ as Qtargets (Echihabi et al., 2003). Most previous
126
+ approaches classify the answer type of a question
127
+ as one of a set of predefined types.
128
+ Many systems construct the classification rules
129
+ manually (Cui et al., 2004; Greenwood, 2004;
130
+ Hermjakob, 2001). The rules are usually triggered
131
+ by the presence of certain words in the question.
132
+ For example, if a question contains “author” then
133
+ the expected answer type is Person.
134
+ The number of answer types as well as the num-
135
+ ber of rules can vary a great deal. For example,
136
+ (Hermjakob, 2001) used 276 rules for 122 answer
137
+ types. Greenwood (2004), on the other hand, used
138
+ 46 answer types with unspecified number of rules.
139
+ The classification rules can also be acquired
140
+ with supervised learning. Ittycheriah, et al. (2001)
141
+ describe a maximum entropy based question clas-
142
+ sification scheme to classify each question as hav-
143
+ ing one of the MUC answer types. In a similar ex-
144
+ periment, Li & Roth (2002) train a question clas-
145
+ sifier based on a modified version of SNoW using
146
+ a richer set of answer types than Ittycheriah et al.
147
+ The LCC system (Harabagiu et al., 2003) com-
148
+ bines fixed types with a novel loop-back strategy.
149
+ In the event that a question cannot be classified as
150
+ one of the fixed entity types or semantic concepts
151
+ derived from WordNet (Fellbaum, 1998), the an-
152
+ swer type model backs off to a logic prover that
153
+ uses axioms derived form WordNet, along with
154
+ logic rules, to justify phrases as answers. Thus, the
155
+ LCC system is able to avoid the use of a miscel-
156
+ laneous type that often exhibits poor performance.
157
+ However, the logic prover must have sufficient ev-
158
+ idence to link the question to the answer, and gen-
159
+ eral knowledge must be encoded as axioms into
160
+ the system. In contrast, our answer type model
161
+ derives all of its information automatically from
162
+ unannotated text.
163
+ Answer types are often used as filters. It was
164
+ noted in (Radev et al., 2002) that a wrong guess
165
+ about the answer type reduces the chance for the
166
+ system to answer the question correctly by as
167
+ much as 17 times. The approach presented here
168
+ is less brittle. Even if the correct candidate does
169
+ not have the highest likelihood according to the
170
+ model, it may still be selected when the answer
171
+ extraction module takes into account other factors
172
+ such as the proximity to the matched keywords.
173
+ Furthermore, a probabilistic model makes it eas-
174
+ ier to integrate the answer type scores with scores
175
+ computed by other components in a question an-
176
+ swering system in a principled fashion.
177
+ 3 Resources
178
+ Before introducing our model, we first describe
179
+ the resources used in the model.
180
+ 3.1 Word Clusters
181
+ Natural language data is extremely sparse. Word
182
+ clusters are a way of coping with data sparseness
183
+ by abstracting a given word to a class of related
184
+ words. Clusters, as used by our probabilistic an-
185
+ swer typing system, play a role similar to that of
186
+ named entity types. Many methods exist for clus-
187
+ tering, e.g., (Brown et al., 1990; Cutting et al.,
188
+ 1992; Pereira et al., 1993; Karypis et al., 1999).
189
+ We used the Clustering By Committee (CBC)
190
+ 394
191
+ Table 1: Words and their clusters
192
+ Word Clusters
193
+ suite software, network, wireless, ...
194
+ rooms, bathrooms, restrooms, ...
195
+ meeting room, conference room, ...
196
+ ghost rabbit, squirrel, duck, elephant, frog, ...
197
+ goblins, ghosts, vampires, ghouls, ...
198
+ punk, reggae, folk, pop, hip-pop, ...
199
+ huge, larger, vast, significant, ...
200
+ coming-of-age, true-life, ...
201
+ clouds, cloud, fog, haze, mist, ...
202
+ algorithm (Pantel and Lin, 2002) on a 10 GB En-
203
+ glish text corpus to obtain 3607 clusters. The fol-
204
+ lowing is an example cluster generated by CBC:
205
+ tension, anger, anxiety, tensions, frustration,
206
+ resentment, uncertainty, confusion, conflict,
207
+ discontent, insecurity, controversy, unease,
208
+ bitterness, dispute, disagreement, nervous-
209
+ ness, sadness, despair, animosity, hostility,
210
+ outrage, discord, pessimism, anguish, ...
211
+ In the clustering generated by CBC, a word may
212
+ belong to multiple clusters. The clusters to which
213
+ a word belongs often represent the senses of the
214
+ word. Table 1 shows two example words and their
215
+ clusters.
216
+ 3.2 Contexts
217
+ The context in which a word appears often im-
218
+ poses constraints on the semantic type of the word.
219
+ This basic idea has been exploited by many pro-
220
+ posals for distributional similarity and clustering,
221
+ e.g., (Church and Hanks, 1989; Lin, 1998; Pereira
222
+ et al., 1993).
223
+ Similar to Lin and Pantel (2001), we define
224
+ the contexts of a word to be the undirected paths
225
+ in dependency trees involving that word at either
226
+ the beginning or the end. The following diagram
227
+ shows an example dependency tree:
228
+ Which city hosted the 1988 Winter Olympics?
229
+ det subj
230
+ obj
231
+ NN
232
+ NN
233
+ det
234
+ The links in the tree represent dependency rela-
235
+ tionships. The direction of a link is from the head
236
+ to the modifier in the relationship. Labels associ-
237
+ ated with the links represent types of relations.
238
+ In a context, the word itself is replaced with a
239
+ variable X. We say a word is the filler of a context
240
+ if it replaces X. For example, the contexts for the
241
+ word “Olympics” in the above sentence include
242
+ the following paths:
243
+ Context of “Olympics” Explanation
244
+ X Winter
245
+ NN
246
+ Winter X
247
+ X 1988
248
+ NN
249
+ 1988 X
250
+ X host
251
+ obj
252
+ host X
253
+ X host
254
+ obj
255
+ city
256
+ subj
257
+ city hosted X
258
+ In these paths, words are reduced to their root
259
+ forms and proper names are reduced to their entity
260
+ tags (we used MUC7 named entity tags).
261
+ Paths allow us to balance the specificity of con-
262
+ texts and the sparseness of data. Longer paths typ-
263
+ ically impose stricter constraints on the slot fillers.
264
+ However, they tend to have fewer occurrences,
265
+ making them more prone to errors arising from
266
+ data sparseness. We have restricted the path length
267
+ to two (involving at most three words) and require
268
+ the two ends of the path to be nouns.
269
+ We parsed the AQUAINT corpus (3GB) with
270
+ Minipar (Lin, 2001) and collected the frequency
271
+ counts of words appearing in various contexts.
272
+ Parsing and database construction is performed
273
+ off-line as the database is identical for all ques-
274
+ tions. We extracted 527,768 contexts that ap-
275
+ peared at least 25 times in the corpus. An example
276
+ context and its fillers are shown in Figure 1.
277
+ X host Olympics
278
+ subj obj
279
+ Africa 2 grant 1 readiness 2
280
+ AP 1 he 2 Rio de Janeiro 1
281
+ Argentina 1 homeland 3 Rome 1
282
+ Athens 16 IOC 1 Salt Lake City 2
283
+ Atlanta 3 Iran 2 school 1
284
+ Bangkok 1 Jakarta 1 S. Africa 1
285
+ . .. . .. . . .
286
+ decades 1 president 2 Zakopane 4
287
+ facility 1 Pusan 1
288
+ government 1 race 1
289
+ Figure 1: An example context and its fillers
290
+ 3.2.1 Question Contexts
291
+ To build a probabilistic model for answer typ-
292
+ ing, we extract a set of contexts, called question
293
+ contexts, from a question. An answer is expected
294
+ to be a plausible filler of the question contexts.
295
+ Question contexts are extracted from a question
296
+ with two rules. First, if the wh-word in a ques-
297
+ tion has a trace in the parse tree, the question con-
298
+ texts are the contexts of the trace. For example, the
299
+ 395
300
+ question “What do most tourists visit in Reims?”
301
+ is parsed as:
302
+ Whati
303
+ do most tourists visit ei
304
+ in Reims?
305
+ det
306
+ i
307
+ subj
308
+ det
309
+ obj
310
+ in
311
+ The symbol ei is the trace of whati. Minipar
312
+ generates the trace to indicate that the word what
313
+ is the object of visit in the deep structure of the
314
+ sentence. The following question contexts are ex-
315
+ tracted from the above question:
316
+ Context Explanation
317
+ X visit tourist
318
+ obj subj
319
+ tourist visits X
320
+ X visit Reims
321
+ obj in
322
+ visit X in Reims
323
+ The second rule deals with situations where
324
+ the wh-word is a determiner, as in the question
325
+ “Which city hosted the 1988 Winter Olympics?”
326
+ (the parse tree for which is shown in section 3.2).
327
+ In such cases, the question contexts consist of a
328
+ single context involving the noun that is modified
329
+ by the determiner. The context for the above sen-
330
+ tence is X city
331
+ subj
332
+ , corresponding to the sentence
333
+ “X is a city.” This context is used because the
334
+ question explicitly states that the desired answer is
335
+ a city. The context overrides the other contexts be-
336
+ cause the question explicitly states the desired an-
337
+ swer type. Experimental results have shown that
338
+ using this context in conjunction with other con-
339
+ texts extracted from the question produces lower
340
+ performance than using this context alone.
341
+ In the event that a context extracted from a ques-
342
+ tion is not found in the database, we shorten the
343
+ context in one of two ways. We start by replac-
344
+ ing the word at the end of the path with a wildcard
345
+ that matches any word. If this fails to yield en-
346
+ tries in the context database, we shorten the con-
347
+ text to length one and replace the end word with
348
+ automatically determined similar words instead of
349
+ a wildcard.
350
+ 3.2.2 Candidate Contexts
351
+ Candidate contexts are very similar in form to
352
+ question contexts, save for one important differ-
353
+ ence. Candidate contexts are extracted from the
354
+ parse trees of the answer candidates rather than the
355
+ question. In natural language, some words may
356
+ be polysemous. For example, Washington may re-
357
+ fer to a person, a city, or a state. The occurrences
358
+ of Washington in “Washington’s descendants” and
359
+ “suburban Washington” should not be given the
360
+ same score when the question is seeking a loca-
361
+ tion. Given that the sense of a word is largely de-
362
+ termined by its local context (Choueka and Lusig-
363
+ nan, 1985), candidate contexts allow the model to
364
+ take into account the candidate answers’ senses
365
+ implicitly.
366
+ 4 Probabilistic Model
367
+ The goal of an answer typing model is to evalu-
368
+ ate the appropriateness of a candidate word as an
369
+ answer to the question. If we assume that a set
370
+ of answer candidates is provided to our model by
371
+ some means (e.g., words comprising documents
372
+ extracted by an information retrieval engine), we
373
+ wish to compute the value P(in(w, ΓQ)|w). That
374
+ is, the appropriateness of a candidate answer w is
375
+ proportional to the probability that it will occur in
376
+ the question contexts ΓQ extracted from the ques-
377
+ tion.
378
+ To mitigate data sparseness, we can introduce
379
+ a hidden variable C that represents the clusters to
380
+ which the candidate answer may belong. As a can-
381
+ didate may belong to multiple clusters, we obtain:
382
+ P(in(w, ΓQ)|w) =
383
+ X
384
+ C
385
+ P(in(w, ΓQ), C|w) (1)
386
+ =
387
+ X
388
+ C
389
+ P(C|w)P(in(w, ΓQ)|C, w) (2)
390
+ Given that a word appears, we assume that it has
391
+ the same probability to appear in a context as all
392
+ other words in the same cluster. Therefore:
393
+ P(in(w, ΓQ)|C, w) ≈ P(in(C, ΓQ)|C) (3)
394
+ We can now rewrite the equation in (2) as:
395
+ P(in(w, ΓQ)|w) ≈
396
+ X
397
+ C
398
+ P(C|w)P(in(C, ΓQ)|C) (4)
399
+ This equation splits our model into two parts:
400
+ one models which clusters a word belongs to and
401
+ the other models how appropriate a cluster is to
402
+ the question contexts. When ΓQ consists of multi-
403
+ ple contexts, we make the na¨ıve Bayes assumption
404
+ that each individual context γQ ∈ ΓQ is indepen-
405
+ dent of all other contexts given the cluster C.
406
+ P(in(w, ΓQ)|w) ≈
407
+ X
408
+ C
409
+ P(C|w)
410
+ Y
411
+ γQ∈ΓQ
412
+ P(in(C, γQ)|C) (5)
413
+ Equation (5) needs the parameters P(C|w) and
414
+ P(in(C, γQ)|C), neither of which are directly
415
+ available from the context-filler database. We will
416
+ discuss the estimation of these parameters in Sec-
417
+ tion 4.2.
418
+ 396
419
+ 4.1 Using Candidate Contexts
420
+ The previous model assigns the same likelihood to
421
+ every instance of a given word. As we noted in
422
+ section 3.2.2, a word may be polysemous. To take
423
+ into account a word’s context, we can instead com-
424
+ pute P(in(w, ΓQ)|w, in(w, Γw)), where Γw is the
425
+ set of contexts for the candidate word w in a re-
426
+ trieved passage.
427
+ By introducing word clusters as intermediate
428
+ variables as before and making a similar assump-
429
+ tion as in equation (3), we obtain:
430
+ P(in(w, ΓQ)|w, in(w, Γw))
431
+ =
432
+ X
433
+ C
434
+ P(in(w, ΓQ), C|w, in(w, Γw)) (6)
435
+
436
+ X
437
+ C
438
+ P(C|w, in(w, Γw))P(in(C, ΓQ)|C) (7)
439
+ Like equation (4), equation (7) partitions the
440
+ model into two parts. Unlike P(C|w) in equation
441
+ (4), the probability of the cluster is now based on
442
+ the particular occurrence of the word in the candi-
443
+ date contexts. It can be estimated by:
444
+ P(C|w, in(w, Γw))
445
+ =
446
+ P(in(w, Γw)|w, C)P(w, C)
447
+ P(in(w, Γw)|w)P(w)
448
+ (8)
449
+
450
+ Y
451
+ γw∈Γw
452
+ P(in(w, γw)|w, C)
453
+ Y
454
+ γw∈Γw
455
+ P(in(w, γw)|w)
456
+ × P(C|w) (9)
457
+ =
458
+ Y
459
+ γw∈Γw
460
+
461
+ P(C|w, in(w, γw))
462
+ P(C|w)
463
+ «
464
+ × P(C|w) (10)
465
+ 4.2 Estimating Parameters
466
+ Our probabilistic model requires the parameters
467
+ P(C|w), P(C|w, in(w, γ)), and P(in(C, γ)|C),
468
+ where w is a word, C is a cluster that w belongs to,
469
+ and γ is a question or candidate context. This sec-
470
+ tion explains how these parameters are estimated
471
+ without using labeled data.
472
+ The context-filler database described in Sec-
473
+ tion 3.2 provides the joint and marginal fre-
474
+ quency counts of contexts and words (|in(γ, w)|,
475
+ |in(∗, γ)| and |in(w, ∗)|). These counts al-
476
+ low us to compute the probabilities P(in(w, γ)),
477
+ P(in(w, ∗)), and P(in(∗, γ)). We can also com-
478
+ pute P(in(w, γ)|w), which is smoothed with add-
479
+ one smoothing (see equation (11) in Figure 2).
480
+ The estimation of P(C|w) presents a challenge.
481
+ We have no corpus from which we can directly
482
+ measure P(C|w) because word instances are not
483
+ labeled with their clusters.
484
+ P(in(w, γ)|w) =
485
+ |in(w, γ)| + P(in(∗, γ))
486
+ |in(w, ∗)| + 1
487
+ (11)
488
+ Pu(C|w) =
489
+ (
490
+ 1
491
+ |{C |w∈C }|
492
+ if w ∈ C,
493
+ 0 otherwise
494
+ (12)
495
+ P(C|w) =
496
+ X
497
+ w ∈S(w)
498
+ sim(w, w ) × Pu(C|w )
499
+ X
500
+ {C |w∈C },
501
+ w ∈S(w)
502
+ sim(w, w ) × Pu(C |w )
503
+ (13)
504
+ P(in(C, γ)|C) =
505
+ X
506
+ w ∈C
507
+ P(C|w ) × |in(w , γ)| + P(in(∗, γ))
508
+ X
509
+ w ∈C
510
+ P(C|w ) × |in(w , ∗)| + 1
511
+ (14)
512
+ Figure 2: Probability estimation
513
+ We use the average weighted “guesses” of the
514
+ top similar words of w to compute P(C|w) (see
515
+ equation 13). The intuition is that if w and w
516
+ are similar words, P(C|w ) and P(C|w) tend
517
+ to have similar values. Since we do not know
518
+ P(C|w ) either, we substitute it with uniform dis-
519
+ tribution Pu(C|w ) as in equation (12) of Fig-
520
+ ure 2. Although Pu(C|w ) is a very crude guess,
521
+ the weighted average of a set of such guesses can
522
+ often be quite accurate.
523
+ The similarities between words are obtained as
524
+ a byproduct of the CBC algorithm. For each word,
525
+ we use S(w) to denote the top-n most similar
526
+ words (n=50 in our experiments) and sim(w, w )
527
+ to denote the similarity between words w and w .
528
+ The following is a sample similar word list for the
529
+ word suit:
530
+ S(suit) = {lawsuit 0.49, suits 0.47, com-
531
+ plaint 0.29, lawsuits 0.27, jacket 0.25, coun-
532
+ tersuit 0.24, counterclaim 0.24, pants 0.24,
533
+ trousers 0.22, shirt 0.21, slacks 0.21, case
534
+ 0.21, pantsuit 0.21, shirts 0.20, sweater 0.20,
535
+ coat 0.20, ...}
536
+ The estimation for P(C|w, in(w, γw)) is sim-
537
+ ilar to that of P(C|w) except that instead of all
538
+ w ∈ S(w), we instead use {w |w ∈ S(w) ∧
539
+ in(w , γw)}. By only looking at a particular con-
540
+ text γw, we may obtain a different distribution over
541
+ C than P(C|w) specifies. In the event that the data
542
+ are too sparse to estimate P(C|w, in(w, γw)), we
543
+ fall back to using P(C|w).
544
+ P(in(C, γ)|C) is computed in (14) by assum-
545
+ ing each instance of w contains a fractional in-
546
+ stance of C and the fractional count is P(C|w).
547
+ Again, add-one smoothing is used.
548
+ 397
549
+ System Median % Top 1% Top 5% Top 10% Top 50%
550
+ Oracle 0.7% 89 (57%) 123 (79%) 131 (85%) 154 (99%)
551
+ Frequency 7.7% 31 (20%) 67 (44%) 86 (56%) 112 (73%)
552
+ Our model 1.2% 71 (46%) 106 (69%) 119 (77%) 146 (95%)
553
+ no cand. contexts 2.2% 58 (38%) 102 (66%) 113 (73%) 145 (94%)
554
+ ANNIE 4.0% 54 (35%) 79 (51%) 93 (60%) 123 (80%)
555
+ Table 2: Summary of Results
556
+ 5 Experimental Setup & Results
557
+ We evaluate our answer typing system by using
558
+ it to filter the contents of documents retrieved by
559
+ the information retrieval portion of a question an-
560
+ swering system. Each answer candidate in the set
561
+ of documents is scored by the answer typing sys-
562
+ tem and the list is sorted in descending order of
563
+ score. We treat the system as a filter and observe
564
+ the proportion of candidates that must be accepted
565
+ by the filter so that at least one correct answer is
566
+ accepted. A model that allows a low percentage
567
+ of candidates to pass while still allowing at least
568
+ one correct answer through is favorable to a model
569
+ in which a high number of candidates must pass.
570
+ This represents an intrinsic rather than extrinsic
571
+ evaluation (Moll´a and Hutchinson, 2003) that we
572
+ believe illustrates the usefulness of our model.
573
+ The evaluation data consist of 154 questions
574
+ from the TREC-2003 QA Track (Voorhees, 2003)
575
+ satisfying the following criteria, along with the top
576
+ 10 documents returned for each question as iden-
577
+ tified by NIST using the PRISE1 search engine.
578
+ • the question begins with What, Which, or
579
+ Who. We restricted the evaluation such ques-
580
+ tions because our system is designed to deal
581
+ with questions whose answer types are often
582
+ semantically open-ended noun phrases.
583
+ • There exists entry for the question in the an-
584
+ swer patterns provided by Ken Litkowski2.
585
+ • One of the top-10 documents returned by
586
+ PRISE contains a correct answer.
587
+ We compare the performance of our prob-
588
+ abilistic model with that of two other sys-
589
+ tems. Both comparison systems make use of a
590
+ small, predefined set of manually-assigned MUC-
591
+ 7 named-entity types (location, person, organiza-
592
+ tion, cardinal, percent, date, time, duration, mea-
593
+ sure, money) augmented with thing-name (proper
594
+ 1
595
+ www.itl.nist.gov/iad/894.02/works/papers/zp2/zp2.html
596
+ 2
597
+ trec.nist.gov/data/qa/2003 qadata/03QA.tasks/t12.pats.txt
598
+ names of inanimate objects) and miscellaneous
599
+ (a catch-all answer type of all other candidates).
600
+ Some examples of thing-name are Guinness Book
601
+ of World Records, Thriller, Mars Pathfinder, and
602
+ Grey Cup. Examples of miscellaneous answers are
603
+ copper, oil, red, and iris.
604
+ The differences in the comparison systems is
605
+ with respect to how entity types are assigned to the
606
+ words in the candidate documents. We make use
607
+ of the ANNIE (Maynard et al., 2002) named entity
608
+ recognition system, along with a manual assigned
609
+ “oracle” strategy, to assign types to candidate an-
610
+ swers. In each case, the score for a candidate is
611
+ either 1 if it is tagged as the same type as the ques-
612
+ tion or 0 otherwise. With this scoring scheme pro-
613
+ ducing a sorted list we can compute the probability
614
+ of the first correct answer appearing at rank R = k
615
+ as follows:
616
+ P(R = k) =
617
+ k−2Y
618
+ i=0
619
+
620
+ t − c − i
621
+ t − i
622
+ «
623
+ c
624
+ t − k + 1
625
+ (15)
626
+ where t is the number of unique candidate answers
627
+ that are of the appropriate type and c is the number
628
+ of unique candidate answers that are correct.
629
+ Using the probabilities in equation (15), we
630
+ compute the expected rank, E(R), of the first cor-
631
+ rect answer of a given question in the system as:
632
+ E(R) =
633
+ t−c+1X
634
+ k=1
635
+ kP(R = k) (16)
636
+ Answer candidates are the set of ANNIE-
637
+ identified tokens with stop words and punctuation
638
+ removed. This yields between 900 and 8000 can-
639
+ didates for each question, depending on the top 10
640
+ documents returned by PRISE. The oracle system
641
+ represents an upper bound on using the predefined
642
+ set of answer types. The ANNIE system repre-
643
+ sents a more realistic expectation of performance.
644
+ The median percentage of candidates that are
645
+ accepted by a filter over the questions of our eval-
646
+ uation data provides one measure of performance
647
+ and is preferred to the average because of the ef-
648
+ fect of large values on the average. In QA, a sys-
649
+ tem accepting 60% of the candidates is not signif-
650
+ icantly better or worse than one accepting 100%,
651
+ 398
652
+ System Measure
653
+ Question Type
654
+ All Location Person Organization Thing-Name Misc Other
655
+ (154) (57) (17) (19) (17) (37) (7)
656
+ Our model
657
+ Median 1.2% 0.8% 2.0% 1.3% 3.7% 3.5% 12.2%
658
+ Top 1% 71 34 6 9 7 13 2
659
+ Top 5% 106 53 11 11 10 19 2
660
+ Top 10% 119 55 12 17 10 22 3
661
+ Top 50% 146 56 16 18 17 34 5
662
+ Oracle
663
+ Median 0.7% 0.4% 1.0% 0.3% 0.4% 16.0% 0.3%
664
+ Top 1% 89 44 8 16 14 1 6
665
+ Top 5% 123 57 17 19 17 6 7
666
+ Top 10% 131 57 17 19 17 14 7
667
+ Top 50% 154 57 17 19 17 37 7
668
+ ANNIE
669
+ Median 4.0% 0.6% 1.4% 6.1% 100% 16.7% 50.0%
670
+ Top 1% 54 39 5 7 0 0 3
671
+ Top 5% 79 53 12 9 0 2 3
672
+ Top 10% 93 54 13 11 0 12 3
673
+ Top 50% 123 56 16 15 5 28 3
674
+ Table 3: Detailed breakdown of performance
675
+ but the effect on average is quite high. Another
676
+ measure is to observe the number of questions
677
+ with at least one correct answer in the top N% for
678
+ various values of N. By examining the number of
679
+ correct answers found in the top N% we can better
680
+ understand what an effective cutoff would be.
681
+ The overall results of our comparison can be
682
+ found in Table 2. We have added the results of
683
+ a system that scores candidates based on their fre-
684
+ quency within the document as a comparison with
685
+ a simple, yet effective, strategy. The second col-
686
+ umn is the median percentage of where the highest
687
+ scored correct answer appears in the sorted candi-
688
+ date list. Low percentage values mean the answer
689
+ is usually found high in the sorted list. The re-
690
+ maining columns list the number of questions that
691
+ have a correct answer somewhere in the top N%
692
+ of their sorted lists. This is meant to show the ef-
693
+ fects of imposing a strict cutoff prior to running
694
+ the answer type model.
695
+ The oracle system performs best, as it bene-
696
+ fits from both manual question classification and
697
+ manual entity tagging. If entity assignment is
698
+ performed by an automatic system (as it is for
699
+ ANNIE), the performance drops noticeably. Our
700
+ probabilistic model performs better than ANNIE
701
+ and achieves approximately 2/3 of the perfor-
702
+ mance of the oracle system. Table 2 also shows
703
+ that the use of candidate contexts increases the
704
+ performance of our answer type model.
705
+ Table 3 shows the performance of the oracle
706
+ system, our model, and the ANNIE system broken
707
+ down by manually-assigned answer types. Due
708
+ to insufficient numbers of questions, the cardinal,
709
+ percent, time, duration, measure, and money types
710
+ are combined into an “Other” category. When
711
+ compared with the oracle system, our model per-
712
+ forms worse overall for questions of all types ex-
713
+ cept for those seeking miscellaneous answers. For
714
+ miscellaneous questions, the oracle identifies all
715
+ tokens that do not belong to one of the other
716
+ known categories as possible answers. For all
717
+ questions of non-miscellaneous type, only a small
718
+ subset of the candidates are marked appropriate.
719
+ In particular, our model performs worse than the
720
+ oracle for questions seeking persons and thing-
721
+ names. Person questions often seek rare person
722
+ names, which occur in few contexts and are diffi-
723
+ cult to reliably cluster. Thing-name questions are
724
+ easy for a human to identify but difficult for au-
725
+ tomatic system to identify. Thing-names are a di-
726
+ verse category and are not strongly associated with
727
+ any identifying contexts.
728
+ Our model outperforms the ANNIE system in
729
+ general, and for questions seeking organizations,
730
+ thing-names, and miscellaneous targets in partic-
731
+ ular. ANNIE may have low coverage on organi-
732
+ zation names, resulting in reduced performance.
733
+ Like the oracle, ANNIE treats all candidates not
734
+ assigned one of the categories as appropriate for
735
+ miscellaneous questions. Because ANNIE cannot
736
+ identify thing-names, they are treated as miscella-
737
+ neous. ANNIE shows low performance on thing-
738
+ names because words incorrectly assigned types
739
+ are sorted to the bottom of the list for miscella-
740
+ neous and thing-name questions. If a correct an-
741
+ swer is incorrectly assigned a type it will be sorted
742
+ near the bottom, resulting in a poor score.
743
+ 399
744
+ 6 Conclusions
745
+ We have presented an unsupervised probabilistic
746
+ answer type model. Our model uses contexts de-
747
+ rived from the question and the candidate answer
748
+ to calculate the appropriateness of a candidate an-
749
+ swer. Statistics gathered from a large corpus of
750
+ text are used in the calculation, and the model is
751
+ constructed to exploit these statistics without be-
752
+ ing overly specific or overly general.
753
+ The method presented here avoids the use of an
754
+ explicit list of answer types. Explicit answer types
755
+ can exhibit poor performance, especially for those
756
+ questions not fitting one of the types. They must
757
+ also be redefined when either the domain or corpus
758
+ substantially changes. By avoiding their use, our
759
+ answer typing method may be easier to adapt to
760
+ different corpora and question answering domains
761
+ (such as bioinformatics).
762
+ In addition to operating as a stand-alone answer
763
+ typing component, our system can be combined
764
+ with other existing answer typing strategies, es-
765
+ pecially in situations in which a catch-all answer
766
+ type is used. Our experimental results show that
767
+ our probabilistic model outperforms the oracle and
768
+ a system using automatic named entity recognition
769
+ under such circumstances. The performance of
770
+ our model is better than that of the semi-automatic
771
+ system, which is a better indication of the expected
772
+ performance of a comparable real-world answer
773
+ typing system.
774
+ Acknowledgments
775
+ The authors would like to thank the anonymous re-
776
+ viewers for their helpful comments on improving
777
+ the paper. The first author is supported by the Nat-
778
+ ural Sciences and Engineering Research Council
779
+ of Canada, the Alberta Ingenuity Fund, and the Al-
780
+ berta Informatics Circle of Research Excellence.
781
+ References
782
+ P.F. Brown, V.J. Della Pietra, P.V. deSouza, J.C. Lai, and R.L.
783
+ Mercer. 1990. Class-based n-gram Models of Natural
784
+ Language. Computational Linguistics, 16(2):79–85.
785
+ Y. Choueka and S. Lusignan. 1985. Disambiguation by Short
786
+ Contexts. Computer and the Humanities, 19:147–157.
787
+ K. Church and P. Hanks. 1989. Word Association Norms,
788
+ Mutual Information, and Lexicography. In Proceedings
789
+ of ACL-89, pages 76–83, Vancouver, British Columbia,
790
+ Canada.
791
+ H. Cui, K. Li, R. Sun, T-S. Chua, and M-K. Kan. 2004. Na-
792
+ tional University of Singapore at the TREC-13 Question
793
+ Answering Main Task. In Notebook of TREC 2004, pages
794
+ 34–42, Gaithersburg, Maryland.
795
+ D.R. Cutting, D. Karger, J. Pedersen, and J.W. Tukey. 1992.
796
+ Scatter/Gather: A Cluster-based Approach to Browsing
797
+ Large Document Collections. In Proceedings of SIGIR-
798
+ 92, pages 318–329, Copenhagen, Denmark.
799
+ A. Echihabi, U. Hermjakob, E. Hovy, D. Marcu, E. Melz,
800
+ and D. Ravichandran. 2003. Multiple-Engine Question
801
+ Answering in TextMap. In Proceedings of TREC 2003,
802
+ pages 772–781, Gaithersburg, Maryland.
803
+ C. Fellbaum. 1998. WordNet: An Electronic Lexical
804
+ Database. MIT Press, Cambridge, Massachusetts.
805
+ M.A. Greenwood. 2004. AnswerFinder: Question Answer-
806
+ ing from your Desktop. In Proceedings of the Seventh
807
+ Annual Colloquium for the UK Special Interest Group
808
+ for Computational Linguistics (CLUK ’04), University of
809
+ Birmingham, UK.
810
+ S. Harabagiu, D. Moldovan, C. Clark, M. Bowden,
811
+ J. Williams, and J. Bensley. 2003. Answer Mining by
812
+ Combining Extraction Techniques with Abductive Rea-
813
+ soning. In Proceedings of TREC 2003, pages 375–382,
814
+ Gaithersburg, Maryland.
815
+ U. Hermjakob. 2001. Parsing and Question Classification for
816
+ Question Answering. In Proceedings of the ACL Work-
817
+ shop on Open-Domain Question Answering, Toulouse,
818
+ France.
819
+ A. Ittycheriah, M. Franz, W-J. Zhu, and A. Ratnaparkhi.
820
+ 2001. Question Answering Using Maximum Entropy
821
+ Components. In Proceedings of NAACL 2001, Pittsburgh,
822
+ Pennsylvania.
823
+ G. Karypis, E.-H. Han, and V. Kumar. 1999. Chameleon: A
824
+ Hierarchical Clustering Algorithm using Dynamic Model-
825
+ ing. IEEE Computer: Special Issue on Data Analysis and
826
+ Mining, 32(8):68–75.
827
+ V. Krishnan, S. Das, and S. Chakrabarti. 2005. Enhanced
828
+ Answer Type Inference from Questions using Sequential
829
+ Models. In Proceedings of HLT/EMNLP 2005, pages
830
+ 315–322, Vancouver, British Columbia, Canada.
831
+ X. Li and D. Roth. 2002. Learning Question Classifiers.
832
+ In Proceedings of COLING 2002, pages 556–562, Taipei,
833
+ Taiwan.
834
+ M. Light, G. Mann, E. Riloff, and E. Breck. 2001. Analyses
835
+ for Elucidating Current Question Answering Technology.
836
+ Natural Language Engineering, 7(4):325–342.
837
+ D. Lin and P. Pantel. 2001. Discovery of Inference Rules
838
+ for Question Answering. Natural Language Engineering,
839
+ 7(4):343–360.
840
+ D. Lin. 1998. Automatic Retrieval and Clustering of Similar
841
+ Words. In Proceedings of COLING-ACL 1998, Montreal,
842
+ Qu´ebec, Canada.
843
+ D. Lin. 2001. Language and Text Analysis Tools. In Pro-
844
+ ceedings of HLT 2001, pages 222–227, San Diego, Cali-
845
+ fornia.
846
+ D. Maynard, V. Tablan, H. Cunningham, C. Ursu, H. Sag-
847
+ gion, K. Bontcheva, and Y. Wilks. 2002. Architectural
848
+ Elements of Language Engineering Robustness. Natural
849
+ Language Engineering, 8(2/3):257–274.
850
+ D. Moll´a and B. Hutchinson. 2003. Intrinsic versus Extrinsic
851
+ Evaluations of Parsing Systems. In Proceedings of EACL
852
+ Workshop on Evaluation Initiatives in Natural Language
853
+ Processing, pages 43–50, Budapest, Hungary.
854
+ P. Pantel and D. Lin. 2002. Document Clustering with Com-
855
+ mittees. In Proceedings of SIGIR 2002, pages 199–206,
856
+ Tampere, Finland.
857
+ F. Pereira, N. Tishby, and L. Lee. 1993. Distributional Clus-
858
+ tering of English Words. In Proceedings of ACL 1992,
859
+ pages 183–190.
860
+ D. Radev, W. Fan, H. Qi, H. Wu, and A. Grewal. 2002. Prob-
861
+ ablistic Question Answering on the Web. In Proceedings
862
+ of the Eleventh International World Wide Web Conference.
863
+ E.M. Voorhees. 2003. Overview of the TREC 2003 Ques-
864
+ tion Answering Track. In Proceedings of TREC 2003,
865
+ Gaithersburg, Maryland.
866
+ 400
867
+