biblicit 1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (322) hide show
  1. data/.gitignore +3 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +6 -0
  4. data/LICENSE.TXT +176 -0
  5. data/README.md +120 -0
  6. data/Rakefile +8 -0
  7. data/biblicit.gemspec +33 -0
  8. data/lib/biblicit/cb2bib.rb +83 -0
  9. data/lib/biblicit/citeseer.rb +53 -0
  10. data/lib/biblicit/extractor.rb +37 -0
  11. data/lib/biblicit.rb +6 -0
  12. data/perl/DocFilter/lib/CSXUtil/SafeText.pm +140 -0
  13. data/perl/DocFilter/lib/DocFilter/Config.pm +35 -0
  14. data/perl/DocFilter/lib/DocFilter/Filter.pm +51 -0
  15. data/perl/FileConversionService/README.TXT +11 -0
  16. data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
  17. data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +140 -0
  18. data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +77 -0
  19. data/perl/FileConversionService/lib/FileConverter/Compression.pm +137 -0
  20. data/perl/FileConversionService/lib/FileConverter/Config.pm +57 -0
  21. data/perl/FileConversionService/lib/FileConverter/Controller.pm +191 -0
  22. data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +61 -0
  23. data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +69 -0
  24. data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +69 -0
  25. data/perl/FileConversionService/lib/FileConverter/PSToText.pm +88 -0
  26. data/perl/FileConversionService/lib/FileConverter/Prescript.pm +68 -0
  27. data/perl/FileConversionService/lib/FileConverter/TET.pm +75 -0
  28. data/perl/FileConversionService/lib/FileConverter/Utils.pm +130 -0
  29. data/perl/HeaderParseService/README.TXT +80 -0
  30. data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +140 -0
  31. data/perl/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +968 -0
  32. data/perl/HeaderParseService/lib/HeaderParse/API/Function.pm +2016 -0
  33. data/perl/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +444 -0
  34. data/perl/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +409 -0
  35. data/perl/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +537 -0
  36. data/perl/HeaderParseService/lib/HeaderParse/API/Parser.pm +68 -0
  37. data/perl/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +1880 -0
  38. data/perl/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +46 -0
  39. data/perl/HeaderParseService/resources/data/EbizHeaders.txt +24330 -0
  40. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +27506 -0
  41. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +26495 -0
  42. data/perl/HeaderParseService/resources/data/tagged_headers.txt +40668 -0
  43. data/perl/HeaderParseService/resources/data/test_header.txt +31 -0
  44. data/perl/HeaderParseService/resources/data/test_header.txt.parsed +31 -0
  45. data/perl/HeaderParseService/resources/database/50states +60 -0
  46. data/perl/HeaderParseService/resources/database/AddrTopWords.txt +17 -0
  47. data/perl/HeaderParseService/resources/database/AffiTopWords.txt +35 -0
  48. data/perl/HeaderParseService/resources/database/AffiTopWordsAll.txt +533 -0
  49. data/perl/HeaderParseService/resources/database/ChineseSurNames.txt +276 -0
  50. data/perl/HeaderParseService/resources/database/Csurnames.bin +0 -0
  51. data/perl/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
  52. data/perl/HeaderParseService/resources/database/DomainSuffixes.txt +242 -0
  53. data/perl/HeaderParseService/resources/database/LabeledHeader +18 -0
  54. data/perl/HeaderParseService/resources/database/README +2 -0
  55. data/perl/HeaderParseService/resources/database/TrainMulClassLines +254 -0
  56. data/perl/HeaderParseService/resources/database/TrainMulClassLines1 +510 -0
  57. data/perl/HeaderParseService/resources/database/abstract.txt +1 -0
  58. data/perl/HeaderParseService/resources/database/abstractTopWords +9 -0
  59. data/perl/HeaderParseService/resources/database/addr.txt +28 -0
  60. data/perl/HeaderParseService/resources/database/affi.txt +34 -0
  61. data/perl/HeaderParseService/resources/database/affis.bin +0 -0
  62. data/perl/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
  63. data/perl/HeaderParseService/resources/database/allnamewords.bin +0 -0
  64. data/perl/HeaderParseService/resources/database/cities_US.txt +4512 -0
  65. data/perl/HeaderParseService/resources/database/cities_world.txt +4463 -0
  66. data/perl/HeaderParseService/resources/database/city.txt +3150 -0
  67. data/perl/HeaderParseService/resources/database/cityname.txt +3151 -0
  68. data/perl/HeaderParseService/resources/database/country_abbr.txt +243 -0
  69. data/perl/HeaderParseService/resources/database/countryname.txt +262 -0
  70. data/perl/HeaderParseService/resources/database/dateTopWords +30 -0
  71. data/perl/HeaderParseService/resources/database/degree.txt +67 -0
  72. data/perl/HeaderParseService/resources/database/email.txt +3 -0
  73. data/perl/HeaderParseService/resources/database/excludeWords.txt +40 -0
  74. data/perl/HeaderParseService/resources/database/female-names +4960 -0
  75. data/perl/HeaderParseService/resources/database/firstNames.txt +8448 -0
  76. data/perl/HeaderParseService/resources/database/firstnames.bin +0 -0
  77. data/perl/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
  78. data/perl/HeaderParseService/resources/database/intro.txt +2 -0
  79. data/perl/HeaderParseService/resources/database/keyword.txt +5 -0
  80. data/perl/HeaderParseService/resources/database/keywordTopWords +7 -0
  81. data/perl/HeaderParseService/resources/database/male-names +3906 -0
  82. data/perl/HeaderParseService/resources/database/middleNames.txt +2 -0
  83. data/perl/HeaderParseService/resources/database/month.txt +35 -0
  84. data/perl/HeaderParseService/resources/database/mul +868 -0
  85. data/perl/HeaderParseService/resources/database/mul.label +869 -0
  86. data/perl/HeaderParseService/resources/database/mul.label.old +869 -0
  87. data/perl/HeaderParseService/resources/database/mul.processed +762 -0
  88. data/perl/HeaderParseService/resources/database/mulAuthor +619 -0
  89. data/perl/HeaderParseService/resources/database/mulClassStat +45 -0
  90. data/perl/HeaderParseService/resources/database/nickname.txt +58 -0
  91. data/perl/HeaderParseService/resources/database/nicknames.bin +0 -0
  92. data/perl/HeaderParseService/resources/database/note.txt +121 -0
  93. data/perl/HeaderParseService/resources/database/page.txt +1 -0
  94. data/perl/HeaderParseService/resources/database/phone.txt +9 -0
  95. data/perl/HeaderParseService/resources/database/postcode.txt +54 -0
  96. data/perl/HeaderParseService/resources/database/pubnum.txt +45 -0
  97. data/perl/HeaderParseService/resources/database/statename.bin +0 -0
  98. data/perl/HeaderParseService/resources/database/statename.txt +73 -0
  99. data/perl/HeaderParseService/resources/database/states_and_abbreviations.txt +118 -0
  100. data/perl/HeaderParseService/resources/database/stopwords +438 -0
  101. data/perl/HeaderParseService/resources/database/stopwords.bin +0 -0
  102. data/perl/HeaderParseService/resources/database/surNames.txt +19613 -0
  103. data/perl/HeaderParseService/resources/database/surnames.bin +0 -0
  104. data/perl/HeaderParseService/resources/database/surnames_spec.bin +0 -0
  105. data/perl/HeaderParseService/resources/database/university_list/A.html +167 -0
  106. data/perl/HeaderParseService/resources/database/university_list/B.html +161 -0
  107. data/perl/HeaderParseService/resources/database/university_list/C.html +288 -0
  108. data/perl/HeaderParseService/resources/database/university_list/D.html +115 -0
  109. data/perl/HeaderParseService/resources/database/university_list/E.html +147 -0
  110. data/perl/HeaderParseService/resources/database/university_list/F.html +112 -0
  111. data/perl/HeaderParseService/resources/database/university_list/G.html +115 -0
  112. data/perl/HeaderParseService/resources/database/university_list/H.html +140 -0
  113. data/perl/HeaderParseService/resources/database/university_list/I.html +138 -0
  114. data/perl/HeaderParseService/resources/database/university_list/J.html +82 -0
  115. data/perl/HeaderParseService/resources/database/university_list/K.html +115 -0
  116. data/perl/HeaderParseService/resources/database/university_list/L.html +131 -0
  117. data/perl/HeaderParseService/resources/database/university_list/M.html +201 -0
  118. data/perl/HeaderParseService/resources/database/university_list/N.html +204 -0
  119. data/perl/HeaderParseService/resources/database/university_list/O.html +89 -0
  120. data/perl/HeaderParseService/resources/database/university_list/P.html +125 -0
  121. data/perl/HeaderParseService/resources/database/university_list/Q.html +49 -0
  122. data/perl/HeaderParseService/resources/database/university_list/R.html +126 -0
  123. data/perl/HeaderParseService/resources/database/university_list/S.html +296 -0
  124. data/perl/HeaderParseService/resources/database/university_list/T.html +156 -0
  125. data/perl/HeaderParseService/resources/database/university_list/U.html +800 -0
  126. data/perl/HeaderParseService/resources/database/university_list/V.html +75 -0
  127. data/perl/HeaderParseService/resources/database/university_list/W.html +144 -0
  128. data/perl/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
  129. data/perl/HeaderParseService/resources/database/university_list/X.html +44 -0
  130. data/perl/HeaderParseService/resources/database/university_list/Y.html +53 -0
  131. data/perl/HeaderParseService/resources/database/university_list/Z.html +43 -0
  132. data/perl/HeaderParseService/resources/database/university_list/ae.html +31 -0
  133. data/perl/HeaderParseService/resources/database/university_list/am.html +30 -0
  134. data/perl/HeaderParseService/resources/database/university_list/ar.html +35 -0
  135. data/perl/HeaderParseService/resources/database/university_list/at.html +43 -0
  136. data/perl/HeaderParseService/resources/database/university_list/au.html +82 -0
  137. data/perl/HeaderParseService/resources/database/university_list/bd.html +28 -0
  138. data/perl/HeaderParseService/resources/database/university_list/be.html +41 -0
  139. data/perl/HeaderParseService/resources/database/university_list/bg.html +28 -0
  140. data/perl/HeaderParseService/resources/database/university_list/bh.html +28 -0
  141. data/perl/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
  142. data/perl/HeaderParseService/resources/database/university_list/bm.html +28 -0
  143. data/perl/HeaderParseService/resources/database/university_list/bn.html +28 -0
  144. data/perl/HeaderParseService/resources/database/university_list/br.html +66 -0
  145. data/perl/HeaderParseService/resources/database/university_list/ca.html +174 -0
  146. data/perl/HeaderParseService/resources/database/university_list/ch.html +52 -0
  147. data/perl/HeaderParseService/resources/database/university_list/cl.html +40 -0
  148. data/perl/HeaderParseService/resources/database/university_list/cn.html +87 -0
  149. data/perl/HeaderParseService/resources/database/university_list/co.html +39 -0
  150. data/perl/HeaderParseService/resources/database/university_list/cr.html +34 -0
  151. data/perl/HeaderParseService/resources/database/university_list/cy.html +34 -0
  152. data/perl/HeaderParseService/resources/database/university_list/cz.html +44 -0
  153. data/perl/HeaderParseService/resources/database/university_list/de.html +128 -0
  154. data/perl/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
  155. data/perl/HeaderParseService/resources/database/university_list/dk.html +42 -0
  156. data/perl/HeaderParseService/resources/database/university_list/ec.html +31 -0
  157. data/perl/HeaderParseService/resources/database/university_list/ee.html +30 -0
  158. data/perl/HeaderParseService/resources/database/university_list/eg.html +29 -0
  159. data/perl/HeaderParseService/resources/database/university_list/es.html +68 -0
  160. data/perl/HeaderParseService/resources/database/university_list/et.html +28 -0
  161. data/perl/HeaderParseService/resources/database/university_list/faq.html +147 -0
  162. data/perl/HeaderParseService/resources/database/university_list/fi.html +49 -0
  163. data/perl/HeaderParseService/resources/database/university_list/fj.html +28 -0
  164. data/perl/HeaderParseService/resources/database/university_list/fo.html +28 -0
  165. data/perl/HeaderParseService/resources/database/university_list/fr.html +106 -0
  166. data/perl/HeaderParseService/resources/database/university_list/geog.html +150 -0
  167. data/perl/HeaderParseService/resources/database/university_list/gr.html +38 -0
  168. data/perl/HeaderParseService/resources/database/university_list/gu.html +28 -0
  169. data/perl/HeaderParseService/resources/database/university_list/hk.html +34 -0
  170. data/perl/HeaderParseService/resources/database/university_list/hr.html +28 -0
  171. data/perl/HeaderParseService/resources/database/university_list/hu.html +46 -0
  172. data/perl/HeaderParseService/resources/database/university_list/id.html +29 -0
  173. data/perl/HeaderParseService/resources/database/university_list/ie.html +49 -0
  174. data/perl/HeaderParseService/resources/database/university_list/il.html +35 -0
  175. data/perl/HeaderParseService/resources/database/university_list/in.html +109 -0
  176. data/perl/HeaderParseService/resources/database/university_list/is.html +32 -0
  177. data/perl/HeaderParseService/resources/database/university_list/it.html +75 -0
  178. data/perl/HeaderParseService/resources/database/university_list/jm.html +28 -0
  179. data/perl/HeaderParseService/resources/database/university_list/jo.html +28 -0
  180. data/perl/HeaderParseService/resources/database/university_list/jp.html +155 -0
  181. data/perl/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
  182. data/perl/HeaderParseService/resources/database/university_list/kr.html +65 -0
  183. data/perl/HeaderParseService/resources/database/university_list/kw.html +28 -0
  184. data/perl/HeaderParseService/resources/database/university_list/lb.html +28 -0
  185. data/perl/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
  186. data/perl/HeaderParseService/resources/database/university_list/lk.html +30 -0
  187. data/perl/HeaderParseService/resources/database/university_list/lt.html +31 -0
  188. data/perl/HeaderParseService/resources/database/university_list/lu.html +34 -0
  189. data/perl/HeaderParseService/resources/database/university_list/lv.html +30 -0
  190. data/perl/HeaderParseService/resources/database/university_list/ma.html +28 -0
  191. data/perl/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
  192. data/perl/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
  193. data/perl/HeaderParseService/resources/database/university_list/mk.html +29 -0
  194. data/perl/HeaderParseService/resources/database/university_list/mo.html +29 -0
  195. data/perl/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
  196. data/perl/HeaderParseService/resources/database/university_list/mt.html +28 -0
  197. data/perl/HeaderParseService/resources/database/university_list/mx.html +68 -0
  198. data/perl/HeaderParseService/resources/database/university_list/my.html +39 -0
  199. data/perl/HeaderParseService/resources/database/university_list/ni.html +28 -0
  200. data/perl/HeaderParseService/resources/database/university_list/nl.html +51 -0
  201. data/perl/HeaderParseService/resources/database/university_list/no.html +56 -0
  202. data/perl/HeaderParseService/resources/database/university_list/nz.html +41 -0
  203. data/perl/HeaderParseService/resources/database/university_list/pa.html +31 -0
  204. data/perl/HeaderParseService/resources/database/university_list/pe.html +40 -0
  205. data/perl/HeaderParseService/resources/database/university_list/ph.html +41 -0
  206. data/perl/HeaderParseService/resources/database/university_list/pl.html +51 -0
  207. data/perl/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
  208. data/perl/HeaderParseService/resources/database/university_list/pr.html +31 -0
  209. data/perl/HeaderParseService/resources/database/university_list/ps.html +28 -0
  210. data/perl/HeaderParseService/resources/database/university_list/pt.html +45 -0
  211. data/perl/HeaderParseService/resources/database/university_list/recognition.html +69 -0
  212. data/perl/HeaderParseService/resources/database/university_list/results.html +71 -0
  213. data/perl/HeaderParseService/resources/database/university_list/ro.html +38 -0
  214. data/perl/HeaderParseService/resources/database/university_list/ru.html +48 -0
  215. data/perl/HeaderParseService/resources/database/university_list/sd.html +28 -0
  216. data/perl/HeaderParseService/resources/database/university_list/se.html +57 -0
  217. data/perl/HeaderParseService/resources/database/university_list/sg.html +33 -0
  218. data/perl/HeaderParseService/resources/database/university_list/si.html +30 -0
  219. data/perl/HeaderParseService/resources/database/university_list/sk.html +35 -0
  220. data/perl/HeaderParseService/resources/database/university_list/th.html +45 -0
  221. data/perl/HeaderParseService/resources/database/university_list/tr.html +44 -0
  222. data/perl/HeaderParseService/resources/database/university_list/tw.html +76 -0
  223. data/perl/HeaderParseService/resources/database/university_list/ua.html +29 -0
  224. data/perl/HeaderParseService/resources/database/university_list/uk.html +168 -0
  225. data/perl/HeaderParseService/resources/database/university_list/univ-full.html +3166 -0
  226. data/perl/HeaderParseService/resources/database/university_list/univ.html +122 -0
  227. data/perl/HeaderParseService/resources/database/university_list/uy.html +31 -0
  228. data/perl/HeaderParseService/resources/database/university_list/ve.html +34 -0
  229. data/perl/HeaderParseService/resources/database/university_list/yu.html +28 -0
  230. data/perl/HeaderParseService/resources/database/university_list/za.html +46 -0
  231. data/perl/HeaderParseService/resources/database/university_list/zm.html +28 -0
  232. data/perl/HeaderParseService/resources/database/university_list.txt +3025 -0
  233. data/perl/HeaderParseService/resources/database/url.txt +1 -0
  234. data/perl/HeaderParseService/resources/database/webTopWords +225 -0
  235. data/perl/HeaderParseService/resources/database/words +45402 -0
  236. data/perl/HeaderParseService/resources/models/10ContextModelfold1 +369 -0
  237. data/perl/HeaderParseService/resources/models/10Modelfold1 +376 -0
  238. data/perl/HeaderParseService/resources/models/11ContextModelfold1 +400 -0
  239. data/perl/HeaderParseService/resources/models/11Modelfold1 +526 -0
  240. data/perl/HeaderParseService/resources/models/12ContextModelfold1 +510 -0
  241. data/perl/HeaderParseService/resources/models/12Modelfold1 +423 -0
  242. data/perl/HeaderParseService/resources/models/13ContextModelfold1 +364 -0
  243. data/perl/HeaderParseService/resources/models/13Modelfold1 +677 -0
  244. data/perl/HeaderParseService/resources/models/14ContextModelfold1 +459 -0
  245. data/perl/HeaderParseService/resources/models/14Modelfold1 +325 -0
  246. data/perl/HeaderParseService/resources/models/15ContextModelfold1 +340 -0
  247. data/perl/HeaderParseService/resources/models/15Modelfold1 +390 -0
  248. data/perl/HeaderParseService/resources/models/1ContextModelfold1 +668 -0
  249. data/perl/HeaderParseService/resources/models/1Modelfold1 +1147 -0
  250. data/perl/HeaderParseService/resources/models/2ContextModelfold1 +755 -0
  251. data/perl/HeaderParseService/resources/models/2Modelfold1 +796 -0
  252. data/perl/HeaderParseService/resources/models/3ContextModelfold1 +1299 -0
  253. data/perl/HeaderParseService/resources/models/3Modelfold1 +1360 -0
  254. data/perl/HeaderParseService/resources/models/4ContextModelfold1 +1062 -0
  255. data/perl/HeaderParseService/resources/models/4Modelfold1 +993 -0
  256. data/perl/HeaderParseService/resources/models/5ContextModelfold1 +1339 -0
  257. data/perl/HeaderParseService/resources/models/5Modelfold1 +2098 -0
  258. data/perl/HeaderParseService/resources/models/6ContextModelfold1 +888 -0
  259. data/perl/HeaderParseService/resources/models/6Modelfold1 +620 -0
  260. data/perl/HeaderParseService/resources/models/7ContextModelfold1 +257 -0
  261. data/perl/HeaderParseService/resources/models/7Modelfold1 +228 -0
  262. data/perl/HeaderParseService/resources/models/8ContextModelfold1 +677 -0
  263. data/perl/HeaderParseService/resources/models/8Modelfold1 +1871 -0
  264. data/perl/HeaderParseService/resources/models/9ContextModelfold1 +198 -0
  265. data/perl/HeaderParseService/resources/models/9Modelfold1 +170 -0
  266. data/perl/HeaderParseService/resources/models/NameSpaceModel +181 -0
  267. data/perl/HeaderParseService/resources/models/NameSpaceTrainF +347 -0
  268. data/perl/HeaderParseService/resources/models/WrapperBaseFeaDict +13460 -0
  269. data/perl/HeaderParseService/resources/models/WrapperContextFeaDict +14045 -0
  270. data/perl/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +510 -0
  271. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +23 -0
  272. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +23 -0
  273. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +23 -0
  274. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +23 -0
  275. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +23 -0
  276. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +23 -0
  277. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +23 -0
  278. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +23 -0
  279. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +23 -0
  280. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +23 -0
  281. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +23 -0
  282. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +23 -0
  283. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +23 -0
  284. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +23 -0
  285. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +23 -0
  286. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +23 -0
  287. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +23 -0
  288. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +23 -0
  289. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +23 -0
  290. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +23 -0
  291. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +23 -0
  292. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +23 -0
  293. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +23 -0
  294. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +23 -0
  295. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +23 -0
  296. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +23 -0
  297. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +23 -0
  298. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +23 -0
  299. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +23 -0
  300. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +23 -0
  301. data/perl/ParsCit/README.TXT +82 -0
  302. data/perl/ParsCit/crfpp/traindata/parsCit.template +60 -0
  303. data/perl/ParsCit/crfpp/traindata/parsCit.train.data +12104 -0
  304. data/perl/ParsCit/crfpp/traindata/tagged_references.txt +500 -0
  305. data/perl/ParsCit/lib/CSXUtil/SafeText.pm +140 -0
  306. data/perl/ParsCit/lib/ParsCit/Citation.pm +462 -0
  307. data/perl/ParsCit/lib/ParsCit/CitationContext.pm +132 -0
  308. data/perl/ParsCit/lib/ParsCit/Config.pm +46 -0
  309. data/perl/ParsCit/lib/ParsCit/Controller.pm +306 -0
  310. data/perl/ParsCit/lib/ParsCit/PostProcess.pm +367 -0
  311. data/perl/ParsCit/lib/ParsCit/PreProcess.pm +333 -0
  312. data/perl/ParsCit/lib/ParsCit/Tr2crfpp.pm +331 -0
  313. data/perl/ParsCit/resources/parsCit.model +0 -0
  314. data/perl/ParsCit/resources/parsCitDict.txt +148783 -0
  315. data/perl/extract.pl +199 -0
  316. data/spec/biblicit/cb2bib_spec.rb +48 -0
  317. data/spec/biblicit/citeseer_spec.rb +40 -0
  318. data/spec/fixtures/pdf/10.1.1.109.4049.pdf +0 -0
  319. data/spec/fixtures/pdf/Bagnoli Watts TAR 2010.pdf +0 -0
  320. data/spec/fixtures/pdf/ICINCO_2010.pdf +0 -0
  321. data/spec/spec_helper.rb +3 -0
  322. metadata +474 -0
@@ -0,0 +1,1880 @@
1
+ #
2
+ # Copyright 2007 Penn State University
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ #
13
+ package HeaderParse::API::ParserMethods;
14
+
15
+ #06/27/2003, start to make this program to handle real data. So there is no evaluation, and off line classifiers should be trained be trained beforehand.
16
+ #02/10/2004 Apply to citeseer data (with the same format of EbizSearch data)
17
+
18
+ use utf8;
19
+ use Data::Dumper;
20
+ use FindBin;
21
+ use HeaderParse::API::NamePatternMatch;
22
+ use HeaderParse::API::MultiClassChunking; #default to use all export by this module
23
+ use HeaderParse::API::LoadInformation;
24
+ use HeaderParse::Config::API_Config;
25
+ use HeaderParse::API::AssembleXMLMetadata;
26
+ use vars qw($debug %dictH %nameH %firstnameH %lastnameH %BasicFeatureDictH %InverseTagMap);
27
+ use vars qw($Classifier $offlineD $Tmp_Dir $nMinHeaderLength $nMaxHeaderLength);
28
+ use HeaderParse::API::Function qw(&AddrMatch &printDict &GenTrainVecMatrix &LineFeatureRepre &FillSpace &SeparatePunc);
29
+
30
+ my $FeatureDictH = \%BasicFeatureDictH;
31
+ my $ContextFeatureDictH;
32
+ my $SpaceAuthorFeatureDictH; #do not know if it is OK to define a hash
33
+ my $PuncAuthorFeatureDictH;
34
+ my $NameSpaceTrainVecH;
35
+ my $NameSpaceTrainF = "$offlineD"."NameSpaceTrainF";
36
+ my $SVMNameSpaceModel = "$offlineD"."NameSpaceModel";
37
+ my $TestH;
38
+ my $TrainH;
39
+ my $TotalHea = 0;
40
+
41
+ my $timestamp;
42
+
43
+ #my $offlineD = "../../offline/";
44
+ #my $TestOutF = "$TestF"."\.parsed";
45
+ #my $tmpCacheVecB = "$Tmp_Dir/tmpVec";
46
+ #my $SVMTmpResultB = "$Tmp_Dir/tmpresult";
47
+
48
+ my $FeatureDict = "$offlineD"."WrapperBaseFeaDict";
49
+ my $ContextFeatureDict = "$offlineD"."WrapperContextFeaDict";
50
+ my $SpaceAuthorFeatureDictF = "$offlineD"."WrapperSpaceAuthorFeaDict";
51
+ my $PuncAuthorFeatureDictF = "$offlineD"."WrapperPuncAuthorFeaDict";
52
+
53
+ my $linear = 1; # just want to be fast
54
+
55
+ my %evalH; # global hash to record classification result for baseline, each context round and IE
56
+ my $norm = 1;
57
+ my $testp = 1; # this is only to make the program run, no meaning.
58
+
59
+
60
+ my %TestDataIndex; #It indexes the header no in the testing dataset
61
+
62
+ #Read dictionary files
63
+ undef $/;
64
+ open(dumpFH, "$FeatureDict") || die "SVMHeaderParse: could not open $FeatureDict to read: $!";
65
+ my $string = <dumpFH>;
66
+ close(dumpFH);
67
+ eval $string;
68
+ $FeatureDictH = $VAR1;
69
+ $string ="";
70
+
71
+ open(dumpFH, "$ContextFeatureDict") || die "SVMHeaderParse: could not open $ContextFeatureDict to read: $!";
72
+ $string = <dumpFH>;
73
+ close(dumpFH);
74
+ eval $string;
75
+ $ContextFeatureDictH = $VAR1;
76
+ $string ="";
77
+
78
+ open(dumpFH, "$SpaceAuthorFeatureDictF") || die "SVMHeaderParse: could not open $SpaceAuthorFeatureDictF to read: $!";
79
+ $string = <dumpFH>;
80
+ close(dumpFH);
81
+ eval $string;
82
+ $SpaceAuthorFeatureDictH = $VAR1;
83
+ $string ="";
84
+ $/ = "\n";
85
+ #End read dictionary files
86
+
87
+
88
+ sub Parse{
89
+ my $header=shift;
90
+ $timestamp = shift;
91
+ my $success = 0;
92
+ # $tmpCacheVec = $tmpCacheVec . "\_$timestamp\_";
93
+ # $SVMTmpResult = $SVMTmpResult . "\_$timestamp\_";
94
+ my $tmpCacheVec = "$Tmp_Dir/tmpVec"."\_$timestamp\_";
95
+
96
+ my $SVMTmpResult = "$Tmp_Dir/tmpresult"."\_$timestamp\_";
97
+ $TestH = &HashEbizHeader(\$header);
98
+ $TestH = &VectorizeUnknownHeaderLine($TestH);
99
+
100
+ my $baseline = 1;
101
+ $TestH = &LineClassify($testp, "", $baseline, $FeatureDictH,
102
+ $TestH, $tmpCacheVec, $SVMTmpResult);
103
+ $TestH = &UpdatePretag($TestH);
104
+
105
+ my $maxLoop = 2;
106
+ for my $loop(1 .. $maxLoop) {
107
+ $baseline = 0;
108
+ my $NowContext = "context"."$loop";
109
+
110
+ $TestH = &LineClassify($testp, $NowContext, $baseline,
111
+ $ContextFeatureDictH, $TestH,
112
+ $tmpCacheVec, $SVMTmpResult);
113
+ $TestH = &UpdatePretag($TestH);
114
+ }
115
+
116
+ #Phase 2: Extraction Information from Multi-Class Lines and Author Lines Chunks
117
+ my $LastContext = "context"."$maxLoop";
118
+
119
+ # BUG: InfoExtract hangs on some documents.
120
+ # this is reproducible with data extracted using TET from doc 654835
121
+ # from the legacy citeseer system.
122
+ eval {
123
+ local $SIG{'ALRM'} = sub { die "alarm\n"; };
124
+ alarm 15;
125
+ $TestH = &InfoExtract($testp, $TestH,$SpaceAuthorFeatureDictH, $PuncAuthorFeatureDictH, $SVMNameSpaceModel, $tmpCacheVec, $SVMTmpResult);
126
+ alarm 0;
127
+ };
128
+ if ($@) {
129
+ if ($@ eq "alarm\n") {
130
+ return 0;
131
+ }
132
+ }
133
+ $rXML = &ExportRDF($TestH);
134
+
135
+ for my $i(1..15){
136
+ unlink "$Tmp_Dir/tmpVec\_$timestamp\_test$i";
137
+ unlink "$Tmp_Dir/tmpresult\_$timestamp\_$i";
138
+ }
139
+ return $rXML;
140
+ }
141
+
142
+
143
+ # This is the header extraction module from CiteSeer.
144
+ # Only the parts related to header extraction is used.
145
+ sub ExtractHeaderInformation {
146
+ my $papertext = shift;
147
+ my $header='';
148
+
149
+ if (!(length($$papertext))){
150
+ return ('Paper text is empty');
151
+ }
152
+
153
+ # $$papertext =~ s/<[SEFC][\d\.e\+\-]*>//sgi; # remove S|E|F|C tags
154
+
155
+ if ($$papertext =~ /^(.*?\b(?:Introduction|INTRODUCTION|Contents|CONTENTS)(?:.*?\n){6})/s) {
156
+ $header = $1;
157
+ } else {
158
+ my $nLines = 150;
159
+ my @lines = split '\n', $$papertext;
160
+ my $contentLines = 0;
161
+ for (my $i=0; $i<=$#lines; $i++) {
162
+ if ($lines[$i] !~ m/^\s*$/) {
163
+ $contentLines++;
164
+ }
165
+ $header .= $lines[$i]."\n";
166
+ if ($contentLines >= $nLines) {
167
+ last;
168
+ }
169
+ }
170
+ }
171
+
172
+ # if ($$papertext =~ /^(.*?)\b(?:Abstract|ABSTRACT|Introduction|INTRODUCTION|Contents|CONTENTS|[Tt]his\s+(paper|memo|technical|article|document|report|dissertation))\b/s) { $header = $1; }
173
+ # elsif ($$papertext =~ /^(.*?)\n[\d\.\s]*(Reference|Bibliography)/si) { $header = $1; }
174
+ # else{
175
+ # return ('Header could not be extracted');
176
+ # }
177
+
178
+ if ((defined $header) && (length ($header) > $nMaxHeaderLength)) {
179
+ $header = substr ($header, 0, $nMaxHeaderLength) . '...';
180
+ }
181
+ if (length($header) < $nMinHeaderLength) {
182
+ return ('Header could not be extracted');
183
+ }
184
+ return ('',$header);
185
+ }
186
+
187
+
188
+ sub UpdatePretag() {
189
+ my $testH = shift;
190
+ # foreach my $testHea(sort {$a <=> $b} keys %{$testH}) {
191
+ foreach my $LN(sort {$a <=> $b} keys %{$testH}) {
192
+ delete($$testH{$LN}{Pretag});
193
+ if ($$testH{$LN}{PClass} eq "s") {
194
+ $$testH{$LN}{Pretag}{$$testH{$LN}{PSClsName}} = 1;
195
+ }elsif ($$testH{$LN}{PClass} eq "m") {
196
+ foreach my $mytag(keys %{$$testH{$LN}{PClsName}}) {
197
+ $$testH{$LN}{Pretag}{$mytag} = 1;
198
+ }
199
+ }
200
+ }
201
+ # }
202
+ return($testH);
203
+ }
204
+
205
+
206
+ #input: the file with all Training and testing samples
207
+ #output: $HeaderH{$HeaNO}{$LineNO} = "";
208
+ sub HashAllHeader() {
209
+ my $simulateHeaNum = shift;
210
+ my $tagF= shift;
211
+ my %HeaH = ();
212
+ my $HeaNO = 1; #start from 1
213
+ my $LineNO = 1;
214
+
215
+ open(tagFH, "$tagF") || die "SVMHeaderParse: could not open tag file\: $tagF to read: $!";
216
+ while (my $line = <tagFH>) {
217
+ $line =~ s/\+L\+//g;
218
+ $line =~ s/^\s+//g;
219
+ $line =~ s/\s+$//g;
220
+
221
+ if ($line =~ /^\s*\<NEW\_HEADER\>/) {
222
+ $HeaNO++;
223
+ $LineNO = 1;
224
+ #remove the line with only tag like </author>
225
+ }elsif (($line =~ /^\s*$/) || ($line =~ /^\<(\/)*(\w+)\>$/)) {
226
+ next;
227
+ }else {
228
+ $HeaH{$HeaNO}{$LineNO}{RawContent} = $line;
229
+ $LineNO++;
230
+ }
231
+
232
+ if ($simulateHeaNum > 0 && $HeaNO >= $simulateHeaNum) {
233
+ last;
234
+ }
235
+ }
236
+ close(tagFH);
237
+ return($HeaNO, \%HeaH);
238
+ }
239
+
240
+
241
+ #HEADER_DID[1]
242
+ #TRECS: Developing a Web-based e-Commerce Business Simulation
243
+ #TRECS: Developing a Web-based
244
+ sub HashEbizHeader() {
245
+ my $headerRef= shift;
246
+ my %HeaH = ();
247
+ # my $HeaNO = 1; #start from 1
248
+ my $LineNO = 1;
249
+
250
+ my @lines = split(/\n/, $$headerRef);
251
+ my $line;
252
+
253
+ #open(FH, "$F") || die "SVMHeaderParse: could not open file\: $F to read: $!";
254
+ #while (my $line = <FH>) {
255
+ foreach $line (@lines){
256
+ $line =~ s/^\s+//g;
257
+ $line =~ s/\s+$//g;
258
+
259
+ # if ($line =~ /^\s*HEADER\_DID\[(\d+)\]/) {
260
+ # $HeaNO = $1;
261
+ # $LineNO = 1;
262
+ # }elsif ($line !~ /^\s*$/) {
263
+ #$HeaH{$HeaNO}{$LineNO}{RawContent} = $line;
264
+ $HeaH{$LineNO}{RawContent} = $line;
265
+ $LineNO++;
266
+ # }
267
+ }
268
+ #close(FH);
269
+ return(\%HeaH);
270
+ }
271
+
272
+
273
+ sub BaseLineTrainSys() {
274
+ my $HeaderH = shift;
275
+ my $FeatureDictH = shift;
276
+
277
+ my %InitialHash = ();
278
+ $InitialHash{FeatureCounter} = 0;
279
+
280
+ my $PuncAuthorDictH = \%InitialHash;
281
+ my $SpaceAuthorDictH;
282
+ #this is the place to generate feature dictionrauy and name pattern dictionary
283
+ ($HeaderH, $FeatureDictH, $SpaceAuthorDictH) = &FormFeaDict($HeaderH, $FeatureDictH);
284
+ #Prune features in Dictionary with DF < 3
285
+ $FeatureDictH = &PruneDict($FeatureDictH);
286
+
287
+ #prune features not in the pruned dict from the feature vector
288
+ foreach my $HeaNO (sort {$a <=> $b} keys %{$HeaderH}) {
289
+ foreach my $line(sort {$a <=> $b} keys %{$$HeaderH{$HeaNO}}) {
290
+ foreach my $fea(keys %{$$HeaderH{$HeaNO}{$line}{FeaVec}}) {
291
+ if (! $$FeatureDictH{$fea}{ID}) {
292
+ delete ($$HeaderH{$HeaNO}{$line}{FeaVec}{$fea});
293
+ }
294
+ }
295
+
296
+ if ($$HeaderH{$HeaNO}{$line}{FeaVec} ne "") {
297
+ my $tmpFeaVec = "";
298
+ foreach my $fea(sort{$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$HeaderH{$HeaNO}{$line}{FeaVec}}) {
299
+
300
+ if ($norm) {
301
+ #normalization
302
+ $$HeaderH{$HeaNO}{$line}{FeaVec}{$fea} = sprintf("%.8f", $$HeaderH{$HeaNO}{$line}{FeaVec}{$fea}/$$FeatureDictH{$fea}{max});
303
+ }
304
+
305
+ $tmpFeaVec .= "$$FeatureDictH{$fea}{ID}\:$$HeaderH{$HeaNO}{$line}{FeaVec}{$fea} ";
306
+ }
307
+ $$HeaderH{$HeaNO}{$line}{SVMFeaVec} = "$tmpFeaVec";
308
+ }
309
+
310
+ }
311
+ }
312
+
313
+ my %NameSpaceTrainVecH = (); #a separate hash for later printing
314
+ my $Lcount = 0;
315
+ #Prune acordingly features
316
+ foreach my $HeaNO (sort {$a <=> $b} keys %{$HeaderH}) {
317
+ foreach my $line(sort {$a <=> $b} keys %{$$HeaderH{$HeaNO}}) {
318
+ if (exists $$HeaderH{$HeaNO}{$line}{NamePattern}) {
319
+ foreach my $CandidateNamePattern(keys %{$$HeaderH{$HeaNO}{$line}{NamePattern}}) {
320
+ foreach my $fea(keys %{$$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}}) {
321
+ if (! $$SpaceAuthorDictH{$fea}{ID}) {
322
+ delete($$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}{$fea});
323
+ }
324
+ }
325
+
326
+ #normalization
327
+ if ($$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec} ne "") {
328
+ $Lcount++;
329
+ my $tmpFeaVec = "$$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{tag} ";
330
+ my $tmpTextVec = "$$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{tag} ";
331
+
332
+ foreach my $fea(sort{$$SpaceAuthorDictH{$a}{ID} <=> $$SpaceAuthorDictH{$b}{ID}} keys %{$$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}}) {
333
+ $$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}{$fea} = sprintf("%.8f", $$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}{$fea}/$$SpaceAuthorDictH{$fea}{max});
334
+ $tmpFeaVec .= "$$SpaceAuthorDictH{$fea}{ID}\:$$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}{$fea} ";
335
+ $tmpTextVec .= "$fea\:$$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}{$fea} ";
336
+ }
337
+ $NameSpaceTrainVecH{$Lcount}{SpaceNameVec}=$tmpFeaVec;
338
+ $NameSpaceTrainVecH{$Lcount}{SpaceTextNameVec}=$tmpTextVec; #for debugging
339
+ }
340
+ }
341
+ }
342
+ }
343
+ }
344
+
345
+ return($HeaderH, $FeatureDictH, $PuncAuthorDictH, $SpaceAuthorDictH, \%NameSpaceTrainVecH);
346
+ }
347
+
348
+ sub ContextTrainSys() {
349
+ my $FeatureDictH = shift;
350
+ my $HeaderH = shift;
351
+
352
+ foreach my $HeaNO (sort {$a <=> $b} keys %{$HeaderH}) {
353
+ #assign neighour line's tag
354
+ ($FeatureDictH, $$HeaderH{$HeaNO}) = &TrainAssignLineTag($FeatureDictH, $$HeaderH{$HeaNO});
355
+ }
356
+ return($FeatureDictH, $HeaderH);
357
+ }
358
+
359
+ #this is to write all the testing lines into one file to speed up
360
+ sub LineClassify() {
361
+ my ($testp, $nowLoop, $baseline, $FeatureDictH,
362
+ $HeaderH, $tmpCacheVec, $SVMTmpResult) = @_;
363
+ my %memoryH = ();
364
+ my $GlobalLineNO = 0;
365
+
366
+ #step1: collect all test data and write into one file
367
+ # keep a hash to record the global lineNO and the header no its local line no
368
+ # here is the file for all the testing data
369
+
370
+ # foreach my $testHea(sort {$a <=> $b} keys %{$HeaderH}) {
371
+ if ($baseline) {
372
+ #Filter feature vector by Feature Dictionary
373
+ ### $$HeaderH{$testHea} = &FormTestFeaVec($FeatureDictH, $$HeaderH{$testHea});
374
+ $HeaderH = &FormTestFeaVec($FeatureDictH, $HeaderH);
375
+ }else {
376
+ $HeaderH = &TestAssignLineTag($FeatureDictH, $HeaderH);
377
+ }
378
+
379
+ foreach my $LN(sort {$a <=> $b} keys %{$HeaderH}) {
380
+ if (! $baseline) {
381
+ #To make the iteration correct, we should initialize $$HeaderH{$testHea} by removing all the single and multiple classes in the hash
382
+ delete($$HeaderH{$LN}{PClass});
383
+ delete($$HeaderH{$LN}{PSClsName});
384
+ delete($$HeaderH{$LN}{PClsName});
385
+ }elsif ($baseline && ($$HeaderH{$LN}{FeaVec} ne "")) {
386
+ #modify the feature vector(normalization)
387
+ my $tmpFeaVec = "";
388
+ foreach my $fea(sort{$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$HeaderH{$LN}{FeaVec}}) {
389
+ if (exists ($$FeatureDictH{$fea}{ID})) {
390
+
391
+ if ($norm) {
392
+ if ($debug) {
393
+ if ($$FeatureDictH{$fea}{max} == 0) {
394
+ print STDERR "fea $fea has max value 0! \n";
395
+ }
396
+ }
397
+ $$HeaderH{$LN}{FeaVec}{$fea} = sprintf("%.8f", $$HeaderH{$LN}{FeaVec}{$fea}/$$FeatureDictH{$fea}{max});
398
+ }
399
+
400
+ $tmpFeaVec .= "$$FeatureDictH{$fea}{ID}\:$$HeaderH{$LN}{FeaVec}{$fea} ";
401
+ }
402
+ }
403
+
404
+ $$HeaderH{$LN}{SVMFeaVec} = "$tmpFeaVec";
405
+
406
+ #be carefull here!!
407
+ if ($$HeaderH{$LN}{SVMFeaVec} eq "") {
408
+ if ($debug) {
409
+ print STDERR "header($testHea) -- Line($LN) has a null feature vector ($$HeaderH{$testHea}{$LN}{RawContent}) \n";
410
+ }
411
+ next;
412
+ }
413
+ }
414
+
415
+ $GlobalLineNO++;
416
+ $memoryH{$GlobalLineNO}{HeaNO} = $testHea;
417
+ $memoryH{$GlobalLineNO}{LocalLineNO} = $LN;
418
+ }
419
+ # }
420
+
421
+ #step2:we print 15 files with labelled feature vectors
422
+ for my $clsNO(1 .. 15) {
423
+ my $testF = "$tmpCacheVec"."test"."$clsNO";
424
+ open(testFH, ">$testF") || die "SVMHeaderParse: could not open $testF to write: $!";
425
+ # foreach my $HeaNO (sort {$a <=> $b} keys %{$HeaderH}) {
426
+ foreach my $LN(sort {$a <=> $b} keys %{$HeaderH}) {
427
+ my $tag = 1; # just to conform to the format
428
+ if ($baseline) {
429
+ print testFH "$tag $$HeaderH{$LN}{SVMFeaVec}\n";
430
+ }else {
431
+ print testFH "$tag $$HeaderH{$LN}{ContextSVMFeaVec}\n";
432
+ #print "context feature vec is $$HeaderH{$HeaNO}{$LN}{ContextSVMFeaVec}\n";
433
+ }
434
+ }
435
+ # } # end of collecting all the testing data into a file
436
+ close(testFH);
437
+ }
438
+
439
+ #step3: SVM classify
440
+ for my $clsNO(1 .. 15) {
441
+ my $testF = "$tmpCacheVec"."test"."$clsNO";
442
+ my $mySVMResult = "$SVMTmpResult"."$clsNO";
443
+ my $SVMModelF;
444
+ my $printstr = "";
445
+ if ($baseline) {
446
+ $printstr = "baseline";
447
+ $SVMModelF = "$offlineD"."$clsNO"."Model"."fold"."$testp";
448
+ }else {
449
+ $printstr = "context"."$nowLoop";
450
+ $SVMModelF = "$offlineD"."$clsNO"."ContextModel"."fold"."$testp";
451
+ }
452
+ # print "$Classifier -v 0 $testF $SVMModelF $mySVMResult\n";
453
+ # print "classification result from fold($testp)-class($clsNO)-$printstr\:\n";
454
+ system("$Classifier -v 0 $testF $SVMModelF $mySVMResult");
455
+ }
456
+
457
+ #step4:Read all the result into a hash
458
+ my %SVMResultHash = ();
459
+ my %OrphanTagAssignHash = (); #This records the accuracy of assigned tags
460
+ my %NegMeanH = (); #record the mean of the negative value each classifier made
461
+ my %PosMinH = ();
462
+
463
+ for my $clsNO(1 .. 15) {
464
+ my $mySVMResult = "$SVMTmpResult"."$clsNO";
465
+ my $myLineNO = 0;
466
+
467
+ #initialize %PosMinH 's value
468
+ $PosMinH{$clsNO} = 100;
469
+
470
+ open(mySVMResultFH, "$mySVMResult") || die "SVMHeaderParse: could not open $mySVMResult to read: $!";
471
+ while (my $myline = <mySVMResultFH>) {
472
+ $myline =~ s/^\s+//g;
473
+ $myline =~ s/\s+$//g;
474
+ if ($myline !~ /^\s*$/) {
475
+ $myLineNO++;
476
+ if ($debug) {
477
+ print STDERR " current lineNo is $myLineNO and score for class $clsNO is $myline \n";
478
+ }
479
+ $SVMResultHash{$myLineNO}{$clsNO} = $myline;
480
+ if ($myline < 0) {
481
+ $NegMeanH{$clsNO} += $myline;
482
+ }else {
483
+ if ($PosMinH{$clsNO} > $myline) {
484
+ $PosMinH{$clsNO} = $myline;
485
+ }
486
+ }
487
+ }
488
+ }
489
+
490
+ if ($myLineNO < 1) {
491
+ if ($debug) {
492
+ print STDERR "yahoo: $mySVMResult has myLineNO 0 \n";
493
+ }
494
+ }else {
495
+ $NegMeanH{$clsNO} = sprintf("%.8f", $NegMeanH{$clsNO}/$myLineNO);
496
+ }
497
+
498
+ close(mySVMResultFH);
499
+ }
500
+
501
+ my $PredTagbyMinNeg = 0;
502
+ my $PredValbyMinNeg = 100;
503
+ my $PredTagbyMinPos = 0;
504
+ my $PredValbyMinPos = 100;
505
+
506
+ #analyze the results from the hash and fill the Test Hash(HeaderH)
507
+ for my $myline(1 .. $GlobalLineNO) {
508
+ my @PredictTags = ();
509
+ my $minVal = 100;
510
+ my $CandidateTag = -1;
511
+ my $myHeaNO = $memoryH{$myline}{HeaNO};
512
+ my $myLineNO = $memoryH{$myline}{LocalLineNO};
513
+
514
+ for my $clsNO(1 .. 15) {
515
+ my $myresult = $SVMResultHash{$myline}{$clsNO};
516
+ #keep the classification results for multi-class line
517
+ $$HeaderH{$myLineNO}{ClassifyResult}{$clsNO} = $myresult;
518
+ if ($debug) {
519
+ print STDERR "\t\t result by class $clsNO -- $result \n";
520
+ }
521
+ my $myRelDiv = 10;
522
+
523
+ if ($myresult > 0) {
524
+ push @PredictTags, $clsNO;
525
+ }else {
526
+ $myRelDiv = sprintf("%.8f", $myresult/$NegMeanH{$clsNO});
527
+ if ($myRelDiv < $minVal) {
528
+ $minVal = $myRelDiv;
529
+ $CandidateTag = $clsNO;
530
+ }
531
+ if ( (0 - $myresult) < $PredValbyMinNeg) {
532
+ $PredValbyMinNeg = -$myresult;
533
+ $PredTagbyMinNeg = $clsNO;
534
+ }
535
+ if (($PosMinH{$clsNO}- $myresult) < $PredValbyMinPos) {
536
+ $PredValbyMinPos = $PosMinH{$clsNO}- $myresult;
537
+ $PredTagbyMinPos = $clsNO;
538
+ }
539
+ }
540
+ }
541
+ #Assign ONLY class nearest to the hyperplane to the orphan point
542
+ if ($#PredictTags < 0) {
543
+ push @PredictTags, $CandidateTag;
544
+ $OrphanTagAssignHash{TotalLineNum}++;
545
+ }
546
+
547
+ #Fill the hash with the classification result
548
+ if ($#PredictTags eq 0) {
549
+ $$HeaderH{$myLineNO}{PClass} = "s";
550
+ $$HeaderH{$myLineNO}{PSClsName} = $PredictTags[0];
551
+ }elsif ($#PredictTags > 0) {
552
+ $$HeaderH{$myLineNO}{PClass} = "m";
553
+ # the multi tags predicted in one line has no sense of the order
554
+ for my $i(0 .. $#PredictTags) {
555
+ $$HeaderH{$myLineNO}{PClsName}{$PredictTags[$i]} = 1;
556
+ if ($debug) {
557
+ print STDERR "hea($myHeaNO)-- line($myLineNO) is classified as multi-class $PredictTags[$i] \n";
558
+ }
559
+ }
560
+ }else { #impossible
561
+ if ($debug) {
562
+ print STDERR "hea($myHeaNO)-- line($myLineNO) is orphan\n";
563
+ }
564
+ }
565
+ }
566
+ return($HeaderH);
567
+ }
568
+
569
+
570
+ #this is to
571
+ #(1) populate the predicted items(done in the LineClassify)
572
+ #(2) Extract related information from multi-author line and multi-classline
573
+ #all information to be extracted comes from {Pchunk}
574
+ #all word distribution information comes from {Pline} word dist.;
575
+
576
+ sub InfoExtract() {
577
+ my $testp = shift;
578
+ my $TestH = shift;
579
+ my $PuncAuthorDictH = shift;
580
+ my $SpaceAuthorDictH = shift;
581
+ my $SVMNameSpaceModel = shift;
582
+ my $tmpCacheVec = shift;
583
+ my $SVMTmpResult = shift;
584
+
585
+ # foreach my $testHea(sort {$a <=> $b} keys %{$TestH}) {
586
+ foreach my $LN(sort {$a <=> $b} keys %{$TestH}) {
587
+ if ($$TestH{$LN}{'PClass'} eq "s") { # single class
588
+ if ($$TestH{$LN}{PSClsName} ne '2') { #non-author single class
589
+ $$TestH{$LN}{Pchunk}{ChunkCounter}++;
590
+ my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
591
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = $$TestH{$LN}{PSClsName};
592
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $$TestH{$LN}{PureText};
593
+ }else {
594
+ if ($$TestH{$LN}{SClsWordCount} < 4) { #obvious single name
595
+ $$TestH{$LN}{Pchunk}{ChunkCounter}++;
596
+ my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
597
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = 2;
598
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $$TestH{$LN}{PureText};
599
+ }else { #multi-authors
600
+ my $Tline = $$TestH{$LN}{RawContent};
601
+ $Tline =~ s/<(\/)*author>//g;
602
+ if ($debug) {
603
+ print STDERR "predicted Multi-Author line -- $Tline \n";
604
+ }
605
+ my $NamePunc = 0;
606
+ #judge this is punctuated line or pure text-space
607
+ if (($$TestH{$LN}{PureText} =~ /([^\p{IsLower}\p{IsUpper}\s+\-\.\d+])/) || ($$TestH{$LN}{PureText} =~ /\band\b/i)) {
608
+ #multi-class needs while ... $punc++;
609
+ $NamePunc = 1;
610
+ }else {
611
+ $NamePunc = 0;
612
+ }
613
+
614
+ if ($NamePunc) {
615
+ #Heuristics bases separation based on features learned.
616
+ if (($$TestH{$LN}{PureText} =~ /Jr|Dr/) && ($$TestH{$LN}{SClsWordCount} <5)) {
617
+ #this is only one name
618
+ $$TestH{$LN}{Pchunk}{ChunkCounter}++;
619
+ my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
620
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = 2;
621
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $$TestH{$LN}{PureText};
622
+ }else {
623
+ my $nameStr = $$TestH{$LN}{PureText};
624
+ $nameStr =~ s/^\s+//g;
625
+ $nameStr =~ s/\s+$//g;
626
+ my @GuessedNames = split(/\,|\&|and/, $nameStr);
627
+ for my $i(0 .. $#GuessedNames) {
628
+ #chunk starts from 1
629
+ $GuessedNames[$i] =~ s/^\s+//g;
630
+ $GuessedNames[$i] =~ s/\s+$//g;
631
+ if ($GuessedNames[$i] !~ /^\s*$/) {
632
+ my @Nameparts = split(/\s+/, $GuessedNames[$i]);
633
+ if ($#Nameparts < 3) {
634
+ $$TestH{$LN}{Pchunk}{ChunkCounter}++;
635
+ my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
636
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = 2;
637
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $GuessedNames[$i];
638
+ }else {
639
+ #space separated names [name1 name2 name3 and name4]
640
+ my $PredictedNames = &HeaderParse::API::NamePatternMatch::NamePatternMatch($GuessedNames[$i]);
641
+ if ($#$PredictedNames < 1){
642
+ #only 1/0 reasonable name pattern, take it
643
+ $$TestH{$LN}{Pchunk}{ChunkCounter}++;
644
+ my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
645
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = 2;
646
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $GuessedNames[$i];
647
+ }else { #classify to predict
648
+ my $BestNamePattern = &PredictBestNamePattern($PredictedNames, $SVMNameSpaceModel, $SpaceAuthorDictH, $tmpCacheVec, $SVMTmpResult);
649
+ my @names = split(/<>/, $BestNamePattern);
650
+ for my $i(0 .. $#names) {
651
+ $$TestH{$LN}{Pchunk}{ChunkCounter}++;
652
+ my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
653
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = 2;
654
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $names[$i];
655
+ }
656
+ }
657
+ }
658
+ }
659
+ }
660
+ }
661
+ }else {
662
+ #name Space
663
+ my $nameStr = $$TestH{$LN}{PureText};
664
+ my $PredictedNames = &HeaderParse::API::NamePatternMatch::NamePatternMatch($nameStr);
665
+ if ($#$PredictedNames < 1){
666
+ #only 1/0 reasonable name pattern, take the parser-decided chunks
667
+ my $tmp_name_container = $$PredictedNames[0];
668
+ if ($#$tmp_name_container > 0) {
669
+ for my $kk(0 .. $#$tmp_name_container) {
670
+ $$TestH{$LN}{Pchunk}{ChunkCounter}++;
671
+ my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
672
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = 2;
673
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $$tmp_name_container[$kk];
674
+ }
675
+ }else {
676
+ #this branch is original
677
+ $$TestH{$LN}{Pchunk}{ChunkCounter}++;
678
+ my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
679
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = 2;
680
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $nameStr;
681
+ }
682
+ }else {
683
+ #classify to predict
684
+ my $BestNamePattern = &PredictBestNamePattern($PredictedNames, $SVMNameSpaceModel, $SpaceAuthorDictH, $tmpCacheVec, $SVMTmpResult);
685
+ my @names = split(/<>/, $BestNamePattern);
686
+ for my $i(0 .. $#names) {
687
+ $$TestH{$LN}{Pchunk}{ChunkCounter}++;
688
+ my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
689
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = 2;
690
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $names[$i];
691
+ }
692
+ }
693
+ }
694
+ }
695
+ }
696
+ #multiple class
697
+ }elsif ($$TestH{$LN}{PClass} eq "m"){
698
+ my (%TagH, $emailChunkH, $URLChunkH, @ArrayofHash);
699
+ #get a hash of all tags
700
+ foreach my $tag(keys %{$$TestH{$LN}{PClsName}}) {
701
+ $TagH{counter}++;
702
+ $TagH{$tag}++;
703
+ }
704
+ my ($PuncNum, $SepH, $component) = &GetSeparatorIndex($$TestH{$LN}{PureText});
705
+ #Preprocess -- extract email and URL out
706
+ if ($$TestH{$LN}{PClsName}{6}) {
707
+ #component has holes of "-1", after extracting emailchunk out
708
+ ($emailChunkH, $component) = &LocateEmailFromComponent($component);
709
+ delete($TagH{6});
710
+ $TagH{counter}--;
711
+ push @ArrayofHash, $emailChunkH;
712
+ }
713
+ if ($$TestH{$LN}{PClsName}{12}) {
714
+ ($URLChunkH, $component) = &LocateURLFromComponent($component);
715
+ delete($TagH{12});
716
+ $TagH{counter}--;
717
+ push @ArrayofHash, $URLChunkH;
718
+ }
719
+
720
+ if($TagH{counter} <1){ #no additional class
721
+ #exception: what if still text left ???????
722
+ $$TestH{$LN} = &FillChunkH($$TestH{$LN},$component, \@ArrayofHash);
723
+ #tag each word
724
+ }elsif ($TagH{counter} == 1){
725
+ #only one class left ..
726
+ my $lastTag = "";
727
+ foreach my $tag(keys %TagH) {
728
+ if ($tag ne "counter") {
729
+ $lastTag = $tag;
730
+ }
731
+ }
732
+ #Get the rest possible chunks separated by the email and URL
733
+ my $UnIdentifiedChunk = &LocateUnIdentifiedChunk($component);
734
+ #Tag all the test chunk as the only left class
735
+ foreach my $chunkNO(sort{$a<=>$b} keys %{$UnIdentifiedChunk}) {
736
+ $$UnIdentifiedChunk{$chunkNO}{cls} = $lastTag;
737
+ }
738
+ push @ArrayofHash, $UnIdentifiedChunk; #or\%myHash--must be pointer
739
+ #fill in the TestH chunk in a ordered way and tag each word
740
+ $$TestH{$LN} = &FillChunkH($$TestH{$LN}, $component, \@ArrayofHash);
741
+ # two class module
742
+ }elsif ($TagH{counter} == 2) {
743
+ #needs maping!
744
+ my @TagsArray = ();
745
+ foreach my $mytag(sort keys %TagH) {
746
+ if ($mytag ne "counter") {
747
+ push @TagsArray, $mytag;
748
+ }
749
+ }
750
+
751
+ my $UnIdentifiedChunk = &LocateUnIdentifiedChunk($component);
752
+ my $chunk1start = $$UnIdentifiedChunk{1}{startPos};
753
+ my $chunk1end = $$UnIdentifiedChunk{1}{endPos};
754
+ my $IdentifiedChunk;
755
+ #continuous
756
+ if ($$UnIdentifiedChunk{counter} == 1) {
757
+ my $offset;
758
+ my $newComponent = $component;
759
+ my $newSepH = $SepH;
760
+ if (($chunk1start == 0) && ($chunk1end == $#$component)) {
761
+ $offset = 0;
762
+ }else {
763
+ $offset = $chunk1start;
764
+ #adjust $component and $SepH
765
+ $newComponent = ();
766
+ for my $tmpi($chunk1start .. $chunk1end) {
767
+ $$newComponent[$tmpi-$offset] = $$component[$tmpi];
768
+ }
769
+
770
+ foreach my $tmpSep(sort keys %{$newSepH}) {
771
+ if (($tmpSep >= $chunk1start) && ($tmpSep <= $chunk1end)) {
772
+ my $newSep = $tmpSep - $offset;
773
+ $$newSepH{$newSep} = $$newSepH{$tmpSep};
774
+ }
775
+ delete($$newSepH{$tmpSep});
776
+ }
777
+ }
778
+
779
+ if ($PuncNum > 1) {
780
+ $IdentifiedChunk = &Cont2ClassChunking($testp, \@TagsArray, "punc", $newSepH, $newComponent, $FeatureDictH, $tmpCacheVec, $SVMTmpResult);
781
+ }else {
782
+ $IdentifiedChunk = &Cont2ClassChunking($testp, \@TagsArray, "space", $newSepH, $newComponent, $FeatureDictH, $tmpCacheVec, $SVMTmpResult);
783
+ }
784
+ #adjust back $chunk
785
+ if ($offset > 0) {
786
+ foreach my $tmpi(sort keys %{$IdentifiedChunk}) {
787
+ $$IdentifiedChunk{$tmpi}{startPos} += $offset;
788
+ $$IdentifiedChunk{$tmpi}{endPos} += $offset;
789
+ }
790
+ }
791
+ push @ArrayofHash, $IdentifiedChunk;
792
+ $$TestH{$LN} = &FillChunkH($$TestH{$LN}, $component, \@ArrayofHash);
793
+ }elsif ($$UnIdentifiedChunk{counter} == 2) { #discrete
794
+ $IdentifiedChunk = &Disc2ClassChunking_2chunk($testp, \@TagsArray, $UnIdentifiedChunk, $FeatureDictH, $tmpCacheVec, $SVMTmpResult);
795
+ push @ArrayofHash, $IdentifiedChunk;
796
+ $$TestH{$LN} = &FillChunkH($$TestH{$LN}, $component, \@ArrayofHash);
797
+ }elsif ($$UnIdentifiedChunk{counter} > 2) { #disc
798
+ if ($debug) {
799
+ print STDERR "2 classes with 3+ chunks\n";
800
+ }
801
+ }
802
+ #see 3 and 4 as one class
803
+ }elsif (($TagH{counter} == 3) && $TagH{3} && $TagH{4}) {
804
+ #tag array includes only 4 and the other tag
805
+ my @TagsArray = ();
806
+ foreach my $mytag(sort keys %TagH) {
807
+ if (($mytag ne "3") && ($mytag ne "4") && ($mytag ne "counter")) {
808
+ push @TagsArray, $mytag;
809
+ }
810
+ }
811
+ push @TagsArray, 4;
812
+
813
+ my $UnIdentifiedChunk = &LocateUnIdentifiedChunk($component);
814
+ my $chunk1start = $$UnIdentifiedChunk{1}{startPos};
815
+ my $chunk1end = $$UnIdentifiedChunk{1}{endPos};
816
+
817
+ my $IdentifiedChunk;
818
+ my $startPos34 = 0;
819
+ my $endPos34 = 0;
820
+ #continuous
821
+ if ($$UnIdentifiedChunk{counter} == 1) {
822
+ my $offset;
823
+ my $newComponent = $component;
824
+ my $newSepH = $SepH;
825
+
826
+ if (($chunk1start == 0) && ($chunk1end == $#$component)) {
827
+ $offset = 0;
828
+ }else {
829
+ $offset = $chunk1start;
830
+ #adjust $component and $SepH
831
+ $newComponent = ();
832
+ for my $tmpi($chunk1start .. $chunk1end) {
833
+ $$newComponent[$tmpi-$offset] = $$component[$tmpi];
834
+ }
835
+
836
+ foreach my $tmpSep(sort keys %{$newSepH}) {
837
+ if (($tmpSep >= $chunk1start) && ($tmpSep <= $chunk1end)) {
838
+ my $newSep = $tmpSep - $offset;
839
+ $$newSepH{$newSep} = $$newSepH{$tmpSep};
840
+ }
841
+ delete($$newSepH{$tmpSep});
842
+ }
843
+ }
844
+
845
+ #find the boundary between 34 and the other tag
846
+ if ($PuncNum > 1) {
847
+ $IdentifiedChunk = &Cont2ClassChunking($testp, \@TagsArray, "punc", $newSepH, $newComponent, $FeatureDictH, $tmpCacheVec, $SVMTmpResult);
848
+ }else {
849
+ $IdentifiedChunk = &Cont2ClassChunking($testp, \@TagsArray, "space", $newSepH, $newComponent, $FeatureDictH, $tmpCacheVec, $SVMTmpResult);
850
+ }
851
+
852
+ #adjust back $chunk
853
+ #get the position of the 3 4
854
+ foreach my $tmpi(sort keys %{$IdentifiedChunk}) {
855
+ if ($offset > 0) {
856
+ $$IdentifiedChunk{$tmpi}{startPos} += $offset;
857
+ $$IdentifiedChunk{$tmpi}{endPos} += $offset;
858
+ }
859
+ if ($$IdentifiedChunk{$tmpi}{cls} eq 4) {
860
+ $startPos34 = $$IdentifiedChunk{$tmpi}{startPos}; #absolute pos
861
+ $endPos34 = $$IdentifiedChunk{$tmpi}{endPos};
862
+ delete($$IdentifiedChunk{$tmpi});
863
+ }
864
+ }
865
+ push @ArrayofHash, $IdentifiedChunk;
866
+
867
+ }else { #if 2 discrete chunks
868
+ $IdentifiedChunk = &Disc2ClassChunking_2chunk($testp, \@TagsArray, $UnIdentifiedChunk, $FeatureDictH, $tmpCacheVec, $SVMTmpResult);
869
+ foreach my $tmpi(sort keys %{$IdentifiedChunk}) {
870
+ if ($$IdentifiedChunk{$tmpi}{cls} eq 4) {
871
+ $startPos34 = $$IdentifiedChunk{$tmpi}{startPos};
872
+ $endPos34 = $$IdentifiedChunk{$tmpi}{endPos};
873
+ delete($$IdentifiedChunk{$tmpi});
874
+ }
875
+ }
876
+ push @ArrayofHash, $IdentifiedChunk;
877
+ }
878
+
879
+ #find the boundary between 3 and 4
880
+ my $newComponent = (); #modified by Hui 03/19
881
+ my $newSepH = $SepH;
882
+ my $newPuncNum = 0;
883
+ my $offset = $startPos34;
884
+ for (my $tmpi=$startPos34; $tmpi<=$endPos34; $tmpi++) {
885
+ #modified by Hui 03/19/03 -$offset
886
+ $$newComponent[$tmpi-$offset] = $$component[$tmpi];
887
+ if ($$newComponent[$tmpi-$offset] =~ /^\W+$/) {
888
+ $newPuncNum++;
889
+ }
890
+ }
891
+
892
+ if ($newPuncNum > 1) {
893
+ foreach my $tmpSep(sort keys %{$$newSepH{punc}}) {
894
+ if (($tmpSep >= $startPos34) && ($tmpSep <= $endPos34)) {
895
+ my $newSep = $tmpSep - $offset;
896
+ $$newSepH{punc}{$newSep} = $$newSepH{punc}{$tmpSep};
897
+ }
898
+ delete($$newSepH{punc}{$tmpSep});
899
+ }
900
+ }else {
901
+ foreach my $tmpSep(sort keys %{$$newSepH{space}}) {
902
+ if (($tmpSep >= $startPos34) && ($tmpSep <= $endPos34)) {
903
+ my $newSep = $tmpSep - $offset;
904
+ $$newSepH{space}{$newSep} = $$newSepH{space}{$tmpSep};
905
+ }
906
+ delete($$newSepH{space}{$tmpSep});
907
+ }
908
+ }
909
+
910
+ my @NewTagsArray = ();
911
+ push @NewTagsArray, 3;
912
+ push @NewTagsArray, 4;
913
+ if ($newPuncNum > 1) {
914
+ $IdentifiedChunk = &Cont2ClassChunking($testp, \@NewTagsArray, "punc", $newSepH, $newComponent, $FeatureDictH, $tmpCacheVec, $SVMTmpResult);
915
+ }else {
916
+ $IdentifiedChunk = &Cont2ClassChunking($testp, \@NewTagsArray, "space", $newSepH, $newComponent, $FeatureDictH, $tmpCacheVec, $SVMTmpResult);
917
+ }
918
+
919
+ #adjust back $chunk
920
+ if ($offset > 0) {
921
+ foreach my $tmpi(sort keys %{$IdentifiedChunk}) {
922
+ $$IdentifiedChunk{$tmpi}{startPos} += $offset;
923
+ $$IdentifiedChunk{$tmpi}{endPos} += $offset;
924
+ }
925
+ }
926
+ push @ArrayofHash, $IdentifiedChunk;
927
+ $$TestH{$LN} = &FillChunkH($$TestH{$LN}, $component, \@ArrayofHash);
928
+ }elsif ($TagH{counter} > 2) { #3+ cases.
929
+ #consider about 3 discrete chunks for 3 tags????
930
+ if ($debug) {
931
+ print STDERR "do not care yet -- here is the case for 3+ classes after preprocessing \n";
932
+ #find the most likely position and expand to arround some(like 3) words
933
+ }
934
+ }
935
+ }
936
+ }
937
+ # }
938
+
939
+ return($TestH);
940
+ }
941
+
942
+
943
+ sub ExportInfo(){
944
+ my $TestH = shift;
945
+ my $outF = "output.txt";
946
+ open(WRITER, ">$outF") || die "SVMHeaderParse: could not open $outF to write: $!";
947
+ # foreach my $testHea(sort {$a <=> $b} keys %{$TestH}) {
948
+ print WRITER "headerno($testHea) -- ";
949
+ foreach my $LN(sort {$a <=> $b} keys %{$TestH}) {
950
+ print WRITER "lineno($LN)\: \n ";
951
+ foreach my $chunk(sort {$a <=> $b} keys %{$$TestH{$LN}{Pchunk}}) {
952
+ if ($chunk ne "ChunkCounter") {
953
+ print WRITER "\t chunk($chunk) -- class($$TestH{$LN}{Pchunk}{$chunk}{cls} <> content($$TestH{$LN}{Pchunk}{$chunk}{content} \n";
954
+ }
955
+ }
956
+ }
957
+ # }
958
+ close(WRITER);
959
+ }
960
+
961
+
962
+ sub ExportRDF(){
963
+ my $TestH = shift;
964
+ my $str='';
965
+ my $tempStr='';
966
+ foreach my $LN(sort {$a <=> $b} keys %{$TestH}) {
967
+ foreach my $chunk(sort {$a <=> $b} keys %{$$TestH{$LN}{Pchunk}}) {
968
+ my $tag = $InverseTagMap{$$TestH{$LN}{Pchunk}{$chunk}{cls}};
969
+ my $content = $$TestH{$LN}{Pchunk}{$chunk}{content};
970
+ if ($content =~ /\w+/) {
971
+ $str .="<$tag>$content</$tag>\n";
972
+ # if($tag =~/(url|note|date|abstract|intro|keyword|web|degree|pubnum|page)/){
973
+ # $tempStr .= "\n<cs_header:$tag>$content<\/cs_header:$tag>";
974
+ # }
975
+ }
976
+ }
977
+ }
978
+
979
+ # print "RDF:\n\n $str\n";
980
+ # print "$str\n";
981
+ $rXML = &HeaderParse::API::AssembleXMLMetadata::assemble(\$str);
982
+ return $rXML;
983
+ }
984
+
985
+
986
+ #Basic function: popuate information from line -- feature vector and class assignment and name patterns.
987
+ #no dictionary would be formed here
988
+ sub PopulateLineInfo4Header_unit() {
989
+ my $HeaderH = shift;
990
+ my %curState = ();
991
+
992
+ foreach my $line(sort {$a <=> $b} keys %{$HeaderH}) {
993
+ my $PureTextLine = $$HeaderH{$line}{RawContent};
994
+ $PureTextLine =~ s/(\<)*\<(\/)*(\w+)\>(\>)*/ /g; # remove the tags
995
+ $PureTextLine =~ s/\+L\+//g;
996
+ $PureTextLine =~ s/^\s+//g;
997
+ $PureTextLine =~ s/\s+$//g;
998
+ #should make punctuation separate!
999
+ $$HeaderH{$line}{PureText} = &SeparatePunc($PureTextLine);
1000
+ $$HeaderH{$line}{FeaVec} = &LineFeatureRepre($$HeaderH{$line}{PureText});
1001
+ #add the position of the line here!!!!
1002
+ $$HeaderH{$line}{FeaVec}{Clinepos} = $line;
1003
+ my $textFeaVec = "";
1004
+ foreach my $fea(keys %{$$HeaderH{$line}{FeaVec}}) {
1005
+ if($$HeaderH{$line}{FeaVec}{$fea} == 0) {
1006
+ delete ($$HeaderH{$line}{FeaVec}{$fea});
1007
+ }else {
1008
+ $textFeaVec .= "$fea($$HeaderH{$line}{FeaVec}{$fea}) ";
1009
+ }
1010
+ }
1011
+ $$HeaderH{$line}{TextFeaVec} = $textFeaVec; # for read and debug
1012
+
1013
+ #assign class tag to each line -- not separator <<sep>><</sep>> here
1014
+ if ($$HeaderH{$line}{RawContent} =~ /([^\<]+|(^\s*))\<(\/)*(\w+)\>($|[^\<]+)/) {
1015
+ %curState = ();
1016
+ my $tmpIndex = 0; # the order of this tag showed up last time
1017
+ my $preTag = -1;
1018
+ my $mul = 0;
1019
+ while ($$HeaderH{$line}{RawContent} =~ /([^\<]+|(^\s*))\<(\/)*(\w+)\>($|[^\<]+)/g) {
1020
+ $tmpIndex++;
1021
+ my $tmptag = $4;
1022
+ if (($preTag > 0) && ($preTag ne $tagMap{$tmptag})) {
1023
+ $mul = 1;
1024
+ }
1025
+ $curState{$tagMap{$tmptag}} = $tmpIndex;
1026
+ $preTag = $tagMap{$tmptag};
1027
+ }
1028
+
1029
+ if ($mul) {
1030
+ $$HeaderH{$line}{TClass} = "m";
1031
+ my $order = 1;
1032
+ foreach my $tag(sort {$curState{$a} <=> $curState{$b}} keys %curState) {
1033
+ $$HeaderH{$line}{MClsName}{$tag} = $order;
1034
+ $order++;
1035
+ }
1036
+
1037
+ #represent the class distribution only for this multi-class case.
1038
+ my $Tline = $$HeaderH{$line}{RawContent};
1039
+ #main purpose is to combine </phone><email> as one <s>
1040
+ $Tline =~ s/\<(\/)*(\w+)\>/<s>/g; #replace the tags with <s>
1041
+ $Tline =~ s/^\s*<s>\s*//g;
1042
+ $Tline =~ s/\s*<s>\s*$//g;
1043
+ $Tline =~ s/<s>\s*<s>/<s>/g;
1044
+ $Tline =~ s/\s+/ /g;
1045
+
1046
+ $Tline = &SeparatePunc($Tline);
1047
+
1048
+ while ($Tline =~ /(\s+(\W+)\s+<s>)/g) {
1049
+ my $whole = $1;
1050
+ my $punc = $2;
1051
+ $punc =~ s/^\s+//g;
1052
+ $punc =~ s/\s+$//g;
1053
+
1054
+ if ($punc eq "\|") {
1055
+ $Tline =~ s/\|/\!\!\!/g;
1056
+ $whole =~ s/\|/\!\!\!/g;
1057
+ }
1058
+ $Tline =~ s/$whole/<<sep>>$punc<<\/sep>>/g; #only once no "g"
1059
+ if ($punc eq "\|") {
1060
+ $Tline =~ s/\!\!\!//g;
1061
+ $whole =~ s/\!\!\!//g;
1062
+ }
1063
+ }
1064
+ while ($Tline =~ /(<s>\s+(\W+)\s+)/g) {
1065
+ my $whole = $1;
1066
+ my $punc = $2;
1067
+ $punc =~ s/^\s+//g;
1068
+ $punc =~ s/\s+$//g;
1069
+ if ($punc eq "\|") {
1070
+ $Tline =~ s/\|/\!\!\!/g;
1071
+ $whole =~ s/\|/\!\!\!/g;
1072
+ }
1073
+ $Tline =~ s/$whole/<<sep>>$punc<<\/sep>>/g; #only once no "g"
1074
+ if ($punc eq "\|") {
1075
+ $Tline =~ s/\!\!\!/\|/g;
1076
+ $whole =~ s/\!\!\!/\|/g;
1077
+ }
1078
+ }
1079
+ $Tline =~ s/<s>/<<sep>><<\/sep>>/g;
1080
+ my ($PuncNum, $SepH, $component) = &GetSeparatorIndex($Tline);
1081
+ #Populate Truth Hash by the chunk and word-class distribution
1082
+ $$HeaderH{$line} = &AssignWordTagFromChunk($$HeaderH{$line}, $SepH, $component);
1083
+ }else {
1084
+ $$HeaderH{$line}{TClass} = "s";
1085
+ my @Tarr = split(/\s+/, $PureTextLine);
1086
+ $$HeaderH{$line}{SClsWordCount} = $#Tarr +1;
1087
+ foreach my $tag(sort {$curState{$a} <=> $curState{$b}} keys %curState) {
1088
+ $$HeaderH{$line}{SClsName} = $tag;
1089
+ }
1090
+
1091
+ #Fill in the word-class distribution for single class line
1092
+ my $lineContent = &SeparatePunc($$HeaderH{$line}{PureText});
1093
+ my @wordArray = split(/\s+/, $lineContent);
1094
+ undef $lineContent;
1095
+
1096
+ $$HeaderH{$line} = &AssignWordTag4SingleClassLine("truth", $$HeaderH{$line}{SClsName}, $$HeaderH{$line}, \@wordArray);
1097
+
1098
+ #but only multi-author has multiple chunks
1099
+ #all reasonable name patterns for space separated names
1100
+ #feature vec for each space namepatterns and puncutation separators
1101
+ #Test/prediction will base on the predicted line tag in another module
1102
+
1103
+ #single author
1104
+ if ($$HeaderH{$line}{SClsName} eq "2") {
1105
+ #From Truth
1106
+ if ($$HeaderH{$line}{RawContent} !~ /<<sep>>/) {
1107
+ #could we save space by indicating the pure text directly
1108
+ $$HeaderH{$line}{Tchunk}{$i}{cls} = 2;
1109
+ $$HeaderH{$line}{Tchunk}{$i}{content} = $$HeaderH{$line}{PureText};
1110
+ #multiple authors
1111
+ }else {
1112
+ my $Tline = $$HeaderH{$line}{RawContent};
1113
+ $Tline =~ s/<(\/)*author>//g;
1114
+
1115
+ my ($PuncNum, $SepH, $component) = &GetSeparatorIndex($Tline);
1116
+ my $nameStr = join(" ", @$component);
1117
+
1118
+ #judge this is punctuated line or pure text-space
1119
+ if ($$HeaderH{$line}{PureText} =~ /([^\p{IsLower}\p{IsUpper}\s+\-\.\d+])|(\W+and\W+)/ig) {
1120
+ #multi-class needs while ... $punc++;
1121
+ $$HeaderH{$line}{NamePunc} = 1;
1122
+ }else {
1123
+ $$HeaderH{$line}{NameSpace} = 1;
1124
+ }
1125
+
1126
+ #{NamePuncFeaVec} and {NameSpaceFeaVec} based on number of puncs (>2)
1127
+ #{MulClsPuncFeaVec}
1128
+
1129
+ ######common to both name space and name punc ######
1130
+ my $TrueNames = &HeaderParse::API::NamePatternMatch::GetTrueName($nameStr);
1131
+ for my $i(0 .. $#$TrueNames) {
1132
+ my $j = $i+1; #chunk should start from 1
1133
+ $$HeaderH{$line}{Tchunk}{$j}{cls} = 2;
1134
+ $$HeaderH{$line}{Tchunk}{$j}{content} = "$$TrueNames[$i]";
1135
+ }
1136
+ ################################################
1137
+
1138
+ if ($$HeaderH{$line}{NamePunc}) {
1139
+ }else {
1140
+ my $PredictedNames = &HeaderParse::API::NamePatternMatch::NamePatternMatch($nameStr);
1141
+ if ($#$PredictedNames < 1) {
1142
+ #only one pattern -- do not fill name pattern
1143
+ }else {
1144
+ my $TrueIndex = &HeaderParse::API::NamePatternMatch::Duplicate($TrueNames, $PredictedNames);
1145
+ #must solve the problem
1146
+ if ($TrueIndex eq "-1") {
1147
+ if ($debug) {
1148
+ print STDERR "here the true name($TrueNames) is null from the line $content \n";
1149
+ }
1150
+ }else {
1151
+ #populate all reasonable name patterns
1152
+ for my $i(0 .. $#$PredictedNames) {
1153
+ my $candidateName = "";
1154
+ for my $j(0 .. $#{$$PredictedNames[$i]}) {
1155
+ if ($$PredictedNames[$i][$j]) {
1156
+ $candidateName .= "$$PredictedNames[$i][$j]<>";
1157
+ }
1158
+ }
1159
+ # print "candidate name\: $candidateName ";
1160
+ $$HeaderH{$line}{NamePattern}{$candidateName}{content} = $candidateName;
1161
+ ($$HeaderH{$line}{NamePattern}{$candidateName}{SpaceNameVec}) = &SpaceNameLnFeaRepre_unit($candidateName);
1162
+ if ($i eq $TrueIndex) {
1163
+ $$HeaderH{$line}{NamePattern}{$candidateName}{tag} = 1;
1164
+ }else {
1165
+ $$HeaderH{$line}{NamePattern}{$candidateName}{tag} = -1;
1166
+ }
1167
+ }
1168
+ }
1169
+ }
1170
+ }
1171
+ }
1172
+ }
1173
+ }
1174
+ }else { #if there is no explicit tag for this line, this line only belongs to the last class of the previous line
1175
+ my $tmpI = 0;
1176
+ foreach my $state (sort {$curState{$b} <=> $curState{$a}} keys %curState) {
1177
+ if ($tmpI > 0) {
1178
+ delete ($curState{$state});
1179
+ } #only keep the last tag
1180
+ $tmpI++;
1181
+ }
1182
+ $$HeaderH{$line}{TClass} = "s";
1183
+ foreach my $tag(sort {$curState{$a} <=> $curState{$b}} keys %curState) {
1184
+ $$HeaderH{$line}{SClsName} = $tag;
1185
+ }
1186
+ }
1187
+ }
1188
+
1189
+ return($HeaderH);
1190
+ }
1191
+
1192
+
1193
+ sub VectorizeUnknownHeaderLine () {
1194
+ my $HeaderH = shift;
1195
+
1196
+ my %curState = ();
1197
+ foreach my $line(sort {$a <=> $b} keys %{$HeaderH}) {
1198
+ my $PureTextLine = $$HeaderH{$line}{RawContent};
1199
+ # print "LINE $line: $PureTextLine\n";
1200
+ $PureTextLine =~ s/^\s+//g;
1201
+ $PureTextLine =~ s/\s+$//g;
1202
+ #should make punctuation separate!
1203
+ $$HeaderH{$line}{PureText} = &SeparatePunc($PureTextLine);
1204
+
1205
+ my @Tarr = split(/\s+/, $PureTextLine);
1206
+ $$HeaderH{$line}{SClsWordCount} = $#Tarr +1;
1207
+ $$HeaderH{$line}{FeaVec} = &LineFeatureRepre($$HeaderH{$line}{PureText});
1208
+ # foreach my $key (keys %{$$HeaderH{$line}{FeaVec}}) {
1209
+ # print "$key :: ".${$$HeaderH{$line}{FeaVec}}{$key}."\n";
1210
+ # }
1211
+ # print "\n";
1212
+ #add the position of the line here!!!!
1213
+ $$HeaderH{$line}{FeaVec}{Clinepos} = $line;
1214
+
1215
+ my $textFeaVec = "";
1216
+ foreach my $fea(keys %{$$HeaderH{$line}{FeaVec}}) {
1217
+ if($$HeaderH{$line}{FeaVec}{$fea} == 0) {
1218
+ delete ($$HeaderH{$line}{FeaVec}{$fea});
1219
+ }else {
1220
+ $textFeaVec .= "$fea($$HeaderH{$line}{FeaVec}{$fea}) ";
1221
+ }
1222
+ }
1223
+ $$HeaderH{$line}{TextFeaVec} = $textFeaVec; # for read and debug
1224
+ }
1225
+
1226
+ return($HeaderH);
1227
+ }
1228
+
1229
+
1230
+ #training data are assigned the true neighbour lines' tag
1231
+ sub TrainAssignLineTag() {
1232
+ my $FeatureDictH = shift;
1233
+ my $HeaderH = shift;
1234
+ my %curState = ();
1235
+
1236
+ foreach my $line(sort {$a <=> $b} keys %{$HeaderH}) {
1237
+ my $PC = 1; # 0 means the tag for current line (which might be useful)
1238
+ my $Pline = $line - $PC;
1239
+ while (($PC < 5) && ($Pline > 0)) { #previous line
1240
+ if (exists $$HeaderH{$Pline}{TClass}) {
1241
+ if ($$HeaderH{$Pline}{TClass} eq "s") {
1242
+ my $ContextFea = "P"."$PC"."$$HeaderH{$Pline}{SClsName}";
1243
+ if (! $$FeatureDictH{$ContextFea}{ID}) {
1244
+ $$FeatureDictH{FeatureCounter}++;
1245
+ $$FeatureDictH{$ContextFea}{ID} = $$FeatureDictH{FeatureCounter};
1246
+ $$FeatureDictH{$ContextFea}{max} = 0.5;
1247
+ }
1248
+
1249
+ if ($$FeatureDictH{$ContextFea}{ID}) {
1250
+ $$HeaderH{$line}{ContextFeaVec}{$ContextFea} = 0.5;
1251
+ $$FeatureDictH{$ContextFea}{DF}++;
1252
+ }
1253
+ }else { # consider the order of the tag
1254
+ foreach my $tag(sort {$$HeaderH{$Pline}{MClsName}{$a} <=> $$HeaderH{$Pline}{MClsName}{$b}} keys %{$$HeaderH{$Pline}{MClsName}}){
1255
+ my $ContextFea = "P"."$PC"."$tag";
1256
+ if (! $$FeatureDictH{$ContextFea}{ID}) {
1257
+ $$FeatureDictH{FeatureCounter}++;
1258
+ $$FeatureDictH{$ContextFea}{ID} = $$FeatureDictH{FeatureCounter};
1259
+ $$FeatureDictH{$ContextFea}{max} = 0.5;
1260
+ }
1261
+ if ($$FeatureDictH{$ContextFea}{ID}) {
1262
+ $$HeaderH{$line}{ContextFeaVec}{$ContextFea} = 0.5;
1263
+ $$FeatureDictH{$ContextFea}{DF}++;
1264
+ }
1265
+ }
1266
+ }
1267
+ $PC++;
1268
+ $Pline = $line - $PC;
1269
+ }else {
1270
+ last;
1271
+ }
1272
+ }
1273
+
1274
+ my $NC = 1;
1275
+ my $Nline = $line + $NC;
1276
+ while (($NC < 5) && (exists $$HeaderH{$Nline})) { #next line
1277
+ if ($$HeaderH{$Nline}{TClass} eq "s") {
1278
+ my $ContextFea = "N"."$NC"."$$HeaderH{$Nline}{SClsName}";
1279
+ if (! $$FeatureDictH{$ContextFea}{ID}) {
1280
+ $$FeatureDictH{FeatureCounter}++;
1281
+ $$FeatureDictH{$ContextFea}{ID} = $$FeatureDictH{FeatureCounter};
1282
+ $$FeatureDictH{$ContextFea}{max} = 0.5;
1283
+ }
1284
+ if ($$FeatureDictH{$ContextFea}{ID}) {
1285
+ $$HeaderH{$line}{ContextFeaVec}{$ContextFea} = 0.5;
1286
+ $$FeatureDictH{$ContextFea}{DF}++;
1287
+ }
1288
+ }else { # consider the order of the tag
1289
+ foreach my $tag(sort {$$HeaderH{$Nline}{MClsName}{$a} <=> $$HeaderH{$Nline}{MClsName}{$b}} keys %{$$HeaderH{$Nline}{MClsName}}){
1290
+ my $ContextFea = "N"."$NC"."$tag";
1291
+ if (! $$FeatureDictH{$ContextFea}{ID}) {
1292
+ $$FeatureDictH{FeatureCounter}++;
1293
+ $$FeatureDictH{$ContextFea}{ID} = $$FeatureDictH{FeatureCounter};
1294
+ $$FeatureDictH{$ContextFea}{max} = 0.5;
1295
+ }
1296
+ if ($$FeatureDictH{$ContextFea}{ID}) {
1297
+ $$HeaderH{$line}{ContextFeaVec}{$ContextFea} = 0.5;
1298
+ $$FeatureDictH{$ContextFea}{DF}++;
1299
+ }
1300
+ }
1301
+ }
1302
+ $NC++;
1303
+ $Nline = $line + $NC;
1304
+ }
1305
+
1306
+ #assemble features and their weight into string without normalization
1307
+ my $tmpFeaVec = $$HeaderH{$line}{SVMFeaVec};
1308
+ foreach my $fea(sort{$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$HeaderH{$line}{ContextFeaVec}}) {
1309
+ if (exists $$FeatureDictH{$fea}{ID}) {
1310
+ $tmpFeaVec .= "$$FeatureDictH{$fea}{ID}\:$$HeaderH{$line}{ContextFeaVec}{$fea} ";
1311
+ }
1312
+ }
1313
+ $$HeaderH{$line}{ContextSVMFeaVec} = "$tmpFeaVec";
1314
+ }
1315
+ return($FeatureDictH, $HeaderH);
1316
+ }
1317
+
1318
+ sub TestAssignLineTag() {
1319
+ my $FeatureDictH = shift;
1320
+ my $HeaderH = shift;
1321
+ my %curState = ();
1322
+
1323
+ foreach $line(sort {$a <=> $b} keys %{$HeaderH}) {
1324
+ #Initialize-remove the $$HeaderH{$line}{ContextFeaVec}
1325
+ if(exists ($$HeaderH{$line}{ContextFeaVec})) {
1326
+ delete($$HeaderH{$line}{ContextFeaVec});
1327
+ }
1328
+
1329
+ my $PC = 1; # 0 means the tag for current line (which might be useful)
1330
+ my $Pline = $line - $PC;
1331
+ while (($PC < 5) && ($Pline > 0)) { #previous line
1332
+ if (exists $$HeaderH{$Pline}{Pretag}) {
1333
+ foreach my $tag(sort keys %{$$HeaderH{$Pline}{Pretag}}){
1334
+ my $ContextFea = "P"."$PC"."$tag";
1335
+ if ($$FeatureDictH{$ContextFea}{ID}) {
1336
+ $$HeaderH{$line}{ContextFeaVec}{$ContextFea} = 0.5;
1337
+ }
1338
+ }
1339
+ }
1340
+ $PC++;
1341
+ $Pline = $line - $PC;
1342
+ }
1343
+
1344
+ my $NC = 1;
1345
+ my $Nline = $line + $NC;
1346
+ while (($NC < 5) && (exists $$HeaderH{$Nline})) { #next line
1347
+ foreach my $tag(sort keys %{$$HeaderH{$Nline}{Pretag}}){
1348
+ my $ContextFea = "N"."$NC"."$tag";
1349
+ if ($$FeatureDictH{$ContextFea}{ID}) {
1350
+ $$HeaderH{$line}{ContextFeaVec}{$ContextFea} = 0.5;
1351
+ }
1352
+ }
1353
+ $NC++;
1354
+ $Nline = $line + $NC;
1355
+ }
1356
+
1357
+ #assemble features and their weight into string without normalization
1358
+ my $tmpFeaVec = $$HeaderH{$line}{SVMFeaVec};
1359
+
1360
+ foreach my $fea(sort{$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$HeaderH{$line}{ContextFeaVec}}) {
1361
+ if (exists $$FeatureDictH{$fea}{ID}) {
1362
+ $tmpFeaVec .= "$$FeatureDictH{$fea}{ID}\:$$HeaderH{$line}{ContextFeaVec}{$fea} ";
1363
+ }
1364
+ }
1365
+ $tmpFeaVec =~ s/\s+$//g;
1366
+ $$HeaderH{$line}{ContextSVMFeaVec} = "$tmpFeaVec";
1367
+ }
1368
+ return($FeatureDictH, $HeaderH);
1369
+ }
1370
+
1371
+ #given a line, check the number and the position of punctuation/space it contains
1372
+ sub GetSeparatorIndex() {
1373
+ my $line = shift;
1374
+ my %SeparatorH = ();
1375
+
1376
+ my $PuncNum = 0;
1377
+ $line =~ s/^\s+//g;
1378
+ $line =~ s/\s+$//g;
1379
+
1380
+ #punc means this line contains punc or only space
1381
+ #each space occupies a position and punctuations are separate
1382
+ my ($punc, $spaceLine) = &FillSpace($line);
1383
+
1384
+ #punctuation is specific; space separator contains punctuation separators.
1385
+ my @component = split(/\s+/, $spaceLine);
1386
+ foreach my $i(0 .. $#component) {
1387
+ if ($component[$i] =~ /<<sep>>(\W+|\s*)<<\/sep>>/) {
1388
+ $component[$i] = $1;
1389
+ if ($component[$i] eq "") {
1390
+ $component[$i] = "<<sep>><<\/sep>>";
1391
+ $SeparatorH{space}{$i} = 2;
1392
+ }else {
1393
+ $SeparatorH{punc}{$i} = 2;
1394
+ $PuncNum++;
1395
+ $SeparatorH{space}{$i} = 2;
1396
+ }
1397
+ }elsif ($component[$i] =~ /<space>/) {
1398
+ $SeparatorH{space}{$i} = 1;
1399
+ }elsif ($component[$i] =~ /^[^\p{IsLower}\p{IsUpper}\s+\-\d+]+$/) {
1400
+ $SeparatorH{punc}{$i} = 1; #position(not what punc)
1401
+ $PuncNum++;
1402
+ $SeparatorH{space}{$i} = 1;
1403
+ }
1404
+ }
1405
+ return($PuncNum, \%SeparatorH, \@component);
1406
+ }
1407
+
1408
+
1409
+ #multi-Authors line still has only one class, although 1+ authors
1410
+ sub AssignWordTagFromChunk() {
1411
+ my ($LineH, $SepH, $component) = @_;
1412
+ my @tags = ();
1413
+ foreach my $tag(sort {$$LineH{MClsName}{$a} <=> $$LineH{MClsName}{$b}} keys %{$$LineH{MClsName}}) {
1414
+ push @tags, $tag;
1415
+ }
1416
+
1417
+ my $ChunkNO = 1;
1418
+ my $curTag = $tags[$tagP];
1419
+ my $WordPos = 1;
1420
+ my $chunk = "";
1421
+ for my $i(0 .. $#$component) {
1422
+ #we do not assign class to separators
1423
+ if ($$SepH{space}{$i} >1) {
1424
+ if ($chunk ne "") {
1425
+ $$LineH{Tchunk}{$ChunkNO}{cls} = $curTag;
1426
+ $$LineH{Tchunk}{$ChunkNO}{content} = $chunk;
1427
+ $chunk = "";
1428
+ $curTag = $tags[$ChunkNO];
1429
+ $ChunkNO++;
1430
+ }
1431
+ }elsif ($$component[$i] !~ /<space>|^\W+$|\<\<.*\>\>/) { # such as <<sep>> <</sep>>
1432
+ $chunk .= "$$component[$i] ";
1433
+ $$LineH{Tline}{$WordPos}{cls} = $curTag;
1434
+ $$LineH{Tline}{$WordPos}{OriginalWord} = $$component[$i];
1435
+ $WordPos++;
1436
+ }
1437
+ };
1438
+
1439
+ #Fill in the last chunk
1440
+ $$LineH{Tchunk}{$ChunkNO}{cls} = $curTag;
1441
+ $$LineH{Tchunk}{$ChunkNO}{content} = $chunk;
1442
+
1443
+ return ($LineH);
1444
+ }
1445
+
1446
+
1447
+ sub AssignWordTag4SingleClassLine() {
1448
+ my ($type, $curTag, $LineH, $component) = @_;
1449
+
1450
+ my $WordPos = 1;
1451
+ for my $i(0 .. $#$component) {
1452
+ if ($$component[$i] !~ /<space>|^\W+$|\<\<.*\>\>/) { # such as <<sep>> <</sep>>
1453
+ if ($type eq "truth") {
1454
+ $$LineH{Tline}{$WordPos}{cls} = $curTag;
1455
+ #added 01/08 the original word in a position
1456
+ $$LineH{Tline}{$WordPos}{OriginalWord} = $$component[$i];
1457
+ }elsif ($type eq "predict") {
1458
+ $$LineH{Pline}{$WordPos}{cls} = $curTag;
1459
+ $$LineH{Pline}{$WordPos}{OriginalWord} = $$component[$i];
1460
+ }
1461
+ $WordPos++;
1462
+ }
1463
+ }
1464
+
1465
+ return ($LineH);
1466
+ }
1467
+
1468
+
1469
+ sub Analyze() {
1470
+ my $resultF = shift;
1471
+ open(resultFH, "$resultF") || die "SVMHeaderParse: could not open $resultF to read: $!";
1472
+ my $result = <resultFH>;
1473
+ close(resultFH);
1474
+ $result =~ s/\s+$//g;
1475
+ return($result);
1476
+ }
1477
+
1478
+
1479
+ sub ReadFeatureDict() {
1480
+ my $Fname = shift;
1481
+ my %FeatureDictH;
1482
+
1483
+ open (FH, "$Fname") || die "SVMHeaderParse: could not open $Fname to read: $!";
1484
+ while (my $line = <FH>) {
1485
+ my ($ID, $fea, $max, $DF) = split(/<>/, $line);
1486
+ $ID =~ s/^\s+//g;
1487
+ $ID =~ s/\s+$//g;
1488
+
1489
+ if ($fea =~ /FeatureCounter/) {
1490
+ $FeatureDictH{$fea}{num} = $ID;
1491
+ next;
1492
+ }
1493
+
1494
+ $fea =~ s/^\s+//g;
1495
+ $fea =~ s/\s+$//g;
1496
+ $max =~ s/^\s+//g;
1497
+ $max =~ s/\s+$//g;
1498
+ $DF =~ s/^\s+//g;
1499
+ $Df =~ s/\s+$//g;
1500
+ $FeatureDictH{$fea}{ID} = $ID;
1501
+ $FeatureDictH{$fea}{max} = $max;
1502
+ $FeatureDictH{$fea}{DF} = $DF;
1503
+ }
1504
+ close(FH);
1505
+ return(\%FeatureDictH);
1506
+ }
1507
+
1508
+
1509
+ sub printTrainData() {
1510
+ my $affix = shift;
1511
+ my $HeaderH = shift;
1512
+
1513
+ #Sometimes $$HeaderH{$HeaNO}{$LN}{ContextSVMFeaVec} is not null, but
1514
+ #$$HeaderH{$HeaNO}{$LN}{SVMFeaVec} is null so they have different file length!
1515
+ for my $clsNO(1 .. 15) {
1516
+ my $F = "$offlineD"."$clsNO"."\."."$affix";
1517
+ open(FH, ">$F") || die "SVMHeaderParse: could not open $F to write: $!";
1518
+ foreach my $HeaNO (sort {$a <=> $b} keys %{$HeaderH}) {
1519
+ foreach my $LN(sort {$a <=> $b} keys %{$$HeaderH{$HeaNO}}) {
1520
+ if ($$HeaderH{$HeaNO}{$LN}{SVMFeaVec} ne "") {
1521
+ if ($affix eq "train") {
1522
+ if ($$HeaderH{$HeaNO}{$LN}{SVMFeaVec} ne "") {
1523
+ if (($$HeaderH{$HeaNO}{$LN}{SClsName} eq "$clsNO") || exists($$HeaderH{$HeaNO}{$LN}{MClsName}{$clsNO})) {
1524
+ print FH "1 $$HeaderH{$HeaNO}{$LN}{SVMFeaVec}\n";
1525
+ }else {
1526
+ print FH "-1 $$HeaderH{$HeaNO}{$LN}{SVMFeaVec}\n";
1527
+ }
1528
+ }
1529
+ }elsif ($affix eq "context") {
1530
+ #if ($$HeaderH{$HeaNO}{$LN}{ContextSVMFeaVec} ne "") {
1531
+ if (($$HeaderH{$HeaNO}{$LN}{SClsName} eq "$clsNO") || exists($$HeaderH{$HeaNO}{$LN}{MClsName}{$clsNO})) {
1532
+ print FH "1 $$HeaderH{$HeaNO}{$LN}{ContextSVMFeaVec}\n";
1533
+ }else {
1534
+ print FH "-1 $$HeaderH{$HeaNO}{$LN}{ContextSVMFeaVec}\n";
1535
+ }
1536
+ }else {
1537
+ print "weired -- $affix is not context nor train \n";
1538
+ }
1539
+ }
1540
+ }
1541
+ }
1542
+ close(FH);
1543
+ }
1544
+ }
1545
+
1546
+
1547
+ sub printNameSpaceTrainData(){
1548
+ my $printF = shift;
1549
+ my $NameSpaceTrainVecH = shift;
1550
+
1551
+ open(FH, ">$printF") || die "SVMHeaderParse: could not open $printF to write: $!";
1552
+ foreach my $Lcount(sort{$a<=>$b} keys %{$NameSpaceTrainVecH}) {
1553
+ print FH "$$NameSpaceTrainVecH{$Lcount}{SpaceNameVec}\n";
1554
+ }
1555
+ close(FH);
1556
+ }
1557
+
1558
+
1559
+ sub SpaceNameLnFeaRepre() {
1560
+ my $type = shift;
1561
+ my $NamePatternStr = shift;
1562
+ my $NameDictH = shift;
1563
+
1564
+ #feature generation and representation
1565
+ #It is good to make each of the apple's feature(color, shape..) separate.
1566
+ my %FeatureH = ();
1567
+ $NamePatternStr =~ s/\<\>$//g; #remove the last <>
1568
+ my @Names = split(/<>/, $NamePatternStr);
1569
+
1570
+ #try making features binary
1571
+ for my $i(0 .. $#Names) {
1572
+ my @NameComponent = split(/\s+/, $Names[$i]);
1573
+ for my $j(0 .. $#NameComponent){
1574
+
1575
+ #feature generation($i = 0 is the first one)
1576
+ $FeatureH{"Name"."$i"."part"."$j"."form"} = &HeaderParse::API::NamePatternMatch::RichNameType($NameComponent[$j]);
1577
+ if ($j eq $#NameComponent) {
1578
+ $FeatureH{"Name"."$i"."part"."$j"."pos"} = "Last";
1579
+ }elsif ($j eq $#NameComponent -1) {
1580
+ $FeatureH{"Name"."$i"."part"."$j"."pos"} = "SecLast";
1581
+ }else {
1582
+ $FeatureH{"Name"."$i"."part"."$j"."pos"} = $j;
1583
+ }
1584
+
1585
+ #firstname, lastname information
1586
+ # print "hello: ".lc($NameComponent[$j])."\n";
1587
+ if (($firstnameH{lc($NameComponent[$j])}) && (!$lastnameH{lc($NameComponent[$j])})) {
1588
+ $FeatureH{"Name"."$i"."part"."$j"."FN"} = 1;
1589
+ }elsif (($lastnameH{lc($NameComponent[$j])}) && (!$firstnameH{lc($NameComponent[$j])})) {
1590
+ $FeatureH{"Name"."$i"."part"."$j"."LN"} = 1;
1591
+ }elsif (! $dictH{lc($NameComponent[$j])}) {
1592
+ $FeatureH{"Name"."$i"."part"."$j"."NonDict"} = 1;
1593
+ }
1594
+
1595
+ #space for more features
1596
+ }
1597
+ }
1598
+
1599
+ #Build up FeatureVec
1600
+ #code for the attribute ID separately so that the ID for features would be continuous
1601
+ if ($type eq "train") {
1602
+ foreach my $fea(sort {$a <=> $b} keys %FeatureH) {
1603
+ if (! $$NameDictH{$fea}{ID}) {
1604
+ $$NameDictH{FeatureCounter}++;
1605
+ $$NameDictH{$fea}{ID} = $$NameDictH{FeatureCounter};
1606
+ }
1607
+
1608
+ if (! IsNumber($FeatureH{$fea})) {
1609
+ if (! exists $$NameDictH{$FeatureH{$fea}}{ID}) {
1610
+ $$NameDictH{FeatureCounter}++;
1611
+ $$NameDictH{$FeatureH{$fea}}{ID} = $$NameDictH{FeatureCounter};
1612
+ }
1613
+ $FeatureH{$fea} = $$NameDictH{$FeatureH{$fea}}{ID};
1614
+ }
1615
+
1616
+ if ($FeatureH{$fea} == 0) {
1617
+ delete($FeatureH{$fea});
1618
+ }else {
1619
+ if ((! exists $$NameDictH{$fea}{max}) || ($$NameDictH{$fea}{max} < $FeatureH{$fea})) {
1620
+ $$NameDictH{$fea}{max} = $FeatureH{$fea};
1621
+ }
1622
+ }
1623
+ }
1624
+ return(\%FeatureH, $NameDictH);
1625
+ #test
1626
+ }else {
1627
+ my $SpaceNameFeaVec = "";
1628
+ my $SpaceNameTextFeaVec = "";
1629
+ foreach my $fea(sort {$$NameDictH{$a}{ID} <=> $$NameDictH{$b}{ID}} keys %FeatureH) {
1630
+ if (! &IsNumber($FeatureH{$fea})) {
1631
+ if (exists $$NameDictH{$FeatureH{$fea}}{ID}) {
1632
+ $FeatureH{$fea} = $$NameDictH{$FeatureH{$fea}}{ID};
1633
+ }else {
1634
+ delete($FeatureH{$fea});
1635
+ }
1636
+ }
1637
+
1638
+ if (! ($FeatureH{$fea} && $$NameDictH{$fea}{ID})) {
1639
+ delete($FeatureH{$fea});
1640
+ }else {
1641
+ $FeatureH{$fea} = sprintf("%.8f", $FeatureH{$fea}/$$NameDictH{$fea}{max});
1642
+ $SpaceNameFeaVec .= "$$NameDictH{$fea}{ID}\:$FeatureH{$fea} ";
1643
+ $SpaceNameTextFeaVec .= "$fea\:$FeatureH{$fea} ";
1644
+ }
1645
+ }
1646
+ return($SpaceNameFeaVec, $SpaceNameTextFeaVec);
1647
+ }
1648
+ }
1649
+
1650
+
1651
+ sub SpaceNameLnFeaRepre_unit() {
1652
+ my $NamePatternStr = shift;
1653
+
1654
+ #feature generation and representation
1655
+ #It is good to make each of the apple's feature(color, shape..) separate.
1656
+ my %FeatureH = ();
1657
+ $NamePatternStr =~ s/\<\>$//g; #remove the last <>
1658
+ my @Names = split(/<>/, $NamePatternStr);
1659
+
1660
+ #try making features binary
1661
+ for my $i(0 .. $#Names) {
1662
+ my @NameComponent = split(/\s+/, $Names[$i]);
1663
+ for my $j(0 .. $#NameComponent){
1664
+ #feature generation($i = 0 is the first one)
1665
+ $FeatureH{"Name"."$i"."part"."$j"."form"} = &HeaderParse::API::NamePatternMatch::RichNameType($NameComponent[$j]);
1666
+ if ($j eq $#NameComponent) {
1667
+ $FeatureH{"Name"."$i"."part"."$j"."pos"} = "Last";
1668
+ }elsif ($j eq $#NameComponent -1) {
1669
+ $FeatureH{"Name"."$i"."part"."$j"."pos"} = "SecLast";
1670
+ }else {
1671
+ $FeatureH{"Name"."$i"."part"."$j"."pos"} = $j;
1672
+ }
1673
+ #firstname, lastname information
1674
+ # print "hello2: ".lc($NameComponent[$j])."\n";
1675
+ if (($firstnameH{lc($NameComponent[$j])}) && (!$lastnameH{lc($NameComponent[$j])})) {
1676
+ # print "NAME MATCH: ".lc($NameComponent[$j])."\n";
1677
+ $FeatureH{"Name"."$i"."part"."$j"."FN"} = 1;
1678
+ }elsif (($lastnameH{lc($NameComponent[$j])}) && (!$firstnameH{lc($NameComponent[$j])})) {
1679
+ # print "NAME MATCH: ".lc($NameComponent[$j])."\n";
1680
+
1681
+ $FeatureH{"Name"."$i"."part"."$j"."LN"} = 1;
1682
+ }elsif (! $dictH{lc($NameComponent[$j])}) {
1683
+ # print "NAME MATCH: ".lc($NameComponent[$j])."\n";
1684
+
1685
+ $FeatureH{"Name"."$i"."part"."$j"."NonDict"} = 1;
1686
+ }
1687
+
1688
+ #space for more features
1689
+ }
1690
+ }
1691
+ return(\%FeatureH);
1692
+ }
1693
+
1694
+
1695
+ sub IsNumber ()
1696
+ {
1697
+ my $in = shift;
1698
+ if ($in =~ m/^(\d+)(\.\d+)*$/) {
1699
+ return 1;
1700
+ }else {
1701
+ return 0;
1702
+ }
1703
+ }
1704
+
1705
+
1706
+ sub FormFeaDict() {
1707
+ my $DataH = shift;
1708
+ my $FeatureDictH = shift;
1709
+ my %NameSpaceFeaDictH = ();
1710
+
1711
+ foreach my $HeaNO (sort {$a <=> $b} keys %{$DataH}) {
1712
+ foreach my $line (sort {$a <=> $b} keys %{$$DataH{$HeaNO}}) {
1713
+ foreach my $fea(keys %{$$DataH{$HeaNO}{$line}{FeaVec}}) {
1714
+ if ($$DataH{$HeaNO}{$line}{FeaVec}{$fea} == 0) {
1715
+ delete ($$DataH{$HeaNO}{$line}{FeaVec}{$fea});
1716
+ next;
1717
+ }else {
1718
+ if (! $$FeatureDictH{$fea}{ID}) {
1719
+ $$FeatureDictH{FeatureCounter}++;
1720
+ $$FeatureDictH{$fea}{ID} = $$FeatureDictH{FeatureCounter};
1721
+ }
1722
+ if ($$DataH{$HeaNO}{$line}{FeaVec}{$fea} > $$FeatureDictH{$fea}{max}) {
1723
+ $$FeatureDictH{$fea}{max} = $$DataH{$HeaNO}{$line}{FeaVec}{$fea};
1724
+ }
1725
+ $$FeatureDictH{$fea}{DF}++;
1726
+ }
1727
+ #test needs this line!
1728
+ if ((! $$FeatureDictH{$fea}{ID}) || ($$DataH{$HeaNO}{$line}{FeaVec}{$fea} == 0)) { #some basic feature defined in initialization such as pubnumber could be 0
1729
+ delete ($$DataH{$HeaNO}{$line}{FeaVec}{$fea});
1730
+ }
1731
+ }
1732
+
1733
+ #form the Name Space Feature Dict
1734
+ if (exists $$DataH{$HeaNO}{$line}{NamePattern}) {
1735
+ foreach my $CandidateNamePattern(keys %{$$DataH{$HeaNO}{$line}{NamePattern}}) {
1736
+ foreach my $fea(keys %{$$DataH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}}) {
1737
+ my $wt = $$DataH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}{$fea};
1738
+ if (! $NameSpaceFeaDictH{$fea}{ID}) {
1739
+ $NameSpaceFeaDictH{FeatureCounter}++;
1740
+ $NameSpaceFeaDictH{$fea}{ID} = $NameSpaceFeaDictH{FeatureCounter};
1741
+ }
1742
+ if (! &IsNumber($wt)) {
1743
+ if (! exists $NameSpaceFeaDictH{$wt}{ID}) {
1744
+ $NameSpaceFeaDictH{FeatureCounter}++;
1745
+ $NameSpaceFeaDictH{$wt}{ID} = $NameSpaceFeaDictH{FeatureCounter};
1746
+ }
1747
+ $$DataH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}{$fea} = $NameSpaceFeaDictH{$wt}{ID};
1748
+ }
1749
+
1750
+ if ($wt == 0) {
1751
+ delete($$DataH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}{$fea});
1752
+ }else {
1753
+ if ((! exists $NameSpaceFeaDictH{$fea}{max}) || ($NameSpaceFeaDictH{$fea}{max} < $wt)) {
1754
+ $NameSpaceFeaDictH{$fea}{max} = $wt;
1755
+ }
1756
+ }
1757
+ }
1758
+ }
1759
+ }
1760
+ #end of form the dictionary for the name
1761
+ }
1762
+ }
1763
+ return($DataH, $FeatureDictH, \%NameSpaceFeaDictH);
1764
+ }
1765
+
1766
+
1767
+ sub FormTestFeaVec(){
1768
+ my $FeatureDictH = shift;
1769
+ my $TestHeaderH = shift;
1770
+
1771
+ foreach my $line(sort{$a<=>$b} keys %{$TestHeaderH}) {
1772
+ foreach my $fea(keys %{$$TestHeaderH{$line}{FeaVec}}) {
1773
+ if ((! $$FeatureDictH{$fea}{ID}) || ($$TestHeaderH{$line}{FeaVec}{$fea} == 0)) {
1774
+ delete($$TestHeaderH{$line}{FeaVec}{$fea});
1775
+ }
1776
+ }
1777
+ }
1778
+ return($TestHeaderH);
1779
+ }
1780
+
1781
+
1782
+ sub PruneDict() {
1783
+ my $FeatureDictH = shift;
1784
+ my $Recount = 1;
1785
+
1786
+ foreach my $DictFea(sort{$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$FeatureDictH}) {
1787
+ if ((($DictFea ne "FeatureCounter") && ($$FeatureDictH{$DictFea}{max} == 0)) || ($$FeatureDictH{$DictFea}{DF} < 2)) {
1788
+ delete($$FeatureDictH{$DictFea});
1789
+ }else {
1790
+ $$FeatureDictH{$DictFea}{ID} = $Recount;
1791
+ $Recount++;
1792
+ }
1793
+ }
1794
+
1795
+ $$FeatureDictH{FeatureCounter} = $Recount-1;
1796
+
1797
+ return($FeatureDictH);
1798
+ }
1799
+
1800
+ #input is an array of name patterns
1801
+ #return a string of the best name pattern
1802
+ sub PredictBestNamePattern() {
1803
+ my $PredictedNames = shift;
1804
+ my $SVMNameSpaceModel = shift;
1805
+ my $SpaceNameDictH = shift;
1806
+ my $tmpCacheVec = shift;
1807
+ my $SVMTmpResult = shift;
1808
+
1809
+ my $MaxVal = -10;
1810
+ my $BestNamePattern = "";
1811
+
1812
+ for my $i(0 .. $#$PredictedNames) {
1813
+ my $candidateName = "";
1814
+ for my $j(0 .. $#{$$PredictedNames[$i]}) {
1815
+ if ($$PredictedNames[$i][$j]) {
1816
+ $candidateName .= "$$PredictedNames[$i][$j]<>";
1817
+ }
1818
+ }
1819
+
1820
+ my ($RawNameFeaVec) = &SpaceNameLnFeaRepre_unit($candidateName);
1821
+ #filter out the non-dictinary features
1822
+ my $SpaceNameVec = "";
1823
+ my $SpaceNameTextFeaVec = "";
1824
+ foreach my $fea(sort {$$SpaceNameDictH{$a}{ID} <=> $$SpaceNameDictH{$b}{ID}} keys %{$RawNameFeaVec}) {
1825
+ my $wt = $$RawNameFeaVec{$fea};
1826
+ if (! &IsNumber($wt)) {
1827
+ if (exists $$SpaceNameDictH{$wt}{ID}) {
1828
+ $$RawNameFeaVec{$fea} = $$SpaceNameDictH{$wt}{ID};
1829
+ }else {
1830
+ delete($$RawNameFeaVec{$fea});
1831
+ }
1832
+ }
1833
+
1834
+ if (! (($$RawNameFeaVec{$fea}>0) && $$SpaceNameDictH{$fea}{ID})) {
1835
+ delete($$RawNameFeaVec{$fea});
1836
+ }else {
1837
+ $$RawNameFeaVec{$fea} = sprintf("%.8f", $$RawNameFeaVec{$fea}/$$SpaceNameDictH{$fea}{max}
1838
+ );
1839
+ $SpaceNameVec .= "$$SpaceNameDictH{$fea}{ID}\:$$RawNameFeaVec{$fea} ";
1840
+ $SpaceNameTextFeaVec .= "$fea\:$$RawNameFeaVec{$fea} ";
1841
+ }
1842
+ }
1843
+ open(testVec, ">$tmpCacheVec") || die "SVMHeaderParse: could not open $tmpCacheVec to write: $!";
1844
+ # print "NamePattern FeatureVec is\: $SpaceNameTextVec\n";
1845
+ print testVec "$SpaceNameVec";
1846
+ close(testVec);
1847
+ `$Classifier -v 0 $tmpCacheVec $SVMNameSpaceModel $SVMTmpResult`;
1848
+ my $result = &Analyze($SVMTmpResult);
1849
+ if ($result > $MaxVal) {
1850
+ $MaxVal = $result;
1851
+ $BestNamePattern = $candidateName;
1852
+ }
1853
+ }
1854
+
1855
+ unlink $tmpCacheVec;
1856
+ unlink $SVMTmpResult;
1857
+
1858
+ #split the multiple names in order
1859
+ $BestNamePattern =~ s/\<\>$//g; #remove the last <>
1860
+
1861
+ return($BestNamePattern);
1862
+ }
1863
+
1864
+
1865
+ sub WordCount() { #didn't try, but should be OK, since it is borrowed from AddrMatch in function.pm
1866
+ my $inStr = shift;
1867
+ $inStr =~ s/^\s+//g;
1868
+ $inStr =~ s/\s+$//g;
1869
+
1870
+ my $senLen = 0;
1871
+ my @words = split(/\s+/, $inStr);
1872
+ for my $i(0 .. $#words) {
1873
+ if ($words[0] !~ /^\W+\s*$/) { #punctuation
1874
+ $senLen ++;
1875
+ }
1876
+ }
1877
+ return($senLen);
1878
+ }
1879
+
1880
+ 1;