biblicit 1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (322) hide show
  1. data/.gitignore +3 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +6 -0
  4. data/LICENSE.TXT +176 -0
  5. data/README.md +120 -0
  6. data/Rakefile +8 -0
  7. data/biblicit.gemspec +33 -0
  8. data/lib/biblicit/cb2bib.rb +83 -0
  9. data/lib/biblicit/citeseer.rb +53 -0
  10. data/lib/biblicit/extractor.rb +37 -0
  11. data/lib/biblicit.rb +6 -0
  12. data/perl/DocFilter/lib/CSXUtil/SafeText.pm +140 -0
  13. data/perl/DocFilter/lib/DocFilter/Config.pm +35 -0
  14. data/perl/DocFilter/lib/DocFilter/Filter.pm +51 -0
  15. data/perl/FileConversionService/README.TXT +11 -0
  16. data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
  17. data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +140 -0
  18. data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +77 -0
  19. data/perl/FileConversionService/lib/FileConverter/Compression.pm +137 -0
  20. data/perl/FileConversionService/lib/FileConverter/Config.pm +57 -0
  21. data/perl/FileConversionService/lib/FileConverter/Controller.pm +191 -0
  22. data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +61 -0
  23. data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +69 -0
  24. data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +69 -0
  25. data/perl/FileConversionService/lib/FileConverter/PSToText.pm +88 -0
  26. data/perl/FileConversionService/lib/FileConverter/Prescript.pm +68 -0
  27. data/perl/FileConversionService/lib/FileConverter/TET.pm +75 -0
  28. data/perl/FileConversionService/lib/FileConverter/Utils.pm +130 -0
  29. data/perl/HeaderParseService/README.TXT +80 -0
  30. data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +140 -0
  31. data/perl/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +968 -0
  32. data/perl/HeaderParseService/lib/HeaderParse/API/Function.pm +2016 -0
  33. data/perl/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +444 -0
  34. data/perl/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +409 -0
  35. data/perl/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +537 -0
  36. data/perl/HeaderParseService/lib/HeaderParse/API/Parser.pm +68 -0
  37. data/perl/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +1880 -0
  38. data/perl/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +46 -0
  39. data/perl/HeaderParseService/resources/data/EbizHeaders.txt +24330 -0
  40. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +27506 -0
  41. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +26495 -0
  42. data/perl/HeaderParseService/resources/data/tagged_headers.txt +40668 -0
  43. data/perl/HeaderParseService/resources/data/test_header.txt +31 -0
  44. data/perl/HeaderParseService/resources/data/test_header.txt.parsed +31 -0
  45. data/perl/HeaderParseService/resources/database/50states +60 -0
  46. data/perl/HeaderParseService/resources/database/AddrTopWords.txt +17 -0
  47. data/perl/HeaderParseService/resources/database/AffiTopWords.txt +35 -0
  48. data/perl/HeaderParseService/resources/database/AffiTopWordsAll.txt +533 -0
  49. data/perl/HeaderParseService/resources/database/ChineseSurNames.txt +276 -0
  50. data/perl/HeaderParseService/resources/database/Csurnames.bin +0 -0
  51. data/perl/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
  52. data/perl/HeaderParseService/resources/database/DomainSuffixes.txt +242 -0
  53. data/perl/HeaderParseService/resources/database/LabeledHeader +18 -0
  54. data/perl/HeaderParseService/resources/database/README +2 -0
  55. data/perl/HeaderParseService/resources/database/TrainMulClassLines +254 -0
  56. data/perl/HeaderParseService/resources/database/TrainMulClassLines1 +510 -0
  57. data/perl/HeaderParseService/resources/database/abstract.txt +1 -0
  58. data/perl/HeaderParseService/resources/database/abstractTopWords +9 -0
  59. data/perl/HeaderParseService/resources/database/addr.txt +28 -0
  60. data/perl/HeaderParseService/resources/database/affi.txt +34 -0
  61. data/perl/HeaderParseService/resources/database/affis.bin +0 -0
  62. data/perl/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
  63. data/perl/HeaderParseService/resources/database/allnamewords.bin +0 -0
  64. data/perl/HeaderParseService/resources/database/cities_US.txt +4512 -0
  65. data/perl/HeaderParseService/resources/database/cities_world.txt +4463 -0
  66. data/perl/HeaderParseService/resources/database/city.txt +3150 -0
  67. data/perl/HeaderParseService/resources/database/cityname.txt +3151 -0
  68. data/perl/HeaderParseService/resources/database/country_abbr.txt +243 -0
  69. data/perl/HeaderParseService/resources/database/countryname.txt +262 -0
  70. data/perl/HeaderParseService/resources/database/dateTopWords +30 -0
  71. data/perl/HeaderParseService/resources/database/degree.txt +67 -0
  72. data/perl/HeaderParseService/resources/database/email.txt +3 -0
  73. data/perl/HeaderParseService/resources/database/excludeWords.txt +40 -0
  74. data/perl/HeaderParseService/resources/database/female-names +4960 -0
  75. data/perl/HeaderParseService/resources/database/firstNames.txt +8448 -0
  76. data/perl/HeaderParseService/resources/database/firstnames.bin +0 -0
  77. data/perl/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
  78. data/perl/HeaderParseService/resources/database/intro.txt +2 -0
  79. data/perl/HeaderParseService/resources/database/keyword.txt +5 -0
  80. data/perl/HeaderParseService/resources/database/keywordTopWords +7 -0
  81. data/perl/HeaderParseService/resources/database/male-names +3906 -0
  82. data/perl/HeaderParseService/resources/database/middleNames.txt +2 -0
  83. data/perl/HeaderParseService/resources/database/month.txt +35 -0
  84. data/perl/HeaderParseService/resources/database/mul +868 -0
  85. data/perl/HeaderParseService/resources/database/mul.label +869 -0
  86. data/perl/HeaderParseService/resources/database/mul.label.old +869 -0
  87. data/perl/HeaderParseService/resources/database/mul.processed +762 -0
  88. data/perl/HeaderParseService/resources/database/mulAuthor +619 -0
  89. data/perl/HeaderParseService/resources/database/mulClassStat +45 -0
  90. data/perl/HeaderParseService/resources/database/nickname.txt +58 -0
  91. data/perl/HeaderParseService/resources/database/nicknames.bin +0 -0
  92. data/perl/HeaderParseService/resources/database/note.txt +121 -0
  93. data/perl/HeaderParseService/resources/database/page.txt +1 -0
  94. data/perl/HeaderParseService/resources/database/phone.txt +9 -0
  95. data/perl/HeaderParseService/resources/database/postcode.txt +54 -0
  96. data/perl/HeaderParseService/resources/database/pubnum.txt +45 -0
  97. data/perl/HeaderParseService/resources/database/statename.bin +0 -0
  98. data/perl/HeaderParseService/resources/database/statename.txt +73 -0
  99. data/perl/HeaderParseService/resources/database/states_and_abbreviations.txt +118 -0
  100. data/perl/HeaderParseService/resources/database/stopwords +438 -0
  101. data/perl/HeaderParseService/resources/database/stopwords.bin +0 -0
  102. data/perl/HeaderParseService/resources/database/surNames.txt +19613 -0
  103. data/perl/HeaderParseService/resources/database/surnames.bin +0 -0
  104. data/perl/HeaderParseService/resources/database/surnames_spec.bin +0 -0
  105. data/perl/HeaderParseService/resources/database/university_list/A.html +167 -0
  106. data/perl/HeaderParseService/resources/database/university_list/B.html +161 -0
  107. data/perl/HeaderParseService/resources/database/university_list/C.html +288 -0
  108. data/perl/HeaderParseService/resources/database/university_list/D.html +115 -0
  109. data/perl/HeaderParseService/resources/database/university_list/E.html +147 -0
  110. data/perl/HeaderParseService/resources/database/university_list/F.html +112 -0
  111. data/perl/HeaderParseService/resources/database/university_list/G.html +115 -0
  112. data/perl/HeaderParseService/resources/database/university_list/H.html +140 -0
  113. data/perl/HeaderParseService/resources/database/university_list/I.html +138 -0
  114. data/perl/HeaderParseService/resources/database/university_list/J.html +82 -0
  115. data/perl/HeaderParseService/resources/database/university_list/K.html +115 -0
  116. data/perl/HeaderParseService/resources/database/university_list/L.html +131 -0
  117. data/perl/HeaderParseService/resources/database/university_list/M.html +201 -0
  118. data/perl/HeaderParseService/resources/database/university_list/N.html +204 -0
  119. data/perl/HeaderParseService/resources/database/university_list/O.html +89 -0
  120. data/perl/HeaderParseService/resources/database/university_list/P.html +125 -0
  121. data/perl/HeaderParseService/resources/database/university_list/Q.html +49 -0
  122. data/perl/HeaderParseService/resources/database/university_list/R.html +126 -0
  123. data/perl/HeaderParseService/resources/database/university_list/S.html +296 -0
  124. data/perl/HeaderParseService/resources/database/university_list/T.html +156 -0
  125. data/perl/HeaderParseService/resources/database/university_list/U.html +800 -0
  126. data/perl/HeaderParseService/resources/database/university_list/V.html +75 -0
  127. data/perl/HeaderParseService/resources/database/university_list/W.html +144 -0
  128. data/perl/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
  129. data/perl/HeaderParseService/resources/database/university_list/X.html +44 -0
  130. data/perl/HeaderParseService/resources/database/university_list/Y.html +53 -0
  131. data/perl/HeaderParseService/resources/database/university_list/Z.html +43 -0
  132. data/perl/HeaderParseService/resources/database/university_list/ae.html +31 -0
  133. data/perl/HeaderParseService/resources/database/university_list/am.html +30 -0
  134. data/perl/HeaderParseService/resources/database/university_list/ar.html +35 -0
  135. data/perl/HeaderParseService/resources/database/university_list/at.html +43 -0
  136. data/perl/HeaderParseService/resources/database/university_list/au.html +82 -0
  137. data/perl/HeaderParseService/resources/database/university_list/bd.html +28 -0
  138. data/perl/HeaderParseService/resources/database/university_list/be.html +41 -0
  139. data/perl/HeaderParseService/resources/database/university_list/bg.html +28 -0
  140. data/perl/HeaderParseService/resources/database/university_list/bh.html +28 -0
  141. data/perl/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
  142. data/perl/HeaderParseService/resources/database/university_list/bm.html +28 -0
  143. data/perl/HeaderParseService/resources/database/university_list/bn.html +28 -0
  144. data/perl/HeaderParseService/resources/database/university_list/br.html +66 -0
  145. data/perl/HeaderParseService/resources/database/university_list/ca.html +174 -0
  146. data/perl/HeaderParseService/resources/database/university_list/ch.html +52 -0
  147. data/perl/HeaderParseService/resources/database/university_list/cl.html +40 -0
  148. data/perl/HeaderParseService/resources/database/university_list/cn.html +87 -0
  149. data/perl/HeaderParseService/resources/database/university_list/co.html +39 -0
  150. data/perl/HeaderParseService/resources/database/university_list/cr.html +34 -0
  151. data/perl/HeaderParseService/resources/database/university_list/cy.html +34 -0
  152. data/perl/HeaderParseService/resources/database/university_list/cz.html +44 -0
  153. data/perl/HeaderParseService/resources/database/university_list/de.html +128 -0
  154. data/perl/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
  155. data/perl/HeaderParseService/resources/database/university_list/dk.html +42 -0
  156. data/perl/HeaderParseService/resources/database/university_list/ec.html +31 -0
  157. data/perl/HeaderParseService/resources/database/university_list/ee.html +30 -0
  158. data/perl/HeaderParseService/resources/database/university_list/eg.html +29 -0
  159. data/perl/HeaderParseService/resources/database/university_list/es.html +68 -0
  160. data/perl/HeaderParseService/resources/database/university_list/et.html +28 -0
  161. data/perl/HeaderParseService/resources/database/university_list/faq.html +147 -0
  162. data/perl/HeaderParseService/resources/database/university_list/fi.html +49 -0
  163. data/perl/HeaderParseService/resources/database/university_list/fj.html +28 -0
  164. data/perl/HeaderParseService/resources/database/university_list/fo.html +28 -0
  165. data/perl/HeaderParseService/resources/database/university_list/fr.html +106 -0
  166. data/perl/HeaderParseService/resources/database/university_list/geog.html +150 -0
  167. data/perl/HeaderParseService/resources/database/university_list/gr.html +38 -0
  168. data/perl/HeaderParseService/resources/database/university_list/gu.html +28 -0
  169. data/perl/HeaderParseService/resources/database/university_list/hk.html +34 -0
  170. data/perl/HeaderParseService/resources/database/university_list/hr.html +28 -0
  171. data/perl/HeaderParseService/resources/database/university_list/hu.html +46 -0
  172. data/perl/HeaderParseService/resources/database/university_list/id.html +29 -0
  173. data/perl/HeaderParseService/resources/database/university_list/ie.html +49 -0
  174. data/perl/HeaderParseService/resources/database/university_list/il.html +35 -0
  175. data/perl/HeaderParseService/resources/database/university_list/in.html +109 -0
  176. data/perl/HeaderParseService/resources/database/university_list/is.html +32 -0
  177. data/perl/HeaderParseService/resources/database/university_list/it.html +75 -0
  178. data/perl/HeaderParseService/resources/database/university_list/jm.html +28 -0
  179. data/perl/HeaderParseService/resources/database/university_list/jo.html +28 -0
  180. data/perl/HeaderParseService/resources/database/university_list/jp.html +155 -0
  181. data/perl/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
  182. data/perl/HeaderParseService/resources/database/university_list/kr.html +65 -0
  183. data/perl/HeaderParseService/resources/database/university_list/kw.html +28 -0
  184. data/perl/HeaderParseService/resources/database/university_list/lb.html +28 -0
  185. data/perl/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
  186. data/perl/HeaderParseService/resources/database/university_list/lk.html +30 -0
  187. data/perl/HeaderParseService/resources/database/university_list/lt.html +31 -0
  188. data/perl/HeaderParseService/resources/database/university_list/lu.html +34 -0
  189. data/perl/HeaderParseService/resources/database/university_list/lv.html +30 -0
  190. data/perl/HeaderParseService/resources/database/university_list/ma.html +28 -0
  191. data/perl/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
  192. data/perl/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
  193. data/perl/HeaderParseService/resources/database/university_list/mk.html +29 -0
  194. data/perl/HeaderParseService/resources/database/university_list/mo.html +29 -0
  195. data/perl/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
  196. data/perl/HeaderParseService/resources/database/university_list/mt.html +28 -0
  197. data/perl/HeaderParseService/resources/database/university_list/mx.html +68 -0
  198. data/perl/HeaderParseService/resources/database/university_list/my.html +39 -0
  199. data/perl/HeaderParseService/resources/database/university_list/ni.html +28 -0
  200. data/perl/HeaderParseService/resources/database/university_list/nl.html +51 -0
  201. data/perl/HeaderParseService/resources/database/university_list/no.html +56 -0
  202. data/perl/HeaderParseService/resources/database/university_list/nz.html +41 -0
  203. data/perl/HeaderParseService/resources/database/university_list/pa.html +31 -0
  204. data/perl/HeaderParseService/resources/database/university_list/pe.html +40 -0
  205. data/perl/HeaderParseService/resources/database/university_list/ph.html +41 -0
  206. data/perl/HeaderParseService/resources/database/university_list/pl.html +51 -0
  207. data/perl/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
  208. data/perl/HeaderParseService/resources/database/university_list/pr.html +31 -0
  209. data/perl/HeaderParseService/resources/database/university_list/ps.html +28 -0
  210. data/perl/HeaderParseService/resources/database/university_list/pt.html +45 -0
  211. data/perl/HeaderParseService/resources/database/university_list/recognition.html +69 -0
  212. data/perl/HeaderParseService/resources/database/university_list/results.html +71 -0
  213. data/perl/HeaderParseService/resources/database/university_list/ro.html +38 -0
  214. data/perl/HeaderParseService/resources/database/university_list/ru.html +48 -0
  215. data/perl/HeaderParseService/resources/database/university_list/sd.html +28 -0
  216. data/perl/HeaderParseService/resources/database/university_list/se.html +57 -0
  217. data/perl/HeaderParseService/resources/database/university_list/sg.html +33 -0
  218. data/perl/HeaderParseService/resources/database/university_list/si.html +30 -0
  219. data/perl/HeaderParseService/resources/database/university_list/sk.html +35 -0
  220. data/perl/HeaderParseService/resources/database/university_list/th.html +45 -0
  221. data/perl/HeaderParseService/resources/database/university_list/tr.html +44 -0
  222. data/perl/HeaderParseService/resources/database/university_list/tw.html +76 -0
  223. data/perl/HeaderParseService/resources/database/university_list/ua.html +29 -0
  224. data/perl/HeaderParseService/resources/database/university_list/uk.html +168 -0
  225. data/perl/HeaderParseService/resources/database/university_list/univ-full.html +3166 -0
  226. data/perl/HeaderParseService/resources/database/university_list/univ.html +122 -0
  227. data/perl/HeaderParseService/resources/database/university_list/uy.html +31 -0
  228. data/perl/HeaderParseService/resources/database/university_list/ve.html +34 -0
  229. data/perl/HeaderParseService/resources/database/university_list/yu.html +28 -0
  230. data/perl/HeaderParseService/resources/database/university_list/za.html +46 -0
  231. data/perl/HeaderParseService/resources/database/university_list/zm.html +28 -0
  232. data/perl/HeaderParseService/resources/database/university_list.txt +3025 -0
  233. data/perl/HeaderParseService/resources/database/url.txt +1 -0
  234. data/perl/HeaderParseService/resources/database/webTopWords +225 -0
  235. data/perl/HeaderParseService/resources/database/words +45402 -0
  236. data/perl/HeaderParseService/resources/models/10ContextModelfold1 +369 -0
  237. data/perl/HeaderParseService/resources/models/10Modelfold1 +376 -0
  238. data/perl/HeaderParseService/resources/models/11ContextModelfold1 +400 -0
  239. data/perl/HeaderParseService/resources/models/11Modelfold1 +526 -0
  240. data/perl/HeaderParseService/resources/models/12ContextModelfold1 +510 -0
  241. data/perl/HeaderParseService/resources/models/12Modelfold1 +423 -0
  242. data/perl/HeaderParseService/resources/models/13ContextModelfold1 +364 -0
  243. data/perl/HeaderParseService/resources/models/13Modelfold1 +677 -0
  244. data/perl/HeaderParseService/resources/models/14ContextModelfold1 +459 -0
  245. data/perl/HeaderParseService/resources/models/14Modelfold1 +325 -0
  246. data/perl/HeaderParseService/resources/models/15ContextModelfold1 +340 -0
  247. data/perl/HeaderParseService/resources/models/15Modelfold1 +390 -0
  248. data/perl/HeaderParseService/resources/models/1ContextModelfold1 +668 -0
  249. data/perl/HeaderParseService/resources/models/1Modelfold1 +1147 -0
  250. data/perl/HeaderParseService/resources/models/2ContextModelfold1 +755 -0
  251. data/perl/HeaderParseService/resources/models/2Modelfold1 +796 -0
  252. data/perl/HeaderParseService/resources/models/3ContextModelfold1 +1299 -0
  253. data/perl/HeaderParseService/resources/models/3Modelfold1 +1360 -0
  254. data/perl/HeaderParseService/resources/models/4ContextModelfold1 +1062 -0
  255. data/perl/HeaderParseService/resources/models/4Modelfold1 +993 -0
  256. data/perl/HeaderParseService/resources/models/5ContextModelfold1 +1339 -0
  257. data/perl/HeaderParseService/resources/models/5Modelfold1 +2098 -0
  258. data/perl/HeaderParseService/resources/models/6ContextModelfold1 +888 -0
  259. data/perl/HeaderParseService/resources/models/6Modelfold1 +620 -0
  260. data/perl/HeaderParseService/resources/models/7ContextModelfold1 +257 -0
  261. data/perl/HeaderParseService/resources/models/7Modelfold1 +228 -0
  262. data/perl/HeaderParseService/resources/models/8ContextModelfold1 +677 -0
  263. data/perl/HeaderParseService/resources/models/8Modelfold1 +1871 -0
  264. data/perl/HeaderParseService/resources/models/9ContextModelfold1 +198 -0
  265. data/perl/HeaderParseService/resources/models/9Modelfold1 +170 -0
  266. data/perl/HeaderParseService/resources/models/NameSpaceModel +181 -0
  267. data/perl/HeaderParseService/resources/models/NameSpaceTrainF +347 -0
  268. data/perl/HeaderParseService/resources/models/WrapperBaseFeaDict +13460 -0
  269. data/perl/HeaderParseService/resources/models/WrapperContextFeaDict +14045 -0
  270. data/perl/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +510 -0
  271. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +23 -0
  272. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +23 -0
  273. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +23 -0
  274. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +23 -0
  275. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +23 -0
  276. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +23 -0
  277. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +23 -0
  278. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +23 -0
  279. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +23 -0
  280. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +23 -0
  281. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +23 -0
  282. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +23 -0
  283. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +23 -0
  284. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +23 -0
  285. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +23 -0
  286. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +23 -0
  287. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +23 -0
  288. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +23 -0
  289. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +23 -0
  290. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +23 -0
  291. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +23 -0
  292. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +23 -0
  293. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +23 -0
  294. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +23 -0
  295. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +23 -0
  296. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +23 -0
  297. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +23 -0
  298. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +23 -0
  299. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +23 -0
  300. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +23 -0
  301. data/perl/ParsCit/README.TXT +82 -0
  302. data/perl/ParsCit/crfpp/traindata/parsCit.template +60 -0
  303. data/perl/ParsCit/crfpp/traindata/parsCit.train.data +12104 -0
  304. data/perl/ParsCit/crfpp/traindata/tagged_references.txt +500 -0
  305. data/perl/ParsCit/lib/CSXUtil/SafeText.pm +140 -0
  306. data/perl/ParsCit/lib/ParsCit/Citation.pm +462 -0
  307. data/perl/ParsCit/lib/ParsCit/CitationContext.pm +132 -0
  308. data/perl/ParsCit/lib/ParsCit/Config.pm +46 -0
  309. data/perl/ParsCit/lib/ParsCit/Controller.pm +306 -0
  310. data/perl/ParsCit/lib/ParsCit/PostProcess.pm +367 -0
  311. data/perl/ParsCit/lib/ParsCit/PreProcess.pm +333 -0
  312. data/perl/ParsCit/lib/ParsCit/Tr2crfpp.pm +331 -0
  313. data/perl/ParsCit/resources/parsCit.model +0 -0
  314. data/perl/ParsCit/resources/parsCitDict.txt +148783 -0
  315. data/perl/extract.pl +199 -0
  316. data/spec/biblicit/cb2bib_spec.rb +48 -0
  317. data/spec/biblicit/citeseer_spec.rb +40 -0
  318. data/spec/fixtures/pdf/10.1.1.109.4049.pdf +0 -0
  319. data/spec/fixtures/pdf/Bagnoli Watts TAR 2010.pdf +0 -0
  320. data/spec/fixtures/pdf/ICINCO_2010.pdf +0 -0
  321. data/spec/spec_helper.rb +3 -0
  322. metadata +474 -0
@@ -0,0 +1,1880 @@
1
+ #
2
+ # Copyright 2007 Penn State University
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ #
13
+ package HeaderParse::API::ParserMethods;
14
+
15
+ #06/27/2003, start to make this program to handle real data. So there is no evaluation, and off line classifiers should be trained be trained beforehand.
16
+ #02/10/2004 Apply to citeseer data (with the same format of EbizSearch data)
17
+
18
+ use utf8;
19
+ use Data::Dumper;
20
+ use FindBin;
21
+ use HeaderParse::API::NamePatternMatch;
22
+ use HeaderParse::API::MultiClassChunking; #default to use all export by this module
23
+ use HeaderParse::API::LoadInformation;
24
+ use HeaderParse::Config::API_Config;
25
+ use HeaderParse::API::AssembleXMLMetadata;
26
+ use vars qw($debug %dictH %nameH %firstnameH %lastnameH %BasicFeatureDictH %InverseTagMap);
27
+ use vars qw($Classifier $offlineD $Tmp_Dir $nMinHeaderLength $nMaxHeaderLength);
28
+ use HeaderParse::API::Function qw(&AddrMatch &printDict &GenTrainVecMatrix &LineFeatureRepre &FillSpace &SeparatePunc);
29
+
30
+ my $FeatureDictH = \%BasicFeatureDictH;
31
+ my $ContextFeatureDictH;
32
+ my $SpaceAuthorFeatureDictH; #do not know if it is OK to define a hash
33
+ my $PuncAuthorFeatureDictH;
34
+ my $NameSpaceTrainVecH;
35
+ my $NameSpaceTrainF = "$offlineD"."NameSpaceTrainF";
36
+ my $SVMNameSpaceModel = "$offlineD"."NameSpaceModel";
37
+ my $TestH;
38
+ my $TrainH;
39
+ my $TotalHea = 0;
40
+
41
+ my $timestamp;
42
+
43
+ #my $offlineD = "../../offline/";
44
+ #my $TestOutF = "$TestF"."\.parsed";
45
+ #my $tmpCacheVecB = "$Tmp_Dir/tmpVec";
46
+ #my $SVMTmpResultB = "$Tmp_Dir/tmpresult";
47
+
48
+ my $FeatureDict = "$offlineD"."WrapperBaseFeaDict";
49
+ my $ContextFeatureDict = "$offlineD"."WrapperContextFeaDict";
50
+ my $SpaceAuthorFeatureDictF = "$offlineD"."WrapperSpaceAuthorFeaDict";
51
+ my $PuncAuthorFeatureDictF = "$offlineD"."WrapperPuncAuthorFeaDict";
52
+
53
+ my $linear = 1; # just want to be fast
54
+
55
+ my %evalH; # global hash to record classification result for baseline, each context round and IE
56
+ my $norm = 1;
57
+ my $testp = 1; # this is only to make the program run, no meaning.
58
+
59
+
60
+ my %TestDataIndex; #It indexes the header no in the testing dataset
61
+
62
+ #Read dictionary files
63
+ undef $/;
64
+ open(dumpFH, "$FeatureDict") || die "SVMHeaderParse: could not open $FeatureDict to read: $!";
65
+ my $string = <dumpFH>;
66
+ close(dumpFH);
67
+ eval $string;
68
+ $FeatureDictH = $VAR1;
69
+ $string ="";
70
+
71
+ open(dumpFH, "$ContextFeatureDict") || die "SVMHeaderParse: could not open $ContextFeatureDict to read: $!";
72
+ $string = <dumpFH>;
73
+ close(dumpFH);
74
+ eval $string;
75
+ $ContextFeatureDictH = $VAR1;
76
+ $string ="";
77
+
78
+ open(dumpFH, "$SpaceAuthorFeatureDictF") || die "SVMHeaderParse: could not open $SpaceAuthorFeatureDictF to read: $!";
79
+ $string = <dumpFH>;
80
+ close(dumpFH);
81
+ eval $string;
82
+ $SpaceAuthorFeatureDictH = $VAR1;
83
+ $string ="";
84
+ $/ = "\n";
85
+ #End read dictionary files
86
+
87
+
88
+ sub Parse{
89
+ my $header=shift;
90
+ $timestamp = shift;
91
+ my $success = 0;
92
+ # $tmpCacheVec = $tmpCacheVec . "\_$timestamp\_";
93
+ # $SVMTmpResult = $SVMTmpResult . "\_$timestamp\_";
94
+ my $tmpCacheVec = "$Tmp_Dir/tmpVec"."\_$timestamp\_";
95
+
96
+ my $SVMTmpResult = "$Tmp_Dir/tmpresult"."\_$timestamp\_";
97
+ $TestH = &HashEbizHeader(\$header);
98
+ $TestH = &VectorizeUnknownHeaderLine($TestH);
99
+
100
+ my $baseline = 1;
101
+ $TestH = &LineClassify($testp, "", $baseline, $FeatureDictH,
102
+ $TestH, $tmpCacheVec, $SVMTmpResult);
103
+ $TestH = &UpdatePretag($TestH);
104
+
105
+ my $maxLoop = 2;
106
+ for my $loop(1 .. $maxLoop) {
107
+ $baseline = 0;
108
+ my $NowContext = "context"."$loop";
109
+
110
+ $TestH = &LineClassify($testp, $NowContext, $baseline,
111
+ $ContextFeatureDictH, $TestH,
112
+ $tmpCacheVec, $SVMTmpResult);
113
+ $TestH = &UpdatePretag($TestH);
114
+ }
115
+
116
+ #Phase 2: Extraction Information from Multi-Class Lines and Author Lines Chunks
117
+ my $LastContext = "context"."$maxLoop";
118
+
119
+ # BUG: InfoExtract hangs on some documents.
120
+ # this is reproducible with data extracted using TET from doc 654835
121
+ # from the legacy citeseer system.
122
+ eval {
123
+ local $SIG{'ALRM'} = sub { die "alarm\n"; };
124
+ alarm 15;
125
+ $TestH = &InfoExtract($testp, $TestH,$SpaceAuthorFeatureDictH, $PuncAuthorFeatureDictH, $SVMNameSpaceModel, $tmpCacheVec, $SVMTmpResult);
126
+ alarm 0;
127
+ };
128
+ if ($@) {
129
+ if ($@ eq "alarm\n") {
130
+ return 0;
131
+ }
132
+ }
133
+ $rXML = &ExportRDF($TestH);
134
+
135
+ for my $i(1..15){
136
+ unlink "$Tmp_Dir/tmpVec\_$timestamp\_test$i";
137
+ unlink "$Tmp_Dir/tmpresult\_$timestamp\_$i";
138
+ }
139
+ return $rXML;
140
+ }
141
+
142
+
143
+ # This is the header extraction module from CiteSeer.
144
+ # Only the parts related to header extraction is used.
145
+ sub ExtractHeaderInformation {
146
+ my $papertext = shift;
147
+ my $header='';
148
+
149
+ if (!(length($$papertext))){
150
+ return ('Paper text is empty');
151
+ }
152
+
153
+ # $$papertext =~ s/<[SEFC][\d\.e\+\-]*>//sgi; # remove S|E|F|C tags
154
+
155
+ if ($$papertext =~ /^(.*?\b(?:Introduction|INTRODUCTION|Contents|CONTENTS)(?:.*?\n){6})/s) {
156
+ $header = $1;
157
+ } else {
158
+ my $nLines = 150;
159
+ my @lines = split '\n', $$papertext;
160
+ my $contentLines = 0;
161
+ for (my $i=0; $i<=$#lines; $i++) {
162
+ if ($lines[$i] !~ m/^\s*$/) {
163
+ $contentLines++;
164
+ }
165
+ $header .= $lines[$i]."\n";
166
+ if ($contentLines >= $nLines) {
167
+ last;
168
+ }
169
+ }
170
+ }
171
+
172
+ # if ($$papertext =~ /^(.*?)\b(?:Abstract|ABSTRACT|Introduction|INTRODUCTION|Contents|CONTENTS|[Tt]his\s+(paper|memo|technical|article|document|report|dissertation))\b/s) { $header = $1; }
173
+ # elsif ($$papertext =~ /^(.*?)\n[\d\.\s]*(Reference|Bibliography)/si) { $header = $1; }
174
+ # else{
175
+ # return ('Header could not be extracted');
176
+ # }
177
+
178
+ if ((defined $header) && (length ($header) > $nMaxHeaderLength)) {
179
+ $header = substr ($header, 0, $nMaxHeaderLength) . '...';
180
+ }
181
+ if (length($header) < $nMinHeaderLength) {
182
+ return ('Header could not be extracted');
183
+ }
184
+ return ('',$header);
185
+ }
186
+
187
+
188
+ sub UpdatePretag() {
189
+ my $testH = shift;
190
+ # foreach my $testHea(sort {$a <=> $b} keys %{$testH}) {
191
+ foreach my $LN(sort {$a <=> $b} keys %{$testH}) {
192
+ delete($$testH{$LN}{Pretag});
193
+ if ($$testH{$LN}{PClass} eq "s") {
194
+ $$testH{$LN}{Pretag}{$$testH{$LN}{PSClsName}} = 1;
195
+ }elsif ($$testH{$LN}{PClass} eq "m") {
196
+ foreach my $mytag(keys %{$$testH{$LN}{PClsName}}) {
197
+ $$testH{$LN}{Pretag}{$mytag} = 1;
198
+ }
199
+ }
200
+ }
201
+ # }
202
+ return($testH);
203
+ }
204
+
205
+
206
+ #input: the file with all Training and testing samples
207
+ #output: $HeaderH{$HeaNO}{$LineNO} = "";
208
+ sub HashAllHeader() {
209
+ my $simulateHeaNum = shift;
210
+ my $tagF= shift;
211
+ my %HeaH = ();
212
+ my $HeaNO = 1; #start from 1
213
+ my $LineNO = 1;
214
+
215
+ open(tagFH, "$tagF") || die "SVMHeaderParse: could not open tag file\: $tagF to read: $!";
216
+ while (my $line = <tagFH>) {
217
+ $line =~ s/\+L\+//g;
218
+ $line =~ s/^\s+//g;
219
+ $line =~ s/\s+$//g;
220
+
221
+ if ($line =~ /^\s*\<NEW\_HEADER\>/) {
222
+ $HeaNO++;
223
+ $LineNO = 1;
224
+ #remove the line with only tag like </author>
225
+ }elsif (($line =~ /^\s*$/) || ($line =~ /^\<(\/)*(\w+)\>$/)) {
226
+ next;
227
+ }else {
228
+ $HeaH{$HeaNO}{$LineNO}{RawContent} = $line;
229
+ $LineNO++;
230
+ }
231
+
232
+ if ($simulateHeaNum > 0 && $HeaNO >= $simulateHeaNum) {
233
+ last;
234
+ }
235
+ }
236
+ close(tagFH);
237
+ return($HeaNO, \%HeaH);
238
+ }
239
+
240
+
241
+ #HEADER_DID[1]
242
+ #TRECS: Developing a Web-based e-Commerce Business Simulation
243
+ #TRECS: Developing a Web-based
244
+ sub HashEbizHeader() {
245
+ my $headerRef= shift;
246
+ my %HeaH = ();
247
+ # my $HeaNO = 1; #start from 1
248
+ my $LineNO = 1;
249
+
250
+ my @lines = split(/\n/, $$headerRef);
251
+ my $line;
252
+
253
+ #open(FH, "$F") || die "SVMHeaderParse: could not open file\: $F to read: $!";
254
+ #while (my $line = <FH>) {
255
+ foreach $line (@lines){
256
+ $line =~ s/^\s+//g;
257
+ $line =~ s/\s+$//g;
258
+
259
+ # if ($line =~ /^\s*HEADER\_DID\[(\d+)\]/) {
260
+ # $HeaNO = $1;
261
+ # $LineNO = 1;
262
+ # }elsif ($line !~ /^\s*$/) {
263
+ #$HeaH{$HeaNO}{$LineNO}{RawContent} = $line;
264
+ $HeaH{$LineNO}{RawContent} = $line;
265
+ $LineNO++;
266
+ # }
267
+ }
268
+ #close(FH);
269
+ return(\%HeaH);
270
+ }
271
+
272
+
273
+ sub BaseLineTrainSys() {
274
+ my $HeaderH = shift;
275
+ my $FeatureDictH = shift;
276
+
277
+ my %InitialHash = ();
278
+ $InitialHash{FeatureCounter} = 0;
279
+
280
+ my $PuncAuthorDictH = \%InitialHash;
281
+ my $SpaceAuthorDictH;
282
+ #this is the place to generate feature dictionrauy and name pattern dictionary
283
+ ($HeaderH, $FeatureDictH, $SpaceAuthorDictH) = &FormFeaDict($HeaderH, $FeatureDictH);
284
+ #Prune features in Dictionary with DF < 3
285
+ $FeatureDictH = &PruneDict($FeatureDictH);
286
+
287
+ #prune features not in the pruned dict from the feature vector
288
+ foreach my $HeaNO (sort {$a <=> $b} keys %{$HeaderH}) {
289
+ foreach my $line(sort {$a <=> $b} keys %{$$HeaderH{$HeaNO}}) {
290
+ foreach my $fea(keys %{$$HeaderH{$HeaNO}{$line}{FeaVec}}) {
291
+ if (! $$FeatureDictH{$fea}{ID}) {
292
+ delete ($$HeaderH{$HeaNO}{$line}{FeaVec}{$fea});
293
+ }
294
+ }
295
+
296
+ if ($$HeaderH{$HeaNO}{$line}{FeaVec} ne "") {
297
+ my $tmpFeaVec = "";
298
+ foreach my $fea(sort{$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$HeaderH{$HeaNO}{$line}{FeaVec}}) {
299
+
300
+ if ($norm) {
301
+ #normalization
302
+ $$HeaderH{$HeaNO}{$line}{FeaVec}{$fea} = sprintf("%.8f", $$HeaderH{$HeaNO}{$line}{FeaVec}{$fea}/$$FeatureDictH{$fea}{max});
303
+ }
304
+
305
+ $tmpFeaVec .= "$$FeatureDictH{$fea}{ID}\:$$HeaderH{$HeaNO}{$line}{FeaVec}{$fea} ";
306
+ }
307
+ $$HeaderH{$HeaNO}{$line}{SVMFeaVec} = "$tmpFeaVec";
308
+ }
309
+
310
+ }
311
+ }
312
+
313
+ my %NameSpaceTrainVecH = (); #a separate hash for later printing
314
+ my $Lcount = 0;
315
+ #Prune acordingly features
316
+ foreach my $HeaNO (sort {$a <=> $b} keys %{$HeaderH}) {
317
+ foreach my $line(sort {$a <=> $b} keys %{$$HeaderH{$HeaNO}}) {
318
+ if (exists $$HeaderH{$HeaNO}{$line}{NamePattern}) {
319
+ foreach my $CandidateNamePattern(keys %{$$HeaderH{$HeaNO}{$line}{NamePattern}}) {
320
+ foreach my $fea(keys %{$$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}}) {
321
+ if (! $$SpaceAuthorDictH{$fea}{ID}) {
322
+ delete($$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}{$fea});
323
+ }
324
+ }
325
+
326
+ #normalization
327
+ if ($$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec} ne "") {
328
+ $Lcount++;
329
+ my $tmpFeaVec = "$$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{tag} ";
330
+ my $tmpTextVec = "$$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{tag} ";
331
+
332
+ foreach my $fea(sort{$$SpaceAuthorDictH{$a}{ID} <=> $$SpaceAuthorDictH{$b}{ID}} keys %{$$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}}) {
333
+ $$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}{$fea} = sprintf("%.8f", $$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}{$fea}/$$SpaceAuthorDictH{$fea}{max});
334
+ $tmpFeaVec .= "$$SpaceAuthorDictH{$fea}{ID}\:$$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}{$fea} ";
335
+ $tmpTextVec .= "$fea\:$$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}{$fea} ";
336
+ }
337
+ $NameSpaceTrainVecH{$Lcount}{SpaceNameVec}=$tmpFeaVec;
338
+ $NameSpaceTrainVecH{$Lcount}{SpaceTextNameVec}=$tmpTextVec; #for debugging
339
+ }
340
+ }
341
+ }
342
+ }
343
+ }
344
+
345
+ return($HeaderH, $FeatureDictH, $PuncAuthorDictH, $SpaceAuthorDictH, \%NameSpaceTrainVecH);
346
+ }
347
+
348
+ sub ContextTrainSys() {
349
+ my $FeatureDictH = shift;
350
+ my $HeaderH = shift;
351
+
352
+ foreach my $HeaNO (sort {$a <=> $b} keys %{$HeaderH}) {
353
+ #assign neighour line's tag
354
+ ($FeatureDictH, $$HeaderH{$HeaNO}) = &TrainAssignLineTag($FeatureDictH, $$HeaderH{$HeaNO});
355
+ }
356
+ return($FeatureDictH, $HeaderH);
357
+ }
358
+
359
+ #this is to write all the testing lines into one file to speed up
360
+ sub LineClassify() {
361
+ my ($testp, $nowLoop, $baseline, $FeatureDictH,
362
+ $HeaderH, $tmpCacheVec, $SVMTmpResult) = @_;
363
+ my %memoryH = ();
364
+ my $GlobalLineNO = 0;
365
+
366
+ #step1: collect all test data and write into one file
367
+ # keep a hash to record the global lineNO and the header no its local line no
368
+ # here is the file for all the testing data
369
+
370
+ # foreach my $testHea(sort {$a <=> $b} keys %{$HeaderH}) {
371
+ if ($baseline) {
372
+ #Filter feature vector by Feature Dictionary
373
+ ### $$HeaderH{$testHea} = &FormTestFeaVec($FeatureDictH, $$HeaderH{$testHea});
374
+ $HeaderH = &FormTestFeaVec($FeatureDictH, $HeaderH);
375
+ }else {
376
+ $HeaderH = &TestAssignLineTag($FeatureDictH, $HeaderH);
377
+ }
378
+
379
+ foreach my $LN(sort {$a <=> $b} keys %{$HeaderH}) {
380
+ if (! $baseline) {
381
+ #To make the iteration correct, we should initialize $$HeaderH{$testHea} by removing all the single and multiple classes in the hash
382
+ delete($$HeaderH{$LN}{PClass});
383
+ delete($$HeaderH{$LN}{PSClsName});
384
+ delete($$HeaderH{$LN}{PClsName});
385
+ }elsif ($baseline && ($$HeaderH{$LN}{FeaVec} ne "")) {
386
+ #modify the feature vector(normalization)
387
+ my $tmpFeaVec = "";
388
+ foreach my $fea(sort{$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$HeaderH{$LN}{FeaVec}}) {
389
+ if (exists ($$FeatureDictH{$fea}{ID})) {
390
+
391
+ if ($norm) {
392
+ if ($debug) {
393
+ if ($$FeatureDictH{$fea}{max} == 0) {
394
+ print STDERR "fea $fea has max value 0! \n";
395
+ }
396
+ }
397
+ $$HeaderH{$LN}{FeaVec}{$fea} = sprintf("%.8f", $$HeaderH{$LN}{FeaVec}{$fea}/$$FeatureDictH{$fea}{max});
398
+ }
399
+
400
+ $tmpFeaVec .= "$$FeatureDictH{$fea}{ID}\:$$HeaderH{$LN}{FeaVec}{$fea} ";
401
+ }
402
+ }
403
+
404
+ $$HeaderH{$LN}{SVMFeaVec} = "$tmpFeaVec";
405
+
406
+ #be carefull here!!
407
+ if ($$HeaderH{$LN}{SVMFeaVec} eq "") {
408
+ if ($debug) {
409
+ print STDERR "header($testHea) -- Line($LN) has a null feature vector ($$HeaderH{$testHea}{$LN}{RawContent}) \n";
410
+ }
411
+ next;
412
+ }
413
+ }
414
+
415
+ $GlobalLineNO++;
416
+ $memoryH{$GlobalLineNO}{HeaNO} = $testHea;
417
+ $memoryH{$GlobalLineNO}{LocalLineNO} = $LN;
418
+ }
419
+ # }
420
+
421
+ #step2:we print 15 files with labelled feature vectors
422
+ for my $clsNO(1 .. 15) {
423
+ my $testF = "$tmpCacheVec"."test"."$clsNO";
424
+ open(testFH, ">$testF") || die "SVMHeaderParse: could not open $testF to write: $!";
425
+ # foreach my $HeaNO (sort {$a <=> $b} keys %{$HeaderH}) {
426
+ foreach my $LN(sort {$a <=> $b} keys %{$HeaderH}) {
427
+ my $tag = 1; # just to conform to the format
428
+ if ($baseline) {
429
+ print testFH "$tag $$HeaderH{$LN}{SVMFeaVec}\n";
430
+ }else {
431
+ print testFH "$tag $$HeaderH{$LN}{ContextSVMFeaVec}\n";
432
+ #print "context feature vec is $$HeaderH{$HeaNO}{$LN}{ContextSVMFeaVec}\n";
433
+ }
434
+ }
435
+ # } # end of collecting all the testing data into a file
436
+ close(testFH);
437
+ }
438
+
439
+ #step3: SVM classify
440
+ for my $clsNO(1 .. 15) {
441
+ my $testF = "$tmpCacheVec"."test"."$clsNO";
442
+ my $mySVMResult = "$SVMTmpResult"."$clsNO";
443
+ my $SVMModelF;
444
+ my $printstr = "";
445
+ if ($baseline) {
446
+ $printstr = "baseline";
447
+ $SVMModelF = "$offlineD"."$clsNO"."Model"."fold"."$testp";
448
+ }else {
449
+ $printstr = "context"."$nowLoop";
450
+ $SVMModelF = "$offlineD"."$clsNO"."ContextModel"."fold"."$testp";
451
+ }
452
+ # print "$Classifier -v 0 $testF $SVMModelF $mySVMResult\n";
453
+ # print "classification result from fold($testp)-class($clsNO)-$printstr\:\n";
454
+ system("$Classifier -v 0 $testF $SVMModelF $mySVMResult");
455
+ }
456
+
457
+ #step4:Read all the result into a hash
458
+ my %SVMResultHash = ();
459
+ my %OrphanTagAssignHash = (); #This records the accuracy of assigned tags
460
+ my %NegMeanH = (); #record the mean of the negative value each classifier made
461
+ my %PosMinH = ();
462
+
463
+ for my $clsNO(1 .. 15) {
464
+ my $mySVMResult = "$SVMTmpResult"."$clsNO";
465
+ my $myLineNO = 0;
466
+
467
+ #initialize %PosMinH 's value
468
+ $PosMinH{$clsNO} = 100;
469
+
470
+ open(mySVMResultFH, "$mySVMResult") || die "SVMHeaderParse: could not open $mySVMResult to read: $!";
471
+ while (my $myline = <mySVMResultFH>) {
472
+ $myline =~ s/^\s+//g;
473
+ $myline =~ s/\s+$//g;
474
+ if ($myline !~ /^\s*$/) {
475
+ $myLineNO++;
476
+ if ($debug) {
477
+ print STDERR " current lineNo is $myLineNO and score for class $clsNO is $myline \n";
478
+ }
479
+ $SVMResultHash{$myLineNO}{$clsNO} = $myline;
480
+ if ($myline < 0) {
481
+ $NegMeanH{$clsNO} += $myline;
482
+ }else {
483
+ if ($PosMinH{$clsNO} > $myline) {
484
+ $PosMinH{$clsNO} = $myline;
485
+ }
486
+ }
487
+ }
488
+ }
489
+
490
+ if ($myLineNO < 1) {
491
+ if ($debug) {
492
+ print STDERR "yahoo: $mySVMResult has myLineNO 0 \n";
493
+ }
494
+ }else {
495
+ $NegMeanH{$clsNO} = sprintf("%.8f", $NegMeanH{$clsNO}/$myLineNO);
496
+ }
497
+
498
+ close(mySVMResultFH);
499
+ }
500
+
501
+ my $PredTagbyMinNeg = 0;
502
+ my $PredValbyMinNeg = 100;
503
+ my $PredTagbyMinPos = 0;
504
+ my $PredValbyMinPos = 100;
505
+
506
+ #analyze the results from the hash and fill the Test Hash(HeaderH)
507
+ for my $myline(1 .. $GlobalLineNO) {
508
+ my @PredictTags = ();
509
+ my $minVal = 100;
510
+ my $CandidateTag = -1;
511
+ my $myHeaNO = $memoryH{$myline}{HeaNO};
512
+ my $myLineNO = $memoryH{$myline}{LocalLineNO};
513
+
514
+ for my $clsNO(1 .. 15) {
515
+ my $myresult = $SVMResultHash{$myline}{$clsNO};
516
+ #keep the classification results for multi-class line
517
+ $$HeaderH{$myLineNO}{ClassifyResult}{$clsNO} = $myresult;
518
+ if ($debug) {
519
+ print STDERR "\t\t result by class $clsNO -- $result \n";
520
+ }
521
+ my $myRelDiv = 10;
522
+
523
+ if ($myresult > 0) {
524
+ push @PredictTags, $clsNO;
525
+ }else {
526
+ $myRelDiv = sprintf("%.8f", $myresult/$NegMeanH{$clsNO});
527
+ if ($myRelDiv < $minVal) {
528
+ $minVal = $myRelDiv;
529
+ $CandidateTag = $clsNO;
530
+ }
531
+ if ( (0 - $myresult) < $PredValbyMinNeg) {
532
+ $PredValbyMinNeg = -$myresult;
533
+ $PredTagbyMinNeg = $clsNO;
534
+ }
535
+ if (($PosMinH{$clsNO}- $myresult) < $PredValbyMinPos) {
536
+ $PredValbyMinPos = $PosMinH{$clsNO}- $myresult;
537
+ $PredTagbyMinPos = $clsNO;
538
+ }
539
+ }
540
+ }
541
+ #Assign ONLY class nearest to the hyperplane to the orphan point
542
+ if ($#PredictTags < 0) {
543
+ push @PredictTags, $CandidateTag;
544
+ $OrphanTagAssignHash{TotalLineNum}++;
545
+ }
546
+
547
+ #Fill the hash with the classification result
548
+ if ($#PredictTags eq 0) {
549
+ $$HeaderH{$myLineNO}{PClass} = "s";
550
+ $$HeaderH{$myLineNO}{PSClsName} = $PredictTags[0];
551
+ }elsif ($#PredictTags > 0) {
552
+ $$HeaderH{$myLineNO}{PClass} = "m";
553
+ # the multi tags predicted in one line has no sense of the order
554
+ for my $i(0 .. $#PredictTags) {
555
+ $$HeaderH{$myLineNO}{PClsName}{$PredictTags[$i]} = 1;
556
+ if ($debug) {
557
+ print STDERR "hea($myHeaNO)-- line($myLineNO) is classified as multi-class $PredictTags[$i] \n";
558
+ }
559
+ }
560
+ }else { #impossible
561
+ if ($debug) {
562
+ print STDERR "hea($myHeaNO)-- line($myLineNO) is orphan\n";
563
+ }
564
+ }
565
+ }
566
+ return($HeaderH);
567
+ }
568
+
569
+
570
+ #this is to
571
+ #(1) populate the predicted items(done in the LineClassify)
572
+ #(2) Extract related information from multi-author line and multi-classline
573
+ #all information to be extracted comes from {Pchunk}
574
+ #all word distribution information comes from {Pline} word dist.;
575
+
576
+ sub InfoExtract() {
577
+ my $testp = shift;
578
+ my $TestH = shift;
579
+ my $PuncAuthorDictH = shift;
580
+ my $SpaceAuthorDictH = shift;
581
+ my $SVMNameSpaceModel = shift;
582
+ my $tmpCacheVec = shift;
583
+ my $SVMTmpResult = shift;
584
+
585
+ # foreach my $testHea(sort {$a <=> $b} keys %{$TestH}) {
586
+ foreach my $LN(sort {$a <=> $b} keys %{$TestH}) {
587
+ if ($$TestH{$LN}{'PClass'} eq "s") { # single class
588
+ if ($$TestH{$LN}{PSClsName} ne '2') { #non-author single class
589
+ $$TestH{$LN}{Pchunk}{ChunkCounter}++;
590
+ my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
591
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = $$TestH{$LN}{PSClsName};
592
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $$TestH{$LN}{PureText};
593
+ }else {
594
+ if ($$TestH{$LN}{SClsWordCount} < 4) { #obvious single name
595
+ $$TestH{$LN}{Pchunk}{ChunkCounter}++;
596
+ my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
597
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = 2;
598
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $$TestH{$LN}{PureText};
599
+ }else { #multi-authors
600
+ my $Tline = $$TestH{$LN}{RawContent};
601
+ $Tline =~ s/<(\/)*author>//g;
602
+ if ($debug) {
603
+ print STDERR "predicted Multi-Author line -- $Tline \n";
604
+ }
605
+ my $NamePunc = 0;
606
+ #judge this is punctuated line or pure text-space
607
+ if (($$TestH{$LN}{PureText} =~ /([^\p{IsLower}\p{IsUpper}\s+\-\.\d+])/) || ($$TestH{$LN}{PureText} =~ /\band\b/i)) {
608
+ #multi-class needs while ... $punc++;
609
+ $NamePunc = 1;
610
+ }else {
611
+ $NamePunc = 0;
612
+ }
613
+
614
+ if ($NamePunc) {
615
+ #Heuristics bases separation based on features learned.
616
+ if (($$TestH{$LN}{PureText} =~ /Jr|Dr/) && ($$TestH{$LN}{SClsWordCount} <5)) {
617
+ #this is only one name
618
+ $$TestH{$LN}{Pchunk}{ChunkCounter}++;
619
+ my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
620
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = 2;
621
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $$TestH{$LN}{PureText};
622
+ }else {
623
+ my $nameStr = $$TestH{$LN}{PureText};
624
+ $nameStr =~ s/^\s+//g;
625
+ $nameStr =~ s/\s+$//g;
626
+ my @GuessedNames = split(/\,|\&|and/, $nameStr);
627
+ for my $i(0 .. $#GuessedNames) {
628
+ #chunk starts from 1
629
+ $GuessedNames[$i] =~ s/^\s+//g;
630
+ $GuessedNames[$i] =~ s/\s+$//g;
631
+ if ($GuessedNames[$i] !~ /^\s*$/) {
632
+ my @Nameparts = split(/\s+/, $GuessedNames[$i]);
633
+ if ($#Nameparts < 3) {
634
+ $$TestH{$LN}{Pchunk}{ChunkCounter}++;
635
+ my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
636
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = 2;
637
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $GuessedNames[$i];
638
+ }else {
639
+ #space separated names [name1 name2 name3 and name4]
640
+ my $PredictedNames = &HeaderParse::API::NamePatternMatch::NamePatternMatch($GuessedNames[$i]);
641
+ if ($#$PredictedNames < 1){
642
+ #only 1/0 reasonable name pattern, take it
643
+ $$TestH{$LN}{Pchunk}{ChunkCounter}++;
644
+ my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
645
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = 2;
646
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $GuessedNames[$i];
647
+ }else { #classify to predict
648
+ my $BestNamePattern = &PredictBestNamePattern($PredictedNames, $SVMNameSpaceModel, $SpaceAuthorDictH, $tmpCacheVec, $SVMTmpResult);
649
+ my @names = split(/<>/, $BestNamePattern);
650
+ for my $i(0 .. $#names) {
651
+ $$TestH{$LN}{Pchunk}{ChunkCounter}++;
652
+ my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
653
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = 2;
654
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $names[$i];
655
+ }
656
+ }
657
+ }
658
+ }
659
+ }
660
+ }
661
+ }else {
662
+ #name Space
663
+ my $nameStr = $$TestH{$LN}{PureText};
664
+ my $PredictedNames = &HeaderParse::API::NamePatternMatch::NamePatternMatch($nameStr);
665
+ if ($#$PredictedNames < 1){
666
+ #only 1/0 reasonable name pattern, take the parser-decided chunks
667
+ my $tmp_name_container = $$PredictedNames[0];
668
+ if ($#$tmp_name_container > 0) {
669
+ for my $kk(0 .. $#$tmp_name_container) {
670
+ $$TestH{$LN}{Pchunk}{ChunkCounter}++;
671
+ my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
672
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = 2;
673
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $$tmp_name_container[$kk];
674
+ }
675
+ }else {
676
+ #this branch is original
677
+ $$TestH{$LN}{Pchunk}{ChunkCounter}++;
678
+ my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
679
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = 2;
680
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $nameStr;
681
+ }
682
+ }else {
683
+ #classify to predict
684
+ my $BestNamePattern = &PredictBestNamePattern($PredictedNames, $SVMNameSpaceModel, $SpaceAuthorDictH, $tmpCacheVec, $SVMTmpResult);
685
+ my @names = split(/<>/, $BestNamePattern);
686
+ for my $i(0 .. $#names) {
687
+ $$TestH{$LN}{Pchunk}{ChunkCounter}++;
688
+ my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
689
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = 2;
690
+ $$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $names[$i];
691
+ }
692
+ }
693
+ }
694
+ }
695
+ }
696
+ #multiple class
697
+ }elsif ($$TestH{$LN}{PClass} eq "m"){
698
+ my (%TagH, $emailChunkH, $URLChunkH, @ArrayofHash);
699
+ #get a hash of all tags
700
+ foreach my $tag(keys %{$$TestH{$LN}{PClsName}}) {
701
+ $TagH{counter}++;
702
+ $TagH{$tag}++;
703
+ }
704
+ my ($PuncNum, $SepH, $component) = &GetSeparatorIndex($$TestH{$LN}{PureText});
705
+ #Preprocess -- extract email and URL out
706
+ if ($$TestH{$LN}{PClsName}{6}) {
707
+ #component has holes of "-1", after extracting emailchunk out
708
+ ($emailChunkH, $component) = &LocateEmailFromComponent($component);
709
+ delete($TagH{6});
710
+ $TagH{counter}--;
711
+ push @ArrayofHash, $emailChunkH;
712
+ }
713
+ if ($$TestH{$LN}{PClsName}{12}) {
714
+ ($URLChunkH, $component) = &LocateURLFromComponent($component);
715
+ delete($TagH{12});
716
+ $TagH{counter}--;
717
+ push @ArrayofHash, $URLChunkH;
718
+ }
719
+
720
+ if($TagH{counter} <1){ #no additional class
721
+ #exception: what if still text left ???????
722
+ $$TestH{$LN} = &FillChunkH($$TestH{$LN},$component, \@ArrayofHash);
723
+ #tag each word
724
+ }elsif ($TagH{counter} == 1){
725
+ #only one class left ..
726
+ my $lastTag = "";
727
+ foreach my $tag(keys %TagH) {
728
+ if ($tag ne "counter") {
729
+ $lastTag = $tag;
730
+ }
731
+ }
732
+ #Get the rest possible chunks separated by the email and URL
733
+ my $UnIdentifiedChunk = &LocateUnIdentifiedChunk($component);
734
+ #Tag all the test chunk as the only left class
735
+ foreach my $chunkNO(sort{$a<=>$b} keys %{$UnIdentifiedChunk}) {
736
+ $$UnIdentifiedChunk{$chunkNO}{cls} = $lastTag;
737
+ }
738
+ push @ArrayofHash, $UnIdentifiedChunk; #or\%myHash--must be pointer
739
+ #fill in the TestH chunk in a ordered way and tag each word
740
+ $$TestH{$LN} = &FillChunkH($$TestH{$LN}, $component, \@ArrayofHash);
741
+ # two class module
742
+ }elsif ($TagH{counter} == 2) {
743
+ #needs maping!
744
+ my @TagsArray = ();
745
+ foreach my $mytag(sort keys %TagH) {
746
+ if ($mytag ne "counter") {
747
+ push @TagsArray, $mytag;
748
+ }
749
+ }
750
+
751
+ my $UnIdentifiedChunk = &LocateUnIdentifiedChunk($component);
752
+ my $chunk1start = $$UnIdentifiedChunk{1}{startPos};
753
+ my $chunk1end = $$UnIdentifiedChunk{1}{endPos};
754
+ my $IdentifiedChunk;
755
+ #continuous
756
+ if ($$UnIdentifiedChunk{counter} == 1) {
757
+ my $offset;
758
+ my $newComponent = $component;
759
+ my $newSepH = $SepH;
760
+ if (($chunk1start == 0) && ($chunk1end == $#$component)) {
761
+ $offset = 0;
762
+ }else {
763
+ $offset = $chunk1start;
764
+ #adjust $component and $SepH
765
+ $newComponent = ();
766
+ for my $tmpi($chunk1start .. $chunk1end) {
767
+ $$newComponent[$tmpi-$offset] = $$component[$tmpi];
768
+ }
769
+
770
+ foreach my $tmpSep(sort keys %{$newSepH}) {
771
+ if (($tmpSep >= $chunk1start) && ($tmpSep <= $chunk1end)) {
772
+ my $newSep = $tmpSep - $offset;
773
+ $$newSepH{$newSep} = $$newSepH{$tmpSep};
774
+ }
775
+ delete($$newSepH{$tmpSep});
776
+ }
777
+ }
778
+
779
+ if ($PuncNum > 1) {
780
+ $IdentifiedChunk = &Cont2ClassChunking($testp, \@TagsArray, "punc", $newSepH, $newComponent, $FeatureDictH, $tmpCacheVec, $SVMTmpResult);
781
+ }else {
782
+ $IdentifiedChunk = &Cont2ClassChunking($testp, \@TagsArray, "space", $newSepH, $newComponent, $FeatureDictH, $tmpCacheVec, $SVMTmpResult);
783
+ }
784
+ #adjust back $chunk
785
+ if ($offset > 0) {
786
+ foreach my $tmpi(sort keys %{$IdentifiedChunk}) {
787
+ $$IdentifiedChunk{$tmpi}{startPos} += $offset;
788
+ $$IdentifiedChunk{$tmpi}{endPos} += $offset;
789
+ }
790
+ }
791
+ push @ArrayofHash, $IdentifiedChunk;
792
+ $$TestH{$LN} = &FillChunkH($$TestH{$LN}, $component, \@ArrayofHash);
793
+ }elsif ($$UnIdentifiedChunk{counter} == 2) { #discrete
794
+ $IdentifiedChunk = &Disc2ClassChunking_2chunk($testp, \@TagsArray, $UnIdentifiedChunk, $FeatureDictH, $tmpCacheVec, $SVMTmpResult);
795
+ push @ArrayofHash, $IdentifiedChunk;
796
+ $$TestH{$LN} = &FillChunkH($$TestH{$LN}, $component, \@ArrayofHash);
797
+ }elsif ($$UnIdentifiedChunk{counter} > 2) { #disc
798
+ if ($debug) {
799
+ print STDERR "2 classes with 3+ chunks\n";
800
+ }
801
+ }
802
+ #see 3 and 4 as one class
803
+ }elsif (($TagH{counter} == 3) && $TagH{3} && $TagH{4}) {
804
+ #tag array includes only 4 and the other tag
805
+ my @TagsArray = ();
806
+ foreach my $mytag(sort keys %TagH) {
807
+ if (($mytag ne "3") && ($mytag ne "4") && ($mytag ne "counter")) {
808
+ push @TagsArray, $mytag;
809
+ }
810
+ }
811
+ push @TagsArray, 4;
812
+
813
+ my $UnIdentifiedChunk = &LocateUnIdentifiedChunk($component);
814
+ my $chunk1start = $$UnIdentifiedChunk{1}{startPos};
815
+ my $chunk1end = $$UnIdentifiedChunk{1}{endPos};
816
+
817
+ my $IdentifiedChunk;
818
+ my $startPos34 = 0;
819
+ my $endPos34 = 0;
820
+ #continuous
821
+ if ($$UnIdentifiedChunk{counter} == 1) {
822
+ my $offset;
823
+ my $newComponent = $component;
824
+ my $newSepH = $SepH;
825
+
826
+ if (($chunk1start == 0) && ($chunk1end == $#$component)) {
827
+ $offset = 0;
828
+ }else {
829
+ $offset = $chunk1start;
830
+ #adjust $component and $SepH
831
+ $newComponent = ();
832
+ for my $tmpi($chunk1start .. $chunk1end) {
833
+ $$newComponent[$tmpi-$offset] = $$component[$tmpi];
834
+ }
835
+
836
+ foreach my $tmpSep(sort keys %{$newSepH}) {
837
+ if (($tmpSep >= $chunk1start) && ($tmpSep <= $chunk1end)) {
838
+ my $newSep = $tmpSep - $offset;
839
+ $$newSepH{$newSep} = $$newSepH{$tmpSep};
840
+ }
841
+ delete($$newSepH{$tmpSep});
842
+ }
843
+ }
844
+
845
+ #find the boundary between 34 and the other tag
846
+ if ($PuncNum > 1) {
847
+ $IdentifiedChunk = &Cont2ClassChunking($testp, \@TagsArray, "punc", $newSepH, $newComponent, $FeatureDictH, $tmpCacheVec, $SVMTmpResult);
848
+ }else {
849
+ $IdentifiedChunk = &Cont2ClassChunking($testp, \@TagsArray, "space", $newSepH, $newComponent, $FeatureDictH, $tmpCacheVec, $SVMTmpResult);
850
+ }
851
+
852
+ #adjust back $chunk
853
+ #get the position of the 3 4
854
+ foreach my $tmpi(sort keys %{$IdentifiedChunk}) {
855
+ if ($offset > 0) {
856
+ $$IdentifiedChunk{$tmpi}{startPos} += $offset;
857
+ $$IdentifiedChunk{$tmpi}{endPos} += $offset;
858
+ }
859
+ if ($$IdentifiedChunk{$tmpi}{cls} eq 4) {
860
+ $startPos34 = $$IdentifiedChunk{$tmpi}{startPos}; #absolute pos
861
+ $endPos34 = $$IdentifiedChunk{$tmpi}{endPos};
862
+ delete($$IdentifiedChunk{$tmpi});
863
+ }
864
+ }
865
+ push @ArrayofHash, $IdentifiedChunk;
866
+
867
+ }else { #if 2 discrete chunks
868
+ $IdentifiedChunk = &Disc2ClassChunking_2chunk($testp, \@TagsArray, $UnIdentifiedChunk, $FeatureDictH, $tmpCacheVec, $SVMTmpResult);
869
+ foreach my $tmpi(sort keys %{$IdentifiedChunk}) {
870
+ if ($$IdentifiedChunk{$tmpi}{cls} eq 4) {
871
+ $startPos34 = $$IdentifiedChunk{$tmpi}{startPos};
872
+ $endPos34 = $$IdentifiedChunk{$tmpi}{endPos};
873
+ delete($$IdentifiedChunk{$tmpi});
874
+ }
875
+ }
876
+ push @ArrayofHash, $IdentifiedChunk;
877
+ }
878
+
879
+ #find the boundary between 3 and 4
880
+ my $newComponent = (); #modified by Hui 03/19
881
+ my $newSepH = $SepH;
882
+ my $newPuncNum = 0;
883
+ my $offset = $startPos34;
884
+ for (my $tmpi=$startPos34; $tmpi<=$endPos34; $tmpi++) {
885
+ #modified by Hui 03/19/03 -$offset
886
+ $$newComponent[$tmpi-$offset] = $$component[$tmpi];
887
+ if ($$newComponent[$tmpi-$offset] =~ /^\W+$/) {
888
+ $newPuncNum++;
889
+ }
890
+ }
891
+
892
+ if ($newPuncNum > 1) {
893
+ foreach my $tmpSep(sort keys %{$$newSepH{punc}}) {
894
+ if (($tmpSep >= $startPos34) && ($tmpSep <= $endPos34)) {
895
+ my $newSep = $tmpSep - $offset;
896
+ $$newSepH{punc}{$newSep} = $$newSepH{punc}{$tmpSep};
897
+ }
898
+ delete($$newSepH{punc}{$tmpSep});
899
+ }
900
+ }else {
901
+ foreach my $tmpSep(sort keys %{$$newSepH{space}}) {
902
+ if (($tmpSep >= $startPos34) && ($tmpSep <= $endPos34)) {
903
+ my $newSep = $tmpSep - $offset;
904
+ $$newSepH{space}{$newSep} = $$newSepH{space}{$tmpSep};
905
+ }
906
+ delete($$newSepH{space}{$tmpSep});
907
+ }
908
+ }
909
+
910
+ my @NewTagsArray = ();
911
+ push @NewTagsArray, 3;
912
+ push @NewTagsArray, 4;
913
+ if ($newPuncNum > 1) {
914
+ $IdentifiedChunk = &Cont2ClassChunking($testp, \@NewTagsArray, "punc", $newSepH, $newComponent, $FeatureDictH, $tmpCacheVec, $SVMTmpResult);
915
+ }else {
916
+ $IdentifiedChunk = &Cont2ClassChunking($testp, \@NewTagsArray, "space", $newSepH, $newComponent, $FeatureDictH, $tmpCacheVec, $SVMTmpResult);
917
+ }
918
+
919
+ #adjust back $chunk
920
+ if ($offset > 0) {
921
+ foreach my $tmpi(sort keys %{$IdentifiedChunk}) {
922
+ $$IdentifiedChunk{$tmpi}{startPos} += $offset;
923
+ $$IdentifiedChunk{$tmpi}{endPos} += $offset;
924
+ }
925
+ }
926
+ push @ArrayofHash, $IdentifiedChunk;
927
+ $$TestH{$LN} = &FillChunkH($$TestH{$LN}, $component, \@ArrayofHash);
928
+ }elsif ($TagH{counter} > 2) { #3+ cases.
929
+ #consider about 3 discrete chunks for 3 tags????
930
+ if ($debug) {
931
+ print STDERR "do not care yet -- here is the case for 3+ classes after preprocessing \n";
932
+ #find the most likely position and expand to arround some(like 3) words
933
+ }
934
+ }
935
+ }
936
+ }
937
+ # }
938
+
939
+ return($TestH);
940
+ }
941
+
942
+
943
+ sub ExportInfo(){
944
+ my $TestH = shift;
945
+ my $outF = "output.txt";
946
+ open(WRITER, ">$outF") || die "SVMHeaderParse: could not open $outF to write: $!";
947
+ # foreach my $testHea(sort {$a <=> $b} keys %{$TestH}) {
948
+ print WRITER "headerno($testHea) -- ";
949
+ foreach my $LN(sort {$a <=> $b} keys %{$TestH}) {
950
+ print WRITER "lineno($LN)\: \n ";
951
+ foreach my $chunk(sort {$a <=> $b} keys %{$$TestH{$LN}{Pchunk}}) {
952
+ if ($chunk ne "ChunkCounter") {
953
+ print WRITER "\t chunk($chunk) -- class($$TestH{$LN}{Pchunk}{$chunk}{cls} <> content($$TestH{$LN}{Pchunk}{$chunk}{content} \n";
954
+ }
955
+ }
956
+ }
957
+ # }
958
+ close(WRITER);
959
+ }
960
+
961
+
962
+ sub ExportRDF(){
963
+ my $TestH = shift;
964
+ my $str='';
965
+ my $tempStr='';
966
+ foreach my $LN(sort {$a <=> $b} keys %{$TestH}) {
967
+ foreach my $chunk(sort {$a <=> $b} keys %{$$TestH{$LN}{Pchunk}}) {
968
+ my $tag = $InverseTagMap{$$TestH{$LN}{Pchunk}{$chunk}{cls}};
969
+ my $content = $$TestH{$LN}{Pchunk}{$chunk}{content};
970
+ if ($content =~ /\w+/) {
971
+ $str .="<$tag>$content</$tag>\n";
972
+ # if($tag =~/(url|note|date|abstract|intro|keyword|web|degree|pubnum|page)/){
973
+ # $tempStr .= "\n<cs_header:$tag>$content<\/cs_header:$tag>";
974
+ # }
975
+ }
976
+ }
977
+ }
978
+
979
+ # print "RDF:\n\n $str\n";
980
+ # print "$str\n";
981
+ $rXML = &HeaderParse::API::AssembleXMLMetadata::assemble(\$str);
982
+ return $rXML;
983
+ }
984
+
985
+
986
+ #Basic function: popuate information from line -- feature vector and class assignment and name patterns.
987
+ #no dictionary would be formed here
988
+ sub PopulateLineInfo4Header_unit() {
989
+ my $HeaderH = shift;
990
+ my %curState = ();
991
+
992
+ foreach my $line(sort {$a <=> $b} keys %{$HeaderH}) {
993
+ my $PureTextLine = $$HeaderH{$line}{RawContent};
994
+ $PureTextLine =~ s/(\<)*\<(\/)*(\w+)\>(\>)*/ /g; # remove the tags
995
+ $PureTextLine =~ s/\+L\+//g;
996
+ $PureTextLine =~ s/^\s+//g;
997
+ $PureTextLine =~ s/\s+$//g;
998
+ #should make punctuation separate!
999
+ $$HeaderH{$line}{PureText} = &SeparatePunc($PureTextLine);
1000
+ $$HeaderH{$line}{FeaVec} = &LineFeatureRepre($$HeaderH{$line}{PureText});
1001
+ #add the position of the line here!!!!
1002
+ $$HeaderH{$line}{FeaVec}{Clinepos} = $line;
1003
+ my $textFeaVec = "";
1004
+ foreach my $fea(keys %{$$HeaderH{$line}{FeaVec}}) {
1005
+ if($$HeaderH{$line}{FeaVec}{$fea} == 0) {
1006
+ delete ($$HeaderH{$line}{FeaVec}{$fea});
1007
+ }else {
1008
+ $textFeaVec .= "$fea($$HeaderH{$line}{FeaVec}{$fea}) ";
1009
+ }
1010
+ }
1011
+ $$HeaderH{$line}{TextFeaVec} = $textFeaVec; # for read and debug
1012
+
1013
+ #assign class tag to each line -- not separator <<sep>><</sep>> here
1014
+ if ($$HeaderH{$line}{RawContent} =~ /([^\<]+|(^\s*))\<(\/)*(\w+)\>($|[^\<]+)/) {
1015
+ %curState = ();
1016
+ my $tmpIndex = 0; # the order of this tag showed up last time
1017
+ my $preTag = -1;
1018
+ my $mul = 0;
1019
+ while ($$HeaderH{$line}{RawContent} =~ /([^\<]+|(^\s*))\<(\/)*(\w+)\>($|[^\<]+)/g) {
1020
+ $tmpIndex++;
1021
+ my $tmptag = $4;
1022
+ if (($preTag > 0) && ($preTag ne $tagMap{$tmptag})) {
1023
+ $mul = 1;
1024
+ }
1025
+ $curState{$tagMap{$tmptag}} = $tmpIndex;
1026
+ $preTag = $tagMap{$tmptag};
1027
+ }
1028
+
1029
+ if ($mul) {
1030
+ $$HeaderH{$line}{TClass} = "m";
1031
+ my $order = 1;
1032
+ foreach my $tag(sort {$curState{$a} <=> $curState{$b}} keys %curState) {
1033
+ $$HeaderH{$line}{MClsName}{$tag} = $order;
1034
+ $order++;
1035
+ }
1036
+
1037
+ #represent the class distribution only for this multi-class case.
1038
+ my $Tline = $$HeaderH{$line}{RawContent};
1039
+ #main purpose is to combine </phone><email> as one <s>
1040
+ $Tline =~ s/\<(\/)*(\w+)\>/<s>/g; #replace the tags with <s>
1041
+ $Tline =~ s/^\s*<s>\s*//g;
1042
+ $Tline =~ s/\s*<s>\s*$//g;
1043
+ $Tline =~ s/<s>\s*<s>/<s>/g;
1044
+ $Tline =~ s/\s+/ /g;
1045
+
1046
+ $Tline = &SeparatePunc($Tline);
1047
+
1048
+ while ($Tline =~ /(\s+(\W+)\s+<s>)/g) {
1049
+ my $whole = $1;
1050
+ my $punc = $2;
1051
+ $punc =~ s/^\s+//g;
1052
+ $punc =~ s/\s+$//g;
1053
+
1054
+ if ($punc eq "\|") {
1055
+ $Tline =~ s/\|/\!\!\!/g;
1056
+ $whole =~ s/\|/\!\!\!/g;
1057
+ }
1058
+ $Tline =~ s/$whole/<<sep>>$punc<<\/sep>>/g; #only once no "g"
1059
+ if ($punc eq "\|") {
1060
+ $Tline =~ s/\!\!\!//g;
1061
+ $whole =~ s/\!\!\!//g;
1062
+ }
1063
+ }
1064
+ while ($Tline =~ /(<s>\s+(\W+)\s+)/g) {
1065
+ my $whole = $1;
1066
+ my $punc = $2;
1067
+ $punc =~ s/^\s+//g;
1068
+ $punc =~ s/\s+$//g;
1069
+ if ($punc eq "\|") {
1070
+ $Tline =~ s/\|/\!\!\!/g;
1071
+ $whole =~ s/\|/\!\!\!/g;
1072
+ }
1073
+ $Tline =~ s/$whole/<<sep>>$punc<<\/sep>>/g; #only once no "g"
1074
+ if ($punc eq "\|") {
1075
+ $Tline =~ s/\!\!\!/\|/g;
1076
+ $whole =~ s/\!\!\!/\|/g;
1077
+ }
1078
+ }
1079
+ $Tline =~ s/<s>/<<sep>><<\/sep>>/g;
1080
+ my ($PuncNum, $SepH, $component) = &GetSeparatorIndex($Tline);
1081
+ #Populate Truth Hash by the chunk and word-class distribution
1082
+ $$HeaderH{$line} = &AssignWordTagFromChunk($$HeaderH{$line}, $SepH, $component);
1083
+ }else {
1084
+ $$HeaderH{$line}{TClass} = "s";
1085
+ my @Tarr = split(/\s+/, $PureTextLine);
1086
+ $$HeaderH{$line}{SClsWordCount} = $#Tarr +1;
1087
+ foreach my $tag(sort {$curState{$a} <=> $curState{$b}} keys %curState) {
1088
+ $$HeaderH{$line}{SClsName} = $tag;
1089
+ }
1090
+
1091
+ #Fill in the word-class distribution for single class line
1092
+ my $lineContent = &SeparatePunc($$HeaderH{$line}{PureText});
1093
+ my @wordArray = split(/\s+/, $lineContent);
1094
+ undef $lineContent;
1095
+
1096
+ $$HeaderH{$line} = &AssignWordTag4SingleClassLine("truth", $$HeaderH{$line}{SClsName}, $$HeaderH{$line}, \@wordArray);
1097
+
1098
+ #but only multi-author has multiple chunks
1099
+ #all reasonable name patterns for space separated names
1100
+ #feature vec for each space namepatterns and puncutation separators
1101
+ #Test/prediction will base on the predicted line tag in another module
1102
+
1103
+ #single author
1104
+ if ($$HeaderH{$line}{SClsName} eq "2") {
1105
+ #From Truth
1106
+ if ($$HeaderH{$line}{RawContent} !~ /<<sep>>/) {
1107
+ #could we save space by indicating the pure text directly
1108
+ $$HeaderH{$line}{Tchunk}{$i}{cls} = 2;
1109
+ $$HeaderH{$line}{Tchunk}{$i}{content} = $$HeaderH{$line}{PureText};
1110
+ #multiple authors
1111
+ }else {
1112
+ my $Tline = $$HeaderH{$line}{RawContent};
1113
+ $Tline =~ s/<(\/)*author>//g;
1114
+
1115
+ my ($PuncNum, $SepH, $component) = &GetSeparatorIndex($Tline);
1116
+ my $nameStr = join(" ", @$component);
1117
+
1118
+ #judge this is punctuated line or pure text-space
1119
+ if ($$HeaderH{$line}{PureText} =~ /([^\p{IsLower}\p{IsUpper}\s+\-\.\d+])|(\W+and\W+)/ig) {
1120
+ #multi-class needs while ... $punc++;
1121
+ $$HeaderH{$line}{NamePunc} = 1;
1122
+ }else {
1123
+ $$HeaderH{$line}{NameSpace} = 1;
1124
+ }
1125
+
1126
+ #{NamePuncFeaVec} and {NameSpaceFeaVec} based on number of puncs (>2)
1127
+ #{MulClsPuncFeaVec}
1128
+
1129
+ ######common to both name space and name punc ######
1130
+ my $TrueNames = &HeaderParse::API::NamePatternMatch::GetTrueName($nameStr);
1131
+ for my $i(0 .. $#$TrueNames) {
1132
+ my $j = $i+1; #chunk should start from 1
1133
+ $$HeaderH{$line}{Tchunk}{$j}{cls} = 2;
1134
+ $$HeaderH{$line}{Tchunk}{$j}{content} = "$$TrueNames[$i]";
1135
+ }
1136
+ ################################################
1137
+
1138
+ if ($$HeaderH{$line}{NamePunc}) {
1139
+ }else {
1140
+ my $PredictedNames = &HeaderParse::API::NamePatternMatch::NamePatternMatch($nameStr);
1141
+ if ($#$PredictedNames < 1) {
1142
+ #only one pattern -- do not fill name pattern
1143
+ }else {
1144
+ my $TrueIndex = &HeaderParse::API::NamePatternMatch::Duplicate($TrueNames, $PredictedNames);
1145
+ #must solve the problem
1146
+ if ($TrueIndex eq "-1") {
1147
+ if ($debug) {
1148
+ print STDERR "here the true name($TrueNames) is null from the line $content \n";
1149
+ }
1150
+ }else {
1151
+ #populate all reasonable name patterns
1152
+ for my $i(0 .. $#$PredictedNames) {
1153
+ my $candidateName = "";
1154
+ for my $j(0 .. $#{$$PredictedNames[$i]}) {
1155
+ if ($$PredictedNames[$i][$j]) {
1156
+ $candidateName .= "$$PredictedNames[$i][$j]<>";
1157
+ }
1158
+ }
1159
+ # print "candidate name\: $candidateName ";
1160
+ $$HeaderH{$line}{NamePattern}{$candidateName}{content} = $candidateName;
1161
+ ($$HeaderH{$line}{NamePattern}{$candidateName}{SpaceNameVec}) = &SpaceNameLnFeaRepre_unit($candidateName);
1162
+ if ($i eq $TrueIndex) {
1163
+ $$HeaderH{$line}{NamePattern}{$candidateName}{tag} = 1;
1164
+ }else {
1165
+ $$HeaderH{$line}{NamePattern}{$candidateName}{tag} = -1;
1166
+ }
1167
+ }
1168
+ }
1169
+ }
1170
+ }
1171
+ }
1172
+ }
1173
+ }
1174
+ }else { #if there is no explicit tag for this line, this line only belongs to the last class of the previous line
1175
+ my $tmpI = 0;
1176
+ foreach my $state (sort {$curState{$b} <=> $curState{$a}} keys %curState) {
1177
+ if ($tmpI > 0) {
1178
+ delete ($curState{$state});
1179
+ } #only keep the last tag
1180
+ $tmpI++;
1181
+ }
1182
+ $$HeaderH{$line}{TClass} = "s";
1183
+ foreach my $tag(sort {$curState{$a} <=> $curState{$b}} keys %curState) {
1184
+ $$HeaderH{$line}{SClsName} = $tag;
1185
+ }
1186
+ }
1187
+ }
1188
+
1189
+ return($HeaderH);
1190
+ }
1191
+
1192
+
1193
+ sub VectorizeUnknownHeaderLine () {
1194
+ my $HeaderH = shift;
1195
+
1196
+ my %curState = ();
1197
+ foreach my $line(sort {$a <=> $b} keys %{$HeaderH}) {
1198
+ my $PureTextLine = $$HeaderH{$line}{RawContent};
1199
+ # print "LINE $line: $PureTextLine\n";
1200
+ $PureTextLine =~ s/^\s+//g;
1201
+ $PureTextLine =~ s/\s+$//g;
1202
+ #should make punctuation separate!
1203
+ $$HeaderH{$line}{PureText} = &SeparatePunc($PureTextLine);
1204
+
1205
+ my @Tarr = split(/\s+/, $PureTextLine);
1206
+ $$HeaderH{$line}{SClsWordCount} = $#Tarr +1;
1207
+ $$HeaderH{$line}{FeaVec} = &LineFeatureRepre($$HeaderH{$line}{PureText});
1208
+ # foreach my $key (keys %{$$HeaderH{$line}{FeaVec}}) {
1209
+ # print "$key :: ".${$$HeaderH{$line}{FeaVec}}{$key}."\n";
1210
+ # }
1211
+ # print "\n";
1212
+ #add the position of the line here!!!!
1213
+ $$HeaderH{$line}{FeaVec}{Clinepos} = $line;
1214
+
1215
+ my $textFeaVec = "";
1216
+ foreach my $fea(keys %{$$HeaderH{$line}{FeaVec}}) {
1217
+ if($$HeaderH{$line}{FeaVec}{$fea} == 0) {
1218
+ delete ($$HeaderH{$line}{FeaVec}{$fea});
1219
+ }else {
1220
+ $textFeaVec .= "$fea($$HeaderH{$line}{FeaVec}{$fea}) ";
1221
+ }
1222
+ }
1223
+ $$HeaderH{$line}{TextFeaVec} = $textFeaVec; # for read and debug
1224
+ }
1225
+
1226
+ return($HeaderH);
1227
+ }
1228
+
1229
+
1230
+ #training data are assigned the true neighbour lines' tag
1231
+ sub TrainAssignLineTag() {
1232
+ my $FeatureDictH = shift;
1233
+ my $HeaderH = shift;
1234
+ my %curState = ();
1235
+
1236
+ foreach my $line(sort {$a <=> $b} keys %{$HeaderH}) {
1237
+ my $PC = 1; # 0 means the tag for current line (which might be useful)
1238
+ my $Pline = $line - $PC;
1239
+ while (($PC < 5) && ($Pline > 0)) { #previous line
1240
+ if (exists $$HeaderH{$Pline}{TClass}) {
1241
+ if ($$HeaderH{$Pline}{TClass} eq "s") {
1242
+ my $ContextFea = "P"."$PC"."$$HeaderH{$Pline}{SClsName}";
1243
+ if (! $$FeatureDictH{$ContextFea}{ID}) {
1244
+ $$FeatureDictH{FeatureCounter}++;
1245
+ $$FeatureDictH{$ContextFea}{ID} = $$FeatureDictH{FeatureCounter};
1246
+ $$FeatureDictH{$ContextFea}{max} = 0.5;
1247
+ }
1248
+
1249
+ if ($$FeatureDictH{$ContextFea}{ID}) {
1250
+ $$HeaderH{$line}{ContextFeaVec}{$ContextFea} = 0.5;
1251
+ $$FeatureDictH{$ContextFea}{DF}++;
1252
+ }
1253
+ }else { # consider the order of the tag
1254
+ foreach my $tag(sort {$$HeaderH{$Pline}{MClsName}{$a} <=> $$HeaderH{$Pline}{MClsName}{$b}} keys %{$$HeaderH{$Pline}{MClsName}}){
1255
+ my $ContextFea = "P"."$PC"."$tag";
1256
+ if (! $$FeatureDictH{$ContextFea}{ID}) {
1257
+ $$FeatureDictH{FeatureCounter}++;
1258
+ $$FeatureDictH{$ContextFea}{ID} = $$FeatureDictH{FeatureCounter};
1259
+ $$FeatureDictH{$ContextFea}{max} = 0.5;
1260
+ }
1261
+ if ($$FeatureDictH{$ContextFea}{ID}) {
1262
+ $$HeaderH{$line}{ContextFeaVec}{$ContextFea} = 0.5;
1263
+ $$FeatureDictH{$ContextFea}{DF}++;
1264
+ }
1265
+ }
1266
+ }
1267
+ $PC++;
1268
+ $Pline = $line - $PC;
1269
+ }else {
1270
+ last;
1271
+ }
1272
+ }
1273
+
1274
+ my $NC = 1;
1275
+ my $Nline = $line + $NC;
1276
+ while (($NC < 5) && (exists $$HeaderH{$Nline})) { #next line
1277
+ if ($$HeaderH{$Nline}{TClass} eq "s") {
1278
+ my $ContextFea = "N"."$NC"."$$HeaderH{$Nline}{SClsName}";
1279
+ if (! $$FeatureDictH{$ContextFea}{ID}) {
1280
+ $$FeatureDictH{FeatureCounter}++;
1281
+ $$FeatureDictH{$ContextFea}{ID} = $$FeatureDictH{FeatureCounter};
1282
+ $$FeatureDictH{$ContextFea}{max} = 0.5;
1283
+ }
1284
+ if ($$FeatureDictH{$ContextFea}{ID}) {
1285
+ $$HeaderH{$line}{ContextFeaVec}{$ContextFea} = 0.5;
1286
+ $$FeatureDictH{$ContextFea}{DF}++;
1287
+ }
1288
+ }else { # consider the order of the tag
1289
+ foreach my $tag(sort {$$HeaderH{$Nline}{MClsName}{$a} <=> $$HeaderH{$Nline}{MClsName}{$b}} keys %{$$HeaderH{$Nline}{MClsName}}){
1290
+ my $ContextFea = "N"."$NC"."$tag";
1291
+ if (! $$FeatureDictH{$ContextFea}{ID}) {
1292
+ $$FeatureDictH{FeatureCounter}++;
1293
+ $$FeatureDictH{$ContextFea}{ID} = $$FeatureDictH{FeatureCounter};
1294
+ $$FeatureDictH{$ContextFea}{max} = 0.5;
1295
+ }
1296
+ if ($$FeatureDictH{$ContextFea}{ID}) {
1297
+ $$HeaderH{$line}{ContextFeaVec}{$ContextFea} = 0.5;
1298
+ $$FeatureDictH{$ContextFea}{DF}++;
1299
+ }
1300
+ }
1301
+ }
1302
+ $NC++;
1303
+ $Nline = $line + $NC;
1304
+ }
1305
+
1306
+ #assemble features and their weight into string without normalization
1307
+ my $tmpFeaVec = $$HeaderH{$line}{SVMFeaVec};
1308
+ foreach my $fea(sort{$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$HeaderH{$line}{ContextFeaVec}}) {
1309
+ if (exists $$FeatureDictH{$fea}{ID}) {
1310
+ $tmpFeaVec .= "$$FeatureDictH{$fea}{ID}\:$$HeaderH{$line}{ContextFeaVec}{$fea} ";
1311
+ }
1312
+ }
1313
+ $$HeaderH{$line}{ContextSVMFeaVec} = "$tmpFeaVec";
1314
+ }
1315
+ return($FeatureDictH, $HeaderH);
1316
+ }
1317
+
1318
+ sub TestAssignLineTag() {
1319
+ my $FeatureDictH = shift;
1320
+ my $HeaderH = shift;
1321
+ my %curState = ();
1322
+
1323
+ foreach $line(sort {$a <=> $b} keys %{$HeaderH}) {
1324
+ #Initialize-remove the $$HeaderH{$line}{ContextFeaVec}
1325
+ if(exists ($$HeaderH{$line}{ContextFeaVec})) {
1326
+ delete($$HeaderH{$line}{ContextFeaVec});
1327
+ }
1328
+
1329
+ my $PC = 1; # 0 means the tag for current line (which might be useful)
1330
+ my $Pline = $line - $PC;
1331
+ while (($PC < 5) && ($Pline > 0)) { #previous line
1332
+ if (exists $$HeaderH{$Pline}{Pretag}) {
1333
+ foreach my $tag(sort keys %{$$HeaderH{$Pline}{Pretag}}){
1334
+ my $ContextFea = "P"."$PC"."$tag";
1335
+ if ($$FeatureDictH{$ContextFea}{ID}) {
1336
+ $$HeaderH{$line}{ContextFeaVec}{$ContextFea} = 0.5;
1337
+ }
1338
+ }
1339
+ }
1340
+ $PC++;
1341
+ $Pline = $line - $PC;
1342
+ }
1343
+
1344
+ my $NC = 1;
1345
+ my $Nline = $line + $NC;
1346
+ while (($NC < 5) && (exists $$HeaderH{$Nline})) { #next line
1347
+ foreach my $tag(sort keys %{$$HeaderH{$Nline}{Pretag}}){
1348
+ my $ContextFea = "N"."$NC"."$tag";
1349
+ if ($$FeatureDictH{$ContextFea}{ID}) {
1350
+ $$HeaderH{$line}{ContextFeaVec}{$ContextFea} = 0.5;
1351
+ }
1352
+ }
1353
+ $NC++;
1354
+ $Nline = $line + $NC;
1355
+ }
1356
+
1357
+ #assemble features and their weight into string without normalization
1358
+ my $tmpFeaVec = $$HeaderH{$line}{SVMFeaVec};
1359
+
1360
+ foreach my $fea(sort{$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$HeaderH{$line}{ContextFeaVec}}) {
1361
+ if (exists $$FeatureDictH{$fea}{ID}) {
1362
+ $tmpFeaVec .= "$$FeatureDictH{$fea}{ID}\:$$HeaderH{$line}{ContextFeaVec}{$fea} ";
1363
+ }
1364
+ }
1365
+ $tmpFeaVec =~ s/\s+$//g;
1366
+ $$HeaderH{$line}{ContextSVMFeaVec} = "$tmpFeaVec";
1367
+ }
1368
+ return($FeatureDictH, $HeaderH);
1369
+ }
1370
+
1371
+ #given a line, check the number and the position of punctuation/space it contains
1372
+ sub GetSeparatorIndex() {
1373
+ my $line = shift;
1374
+ my %SeparatorH = ();
1375
+
1376
+ my $PuncNum = 0;
1377
+ $line =~ s/^\s+//g;
1378
+ $line =~ s/\s+$//g;
1379
+
1380
+ #punc means this line contains punc or only space
1381
+ #each space occupies a position and punctuations are separate
1382
+ my ($punc, $spaceLine) = &FillSpace($line);
1383
+
1384
+ #punctuation is specific; space separator contains punctuation separators.
1385
+ my @component = split(/\s+/, $spaceLine);
1386
+ foreach my $i(0 .. $#component) {
1387
+ if ($component[$i] =~ /<<sep>>(\W+|\s*)<<\/sep>>/) {
1388
+ $component[$i] = $1;
1389
+ if ($component[$i] eq "") {
1390
+ $component[$i] = "<<sep>><<\/sep>>";
1391
+ $SeparatorH{space}{$i} = 2;
1392
+ }else {
1393
+ $SeparatorH{punc}{$i} = 2;
1394
+ $PuncNum++;
1395
+ $SeparatorH{space}{$i} = 2;
1396
+ }
1397
+ }elsif ($component[$i] =~ /<space>/) {
1398
+ $SeparatorH{space}{$i} = 1;
1399
+ }elsif ($component[$i] =~ /^[^\p{IsLower}\p{IsUpper}\s+\-\d+]+$/) {
1400
+ $SeparatorH{punc}{$i} = 1; #position(not what punc)
1401
+ $PuncNum++;
1402
+ $SeparatorH{space}{$i} = 1;
1403
+ }
1404
+ }
1405
+ return($PuncNum, \%SeparatorH, \@component);
1406
+ }
1407
+
1408
+
1409
+ #multi-Authors line still has only one class, although 1+ authors
1410
+ sub AssignWordTagFromChunk() {
1411
+ my ($LineH, $SepH, $component) = @_;
1412
+ my @tags = ();
1413
+ foreach my $tag(sort {$$LineH{MClsName}{$a} <=> $$LineH{MClsName}{$b}} keys %{$$LineH{MClsName}}) {
1414
+ push @tags, $tag;
1415
+ }
1416
+
1417
+ my $ChunkNO = 1;
1418
+ my $curTag = $tags[$tagP];
1419
+ my $WordPos = 1;
1420
+ my $chunk = "";
1421
+ for my $i(0 .. $#$component) {
1422
+ #we do not assign class to separators
1423
+ if ($$SepH{space}{$i} >1) {
1424
+ if ($chunk ne "") {
1425
+ $$LineH{Tchunk}{$ChunkNO}{cls} = $curTag;
1426
+ $$LineH{Tchunk}{$ChunkNO}{content} = $chunk;
1427
+ $chunk = "";
1428
+ $curTag = $tags[$ChunkNO];
1429
+ $ChunkNO++;
1430
+ }
1431
+ }elsif ($$component[$i] !~ /<space>|^\W+$|\<\<.*\>\>/) { # such as <<sep>> <</sep>>
1432
+ $chunk .= "$$component[$i] ";
1433
+ $$LineH{Tline}{$WordPos}{cls} = $curTag;
1434
+ $$LineH{Tline}{$WordPos}{OriginalWord} = $$component[$i];
1435
+ $WordPos++;
1436
+ }
1437
+ };
1438
+
1439
+ #Fill in the last chunk
1440
+ $$LineH{Tchunk}{$ChunkNO}{cls} = $curTag;
1441
+ $$LineH{Tchunk}{$ChunkNO}{content} = $chunk;
1442
+
1443
+ return ($LineH);
1444
+ }
1445
+
1446
+
1447
+ sub AssignWordTag4SingleClassLine() {
1448
+ my ($type, $curTag, $LineH, $component) = @_;
1449
+
1450
+ my $WordPos = 1;
1451
+ for my $i(0 .. $#$component) {
1452
+ if ($$component[$i] !~ /<space>|^\W+$|\<\<.*\>\>/) { # such as <<sep>> <</sep>>
1453
+ if ($type eq "truth") {
1454
+ $$LineH{Tline}{$WordPos}{cls} = $curTag;
1455
+ #added 01/08 the original word in a position
1456
+ $$LineH{Tline}{$WordPos}{OriginalWord} = $$component[$i];
1457
+ }elsif ($type eq "predict") {
1458
+ $$LineH{Pline}{$WordPos}{cls} = $curTag;
1459
+ $$LineH{Pline}{$WordPos}{OriginalWord} = $$component[$i];
1460
+ }
1461
+ $WordPos++;
1462
+ }
1463
+ }
1464
+
1465
+ return ($LineH);
1466
+ }
1467
+
1468
+
1469
+ sub Analyze() {
1470
+ my $resultF = shift;
1471
+ open(resultFH, "$resultF") || die "SVMHeaderParse: could not open $resultF to read: $!";
1472
+ my $result = <resultFH>;
1473
+ close(resultFH);
1474
+ $result =~ s/\s+$//g;
1475
+ return($result);
1476
+ }
1477
+
1478
+
1479
+ sub ReadFeatureDict() {
1480
+ my $Fname = shift;
1481
+ my %FeatureDictH;
1482
+
1483
+ open (FH, "$Fname") || die "SVMHeaderParse: could not open $Fname to read: $!";
1484
+ while (my $line = <FH>) {
1485
+ my ($ID, $fea, $max, $DF) = split(/<>/, $line);
1486
+ $ID =~ s/^\s+//g;
1487
+ $ID =~ s/\s+$//g;
1488
+
1489
+ if ($fea =~ /FeatureCounter/) {
1490
+ $FeatureDictH{$fea}{num} = $ID;
1491
+ next;
1492
+ }
1493
+
1494
+ $fea =~ s/^\s+//g;
1495
+ $fea =~ s/\s+$//g;
1496
+ $max =~ s/^\s+//g;
1497
+ $max =~ s/\s+$//g;
1498
+ $DF =~ s/^\s+//g;
1499
+ $Df =~ s/\s+$//g;
1500
+ $FeatureDictH{$fea}{ID} = $ID;
1501
+ $FeatureDictH{$fea}{max} = $max;
1502
+ $FeatureDictH{$fea}{DF} = $DF;
1503
+ }
1504
+ close(FH);
1505
+ return(\%FeatureDictH);
1506
+ }
1507
+
1508
+
1509
+ sub printTrainData() {
1510
+ my $affix = shift;
1511
+ my $HeaderH = shift;
1512
+
1513
+ #Sometimes $$HeaderH{$HeaNO}{$LN}{ContextSVMFeaVec} is not null, but
1514
+ #$$HeaderH{$HeaNO}{$LN}{SVMFeaVec} is null so they have different file length!
1515
+ for my $clsNO(1 .. 15) {
1516
+ my $F = "$offlineD"."$clsNO"."\."."$affix";
1517
+ open(FH, ">$F") || die "SVMHeaderParse: could not open $F to write: $!";
1518
+ foreach my $HeaNO (sort {$a <=> $b} keys %{$HeaderH}) {
1519
+ foreach my $LN(sort {$a <=> $b} keys %{$$HeaderH{$HeaNO}}) {
1520
+ if ($$HeaderH{$HeaNO}{$LN}{SVMFeaVec} ne "") {
1521
+ if ($affix eq "train") {
1522
+ if ($$HeaderH{$HeaNO}{$LN}{SVMFeaVec} ne "") {
1523
+ if (($$HeaderH{$HeaNO}{$LN}{SClsName} eq "$clsNO") || exists($$HeaderH{$HeaNO}{$LN}{MClsName}{$clsNO})) {
1524
+ print FH "1 $$HeaderH{$HeaNO}{$LN}{SVMFeaVec}\n";
1525
+ }else {
1526
+ print FH "-1 $$HeaderH{$HeaNO}{$LN}{SVMFeaVec}\n";
1527
+ }
1528
+ }
1529
+ }elsif ($affix eq "context") {
1530
+ #if ($$HeaderH{$HeaNO}{$LN}{ContextSVMFeaVec} ne "") {
1531
+ if (($$HeaderH{$HeaNO}{$LN}{SClsName} eq "$clsNO") || exists($$HeaderH{$HeaNO}{$LN}{MClsName}{$clsNO})) {
1532
+ print FH "1 $$HeaderH{$HeaNO}{$LN}{ContextSVMFeaVec}\n";
1533
+ }else {
1534
+ print FH "-1 $$HeaderH{$HeaNO}{$LN}{ContextSVMFeaVec}\n";
1535
+ }
1536
+ }else {
1537
+ print "weired -- $affix is not context nor train \n";
1538
+ }
1539
+ }
1540
+ }
1541
+ }
1542
+ close(FH);
1543
+ }
1544
+ }
1545
+
1546
+
1547
+ sub printNameSpaceTrainData(){
1548
+ my $printF = shift;
1549
+ my $NameSpaceTrainVecH = shift;
1550
+
1551
+ open(FH, ">$printF") || die "SVMHeaderParse: could not open $printF to write: $!";
1552
+ foreach my $Lcount(sort{$a<=>$b} keys %{$NameSpaceTrainVecH}) {
1553
+ print FH "$$NameSpaceTrainVecH{$Lcount}{SpaceNameVec}\n";
1554
+ }
1555
+ close(FH);
1556
+ }
1557
+
1558
+
1559
+ sub SpaceNameLnFeaRepre() {
1560
+ my $type = shift;
1561
+ my $NamePatternStr = shift;
1562
+ my $NameDictH = shift;
1563
+
1564
+ #feature generation and representation
1565
+ #It is good to make each of the apple's feature(color, shape..) separate.
1566
+ my %FeatureH = ();
1567
+ $NamePatternStr =~ s/\<\>$//g; #remove the last <>
1568
+ my @Names = split(/<>/, $NamePatternStr);
1569
+
1570
+ #try making features binary
1571
+ for my $i(0 .. $#Names) {
1572
+ my @NameComponent = split(/\s+/, $Names[$i]);
1573
+ for my $j(0 .. $#NameComponent){
1574
+
1575
+ #feature generation($i = 0 is the first one)
1576
+ $FeatureH{"Name"."$i"."part"."$j"."form"} = &HeaderParse::API::NamePatternMatch::RichNameType($NameComponent[$j]);
1577
+ if ($j eq $#NameComponent) {
1578
+ $FeatureH{"Name"."$i"."part"."$j"."pos"} = "Last";
1579
+ }elsif ($j eq $#NameComponent -1) {
1580
+ $FeatureH{"Name"."$i"."part"."$j"."pos"} = "SecLast";
1581
+ }else {
1582
+ $FeatureH{"Name"."$i"."part"."$j"."pos"} = $j;
1583
+ }
1584
+
1585
+ #firstname, lastname information
1586
+ # print "hello: ".lc($NameComponent[$j])."\n";
1587
+ if (($firstnameH{lc($NameComponent[$j])}) && (!$lastnameH{lc($NameComponent[$j])})) {
1588
+ $FeatureH{"Name"."$i"."part"."$j"."FN"} = 1;
1589
+ }elsif (($lastnameH{lc($NameComponent[$j])}) && (!$firstnameH{lc($NameComponent[$j])})) {
1590
+ $FeatureH{"Name"."$i"."part"."$j"."LN"} = 1;
1591
+ }elsif (! $dictH{lc($NameComponent[$j])}) {
1592
+ $FeatureH{"Name"."$i"."part"."$j"."NonDict"} = 1;
1593
+ }
1594
+
1595
+ #space for more features
1596
+ }
1597
+ }
1598
+
1599
+ #Build up FeatureVec
1600
+ #code for the attribute ID separately so that the ID for features would be continuous
1601
+ if ($type eq "train") {
1602
+ foreach my $fea(sort {$a <=> $b} keys %FeatureH) {
1603
+ if (! $$NameDictH{$fea}{ID}) {
1604
+ $$NameDictH{FeatureCounter}++;
1605
+ $$NameDictH{$fea}{ID} = $$NameDictH{FeatureCounter};
1606
+ }
1607
+
1608
+ if (! IsNumber($FeatureH{$fea})) {
1609
+ if (! exists $$NameDictH{$FeatureH{$fea}}{ID}) {
1610
+ $$NameDictH{FeatureCounter}++;
1611
+ $$NameDictH{$FeatureH{$fea}}{ID} = $$NameDictH{FeatureCounter};
1612
+ }
1613
+ $FeatureH{$fea} = $$NameDictH{$FeatureH{$fea}}{ID};
1614
+ }
1615
+
1616
+ if ($FeatureH{$fea} == 0) {
1617
+ delete($FeatureH{$fea});
1618
+ }else {
1619
+ if ((! exists $$NameDictH{$fea}{max}) || ($$NameDictH{$fea}{max} < $FeatureH{$fea})) {
1620
+ $$NameDictH{$fea}{max} = $FeatureH{$fea};
1621
+ }
1622
+ }
1623
+ }
1624
+ return(\%FeatureH, $NameDictH);
1625
+ #test
1626
+ }else {
1627
+ my $SpaceNameFeaVec = "";
1628
+ my $SpaceNameTextFeaVec = "";
1629
+ foreach my $fea(sort {$$NameDictH{$a}{ID} <=> $$NameDictH{$b}{ID}} keys %FeatureH) {
1630
+ if (! &IsNumber($FeatureH{$fea})) {
1631
+ if (exists $$NameDictH{$FeatureH{$fea}}{ID}) {
1632
+ $FeatureH{$fea} = $$NameDictH{$FeatureH{$fea}}{ID};
1633
+ }else {
1634
+ delete($FeatureH{$fea});
1635
+ }
1636
+ }
1637
+
1638
+ if (! ($FeatureH{$fea} && $$NameDictH{$fea}{ID})) {
1639
+ delete($FeatureH{$fea});
1640
+ }else {
1641
+ $FeatureH{$fea} = sprintf("%.8f", $FeatureH{$fea}/$$NameDictH{$fea}{max});
1642
+ $SpaceNameFeaVec .= "$$NameDictH{$fea}{ID}\:$FeatureH{$fea} ";
1643
+ $SpaceNameTextFeaVec .= "$fea\:$FeatureH{$fea} ";
1644
+ }
1645
+ }
1646
+ return($SpaceNameFeaVec, $SpaceNameTextFeaVec);
1647
+ }
1648
+ }
1649
+
1650
+
1651
+ sub SpaceNameLnFeaRepre_unit() {
1652
+ my $NamePatternStr = shift;
1653
+
1654
+ #feature generation and representation
1655
+ #It is good to make each of the apple's feature(color, shape..) separate.
1656
+ my %FeatureH = ();
1657
+ $NamePatternStr =~ s/\<\>$//g; #remove the last <>
1658
+ my @Names = split(/<>/, $NamePatternStr);
1659
+
1660
+ #try making features binary
1661
+ for my $i(0 .. $#Names) {
1662
+ my @NameComponent = split(/\s+/, $Names[$i]);
1663
+ for my $j(0 .. $#NameComponent){
1664
+ #feature generation($i = 0 is the first one)
1665
+ $FeatureH{"Name"."$i"."part"."$j"."form"} = &HeaderParse::API::NamePatternMatch::RichNameType($NameComponent[$j]);
1666
+ if ($j eq $#NameComponent) {
1667
+ $FeatureH{"Name"."$i"."part"."$j"."pos"} = "Last";
1668
+ }elsif ($j eq $#NameComponent -1) {
1669
+ $FeatureH{"Name"."$i"."part"."$j"."pos"} = "SecLast";
1670
+ }else {
1671
+ $FeatureH{"Name"."$i"."part"."$j"."pos"} = $j;
1672
+ }
1673
+ #firstname, lastname information
1674
+ # print "hello2: ".lc($NameComponent[$j])."\n";
1675
+ if (($firstnameH{lc($NameComponent[$j])}) && (!$lastnameH{lc($NameComponent[$j])})) {
1676
+ # print "NAME MATCH: ".lc($NameComponent[$j])."\n";
1677
+ $FeatureH{"Name"."$i"."part"."$j"."FN"} = 1;
1678
+ }elsif (($lastnameH{lc($NameComponent[$j])}) && (!$firstnameH{lc($NameComponent[$j])})) {
1679
+ # print "NAME MATCH: ".lc($NameComponent[$j])."\n";
1680
+
1681
+ $FeatureH{"Name"."$i"."part"."$j"."LN"} = 1;
1682
+ }elsif (! $dictH{lc($NameComponent[$j])}) {
1683
+ # print "NAME MATCH: ".lc($NameComponent[$j])."\n";
1684
+
1685
+ $FeatureH{"Name"."$i"."part"."$j"."NonDict"} = 1;
1686
+ }
1687
+
1688
+ #space for more features
1689
+ }
1690
+ }
1691
+ return(\%FeatureH);
1692
+ }
1693
+
1694
+
1695
+ sub IsNumber ()
1696
+ {
1697
+ my $in = shift;
1698
+ if ($in =~ m/^(\d+)(\.\d+)*$/) {
1699
+ return 1;
1700
+ }else {
1701
+ return 0;
1702
+ }
1703
+ }
1704
+
1705
+
1706
+ sub FormFeaDict() {
1707
+ my $DataH = shift;
1708
+ my $FeatureDictH = shift;
1709
+ my %NameSpaceFeaDictH = ();
1710
+
1711
+ foreach my $HeaNO (sort {$a <=> $b} keys %{$DataH}) {
1712
+ foreach my $line (sort {$a <=> $b} keys %{$$DataH{$HeaNO}}) {
1713
+ foreach my $fea(keys %{$$DataH{$HeaNO}{$line}{FeaVec}}) {
1714
+ if ($$DataH{$HeaNO}{$line}{FeaVec}{$fea} == 0) {
1715
+ delete ($$DataH{$HeaNO}{$line}{FeaVec}{$fea});
1716
+ next;
1717
+ }else {
1718
+ if (! $$FeatureDictH{$fea}{ID}) {
1719
+ $$FeatureDictH{FeatureCounter}++;
1720
+ $$FeatureDictH{$fea}{ID} = $$FeatureDictH{FeatureCounter};
1721
+ }
1722
+ if ($$DataH{$HeaNO}{$line}{FeaVec}{$fea} > $$FeatureDictH{$fea}{max}) {
1723
+ $$FeatureDictH{$fea}{max} = $$DataH{$HeaNO}{$line}{FeaVec}{$fea};
1724
+ }
1725
+ $$FeatureDictH{$fea}{DF}++;
1726
+ }
1727
+ #test needs this line!
1728
+ if ((! $$FeatureDictH{$fea}{ID}) || ($$DataH{$HeaNO}{$line}{FeaVec}{$fea} == 0)) { #some basic feature defined in initialization such as pubnumber could be 0
1729
+ delete ($$DataH{$HeaNO}{$line}{FeaVec}{$fea});
1730
+ }
1731
+ }
1732
+
1733
+ #form the Name Space Feature Dict
1734
+ if (exists $$DataH{$HeaNO}{$line}{NamePattern}) {
1735
+ foreach my $CandidateNamePattern(keys %{$$DataH{$HeaNO}{$line}{NamePattern}}) {
1736
+ foreach my $fea(keys %{$$DataH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}}) {
1737
+ my $wt = $$DataH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}{$fea};
1738
+ if (! $NameSpaceFeaDictH{$fea}{ID}) {
1739
+ $NameSpaceFeaDictH{FeatureCounter}++;
1740
+ $NameSpaceFeaDictH{$fea}{ID} = $NameSpaceFeaDictH{FeatureCounter};
1741
+ }
1742
+ if (! &IsNumber($wt)) {
1743
+ if (! exists $NameSpaceFeaDictH{$wt}{ID}) {
1744
+ $NameSpaceFeaDictH{FeatureCounter}++;
1745
+ $NameSpaceFeaDictH{$wt}{ID} = $NameSpaceFeaDictH{FeatureCounter};
1746
+ }
1747
+ $$DataH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}{$fea} = $NameSpaceFeaDictH{$wt}{ID};
1748
+ }
1749
+
1750
+ if ($wt == 0) {
1751
+ delete($$DataH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}{$fea});
1752
+ }else {
1753
+ if ((! exists $NameSpaceFeaDictH{$fea}{max}) || ($NameSpaceFeaDictH{$fea}{max} < $wt)) {
1754
+ $NameSpaceFeaDictH{$fea}{max} = $wt;
1755
+ }
1756
+ }
1757
+ }
1758
+ }
1759
+ }
1760
+ #end of form the dictionary for the name
1761
+ }
1762
+ }
1763
+ return($DataH, $FeatureDictH, \%NameSpaceFeaDictH);
1764
+ }
1765
+
1766
+
1767
+ sub FormTestFeaVec(){
1768
+ my $FeatureDictH = shift;
1769
+ my $TestHeaderH = shift;
1770
+
1771
+ foreach my $line(sort{$a<=>$b} keys %{$TestHeaderH}) {
1772
+ foreach my $fea(keys %{$$TestHeaderH{$line}{FeaVec}}) {
1773
+ if ((! $$FeatureDictH{$fea}{ID}) || ($$TestHeaderH{$line}{FeaVec}{$fea} == 0)) {
1774
+ delete($$TestHeaderH{$line}{FeaVec}{$fea});
1775
+ }
1776
+ }
1777
+ }
1778
+ return($TestHeaderH);
1779
+ }
1780
+
1781
+
1782
+ sub PruneDict() {
1783
+ my $FeatureDictH = shift;
1784
+ my $Recount = 1;
1785
+
1786
+ foreach my $DictFea(sort{$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$FeatureDictH}) {
1787
+ if ((($DictFea ne "FeatureCounter") && ($$FeatureDictH{$DictFea}{max} == 0)) || ($$FeatureDictH{$DictFea}{DF} < 2)) {
1788
+ delete($$FeatureDictH{$DictFea});
1789
+ }else {
1790
+ $$FeatureDictH{$DictFea}{ID} = $Recount;
1791
+ $Recount++;
1792
+ }
1793
+ }
1794
+
1795
+ $$FeatureDictH{FeatureCounter} = $Recount-1;
1796
+
1797
+ return($FeatureDictH);
1798
+ }
1799
+
1800
+ #input is an array of name patterns
1801
+ #return a string of the best name pattern
1802
+ sub PredictBestNamePattern() {
1803
+ my $PredictedNames = shift;
1804
+ my $SVMNameSpaceModel = shift;
1805
+ my $SpaceNameDictH = shift;
1806
+ my $tmpCacheVec = shift;
1807
+ my $SVMTmpResult = shift;
1808
+
1809
+ my $MaxVal = -10;
1810
+ my $BestNamePattern = "";
1811
+
1812
+ for my $i(0 .. $#$PredictedNames) {
1813
+ my $candidateName = "";
1814
+ for my $j(0 .. $#{$$PredictedNames[$i]}) {
1815
+ if ($$PredictedNames[$i][$j]) {
1816
+ $candidateName .= "$$PredictedNames[$i][$j]<>";
1817
+ }
1818
+ }
1819
+
1820
+ my ($RawNameFeaVec) = &SpaceNameLnFeaRepre_unit($candidateName);
1821
+ #filter out the non-dictinary features
1822
+ my $SpaceNameVec = "";
1823
+ my $SpaceNameTextFeaVec = "";
1824
+ foreach my $fea(sort {$$SpaceNameDictH{$a}{ID} <=> $$SpaceNameDictH{$b}{ID}} keys %{$RawNameFeaVec}) {
1825
+ my $wt = $$RawNameFeaVec{$fea};
1826
+ if (! &IsNumber($wt)) {
1827
+ if (exists $$SpaceNameDictH{$wt}{ID}) {
1828
+ $$RawNameFeaVec{$fea} = $$SpaceNameDictH{$wt}{ID};
1829
+ }else {
1830
+ delete($$RawNameFeaVec{$fea});
1831
+ }
1832
+ }
1833
+
1834
+ if (! (($$RawNameFeaVec{$fea}>0) && $$SpaceNameDictH{$fea}{ID})) {
1835
+ delete($$RawNameFeaVec{$fea});
1836
+ }else {
1837
+ $$RawNameFeaVec{$fea} = sprintf("%.8f", $$RawNameFeaVec{$fea}/$$SpaceNameDictH{$fea}{max}
1838
+ );
1839
+ $SpaceNameVec .= "$$SpaceNameDictH{$fea}{ID}\:$$RawNameFeaVec{$fea} ";
1840
+ $SpaceNameTextFeaVec .= "$fea\:$$RawNameFeaVec{$fea} ";
1841
+ }
1842
+ }
1843
+ open(testVec, ">$tmpCacheVec") || die "SVMHeaderParse: could not open $tmpCacheVec to write: $!";
1844
+ # print "NamePattern FeatureVec is\: $SpaceNameTextVec\n";
1845
+ print testVec "$SpaceNameVec";
1846
+ close(testVec);
1847
+ `$Classifier -v 0 $tmpCacheVec $SVMNameSpaceModel $SVMTmpResult`;
1848
+ my $result = &Analyze($SVMTmpResult);
1849
+ if ($result > $MaxVal) {
1850
+ $MaxVal = $result;
1851
+ $BestNamePattern = $candidateName;
1852
+ }
1853
+ }
1854
+
1855
+ unlink $tmpCacheVec;
1856
+ unlink $SVMTmpResult;
1857
+
1858
+ #split the multiple names in order
1859
+ $BestNamePattern =~ s/\<\>$//g; #remove the last <>
1860
+
1861
+ return($BestNamePattern);
1862
+ }
1863
+
1864
+
1865
+ sub WordCount() { #didn't try, but should be OK, since it is borrowed from AddrMatch in function.pm
1866
+ my $inStr = shift;
1867
+ $inStr =~ s/^\s+//g;
1868
+ $inStr =~ s/\s+$//g;
1869
+
1870
+ my $senLen = 0;
1871
+ my @words = split(/\s+/, $inStr);
1872
+ for my $i(0 .. $#words) {
1873
+ if ($words[0] !~ /^\W+\s*$/) { #punctuation
1874
+ $senLen ++;
1875
+ }
1876
+ }
1877
+ return($senLen);
1878
+ }
1879
+
1880
+ 1;