biblicit 1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (322) hide show
  1. data/.gitignore +3 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +6 -0
  4. data/LICENSE.TXT +176 -0
  5. data/README.md +120 -0
  6. data/Rakefile +8 -0
  7. data/biblicit.gemspec +33 -0
  8. data/lib/biblicit/cb2bib.rb +83 -0
  9. data/lib/biblicit/citeseer.rb +53 -0
  10. data/lib/biblicit/extractor.rb +37 -0
  11. data/lib/biblicit.rb +6 -0
  12. data/perl/DocFilter/lib/CSXUtil/SafeText.pm +140 -0
  13. data/perl/DocFilter/lib/DocFilter/Config.pm +35 -0
  14. data/perl/DocFilter/lib/DocFilter/Filter.pm +51 -0
  15. data/perl/FileConversionService/README.TXT +11 -0
  16. data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
  17. data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +140 -0
  18. data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +77 -0
  19. data/perl/FileConversionService/lib/FileConverter/Compression.pm +137 -0
  20. data/perl/FileConversionService/lib/FileConverter/Config.pm +57 -0
  21. data/perl/FileConversionService/lib/FileConverter/Controller.pm +191 -0
  22. data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +61 -0
  23. data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +69 -0
  24. data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +69 -0
  25. data/perl/FileConversionService/lib/FileConverter/PSToText.pm +88 -0
  26. data/perl/FileConversionService/lib/FileConverter/Prescript.pm +68 -0
  27. data/perl/FileConversionService/lib/FileConverter/TET.pm +75 -0
  28. data/perl/FileConversionService/lib/FileConverter/Utils.pm +130 -0
  29. data/perl/HeaderParseService/README.TXT +80 -0
  30. data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +140 -0
  31. data/perl/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +968 -0
  32. data/perl/HeaderParseService/lib/HeaderParse/API/Function.pm +2016 -0
  33. data/perl/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +444 -0
  34. data/perl/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +409 -0
  35. data/perl/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +537 -0
  36. data/perl/HeaderParseService/lib/HeaderParse/API/Parser.pm +68 -0
  37. data/perl/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +1880 -0
  38. data/perl/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +46 -0
  39. data/perl/HeaderParseService/resources/data/EbizHeaders.txt +24330 -0
  40. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +27506 -0
  41. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +26495 -0
  42. data/perl/HeaderParseService/resources/data/tagged_headers.txt +40668 -0
  43. data/perl/HeaderParseService/resources/data/test_header.txt +31 -0
  44. data/perl/HeaderParseService/resources/data/test_header.txt.parsed +31 -0
  45. data/perl/HeaderParseService/resources/database/50states +60 -0
  46. data/perl/HeaderParseService/resources/database/AddrTopWords.txt +17 -0
  47. data/perl/HeaderParseService/resources/database/AffiTopWords.txt +35 -0
  48. data/perl/HeaderParseService/resources/database/AffiTopWordsAll.txt +533 -0
  49. data/perl/HeaderParseService/resources/database/ChineseSurNames.txt +276 -0
  50. data/perl/HeaderParseService/resources/database/Csurnames.bin +0 -0
  51. data/perl/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
  52. data/perl/HeaderParseService/resources/database/DomainSuffixes.txt +242 -0
  53. data/perl/HeaderParseService/resources/database/LabeledHeader +18 -0
  54. data/perl/HeaderParseService/resources/database/README +2 -0
  55. data/perl/HeaderParseService/resources/database/TrainMulClassLines +254 -0
  56. data/perl/HeaderParseService/resources/database/TrainMulClassLines1 +510 -0
  57. data/perl/HeaderParseService/resources/database/abstract.txt +1 -0
  58. data/perl/HeaderParseService/resources/database/abstractTopWords +9 -0
  59. data/perl/HeaderParseService/resources/database/addr.txt +28 -0
  60. data/perl/HeaderParseService/resources/database/affi.txt +34 -0
  61. data/perl/HeaderParseService/resources/database/affis.bin +0 -0
  62. data/perl/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
  63. data/perl/HeaderParseService/resources/database/allnamewords.bin +0 -0
  64. data/perl/HeaderParseService/resources/database/cities_US.txt +4512 -0
  65. data/perl/HeaderParseService/resources/database/cities_world.txt +4463 -0
  66. data/perl/HeaderParseService/resources/database/city.txt +3150 -0
  67. data/perl/HeaderParseService/resources/database/cityname.txt +3151 -0
  68. data/perl/HeaderParseService/resources/database/country_abbr.txt +243 -0
  69. data/perl/HeaderParseService/resources/database/countryname.txt +262 -0
  70. data/perl/HeaderParseService/resources/database/dateTopWords +30 -0
  71. data/perl/HeaderParseService/resources/database/degree.txt +67 -0
  72. data/perl/HeaderParseService/resources/database/email.txt +3 -0
  73. data/perl/HeaderParseService/resources/database/excludeWords.txt +40 -0
  74. data/perl/HeaderParseService/resources/database/female-names +4960 -0
  75. data/perl/HeaderParseService/resources/database/firstNames.txt +8448 -0
  76. data/perl/HeaderParseService/resources/database/firstnames.bin +0 -0
  77. data/perl/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
  78. data/perl/HeaderParseService/resources/database/intro.txt +2 -0
  79. data/perl/HeaderParseService/resources/database/keyword.txt +5 -0
  80. data/perl/HeaderParseService/resources/database/keywordTopWords +7 -0
  81. data/perl/HeaderParseService/resources/database/male-names +3906 -0
  82. data/perl/HeaderParseService/resources/database/middleNames.txt +2 -0
  83. data/perl/HeaderParseService/resources/database/month.txt +35 -0
  84. data/perl/HeaderParseService/resources/database/mul +868 -0
  85. data/perl/HeaderParseService/resources/database/mul.label +869 -0
  86. data/perl/HeaderParseService/resources/database/mul.label.old +869 -0
  87. data/perl/HeaderParseService/resources/database/mul.processed +762 -0
  88. data/perl/HeaderParseService/resources/database/mulAuthor +619 -0
  89. data/perl/HeaderParseService/resources/database/mulClassStat +45 -0
  90. data/perl/HeaderParseService/resources/database/nickname.txt +58 -0
  91. data/perl/HeaderParseService/resources/database/nicknames.bin +0 -0
  92. data/perl/HeaderParseService/resources/database/note.txt +121 -0
  93. data/perl/HeaderParseService/resources/database/page.txt +1 -0
  94. data/perl/HeaderParseService/resources/database/phone.txt +9 -0
  95. data/perl/HeaderParseService/resources/database/postcode.txt +54 -0
  96. data/perl/HeaderParseService/resources/database/pubnum.txt +45 -0
  97. data/perl/HeaderParseService/resources/database/statename.bin +0 -0
  98. data/perl/HeaderParseService/resources/database/statename.txt +73 -0
  99. data/perl/HeaderParseService/resources/database/states_and_abbreviations.txt +118 -0
  100. data/perl/HeaderParseService/resources/database/stopwords +438 -0
  101. data/perl/HeaderParseService/resources/database/stopwords.bin +0 -0
  102. data/perl/HeaderParseService/resources/database/surNames.txt +19613 -0
  103. data/perl/HeaderParseService/resources/database/surnames.bin +0 -0
  104. data/perl/HeaderParseService/resources/database/surnames_spec.bin +0 -0
  105. data/perl/HeaderParseService/resources/database/university_list/A.html +167 -0
  106. data/perl/HeaderParseService/resources/database/university_list/B.html +161 -0
  107. data/perl/HeaderParseService/resources/database/university_list/C.html +288 -0
  108. data/perl/HeaderParseService/resources/database/university_list/D.html +115 -0
  109. data/perl/HeaderParseService/resources/database/university_list/E.html +147 -0
  110. data/perl/HeaderParseService/resources/database/university_list/F.html +112 -0
  111. data/perl/HeaderParseService/resources/database/university_list/G.html +115 -0
  112. data/perl/HeaderParseService/resources/database/university_list/H.html +140 -0
  113. data/perl/HeaderParseService/resources/database/university_list/I.html +138 -0
  114. data/perl/HeaderParseService/resources/database/university_list/J.html +82 -0
  115. data/perl/HeaderParseService/resources/database/university_list/K.html +115 -0
  116. data/perl/HeaderParseService/resources/database/university_list/L.html +131 -0
  117. data/perl/HeaderParseService/resources/database/university_list/M.html +201 -0
  118. data/perl/HeaderParseService/resources/database/university_list/N.html +204 -0
  119. data/perl/HeaderParseService/resources/database/university_list/O.html +89 -0
  120. data/perl/HeaderParseService/resources/database/university_list/P.html +125 -0
  121. data/perl/HeaderParseService/resources/database/university_list/Q.html +49 -0
  122. data/perl/HeaderParseService/resources/database/university_list/R.html +126 -0
  123. data/perl/HeaderParseService/resources/database/university_list/S.html +296 -0
  124. data/perl/HeaderParseService/resources/database/university_list/T.html +156 -0
  125. data/perl/HeaderParseService/resources/database/university_list/U.html +800 -0
  126. data/perl/HeaderParseService/resources/database/university_list/V.html +75 -0
  127. data/perl/HeaderParseService/resources/database/university_list/W.html +144 -0
  128. data/perl/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
  129. data/perl/HeaderParseService/resources/database/university_list/X.html +44 -0
  130. data/perl/HeaderParseService/resources/database/university_list/Y.html +53 -0
  131. data/perl/HeaderParseService/resources/database/university_list/Z.html +43 -0
  132. data/perl/HeaderParseService/resources/database/university_list/ae.html +31 -0
  133. data/perl/HeaderParseService/resources/database/university_list/am.html +30 -0
  134. data/perl/HeaderParseService/resources/database/university_list/ar.html +35 -0
  135. data/perl/HeaderParseService/resources/database/university_list/at.html +43 -0
  136. data/perl/HeaderParseService/resources/database/university_list/au.html +82 -0
  137. data/perl/HeaderParseService/resources/database/university_list/bd.html +28 -0
  138. data/perl/HeaderParseService/resources/database/university_list/be.html +41 -0
  139. data/perl/HeaderParseService/resources/database/university_list/bg.html +28 -0
  140. data/perl/HeaderParseService/resources/database/university_list/bh.html +28 -0
  141. data/perl/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
  142. data/perl/HeaderParseService/resources/database/university_list/bm.html +28 -0
  143. data/perl/HeaderParseService/resources/database/university_list/bn.html +28 -0
  144. data/perl/HeaderParseService/resources/database/university_list/br.html +66 -0
  145. data/perl/HeaderParseService/resources/database/university_list/ca.html +174 -0
  146. data/perl/HeaderParseService/resources/database/university_list/ch.html +52 -0
  147. data/perl/HeaderParseService/resources/database/university_list/cl.html +40 -0
  148. data/perl/HeaderParseService/resources/database/university_list/cn.html +87 -0
  149. data/perl/HeaderParseService/resources/database/university_list/co.html +39 -0
  150. data/perl/HeaderParseService/resources/database/university_list/cr.html +34 -0
  151. data/perl/HeaderParseService/resources/database/university_list/cy.html +34 -0
  152. data/perl/HeaderParseService/resources/database/university_list/cz.html +44 -0
  153. data/perl/HeaderParseService/resources/database/university_list/de.html +128 -0
  154. data/perl/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
  155. data/perl/HeaderParseService/resources/database/university_list/dk.html +42 -0
  156. data/perl/HeaderParseService/resources/database/university_list/ec.html +31 -0
  157. data/perl/HeaderParseService/resources/database/university_list/ee.html +30 -0
  158. data/perl/HeaderParseService/resources/database/university_list/eg.html +29 -0
  159. data/perl/HeaderParseService/resources/database/university_list/es.html +68 -0
  160. data/perl/HeaderParseService/resources/database/university_list/et.html +28 -0
  161. data/perl/HeaderParseService/resources/database/university_list/faq.html +147 -0
  162. data/perl/HeaderParseService/resources/database/university_list/fi.html +49 -0
  163. data/perl/HeaderParseService/resources/database/university_list/fj.html +28 -0
  164. data/perl/HeaderParseService/resources/database/university_list/fo.html +28 -0
  165. data/perl/HeaderParseService/resources/database/university_list/fr.html +106 -0
  166. data/perl/HeaderParseService/resources/database/university_list/geog.html +150 -0
  167. data/perl/HeaderParseService/resources/database/university_list/gr.html +38 -0
  168. data/perl/HeaderParseService/resources/database/university_list/gu.html +28 -0
  169. data/perl/HeaderParseService/resources/database/university_list/hk.html +34 -0
  170. data/perl/HeaderParseService/resources/database/university_list/hr.html +28 -0
  171. data/perl/HeaderParseService/resources/database/university_list/hu.html +46 -0
  172. data/perl/HeaderParseService/resources/database/university_list/id.html +29 -0
  173. data/perl/HeaderParseService/resources/database/university_list/ie.html +49 -0
  174. data/perl/HeaderParseService/resources/database/university_list/il.html +35 -0
  175. data/perl/HeaderParseService/resources/database/university_list/in.html +109 -0
  176. data/perl/HeaderParseService/resources/database/university_list/is.html +32 -0
  177. data/perl/HeaderParseService/resources/database/university_list/it.html +75 -0
  178. data/perl/HeaderParseService/resources/database/university_list/jm.html +28 -0
  179. data/perl/HeaderParseService/resources/database/university_list/jo.html +28 -0
  180. data/perl/HeaderParseService/resources/database/university_list/jp.html +155 -0
  181. data/perl/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
  182. data/perl/HeaderParseService/resources/database/university_list/kr.html +65 -0
  183. data/perl/HeaderParseService/resources/database/university_list/kw.html +28 -0
  184. data/perl/HeaderParseService/resources/database/university_list/lb.html +28 -0
  185. data/perl/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
  186. data/perl/HeaderParseService/resources/database/university_list/lk.html +30 -0
  187. data/perl/HeaderParseService/resources/database/university_list/lt.html +31 -0
  188. data/perl/HeaderParseService/resources/database/university_list/lu.html +34 -0
  189. data/perl/HeaderParseService/resources/database/university_list/lv.html +30 -0
  190. data/perl/HeaderParseService/resources/database/university_list/ma.html +28 -0
  191. data/perl/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
  192. data/perl/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
  193. data/perl/HeaderParseService/resources/database/university_list/mk.html +29 -0
  194. data/perl/HeaderParseService/resources/database/university_list/mo.html +29 -0
  195. data/perl/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
  196. data/perl/HeaderParseService/resources/database/university_list/mt.html +28 -0
  197. data/perl/HeaderParseService/resources/database/university_list/mx.html +68 -0
  198. data/perl/HeaderParseService/resources/database/university_list/my.html +39 -0
  199. data/perl/HeaderParseService/resources/database/university_list/ni.html +28 -0
  200. data/perl/HeaderParseService/resources/database/university_list/nl.html +51 -0
  201. data/perl/HeaderParseService/resources/database/university_list/no.html +56 -0
  202. data/perl/HeaderParseService/resources/database/university_list/nz.html +41 -0
  203. data/perl/HeaderParseService/resources/database/university_list/pa.html +31 -0
  204. data/perl/HeaderParseService/resources/database/university_list/pe.html +40 -0
  205. data/perl/HeaderParseService/resources/database/university_list/ph.html +41 -0
  206. data/perl/HeaderParseService/resources/database/university_list/pl.html +51 -0
  207. data/perl/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
  208. data/perl/HeaderParseService/resources/database/university_list/pr.html +31 -0
  209. data/perl/HeaderParseService/resources/database/university_list/ps.html +28 -0
  210. data/perl/HeaderParseService/resources/database/university_list/pt.html +45 -0
  211. data/perl/HeaderParseService/resources/database/university_list/recognition.html +69 -0
  212. data/perl/HeaderParseService/resources/database/university_list/results.html +71 -0
  213. data/perl/HeaderParseService/resources/database/university_list/ro.html +38 -0
  214. data/perl/HeaderParseService/resources/database/university_list/ru.html +48 -0
  215. data/perl/HeaderParseService/resources/database/university_list/sd.html +28 -0
  216. data/perl/HeaderParseService/resources/database/university_list/se.html +57 -0
  217. data/perl/HeaderParseService/resources/database/university_list/sg.html +33 -0
  218. data/perl/HeaderParseService/resources/database/university_list/si.html +30 -0
  219. data/perl/HeaderParseService/resources/database/university_list/sk.html +35 -0
  220. data/perl/HeaderParseService/resources/database/university_list/th.html +45 -0
  221. data/perl/HeaderParseService/resources/database/university_list/tr.html +44 -0
  222. data/perl/HeaderParseService/resources/database/university_list/tw.html +76 -0
  223. data/perl/HeaderParseService/resources/database/university_list/ua.html +29 -0
  224. data/perl/HeaderParseService/resources/database/university_list/uk.html +168 -0
  225. data/perl/HeaderParseService/resources/database/university_list/univ-full.html +3166 -0
  226. data/perl/HeaderParseService/resources/database/university_list/univ.html +122 -0
  227. data/perl/HeaderParseService/resources/database/university_list/uy.html +31 -0
  228. data/perl/HeaderParseService/resources/database/university_list/ve.html +34 -0
  229. data/perl/HeaderParseService/resources/database/university_list/yu.html +28 -0
  230. data/perl/HeaderParseService/resources/database/university_list/za.html +46 -0
  231. data/perl/HeaderParseService/resources/database/university_list/zm.html +28 -0
  232. data/perl/HeaderParseService/resources/database/university_list.txt +3025 -0
  233. data/perl/HeaderParseService/resources/database/url.txt +1 -0
  234. data/perl/HeaderParseService/resources/database/webTopWords +225 -0
  235. data/perl/HeaderParseService/resources/database/words +45402 -0
  236. data/perl/HeaderParseService/resources/models/10ContextModelfold1 +369 -0
  237. data/perl/HeaderParseService/resources/models/10Modelfold1 +376 -0
  238. data/perl/HeaderParseService/resources/models/11ContextModelfold1 +400 -0
  239. data/perl/HeaderParseService/resources/models/11Modelfold1 +526 -0
  240. data/perl/HeaderParseService/resources/models/12ContextModelfold1 +510 -0
  241. data/perl/HeaderParseService/resources/models/12Modelfold1 +423 -0
  242. data/perl/HeaderParseService/resources/models/13ContextModelfold1 +364 -0
  243. data/perl/HeaderParseService/resources/models/13Modelfold1 +677 -0
  244. data/perl/HeaderParseService/resources/models/14ContextModelfold1 +459 -0
  245. data/perl/HeaderParseService/resources/models/14Modelfold1 +325 -0
  246. data/perl/HeaderParseService/resources/models/15ContextModelfold1 +340 -0
  247. data/perl/HeaderParseService/resources/models/15Modelfold1 +390 -0
  248. data/perl/HeaderParseService/resources/models/1ContextModelfold1 +668 -0
  249. data/perl/HeaderParseService/resources/models/1Modelfold1 +1147 -0
  250. data/perl/HeaderParseService/resources/models/2ContextModelfold1 +755 -0
  251. data/perl/HeaderParseService/resources/models/2Modelfold1 +796 -0
  252. data/perl/HeaderParseService/resources/models/3ContextModelfold1 +1299 -0
  253. data/perl/HeaderParseService/resources/models/3Modelfold1 +1360 -0
  254. data/perl/HeaderParseService/resources/models/4ContextModelfold1 +1062 -0
  255. data/perl/HeaderParseService/resources/models/4Modelfold1 +993 -0
  256. data/perl/HeaderParseService/resources/models/5ContextModelfold1 +1339 -0
  257. data/perl/HeaderParseService/resources/models/5Modelfold1 +2098 -0
  258. data/perl/HeaderParseService/resources/models/6ContextModelfold1 +888 -0
  259. data/perl/HeaderParseService/resources/models/6Modelfold1 +620 -0
  260. data/perl/HeaderParseService/resources/models/7ContextModelfold1 +257 -0
  261. data/perl/HeaderParseService/resources/models/7Modelfold1 +228 -0
  262. data/perl/HeaderParseService/resources/models/8ContextModelfold1 +677 -0
  263. data/perl/HeaderParseService/resources/models/8Modelfold1 +1871 -0
  264. data/perl/HeaderParseService/resources/models/9ContextModelfold1 +198 -0
  265. data/perl/HeaderParseService/resources/models/9Modelfold1 +170 -0
  266. data/perl/HeaderParseService/resources/models/NameSpaceModel +181 -0
  267. data/perl/HeaderParseService/resources/models/NameSpaceTrainF +347 -0
  268. data/perl/HeaderParseService/resources/models/WrapperBaseFeaDict +13460 -0
  269. data/perl/HeaderParseService/resources/models/WrapperContextFeaDict +14045 -0
  270. data/perl/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +510 -0
  271. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +23 -0
  272. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +23 -0
  273. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +23 -0
  274. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +23 -0
  275. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +23 -0
  276. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +23 -0
  277. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +23 -0
  278. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +23 -0
  279. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +23 -0
  280. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +23 -0
  281. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +23 -0
  282. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +23 -0
  283. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +23 -0
  284. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +23 -0
  285. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +23 -0
  286. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +23 -0
  287. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +23 -0
  288. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +23 -0
  289. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +23 -0
  290. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +23 -0
  291. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +23 -0
  292. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +23 -0
  293. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +23 -0
  294. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +23 -0
  295. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +23 -0
  296. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +23 -0
  297. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +23 -0
  298. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +23 -0
  299. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +23 -0
  300. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +23 -0
  301. data/perl/ParsCit/README.TXT +82 -0
  302. data/perl/ParsCit/crfpp/traindata/parsCit.template +60 -0
  303. data/perl/ParsCit/crfpp/traindata/parsCit.train.data +12104 -0
  304. data/perl/ParsCit/crfpp/traindata/tagged_references.txt +500 -0
  305. data/perl/ParsCit/lib/CSXUtil/SafeText.pm +140 -0
  306. data/perl/ParsCit/lib/ParsCit/Citation.pm +462 -0
  307. data/perl/ParsCit/lib/ParsCit/CitationContext.pm +132 -0
  308. data/perl/ParsCit/lib/ParsCit/Config.pm +46 -0
  309. data/perl/ParsCit/lib/ParsCit/Controller.pm +306 -0
  310. data/perl/ParsCit/lib/ParsCit/PostProcess.pm +367 -0
  311. data/perl/ParsCit/lib/ParsCit/PreProcess.pm +333 -0
  312. data/perl/ParsCit/lib/ParsCit/Tr2crfpp.pm +331 -0
  313. data/perl/ParsCit/resources/parsCit.model +0 -0
  314. data/perl/ParsCit/resources/parsCitDict.txt +148783 -0
  315. data/perl/extract.pl +199 -0
  316. data/spec/biblicit/cb2bib_spec.rb +48 -0
  317. data/spec/biblicit/citeseer_spec.rb +40 -0
  318. data/spec/fixtures/pdf/10.1.1.109.4049.pdf +0 -0
  319. data/spec/fixtures/pdf/Bagnoli Watts TAR 2010.pdf +0 -0
  320. data/spec/fixtures/pdf/ICINCO_2010.pdf +0 -0
  321. data/spec/spec_helper.rb +3 -0
  322. metadata +474 -0
@@ -0,0 +1,2016 @@
1
+ #
2
+ # Copyright 2007 Penn State University
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ #
13
+ package HeaderParse::API::Function;
14
+
15
+ use utf8;
16
+ use HeaderParse::Config::API_Config qw($Database_Dir);
17
+ use HeaderParse::API::LoadInformation;
18
+ require Exporter;
19
+ use Storable qw(nfreeze thaw);
20
+ use Data::Dumper;
21
+ use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
22
+ use vars qw(%dictH %nameH %monthH %affiH %addrH %conjH %prepH %postcodeH %cityH %stateH %countryH %abstractH);
23
+
24
+ @ISA = qw(Exporter); # important!!
25
+ @EXPORT = qw(&weired_author &AddrMatch &printDict &GenTrainVecMatrix &GetBorderLine &LineFeatureRepre &LineFeatureRepre2 &OfflineFillSpace &FillSpace &SeparatePunc &hash_stopwords &hash_nickname &hash_affi_stopwords &hash_addrwords &hash_statewords &str_space_clean &dump_hash_to_file &nfreeze_hash_to_file &read_hash_from_file &thaw_hash_from_file &rand_split_samples_to2parts &rand_split_samples_toNparts &rand_split_hash_index_toNparts &ExtractBinaryNfoldSVMResult &GetNameVariations &get_university_emails &compute_std);
26
+
27
+ sub AddrMatch() {
28
+
29
+ ###open (MYLOGGER, ">ADDRMATCH.LOG");
30
+ ###MYLOGGER->autoflush(1);
31
+
32
+ my $inline = shift;
33
+
34
+ ###$inline="Solitary Waves in the Critical Surface Tension Model";
35
+ ###print MYLOGGER "$inline\n";
36
+
37
+ my @words = split(/\s+/, $inline);
38
+ my $senLen = 0;
39
+
40
+ # match the state and country here using one or two words
41
+ # this step might be very time consuming
42
+ if ($words[0] !~ /^\W+\s*$/) {
43
+ $senLen ++; # punctuation
44
+ }
45
+
46
+ ###foreach $word (@words){
47
+ ###print MYLOGGER "before : word is \"$word\"\n";
48
+ ###$word = lc($word);
49
+ ###print MYLOGGER "after : word is \"$word\"\n";
50
+ ###}
51
+
52
+
53
+ ###print MYLOGGER "count is $#words\n";
54
+
55
+ for my $i(1 .. $#words) {
56
+ ### print MYLOGGER "word is $words[$i]\n";
57
+ if ($words[$i] !~ /^\W+\s*$/) {
58
+ $senLen ++; # punctuation
59
+ }
60
+ #the first letter is capitalized
61
+ if (($words[$i-1] =~ /^[\p{IsUpper}]/) && ($words[$i] =~ /^[\p{IsUpper}]/)) {
62
+ ###print MYLOGGER "before: $words[$i-1],$words[$i]\n";
63
+ my $pre = lc($words[$i-1]);
64
+
65
+ my $now = lc($words[$i]);
66
+ ###print MYLOGGER "pre is $pre\n now is $now\n";
67
+ if (exists $stateH{"$pre $now"}) { # need to check if it is correct
68
+ $words[$i-1] = "";
69
+ $words[$i] = ":state:";
70
+ }elsif (exists $countryH{"$pre $now"}) {
71
+ $words[$i-1] = "";
72
+ $words[$i] = ":country:";
73
+ }elsif (exists $cityH{"$pre $now"}) {
74
+ $words[$i-1] = "";
75
+ $words[$i] = ":city:";
76
+ }
77
+ }
78
+ }
79
+ ###CLOSE(MYLOGGER);
80
+ #Broken line is because of the insufficient hard disk
81
+ $inline = "@words"; #nice join!
82
+ $inline =~ s/^\s+//g;
83
+ $inline =~ s/\s+$//g;
84
+
85
+ return($inline, $senLen);
86
+ }
87
+
88
+
89
+ sub printDict() {
90
+ my ($TotalTrainLineCount, $dictF, %dictH) = @_;
91
+
92
+ open(DictFH, ">$dictF") || die "SVMHeaderParse: could not open dictfile\: $dictF to write\n";
93
+ # replace the old FeatureDictH with the new IDs
94
+ foreach my $feature (sort{$dictH{$a}{ID} <=> $dictH{$b}{ID}} keys %dictH) {
95
+ if (defined $dictH{$feature}{ID}) {
96
+ $dictH{$feature}{mean} = sprintf("%.8f", $dictH{$feature}{mean}/$TotalTrainLineCount);
97
+
98
+ if ($dictH{$feature}{max} == 0) {
99
+ print STDERR "$feature Yahoo1 \n";
100
+ }
101
+ my $ANmean = sprintf("%.8f", $dictH{$feature}{mean}/$dictH{$feature}{max});
102
+ print DictFH "$dictH{$feature}{df} $dictH{$feature}{ID} $feature\: max\($dictH{$feature}{max}\) BNmean\($dictH{$feature}{mean}\) ANmean\($ANmean\)\n";
103
+ }
104
+ }
105
+ close(DictFH);
106
+
107
+ return (%dictH);
108
+ }
109
+
110
+ sub GenTrainVecMatrix() {
111
+ my ($FeatureDictH, $TrainFeatureVecH, $TrainFeatureVec, $TrainMatrixF, $TrainTagInd, $GenMatrix, $norm, $center) = @_;
112
+
113
+ open (TrainFeatureVec, ">$TrainFeatureVec") || die "SVMHeaderParse: could not open TrainFeatureVec $TrainFeatureVec to write\n";
114
+ if ($GenMatrix) {
115
+ open (TrainMatrxFH, ">$TrainMatrixF") || die "SVMHeaderParse: could not open TrainMatrF\: $TrainMatrixF to write\n";
116
+ open (TrainTagIndFH, ">$TrainTagInd") || die "SVMHeaderParse: could not open TrainTagInd\: $TrainTagInd to write\n";
117
+ }
118
+ my $TmpTrainLineNo = 0;
119
+ foreach my $s (sort {$a <=> $b} keys %{$TrainFeatureVecH}) {
120
+ foreach my $li (sort {$a <=> $b} keys %{$$TrainFeatureVecH{$s}}) {
121
+ $TmpTrainLineNo ++;
122
+
123
+ #10/17 multi-class
124
+ print TrainFeatureVec "\(";
125
+ # print TrainTagIndFH "";
126
+ foreach my $tmpCurState (keys %{$$TrainFeatureVecH{$s}{$li}{tag}}) {
127
+ print TrainFeatureVec "$tmpCurState ";
128
+ print TrainTagIndFH "$tmpCurState ";
129
+ }
130
+ print TrainFeatureVec "\) ";
131
+ print TrainTagIndFH "\n";
132
+
133
+ # brute force; insufficient memorty
134
+ if ($GenMatrix == 0) {
135
+ if ($norm) {
136
+ if ($center == 1) {
137
+ #a loop of each feature in the dictionary.
138
+ foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$FeatureDictH}) {
139
+ if (! exists ($$TrainFeatureVecH{$s}{$li}{content}{$feature})) {
140
+ $$TrainFeatureVecH{$s}{$li}{content}{$feature} = 0;
141
+ }
142
+ # norm
143
+ my $featureVal = sprintf("%.8f", $$TrainFeatureVecH{$s}{$li}{content}{$feature}/$$FeatureDictH{$feature}{max});
144
+ #centering
145
+ $featureVal -= sprintf("%.8f", $$FeatureDictH{$feature}{mean}/$$FeatureDictH{$feature}{max});
146
+ $$TrainFeatureVecH{$s}{$li}{content}{$feature} = $featureVal;
147
+ if ($$TrainFeatureVecH{$s}{$li}{content}{$feature} != 0) {
148
+ print TrainFeatureVec "$$FeatureDictH{$feature}{ID}\:$$TrainFeatureVecH{$s}{$li}{content}{$feature} ";
149
+ }
150
+ }
151
+ print TrainFeatureVec "\n";
152
+ }else {
153
+ foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$TrainFeatureVecH{$s}{$li}{content}}) {
154
+ if (! defined ($$FeatureDictH{$feature}{ID})) {
155
+ next;
156
+ }
157
+ my $featureVal = sprintf("%.8f", $$TrainFeatureVecH{$s}{$li}{content}{$feature}/$$FeatureDictH{$feature}{max});
158
+ $$TrainFeatureVecH{$s}{$li}{content}{$feature} = $featureVal;
159
+ if ($$TrainFeatureVecH{$s}{$li}{content}{$feature} != 0) {
160
+ print TrainFeatureVec "$$FeatureDictH{$feature}{ID}\:$$TrainFeatureVecH{$s}{$li}{content}{$feature} ";
161
+ }
162
+ }
163
+ print TrainFeatureVec "\n";
164
+ }
165
+ }else { # norm = 0;
166
+ foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$TrainFeatureVecH{$s}{$li}{content}}) {
167
+ if (! defined ($$FeatureDictH{$feature}{ID})) {
168
+ next;
169
+ }
170
+ if ($$TrainFeatureVecH{$s}{$li}{content}{$feature} != 0) { # must be != 0
171
+ print TrainFeatureVec "$$FeatureDictH{$feature}{ID}\:$$TrainFeatureVecH{$s}{$li}{content}{$feature} ";
172
+ }
173
+ }
174
+ print TrainFeatureVec "\n";
175
+ }
176
+ }else {
177
+ foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$TrainFeatureVecH{$s}{$li}{content}}) {
178
+ if (! defined ($$FeatureDictH{$feature}{ID})) {
179
+ next;
180
+ }
181
+ if ($norm == 1) {
182
+ my $featureVal = sprintf("%.8f", $$TrainFeatureVecH{$s}{$li}{content}{$feature}/$$FeatureDictH{$feature}{max});
183
+ $$TrainFeatureVecH{$s}{$li}{content}{$feature} = $featureVal;
184
+ }
185
+ if ($$TrainFeatureVecH{$s}{$li}{content}{$feature}) {
186
+ print TrainFeatureVec "$$FeatureDictH{$feature}{ID}\:$$TrainFeatureVecH{$s}{$li}{content}{$feature} ";
187
+ # generate the matrix file for the training samples \n";
188
+ print TrainMatrxFH " $TmpTrainLineNo $$FeatureDictH{$feature}{ID} $$TrainFeatureVecH{$s}{$li}{content}{$feature}\n";
189
+ }
190
+ }
191
+ print TrainFeatureVec "\n";
192
+ }# end if
193
+
194
+ }# end foreach l(line)
195
+ print TrainFeatureVec "<NEW_HEADER>\n";
196
+ print TrainTagIndFH "<NEW_HEADER>\n";
197
+ }#end foreach s(sample)
198
+ close (TrainFeatureVec);
199
+ undef (%{$TrainFeatureVecH}); # release the training vector hash
200
+ $endTrain = 0;
201
+ close(TrainTagIndFH);
202
+ if ($GenMatrix) {
203
+ close(TrainMatrxFH);
204
+ }
205
+ }
206
+
207
+ # this is for the plaintext class -- no difference from GenTrainVecMatrix
208
+ sub GenOriginalTrainVecMatrix() {
209
+ my ($FeatureDictH, $TrainFeatureVecH, $TrainFeatureVec, $TrainMatrixF, $TrainTagInd, $GenMatrix, $norm, $center) = @_;
210
+
211
+
212
+ open (TrainFeatureVec, ">$TrainFeatureVec") || die "SVMHeaderParse: here1...could not open TrainFeatureVec $TrainFeatureVec to write\n";
213
+ if ($GenMatrix) {
214
+ open (TrainMatrxFH, ">$TrainMatrixF") || die "SVMHeaderParse: here2...could not open TrainMatrF\: $TrainMatrixF to write\n";
215
+ open (TrainTagIndFH, ">$TrainTagInd") || die "SVMHeaderParse: here3...could not open TrainTagInd\: $TrainTagInd to write\n";
216
+ }
217
+ my $TmpTrainLineNo = 0;
218
+ foreach my $s (sort {$a <=> $b} keys %{$TrainFeatureVecH}) {
219
+ foreach my $li (sort {$a <=> $b} keys %{$$TrainFeatureVecH{$s}}) {
220
+ $TmpTrainLineNo ++;
221
+
222
+ #10/17 multi-class
223
+ print TrainFeatureVec "\(";
224
+ # print TrainTagIndFH "";
225
+ foreach my $tmpCurState (keys %{$$TrainFeatureVecH{$s}{$li}{tag}}) {
226
+ print TrainFeatureVec "$tmpCurState ";
227
+ print TrainTagIndFH "$tmpCurState ";
228
+ }
229
+ print TrainFeatureVec "\) ";
230
+ print TrainTagIndFH "\n";
231
+
232
+ #
233
+
234
+ if (0) {
235
+ print TrainFeatureVec "$$TrainFeatureVecH{$s}{$li}{tag} ";
236
+ print TrainTagIndFH "$$TrainFeatureVecH{$s}{$li}{tag}\n";
237
+ }
238
+
239
+ # brute force; insufficient memorty
240
+ if (($GenMatrix == 0) && ($norm == 1) && ($center == 1)) {
241
+ #a loop of each feature in the dictionary.
242
+ foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$FeatureDictH}) {
243
+ if (! exists ($$TrainFeatureVecH{$s}{$li}{content}{$feature})) {
244
+ $$TrainFeatureVecH{$s}{$li}{content}{$feature} = 0;
245
+ }
246
+ # norm
247
+ my $featureVal = sprintf("%.8f", $$TrainFeatureVecH{$s}{$li}{content}{$feature}/$$FeatureDictH{$feature}{max});
248
+ #centering
249
+ $featureVal -= sprintf("%.8f", $$FeatureDictH{$feature}{mean}/$$FeatureDictH{$feature}{max});
250
+ $$TrainFeatureVecH{$s}{$li}{content}{$feature} = $featureVal;
251
+ if ($$TrainFeatureVecH{$s}{$li}{content}{$feature} != 0) {
252
+ print TrainFeatureVec "$$FeatureDictH{$feature}{ID}\:$$TrainFeatureVecH{$s}{$li}{content}{$feature} ";
253
+ }
254
+ }
255
+ print TrainFeatureVec "\n";
256
+ }elsif (($GenMatrix == 0) && ($norm == 1)) {
257
+ foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$TrainFeatureVecH{$s}{$li}{content}}) {
258
+ if (! defined ($$FeatureDictH{$feature}{ID})) {
259
+ next;
260
+ }
261
+ my $featureVal = sprintf("%.8f", $$TrainFeatureVecH{$s}{$li}{content}{$feature}/$$FeatureDictH{$feature}{max});
262
+ $$TrainFeatureVecH{$s}{$li}{content}{$feature} = $featureVal;
263
+ if ($$TrainFeatureVecH{$s}{$li}{content}{$feature} != 0) {
264
+ print TrainFeatureVec "$$FeatureDictH{$feature}{ID}\:$$TrainFeatureVecH{$s}{$li}{content}{$feature} ";
265
+ }
266
+ }
267
+ print TrainFeatureVec "\n";
268
+ }elsif($GenMatrix == 1) {
269
+ foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$TrainFeatureVecH{$s}{$li}{content}}) {
270
+ if (! defined ($$FeatureDictH{$feature}{ID})) {
271
+ next;
272
+ }
273
+ if ($norm == 1) {
274
+ my $featureVal = sprintf("%.8f", $$TrainFeatureVecH{$s}{$li}{content}{$feature}/$$FeatureDictH{$feature}{max});
275
+ $$TrainFeatureVecH{$s}{$li}{content}{$feature} = $featureVal;
276
+ }
277
+ if ($$TrainFeatureVecH{$s}{$li}{content}{$feature} != 0) {
278
+ print TrainFeatureVec "$$FeatureDictH{$feature}{ID}\:$$TrainFeatureVecH{$s}{$li}{content}{$feature} ";
279
+ # generate the matrix file for the training samples \n";
280
+ print TrainMatrxFH " $TmpTrainLineNo $$FeatureDictH{$feature}{ID} $$TrainFeatureVecH{$s}{$li}{content}{$feature}\n";
281
+ }
282
+ }
283
+ print TrainFeatureVec "\n";
284
+ }# end if
285
+
286
+ }# end foreach l(line)
287
+ print TrainFeatureVec "<NEW_HEADER>\n";
288
+ print TrainTagIndFH "<NEW_HEADER>\n";
289
+ }#end foreach s(sample)
290
+ close (TrainFeatureVec);
291
+ undef (%{$TrainFeatureVecH}); # release the training vector hash
292
+ $endTrain = 0;
293
+ close(TrainTagIndFH);
294
+ if ($GenMatrix) {
295
+ close(TrainMatrxFH);
296
+ }
297
+ }
298
+
299
+
300
+
301
+ sub GetBorderLine() {
302
+ my $InFile = shift; # this file contains the sample separator
303
+ my %BorderLineH;
304
+ my $LineNO = 0;
305
+
306
+ open(INFH, "$InFile") || die "SVMHeaderParse: could not open Infile\: $InFile to read \n";
307
+ while (my $li =<INFH>) {
308
+ $li =~ s/^\s+//g;
309
+ $li =~ s/\s+$//g;
310
+ if ($li !~ /^\s*$/) {
311
+ $LineNO++;
312
+ if ($LineNO == 1) {
313
+ $BorderLineH{$LineNO} = "N"; #only has next line
314
+ }elsif ($li =~ /^\<NEW\_HEADER\>/) {
315
+ $BorderLineH{$LineNO-1} = "P";
316
+ $BorderLineH{$LineNO+1} = "N";
317
+ }
318
+ }
319
+ }
320
+ close(INFH);
321
+ delete($BorderLineH{$LineNO+1}); # delete the last line
322
+
323
+ return(\%BorderLineH);
324
+ }
325
+
326
+ #all relevant domain databases are imported as shown at the beginning of this
327
+ #program
328
+ #useful for OfflineSeparateMultiClassLine.pl esp. for printing
329
+ sub LineFeatureRepre2() {
330
+ my $label = shift;
331
+ my $line = shift;
332
+ my $FeatureDictH = shift;
333
+ my $FiletoPrint = shift;
334
+
335
+ my $neutral = 1;
336
+ my $neutralAddName = 0;
337
+ my $norm = 1;
338
+
339
+ my %TestFeatureVecH = (); #very important
340
+
341
+ #some of these features might not work for single word case such as
342
+ #senLen, so might just take this factor out for word case
343
+ #########categorical features################
344
+ my $senLen = 0;
345
+ my $dateNum = 0;
346
+ my $DictWordNum = 0;
347
+ my $NonDictWordNum = 0;
348
+ my $Cap1DictWordNum = 0;
349
+ my $Cap1NonDictWordNum = 0;
350
+ my $digitNum = 0;
351
+ my $others = 0;
352
+ my $affiNum = 0;
353
+ my $addrNum = 0; # let city, state, country all counted as the addr
354
+ # for word case, we might need more specific recognition
355
+ my $capNum = 0;
356
+ my $introNum = 0;
357
+ my $phoneNum = 0;
358
+ my $degreeNum = 0;
359
+ my $pubNum = 0;
360
+ my $noteNum = 0;
361
+ my $pageNum = 0;
362
+ ###
363
+
364
+ my $TokenLine;
365
+ if (length($line) > 1) {
366
+ ($TokenLine, $senLen) = &AddrMatch($line); # this is to match the bi-grams in the address database; assume bi-gram is unique for address
367
+ #transformed features
368
+ }else {
369
+ $TokenLine = $line;
370
+ }
371
+
372
+ my @words = split(/\s+/, $TokenLine);
373
+ #now start the AddrNameConfu, shared among address and people's name
374
+ #normally do not use this representation
375
+
376
+ for my $i(0 .. $#words) {
377
+ if ($words[$i] =~ /\+PAGE\+/) {
378
+ $words[$i] = ":page:";
379
+ $pageNum++;
380
+ }
381
+ } # end with for each word
382
+
383
+ #match bi-gram on Pubnum, Note and Degree and affiliation (might make it a separate func)
384
+ if (($neutral) && (length($line) > 1)) {
385
+ for my $i(1 .. $#words) {
386
+ my $pre = lc($words[$i-1]);
387
+ my $now = lc($words[$i]);
388
+ my $prestem;
389
+ my $nowstem;
390
+ my $degreeMatch;
391
+ my $pubnumMatch;
392
+ my $noteMatch;
393
+ my $affiMatch;
394
+
395
+ if ($stem) {
396
+ $prestem = &PSTEM::stem($pre);
397
+ $nowstem = &PSTEM::stem($now);
398
+ $degreeMatch = $degreeH{lc("$prestem $nowstem")};
399
+ $pubnumMatch = $pubnumH{lc("$prestem $nowstem")};
400
+ $noteMatch = $noteH{lc("$prestem $nowstem")};
401
+ $affiMatch = $affiH{lc("$prestem $nowstem")};
402
+ }else { # for bigram match, we do not request both to be capitalized
403
+ $degreeMatch = $degreeH{lc("$pre $now")};
404
+ $pubnumMatch = $pubnumH{lc("$pre $now")};
405
+ $noteMatch = $noteH{lc("$pre $now")};
406
+ $affiMatch = $affiH{lc("$pre $now")};
407
+ }
408
+
409
+
410
+ if (($pre =~ /^\s*$/) || ($pre =~ /\:\w+\:/)) {next; }
411
+
412
+ my %Confuse4BiGram = (
413
+ 1 => 0,
414
+ 2 => 0,
415
+ 3 => 0,
416
+ 4 => 0
417
+ );
418
+ my $match = 0;
419
+ if ($degreeMatch) {
420
+ $Confuse4BiGram{1} = 1;
421
+ $match = 1;
422
+ }
423
+ if ($pubnumMatch) {
424
+ $Confuse4BiGram{2} = 1;
425
+ $match = 1;
426
+ }
427
+ if ($noteMatch) {
428
+ $Confuse4BiGram{3} = 1;
429
+ $match = 1;
430
+ }
431
+
432
+ if ($affiMatch) {
433
+ $Confuse4BiGram{4} = 1;
434
+ $match = 1;
435
+ }
436
+
437
+ if ($match == 0) { next; }
438
+
439
+ $words[$i] = "\:Confuse4BiGram";
440
+ foreach my $ind(sort {$a <=> $b} keys %Confuse4BiGram) {
441
+ $words[$i] .= "$Confuse4BiGram{$ind}";
442
+ }
443
+ $words[$i] .= "\:";
444
+
445
+ if ($words[$i] eq "\:Confuse4BiGram1000\:") {
446
+ $words[$i-1] = "";
447
+ $words[$i] = ":degree:";
448
+ $degreeNum++;
449
+ }elsif ($words[$i] eq "\:Confuse4BiGram0100\:") {
450
+ $words[$i-1] = "";
451
+ $words[$i] = ":pubnum:";
452
+ $pubNum++;
453
+ }elsif ($words[$i] eq "\:Confuse4BiGram0010\:") {
454
+ $words[$i-1] = "";
455
+ $words[$i] = ":note:";
456
+ $noteNum++;
457
+ }elsif ($words[$i] eq "\:Confuse4BiGram0001\:") {
458
+ $words[$i-1] = "";
459
+ $words[$i] = ":affi:";
460
+ $affiNum++;
461
+ }
462
+ }
463
+ }#end with neutral bigram
464
+
465
+ # single words match on Pubnum, notes and degree!
466
+ for my $i(0 .. $#words) {
467
+ if (($words[$i] !~ /\:\w+\:/) && ($words[$i] !~ /^\W+\s*$/)) {
468
+ if ($neutral) {
469
+ my %Confuse4Single = (
470
+ 1 => 0,
471
+ 2 => 0,
472
+ 3 => 0,
473
+ 4 => 0
474
+ );
475
+ my $match = 0;
476
+ my $degreeMatch;
477
+ my $pubnumMatch;
478
+ my $noteMatch;
479
+ my $affiMatch;
480
+ my $stemword;
481
+
482
+ if ($stem) {
483
+ $stemword = &PSTEM::stem($stemword);
484
+ $degreeMatch = $degreeH{$stemword};
485
+ $pubnumMatch = $pubnumH{$stemword};
486
+ $noteMatch = $noteH{$stemword};
487
+ $affiMatch = ($words[$i] =~ /^[\p{IsUpper}]/ && $affiH{$stemword});
488
+ }else {
489
+ $degreeMatch = $degreeH{lc($words[$i])};
490
+ $pubnumMatch = $pubnumH{lc($words[$i])};
491
+ $noteMatch = $noteH{lc($words[$i])};
492
+ $affiMatch = ($words[$i] =~ /^[\p{IsUpper}]/ && $affiH{lc($words[$i])});
493
+ }
494
+
495
+ #because hhan@cse.psu.edu will become hhan.psu.edu after stemming
496
+ #and $stemword is lower case
497
+ if ($degreeMatch) {
498
+ $Confuse4Single{1} = 1;
499
+ $match = 1;
500
+ }
501
+ if ($pubnumMatch) {
502
+ $Confuse4Single{2} = 1;
503
+ $match = 1;
504
+ }
505
+ if ($noteMatch) {
506
+ $Confuse4Single{3} = 1;
507
+ $match = 1;
508
+ }
509
+ if ($affiMatch) {
510
+ $Confuse4Single{4} = 1;
511
+ $match = 1;
512
+ }
513
+
514
+ if ($match) {
515
+ $words[$i] = "\:Confuse4Single";
516
+ foreach my $ind(sort {$a <=> $b} keys %Confuse4Single) {
517
+ $words[$i] .= "$Confuse4Single{$ind}";
518
+ }
519
+ $words[$i] .= "\:";
520
+ if ($words[$i] eq "\:Confuse4Single1000\:") {
521
+ $words[$i] = ":degree:";
522
+ $degreeNum++;
523
+ }elsif ($words[$i] eq "\:Confuse4Single0100\:") {
524
+ $words[$i] = ":pubnum:";
525
+ $pubNum++;
526
+ }elsif ($words[$i] eq "\:Confuse4Single0010\:") {
527
+ $words[$i] = ":note:";
528
+ $noteNum++;
529
+ }elsif ($words[$i] eq "\:Confuse4Single0001\:") {
530
+ $words[$i] = ":affi:";
531
+ $affiNum++;
532
+ }
533
+ }
534
+ }# end with neutral
535
+
536
+ if ($words[$i] !~ /\:\w+\:/) {
537
+ if (exists($conjH{$words[$i]})) {
538
+ $words[$i] = ":conj:";
539
+ }elsif (exists($prepH{$words[$i]})) {
540
+ $words[$i] = ":prep:";
541
+ }elsif ($words[$i] =~ /\@/) {
542
+ $words[$i] = "\:Email\:";
543
+ }elsif ($words[$i] =~ /(http)|(ftp)\:\/\/(\w+\.){1,}/i) {
544
+ $words[$i] = "\:http\:";
545
+ }elsif ($words[$i] =~ /^[\p{IsUpper}]/) { # Capitalize letter 1
546
+ if ((length($words[$i]) == 1) || ($words[$i] =~ /^[\p{IsUpper}]\.$/)) {
547
+ $words[$i] = ":SingleCap:"; #like M
548
+ $capNum ++; # actually only the number of single cap
549
+ }elsif (exists ($postcodeH{lc($words[$i])})) { # 2 caps
550
+ $words[$i] = ":postcode:";
551
+ }elsif (($i == 0) && ($abstractH{lc($words[$i])})) {
552
+ $words[$i] = ":abstract:";
553
+ }elsif (($i == 0) && ($keywordH{lc($words[$i])})) {
554
+ $words[$i] = ":keyword:";
555
+ }elsif ($introH{lc($words[$i])}) {
556
+ $words[$i] = ":intro:";
557
+ $introNum++;
558
+ }elsif ($phoneH{lc($words[$i])}) {
559
+ $words[$i] = ":phone:";
560
+ $phoneNum++;
561
+ }elsif ($monthH{lc($words[$i])}) {
562
+ $words[$i] = ":month:";
563
+ $dateNum++;
564
+ }else {
565
+ if ($neutral) {
566
+ if ($addrH{lc($words[$i])}) {
567
+ $words[$i] = ":addr:";
568
+ $addrNum++;
569
+ }elsif ($cityH{lc($words[$i])}) { #If not neutral class
570
+ $words[$i] = ":city:";
571
+ $addrNum++;
572
+ }elsif ($stateH{lc($words[$i])}) {
573
+ $words[$i] = ":state:";
574
+ $addrNum++;
575
+ }elsif ($countryH{lc($words[$i])}) {
576
+ $words[$i] = ":country:";
577
+ $addrNum++;
578
+ }elsif ($nameH{lc($words[$i])}) { # end with not neutral class
579
+ $words[$i] = ":MayName:";
580
+ $Cap1NonDictWordNum ++;
581
+ }elsif ($dictH{lc($words[$i])}) {
582
+ $words[$i] = ":Cap1DictWord:";
583
+ $Cap1DictWordNum ++;
584
+ }elsif ($words[$i] =~ /\W+|\-/) { #like BU-CS-93-015; maybe the length could be relaxed; I add \W+ here!!!
585
+ my @Parts = split(/\W+|\-/, $words[$i]);
586
+ for $i(0 .. $#Parts) {
587
+ if ($Parts[$i] =~ /^[\p{IsLower}]+$/) {
588
+ my $len = length($Parts[$i]);
589
+ # $Parts[$i] = "\:LowerWord"."$len"."\:";
590
+ $Parts[$i] = "\:LowerWords\:";
591
+ }elsif ($Parts[$i] =~ /^[\p{IsUpper}]+$/) {
592
+ my $len = length($Parts[$i]);
593
+ # $Parts[$i] = "\:CapWord"."$len"."\:";
594
+ $Parts[$i] = "\:CapWords\:";
595
+ }elsif ($Parts[$i] =~ /^\d+$/) {
596
+ my $len = length($Parts[$i]);
597
+ # $Parts[$i] = "\:Dig\[$len\]\:";
598
+ $Parts[$i] = "\:Digs\:";
599
+ }elsif ($Parts[$i] =~ /^[\p{IsUpper}\p{IsLower}]+$/) {
600
+ $Parts[$i] = "\:MixCaseWords\:";
601
+ }else {
602
+ my $len = length($Parts[$i]);
603
+ $Parts[$i] = "\:Mix\[$len\]\:";
604
+ }
605
+ }
606
+ $words[$i] = join("\-", @Parts);
607
+ }elsif ($words[$i] =~ /^[\p{IsUpper}]+$/) {
608
+ my $len = length($words[$i]);
609
+ $words[$i] = "\:CapWord"."$len"."\:";
610
+ # $words[$i] = "\:CapWords\:";
611
+ }else {
612
+ $words[$i] = ":Cap1NonDictWord:";
613
+ $Cap1NonDictWordNum ++;
614
+ }
615
+ }else {
616
+ if ($degreeH{lc($words[$i])}) {
617
+ $words[$i] = ":degree:";
618
+ $degreeNum++;
619
+ }elsif ($pubnumH{lc($words[$i])}) {
620
+ $words[$i] = ":pubnum:";
621
+ $pubNum++;
622
+ }elsif ($noteH{lc($words[$i])}) {
623
+ $words[$i] = ":note:";
624
+ $noteNum++;
625
+ }elsif ($monthH{lc($words[$i])}) {
626
+ $words[$i] = ":month:";
627
+ $dateNum++;
628
+ }elsif ($affiH{lc($words[$i])}) {
629
+ $words[$i] = ":affi:";
630
+ $affiNum++;
631
+ }elsif ($addrH{lc($words[$i])}) {
632
+ $words[$i] = ":addr:";
633
+ $addrNum++;
634
+ }elsif ($cityH{lc($words[$i])}) { #If not neutral class
635
+ $words[$i] = ":city:";
636
+ # $words[$i] = ":addr:";
637
+ $addrNum++;
638
+ }elsif ($stateH{lc($words[$i])}) {
639
+ $words[$i] = ":state:";
640
+ # $words[$i] = ":addr:";
641
+ $addrNum++;
642
+ }elsif ($countryH{lc($words[$i])}) {
643
+ $words[$i] = ":country:";
644
+ # $words[$i] = ":addr:";
645
+ $addrNum++;
646
+ }elsif ($nameH{lc($words[$i])}) { # end with not neutral class
647
+ $words[$i] = ":MayName:";
648
+ $Cap1NonDictWordNum ++;
649
+ }elsif ( $dictH{lc($words[$i])}) {
650
+ $words[$i] = ":Cap1DictWord:";
651
+ $Cap1DictWordNum ++;
652
+ }elsif ($words[$i] =~ /\W+|\-/) { #like BU-CS-93-015; maybe the length could be relaxed; I add \W+ here!!!
653
+ my @Parts = split(/\W+|\-/, $words[$i]);
654
+ for $i(0 .. $#Parts) {
655
+ if ($Parts[$i] =~ /^[\p{IsLower}]+$/) {
656
+ my $len = length($Parts[$i]);
657
+ # $Parts[$i] = "\:LowerWord"."$len"."\:";
658
+ $Parts[$i] = "\:LowerWords\:";
659
+ }elsif ($Parts[$i] =~ /^[\p{IsUpper}]+$/) {
660
+ my $len = length($Parts[$i]);
661
+ # $Parts[$i] = "\:CapWord"."$len"."\:";
662
+ $Parts[$i] = "\:CapWords\:";
663
+ }elsif ($Parts[$i] =~ /^\d+$/) {
664
+ my $len = length($Parts[$i]);
665
+ # $Parts[$i] = "\:Dig\[$len\]\:";
666
+ $Parts[$i] = "\:Digs\:";
667
+ }elsif ($Parts[$i] =~ /^[\p{IsUpper}\p{IsLower}]+$/) {
668
+ $Parts[$i] = "\:MixCaseWords\:";
669
+ }else {
670
+ my $len = length($Parts[$i]);
671
+ $Parts[$i] = "\:Mix\[$len\]\:";
672
+ }
673
+ }
674
+ $words[$i] = join("\-", @Parts);
675
+ }elsif ($words[$i] =~ /^[\p{IsUpper}]+$/) {
676
+ my $len = length($words[$i]);
677
+ $words[$i] = "\:CapWord"."$len"."\:";
678
+ # $words[$i] = "\:CapWords\:";
679
+ }else {
680
+ $words[$i] = ":Cap1NonDictWord:";
681
+ $Cap1NonDictWordNum ++;
682
+ }
683
+ }
684
+ }#end with neutral
685
+ }elsif ($words[$i] =~ /^[\p{IsLower}]/) { # small case letter 1
686
+ if (exists ($phoneH{$words[$i]})) {
687
+ $words[$i] = ":phone:";
688
+ $phoneNum++;
689
+ }elsif (exists ($monthH{lc($words[$i])})) {
690
+ $words[$i] = ":month:";
691
+ $dateNum++;
692
+ }elsif ($keywordH{lc($words[$i])}) {
693
+ $words[$i] = ":keyword:";
694
+ }elsif (exists $dictH{lc($words[$i])}) {
695
+ $words[$i] = ":DictWord:";
696
+ $DictWordNum ++;
697
+ }else {# should consider the mixure of digit and letters
698
+ $words[$i] = ":NonDictWord:";
699
+ $NonDictWordNum ++;
700
+ }
701
+ }elsif ($words[$i] =~ /^[\d\-]+$/) { #like 30332-0280 or 1111
702
+ my $newword = $words[$i];
703
+ while ($words[$i] =~ /(\d+)/g) {
704
+ my $dig = $1;
705
+ my $diglen = length($dig);
706
+ $newword =~ s/$dig/ \:Dig\[$diglen\]\: /;
707
+ }
708
+ $words[$i] = $newword;
709
+ $digitNum++;
710
+ }elsif ($words[$i] =~ /^(\W+)(.*)$/) { #start from a non-word character
711
+ my $nonword = $1;
712
+ my $rest = $2;
713
+ $words[$i] = $nonword;
714
+ while (length($rest) > 0) {
715
+ if ($rest =~ /^([\p{IsUpper}]+)(.*)$/) {
716
+ my $tmp = $1;
717
+ $rest = $2;
718
+ $words[$i] .= "\:CapWords\:".length($tmp); #length may be relaxed
719
+ }elsif ($rest =~ /^([\p{IsLower}]+)(.*)$/) {
720
+ my $tmp = $1;
721
+ $rest = $2;
722
+ $words[$i] .= "\:LowerWords\:".length($tmp);
723
+ }elsif ($rest =~ /^(\d+)(.*)$/) {
724
+ my $tmp = $1;
725
+ $rest = $2;
726
+ $words[$i] .= "\:Digs\:".length($tmp);
727
+ }else { #get the head character
728
+ my $restLen = length($rest);
729
+ $restLen--;
730
+ $words[$i] .= substr($rest, 0, 1);
731
+ $rest = substr($rest, 1, $restLen);
732
+ }
733
+ }
734
+ }else {
735
+ $others++;
736
+ }
737
+ }
738
+ }else {
739
+ # print " already token or punctuation\: $words[$i] \n";
740
+ }
741
+ }
742
+
743
+ for my $i(0 .. $#words) {
744
+ if (exists ($$FeatureDictH{$words[$i]}{ID})) {
745
+ $TestFeatureVecH{$words[$i]}++;
746
+ }
747
+ }
748
+
749
+ # here we add in the bigrams
750
+ if (length($line) > 1) {
751
+ for my $i(1 .. $#words) { #not good for (0 .. $#words-1) soemtimes
752
+ my $pre = $words[$i-1];
753
+ my $now = $words[$i];
754
+ # add bigram into dict and train or test vector
755
+ if (exists ($$FeatureDictH{"$pre $now"}{ID})) {
756
+ $TestFeatureVecH{"$pre $now"}++;
757
+ }
758
+ } # end with bigram features
759
+ }
760
+
761
+ # try to normalize using F1
762
+ $TestFeatureVecH{CsenLen} = $senLen;
763
+ if ($senLen > 0) {
764
+ $TestFeatureVecH{CdateNumPer} = sprintf("%.8f", $dateNum/$senLen);
765
+ $TestFeatureVecH{CDictWordNumPer} = sprintf("%.8f", $DictWordNum/$senLen);
766
+ $TestFeatureVecH{CNonDictWordNumPer} = sprintf("%.8f", $NonDictWordNum/$senLen);
767
+ $TestFeatureVecH{CCap1DictWordNumPer} = sprintf("%.8f", $Cap1DictWordNum/$senLen);
768
+ $TestFeatureVecH{CCap1NonDictWordNumPer} = sprintf("%.8f", $Cap1NonDictWordNum/$senLen);
769
+ $TestFeatureVecH{CdigitNumPer} = sprintf("%.8f", $digitNum/$senLen);
770
+ $TestFeatureVecH{CaffiNumPer} = sprintf("%.8f", $affiNum/$senLen);
771
+ $TestFeatureVecH{CaddrNumPer} = sprintf("%.8f", $addrNum/$senLen);
772
+ $TestFeatureVecH{CintroNumPer} = sprintf("%.8f",$introNum/$senLen);
773
+ $TestFeatureVecH{CphoneNumPer} = sprintf("%.8f",$phoneNum/$senLen);
774
+ $TestFeatureVecH{CdegreeNumPer} = sprintf("%.8f",$degreeNum/$senLen);
775
+ $TestFeatureVecH{CpubNumPer} = sprintf("%.8f",$pubNum/$senLen);
776
+ $TestFeatureVecH{CnoteNumPer} = sprintf("%.8f",$noteNum/$senLen);
777
+ $TestFeatureVecH{CpageNumPer} = sprintf("%.8f",$pageNum/$senLen);
778
+ $TestFeatureVecH{CcapNumPer} = sprintf("%.8f",$capNum/$senLen);
779
+ $TestFeatureVecH{CothersPer} = sprintf("%.8f", $others/$senLen);
780
+ #$TestFeatureVecH{ClinePos} = sprintf("%.8f", $linePos);
781
+ }else {
782
+ #print "null line\: $line \n";
783
+ }
784
+
785
+ if ($FiletoPrint ne "") {
786
+ open(PFH, ">$FiletoPrint") || die "SVMHeaderParse: here4...could not open $FiletoPrint to write\n";
787
+ print PFH "$label ";
788
+ }
789
+
790
+ my $SVMFeaVec = "$label "; #this is a string
791
+ foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %TestFeatureVecH) {
792
+ if ($TestFeatureVecH{$feature} != 0){
793
+ if ($norm) {
794
+ if ($$FeatureDictH{$feature}{max} != 0) {
795
+ # print "feature: $TestFeatureVecH{$feature} ; dict $$FeatureDictH{$feature}{max} => ";
796
+ my $tmpval = sprintf("%.8f", $TestFeatureVecH{$feature}/$$FeatureDictH{$feature}{max});
797
+ $TestFeatureVecH{$feature} = $tmpval;
798
+ #print " $TestFeatureVecH{$feature} \n";
799
+ }else {
800
+ #print "zero max\: $feature \n";
801
+ }
802
+ }
803
+ if ($FiletoPrint ne "") {
804
+ print PFH "$$FeatureDictH{$feature}{ID}\:$TestFeatureVecH{$feature} ";
805
+ }
806
+ $SVMFeaVec .= "$$FeatureDictH{$feature}{ID}\:$TestFeatureVecH{$feature} ";
807
+ }else {
808
+ #print "zero value\: $feature ($TestFeatureVecH{$feature}) \n";
809
+ }
810
+ }
811
+ if ($FiletoPrint ne "") {
812
+ print PFH "\n";
813
+ close(PFH);
814
+ }
815
+
816
+ my $convertedStr = join(" ", @words);
817
+ #return(\%TestFeatureVecH);
818
+ return($SVMFeaVec);
819
+ #return($convertedStr);
820
+ }
821
+
822
+
823
+ sub LineFeatureRepre() {
824
+ my $line = shift;
825
+ my $neutral = 1;
826
+ my $neutralAddName = 0;
827
+ my $norm = 1;
828
+ my %TestFeatureVecH = (); #very important
829
+
830
+ #some of these features might not work for single word case such as
831
+ #senLen, so might just take this factor out for word case
832
+ #########categorical features################
833
+ my $senLen = 0;
834
+ my $dateNum = 0;
835
+ my $DictWordNum = 0;
836
+ my $NonDictWordNum = 0;
837
+ my $Cap1DictWordNum = 0;
838
+ my $Cap1NonDictWordNum = 0;
839
+ my $digitNum = 0;
840
+ my $others = 0;
841
+ my $affiNum = 0;
842
+ my $addrNum = 0; # let city, state, country all counted as the addr
843
+ # for word case, we might need more specific recognition
844
+ my $capNum = 0;
845
+ my $introNum = 0;
846
+ my $phoneNum = 0;
847
+ my $degreeNum = 0;
848
+ my $pubNum = 0;
849
+ my $noteNum = 0;
850
+ my $pageNum = 0;
851
+ ###
852
+
853
+ my $TokenLine;
854
+ if (length($line) > 1) {
855
+ ($TokenLine, $senLen) = &AddrMatch($line); # this is to match the bi-grams in the address database; assume bi-gram is unique for address
856
+ #transformed features
857
+ }else {
858
+ $TokenLine = $line;
859
+ }
860
+ my @words = split(/\s+/, $TokenLine);
861
+ #now start the AddrNameConfu, shared among address and people's name
862
+ #normally do not use this representation
863
+
864
+ for my $i(0 .. $#words) {
865
+ if ($words[$i] =~ /\+PAGE\+/) {
866
+ $words[$i] = ":page:";
867
+ $pageNum++;
868
+ }
869
+ } # end with for each word
870
+
871
+ #match bi-gram on Pubnum, Note and Degree and affiliation (might make it a separate func)
872
+ if (($neutral) && (length($line) > 1)) {
873
+ for my $i(1 .. $#words) {
874
+ my $pre = lc($words[$i-1]);
875
+ my $now = lc($words[$i]);
876
+ my $prestem;
877
+ my $nowstem;
878
+ my $degreeMatch;
879
+ my $pubnumMatch;
880
+ my $noteMatch;
881
+ my $affiMatch;
882
+
883
+ if ($stem) {
884
+ $prestem = &PSTEM::stem($pre);
885
+ $nowstem = &PSTEM::stem($now);
886
+ $degreeMatch = $degreeH{lc("$prestem $nowstem")};
887
+ $pubnumMatch = $pubnumH{lc("$prestem $nowstem")};
888
+ $noteMatch = $noteH{lc("$prestem $nowstem")};
889
+ $affiMatch = $affiH{lc("$prestem $nowstem")};
890
+ }else { # for bigram match, we do not request both to be capitalized
891
+ $degreeMatch = $degreeH{lc("$pre $now")};
892
+ $pubnumMatch = $pubnumH{lc("$pre $now")};
893
+ $noteMatch = $noteH{lc("$pre $now")};
894
+ $affiMatch = $affiH{lc("$pre $now")};
895
+ }
896
+
897
+
898
+ if (($pre =~ /^\s*$/) || ($pre =~ /\:\w+\:/)) {next; }
899
+
900
+ my %Confuse4BiGram = (
901
+ 1 => 0,
902
+ 2 => 0,
903
+ 3 => 0,
904
+ 4 => 0
905
+ );
906
+ my $match = 0;
907
+ if ($degreeMatch) {
908
+ $Confuse4BiGram{1} = 1;
909
+ $match = 1;
910
+ }
911
+ if ($pubnumMatch) {
912
+ $Confuse4BiGram{2} = 1;
913
+ $match = 1;
914
+ }
915
+ if ($noteMatch) {
916
+ $Confuse4BiGram{3} = 1;
917
+ $match = 1;
918
+ }
919
+
920
+ if ($affiMatch) {
921
+ $Confuse4BiGram{4} = 1;
922
+ $match = 1;
923
+ }
924
+
925
+ if ($match == 0) { next; }
926
+
927
+ $words[$i] = "\:Confuse4BiGram";
928
+ foreach my $ind(sort {$a <=> $b} keys %Confuse4BiGram) {
929
+ $words[$i] .= "$Confuse4BiGram{$ind}";
930
+ }
931
+ $words[$i] .= "\:";
932
+
933
+ if ($words[$i] eq "\:Confuse4BiGram1000\:") {
934
+ $words[$i-1] = "";
935
+ $words[$i] = ":degree:";
936
+ $degreeNum++;
937
+ }elsif ($words[$i] eq "\:Confuse4BiGram0100\:") {
938
+ $words[$i-1] = "";
939
+ $words[$i] = ":pubnum:";
940
+ $pubNum++;
941
+ }elsif ($words[$i] eq "\:Confuse4BiGram0010\:") {
942
+ $words[$i-1] = "";
943
+ $words[$i] = ":note:";
944
+ $noteNum++;
945
+ }elsif ($words[$i] eq "\:Confuse4BiGram0001\:") {
946
+ $words[$i-1] = "";
947
+ $words[$i] = ":affi:";
948
+ $affiNum++;
949
+ }
950
+ }
951
+ }#end with neutral bigram
952
+
953
+ # single words match on Pubnum, notes and degree!
954
+ for my $i(0 .. $#words) {
955
+ if (($words[$i] !~ /\:\w+\:/) && ($words[$i] !~ /^\W+\s*$/)) {
956
+ if ($neutral) {
957
+ my %Confuse4Single = (
958
+ 1 => 0,
959
+ 2 => 0,
960
+ 3 => 0,
961
+ 4 => 0
962
+ );
963
+ my $match = 0;
964
+ my $degreeMatch;
965
+ my $pubnumMatch;
966
+ my $noteMatch;
967
+ my $affiMatch;
968
+ my $stemword;
969
+
970
+ if ($stem) {
971
+ $stemword = &PSTEM::stem($stemword);
972
+ $degreeMatch = $degreeH{$stemword};
973
+ $pubnumMatch = $pubnumH{$stemword};
974
+ $noteMatch = $noteH{$stemword};
975
+ $affiMatch = ($words[$i] =~ /^[\p{IsUpper}]/ && $affiH{$stemword});
976
+ }else {
977
+ $degreeMatch = $degreeH{lc($words[$i])};
978
+ $pubnumMatch = $pubnumH{lc($words[$i])};
979
+ $noteMatch = $noteH{lc($words[$i])};
980
+ $affiMatch = ($words[$i] =~ /^[\p{IsUpper}]/ && $affiH{lc($words[$i])});
981
+ }
982
+
983
+ #because hhan@cse.psu.edu will become hhan.psu.edu after stemming
984
+ #and $stemword is lower case
985
+ if ($degreeMatch) {
986
+ $Confuse4Single{1} = 1;
987
+ $match = 1;
988
+ }
989
+ if ($pubnumMatch) {
990
+ $Confuse4Single{2} = 1;
991
+ $match = 1;
992
+ }
993
+ if ($noteMatch) {
994
+ $Confuse4Single{3} = 1;
995
+ $match = 1;
996
+ }
997
+ if ($affiMatch) {
998
+ $Confuse4Single{4} = 1;
999
+ $match = 1;
1000
+ }
1001
+
1002
+ if ($match) {
1003
+ $words[$i] = "\:Confuse4Single";
1004
+ foreach my $ind(sort {$a <=> $b} keys %Confuse4Single) {
1005
+ $words[$i] .= "$Confuse4Single{$ind}";
1006
+ }
1007
+ $words[$i] .= "\:";
1008
+ if ($words[$i] eq "\:Confuse4Single1000\:") {
1009
+ $words[$i] = ":degree:";
1010
+ $degreeNum++;
1011
+ }elsif ($words[$i] eq "\:Confuse4Single0100\:") {
1012
+ $words[$i] = ":pubnum:";
1013
+ $pubNum++;
1014
+ }elsif ($words[$i] eq "\:Confuse4Single0010\:") {
1015
+ $words[$i] = ":note:";
1016
+ $noteNum++;
1017
+ }elsif ($words[$i] eq "\:Confuse4Single0001\:") {
1018
+ $words[$i] = ":affi:";
1019
+ $affiNum++;
1020
+ }
1021
+ }
1022
+ }# end with neutral
1023
+
1024
+ if ($words[$i] !~ /\:\w+\:/) {
1025
+ if (exists($conjH{$words[$i]})) {
1026
+ $words[$i] = ":conj:";
1027
+ }elsif (exists($prepH{$words[$i]})) {
1028
+ $words[$i] = ":prep:";
1029
+ }elsif ($words[$i] =~ /\@/) {
1030
+ $words[$i] = "\:Email\:";
1031
+ }elsif ($words[$i] =~ /(http)|(ftp)\:\/\/(\w+\.){1,}/i) {
1032
+ $words[$i] = "\:http\:";
1033
+ }elsif ($words[$i] =~ /^[\p{IsUpper}]/) { # Capitalize letter 1
1034
+ if ((length($words[$i]) == 1) || ($words[$i] =~ /^[\p{IsUpper}]\.$/)) {
1035
+ $words[$i] = ":SingleCap:"; #like M
1036
+ $capNum ++; # actually only the number of single cap
1037
+ }elsif (exists ($postcodeH{lc($words[$i])})) { # 2 caps
1038
+ $words[$i] = ":postcode:";
1039
+ }elsif (($i == 0) && ($abstractH{lc($words[$i])})) {
1040
+ $words[$i] = ":abstract:";
1041
+ }elsif (($i == 0) && ($keywordH{lc($words[$i])})) {
1042
+ $words[$i] = ":keyword:";
1043
+ }elsif ($introH{lc($words[$i])}) {
1044
+ $words[$i] = ":intro:";
1045
+ $introNum++;
1046
+ }elsif ($phoneH{lc($words[$i])}) {
1047
+ $words[$i] = ":phone:";
1048
+ $phoneNum++;
1049
+ }elsif ($monthH{lc($words[$i])}) {
1050
+ $words[$i] = ":month:";
1051
+ $dateNum++;
1052
+ }else {
1053
+ if ($neutral) {
1054
+ if ($addrH{lc($words[$i])}) {
1055
+ $words[$i] = ":addr:";
1056
+ $addrNum++;
1057
+ }elsif ($cityH{lc($words[$i])}) { #If not neutral class
1058
+ $words[$i] = ":city:";
1059
+ $addrNum++;
1060
+ }elsif ($stateH{lc($words[$i])}) {
1061
+ $words[$i] = ":state:";
1062
+ $addrNum++;
1063
+ }elsif ($countryH{lc($words[$i])}) {
1064
+ $words[$i] = ":country:";
1065
+ $addrNum++;
1066
+ }elsif ($nameH{lc($words[$i])}) { # end with not neutral class
1067
+ $words[$i] = ":MayName:";
1068
+ $Cap1NonDictWordNum ++;
1069
+ }elsif ($dictH{lc($words[$i])}) {
1070
+ $words[$i] = ":Cap1DictWord:";
1071
+ $Cap1DictWordNum ++;
1072
+ }elsif ($words[$i] =~ /\W+|\-/) { #like BU-CS-93-015; maybe the length could be relaxed; I add \W+ here!!!
1073
+ my @Parts = split(/\W+|\-/, $words[$i]);
1074
+ for $i(0 .. $#Parts) {
1075
+ if ($Parts[$i] =~ /^[\p{IsLower}]+$/) {
1076
+ my $len = length($Parts[$i]);
1077
+ # $Parts[$i] = "\:LowerWord"."$len"."\:";
1078
+ $Parts[$i] = "\:LowerWords\:";
1079
+ }elsif ($Parts[$i] =~ /^[\p{IsUpper}]+$/) {
1080
+ my $len = length($Parts[$i]);
1081
+ # $Parts[$i] = "\:CapWord"."$len"."\:";
1082
+ $Parts[$i] = "\:CapWords\:";
1083
+ }elsif ($Parts[$i] =~ /^\d+$/) {
1084
+ my $len = length($Parts[$i]);
1085
+ # $Parts[$i] = "\:Dig\[$len\]\:";
1086
+ $Parts[$i] = "\:Digs\:";
1087
+ }elsif ($Parts[$i] =~ /^[\p{IsUpper}\p{IsLower}]+$/) {
1088
+ $Parts[$i] = "\:MixCaseWords\:";
1089
+ }else {
1090
+ my $len = length($Parts[$i]);
1091
+ $Parts[$i] = "\:Mix\[$len\]\:";
1092
+ }
1093
+ }
1094
+ $words[$i] = join("\-", @Parts);
1095
+ }elsif ($words[$i] =~ /^[\p{IsUpper}]+$/) {
1096
+ my $len = length($words[$i]);
1097
+ $words[$i] = "\:CapWord"."$len"."\:";
1098
+ # $words[$i] = "\:CapWords\:";
1099
+ }else {
1100
+ $words[$i] = ":Cap1NonDictWord:";
1101
+ $Cap1NonDictWordNum ++;
1102
+ }
1103
+ }else {#end with neutral
1104
+
1105
+ if ($degreeH{lc($words[$i])}) {
1106
+ $words[$i] = ":degree:";
1107
+ $degreeNum++;
1108
+ }elsif ($pubnumH{lc($words[$i])}) {
1109
+ $words[$i] = ":pubnum:";
1110
+ $pubNum++;
1111
+ }elsif ($noteH{lc($words[$i])}) {
1112
+ $words[$i] = ":note:";
1113
+ $noteNum++;
1114
+ }elsif ($monthH{lc($words[$i])}) {
1115
+ $words[$i] = ":month:";
1116
+ $dateNum++;
1117
+ }elsif ($affiH{lc($words[$i])}) {
1118
+ $words[$i] = ":affi:";
1119
+ $affiNum++;
1120
+ }elsif ($addrH{lc($words[$i])}) {
1121
+ $words[$i] = ":addr:";
1122
+ $addrNum++;
1123
+ }elsif ($cityH{lc($words[$i])}) { #If not neutral class
1124
+ $words[$i] = ":city:";
1125
+ # $words[$i] = ":addr:";
1126
+ $addrNum++;
1127
+ }elsif ($stateH{lc($words[$i])}) {
1128
+ $words[$i] = ":state:";
1129
+ # $words[$i] = ":addr:";
1130
+ $addrNum++;
1131
+ }elsif ($countryH{lc($words[$i])}) {
1132
+ $words[$i] = ":country:";
1133
+ # $words[$i] = ":addr:";
1134
+ $addrNum++;
1135
+ }elsif ($nameH{lc($words[$i])}) { # end with not neutral class
1136
+ $words[$i] = ":MayName:";
1137
+ $Cap1NonDictWordNum ++;
1138
+ }elsif ( $dictH{lc($words[$i])}) {
1139
+ $words[$i] = ":Cap1DictWord:";
1140
+ $Cap1DictWordNum ++;
1141
+ }elsif ($words[$i] =~ /\W+|\-/) { #like BU-CS-93-015; maybe the length could be relaxed; I add \W+ here!!!
1142
+ my @Parts = split(/\W+|\-/, $words[$i]);
1143
+ for $i(0 .. $#Parts) {
1144
+ if ($Parts[$i] =~ /^[\p{IsLower}]+$/) {
1145
+ my $len = length($Parts[$i]);
1146
+ # $Parts[$i] = "\:LowerWord"."$len"."\:";
1147
+ $Parts[$i] = "\:LowerWords\:";
1148
+ }elsif ($Parts[$i] =~ /^[\p{IsUpper}]+$/) {
1149
+ my $len = length($Parts[$i]);
1150
+ # $Parts[$i] = "\:CapWord"."$len"."\:";
1151
+ $Parts[$i] = "\:CapWords\:";
1152
+ }elsif ($Parts[$i] =~ /^\d+$/) {
1153
+ my $len = length($Parts[$i]);
1154
+ # $Parts[$i] = "\:Dig\[$len\]\:";
1155
+ $Parts[$i] = "\:Digs\:";
1156
+ }elsif ($Parts[$i] =~ /^[\p{IsUpper}\p{IsLower}]+$/) {
1157
+ $Parts[$i] = "\:MixCaseWords\:";
1158
+ }else {
1159
+ my $len = length($Parts[$i]);
1160
+ $Parts[$i] = "\:Mix\[$len\]\:";
1161
+ }
1162
+ }
1163
+ $words[$i] = join("\-", @Parts);
1164
+ }elsif ($words[$i] =~ /^[\p{IsUpper}]+$/) {
1165
+ my $len = length($words[$i]);
1166
+ $words[$i] = "\:CapWord"."$len"."\:";
1167
+ # $words[$i] = "\:CapWords\:";
1168
+ }else {
1169
+ $words[$i] = ":Cap1NonDictWord:";
1170
+ $Cap1NonDictWordNum ++;
1171
+ }
1172
+ }
1173
+ }#end with else neutral
1174
+ }elsif ($words[$i] =~ /^[\p{IsLower}]/) { # small case letter 1
1175
+ if (exists ($phoneH{$words[$i]})) {
1176
+ $words[$i] = ":phone:";
1177
+ $phoneNum++;
1178
+ }elsif (exists ($monthH{lc($words[$i])})) {
1179
+ $words[$i] = ":month:";
1180
+ $dateNum++;
1181
+ }elsif ($keywordH{lc($words[$i])}) {
1182
+ $words[$i] = ":keyword:";
1183
+ }elsif (exists $dictH{lc($words[$i])}) {
1184
+ $words[$i] = ":DictWord:";
1185
+ $DictWordNum ++;
1186
+ }else {# should consider the mixure of digit and letters
1187
+ $words[$i] = ":NonDictWord:";
1188
+ $NonDictWordNum ++;
1189
+ }
1190
+ }elsif ($words[$i] =~ /^[\d\-]+$/) { #like 30332-0280 or 1111
1191
+ my $newword = $words[$i];
1192
+ while ($words[$i] =~ /(\d+)/g) {
1193
+ my $dig = $1;
1194
+ my $diglen = length($dig);
1195
+ $newword =~ s/$dig/ \:Dig\[$diglen\]\: /;
1196
+ }
1197
+ $words[$i] = $newword;
1198
+ $digitNum++;
1199
+ }elsif ($words[$i] =~ /^(\W+)(.*)$/) { #start from a non-word character
1200
+ my $nonword = $1;
1201
+ my $rest = $2;
1202
+ $words[$i] = $nonword;
1203
+ while (length($rest) > 0) {
1204
+ if ($rest =~ /^([\p{IsUpper}]+)(.*)$/) {
1205
+ my $tmp = $1;
1206
+ $rest = $2;
1207
+ $words[$i] .= "\:CapWords\:".length($tmp); #length may be relaxed
1208
+ }elsif ($rest =~ /^([\p{IsLower}]+)(.*)$/) {
1209
+ my $tmp = $1;
1210
+ $rest = $2;
1211
+ $words[$i] .= "\:LowerWords\:".length($tmp);
1212
+ }elsif ($rest =~ /^(\d+)(.*)$/) {
1213
+ my $tmp = $1;
1214
+ $rest = $2;
1215
+ $words[$i] .= "\:Digs\:".length($tmp);
1216
+ }else { #get the head character
1217
+ my $restLen = length($rest);
1218
+ $restLen--;
1219
+ $words[$i] .= substr($rest, 0, 1);
1220
+ $rest = substr($rest, 1, $restLen);
1221
+ }
1222
+ }
1223
+ }else {
1224
+ $others++;
1225
+ }
1226
+ }
1227
+ }else {
1228
+ # print " already token or punctuation\: $words[$i] \n";
1229
+ }
1230
+ }
1231
+
1232
+ for my $i(0 .. $#words) {
1233
+ # if (exists ($$FeatureDictH{$words[$i]}{ID})) {
1234
+ $TestFeatureVecH{$words[$i]}++;
1235
+ # }
1236
+ }
1237
+
1238
+ # here we add in the bigrams
1239
+ if (length($line) > 1) {
1240
+ for my $i(1 .. $#words) { #not good for (0 .. $#words-1) soemtimes
1241
+ my $pre = $words[$i-1];
1242
+ my $now = $words[$i];
1243
+ # add bigram into dict and train or test vector
1244
+ # if (exists ($$FeatureDictH{"$pre $now"}{ID})) {
1245
+ $TestFeatureVecH{"$pre $now"}++;
1246
+ # }
1247
+ } # end with bigram features
1248
+ }
1249
+
1250
+ # try to normalize using F1
1251
+ $TestFeatureVecH{CsenLen} = $senLen;
1252
+ if ($senLen > 0) {
1253
+ $TestFeatureVecH{CdateNumPer} = sprintf("%.8f", $dateNum/$senLen);
1254
+ $TestFeatureVecH{CDictWordNumPer} = sprintf("%.8f", $DictWordNum/$senLen);
1255
+ $TestFeatureVecH{CNonDictWordNumPer} = sprintf("%.8f", $NonDictWordNum/$senLen);
1256
+ $TestFeatureVecH{CCap1DictWordNumPer} = sprintf("%.8f", $Cap1DictWordNum/$senLen);
1257
+ $TestFeatureVecH{CCap1NonDictWordNumPer} = sprintf("%.8f", $Cap1NonDictWordNum/$senLen);
1258
+ $TestFeatureVecH{CdigitNumPer} = sprintf("%.8f", $digitNum/$senLen);
1259
+ $TestFeatureVecH{CaffiNumPer} = sprintf("%.8f", $affiNum/$senLen);
1260
+ $TestFeatureVecH{CaddrNumPer} = sprintf("%.8f", $addrNum/$senLen);
1261
+ $TestFeatureVecH{CintroNumPer} = sprintf("%.8f",$introNum/$senLen);
1262
+ $TestFeatureVecH{CphoneNumPer} = sprintf("%.8f",$phoneNum/$senLen);
1263
+ $TestFeatureVecH{CdegreeNumPer} = sprintf("%.8f",$degreeNum/$senLen);
1264
+ $TestFeatureVecH{CpubNumPer} = sprintf("%.8f",$pubNum/$senLen);
1265
+ $TestFeatureVecH{CnoteNumPer} = sprintf("%.8f",$noteNum/$senLen);
1266
+ $TestFeatureVecH{CpageNumPer} = sprintf("%.8f",$pageNum/$senLen);
1267
+ $TestFeatureVecH{CcapNumPer} = sprintf("%.8f",$capNum/$senLen);
1268
+ $TestFeatureVecH{CothersPer} = sprintf("%.8f", $others/$senLen);
1269
+ #$TestFeatureVecH{ClinePos} = sprintf("%.8f", $linePos);
1270
+ }else {
1271
+ #print "null line\: $line \n";
1272
+ }
1273
+
1274
+ if ($FiletoPrint ne "") {
1275
+ open(PFH, ">$FiletoPrint") || die "SVMHeaderParse: could not open $FiletoPrint to write: $!";
1276
+ print PFH "$label ";
1277
+ }
1278
+
1279
+ if (0) {
1280
+ my $SVMFeaVec = ""; #this is a string
1281
+ foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %TestFeatureVecH) {
1282
+ if ($TestFeatureVecH{$feature} != 0){
1283
+ if ($norm) {
1284
+ if ($$FeatureDictH{$feature}{max} != 0) {
1285
+ # print "feature: $TestFeatureVecH{$feature} ; dict $$FeatureDictH{$feature}{max} => ";
1286
+ my $tmpval = sprintf("%.8f", $TestFeatureVecH{$feature}/$$FeatureDictH{$feature}{max});
1287
+ $TestFeatureVecH{$feature} = $tmpval;
1288
+ #print " $TestFeatureVecH{$feature} \n";
1289
+ }else {
1290
+ #print "zero max\: $feature \n";
1291
+ }
1292
+ }
1293
+ if ($FiletoPrint ne "") {
1294
+ print PFH "$$FeatureDictH{$feature}{ID}\:$TestFeatureVecH{$feature} ";
1295
+ }
1296
+ $SVMFeaVec .= "$$FeatureDictH{$feature}{ID}\:$TestFeatureVecH{$feature} ";
1297
+ }else {
1298
+ #print "zero value\: $feature ($TestFeatureVecH{$feature}) \n";
1299
+ }
1300
+ }
1301
+ }
1302
+
1303
+
1304
+ if ($FiletoPrint ne "") {
1305
+ print PFH "\n";
1306
+ close(PFH);
1307
+ }
1308
+
1309
+ my $convertedStr = join(" ", @words);
1310
+ return(\%TestFeatureVecH);
1311
+ # return($SVMFeaVec);
1312
+ #return($convertedStr);
1313
+ }
1314
+
1315
+
1316
+ sub WordFeatureRepre() {
1317
+ my $line = shift;
1318
+ my $dict = shift;
1319
+ my @FeatureLine;
1320
+
1321
+
1322
+ return(\@FeatureLine);
1323
+ }
1324
+
1325
+ #Given a line, make the space explicit
1326
+ sub FillSpace() { #recognize <<sep>>, instead of <sep>
1327
+ my $content = shift;
1328
+ my $lineNO = 0;
1329
+
1330
+ $content =~ s/\s+<<sep>>/<<sep>>/g;
1331
+ $content =~ s/<<\/sep>>\s+/<<\/sep>>/g;
1332
+
1333
+ my $punc = 0; # space is the only separator
1334
+ if ($content =~ /([^\p{IsLower}\p{IsUpper}\s+\-\.\d+(<<sep>>)(<<\/sep>>)])|(\W+and\W+)/ig) {
1335
+ $punc = 1; #contains punctuation
1336
+ }
1337
+
1338
+ my @Seq = split(/(<<sep>>[^\<\>]*<<\/sep>>)/, $content); #the () keeps the spliter in the array @Seq!
1339
+ for my $i (0 .. $#Seq) {
1340
+ if ($Seq[$i] =~ /<<sep>>/) {
1341
+ #print "spliter\: $Seq[$i] \n";
1342
+ }else {
1343
+ #this is the place to separate the punctuations and fill the space
1344
+ # print "before removing the space $Seq[$i] \n";
1345
+ $Seq[$i] =~ s/\s+<<sep>>/<<sep>>/g;
1346
+ $Seq[$i] =~ s/<<\/sep>>\s+/<<\/sep>>/g;
1347
+ # remove space arround punctuations
1348
+ $Seq[$i] =~ s/\s+/ \<space\> /g;
1349
+ $Seq[$i] =~ s/<space>\s+(\W+)\s+<space>/ $1 /g;
1350
+ # print "after removing the space $Seq[$i]\n";
1351
+ }
1352
+ }
1353
+
1354
+ $content = join(" ", @Seq);
1355
+ return($punc, $content);
1356
+ }
1357
+
1358
+
1359
+ #Given a line, make the space explicit
1360
+ sub OfflineFillSpace() { #recognize <sep>
1361
+ my $content = shift;
1362
+ my $lineNO = 0;
1363
+
1364
+ $content =~ s/\s+<sep>/<sep>/g;
1365
+ $content =~ s/<\/sep>\s+/<\/sep>/g;
1366
+
1367
+ my $punc = 0; # space is the only separator
1368
+ if ($content =~ /([^\p{IsLower}\p{IsUpper}\s+\-\.\d+(<sep>)(<\/sep>)])|(\W+and\W+)/ig) {
1369
+ $punc = 1; #contains punctuation
1370
+ }
1371
+
1372
+ my @Seq = split(/(<sep>[^\<\>]*<\/sep>)/, $content); #the () keeps the spliter in the array @Seq!
1373
+ for my $i (0 .. $#Seq) {
1374
+ if ($Seq[$i] =~ /<sep>/) {
1375
+ #print "spliter\: $Seq[$i] \n";
1376
+ }else {
1377
+ #this is the place to separate the punctuations and fill the space
1378
+ # print "before removing the space $Seq[$i] \n";
1379
+ $Seq[$i] =~ s/\s+<sep>/<sep>/g;
1380
+ $Seq[$i] =~ s/<\/sep>\s+/<\/sep>/g;
1381
+ # remove space arround punctuations
1382
+ $Seq[$i] =~ s/\s+/ \<space\> /g;
1383
+ $Seq[$i] =~ s/<space>\s+(\W+)\s+<space>/ $1 /g;
1384
+ # print "after removing the space $Seq[$i]\n";
1385
+ }
1386
+ }
1387
+
1388
+ $content = join(" ", @Seq);
1389
+ return($punc, $content);
1390
+ }
1391
+
1392
+
1393
+ sub SeparatePunc0108bak() {
1394
+ my $line = shift;
1395
+
1396
+ #added 12/16
1397
+ $line =~ s/^\s+//g;
1398
+ $line =~ s/\s+$//g;
1399
+
1400
+ $line =~ s/([^\p{IsLower}\p{IsUpper}\s+\-\d+\<\>\.]\s+)/ $1 /g;
1401
+ $line =~ s/[\w+]{3,}(\.)\s+/ $1 /g;
1402
+ $line =~ s/\s+/ /;
1403
+
1404
+ return($line);
1405
+ }
1406
+
1407
+ sub SeparatePunc() {
1408
+ my $line = shift;
1409
+
1410
+ $line =~ s/^\s+//g;
1411
+ $line =~ s/\s+$//g;
1412
+
1413
+ $line =~ s/([^\p{IsLower}\p{IsUpper}\s+\-\d+\<\>\.]\s+)/ $1 /g;
1414
+ $line =~ s/([\w+]{3,})(\.)\s+/$1 $2 /g; #"Dept. of" becomes "Dept . of"
1415
+ #How about blah, blah, ... blah. And ....
1416
+ #Dr. Smith will be keep the small dot
1417
+ #Sep. will keep the small dot as well.
1418
+ #But how about removing every dot, including Dr. and Sep. ?
1419
+
1420
+ # $line =~ s/\W+$//g; #remove last punctuation
1421
+ $line =~ s/\s+/ /;
1422
+
1423
+ return($line);
1424
+ }
1425
+
1426
+
1427
+ sub weired_author(){
1428
+ my $str = shift;
1429
+
1430
+ my $weired = 0;
1431
+ my %weired_words = (
1432
+ 'Departamento' =>1,
1433
+ 'IN PRESS'=>1,
1434
+ 'PRESS'=>1,
1435
+ 'Center'=>1,
1436
+ 'Ltd' =>1,
1437
+ 'Universidad'=>1,
1438
+ 'chair' =>1,
1439
+ 'Submitted'=>1,
1440
+ 'pp'=>1,
1441
+ 'Version'=>1,
1442
+ 'Thesis' =>1,
1443
+ 'Proposal' =>1,
1444
+ 'University'=>1,
1445
+ 'Universiteit'=>1,
1446
+ 'Institut'=>1,
1447
+ 'extended'=>1,
1448
+ 'abstract'=>1,
1449
+ 'Laboratoire'=>1,
1450
+ 'COVER PAGE'=>1,
1451
+ 'COVER'=>1,
1452
+ 'Page' => 1,
1453
+ 'Job Title'=>1,
1454
+ 'Job'=>1,
1455
+ 'Title'=>1,
1456
+ 'Case Study'=>1,
1457
+ 'Case Sludy'=>1,
1458
+ 'Case'=>1,
1459
+ 'Report'=>1,
1460
+ 'Reply'=>1,
1461
+ 'A Report'=>1,
1462
+ 'A Reply'=>1,
1463
+ 'Research'=>1,
1464
+ 'Paper'=>1,
1465
+ 'Research Paper'=>1,
1466
+ 'Research Project'=>1,
1467
+ 'Project'=>1,
1468
+ 'Retrospective'=>1,
1469
+ 'Roadmap'=>1,
1470
+ 'Tutorial'=>1,
1471
+ 'WORKING PAPER'=>1,
1472
+ 'Working' =>1,
1473
+ 'White Paper'=>1,
1474
+ 'in honor of'=>1,
1475
+ 'international' =>1,
1476
+ 'Dataset' =>1,
1477
+ 'Sample' =>1,
1478
+ 'Network'=>1,
1479
+ 'Networks'=>1,
1480
+ 'Academiae'=>1,
1481
+ 'company'=>1,
1482
+ 'Submitted'=>1,
1483
+ );
1484
+
1485
+ my %filter_words = (
1486
+ 'honor'=>1,
1487
+ 'ed'=>1,
1488
+ 'eds'=>1,
1489
+ 'jr'=>1,
1490
+ 'jr\.'=>1,
1491
+ 'authors'=>1,
1492
+ 'author' =>1,
1493
+ 'editor'=>1,
1494
+ 'editors'=>1,
1495
+ 'with'=>1,
1496
+ 'by'=>1,
1497
+ );
1498
+
1499
+ #if separate authors into individuals.
1500
+ ## $str =~ s/^\s*[^\p{IsLower}\p{IsUpper}\d\-\.]//g;
1501
+ # $str =~ s/[^\p{IsLower}\p{IsUpper}\d\-\.]\s*$//g;
1502
+
1503
+ my @weired_words_arr = keys %weired_words;
1504
+ my $weired_words_str = join("|", @weired_words_arr);
1505
+
1506
+ #print "\n\nbefore: $str\n";
1507
+ $str =~ s/\./\. /g;
1508
+ $str =~ s/\d+//g;
1509
+ $str =~ s/^\s*\W+//g;
1510
+ $str =~ s/\W+\s*$//g;
1511
+ $str =~ s/\s+/ /g;
1512
+ $str = &str_space_clean($str);
1513
+ #print "after: $str \n";
1514
+
1515
+ my @words = split(/\s+/, $str);
1516
+ my $lcase_num = 0;
1517
+ my $weired_form = 0;
1518
+ my @new_name = ();
1519
+ my $pure_single_letter = 1;
1520
+ for my $i(0 .. $#words) {
1521
+ if ( (length($words[$i]) > 1) && ($words[$i] !~ /^\w\.$/)) {
1522
+ $pure_single_letter = 0;
1523
+ }
1524
+ if ($filter_words{lc($words[$i])} || ($words[$i] !~ /\w/)) {
1525
+ next;
1526
+ }else {
1527
+ if ($words[$i] =~ /^[\p{IsLower}\-]+$/) {
1528
+ $lcase_num++;
1529
+ }elsif ($words[$i] =~ /[^\p{IsLower}\p{IsUpper}\-\.]/) {
1530
+ $weired_form++;
1531
+ }
1532
+ #make the first letter capitalized
1533
+ $words[$i] = ucfirst(lc($words[$i]));
1534
+ push @new_name, $words[$i];
1535
+ }
1536
+ }
1537
+ if (($pure_single_letter) || ($str =~ /$weired_words_str/) || ($#words > 4) || ($#new_name <1) || (($#words +1 - $weired_form) < 2) || ($lcase_num>2)) {
1538
+ $weired = 1;
1539
+ }
1540
+ #print "weired:? $weired \n";
1541
+ $str = join(' ', @new_name);
1542
+ #print "final str $str\n";
1543
+
1544
+ return($weired, $str);
1545
+ }
1546
+
1547
+
1548
+
1549
+ #turn array into hash map { $hash_name{$_} =$some_value } @array_name;
1550
+ sub hash_stopwords {
1551
+ my $stopword = "$Database_Dir/stopwords";
1552
+ my %stopH = ();
1553
+ open(stopReader, "$stopword") || die "SVMHeaderParse: could not open $stopword to read \n";
1554
+ while (my $line = <stopReader>) {
1555
+ $line = &str_space_clean($line);
1556
+ $stopH{$line}++;
1557
+ }
1558
+ close(stopReader);
1559
+ return(\%stopH);
1560
+ }
1561
+
1562
+ sub hash_affi_stopwords {
1563
+ #my $DB_dir = "/home/hhan/projects/public_library/DB";
1564
+ my $stopword = "$Database_Dir/affi.txt";
1565
+ my %stopH = ();
1566
+ open(stopReader, "$stopword") || die "SVMHeaderParse: could not open $stopword to read \n";
1567
+ while (my $line = <stopReader>) {
1568
+ $line = &str_space_clean($line);
1569
+ $line =~ s/^\d+\s+//g;
1570
+ $stopH{lc($line)}++;
1571
+ }
1572
+ close(stopReader);
1573
+ return(\%stopH);
1574
+ }
1575
+
1576
+ sub hash_nickname{
1577
+ #my $DB_dir = "/home/hhan/projects/public_library/DB";
1578
+ my $stopword = "$Database_Dir/nickname.txt";
1579
+ my %stopH = ();
1580
+ open(stopReader, "$stopword") || die "SVMHeaderParse: could not open $stopword to read \n";
1581
+ while (my $line = <stopReader>) {
1582
+ $line = &str_space_clean($line);
1583
+ my @names = split(/<>|\s*\,\s*/, $line);
1584
+ for my $i(1 .. $#names) {
1585
+ $stopH{lc($names[0])}{lc($names[$i])} = 1;
1586
+ }
1587
+ }
1588
+ close(stopReader);
1589
+ return(\%stopH);
1590
+ }
1591
+
1592
+ sub hash_statewords {
1593
+ #my $DB_dir = "/home/hhan/projects/public_library/DB";
1594
+ my $stopword = "$Database_Dir/statename.txt";
1595
+ my %stopH = ();
1596
+ open(stopReader, "$stopword") || die "SVMHeaderParse: could not open $stopword to read \n";
1597
+ while (my $line = <stopReader>) {
1598
+ $line = &str_space_clean($line);
1599
+ my ($state, $abbr) = split(/\s*\,\s*/, $line);
1600
+ $stopH{$abbr} = $state;
1601
+ }
1602
+ close(stopReader);
1603
+ return(\%stopH);
1604
+ }
1605
+
1606
+
1607
+ sub hash_addrwords {
1608
+ #my $DB_dir = "/home/hhan/projects/public_library/DB";
1609
+ my $stopword = "$Database_Dir/addr.txt";
1610
+ my %stopH = ();
1611
+ open(stopReader, "$stopword") || die "SVMHeaderParse: could not open $stopword to read \n";
1612
+ while (my $line = <stopReader>) {
1613
+ $line = &str_space_clean($line);
1614
+ $line =~ s/^\d+\s+//g;
1615
+ $stopH{lc($line)}++;
1616
+ }
1617
+ close(stopReader);
1618
+ return(\%stopH);
1619
+ }
1620
+
1621
+ sub str_space_clean() {
1622
+ my $str = shift;
1623
+
1624
+ $str =~ s/\s+/ /g;
1625
+ $str =~ s/^\s+//g;
1626
+ $str =~ s/\s+$//g;
1627
+ return($str);
1628
+ }
1629
+
1630
+ sub nfreeze_hash_to_file() {
1631
+ my $H = shift;
1632
+ my $F = shift;
1633
+
1634
+ my $mystring = nfreeze($H);
1635
+ open(dumpFH, ">$F") || die "SVMHeaderParse: could not open $F to write: $!";
1636
+ print dumpFH "$mystring";
1637
+ close(dumpFH);
1638
+ }
1639
+
1640
+ sub dump_hash_to_file() {
1641
+ my $H = shift;
1642
+ my $F = shift;
1643
+
1644
+ $d = Data::Dumper->new([$H]);
1645
+ $mystring = $d->Dump;
1646
+ open(dumpFH, ">$F") || die "SVMHeaderParse: could not open $F to write: $!";
1647
+ print dumpFH "$mystring";
1648
+ close(dumpFH);
1649
+ }
1650
+
1651
+
1652
+ sub read_hash_from_file() {
1653
+ my $file = shift;
1654
+
1655
+ undef $/;
1656
+ open(dumpFH, "$file") || die "SVMHeaderParse: could not open $file to read. \n";
1657
+ my $string = <dumpFH>;
1658
+ close(dumpFH);
1659
+ $/ = "\n";
1660
+
1661
+ eval($string);
1662
+ return($VAR1);
1663
+ }
1664
+
1665
+ sub thaw_hash_from_file() {
1666
+ my $file = shift;
1667
+
1668
+ undef $/;
1669
+ open(dumpFH, "$file") || die "SVMHeaderParse: could not open $file to read. \n";
1670
+ my $string = <dumpFH>;
1671
+ close(dumpFH);
1672
+ $/ = "\n";
1673
+
1674
+ my $VAR1 = thaw($string);
1675
+ return($VAR1);
1676
+ }
1677
+
1678
+ sub rand_split_samples_to2parts() {
1679
+ my $samples = shift; #array
1680
+ my $ratio = shift;
1681
+
1682
+ my $total_num = $#$samples;
1683
+ my $num1 = int($total_num*$ratio);
1684
+ my $num2 = $total_num - $num1;
1685
+ my (@part1, @part2);
1686
+ print STDERR "rand_split_samples_to2parts\: $ratio of $total_num is $num1\n";
1687
+ $t=time;
1688
+ srand($t); #seed
1689
+ for($j=$total_num;$j>=0;$j--){
1690
+ $r=int(rand($j));
1691
+ if (($total_num - $j) < $num1) {
1692
+ push @part1, $$samples[$r];
1693
+ #adjust the samples after the selected one
1694
+ for my $k($r .. $#$samples-1) {
1695
+ $$samples[$k] = $$samples[$k+1];
1696
+ }
1697
+ pop @$samples;
1698
+ }else {
1699
+ push @part2, $$samples[$r];
1700
+ }
1701
+ }
1702
+ return(\@part1, \@part2);
1703
+ }
1704
+
1705
+ sub rand_split_samples_to2parts_v2() {
1706
+ my $samples = shift; #array
1707
+ my $ratio = shift;
1708
+
1709
+ my $total_num = $#$samples;
1710
+ my $num1 = int($total_num*$ratio);
1711
+ my $num2 = $total_num - $num1;
1712
+ my (@part1, @part2);
1713
+ print STDERR "rand_split_samples_to2parts\: $ratio of $total_num is $num1\n";
1714
+ $t=time;
1715
+ srand($t); #seed
1716
+ for($j=$total_num;$j>=0;$j--){
1717
+ $r=int(rand($j));
1718
+ if (($total_num - $j) < $num1) {
1719
+ push @part1, $$samples[$r];
1720
+ #adjust the samples after the selected one
1721
+ for my $k($r .. $#$samples-1) {
1722
+ $$samples[$k] = $$samples[$k+1];
1723
+ }
1724
+ pop @$samples;
1725
+ }
1726
+ }
1727
+ return(\@part1, $samples);
1728
+ }
1729
+
1730
+ sub rand_split_samples_toNparts() {
1731
+ my $samples = shift; #array
1732
+ my $fold = shift;
1733
+
1734
+ my $total_num = $#$samples;
1735
+ my $unit = int($total_num/$fold +1);
1736
+ my $last_fold = $total_num - $unit*($fold-1);
1737
+
1738
+ my @data = ();
1739
+ $t=time;
1740
+ srand($t); #seed
1741
+ for($j=$total_num;$j>=1;$j--){
1742
+ $r=int(rand($j));
1743
+ my $subfold = int(($total_num - $j)/$unit) + 1;
1744
+ push @{$data[$subfold]}, $$samples[$r];
1745
+ #adjust the samples after the selected one
1746
+ for my $k($r .. $#$samples-1) {
1747
+ $$samples[$k] = $$samples[$k+1];
1748
+ }
1749
+ pop @$samples;
1750
+ }
1751
+ return(@data);
1752
+ }
1753
+
1754
+ sub rand_split_hash_index_toNparts() {
1755
+ my $sample_hash = shift; #hash
1756
+ my $fold = shift;
1757
+
1758
+ my @sample_arr = keys %{$sample_hash};
1759
+ my $total_num = $#sample_arr;
1760
+ my $unit = int($total_num/$fold +1);
1761
+ my $last_fold = $total_num - $unit*($fold-1);
1762
+
1763
+ my @data = ();
1764
+ $t=time;
1765
+ srand($t); #seed
1766
+ for($j=$total_num;$j>=1;$j--){
1767
+ $r=int(rand($j));
1768
+ my $subfold = int(($total_num - $j)/$unit) + 1;
1769
+
1770
+ my $name = $sample_arr[$r];
1771
+ my %pos = ();
1772
+ if ($$sample_hash{$name}{label} > -1) {
1773
+ my @tmp = split(/\<\>/, $$sample_hash{$name}{label});
1774
+ map { $pos{$_} =1 } @tmp;
1775
+ }
1776
+ foreach my $file_name (keys %{$$sample_hash{$name}{name}}) {
1777
+ my ($tmp, $num) = split(/\_\_/, $file_name);
1778
+ my $label = "-1";
1779
+ if ($pos{$num}) {
1780
+ $label = "+1";
1781
+ }
1782
+ push @{$data[$subfold]}, "$label<>$file_name<>$$sample_hash{$name}{name}{$file_name}{snippet}";
1783
+ }
1784
+
1785
+ for my $k($r .. $#sample_arr-1) {
1786
+ $sample_arr[$k] = $sample_arr[$k+1];
1787
+ }
1788
+ pop @sample_arr;
1789
+ }
1790
+ return(@data);
1791
+ }
1792
+
1793
+ sub ExtractBinaryNfoldSVMResult() {
1794
+ my $in = shift;
1795
+ my %ResultH = ();
1796
+
1797
+ open (inFH, "$in") || die "SVMHeaderParse: could not open $in to read \n";
1798
+ while (my $line = <inFH>) {
1799
+ if ($line =~ /Accuracy on test set: (\d+\.\d+)\%/) {
1800
+ $ResultH{A}{count}++;
1801
+ $ResultH{A}{sum} += $1;
1802
+ }
1803
+ if ($line =~ /Precision\/recall on test set\: (.*)\%\/(.*)\%/) {
1804
+ my $P = $1;
1805
+ my $R = $2;
1806
+ if ($P =~ /\d+\.\d+/) {
1807
+ $ResultH{P}{count}++;
1808
+ $ResultH{P}{sum} += $P;
1809
+ }
1810
+ if ($R =~ /\d+\.\d+/) {
1811
+ $ResultH{R}{count}++;
1812
+ $ResultH{R}{sum} += $R;
1813
+ }
1814
+ }
1815
+ }
1816
+ close(inFH);
1817
+
1818
+ print STDERR "average result from cross validation \n";
1819
+ foreach my $eval(sort {$a <=> $b} keys %ResultH) {
1820
+ $ResultH{$eval}{avg} = sprintf("%.8f", $ResultH{$eval}{sum}/$ResultH{$eval}{count});
1821
+ print STDERR "evaluation($eval) -- $ResultH{$eval}{avg}\n";
1822
+ }
1823
+ }
1824
+
1825
+ ## get alias file
1826
+ sub GetNameVariations1() {
1827
+ my $personalName = shift; #like _Chris_S._Mellish__1.txt
1828
+
1829
+ my @QueryNameParts = split(/\s+|\-/, $personalName);
1830
+ my %NameVariations;
1831
+ my ($FirstName, $LastName, $FI, $LI, $MI1, $MI2, $FI_LI, $AllInitial, $AllName, $FILN, $FIMI1LastName, $FIMI1MI2LastName)
1832
+ = ('','','','','','','','','','','','','');
1833
+
1834
+ $FirstName = $QueryNameParts[0];
1835
+ $LastName = $QueryNameParts[$#QueryNameParts];
1836
+
1837
+ $NameVariations{$FirstName} = "FN";
1838
+ $NameVariations{$LastName} = "LN";
1839
+
1840
+ $FI = substr($FirstName, 0, 1);
1841
+ $LI = substr($LastName, 0, 1);
1842
+
1843
+ $FI_LI= "$FI"."$LI";
1844
+ $FILN = "$FI"."$LastName";
1845
+ $FNLI = "$FirstName"."$LI";
1846
+ $NameVariations{$FILN} = "FILN";
1847
+ $NameVariations{$FNLI} = "FNLI";
1848
+
1849
+ for my $i(0 .. $#QueryNameParts) {
1850
+ $QueryNameParts[$i] =~ s/\W+//g;
1851
+ $AllName .= $QueryNameParts[$i];
1852
+ }
1853
+ # dependts on whether this name contains 3 parts or 4 parts
1854
+ if ($#QueryNameParts < 1) {next;}
1855
+ if ($#QueryNameParts eq 1) {
1856
+ $AllInitial = $FI_LI;
1857
+ $NameVariations{$AllInitial} = "all_initial";
1858
+ }else {
1859
+ $NameVariations{$FI_LI} = "FILI";
1860
+ $NameVariations{"$FN"."$QueryNameParts[1]"} = "FNMN";
1861
+ if ($#QueryNameParts eq 2) {
1862
+ $MI1 = substr($QueryNameParts[1], 0, 1);
1863
+ $AllInitial= "$FI"."$MI1"."$LI";
1864
+ $FIMI1LastName = "$FI"."$MI1"."$LastName";
1865
+ $NameVariations{$AllInitial} = "all_initial";
1866
+ $NameVariations{$FIMI1LastName} = "FIMI1LN";
1867
+ }elsif ($#NameParts eq 3) {
1868
+ $MI1 = substr($QueryNameParts[1], 0, 1);
1869
+ $MI2 = substr($QueryNameParts[2], 0, 1);
1870
+ $AllInitial = "$FI"."$MI1"."$MI2"."$LI";
1871
+ $FIMI1LastName = "$FI"."$MI1"."$LastName";
1872
+ $FIMI1MI2LastName = "$FI"."$MI1"."$MI2"."$LastName";
1873
+ $NameVariations{$AllInitial} = "all_initial";
1874
+ $NameVariations{$FIMI1LastName} = "FIMI1LN";
1875
+ $NameVariations{$FIMI1MI2LastName} = "FIMI1MI2LN";
1876
+ }
1877
+ }
1878
+
1879
+ ## It will take chance for this exact match
1880
+ if (length ($QueryNameParts[$#QueryNameParts]) < 4) {
1881
+ $PartLastName = substr($QueryNameParts[$#QueryNameParts], 0, 5);
1882
+ $NameVariations{$PartLastName} = "partial_LN";
1883
+ }
1884
+ return(\%NameVariations);
1885
+ }
1886
+
1887
+ sub GetNameVariations() {
1888
+ my $personalName = shift; #like _Chris_S._Mellish__1.txt
1889
+ my $nickname = shift;
1890
+
1891
+ my @QueryNameParts = split(/\s+|\-/, $personalName);
1892
+ my %NameVariations;
1893
+ my ($FirstName, $LastName, $FI, $LI, $MI1, $MI2, $FI_LI, $AllInitial, $AllName, $FILN, $FIMI1LastName, $FIMI1MI2LastName)
1894
+ = ('','','','','','','','','','','','','');
1895
+
1896
+ $FirstName = $QueryNameParts[0];
1897
+ $LastName = $QueryNameParts[$#QueryNameParts];
1898
+ # using first 5 letters decreases performance if not using substring matching
1899
+ # if (length($QueryNameParts[0]) > 4) {
1900
+ # $FirstName = substr($QueryNameParts[0],0,5);
1901
+ # }
1902
+ # if (length($QueryNameParts[$#QueryNameParts]) > 4) {
1903
+ # $LastName = substr($QueryNameParts[$#QueryNameParts],0,5);
1904
+ # }
1905
+ $NameVariations{$FirstName} = "FN";
1906
+ $NameVariations{$LastName} = "LN";
1907
+ foreach my $alias(keys %{$$nickname{lc($FirstName)}}) {
1908
+ $NameVariations{$alias} = "FN";
1909
+ }
1910
+ $FI = substr($FirstName, 0, 1);
1911
+ $LI = substr($LastName, 0, 1);
1912
+
1913
+ $FI_LI= "$FI"."$LI";
1914
+ $FILN = "$FI"."\\"."w*"."$LastName";
1915
+ $FNLI = "$FirstName"."$LI";
1916
+ $LNFI = "$LastName"."$FI";
1917
+ $NameVariations{$FILN} = "FILN";
1918
+ $NameVariations{"$FI"."\."."$LastName"} = "FILN";
1919
+ $NameVariations{$FNLI} = "FNLI";
1920
+ $NameVariations{$LNFI} = "LNFI";
1921
+
1922
+ for my $i(0 .. $#QueryNameParts) {
1923
+ $QueryNameParts[$i] =~ s/\W+//g;
1924
+ $AllName .= $QueryNameParts[$i];
1925
+ }
1926
+ $NameVariations{$AllName} = "full_name";
1927
+ $NameVariations{"\\"."w*"."$FirstName"."\\"."w*"."$LastName"."\\"."w*"} = "FNLN";
1928
+ $NameVariations{"$FirstName"."\."."$LastName"} = "FNLN";
1929
+ $NameVariations{"$LastName"."$FirstName"} = "LNFN";
1930
+
1931
+ # depends on whether this name contains 3 parts or 4 parts
1932
+ if ($#QueryNameParts < 1) {next;}
1933
+ elsif ($#QueryNameParts eq 1) {
1934
+ $AllInitial = $FI_LI;
1935
+ $NameVariations{$AllInitial} = "all_initial";
1936
+ }else {
1937
+ $NameVariations{$FI_LI} = "FILI";
1938
+ $NameVariations{"$FirstName"."$QueryNameParts[1]"} = "FNMN";
1939
+ if ($#QueryNameParts eq 2) {
1940
+ $MI1 = substr($QueryNameParts[1], 0, 1);
1941
+ $AllInitial= "$FI"."$MI1"."$LI";
1942
+ $FIMI1LastName = "$FI"."$MI1"."$LastName";
1943
+ $NameVariations{$AllInitial} = "all_initial";
1944
+ $NameVariations{$FIMI1LastName} = "FIMI1LN";
1945
+ }elsif ($#QueryNameParts eq 3) {
1946
+ $MI1 = substr($QueryNameParts[1], 0, 1);
1947
+ $MI2 = substr($QueryNameParts[2], 0, 1);
1948
+ $AllInitial = "$FI"."$MI1"."$MI2"."$LI";
1949
+ $FIMI1LastName = "$FI"."$MI1"."$LastName";
1950
+ $FIMI2LI = "$FI"."$MI2"."$LI";
1951
+ $FIMI1LI = "$FI"."$MI1"."$LI";
1952
+ $FIMI1MI2LastName = "$FI"."$MI1"."$MI2"."$LastName";
1953
+ $MN2LastName = "$QueryNameParts[2]"."$LastName";
1954
+ $NameVariations{$AllInitial} = "all_initial";
1955
+ $NameVariations{$FIMI1LastName} = "FIMI1LN";
1956
+ $NameVariations{$FIMI1LI} = "FIMI1LI";
1957
+ $NameVariations{$FIMI2LI} = "FIMI2LI";
1958
+ $NameVariations{$MN2LastName} = "MI2LN";
1959
+ $NameVariations{$FIMI1MI2LastName} = "FIMI1MI2LN";
1960
+ }
1961
+ }
1962
+
1963
+ ## It will take chance for this exact match
1964
+ if (length ($QueryNameParts[$#QueryNameParts]) < 4) {
1965
+ $PartLastName = substr($QueryNameParts[$#QueryNameParts], 0, 5);
1966
+ $NameVariations{$PartLastName} = "partial_LN";
1967
+ }
1968
+ return(\%NameVariations);
1969
+ }
1970
+
1971
+ sub get_university_emails() {
1972
+ my $univ = "$Database_Dir/university_list/univ-full.html";
1973
+ my $simple_format = "$Database_Dir/university_list.txt";
1974
+
1975
+ my %H = ();
1976
+ open(UNIV, $univ) || die "SVMHeaderParse: could not open $univ to read. \n";
1977
+ my @content = <UNIV>;
1978
+ close(UNIV);
1979
+
1980
+ open(simpleWriter, ">$simple_format") || die "SVMHeaderParse: could not open $simple_format to write: $!";
1981
+ for my $i(0 .. $#content) {
1982
+ if ($content[$i] =~ /\<LI\>\s+\<A\s+HREF\=\"([^\"]*)\"\>(.*)\<\/A\>/) {
1983
+ my $url = $1;
1984
+ my $college = $2;
1985
+ print simpleWriter "$college<>$url\n";
1986
+ }
1987
+ }
1988
+ close(simpleWriter);
1989
+ return(\%H);
1990
+ }
1991
+
1992
+ #input: an array of value
1993
+ sub compute_std() {
1994
+ my $arr = shift;
1995
+ my $mean = 0;
1996
+ my $std = 0;
1997
+
1998
+ #cal mean
1999
+ for my $i(0 .. $#$arr) {
2000
+ $mean += $$arr[$i];
2001
+ }
2002
+ $mean = sprintf("%.3f", $mean/($#$arr+1));
2003
+
2004
+ for my $i(0 .. $#$arr) {
2005
+ $std += ($$arr[$i]-$mean)**2;
2006
+ }
2007
+ my $temp = sprintf("%.8f", $std/$#$arr);
2008
+ $std = sqrt($temp);
2009
+
2010
+ return($mean, $std);
2011
+ }
2012
+
2013
+
2014
+
2015
+
2016
+ 1;