biblicit 1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (322) hide show
  1. data/.gitignore +3 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +6 -0
  4. data/LICENSE.TXT +176 -0
  5. data/README.md +120 -0
  6. data/Rakefile +8 -0
  7. data/biblicit.gemspec +33 -0
  8. data/lib/biblicit/cb2bib.rb +83 -0
  9. data/lib/biblicit/citeseer.rb +53 -0
  10. data/lib/biblicit/extractor.rb +37 -0
  11. data/lib/biblicit.rb +6 -0
  12. data/perl/DocFilter/lib/CSXUtil/SafeText.pm +140 -0
  13. data/perl/DocFilter/lib/DocFilter/Config.pm +35 -0
  14. data/perl/DocFilter/lib/DocFilter/Filter.pm +51 -0
  15. data/perl/FileConversionService/README.TXT +11 -0
  16. data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
  17. data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +140 -0
  18. data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +77 -0
  19. data/perl/FileConversionService/lib/FileConverter/Compression.pm +137 -0
  20. data/perl/FileConversionService/lib/FileConverter/Config.pm +57 -0
  21. data/perl/FileConversionService/lib/FileConverter/Controller.pm +191 -0
  22. data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +61 -0
  23. data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +69 -0
  24. data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +69 -0
  25. data/perl/FileConversionService/lib/FileConverter/PSToText.pm +88 -0
  26. data/perl/FileConversionService/lib/FileConverter/Prescript.pm +68 -0
  27. data/perl/FileConversionService/lib/FileConverter/TET.pm +75 -0
  28. data/perl/FileConversionService/lib/FileConverter/Utils.pm +130 -0
  29. data/perl/HeaderParseService/README.TXT +80 -0
  30. data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +140 -0
  31. data/perl/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +968 -0
  32. data/perl/HeaderParseService/lib/HeaderParse/API/Function.pm +2016 -0
  33. data/perl/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +444 -0
  34. data/perl/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +409 -0
  35. data/perl/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +537 -0
  36. data/perl/HeaderParseService/lib/HeaderParse/API/Parser.pm +68 -0
  37. data/perl/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +1880 -0
  38. data/perl/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +46 -0
  39. data/perl/HeaderParseService/resources/data/EbizHeaders.txt +24330 -0
  40. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +27506 -0
  41. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +26495 -0
  42. data/perl/HeaderParseService/resources/data/tagged_headers.txt +40668 -0
  43. data/perl/HeaderParseService/resources/data/test_header.txt +31 -0
  44. data/perl/HeaderParseService/resources/data/test_header.txt.parsed +31 -0
  45. data/perl/HeaderParseService/resources/database/50states +60 -0
  46. data/perl/HeaderParseService/resources/database/AddrTopWords.txt +17 -0
  47. data/perl/HeaderParseService/resources/database/AffiTopWords.txt +35 -0
  48. data/perl/HeaderParseService/resources/database/AffiTopWordsAll.txt +533 -0
  49. data/perl/HeaderParseService/resources/database/ChineseSurNames.txt +276 -0
  50. data/perl/HeaderParseService/resources/database/Csurnames.bin +0 -0
  51. data/perl/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
  52. data/perl/HeaderParseService/resources/database/DomainSuffixes.txt +242 -0
  53. data/perl/HeaderParseService/resources/database/LabeledHeader +18 -0
  54. data/perl/HeaderParseService/resources/database/README +2 -0
  55. data/perl/HeaderParseService/resources/database/TrainMulClassLines +254 -0
  56. data/perl/HeaderParseService/resources/database/TrainMulClassLines1 +510 -0
  57. data/perl/HeaderParseService/resources/database/abstract.txt +1 -0
  58. data/perl/HeaderParseService/resources/database/abstractTopWords +9 -0
  59. data/perl/HeaderParseService/resources/database/addr.txt +28 -0
  60. data/perl/HeaderParseService/resources/database/affi.txt +34 -0
  61. data/perl/HeaderParseService/resources/database/affis.bin +0 -0
  62. data/perl/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
  63. data/perl/HeaderParseService/resources/database/allnamewords.bin +0 -0
  64. data/perl/HeaderParseService/resources/database/cities_US.txt +4512 -0
  65. data/perl/HeaderParseService/resources/database/cities_world.txt +4463 -0
  66. data/perl/HeaderParseService/resources/database/city.txt +3150 -0
  67. data/perl/HeaderParseService/resources/database/cityname.txt +3151 -0
  68. data/perl/HeaderParseService/resources/database/country_abbr.txt +243 -0
  69. data/perl/HeaderParseService/resources/database/countryname.txt +262 -0
  70. data/perl/HeaderParseService/resources/database/dateTopWords +30 -0
  71. data/perl/HeaderParseService/resources/database/degree.txt +67 -0
  72. data/perl/HeaderParseService/resources/database/email.txt +3 -0
  73. data/perl/HeaderParseService/resources/database/excludeWords.txt +40 -0
  74. data/perl/HeaderParseService/resources/database/female-names +4960 -0
  75. data/perl/HeaderParseService/resources/database/firstNames.txt +8448 -0
  76. data/perl/HeaderParseService/resources/database/firstnames.bin +0 -0
  77. data/perl/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
  78. data/perl/HeaderParseService/resources/database/intro.txt +2 -0
  79. data/perl/HeaderParseService/resources/database/keyword.txt +5 -0
  80. data/perl/HeaderParseService/resources/database/keywordTopWords +7 -0
  81. data/perl/HeaderParseService/resources/database/male-names +3906 -0
  82. data/perl/HeaderParseService/resources/database/middleNames.txt +2 -0
  83. data/perl/HeaderParseService/resources/database/month.txt +35 -0
  84. data/perl/HeaderParseService/resources/database/mul +868 -0
  85. data/perl/HeaderParseService/resources/database/mul.label +869 -0
  86. data/perl/HeaderParseService/resources/database/mul.label.old +869 -0
  87. data/perl/HeaderParseService/resources/database/mul.processed +762 -0
  88. data/perl/HeaderParseService/resources/database/mulAuthor +619 -0
  89. data/perl/HeaderParseService/resources/database/mulClassStat +45 -0
  90. data/perl/HeaderParseService/resources/database/nickname.txt +58 -0
  91. data/perl/HeaderParseService/resources/database/nicknames.bin +0 -0
  92. data/perl/HeaderParseService/resources/database/note.txt +121 -0
  93. data/perl/HeaderParseService/resources/database/page.txt +1 -0
  94. data/perl/HeaderParseService/resources/database/phone.txt +9 -0
  95. data/perl/HeaderParseService/resources/database/postcode.txt +54 -0
  96. data/perl/HeaderParseService/resources/database/pubnum.txt +45 -0
  97. data/perl/HeaderParseService/resources/database/statename.bin +0 -0
  98. data/perl/HeaderParseService/resources/database/statename.txt +73 -0
  99. data/perl/HeaderParseService/resources/database/states_and_abbreviations.txt +118 -0
  100. data/perl/HeaderParseService/resources/database/stopwords +438 -0
  101. data/perl/HeaderParseService/resources/database/stopwords.bin +0 -0
  102. data/perl/HeaderParseService/resources/database/surNames.txt +19613 -0
  103. data/perl/HeaderParseService/resources/database/surnames.bin +0 -0
  104. data/perl/HeaderParseService/resources/database/surnames_spec.bin +0 -0
  105. data/perl/HeaderParseService/resources/database/university_list/A.html +167 -0
  106. data/perl/HeaderParseService/resources/database/university_list/B.html +161 -0
  107. data/perl/HeaderParseService/resources/database/university_list/C.html +288 -0
  108. data/perl/HeaderParseService/resources/database/university_list/D.html +115 -0
  109. data/perl/HeaderParseService/resources/database/university_list/E.html +147 -0
  110. data/perl/HeaderParseService/resources/database/university_list/F.html +112 -0
  111. data/perl/HeaderParseService/resources/database/university_list/G.html +115 -0
  112. data/perl/HeaderParseService/resources/database/university_list/H.html +140 -0
  113. data/perl/HeaderParseService/resources/database/university_list/I.html +138 -0
  114. data/perl/HeaderParseService/resources/database/university_list/J.html +82 -0
  115. data/perl/HeaderParseService/resources/database/university_list/K.html +115 -0
  116. data/perl/HeaderParseService/resources/database/university_list/L.html +131 -0
  117. data/perl/HeaderParseService/resources/database/university_list/M.html +201 -0
  118. data/perl/HeaderParseService/resources/database/university_list/N.html +204 -0
  119. data/perl/HeaderParseService/resources/database/university_list/O.html +89 -0
  120. data/perl/HeaderParseService/resources/database/university_list/P.html +125 -0
  121. data/perl/HeaderParseService/resources/database/university_list/Q.html +49 -0
  122. data/perl/HeaderParseService/resources/database/university_list/R.html +126 -0
  123. data/perl/HeaderParseService/resources/database/university_list/S.html +296 -0
  124. data/perl/HeaderParseService/resources/database/university_list/T.html +156 -0
  125. data/perl/HeaderParseService/resources/database/university_list/U.html +800 -0
  126. data/perl/HeaderParseService/resources/database/university_list/V.html +75 -0
  127. data/perl/HeaderParseService/resources/database/university_list/W.html +144 -0
  128. data/perl/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
  129. data/perl/HeaderParseService/resources/database/university_list/X.html +44 -0
  130. data/perl/HeaderParseService/resources/database/university_list/Y.html +53 -0
  131. data/perl/HeaderParseService/resources/database/university_list/Z.html +43 -0
  132. data/perl/HeaderParseService/resources/database/university_list/ae.html +31 -0
  133. data/perl/HeaderParseService/resources/database/university_list/am.html +30 -0
  134. data/perl/HeaderParseService/resources/database/university_list/ar.html +35 -0
  135. data/perl/HeaderParseService/resources/database/university_list/at.html +43 -0
  136. data/perl/HeaderParseService/resources/database/university_list/au.html +82 -0
  137. data/perl/HeaderParseService/resources/database/university_list/bd.html +28 -0
  138. data/perl/HeaderParseService/resources/database/university_list/be.html +41 -0
  139. data/perl/HeaderParseService/resources/database/university_list/bg.html +28 -0
  140. data/perl/HeaderParseService/resources/database/university_list/bh.html +28 -0
  141. data/perl/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
  142. data/perl/HeaderParseService/resources/database/university_list/bm.html +28 -0
  143. data/perl/HeaderParseService/resources/database/university_list/bn.html +28 -0
  144. data/perl/HeaderParseService/resources/database/university_list/br.html +66 -0
  145. data/perl/HeaderParseService/resources/database/university_list/ca.html +174 -0
  146. data/perl/HeaderParseService/resources/database/university_list/ch.html +52 -0
  147. data/perl/HeaderParseService/resources/database/university_list/cl.html +40 -0
  148. data/perl/HeaderParseService/resources/database/university_list/cn.html +87 -0
  149. data/perl/HeaderParseService/resources/database/university_list/co.html +39 -0
  150. data/perl/HeaderParseService/resources/database/university_list/cr.html +34 -0
  151. data/perl/HeaderParseService/resources/database/university_list/cy.html +34 -0
  152. data/perl/HeaderParseService/resources/database/university_list/cz.html +44 -0
  153. data/perl/HeaderParseService/resources/database/university_list/de.html +128 -0
  154. data/perl/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
  155. data/perl/HeaderParseService/resources/database/university_list/dk.html +42 -0
  156. data/perl/HeaderParseService/resources/database/university_list/ec.html +31 -0
  157. data/perl/HeaderParseService/resources/database/university_list/ee.html +30 -0
  158. data/perl/HeaderParseService/resources/database/university_list/eg.html +29 -0
  159. data/perl/HeaderParseService/resources/database/university_list/es.html +68 -0
  160. data/perl/HeaderParseService/resources/database/university_list/et.html +28 -0
  161. data/perl/HeaderParseService/resources/database/university_list/faq.html +147 -0
  162. data/perl/HeaderParseService/resources/database/university_list/fi.html +49 -0
  163. data/perl/HeaderParseService/resources/database/university_list/fj.html +28 -0
  164. data/perl/HeaderParseService/resources/database/university_list/fo.html +28 -0
  165. data/perl/HeaderParseService/resources/database/university_list/fr.html +106 -0
  166. data/perl/HeaderParseService/resources/database/university_list/geog.html +150 -0
  167. data/perl/HeaderParseService/resources/database/university_list/gr.html +38 -0
  168. data/perl/HeaderParseService/resources/database/university_list/gu.html +28 -0
  169. data/perl/HeaderParseService/resources/database/university_list/hk.html +34 -0
  170. data/perl/HeaderParseService/resources/database/university_list/hr.html +28 -0
  171. data/perl/HeaderParseService/resources/database/university_list/hu.html +46 -0
  172. data/perl/HeaderParseService/resources/database/university_list/id.html +29 -0
  173. data/perl/HeaderParseService/resources/database/university_list/ie.html +49 -0
  174. data/perl/HeaderParseService/resources/database/university_list/il.html +35 -0
  175. data/perl/HeaderParseService/resources/database/university_list/in.html +109 -0
  176. data/perl/HeaderParseService/resources/database/university_list/is.html +32 -0
  177. data/perl/HeaderParseService/resources/database/university_list/it.html +75 -0
  178. data/perl/HeaderParseService/resources/database/university_list/jm.html +28 -0
  179. data/perl/HeaderParseService/resources/database/university_list/jo.html +28 -0
  180. data/perl/HeaderParseService/resources/database/university_list/jp.html +155 -0
  181. data/perl/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
  182. data/perl/HeaderParseService/resources/database/university_list/kr.html +65 -0
  183. data/perl/HeaderParseService/resources/database/university_list/kw.html +28 -0
  184. data/perl/HeaderParseService/resources/database/university_list/lb.html +28 -0
  185. data/perl/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
  186. data/perl/HeaderParseService/resources/database/university_list/lk.html +30 -0
  187. data/perl/HeaderParseService/resources/database/university_list/lt.html +31 -0
  188. data/perl/HeaderParseService/resources/database/university_list/lu.html +34 -0
  189. data/perl/HeaderParseService/resources/database/university_list/lv.html +30 -0
  190. data/perl/HeaderParseService/resources/database/university_list/ma.html +28 -0
  191. data/perl/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
  192. data/perl/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
  193. data/perl/HeaderParseService/resources/database/university_list/mk.html +29 -0
  194. data/perl/HeaderParseService/resources/database/university_list/mo.html +29 -0
  195. data/perl/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
  196. data/perl/HeaderParseService/resources/database/university_list/mt.html +28 -0
  197. data/perl/HeaderParseService/resources/database/university_list/mx.html +68 -0
  198. data/perl/HeaderParseService/resources/database/university_list/my.html +39 -0
  199. data/perl/HeaderParseService/resources/database/university_list/ni.html +28 -0
  200. data/perl/HeaderParseService/resources/database/university_list/nl.html +51 -0
  201. data/perl/HeaderParseService/resources/database/university_list/no.html +56 -0
  202. data/perl/HeaderParseService/resources/database/university_list/nz.html +41 -0
  203. data/perl/HeaderParseService/resources/database/university_list/pa.html +31 -0
  204. data/perl/HeaderParseService/resources/database/university_list/pe.html +40 -0
  205. data/perl/HeaderParseService/resources/database/university_list/ph.html +41 -0
  206. data/perl/HeaderParseService/resources/database/university_list/pl.html +51 -0
  207. data/perl/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
  208. data/perl/HeaderParseService/resources/database/university_list/pr.html +31 -0
  209. data/perl/HeaderParseService/resources/database/university_list/ps.html +28 -0
  210. data/perl/HeaderParseService/resources/database/university_list/pt.html +45 -0
  211. data/perl/HeaderParseService/resources/database/university_list/recognition.html +69 -0
  212. data/perl/HeaderParseService/resources/database/university_list/results.html +71 -0
  213. data/perl/HeaderParseService/resources/database/university_list/ro.html +38 -0
  214. data/perl/HeaderParseService/resources/database/university_list/ru.html +48 -0
  215. data/perl/HeaderParseService/resources/database/university_list/sd.html +28 -0
  216. data/perl/HeaderParseService/resources/database/university_list/se.html +57 -0
  217. data/perl/HeaderParseService/resources/database/university_list/sg.html +33 -0
  218. data/perl/HeaderParseService/resources/database/university_list/si.html +30 -0
  219. data/perl/HeaderParseService/resources/database/university_list/sk.html +35 -0
  220. data/perl/HeaderParseService/resources/database/university_list/th.html +45 -0
  221. data/perl/HeaderParseService/resources/database/university_list/tr.html +44 -0
  222. data/perl/HeaderParseService/resources/database/university_list/tw.html +76 -0
  223. data/perl/HeaderParseService/resources/database/university_list/ua.html +29 -0
  224. data/perl/HeaderParseService/resources/database/university_list/uk.html +168 -0
  225. data/perl/HeaderParseService/resources/database/university_list/univ-full.html +3166 -0
  226. data/perl/HeaderParseService/resources/database/university_list/univ.html +122 -0
  227. data/perl/HeaderParseService/resources/database/university_list/uy.html +31 -0
  228. data/perl/HeaderParseService/resources/database/university_list/ve.html +34 -0
  229. data/perl/HeaderParseService/resources/database/university_list/yu.html +28 -0
  230. data/perl/HeaderParseService/resources/database/university_list/za.html +46 -0
  231. data/perl/HeaderParseService/resources/database/university_list/zm.html +28 -0
  232. data/perl/HeaderParseService/resources/database/university_list.txt +3025 -0
  233. data/perl/HeaderParseService/resources/database/url.txt +1 -0
  234. data/perl/HeaderParseService/resources/database/webTopWords +225 -0
  235. data/perl/HeaderParseService/resources/database/words +45402 -0
  236. data/perl/HeaderParseService/resources/models/10ContextModelfold1 +369 -0
  237. data/perl/HeaderParseService/resources/models/10Modelfold1 +376 -0
  238. data/perl/HeaderParseService/resources/models/11ContextModelfold1 +400 -0
  239. data/perl/HeaderParseService/resources/models/11Modelfold1 +526 -0
  240. data/perl/HeaderParseService/resources/models/12ContextModelfold1 +510 -0
  241. data/perl/HeaderParseService/resources/models/12Modelfold1 +423 -0
  242. data/perl/HeaderParseService/resources/models/13ContextModelfold1 +364 -0
  243. data/perl/HeaderParseService/resources/models/13Modelfold1 +677 -0
  244. data/perl/HeaderParseService/resources/models/14ContextModelfold1 +459 -0
  245. data/perl/HeaderParseService/resources/models/14Modelfold1 +325 -0
  246. data/perl/HeaderParseService/resources/models/15ContextModelfold1 +340 -0
  247. data/perl/HeaderParseService/resources/models/15Modelfold1 +390 -0
  248. data/perl/HeaderParseService/resources/models/1ContextModelfold1 +668 -0
  249. data/perl/HeaderParseService/resources/models/1Modelfold1 +1147 -0
  250. data/perl/HeaderParseService/resources/models/2ContextModelfold1 +755 -0
  251. data/perl/HeaderParseService/resources/models/2Modelfold1 +796 -0
  252. data/perl/HeaderParseService/resources/models/3ContextModelfold1 +1299 -0
  253. data/perl/HeaderParseService/resources/models/3Modelfold1 +1360 -0
  254. data/perl/HeaderParseService/resources/models/4ContextModelfold1 +1062 -0
  255. data/perl/HeaderParseService/resources/models/4Modelfold1 +993 -0
  256. data/perl/HeaderParseService/resources/models/5ContextModelfold1 +1339 -0
  257. data/perl/HeaderParseService/resources/models/5Modelfold1 +2098 -0
  258. data/perl/HeaderParseService/resources/models/6ContextModelfold1 +888 -0
  259. data/perl/HeaderParseService/resources/models/6Modelfold1 +620 -0
  260. data/perl/HeaderParseService/resources/models/7ContextModelfold1 +257 -0
  261. data/perl/HeaderParseService/resources/models/7Modelfold1 +228 -0
  262. data/perl/HeaderParseService/resources/models/8ContextModelfold1 +677 -0
  263. data/perl/HeaderParseService/resources/models/8Modelfold1 +1871 -0
  264. data/perl/HeaderParseService/resources/models/9ContextModelfold1 +198 -0
  265. data/perl/HeaderParseService/resources/models/9Modelfold1 +170 -0
  266. data/perl/HeaderParseService/resources/models/NameSpaceModel +181 -0
  267. data/perl/HeaderParseService/resources/models/NameSpaceTrainF +347 -0
  268. data/perl/HeaderParseService/resources/models/WrapperBaseFeaDict +13460 -0
  269. data/perl/HeaderParseService/resources/models/WrapperContextFeaDict +14045 -0
  270. data/perl/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +510 -0
  271. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +23 -0
  272. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +23 -0
  273. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +23 -0
  274. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +23 -0
  275. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +23 -0
  276. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +23 -0
  277. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +23 -0
  278. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +23 -0
  279. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +23 -0
  280. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +23 -0
  281. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +23 -0
  282. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +23 -0
  283. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +23 -0
  284. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +23 -0
  285. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +23 -0
  286. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +23 -0
  287. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +23 -0
  288. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +23 -0
  289. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +23 -0
  290. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +23 -0
  291. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +23 -0
  292. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +23 -0
  293. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +23 -0
  294. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +23 -0
  295. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +23 -0
  296. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +23 -0
  297. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +23 -0
  298. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +23 -0
  299. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +23 -0
  300. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +23 -0
  301. data/perl/ParsCit/README.TXT +82 -0
  302. data/perl/ParsCit/crfpp/traindata/parsCit.template +60 -0
  303. data/perl/ParsCit/crfpp/traindata/parsCit.train.data +12104 -0
  304. data/perl/ParsCit/crfpp/traindata/tagged_references.txt +500 -0
  305. data/perl/ParsCit/lib/CSXUtil/SafeText.pm +140 -0
  306. data/perl/ParsCit/lib/ParsCit/Citation.pm +462 -0
  307. data/perl/ParsCit/lib/ParsCit/CitationContext.pm +132 -0
  308. data/perl/ParsCit/lib/ParsCit/Config.pm +46 -0
  309. data/perl/ParsCit/lib/ParsCit/Controller.pm +306 -0
  310. data/perl/ParsCit/lib/ParsCit/PostProcess.pm +367 -0
  311. data/perl/ParsCit/lib/ParsCit/PreProcess.pm +333 -0
  312. data/perl/ParsCit/lib/ParsCit/Tr2crfpp.pm +331 -0
  313. data/perl/ParsCit/resources/parsCit.model +0 -0
  314. data/perl/ParsCit/resources/parsCitDict.txt +148783 -0
  315. data/perl/extract.pl +199 -0
  316. data/spec/biblicit/cb2bib_spec.rb +48 -0
  317. data/spec/biblicit/citeseer_spec.rb +40 -0
  318. data/spec/fixtures/pdf/10.1.1.109.4049.pdf +0 -0
  319. data/spec/fixtures/pdf/Bagnoli Watts TAR 2010.pdf +0 -0
  320. data/spec/fixtures/pdf/ICINCO_2010.pdf +0 -0
  321. data/spec/spec_helper.rb +3 -0
  322. metadata +474 -0
@@ -0,0 +1,2016 @@
1
+ #
2
+ # Copyright 2007 Penn State University
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ #
13
+ package HeaderParse::API::Function;
14
+
15
+ use utf8;
16
+ use HeaderParse::Config::API_Config qw($Database_Dir);
17
+ use HeaderParse::API::LoadInformation;
18
+ require Exporter;
19
+ use Storable qw(nfreeze thaw);
20
+ use Data::Dumper;
21
+ use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
22
+ use vars qw(%dictH %nameH %monthH %affiH %addrH %conjH %prepH %postcodeH %cityH %stateH %countryH %abstractH);
23
+
24
+ @ISA = qw(Exporter); # important!!
25
+ @EXPORT = qw(&weired_author &AddrMatch &printDict &GenTrainVecMatrix &GetBorderLine &LineFeatureRepre &LineFeatureRepre2 &OfflineFillSpace &FillSpace &SeparatePunc &hash_stopwords &hash_nickname &hash_affi_stopwords &hash_addrwords &hash_statewords &str_space_clean &dump_hash_to_file &nfreeze_hash_to_file &read_hash_from_file &thaw_hash_from_file &rand_split_samples_to2parts &rand_split_samples_toNparts &rand_split_hash_index_toNparts &ExtractBinaryNfoldSVMResult &GetNameVariations &get_university_emails &compute_std);
26
+
27
+ sub AddrMatch() {
28
+
29
+ ###open (MYLOGGER, ">ADDRMATCH.LOG");
30
+ ###MYLOGGER->autoflush(1);
31
+
32
+ my $inline = shift;
33
+
34
+ ###$inline="Solitary Waves in the Critical Surface Tension Model";
35
+ ###print MYLOGGER "$inline\n";
36
+
37
+ my @words = split(/\s+/, $inline);
38
+ my $senLen = 0;
39
+
40
+ # match the state and country here using one or two words
41
+ # this step might be very time consuming
42
+ if ($words[0] !~ /^\W+\s*$/) {
43
+ $senLen ++; # punctuation
44
+ }
45
+
46
+ ###foreach $word (@words){
47
+ ###print MYLOGGER "before : word is \"$word\"\n";
48
+ ###$word = lc($word);
49
+ ###print MYLOGGER "after : word is \"$word\"\n";
50
+ ###}
51
+
52
+
53
+ ###print MYLOGGER "count is $#words\n";
54
+
55
+ for my $i(1 .. $#words) {
56
+ ### print MYLOGGER "word is $words[$i]\n";
57
+ if ($words[$i] !~ /^\W+\s*$/) {
58
+ $senLen ++; # punctuation
59
+ }
60
+ #the first letter is capitalized
61
+ if (($words[$i-1] =~ /^[\p{IsUpper}]/) && ($words[$i] =~ /^[\p{IsUpper}]/)) {
62
+ ###print MYLOGGER "before: $words[$i-1],$words[$i]\n";
63
+ my $pre = lc($words[$i-1]);
64
+
65
+ my $now = lc($words[$i]);
66
+ ###print MYLOGGER "pre is $pre\n now is $now\n";
67
+ if (exists $stateH{"$pre $now"}) { # need to check if it is correct
68
+ $words[$i-1] = "";
69
+ $words[$i] = ":state:";
70
+ }elsif (exists $countryH{"$pre $now"}) {
71
+ $words[$i-1] = "";
72
+ $words[$i] = ":country:";
73
+ }elsif (exists $cityH{"$pre $now"}) {
74
+ $words[$i-1] = "";
75
+ $words[$i] = ":city:";
76
+ }
77
+ }
78
+ }
79
+ ###CLOSE(MYLOGGER);
80
+ #Broken line is because of the insufficient hard disk
81
+ $inline = "@words"; #nice join!
82
+ $inline =~ s/^\s+//g;
83
+ $inline =~ s/\s+$//g;
84
+
85
+ return($inline, $senLen);
86
+ }
87
+
88
+
89
+ sub printDict() {
90
+ my ($TotalTrainLineCount, $dictF, %dictH) = @_;
91
+
92
+ open(DictFH, ">$dictF") || die "SVMHeaderParse: could not open dictfile\: $dictF to write\n";
93
+ # replace the old FeatureDictH with the new IDs
94
+ foreach my $feature (sort{$dictH{$a}{ID} <=> $dictH{$b}{ID}} keys %dictH) {
95
+ if (defined $dictH{$feature}{ID}) {
96
+ $dictH{$feature}{mean} = sprintf("%.8f", $dictH{$feature}{mean}/$TotalTrainLineCount);
97
+
98
+ if ($dictH{$feature}{max} == 0) {
99
+ print STDERR "$feature Yahoo1 \n";
100
+ }
101
+ my $ANmean = sprintf("%.8f", $dictH{$feature}{mean}/$dictH{$feature}{max});
102
+ print DictFH "$dictH{$feature}{df} $dictH{$feature}{ID} $feature\: max\($dictH{$feature}{max}\) BNmean\($dictH{$feature}{mean}\) ANmean\($ANmean\)\n";
103
+ }
104
+ }
105
+ close(DictFH);
106
+
107
+ return (%dictH);
108
+ }
109
+
110
+ sub GenTrainVecMatrix() {
111
+ my ($FeatureDictH, $TrainFeatureVecH, $TrainFeatureVec, $TrainMatrixF, $TrainTagInd, $GenMatrix, $norm, $center) = @_;
112
+
113
+ open (TrainFeatureVec, ">$TrainFeatureVec") || die "SVMHeaderParse: could not open TrainFeatureVec $TrainFeatureVec to write\n";
114
+ if ($GenMatrix) {
115
+ open (TrainMatrxFH, ">$TrainMatrixF") || die "SVMHeaderParse: could not open TrainMatrF\: $TrainMatrixF to write\n";
116
+ open (TrainTagIndFH, ">$TrainTagInd") || die "SVMHeaderParse: could not open TrainTagInd\: $TrainTagInd to write\n";
117
+ }
118
+ my $TmpTrainLineNo = 0;
119
+ foreach my $s (sort {$a <=> $b} keys %{$TrainFeatureVecH}) {
120
+ foreach my $li (sort {$a <=> $b} keys %{$$TrainFeatureVecH{$s}}) {
121
+ $TmpTrainLineNo ++;
122
+
123
+ #10/17 multi-class
124
+ print TrainFeatureVec "\(";
125
+ # print TrainTagIndFH "";
126
+ foreach my $tmpCurState (keys %{$$TrainFeatureVecH{$s}{$li}{tag}}) {
127
+ print TrainFeatureVec "$tmpCurState ";
128
+ print TrainTagIndFH "$tmpCurState ";
129
+ }
130
+ print TrainFeatureVec "\) ";
131
+ print TrainTagIndFH "\n";
132
+
133
+ # brute force; insufficient memorty
134
+ if ($GenMatrix == 0) {
135
+ if ($norm) {
136
+ if ($center == 1) {
137
+ #a loop of each feature in the dictionary.
138
+ foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$FeatureDictH}) {
139
+ if (! exists ($$TrainFeatureVecH{$s}{$li}{content}{$feature})) {
140
+ $$TrainFeatureVecH{$s}{$li}{content}{$feature} = 0;
141
+ }
142
+ # norm
143
+ my $featureVal = sprintf("%.8f", $$TrainFeatureVecH{$s}{$li}{content}{$feature}/$$FeatureDictH{$feature}{max});
144
+ #centering
145
+ $featureVal -= sprintf("%.8f", $$FeatureDictH{$feature}{mean}/$$FeatureDictH{$feature}{max});
146
+ $$TrainFeatureVecH{$s}{$li}{content}{$feature} = $featureVal;
147
+ if ($$TrainFeatureVecH{$s}{$li}{content}{$feature} != 0) {
148
+ print TrainFeatureVec "$$FeatureDictH{$feature}{ID}\:$$TrainFeatureVecH{$s}{$li}{content}{$feature} ";
149
+ }
150
+ }
151
+ print TrainFeatureVec "\n";
152
+ }else {
153
+ foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$TrainFeatureVecH{$s}{$li}{content}}) {
154
+ if (! defined ($$FeatureDictH{$feature}{ID})) {
155
+ next;
156
+ }
157
+ my $featureVal = sprintf("%.8f", $$TrainFeatureVecH{$s}{$li}{content}{$feature}/$$FeatureDictH{$feature}{max});
158
+ $$TrainFeatureVecH{$s}{$li}{content}{$feature} = $featureVal;
159
+ if ($$TrainFeatureVecH{$s}{$li}{content}{$feature} != 0) {
160
+ print TrainFeatureVec "$$FeatureDictH{$feature}{ID}\:$$TrainFeatureVecH{$s}{$li}{content}{$feature} ";
161
+ }
162
+ }
163
+ print TrainFeatureVec "\n";
164
+ }
165
+ }else { # norm = 0;
166
+ foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$TrainFeatureVecH{$s}{$li}{content}}) {
167
+ if (! defined ($$FeatureDictH{$feature}{ID})) {
168
+ next;
169
+ }
170
+ if ($$TrainFeatureVecH{$s}{$li}{content}{$feature} != 0) { # must be != 0
171
+ print TrainFeatureVec "$$FeatureDictH{$feature}{ID}\:$$TrainFeatureVecH{$s}{$li}{content}{$feature} ";
172
+ }
173
+ }
174
+ print TrainFeatureVec "\n";
175
+ }
176
+ }else {
177
+ foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$TrainFeatureVecH{$s}{$li}{content}}) {
178
+ if (! defined ($$FeatureDictH{$feature}{ID})) {
179
+ next;
180
+ }
181
+ if ($norm == 1) {
182
+ my $featureVal = sprintf("%.8f", $$TrainFeatureVecH{$s}{$li}{content}{$feature}/$$FeatureDictH{$feature}{max});
183
+ $$TrainFeatureVecH{$s}{$li}{content}{$feature} = $featureVal;
184
+ }
185
+ if ($$TrainFeatureVecH{$s}{$li}{content}{$feature}) {
186
+ print TrainFeatureVec "$$FeatureDictH{$feature}{ID}\:$$TrainFeatureVecH{$s}{$li}{content}{$feature} ";
187
+ # generate the matrix file for the training samples \n";
188
+ print TrainMatrxFH " $TmpTrainLineNo $$FeatureDictH{$feature}{ID} $$TrainFeatureVecH{$s}{$li}{content}{$feature}\n";
189
+ }
190
+ }
191
+ print TrainFeatureVec "\n";
192
+ }# end if
193
+
194
+ }# end foreach l(line)
195
+ print TrainFeatureVec "<NEW_HEADER>\n";
196
+ print TrainTagIndFH "<NEW_HEADER>\n";
197
+ }#end foreach s(sample)
198
+ close (TrainFeatureVec);
199
+ undef (%{$TrainFeatureVecH}); # release the training vector hash
200
+ $endTrain = 0;
201
+ close(TrainTagIndFH);
202
+ if ($GenMatrix) {
203
+ close(TrainMatrxFH);
204
+ }
205
+ }
206
+
207
+ # this is for the plaintext class -- no difference from GenTrainVecMatrix
208
+ sub GenOriginalTrainVecMatrix() {
209
+ my ($FeatureDictH, $TrainFeatureVecH, $TrainFeatureVec, $TrainMatrixF, $TrainTagInd, $GenMatrix, $norm, $center) = @_;
210
+
211
+
212
+ open (TrainFeatureVec, ">$TrainFeatureVec") || die "SVMHeaderParse: here1...could not open TrainFeatureVec $TrainFeatureVec to write\n";
213
+ if ($GenMatrix) {
214
+ open (TrainMatrxFH, ">$TrainMatrixF") || die "SVMHeaderParse: here2...could not open TrainMatrF\: $TrainMatrixF to write\n";
215
+ open (TrainTagIndFH, ">$TrainTagInd") || die "SVMHeaderParse: here3...could not open TrainTagInd\: $TrainTagInd to write\n";
216
+ }
217
+ my $TmpTrainLineNo = 0;
218
+ foreach my $s (sort {$a <=> $b} keys %{$TrainFeatureVecH}) {
219
+ foreach my $li (sort {$a <=> $b} keys %{$$TrainFeatureVecH{$s}}) {
220
+ $TmpTrainLineNo ++;
221
+
222
+ #10/17 multi-class
223
+ print TrainFeatureVec "\(";
224
+ # print TrainTagIndFH "";
225
+ foreach my $tmpCurState (keys %{$$TrainFeatureVecH{$s}{$li}{tag}}) {
226
+ print TrainFeatureVec "$tmpCurState ";
227
+ print TrainTagIndFH "$tmpCurState ";
228
+ }
229
+ print TrainFeatureVec "\) ";
230
+ print TrainTagIndFH "\n";
231
+
232
+ #
233
+
234
+ if (0) {
235
+ print TrainFeatureVec "$$TrainFeatureVecH{$s}{$li}{tag} ";
236
+ print TrainTagIndFH "$$TrainFeatureVecH{$s}{$li}{tag}\n";
237
+ }
238
+
239
+ # brute force; insufficient memorty
240
+ if (($GenMatrix == 0) && ($norm == 1) && ($center == 1)) {
241
+ #a loop of each feature in the dictionary.
242
+ foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$FeatureDictH}) {
243
+ if (! exists ($$TrainFeatureVecH{$s}{$li}{content}{$feature})) {
244
+ $$TrainFeatureVecH{$s}{$li}{content}{$feature} = 0;
245
+ }
246
+ # norm
247
+ my $featureVal = sprintf("%.8f", $$TrainFeatureVecH{$s}{$li}{content}{$feature}/$$FeatureDictH{$feature}{max});
248
+ #centering
249
+ $featureVal -= sprintf("%.8f", $$FeatureDictH{$feature}{mean}/$$FeatureDictH{$feature}{max});
250
+ $$TrainFeatureVecH{$s}{$li}{content}{$feature} = $featureVal;
251
+ if ($$TrainFeatureVecH{$s}{$li}{content}{$feature} != 0) {
252
+ print TrainFeatureVec "$$FeatureDictH{$feature}{ID}\:$$TrainFeatureVecH{$s}{$li}{content}{$feature} ";
253
+ }
254
+ }
255
+ print TrainFeatureVec "\n";
256
+ }elsif (($GenMatrix == 0) && ($norm == 1)) {
257
+ foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$TrainFeatureVecH{$s}{$li}{content}}) {
258
+ if (! defined ($$FeatureDictH{$feature}{ID})) {
259
+ next;
260
+ }
261
+ my $featureVal = sprintf("%.8f", $$TrainFeatureVecH{$s}{$li}{content}{$feature}/$$FeatureDictH{$feature}{max});
262
+ $$TrainFeatureVecH{$s}{$li}{content}{$feature} = $featureVal;
263
+ if ($$TrainFeatureVecH{$s}{$li}{content}{$feature} != 0) {
264
+ print TrainFeatureVec "$$FeatureDictH{$feature}{ID}\:$$TrainFeatureVecH{$s}{$li}{content}{$feature} ";
265
+ }
266
+ }
267
+ print TrainFeatureVec "\n";
268
+ }elsif($GenMatrix == 1) {
269
+ foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$TrainFeatureVecH{$s}{$li}{content}}) {
270
+ if (! defined ($$FeatureDictH{$feature}{ID})) {
271
+ next;
272
+ }
273
+ if ($norm == 1) {
274
+ my $featureVal = sprintf("%.8f", $$TrainFeatureVecH{$s}{$li}{content}{$feature}/$$FeatureDictH{$feature}{max});
275
+ $$TrainFeatureVecH{$s}{$li}{content}{$feature} = $featureVal;
276
+ }
277
+ if ($$TrainFeatureVecH{$s}{$li}{content}{$feature} != 0) {
278
+ print TrainFeatureVec "$$FeatureDictH{$feature}{ID}\:$$TrainFeatureVecH{$s}{$li}{content}{$feature} ";
279
+ # generate the matrix file for the training samples \n";
280
+ print TrainMatrxFH " $TmpTrainLineNo $$FeatureDictH{$feature}{ID} $$TrainFeatureVecH{$s}{$li}{content}{$feature}\n";
281
+ }
282
+ }
283
+ print TrainFeatureVec "\n";
284
+ }# end if
285
+
286
+ }# end foreach l(line)
287
+ print TrainFeatureVec "<NEW_HEADER>\n";
288
+ print TrainTagIndFH "<NEW_HEADER>\n";
289
+ }#end foreach s(sample)
290
+ close (TrainFeatureVec);
291
+ undef (%{$TrainFeatureVecH}); # release the training vector hash
292
+ $endTrain = 0;
293
+ close(TrainTagIndFH);
294
+ if ($GenMatrix) {
295
+ close(TrainMatrxFH);
296
+ }
297
+ }
298
+
299
+
300
+
301
+ sub GetBorderLine() {
302
+ my $InFile = shift; # this file contains the sample separator
303
+ my %BorderLineH;
304
+ my $LineNO = 0;
305
+
306
+ open(INFH, "$InFile") || die "SVMHeaderParse: could not open Infile\: $InFile to read \n";
307
+ while (my $li =<INFH>) {
308
+ $li =~ s/^\s+//g;
309
+ $li =~ s/\s+$//g;
310
+ if ($li !~ /^\s*$/) {
311
+ $LineNO++;
312
+ if ($LineNO == 1) {
313
+ $BorderLineH{$LineNO} = "N"; #only has next line
314
+ }elsif ($li =~ /^\<NEW\_HEADER\>/) {
315
+ $BorderLineH{$LineNO-1} = "P";
316
+ $BorderLineH{$LineNO+1} = "N";
317
+ }
318
+ }
319
+ }
320
+ close(INFH);
321
+ delete($BorderLineH{$LineNO+1}); # delete the last line
322
+
323
+ return(\%BorderLineH);
324
+ }
325
+
326
+ #all relevant domain databases are imported as shown at the beginning of this
327
+ #program
328
+ #useful for OfflineSeparateMultiClassLine.pl esp. for printing
329
+ sub LineFeatureRepre2() {
330
+ my $label = shift;
331
+ my $line = shift;
332
+ my $FeatureDictH = shift;
333
+ my $FiletoPrint = shift;
334
+
335
+ my $neutral = 1;
336
+ my $neutralAddName = 0;
337
+ my $norm = 1;
338
+
339
+ my %TestFeatureVecH = (); #very important
340
+
341
+ #some of these features might not work for single word case such as
342
+ #senLen, so might just take this factor out for word case
343
+ #########categorical features################
344
+ my $senLen = 0;
345
+ my $dateNum = 0;
346
+ my $DictWordNum = 0;
347
+ my $NonDictWordNum = 0;
348
+ my $Cap1DictWordNum = 0;
349
+ my $Cap1NonDictWordNum = 0;
350
+ my $digitNum = 0;
351
+ my $others = 0;
352
+ my $affiNum = 0;
353
+ my $addrNum = 0; # let city, state, country all counted as the addr
354
+ # for word case, we might need more specific recognition
355
+ my $capNum = 0;
356
+ my $introNum = 0;
357
+ my $phoneNum = 0;
358
+ my $degreeNum = 0;
359
+ my $pubNum = 0;
360
+ my $noteNum = 0;
361
+ my $pageNum = 0;
362
+ ###
363
+
364
+ my $TokenLine;
365
+ if (length($line) > 1) {
366
+ ($TokenLine, $senLen) = &AddrMatch($line); # this is to match the bi-grams in the address database; assume bi-gram is unique for address
367
+ #transformed features
368
+ }else {
369
+ $TokenLine = $line;
370
+ }
371
+
372
+ my @words = split(/\s+/, $TokenLine);
373
+ #now start the AddrNameConfu, shared among address and people's name
374
+ #normally do not use this representation
375
+
376
+ for my $i(0 .. $#words) {
377
+ if ($words[$i] =~ /\+PAGE\+/) {
378
+ $words[$i] = ":page:";
379
+ $pageNum++;
380
+ }
381
+ } # end with for each word
382
+
383
+ #match bi-gram on Pubnum, Note and Degree and affiliation (might make it a separate func)
384
+ if (($neutral) && (length($line) > 1)) {
385
+ for my $i(1 .. $#words) {
386
+ my $pre = lc($words[$i-1]);
387
+ my $now = lc($words[$i]);
388
+ my $prestem;
389
+ my $nowstem;
390
+ my $degreeMatch;
391
+ my $pubnumMatch;
392
+ my $noteMatch;
393
+ my $affiMatch;
394
+
395
+ if ($stem) {
396
+ $prestem = &PSTEM::stem($pre);
397
+ $nowstem = &PSTEM::stem($now);
398
+ $degreeMatch = $degreeH{lc("$prestem $nowstem")};
399
+ $pubnumMatch = $pubnumH{lc("$prestem $nowstem")};
400
+ $noteMatch = $noteH{lc("$prestem $nowstem")};
401
+ $affiMatch = $affiH{lc("$prestem $nowstem")};
402
+ }else { # for bigram match, we do not request both to be capitalized
403
+ $degreeMatch = $degreeH{lc("$pre $now")};
404
+ $pubnumMatch = $pubnumH{lc("$pre $now")};
405
+ $noteMatch = $noteH{lc("$pre $now")};
406
+ $affiMatch = $affiH{lc("$pre $now")};
407
+ }
408
+
409
+
410
+ if (($pre =~ /^\s*$/) || ($pre =~ /\:\w+\:/)) {next; }
411
+
412
+ my %Confuse4BiGram = (
413
+ 1 => 0,
414
+ 2 => 0,
415
+ 3 => 0,
416
+ 4 => 0
417
+ );
418
+ my $match = 0;
419
+ if ($degreeMatch) {
420
+ $Confuse4BiGram{1} = 1;
421
+ $match = 1;
422
+ }
423
+ if ($pubnumMatch) {
424
+ $Confuse4BiGram{2} = 1;
425
+ $match = 1;
426
+ }
427
+ if ($noteMatch) {
428
+ $Confuse4BiGram{3} = 1;
429
+ $match = 1;
430
+ }
431
+
432
+ if ($affiMatch) {
433
+ $Confuse4BiGram{4} = 1;
434
+ $match = 1;
435
+ }
436
+
437
+ if ($match == 0) { next; }
438
+
439
+ $words[$i] = "\:Confuse4BiGram";
440
+ foreach my $ind(sort {$a <=> $b} keys %Confuse4BiGram) {
441
+ $words[$i] .= "$Confuse4BiGram{$ind}";
442
+ }
443
+ $words[$i] .= "\:";
444
+
445
+ if ($words[$i] eq "\:Confuse4BiGram1000\:") {
446
+ $words[$i-1] = "";
447
+ $words[$i] = ":degree:";
448
+ $degreeNum++;
449
+ }elsif ($words[$i] eq "\:Confuse4BiGram0100\:") {
450
+ $words[$i-1] = "";
451
+ $words[$i] = ":pubnum:";
452
+ $pubNum++;
453
+ }elsif ($words[$i] eq "\:Confuse4BiGram0010\:") {
454
+ $words[$i-1] = "";
455
+ $words[$i] = ":note:";
456
+ $noteNum++;
457
+ }elsif ($words[$i] eq "\:Confuse4BiGram0001\:") {
458
+ $words[$i-1] = "";
459
+ $words[$i] = ":affi:";
460
+ $affiNum++;
461
+ }
462
+ }
463
+ }#end with neutral bigram
464
+
465
+ # single words match on Pubnum, notes and degree!
466
+ for my $i(0 .. $#words) {
467
+ if (($words[$i] !~ /\:\w+\:/) && ($words[$i] !~ /^\W+\s*$/)) {
468
+ if ($neutral) {
469
+ my %Confuse4Single = (
470
+ 1 => 0,
471
+ 2 => 0,
472
+ 3 => 0,
473
+ 4 => 0
474
+ );
475
+ my $match = 0;
476
+ my $degreeMatch;
477
+ my $pubnumMatch;
478
+ my $noteMatch;
479
+ my $affiMatch;
480
+ my $stemword;
481
+
482
+ if ($stem) {
483
+ $stemword = &PSTEM::stem($stemword);
484
+ $degreeMatch = $degreeH{$stemword};
485
+ $pubnumMatch = $pubnumH{$stemword};
486
+ $noteMatch = $noteH{$stemword};
487
+ $affiMatch = ($words[$i] =~ /^[\p{IsUpper}]/ && $affiH{$stemword});
488
+ }else {
489
+ $degreeMatch = $degreeH{lc($words[$i])};
490
+ $pubnumMatch = $pubnumH{lc($words[$i])};
491
+ $noteMatch = $noteH{lc($words[$i])};
492
+ $affiMatch = ($words[$i] =~ /^[\p{IsUpper}]/ && $affiH{lc($words[$i])});
493
+ }
494
+
495
+ #because hhan@cse.psu.edu will become hhan.psu.edu after stemming
496
+ #and $stemword is lower case
497
+ if ($degreeMatch) {
498
+ $Confuse4Single{1} = 1;
499
+ $match = 1;
500
+ }
501
+ if ($pubnumMatch) {
502
+ $Confuse4Single{2} = 1;
503
+ $match = 1;
504
+ }
505
+ if ($noteMatch) {
506
+ $Confuse4Single{3} = 1;
507
+ $match = 1;
508
+ }
509
+ if ($affiMatch) {
510
+ $Confuse4Single{4} = 1;
511
+ $match = 1;
512
+ }
513
+
514
+ if ($match) {
515
+ $words[$i] = "\:Confuse4Single";
516
+ foreach my $ind(sort {$a <=> $b} keys %Confuse4Single) {
517
+ $words[$i] .= "$Confuse4Single{$ind}";
518
+ }
519
+ $words[$i] .= "\:";
520
+ if ($words[$i] eq "\:Confuse4Single1000\:") {
521
+ $words[$i] = ":degree:";
522
+ $degreeNum++;
523
+ }elsif ($words[$i] eq "\:Confuse4Single0100\:") {
524
+ $words[$i] = ":pubnum:";
525
+ $pubNum++;
526
+ }elsif ($words[$i] eq "\:Confuse4Single0010\:") {
527
+ $words[$i] = ":note:";
528
+ $noteNum++;
529
+ }elsif ($words[$i] eq "\:Confuse4Single0001\:") {
530
+ $words[$i] = ":affi:";
531
+ $affiNum++;
532
+ }
533
+ }
534
+ }# end with neutral
535
+
536
+ if ($words[$i] !~ /\:\w+\:/) {
537
+ if (exists($conjH{$words[$i]})) {
538
+ $words[$i] = ":conj:";
539
+ }elsif (exists($prepH{$words[$i]})) {
540
+ $words[$i] = ":prep:";
541
+ }elsif ($words[$i] =~ /\@/) {
542
+ $words[$i] = "\:Email\:";
543
+ }elsif ($words[$i] =~ /(http)|(ftp)\:\/\/(\w+\.){1,}/i) {
544
+ $words[$i] = "\:http\:";
545
+ }elsif ($words[$i] =~ /^[\p{IsUpper}]/) { # Capitalize letter 1
546
+ if ((length($words[$i]) == 1) || ($words[$i] =~ /^[\p{IsUpper}]\.$/)) {
547
+ $words[$i] = ":SingleCap:"; #like M
548
+ $capNum ++; # actually only the number of single cap
549
+ }elsif (exists ($postcodeH{lc($words[$i])})) { # 2 caps
550
+ $words[$i] = ":postcode:";
551
+ }elsif (($i == 0) && ($abstractH{lc($words[$i])})) {
552
+ $words[$i] = ":abstract:";
553
+ }elsif (($i == 0) && ($keywordH{lc($words[$i])})) {
554
+ $words[$i] = ":keyword:";
555
+ }elsif ($introH{lc($words[$i])}) {
556
+ $words[$i] = ":intro:";
557
+ $introNum++;
558
+ }elsif ($phoneH{lc($words[$i])}) {
559
+ $words[$i] = ":phone:";
560
+ $phoneNum++;
561
+ }elsif ($monthH{lc($words[$i])}) {
562
+ $words[$i] = ":month:";
563
+ $dateNum++;
564
+ }else {
565
+ if ($neutral) {
566
+ if ($addrH{lc($words[$i])}) {
567
+ $words[$i] = ":addr:";
568
+ $addrNum++;
569
+ }elsif ($cityH{lc($words[$i])}) { #If not neutral class
570
+ $words[$i] = ":city:";
571
+ $addrNum++;
572
+ }elsif ($stateH{lc($words[$i])}) {
573
+ $words[$i] = ":state:";
574
+ $addrNum++;
575
+ }elsif ($countryH{lc($words[$i])}) {
576
+ $words[$i] = ":country:";
577
+ $addrNum++;
578
+ }elsif ($nameH{lc($words[$i])}) { # end with not neutral class
579
+ $words[$i] = ":MayName:";
580
+ $Cap1NonDictWordNum ++;
581
+ }elsif ($dictH{lc($words[$i])}) {
582
+ $words[$i] = ":Cap1DictWord:";
583
+ $Cap1DictWordNum ++;
584
+ }elsif ($words[$i] =~ /\W+|\-/) { #like BU-CS-93-015; maybe the length could be relaxed; I add \W+ here!!!
585
+ my @Parts = split(/\W+|\-/, $words[$i]);
586
+ for $i(0 .. $#Parts) {
587
+ if ($Parts[$i] =~ /^[\p{IsLower}]+$/) {
588
+ my $len = length($Parts[$i]);
589
+ # $Parts[$i] = "\:LowerWord"."$len"."\:";
590
+ $Parts[$i] = "\:LowerWords\:";
591
+ }elsif ($Parts[$i] =~ /^[\p{IsUpper}]+$/) {
592
+ my $len = length($Parts[$i]);
593
+ # $Parts[$i] = "\:CapWord"."$len"."\:";
594
+ $Parts[$i] = "\:CapWords\:";
595
+ }elsif ($Parts[$i] =~ /^\d+$/) {
596
+ my $len = length($Parts[$i]);
597
+ # $Parts[$i] = "\:Dig\[$len\]\:";
598
+ $Parts[$i] = "\:Digs\:";
599
+ }elsif ($Parts[$i] =~ /^[\p{IsUpper}\p{IsLower}]+$/) {
600
+ $Parts[$i] = "\:MixCaseWords\:";
601
+ }else {
602
+ my $len = length($Parts[$i]);
603
+ $Parts[$i] = "\:Mix\[$len\]\:";
604
+ }
605
+ }
606
+ $words[$i] = join("\-", @Parts);
607
+ }elsif ($words[$i] =~ /^[\p{IsUpper}]+$/) {
608
+ my $len = length($words[$i]);
609
+ $words[$i] = "\:CapWord"."$len"."\:";
610
+ # $words[$i] = "\:CapWords\:";
611
+ }else {
612
+ $words[$i] = ":Cap1NonDictWord:";
613
+ $Cap1NonDictWordNum ++;
614
+ }
615
+ }else {
616
+ if ($degreeH{lc($words[$i])}) {
617
+ $words[$i] = ":degree:";
618
+ $degreeNum++;
619
+ }elsif ($pubnumH{lc($words[$i])}) {
620
+ $words[$i] = ":pubnum:";
621
+ $pubNum++;
622
+ }elsif ($noteH{lc($words[$i])}) {
623
+ $words[$i] = ":note:";
624
+ $noteNum++;
625
+ }elsif ($monthH{lc($words[$i])}) {
626
+ $words[$i] = ":month:";
627
+ $dateNum++;
628
+ }elsif ($affiH{lc($words[$i])}) {
629
+ $words[$i] = ":affi:";
630
+ $affiNum++;
631
+ }elsif ($addrH{lc($words[$i])}) {
632
+ $words[$i] = ":addr:";
633
+ $addrNum++;
634
+ }elsif ($cityH{lc($words[$i])}) { #If not neutral class
635
+ $words[$i] = ":city:";
636
+ # $words[$i] = ":addr:";
637
+ $addrNum++;
638
+ }elsif ($stateH{lc($words[$i])}) {
639
+ $words[$i] = ":state:";
640
+ # $words[$i] = ":addr:";
641
+ $addrNum++;
642
+ }elsif ($countryH{lc($words[$i])}) {
643
+ $words[$i] = ":country:";
644
+ # $words[$i] = ":addr:";
645
+ $addrNum++;
646
+ }elsif ($nameH{lc($words[$i])}) { # end with not neutral class
647
+ $words[$i] = ":MayName:";
648
+ $Cap1NonDictWordNum ++;
649
+ }elsif ( $dictH{lc($words[$i])}) {
650
+ $words[$i] = ":Cap1DictWord:";
651
+ $Cap1DictWordNum ++;
652
+ }elsif ($words[$i] =~ /\W+|\-/) { #like BU-CS-93-015; maybe the length could be relaxed; I add \W+ here!!!
653
+ my @Parts = split(/\W+|\-/, $words[$i]);
654
+ for $i(0 .. $#Parts) {
655
+ if ($Parts[$i] =~ /^[\p{IsLower}]+$/) {
656
+ my $len = length($Parts[$i]);
657
+ # $Parts[$i] = "\:LowerWord"."$len"."\:";
658
+ $Parts[$i] = "\:LowerWords\:";
659
+ }elsif ($Parts[$i] =~ /^[\p{IsUpper}]+$/) {
660
+ my $len = length($Parts[$i]);
661
+ # $Parts[$i] = "\:CapWord"."$len"."\:";
662
+ $Parts[$i] = "\:CapWords\:";
663
+ }elsif ($Parts[$i] =~ /^\d+$/) {
664
+ my $len = length($Parts[$i]);
665
+ # $Parts[$i] = "\:Dig\[$len\]\:";
666
+ $Parts[$i] = "\:Digs\:";
667
+ }elsif ($Parts[$i] =~ /^[\p{IsUpper}\p{IsLower}]+$/) {
668
+ $Parts[$i] = "\:MixCaseWords\:";
669
+ }else {
670
+ my $len = length($Parts[$i]);
671
+ $Parts[$i] = "\:Mix\[$len\]\:";
672
+ }
673
+ }
674
+ $words[$i] = join("\-", @Parts);
675
+ }elsif ($words[$i] =~ /^[\p{IsUpper}]+$/) {
676
+ my $len = length($words[$i]);
677
+ $words[$i] = "\:CapWord"."$len"."\:";
678
+ # $words[$i] = "\:CapWords\:";
679
+ }else {
680
+ $words[$i] = ":Cap1NonDictWord:";
681
+ $Cap1NonDictWordNum ++;
682
+ }
683
+ }
684
+ }#end with neutral
685
+ }elsif ($words[$i] =~ /^[\p{IsLower}]/) { # small case letter 1
686
+ if (exists ($phoneH{$words[$i]})) {
687
+ $words[$i] = ":phone:";
688
+ $phoneNum++;
689
+ }elsif (exists ($monthH{lc($words[$i])})) {
690
+ $words[$i] = ":month:";
691
+ $dateNum++;
692
+ }elsif ($keywordH{lc($words[$i])}) {
693
+ $words[$i] = ":keyword:";
694
+ }elsif (exists $dictH{lc($words[$i])}) {
695
+ $words[$i] = ":DictWord:";
696
+ $DictWordNum ++;
697
+ }else {# should consider the mixure of digit and letters
698
+ $words[$i] = ":NonDictWord:";
699
+ $NonDictWordNum ++;
700
+ }
701
+ }elsif ($words[$i] =~ /^[\d\-]+$/) { #like 30332-0280 or 1111
702
+ my $newword = $words[$i];
703
+ while ($words[$i] =~ /(\d+)/g) {
704
+ my $dig = $1;
705
+ my $diglen = length($dig);
706
+ $newword =~ s/$dig/ \:Dig\[$diglen\]\: /;
707
+ }
708
+ $words[$i] = $newword;
709
+ $digitNum++;
710
+ }elsif ($words[$i] =~ /^(\W+)(.*)$/) { #start from a non-word character
711
+ my $nonword = $1;
712
+ my $rest = $2;
713
+ $words[$i] = $nonword;
714
+ while (length($rest) > 0) {
715
+ if ($rest =~ /^([\p{IsUpper}]+)(.*)$/) {
716
+ my $tmp = $1;
717
+ $rest = $2;
718
+ $words[$i] .= "\:CapWords\:".length($tmp); #length may be relaxed
719
+ }elsif ($rest =~ /^([\p{IsLower}]+)(.*)$/) {
720
+ my $tmp = $1;
721
+ $rest = $2;
722
+ $words[$i] .= "\:LowerWords\:".length($tmp);
723
+ }elsif ($rest =~ /^(\d+)(.*)$/) {
724
+ my $tmp = $1;
725
+ $rest = $2;
726
+ $words[$i] .= "\:Digs\:".length($tmp);
727
+ }else { #get the head character
728
+ my $restLen = length($rest);
729
+ $restLen--;
730
+ $words[$i] .= substr($rest, 0, 1);
731
+ $rest = substr($rest, 1, $restLen);
732
+ }
733
+ }
734
+ }else {
735
+ $others++;
736
+ }
737
+ }
738
+ }else {
739
+ # print " already token or punctuation\: $words[$i] \n";
740
+ }
741
+ }
742
+
743
+ for my $i(0 .. $#words) {
744
+ if (exists ($$FeatureDictH{$words[$i]}{ID})) {
745
+ $TestFeatureVecH{$words[$i]}++;
746
+ }
747
+ }
748
+
749
+ # here we add in the bigrams
750
+ if (length($line) > 1) {
751
+ for my $i(1 .. $#words) { #not good for (0 .. $#words-1) soemtimes
752
+ my $pre = $words[$i-1];
753
+ my $now = $words[$i];
754
+ # add bigram into dict and train or test vector
755
+ if (exists ($$FeatureDictH{"$pre $now"}{ID})) {
756
+ $TestFeatureVecH{"$pre $now"}++;
757
+ }
758
+ } # end with bigram features
759
+ }
760
+
761
+ # try to normalize using F1
762
+ $TestFeatureVecH{CsenLen} = $senLen;
763
+ if ($senLen > 0) {
764
+ $TestFeatureVecH{CdateNumPer} = sprintf("%.8f", $dateNum/$senLen);
765
+ $TestFeatureVecH{CDictWordNumPer} = sprintf("%.8f", $DictWordNum/$senLen);
766
+ $TestFeatureVecH{CNonDictWordNumPer} = sprintf("%.8f", $NonDictWordNum/$senLen);
767
+ $TestFeatureVecH{CCap1DictWordNumPer} = sprintf("%.8f", $Cap1DictWordNum/$senLen);
768
+ $TestFeatureVecH{CCap1NonDictWordNumPer} = sprintf("%.8f", $Cap1NonDictWordNum/$senLen);
769
+ $TestFeatureVecH{CdigitNumPer} = sprintf("%.8f", $digitNum/$senLen);
770
+ $TestFeatureVecH{CaffiNumPer} = sprintf("%.8f", $affiNum/$senLen);
771
+ $TestFeatureVecH{CaddrNumPer} = sprintf("%.8f", $addrNum/$senLen);
772
+ $TestFeatureVecH{CintroNumPer} = sprintf("%.8f",$introNum/$senLen);
773
+ $TestFeatureVecH{CphoneNumPer} = sprintf("%.8f",$phoneNum/$senLen);
774
+ $TestFeatureVecH{CdegreeNumPer} = sprintf("%.8f",$degreeNum/$senLen);
775
+ $TestFeatureVecH{CpubNumPer} = sprintf("%.8f",$pubNum/$senLen);
776
+ $TestFeatureVecH{CnoteNumPer} = sprintf("%.8f",$noteNum/$senLen);
777
+ $TestFeatureVecH{CpageNumPer} = sprintf("%.8f",$pageNum/$senLen);
778
+ $TestFeatureVecH{CcapNumPer} = sprintf("%.8f",$capNum/$senLen);
779
+ $TestFeatureVecH{CothersPer} = sprintf("%.8f", $others/$senLen);
780
+ #$TestFeatureVecH{ClinePos} = sprintf("%.8f", $linePos);
781
+ }else {
782
+ #print "null line\: $line \n";
783
+ }
784
+
785
+ if ($FiletoPrint ne "") {
786
+ open(PFH, ">$FiletoPrint") || die "SVMHeaderParse: here4...could not open $FiletoPrint to write\n";
787
+ print PFH "$label ";
788
+ }
789
+
790
+ my $SVMFeaVec = "$label "; #this is a string
791
+ foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %TestFeatureVecH) {
792
+ if ($TestFeatureVecH{$feature} != 0){
793
+ if ($norm) {
794
+ if ($$FeatureDictH{$feature}{max} != 0) {
795
+ # print "feature: $TestFeatureVecH{$feature} ; dict $$FeatureDictH{$feature}{max} => ";
796
+ my $tmpval = sprintf("%.8f", $TestFeatureVecH{$feature}/$$FeatureDictH{$feature}{max});
797
+ $TestFeatureVecH{$feature} = $tmpval;
798
+ #print " $TestFeatureVecH{$feature} \n";
799
+ }else {
800
+ #print "zero max\: $feature \n";
801
+ }
802
+ }
803
+ if ($FiletoPrint ne "") {
804
+ print PFH "$$FeatureDictH{$feature}{ID}\:$TestFeatureVecH{$feature} ";
805
+ }
806
+ $SVMFeaVec .= "$$FeatureDictH{$feature}{ID}\:$TestFeatureVecH{$feature} ";
807
+ }else {
808
+ #print "zero value\: $feature ($TestFeatureVecH{$feature}) \n";
809
+ }
810
+ }
811
+ if ($FiletoPrint ne "") {
812
+ print PFH "\n";
813
+ close(PFH);
814
+ }
815
+
816
+ my $convertedStr = join(" ", @words);
817
+ #return(\%TestFeatureVecH);
818
+ return($SVMFeaVec);
819
+ #return($convertedStr);
820
+ }
821
+
822
+
823
+ sub LineFeatureRepre() {
824
+ my $line = shift;
825
+ my $neutral = 1;
826
+ my $neutralAddName = 0;
827
+ my $norm = 1;
828
+ my %TestFeatureVecH = (); #very important
829
+
830
+ #some of these features might not work for single word case such as
831
+ #senLen, so might just take this factor out for word case
832
+ #########categorical features################
833
+ my $senLen = 0;
834
+ my $dateNum = 0;
835
+ my $DictWordNum = 0;
836
+ my $NonDictWordNum = 0;
837
+ my $Cap1DictWordNum = 0;
838
+ my $Cap1NonDictWordNum = 0;
839
+ my $digitNum = 0;
840
+ my $others = 0;
841
+ my $affiNum = 0;
842
+ my $addrNum = 0; # let city, state, country all counted as the addr
843
+ # for word case, we might need more specific recognition
844
+ my $capNum = 0;
845
+ my $introNum = 0;
846
+ my $phoneNum = 0;
847
+ my $degreeNum = 0;
848
+ my $pubNum = 0;
849
+ my $noteNum = 0;
850
+ my $pageNum = 0;
851
+ ###
852
+
853
+ my $TokenLine;
854
+ if (length($line) > 1) {
855
+ ($TokenLine, $senLen) = &AddrMatch($line); # this is to match the bi-grams in the address database; assume bi-gram is unique for address
856
+ #transformed features
857
+ }else {
858
+ $TokenLine = $line;
859
+ }
860
+ my @words = split(/\s+/, $TokenLine);
861
+ #now start the AddrNameConfu, shared among address and people's name
862
+ #normally do not use this representation
863
+
864
+ for my $i(0 .. $#words) {
865
+ if ($words[$i] =~ /\+PAGE\+/) {
866
+ $words[$i] = ":page:";
867
+ $pageNum++;
868
+ }
869
+ } # end with for each word
870
+
871
+ #match bi-gram on Pubnum, Note and Degree and affiliation (might make it a separate func)
872
+ if (($neutral) && (length($line) > 1)) {
873
+ for my $i(1 .. $#words) {
874
+ my $pre = lc($words[$i-1]);
875
+ my $now = lc($words[$i]);
876
+ my $prestem;
877
+ my $nowstem;
878
+ my $degreeMatch;
879
+ my $pubnumMatch;
880
+ my $noteMatch;
881
+ my $affiMatch;
882
+
883
+ if ($stem) {
884
+ $prestem = &PSTEM::stem($pre);
885
+ $nowstem = &PSTEM::stem($now);
886
+ $degreeMatch = $degreeH{lc("$prestem $nowstem")};
887
+ $pubnumMatch = $pubnumH{lc("$prestem $nowstem")};
888
+ $noteMatch = $noteH{lc("$prestem $nowstem")};
889
+ $affiMatch = $affiH{lc("$prestem $nowstem")};
890
+ }else { # for bigram match, we do not request both to be capitalized
891
+ $degreeMatch = $degreeH{lc("$pre $now")};
892
+ $pubnumMatch = $pubnumH{lc("$pre $now")};
893
+ $noteMatch = $noteH{lc("$pre $now")};
894
+ $affiMatch = $affiH{lc("$pre $now")};
895
+ }
896
+
897
+
898
+ if (($pre =~ /^\s*$/) || ($pre =~ /\:\w+\:/)) {next; }
899
+
900
+ my %Confuse4BiGram = (
901
+ 1 => 0,
902
+ 2 => 0,
903
+ 3 => 0,
904
+ 4 => 0
905
+ );
906
+ my $match = 0;
907
+ if ($degreeMatch) {
908
+ $Confuse4BiGram{1} = 1;
909
+ $match = 1;
910
+ }
911
+ if ($pubnumMatch) {
912
+ $Confuse4BiGram{2} = 1;
913
+ $match = 1;
914
+ }
915
+ if ($noteMatch) {
916
+ $Confuse4BiGram{3} = 1;
917
+ $match = 1;
918
+ }
919
+
920
+ if ($affiMatch) {
921
+ $Confuse4BiGram{4} = 1;
922
+ $match = 1;
923
+ }
924
+
925
+ if ($match == 0) { next; }
926
+
927
+ $words[$i] = "\:Confuse4BiGram";
928
+ foreach my $ind(sort {$a <=> $b} keys %Confuse4BiGram) {
929
+ $words[$i] .= "$Confuse4BiGram{$ind}";
930
+ }
931
+ $words[$i] .= "\:";
932
+
933
+ if ($words[$i] eq "\:Confuse4BiGram1000\:") {
934
+ $words[$i-1] = "";
935
+ $words[$i] = ":degree:";
936
+ $degreeNum++;
937
+ }elsif ($words[$i] eq "\:Confuse4BiGram0100\:") {
938
+ $words[$i-1] = "";
939
+ $words[$i] = ":pubnum:";
940
+ $pubNum++;
941
+ }elsif ($words[$i] eq "\:Confuse4BiGram0010\:") {
942
+ $words[$i-1] = "";
943
+ $words[$i] = ":note:";
944
+ $noteNum++;
945
+ }elsif ($words[$i] eq "\:Confuse4BiGram0001\:") {
946
+ $words[$i-1] = "";
947
+ $words[$i] = ":affi:";
948
+ $affiNum++;
949
+ }
950
+ }
951
+ }#end with neutral bigram
952
+
953
+ # single words match on Pubnum, notes and degree!
954
+ for my $i(0 .. $#words) {
955
+ if (($words[$i] !~ /\:\w+\:/) && ($words[$i] !~ /^\W+\s*$/)) {
956
+ if ($neutral) {
957
+ my %Confuse4Single = (
958
+ 1 => 0,
959
+ 2 => 0,
960
+ 3 => 0,
961
+ 4 => 0
962
+ );
963
+ my $match = 0;
964
+ my $degreeMatch;
965
+ my $pubnumMatch;
966
+ my $noteMatch;
967
+ my $affiMatch;
968
+ my $stemword;
969
+
970
+ if ($stem) {
971
+ $stemword = &PSTEM::stem($stemword);
972
+ $degreeMatch = $degreeH{$stemword};
973
+ $pubnumMatch = $pubnumH{$stemword};
974
+ $noteMatch = $noteH{$stemword};
975
+ $affiMatch = ($words[$i] =~ /^[\p{IsUpper}]/ && $affiH{$stemword});
976
+ }else {
977
+ $degreeMatch = $degreeH{lc($words[$i])};
978
+ $pubnumMatch = $pubnumH{lc($words[$i])};
979
+ $noteMatch = $noteH{lc($words[$i])};
980
+ $affiMatch = ($words[$i] =~ /^[\p{IsUpper}]/ && $affiH{lc($words[$i])});
981
+ }
982
+
983
+ #because hhan@cse.psu.edu will become hhan.psu.edu after stemming
984
+ #and $stemword is lower case
985
+ if ($degreeMatch) {
986
+ $Confuse4Single{1} = 1;
987
+ $match = 1;
988
+ }
989
+ if ($pubnumMatch) {
990
+ $Confuse4Single{2} = 1;
991
+ $match = 1;
992
+ }
993
+ if ($noteMatch) {
994
+ $Confuse4Single{3} = 1;
995
+ $match = 1;
996
+ }
997
+ if ($affiMatch) {
998
+ $Confuse4Single{4} = 1;
999
+ $match = 1;
1000
+ }
1001
+
1002
+ if ($match) {
1003
+ $words[$i] = "\:Confuse4Single";
1004
+ foreach my $ind(sort {$a <=> $b} keys %Confuse4Single) {
1005
+ $words[$i] .= "$Confuse4Single{$ind}";
1006
+ }
1007
+ $words[$i] .= "\:";
1008
+ if ($words[$i] eq "\:Confuse4Single1000\:") {
1009
+ $words[$i] = ":degree:";
1010
+ $degreeNum++;
1011
+ }elsif ($words[$i] eq "\:Confuse4Single0100\:") {
1012
+ $words[$i] = ":pubnum:";
1013
+ $pubNum++;
1014
+ }elsif ($words[$i] eq "\:Confuse4Single0010\:") {
1015
+ $words[$i] = ":note:";
1016
+ $noteNum++;
1017
+ }elsif ($words[$i] eq "\:Confuse4Single0001\:") {
1018
+ $words[$i] = ":affi:";
1019
+ $affiNum++;
1020
+ }
1021
+ }
1022
+ }# end with neutral
1023
+
1024
+ if ($words[$i] !~ /\:\w+\:/) {
1025
+ if (exists($conjH{$words[$i]})) {
1026
+ $words[$i] = ":conj:";
1027
+ }elsif (exists($prepH{$words[$i]})) {
1028
+ $words[$i] = ":prep:";
1029
+ }elsif ($words[$i] =~ /\@/) {
1030
+ $words[$i] = "\:Email\:";
1031
+ }elsif ($words[$i] =~ /(http)|(ftp)\:\/\/(\w+\.){1,}/i) {
1032
+ $words[$i] = "\:http\:";
1033
+ }elsif ($words[$i] =~ /^[\p{IsUpper}]/) { # Capitalize letter 1
1034
+ if ((length($words[$i]) == 1) || ($words[$i] =~ /^[\p{IsUpper}]\.$/)) {
1035
+ $words[$i] = ":SingleCap:"; #like M
1036
+ $capNum ++; # actually only the number of single cap
1037
+ }elsif (exists ($postcodeH{lc($words[$i])})) { # 2 caps
1038
+ $words[$i] = ":postcode:";
1039
+ }elsif (($i == 0) && ($abstractH{lc($words[$i])})) {
1040
+ $words[$i] = ":abstract:";
1041
+ }elsif (($i == 0) && ($keywordH{lc($words[$i])})) {
1042
+ $words[$i] = ":keyword:";
1043
+ }elsif ($introH{lc($words[$i])}) {
1044
+ $words[$i] = ":intro:";
1045
+ $introNum++;
1046
+ }elsif ($phoneH{lc($words[$i])}) {
1047
+ $words[$i] = ":phone:";
1048
+ $phoneNum++;
1049
+ }elsif ($monthH{lc($words[$i])}) {
1050
+ $words[$i] = ":month:";
1051
+ $dateNum++;
1052
+ }else {
1053
+ if ($neutral) {
1054
+ if ($addrH{lc($words[$i])}) {
1055
+ $words[$i] = ":addr:";
1056
+ $addrNum++;
1057
+ }elsif ($cityH{lc($words[$i])}) { #If not neutral class
1058
+ $words[$i] = ":city:";
1059
+ $addrNum++;
1060
+ }elsif ($stateH{lc($words[$i])}) {
1061
+ $words[$i] = ":state:";
1062
+ $addrNum++;
1063
+ }elsif ($countryH{lc($words[$i])}) {
1064
+ $words[$i] = ":country:";
1065
+ $addrNum++;
1066
+ }elsif ($nameH{lc($words[$i])}) { # end with not neutral class
1067
+ $words[$i] = ":MayName:";
1068
+ $Cap1NonDictWordNum ++;
1069
+ }elsif ($dictH{lc($words[$i])}) {
1070
+ $words[$i] = ":Cap1DictWord:";
1071
+ $Cap1DictWordNum ++;
1072
+ }elsif ($words[$i] =~ /\W+|\-/) { #like BU-CS-93-015; maybe the length could be relaxed; I add \W+ here!!!
1073
+ my @Parts = split(/\W+|\-/, $words[$i]);
1074
+ for $i(0 .. $#Parts) {
1075
+ if ($Parts[$i] =~ /^[\p{IsLower}]+$/) {
1076
+ my $len = length($Parts[$i]);
1077
+ # $Parts[$i] = "\:LowerWord"."$len"."\:";
1078
+ $Parts[$i] = "\:LowerWords\:";
1079
+ }elsif ($Parts[$i] =~ /^[\p{IsUpper}]+$/) {
1080
+ my $len = length($Parts[$i]);
1081
+ # $Parts[$i] = "\:CapWord"."$len"."\:";
1082
+ $Parts[$i] = "\:CapWords\:";
1083
+ }elsif ($Parts[$i] =~ /^\d+$/) {
1084
+ my $len = length($Parts[$i]);
1085
+ # $Parts[$i] = "\:Dig\[$len\]\:";
1086
+ $Parts[$i] = "\:Digs\:";
1087
+ }elsif ($Parts[$i] =~ /^[\p{IsUpper}\p{IsLower}]+$/) {
1088
+ $Parts[$i] = "\:MixCaseWords\:";
1089
+ }else {
1090
+ my $len = length($Parts[$i]);
1091
+ $Parts[$i] = "\:Mix\[$len\]\:";
1092
+ }
1093
+ }
1094
+ $words[$i] = join("\-", @Parts);
1095
+ }elsif ($words[$i] =~ /^[\p{IsUpper}]+$/) {
1096
+ my $len = length($words[$i]);
1097
+ $words[$i] = "\:CapWord"."$len"."\:";
1098
+ # $words[$i] = "\:CapWords\:";
1099
+ }else {
1100
+ $words[$i] = ":Cap1NonDictWord:";
1101
+ $Cap1NonDictWordNum ++;
1102
+ }
1103
+ }else {#end with neutral
1104
+
1105
+ if ($degreeH{lc($words[$i])}) {
1106
+ $words[$i] = ":degree:";
1107
+ $degreeNum++;
1108
+ }elsif ($pubnumH{lc($words[$i])}) {
1109
+ $words[$i] = ":pubnum:";
1110
+ $pubNum++;
1111
+ }elsif ($noteH{lc($words[$i])}) {
1112
+ $words[$i] = ":note:";
1113
+ $noteNum++;
1114
+ }elsif ($monthH{lc($words[$i])}) {
1115
+ $words[$i] = ":month:";
1116
+ $dateNum++;
1117
+ }elsif ($affiH{lc($words[$i])}) {
1118
+ $words[$i] = ":affi:";
1119
+ $affiNum++;
1120
+ }elsif ($addrH{lc($words[$i])}) {
1121
+ $words[$i] = ":addr:";
1122
+ $addrNum++;
1123
+ }elsif ($cityH{lc($words[$i])}) { #If not neutral class
1124
+ $words[$i] = ":city:";
1125
+ # $words[$i] = ":addr:";
1126
+ $addrNum++;
1127
+ }elsif ($stateH{lc($words[$i])}) {
1128
+ $words[$i] = ":state:";
1129
+ # $words[$i] = ":addr:";
1130
+ $addrNum++;
1131
+ }elsif ($countryH{lc($words[$i])}) {
1132
+ $words[$i] = ":country:";
1133
+ # $words[$i] = ":addr:";
1134
+ $addrNum++;
1135
+ }elsif ($nameH{lc($words[$i])}) { # end with not neutral class
1136
+ $words[$i] = ":MayName:";
1137
+ $Cap1NonDictWordNum ++;
1138
+ }elsif ( $dictH{lc($words[$i])}) {
1139
+ $words[$i] = ":Cap1DictWord:";
1140
+ $Cap1DictWordNum ++;
1141
+ }elsif ($words[$i] =~ /\W+|\-/) { #like BU-CS-93-015; maybe the length could be relaxed; I add \W+ here!!!
1142
+ my @Parts = split(/\W+|\-/, $words[$i]);
1143
+ for $i(0 .. $#Parts) {
1144
+ if ($Parts[$i] =~ /^[\p{IsLower}]+$/) {
1145
+ my $len = length($Parts[$i]);
1146
+ # $Parts[$i] = "\:LowerWord"."$len"."\:";
1147
+ $Parts[$i] = "\:LowerWords\:";
1148
+ }elsif ($Parts[$i] =~ /^[\p{IsUpper}]+$/) {
1149
+ my $len = length($Parts[$i]);
1150
+ # $Parts[$i] = "\:CapWord"."$len"."\:";
1151
+ $Parts[$i] = "\:CapWords\:";
1152
+ }elsif ($Parts[$i] =~ /^\d+$/) {
1153
+ my $len = length($Parts[$i]);
1154
+ # $Parts[$i] = "\:Dig\[$len\]\:";
1155
+ $Parts[$i] = "\:Digs\:";
1156
+ }elsif ($Parts[$i] =~ /^[\p{IsUpper}\p{IsLower}]+$/) {
1157
+ $Parts[$i] = "\:MixCaseWords\:";
1158
+ }else {
1159
+ my $len = length($Parts[$i]);
1160
+ $Parts[$i] = "\:Mix\[$len\]\:";
1161
+ }
1162
+ }
1163
+ $words[$i] = join("\-", @Parts);
1164
+ }elsif ($words[$i] =~ /^[\p{IsUpper}]+$/) {
1165
+ my $len = length($words[$i]);
1166
+ $words[$i] = "\:CapWord"."$len"."\:";
1167
+ # $words[$i] = "\:CapWords\:";
1168
+ }else {
1169
+ $words[$i] = ":Cap1NonDictWord:";
1170
+ $Cap1NonDictWordNum ++;
1171
+ }
1172
+ }
1173
+ }#end with else neutral
1174
+ }elsif ($words[$i] =~ /^[\p{IsLower}]/) { # small case letter 1
1175
+ if (exists ($phoneH{$words[$i]})) {
1176
+ $words[$i] = ":phone:";
1177
+ $phoneNum++;
1178
+ }elsif (exists ($monthH{lc($words[$i])})) {
1179
+ $words[$i] = ":month:";
1180
+ $dateNum++;
1181
+ }elsif ($keywordH{lc($words[$i])}) {
1182
+ $words[$i] = ":keyword:";
1183
+ }elsif (exists $dictH{lc($words[$i])}) {
1184
+ $words[$i] = ":DictWord:";
1185
+ $DictWordNum ++;
1186
+ }else {# should consider the mixure of digit and letters
1187
+ $words[$i] = ":NonDictWord:";
1188
+ $NonDictWordNum ++;
1189
+ }
1190
+ }elsif ($words[$i] =~ /^[\d\-]+$/) { #like 30332-0280 or 1111
1191
+ my $newword = $words[$i];
1192
+ while ($words[$i] =~ /(\d+)/g) {
1193
+ my $dig = $1;
1194
+ my $diglen = length($dig);
1195
+ $newword =~ s/$dig/ \:Dig\[$diglen\]\: /;
1196
+ }
1197
+ $words[$i] = $newword;
1198
+ $digitNum++;
1199
+ }elsif ($words[$i] =~ /^(\W+)(.*)$/) { #start from a non-word character
1200
+ my $nonword = $1;
1201
+ my $rest = $2;
1202
+ $words[$i] = $nonword;
1203
+ while (length($rest) > 0) {
1204
+ if ($rest =~ /^([\p{IsUpper}]+)(.*)$/) {
1205
+ my $tmp = $1;
1206
+ $rest = $2;
1207
+ $words[$i] .= "\:CapWords\:".length($tmp); #length may be relaxed
1208
+ }elsif ($rest =~ /^([\p{IsLower}]+)(.*)$/) {
1209
+ my $tmp = $1;
1210
+ $rest = $2;
1211
+ $words[$i] .= "\:LowerWords\:".length($tmp);
1212
+ }elsif ($rest =~ /^(\d+)(.*)$/) {
1213
+ my $tmp = $1;
1214
+ $rest = $2;
1215
+ $words[$i] .= "\:Digs\:".length($tmp);
1216
+ }else { #get the head character
1217
+ my $restLen = length($rest);
1218
+ $restLen--;
1219
+ $words[$i] .= substr($rest, 0, 1);
1220
+ $rest = substr($rest, 1, $restLen);
1221
+ }
1222
+ }
1223
+ }else {
1224
+ $others++;
1225
+ }
1226
+ }
1227
+ }else {
1228
+ # print " already token or punctuation\: $words[$i] \n";
1229
+ }
1230
+ }
1231
+
1232
+ for my $i(0 .. $#words) {
1233
+ # if (exists ($$FeatureDictH{$words[$i]}{ID})) {
1234
+ $TestFeatureVecH{$words[$i]}++;
1235
+ # }
1236
+ }
1237
+
1238
+ # here we add in the bigrams
1239
+ if (length($line) > 1) {
1240
+ for my $i(1 .. $#words) { #not good for (0 .. $#words-1) soemtimes
1241
+ my $pre = $words[$i-1];
1242
+ my $now = $words[$i];
1243
+ # add bigram into dict and train or test vector
1244
+ # if (exists ($$FeatureDictH{"$pre $now"}{ID})) {
1245
+ $TestFeatureVecH{"$pre $now"}++;
1246
+ # }
1247
+ } # end with bigram features
1248
+ }
1249
+
1250
+ # try to normalize using F1
1251
+ $TestFeatureVecH{CsenLen} = $senLen;
1252
+ if ($senLen > 0) {
1253
+ $TestFeatureVecH{CdateNumPer} = sprintf("%.8f", $dateNum/$senLen);
1254
+ $TestFeatureVecH{CDictWordNumPer} = sprintf("%.8f", $DictWordNum/$senLen);
1255
+ $TestFeatureVecH{CNonDictWordNumPer} = sprintf("%.8f", $NonDictWordNum/$senLen);
1256
+ $TestFeatureVecH{CCap1DictWordNumPer} = sprintf("%.8f", $Cap1DictWordNum/$senLen);
1257
+ $TestFeatureVecH{CCap1NonDictWordNumPer} = sprintf("%.8f", $Cap1NonDictWordNum/$senLen);
1258
+ $TestFeatureVecH{CdigitNumPer} = sprintf("%.8f", $digitNum/$senLen);
1259
+ $TestFeatureVecH{CaffiNumPer} = sprintf("%.8f", $affiNum/$senLen);
1260
+ $TestFeatureVecH{CaddrNumPer} = sprintf("%.8f", $addrNum/$senLen);
1261
+ $TestFeatureVecH{CintroNumPer} = sprintf("%.8f",$introNum/$senLen);
1262
+ $TestFeatureVecH{CphoneNumPer} = sprintf("%.8f",$phoneNum/$senLen);
1263
+ $TestFeatureVecH{CdegreeNumPer} = sprintf("%.8f",$degreeNum/$senLen);
1264
+ $TestFeatureVecH{CpubNumPer} = sprintf("%.8f",$pubNum/$senLen);
1265
+ $TestFeatureVecH{CnoteNumPer} = sprintf("%.8f",$noteNum/$senLen);
1266
+ $TestFeatureVecH{CpageNumPer} = sprintf("%.8f",$pageNum/$senLen);
1267
+ $TestFeatureVecH{CcapNumPer} = sprintf("%.8f",$capNum/$senLen);
1268
+ $TestFeatureVecH{CothersPer} = sprintf("%.8f", $others/$senLen);
1269
+ #$TestFeatureVecH{ClinePos} = sprintf("%.8f", $linePos);
1270
+ }else {
1271
+ #print "null line\: $line \n";
1272
+ }
1273
+
1274
+ if ($FiletoPrint ne "") {
1275
+ open(PFH, ">$FiletoPrint") || die "SVMHeaderParse: could not open $FiletoPrint to write: $!";
1276
+ print PFH "$label ";
1277
+ }
1278
+
1279
+ if (0) {
1280
+ my $SVMFeaVec = ""; #this is a string
1281
+ foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %TestFeatureVecH) {
1282
+ if ($TestFeatureVecH{$feature} != 0){
1283
+ if ($norm) {
1284
+ if ($$FeatureDictH{$feature}{max} != 0) {
1285
+ # print "feature: $TestFeatureVecH{$feature} ; dict $$FeatureDictH{$feature}{max} => ";
1286
+ my $tmpval = sprintf("%.8f", $TestFeatureVecH{$feature}/$$FeatureDictH{$feature}{max});
1287
+ $TestFeatureVecH{$feature} = $tmpval;
1288
+ #print " $TestFeatureVecH{$feature} \n";
1289
+ }else {
1290
+ #print "zero max\: $feature \n";
1291
+ }
1292
+ }
1293
+ if ($FiletoPrint ne "") {
1294
+ print PFH "$$FeatureDictH{$feature}{ID}\:$TestFeatureVecH{$feature} ";
1295
+ }
1296
+ $SVMFeaVec .= "$$FeatureDictH{$feature}{ID}\:$TestFeatureVecH{$feature} ";
1297
+ }else {
1298
+ #print "zero value\: $feature ($TestFeatureVecH{$feature}) \n";
1299
+ }
1300
+ }
1301
+ }
1302
+
1303
+
1304
+ if ($FiletoPrint ne "") {
1305
+ print PFH "\n";
1306
+ close(PFH);
1307
+ }
1308
+
1309
+ my $convertedStr = join(" ", @words);
1310
+ return(\%TestFeatureVecH);
1311
+ # return($SVMFeaVec);
1312
+ #return($convertedStr);
1313
+ }
1314
+
1315
+
1316
+ sub WordFeatureRepre() {
1317
+ my $line = shift;
1318
+ my $dict = shift;
1319
+ my @FeatureLine;
1320
+
1321
+
1322
+ return(\@FeatureLine);
1323
+ }
1324
+
1325
+ #Given a line, make the space explicit
1326
+ sub FillSpace() { #recognize <<sep>>, instead of <sep>
1327
+ my $content = shift;
1328
+ my $lineNO = 0;
1329
+
1330
+ $content =~ s/\s+<<sep>>/<<sep>>/g;
1331
+ $content =~ s/<<\/sep>>\s+/<<\/sep>>/g;
1332
+
1333
+ my $punc = 0; # space is the only separator
1334
+ if ($content =~ /([^\p{IsLower}\p{IsUpper}\s+\-\.\d+(<<sep>>)(<<\/sep>>)])|(\W+and\W+)/ig) {
1335
+ $punc = 1; #contains punctuation
1336
+ }
1337
+
1338
+ my @Seq = split(/(<<sep>>[^\<\>]*<<\/sep>>)/, $content); #the () keeps the spliter in the array @Seq!
1339
+ for my $i (0 .. $#Seq) {
1340
+ if ($Seq[$i] =~ /<<sep>>/) {
1341
+ #print "spliter\: $Seq[$i] \n";
1342
+ }else {
1343
+ #this is the place to separate the punctuations and fill the space
1344
+ # print "before removing the space $Seq[$i] \n";
1345
+ $Seq[$i] =~ s/\s+<<sep>>/<<sep>>/g;
1346
+ $Seq[$i] =~ s/<<\/sep>>\s+/<<\/sep>>/g;
1347
+ # remove space arround punctuations
1348
+ $Seq[$i] =~ s/\s+/ \<space\> /g;
1349
+ $Seq[$i] =~ s/<space>\s+(\W+)\s+<space>/ $1 /g;
1350
+ # print "after removing the space $Seq[$i]\n";
1351
+ }
1352
+ }
1353
+
1354
+ $content = join(" ", @Seq);
1355
+ return($punc, $content);
1356
+ }
1357
+
1358
+
1359
+ #Given a line, make the space explicit
1360
+ sub OfflineFillSpace() { #recognize <sep>
1361
+ my $content = shift;
1362
+ my $lineNO = 0;
1363
+
1364
+ $content =~ s/\s+<sep>/<sep>/g;
1365
+ $content =~ s/<\/sep>\s+/<\/sep>/g;
1366
+
1367
+ my $punc = 0; # space is the only separator
1368
+ if ($content =~ /([^\p{IsLower}\p{IsUpper}\s+\-\.\d+(<sep>)(<\/sep>)])|(\W+and\W+)/ig) {
1369
+ $punc = 1; #contains punctuation
1370
+ }
1371
+
1372
+ my @Seq = split(/(<sep>[^\<\>]*<\/sep>)/, $content); #the () keeps the spliter in the array @Seq!
1373
+ for my $i (0 .. $#Seq) {
1374
+ if ($Seq[$i] =~ /<sep>/) {
1375
+ #print "spliter\: $Seq[$i] \n";
1376
+ }else {
1377
+ #this is the place to separate the punctuations and fill the space
1378
+ # print "before removing the space $Seq[$i] \n";
1379
+ $Seq[$i] =~ s/\s+<sep>/<sep>/g;
1380
+ $Seq[$i] =~ s/<\/sep>\s+/<\/sep>/g;
1381
+ # remove space arround punctuations
1382
+ $Seq[$i] =~ s/\s+/ \<space\> /g;
1383
+ $Seq[$i] =~ s/<space>\s+(\W+)\s+<space>/ $1 /g;
1384
+ # print "after removing the space $Seq[$i]\n";
1385
+ }
1386
+ }
1387
+
1388
+ $content = join(" ", @Seq);
1389
+ return($punc, $content);
1390
+ }
1391
+
1392
+
1393
+ sub SeparatePunc0108bak() {
1394
+ my $line = shift;
1395
+
1396
+ #added 12/16
1397
+ $line =~ s/^\s+//g;
1398
+ $line =~ s/\s+$//g;
1399
+
1400
+ $line =~ s/([^\p{IsLower}\p{IsUpper}\s+\-\d+\<\>\.]\s+)/ $1 /g;
1401
+ $line =~ s/[\w+]{3,}(\.)\s+/ $1 /g;
1402
+ $line =~ s/\s+/ /;
1403
+
1404
+ return($line);
1405
+ }
1406
+
1407
+ sub SeparatePunc() {
1408
+ my $line = shift;
1409
+
1410
+ $line =~ s/^\s+//g;
1411
+ $line =~ s/\s+$//g;
1412
+
1413
+ $line =~ s/([^\p{IsLower}\p{IsUpper}\s+\-\d+\<\>\.]\s+)/ $1 /g;
1414
+ $line =~ s/([\w+]{3,})(\.)\s+/$1 $2 /g; #"Dept. of" becomes "Dept . of"
1415
+ #How about blah, blah, ... blah. And ....
1416
+ #Dr. Smith will be keep the small dot
1417
+ #Sep. will keep the small dot as well.
1418
+ #But how about removing every dot, including Dr. and Sep. ?
1419
+
1420
+ # $line =~ s/\W+$//g; #remove last punctuation
1421
+ $line =~ s/\s+/ /;
1422
+
1423
+ return($line);
1424
+ }
1425
+
1426
+
1427
+ sub weired_author(){
1428
+ my $str = shift;
1429
+
1430
+ my $weired = 0;
1431
+ my %weired_words = (
1432
+ 'Departamento' =>1,
1433
+ 'IN PRESS'=>1,
1434
+ 'PRESS'=>1,
1435
+ 'Center'=>1,
1436
+ 'Ltd' =>1,
1437
+ 'Universidad'=>1,
1438
+ 'chair' =>1,
1439
+ 'Submitted'=>1,
1440
+ 'pp'=>1,
1441
+ 'Version'=>1,
1442
+ 'Thesis' =>1,
1443
+ 'Proposal' =>1,
1444
+ 'University'=>1,
1445
+ 'Universiteit'=>1,
1446
+ 'Institut'=>1,
1447
+ 'extended'=>1,
1448
+ 'abstract'=>1,
1449
+ 'Laboratoire'=>1,
1450
+ 'COVER PAGE'=>1,
1451
+ 'COVER'=>1,
1452
+ 'Page' => 1,
1453
+ 'Job Title'=>1,
1454
+ 'Job'=>1,
1455
+ 'Title'=>1,
1456
+ 'Case Study'=>1,
1457
+ 'Case Sludy'=>1,
1458
+ 'Case'=>1,
1459
+ 'Report'=>1,
1460
+ 'Reply'=>1,
1461
+ 'A Report'=>1,
1462
+ 'A Reply'=>1,
1463
+ 'Research'=>1,
1464
+ 'Paper'=>1,
1465
+ 'Research Paper'=>1,
1466
+ 'Research Project'=>1,
1467
+ 'Project'=>1,
1468
+ 'Retrospective'=>1,
1469
+ 'Roadmap'=>1,
1470
+ 'Tutorial'=>1,
1471
+ 'WORKING PAPER'=>1,
1472
+ 'Working' =>1,
1473
+ 'White Paper'=>1,
1474
+ 'in honor of'=>1,
1475
+ 'international' =>1,
1476
+ 'Dataset' =>1,
1477
+ 'Sample' =>1,
1478
+ 'Network'=>1,
1479
+ 'Networks'=>1,
1480
+ 'Academiae'=>1,
1481
+ 'company'=>1,
1482
+ 'Submitted'=>1,
1483
+ );
1484
+
1485
+ my %filter_words = (
1486
+ 'honor'=>1,
1487
+ 'ed'=>1,
1488
+ 'eds'=>1,
1489
+ 'jr'=>1,
1490
+ 'jr\.'=>1,
1491
+ 'authors'=>1,
1492
+ 'author' =>1,
1493
+ 'editor'=>1,
1494
+ 'editors'=>1,
1495
+ 'with'=>1,
1496
+ 'by'=>1,
1497
+ );
1498
+
1499
+ #if separate authors into individuals.
1500
+ ## $str =~ s/^\s*[^\p{IsLower}\p{IsUpper}\d\-\.]//g;
1501
+ # $str =~ s/[^\p{IsLower}\p{IsUpper}\d\-\.]\s*$//g;
1502
+
1503
+ my @weired_words_arr = keys %weired_words;
1504
+ my $weired_words_str = join("|", @weired_words_arr);
1505
+
1506
+ #print "\n\nbefore: $str\n";
1507
+ $str =~ s/\./\. /g;
1508
+ $str =~ s/\d+//g;
1509
+ $str =~ s/^\s*\W+//g;
1510
+ $str =~ s/\W+\s*$//g;
1511
+ $str =~ s/\s+/ /g;
1512
+ $str = &str_space_clean($str);
1513
+ #print "after: $str \n";
1514
+
1515
+ my @words = split(/\s+/, $str);
1516
+ my $lcase_num = 0;
1517
+ my $weired_form = 0;
1518
+ my @new_name = ();
1519
+ my $pure_single_letter = 1;
1520
+ for my $i(0 .. $#words) {
1521
+ if ( (length($words[$i]) > 1) && ($words[$i] !~ /^\w\.$/)) {
1522
+ $pure_single_letter = 0;
1523
+ }
1524
+ if ($filter_words{lc($words[$i])} || ($words[$i] !~ /\w/)) {
1525
+ next;
1526
+ }else {
1527
+ if ($words[$i] =~ /^[\p{IsLower}\-]+$/) {
1528
+ $lcase_num++;
1529
+ }elsif ($words[$i] =~ /[^\p{IsLower}\p{IsUpper}\-\.]/) {
1530
+ $weired_form++;
1531
+ }
1532
+ #make the first letter capitalized
1533
+ $words[$i] = ucfirst(lc($words[$i]));
1534
+ push @new_name, $words[$i];
1535
+ }
1536
+ }
1537
+ if (($pure_single_letter) || ($str =~ /$weired_words_str/) || ($#words > 4) || ($#new_name <1) || (($#words +1 - $weired_form) < 2) || ($lcase_num>2)) {
1538
+ $weired = 1;
1539
+ }
1540
+ #print "weired:? $weired \n";
1541
+ $str = join(' ', @new_name);
1542
+ #print "final str $str\n";
1543
+
1544
+ return($weired, $str);
1545
+ }
1546
+
1547
+
1548
+
1549
+ #turn array into hash map { $hash_name{$_} =$some_value } @array_name;
1550
+ sub hash_stopwords {
1551
+ my $stopword = "$Database_Dir/stopwords";
1552
+ my %stopH = ();
1553
+ open(stopReader, "$stopword") || die "SVMHeaderParse: could not open $stopword to read \n";
1554
+ while (my $line = <stopReader>) {
1555
+ $line = &str_space_clean($line);
1556
+ $stopH{$line}++;
1557
+ }
1558
+ close(stopReader);
1559
+ return(\%stopH);
1560
+ }
1561
+
1562
+ sub hash_affi_stopwords {
1563
+ #my $DB_dir = "/home/hhan/projects/public_library/DB";
1564
+ my $stopword = "$Database_Dir/affi.txt";
1565
+ my %stopH = ();
1566
+ open(stopReader, "$stopword") || die "SVMHeaderParse: could not open $stopword to read \n";
1567
+ while (my $line = <stopReader>) {
1568
+ $line = &str_space_clean($line);
1569
+ $line =~ s/^\d+\s+//g;
1570
+ $stopH{lc($line)}++;
1571
+ }
1572
+ close(stopReader);
1573
+ return(\%stopH);
1574
+ }
1575
+
1576
+ sub hash_nickname{
1577
+ #my $DB_dir = "/home/hhan/projects/public_library/DB";
1578
+ my $stopword = "$Database_Dir/nickname.txt";
1579
+ my %stopH = ();
1580
+ open(stopReader, "$stopword") || die "SVMHeaderParse: could not open $stopword to read \n";
1581
+ while (my $line = <stopReader>) {
1582
+ $line = &str_space_clean($line);
1583
+ my @names = split(/<>|\s*\,\s*/, $line);
1584
+ for my $i(1 .. $#names) {
1585
+ $stopH{lc($names[0])}{lc($names[$i])} = 1;
1586
+ }
1587
+ }
1588
+ close(stopReader);
1589
+ return(\%stopH);
1590
+ }
1591
+
1592
+ sub hash_statewords {
1593
+ #my $DB_dir = "/home/hhan/projects/public_library/DB";
1594
+ my $stopword = "$Database_Dir/statename.txt";
1595
+ my %stopH = ();
1596
+ open(stopReader, "$stopword") || die "SVMHeaderParse: could not open $stopword to read \n";
1597
+ while (my $line = <stopReader>) {
1598
+ $line = &str_space_clean($line);
1599
+ my ($state, $abbr) = split(/\s*\,\s*/, $line);
1600
+ $stopH{$abbr} = $state;
1601
+ }
1602
+ close(stopReader);
1603
+ return(\%stopH);
1604
+ }
1605
+
1606
+
1607
+ sub hash_addrwords {
1608
+ #my $DB_dir = "/home/hhan/projects/public_library/DB";
1609
+ my $stopword = "$Database_Dir/addr.txt";
1610
+ my %stopH = ();
1611
+ open(stopReader, "$stopword") || die "SVMHeaderParse: could not open $stopword to read \n";
1612
+ while (my $line = <stopReader>) {
1613
+ $line = &str_space_clean($line);
1614
+ $line =~ s/^\d+\s+//g;
1615
+ $stopH{lc($line)}++;
1616
+ }
1617
+ close(stopReader);
1618
+ return(\%stopH);
1619
+ }
1620
+
1621
+ sub str_space_clean() {
1622
+ my $str = shift;
1623
+
1624
+ $str =~ s/\s+/ /g;
1625
+ $str =~ s/^\s+//g;
1626
+ $str =~ s/\s+$//g;
1627
+ return($str);
1628
+ }
1629
+
1630
+ sub nfreeze_hash_to_file() {
1631
+ my $H = shift;
1632
+ my $F = shift;
1633
+
1634
+ my $mystring = nfreeze($H);
1635
+ open(dumpFH, ">$F") || die "SVMHeaderParse: could not open $F to write: $!";
1636
+ print dumpFH "$mystring";
1637
+ close(dumpFH);
1638
+ }
1639
+
1640
+ sub dump_hash_to_file() {
1641
+ my $H = shift;
1642
+ my $F = shift;
1643
+
1644
+ $d = Data::Dumper->new([$H]);
1645
+ $mystring = $d->Dump;
1646
+ open(dumpFH, ">$F") || die "SVMHeaderParse: could not open $F to write: $!";
1647
+ print dumpFH "$mystring";
1648
+ close(dumpFH);
1649
+ }
1650
+
1651
+
1652
+ sub read_hash_from_file() {
1653
+ my $file = shift;
1654
+
1655
+ undef $/;
1656
+ open(dumpFH, "$file") || die "SVMHeaderParse: could not open $file to read. \n";
1657
+ my $string = <dumpFH>;
1658
+ close(dumpFH);
1659
+ $/ = "\n";
1660
+
1661
+ eval($string);
1662
+ return($VAR1);
1663
+ }
1664
+
1665
+ sub thaw_hash_from_file() {
1666
+ my $file = shift;
1667
+
1668
+ undef $/;
1669
+ open(dumpFH, "$file") || die "SVMHeaderParse: could not open $file to read. \n";
1670
+ my $string = <dumpFH>;
1671
+ close(dumpFH);
1672
+ $/ = "\n";
1673
+
1674
+ my $VAR1 = thaw($string);
1675
+ return($VAR1);
1676
+ }
1677
+
1678
+ sub rand_split_samples_to2parts() {
1679
+ my $samples = shift; #array
1680
+ my $ratio = shift;
1681
+
1682
+ my $total_num = $#$samples;
1683
+ my $num1 = int($total_num*$ratio);
1684
+ my $num2 = $total_num - $num1;
1685
+ my (@part1, @part2);
1686
+ print STDERR "rand_split_samples_to2parts\: $ratio of $total_num is $num1\n";
1687
+ $t=time;
1688
+ srand($t); #seed
1689
+ for($j=$total_num;$j>=0;$j--){
1690
+ $r=int(rand($j));
1691
+ if (($total_num - $j) < $num1) {
1692
+ push @part1, $$samples[$r];
1693
+ #adjust the samples after the selected one
1694
+ for my $k($r .. $#$samples-1) {
1695
+ $$samples[$k] = $$samples[$k+1];
1696
+ }
1697
+ pop @$samples;
1698
+ }else {
1699
+ push @part2, $$samples[$r];
1700
+ }
1701
+ }
1702
+ return(\@part1, \@part2);
1703
+ }
1704
+
1705
+ sub rand_split_samples_to2parts_v2() {
1706
+ my $samples = shift; #array
1707
+ my $ratio = shift;
1708
+
1709
+ my $total_num = $#$samples;
1710
+ my $num1 = int($total_num*$ratio);
1711
+ my $num2 = $total_num - $num1;
1712
+ my (@part1, @part2);
1713
+ print STDERR "rand_split_samples_to2parts\: $ratio of $total_num is $num1\n";
1714
+ $t=time;
1715
+ srand($t); #seed
1716
+ for($j=$total_num;$j>=0;$j--){
1717
+ $r=int(rand($j));
1718
+ if (($total_num - $j) < $num1) {
1719
+ push @part1, $$samples[$r];
1720
+ #adjust the samples after the selected one
1721
+ for my $k($r .. $#$samples-1) {
1722
+ $$samples[$k] = $$samples[$k+1];
1723
+ }
1724
+ pop @$samples;
1725
+ }
1726
+ }
1727
+ return(\@part1, $samples);
1728
+ }
1729
+
1730
+ sub rand_split_samples_toNparts() {
1731
+ my $samples = shift; #array
1732
+ my $fold = shift;
1733
+
1734
+ my $total_num = $#$samples;
1735
+ my $unit = int($total_num/$fold +1);
1736
+ my $last_fold = $total_num - $unit*($fold-1);
1737
+
1738
+ my @data = ();
1739
+ $t=time;
1740
+ srand($t); #seed
1741
+ for($j=$total_num;$j>=1;$j--){
1742
+ $r=int(rand($j));
1743
+ my $subfold = int(($total_num - $j)/$unit) + 1;
1744
+ push @{$data[$subfold]}, $$samples[$r];
1745
+ #adjust the samples after the selected one
1746
+ for my $k($r .. $#$samples-1) {
1747
+ $$samples[$k] = $$samples[$k+1];
1748
+ }
1749
+ pop @$samples;
1750
+ }
1751
+ return(@data);
1752
+ }
1753
+
1754
+ sub rand_split_hash_index_toNparts() {
1755
+ my $sample_hash = shift; #hash
1756
+ my $fold = shift;
1757
+
1758
+ my @sample_arr = keys %{$sample_hash};
1759
+ my $total_num = $#sample_arr;
1760
+ my $unit = int($total_num/$fold +1);
1761
+ my $last_fold = $total_num - $unit*($fold-1);
1762
+
1763
+ my @data = ();
1764
+ $t=time;
1765
+ srand($t); #seed
1766
+ for($j=$total_num;$j>=1;$j--){
1767
+ $r=int(rand($j));
1768
+ my $subfold = int(($total_num - $j)/$unit) + 1;
1769
+
1770
+ my $name = $sample_arr[$r];
1771
+ my %pos = ();
1772
+ if ($$sample_hash{$name}{label} > -1) {
1773
+ my @tmp = split(/\<\>/, $$sample_hash{$name}{label});
1774
+ map { $pos{$_} =1 } @tmp;
1775
+ }
1776
+ foreach my $file_name (keys %{$$sample_hash{$name}{name}}) {
1777
+ my ($tmp, $num) = split(/\_\_/, $file_name);
1778
+ my $label = "-1";
1779
+ if ($pos{$num}) {
1780
+ $label = "+1";
1781
+ }
1782
+ push @{$data[$subfold]}, "$label<>$file_name<>$$sample_hash{$name}{name}{$file_name}{snippet}";
1783
+ }
1784
+
1785
+ for my $k($r .. $#sample_arr-1) {
1786
+ $sample_arr[$k] = $sample_arr[$k+1];
1787
+ }
1788
+ pop @sample_arr;
1789
+ }
1790
+ return(@data);
1791
+ }
1792
+
1793
+ sub ExtractBinaryNfoldSVMResult() {
1794
+ my $in = shift;
1795
+ my %ResultH = ();
1796
+
1797
+ open (inFH, "$in") || die "SVMHeaderParse: could not open $in to read \n";
1798
+ while (my $line = <inFH>) {
1799
+ if ($line =~ /Accuracy on test set: (\d+\.\d+)\%/) {
1800
+ $ResultH{A}{count}++;
1801
+ $ResultH{A}{sum} += $1;
1802
+ }
1803
+ if ($line =~ /Precision\/recall on test set\: (.*)\%\/(.*)\%/) {
1804
+ my $P = $1;
1805
+ my $R = $2;
1806
+ if ($P =~ /\d+\.\d+/) {
1807
+ $ResultH{P}{count}++;
1808
+ $ResultH{P}{sum} += $P;
1809
+ }
1810
+ if ($R =~ /\d+\.\d+/) {
1811
+ $ResultH{R}{count}++;
1812
+ $ResultH{R}{sum} += $R;
1813
+ }
1814
+ }
1815
+ }
1816
+ close(inFH);
1817
+
1818
+ print STDERR "average result from cross validation \n";
1819
+ foreach my $eval(sort {$a <=> $b} keys %ResultH) {
1820
+ $ResultH{$eval}{avg} = sprintf("%.8f", $ResultH{$eval}{sum}/$ResultH{$eval}{count});
1821
+ print STDERR "evaluation($eval) -- $ResultH{$eval}{avg}\n";
1822
+ }
1823
+ }
1824
+
1825
+ ## get alias file
1826
+ sub GetNameVariations1() {
1827
+ my $personalName = shift; #like _Chris_S._Mellish__1.txt
1828
+
1829
+ my @QueryNameParts = split(/\s+|\-/, $personalName);
1830
+ my %NameVariations;
1831
+ my ($FirstName, $LastName, $FI, $LI, $MI1, $MI2, $FI_LI, $AllInitial, $AllName, $FILN, $FIMI1LastName, $FIMI1MI2LastName)
1832
+ = ('','','','','','','','','','','','','');
1833
+
1834
+ $FirstName = $QueryNameParts[0];
1835
+ $LastName = $QueryNameParts[$#QueryNameParts];
1836
+
1837
+ $NameVariations{$FirstName} = "FN";
1838
+ $NameVariations{$LastName} = "LN";
1839
+
1840
+ $FI = substr($FirstName, 0, 1);
1841
+ $LI = substr($LastName, 0, 1);
1842
+
1843
+ $FI_LI= "$FI"."$LI";
1844
+ $FILN = "$FI"."$LastName";
1845
+ $FNLI = "$FirstName"."$LI";
1846
+ $NameVariations{$FILN} = "FILN";
1847
+ $NameVariations{$FNLI} = "FNLI";
1848
+
1849
+ for my $i(0 .. $#QueryNameParts) {
1850
+ $QueryNameParts[$i] =~ s/\W+//g;
1851
+ $AllName .= $QueryNameParts[$i];
1852
+ }
1853
+ # dependts on whether this name contains 3 parts or 4 parts
1854
+ if ($#QueryNameParts < 1) {next;}
1855
+ if ($#QueryNameParts eq 1) {
1856
+ $AllInitial = $FI_LI;
1857
+ $NameVariations{$AllInitial} = "all_initial";
1858
+ }else {
1859
+ $NameVariations{$FI_LI} = "FILI";
1860
+ $NameVariations{"$FN"."$QueryNameParts[1]"} = "FNMN";
1861
+ if ($#QueryNameParts eq 2) {
1862
+ $MI1 = substr($QueryNameParts[1], 0, 1);
1863
+ $AllInitial= "$FI"."$MI1"."$LI";
1864
+ $FIMI1LastName = "$FI"."$MI1"."$LastName";
1865
+ $NameVariations{$AllInitial} = "all_initial";
1866
+ $NameVariations{$FIMI1LastName} = "FIMI1LN";
1867
+ }elsif ($#NameParts eq 3) {
1868
+ $MI1 = substr($QueryNameParts[1], 0, 1);
1869
+ $MI2 = substr($QueryNameParts[2], 0, 1);
1870
+ $AllInitial = "$FI"."$MI1"."$MI2"."$LI";
1871
+ $FIMI1LastName = "$FI"."$MI1"."$LastName";
1872
+ $FIMI1MI2LastName = "$FI"."$MI1"."$MI2"."$LastName";
1873
+ $NameVariations{$AllInitial} = "all_initial";
1874
+ $NameVariations{$FIMI1LastName} = "FIMI1LN";
1875
+ $NameVariations{$FIMI1MI2LastName} = "FIMI1MI2LN";
1876
+ }
1877
+ }
1878
+
1879
+ ## It will take chance for this exact match
1880
+ if (length ($QueryNameParts[$#QueryNameParts]) < 4) {
1881
+ $PartLastName = substr($QueryNameParts[$#QueryNameParts], 0, 5);
1882
+ $NameVariations{$PartLastName} = "partial_LN";
1883
+ }
1884
+ return(\%NameVariations);
1885
+ }
1886
+
1887
+ sub GetNameVariations() {
1888
+ my $personalName = shift; #like _Chris_S._Mellish__1.txt
1889
+ my $nickname = shift;
1890
+
1891
+ my @QueryNameParts = split(/\s+|\-/, $personalName);
1892
+ my %NameVariations;
1893
+ my ($FirstName, $LastName, $FI, $LI, $MI1, $MI2, $FI_LI, $AllInitial, $AllName, $FILN, $FIMI1LastName, $FIMI1MI2LastName)
1894
+ = ('','','','','','','','','','','','','');
1895
+
1896
+ $FirstName = $QueryNameParts[0];
1897
+ $LastName = $QueryNameParts[$#QueryNameParts];
1898
+ # using first 5 letters decreases performance if not using substring matching
1899
+ # if (length($QueryNameParts[0]) > 4) {
1900
+ # $FirstName = substr($QueryNameParts[0],0,5);
1901
+ # }
1902
+ # if (length($QueryNameParts[$#QueryNameParts]) > 4) {
1903
+ # $LastName = substr($QueryNameParts[$#QueryNameParts],0,5);
1904
+ # }
1905
+ $NameVariations{$FirstName} = "FN";
1906
+ $NameVariations{$LastName} = "LN";
1907
+ foreach my $alias(keys %{$$nickname{lc($FirstName)}}) {
1908
+ $NameVariations{$alias} = "FN";
1909
+ }
1910
+ $FI = substr($FirstName, 0, 1);
1911
+ $LI = substr($LastName, 0, 1);
1912
+
1913
+ $FI_LI= "$FI"."$LI";
1914
+ $FILN = "$FI"."\\"."w*"."$LastName";
1915
+ $FNLI = "$FirstName"."$LI";
1916
+ $LNFI = "$LastName"."$FI";
1917
+ $NameVariations{$FILN} = "FILN";
1918
+ $NameVariations{"$FI"."\."."$LastName"} = "FILN";
1919
+ $NameVariations{$FNLI} = "FNLI";
1920
+ $NameVariations{$LNFI} = "LNFI";
1921
+
1922
+ for my $i(0 .. $#QueryNameParts) {
1923
+ $QueryNameParts[$i] =~ s/\W+//g;
1924
+ $AllName .= $QueryNameParts[$i];
1925
+ }
1926
+ $NameVariations{$AllName} = "full_name";
1927
+ $NameVariations{"\\"."w*"."$FirstName"."\\"."w*"."$LastName"."\\"."w*"} = "FNLN";
1928
+ $NameVariations{"$FirstName"."\."."$LastName"} = "FNLN";
1929
+ $NameVariations{"$LastName"."$FirstName"} = "LNFN";
1930
+
1931
+ # depends on whether this name contains 3 parts or 4 parts
1932
+ if ($#QueryNameParts < 1) {next;}
1933
+ elsif ($#QueryNameParts eq 1) {
1934
+ $AllInitial = $FI_LI;
1935
+ $NameVariations{$AllInitial} = "all_initial";
1936
+ }else {
1937
+ $NameVariations{$FI_LI} = "FILI";
1938
+ $NameVariations{"$FirstName"."$QueryNameParts[1]"} = "FNMN";
1939
+ if ($#QueryNameParts eq 2) {
1940
+ $MI1 = substr($QueryNameParts[1], 0, 1);
1941
+ $AllInitial= "$FI"."$MI1"."$LI";
1942
+ $FIMI1LastName = "$FI"."$MI1"."$LastName";
1943
+ $NameVariations{$AllInitial} = "all_initial";
1944
+ $NameVariations{$FIMI1LastName} = "FIMI1LN";
1945
+ }elsif ($#QueryNameParts eq 3) {
1946
+ $MI1 = substr($QueryNameParts[1], 0, 1);
1947
+ $MI2 = substr($QueryNameParts[2], 0, 1);
1948
+ $AllInitial = "$FI"."$MI1"."$MI2"."$LI";
1949
+ $FIMI1LastName = "$FI"."$MI1"."$LastName";
1950
+ $FIMI2LI = "$FI"."$MI2"."$LI";
1951
+ $FIMI1LI = "$FI"."$MI1"."$LI";
1952
+ $FIMI1MI2LastName = "$FI"."$MI1"."$MI2"."$LastName";
1953
+ $MN2LastName = "$QueryNameParts[2]"."$LastName";
1954
+ $NameVariations{$AllInitial} = "all_initial";
1955
+ $NameVariations{$FIMI1LastName} = "FIMI1LN";
1956
+ $NameVariations{$FIMI1LI} = "FIMI1LI";
1957
+ $NameVariations{$FIMI2LI} = "FIMI2LI";
1958
+ $NameVariations{$MN2LastName} = "MI2LN";
1959
+ $NameVariations{$FIMI1MI2LastName} = "FIMI1MI2LN";
1960
+ }
1961
+ }
1962
+
1963
+ ## It will take chance for this exact match
1964
+ if (length ($QueryNameParts[$#QueryNameParts]) < 4) {
1965
+ $PartLastName = substr($QueryNameParts[$#QueryNameParts], 0, 5);
1966
+ $NameVariations{$PartLastName} = "partial_LN";
1967
+ }
1968
+ return(\%NameVariations);
1969
+ }
1970
+
1971
+ sub get_university_emails() {
1972
+ my $univ = "$Database_Dir/university_list/univ-full.html";
1973
+ my $simple_format = "$Database_Dir/university_list.txt";
1974
+
1975
+ my %H = ();
1976
+ open(UNIV, $univ) || die "SVMHeaderParse: could not open $univ to read. \n";
1977
+ my @content = <UNIV>;
1978
+ close(UNIV);
1979
+
1980
+ open(simpleWriter, ">$simple_format") || die "SVMHeaderParse: could not open $simple_format to write: $!";
1981
+ for my $i(0 .. $#content) {
1982
+ if ($content[$i] =~ /\<LI\>\s+\<A\s+HREF\=\"([^\"]*)\"\>(.*)\<\/A\>/) {
1983
+ my $url = $1;
1984
+ my $college = $2;
1985
+ print simpleWriter "$college<>$url\n";
1986
+ }
1987
+ }
1988
+ close(simpleWriter);
1989
+ return(\%H);
1990
+ }
1991
+
1992
+ #input: an array of value
1993
+ sub compute_std() {
1994
+ my $arr = shift;
1995
+ my $mean = 0;
1996
+ my $std = 0;
1997
+
1998
+ #cal mean
1999
+ for my $i(0 .. $#$arr) {
2000
+ $mean += $$arr[$i];
2001
+ }
2002
+ $mean = sprintf("%.3f", $mean/($#$arr+1));
2003
+
2004
+ for my $i(0 .. $#$arr) {
2005
+ $std += ($$arr[$i]-$mean)**2;
2006
+ }
2007
+ my $temp = sprintf("%.8f", $std/$#$arr);
2008
+ $std = sqrt($temp);
2009
+
2010
+ return($mean, $std);
2011
+ }
2012
+
2013
+
2014
+
2015
+
2016
+ 1;