biblicit 1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (322) hide show
  1. data/.gitignore +3 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +6 -0
  4. data/LICENSE.TXT +176 -0
  5. data/README.md +120 -0
  6. data/Rakefile +8 -0
  7. data/biblicit.gemspec +33 -0
  8. data/lib/biblicit/cb2bib.rb +83 -0
  9. data/lib/biblicit/citeseer.rb +53 -0
  10. data/lib/biblicit/extractor.rb +37 -0
  11. data/lib/biblicit.rb +6 -0
  12. data/perl/DocFilter/lib/CSXUtil/SafeText.pm +140 -0
  13. data/perl/DocFilter/lib/DocFilter/Config.pm +35 -0
  14. data/perl/DocFilter/lib/DocFilter/Filter.pm +51 -0
  15. data/perl/FileConversionService/README.TXT +11 -0
  16. data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
  17. data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +140 -0
  18. data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +77 -0
  19. data/perl/FileConversionService/lib/FileConverter/Compression.pm +137 -0
  20. data/perl/FileConversionService/lib/FileConverter/Config.pm +57 -0
  21. data/perl/FileConversionService/lib/FileConverter/Controller.pm +191 -0
  22. data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +61 -0
  23. data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +69 -0
  24. data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +69 -0
  25. data/perl/FileConversionService/lib/FileConverter/PSToText.pm +88 -0
  26. data/perl/FileConversionService/lib/FileConverter/Prescript.pm +68 -0
  27. data/perl/FileConversionService/lib/FileConverter/TET.pm +75 -0
  28. data/perl/FileConversionService/lib/FileConverter/Utils.pm +130 -0
  29. data/perl/HeaderParseService/README.TXT +80 -0
  30. data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +140 -0
  31. data/perl/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +968 -0
  32. data/perl/HeaderParseService/lib/HeaderParse/API/Function.pm +2016 -0
  33. data/perl/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +444 -0
  34. data/perl/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +409 -0
  35. data/perl/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +537 -0
  36. data/perl/HeaderParseService/lib/HeaderParse/API/Parser.pm +68 -0
  37. data/perl/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +1880 -0
  38. data/perl/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +46 -0
  39. data/perl/HeaderParseService/resources/data/EbizHeaders.txt +24330 -0
  40. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +27506 -0
  41. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +26495 -0
  42. data/perl/HeaderParseService/resources/data/tagged_headers.txt +40668 -0
  43. data/perl/HeaderParseService/resources/data/test_header.txt +31 -0
  44. data/perl/HeaderParseService/resources/data/test_header.txt.parsed +31 -0
  45. data/perl/HeaderParseService/resources/database/50states +60 -0
  46. data/perl/HeaderParseService/resources/database/AddrTopWords.txt +17 -0
  47. data/perl/HeaderParseService/resources/database/AffiTopWords.txt +35 -0
  48. data/perl/HeaderParseService/resources/database/AffiTopWordsAll.txt +533 -0
  49. data/perl/HeaderParseService/resources/database/ChineseSurNames.txt +276 -0
  50. data/perl/HeaderParseService/resources/database/Csurnames.bin +0 -0
  51. data/perl/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
  52. data/perl/HeaderParseService/resources/database/DomainSuffixes.txt +242 -0
  53. data/perl/HeaderParseService/resources/database/LabeledHeader +18 -0
  54. data/perl/HeaderParseService/resources/database/README +2 -0
  55. data/perl/HeaderParseService/resources/database/TrainMulClassLines +254 -0
  56. data/perl/HeaderParseService/resources/database/TrainMulClassLines1 +510 -0
  57. data/perl/HeaderParseService/resources/database/abstract.txt +1 -0
  58. data/perl/HeaderParseService/resources/database/abstractTopWords +9 -0
  59. data/perl/HeaderParseService/resources/database/addr.txt +28 -0
  60. data/perl/HeaderParseService/resources/database/affi.txt +34 -0
  61. data/perl/HeaderParseService/resources/database/affis.bin +0 -0
  62. data/perl/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
  63. data/perl/HeaderParseService/resources/database/allnamewords.bin +0 -0
  64. data/perl/HeaderParseService/resources/database/cities_US.txt +4512 -0
  65. data/perl/HeaderParseService/resources/database/cities_world.txt +4463 -0
  66. data/perl/HeaderParseService/resources/database/city.txt +3150 -0
  67. data/perl/HeaderParseService/resources/database/cityname.txt +3151 -0
  68. data/perl/HeaderParseService/resources/database/country_abbr.txt +243 -0
  69. data/perl/HeaderParseService/resources/database/countryname.txt +262 -0
  70. data/perl/HeaderParseService/resources/database/dateTopWords +30 -0
  71. data/perl/HeaderParseService/resources/database/degree.txt +67 -0
  72. data/perl/HeaderParseService/resources/database/email.txt +3 -0
  73. data/perl/HeaderParseService/resources/database/excludeWords.txt +40 -0
  74. data/perl/HeaderParseService/resources/database/female-names +4960 -0
  75. data/perl/HeaderParseService/resources/database/firstNames.txt +8448 -0
  76. data/perl/HeaderParseService/resources/database/firstnames.bin +0 -0
  77. data/perl/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
  78. data/perl/HeaderParseService/resources/database/intro.txt +2 -0
  79. data/perl/HeaderParseService/resources/database/keyword.txt +5 -0
  80. data/perl/HeaderParseService/resources/database/keywordTopWords +7 -0
  81. data/perl/HeaderParseService/resources/database/male-names +3906 -0
  82. data/perl/HeaderParseService/resources/database/middleNames.txt +2 -0
  83. data/perl/HeaderParseService/resources/database/month.txt +35 -0
  84. data/perl/HeaderParseService/resources/database/mul +868 -0
  85. data/perl/HeaderParseService/resources/database/mul.label +869 -0
  86. data/perl/HeaderParseService/resources/database/mul.label.old +869 -0
  87. data/perl/HeaderParseService/resources/database/mul.processed +762 -0
  88. data/perl/HeaderParseService/resources/database/mulAuthor +619 -0
  89. data/perl/HeaderParseService/resources/database/mulClassStat +45 -0
  90. data/perl/HeaderParseService/resources/database/nickname.txt +58 -0
  91. data/perl/HeaderParseService/resources/database/nicknames.bin +0 -0
  92. data/perl/HeaderParseService/resources/database/note.txt +121 -0
  93. data/perl/HeaderParseService/resources/database/page.txt +1 -0
  94. data/perl/HeaderParseService/resources/database/phone.txt +9 -0
  95. data/perl/HeaderParseService/resources/database/postcode.txt +54 -0
  96. data/perl/HeaderParseService/resources/database/pubnum.txt +45 -0
  97. data/perl/HeaderParseService/resources/database/statename.bin +0 -0
  98. data/perl/HeaderParseService/resources/database/statename.txt +73 -0
  99. data/perl/HeaderParseService/resources/database/states_and_abbreviations.txt +118 -0
  100. data/perl/HeaderParseService/resources/database/stopwords +438 -0
  101. data/perl/HeaderParseService/resources/database/stopwords.bin +0 -0
  102. data/perl/HeaderParseService/resources/database/surNames.txt +19613 -0
  103. data/perl/HeaderParseService/resources/database/surnames.bin +0 -0
  104. data/perl/HeaderParseService/resources/database/surnames_spec.bin +0 -0
  105. data/perl/HeaderParseService/resources/database/university_list/A.html +167 -0
  106. data/perl/HeaderParseService/resources/database/university_list/B.html +161 -0
  107. data/perl/HeaderParseService/resources/database/university_list/C.html +288 -0
  108. data/perl/HeaderParseService/resources/database/university_list/D.html +115 -0
  109. data/perl/HeaderParseService/resources/database/university_list/E.html +147 -0
  110. data/perl/HeaderParseService/resources/database/university_list/F.html +112 -0
  111. data/perl/HeaderParseService/resources/database/university_list/G.html +115 -0
  112. data/perl/HeaderParseService/resources/database/university_list/H.html +140 -0
  113. data/perl/HeaderParseService/resources/database/university_list/I.html +138 -0
  114. data/perl/HeaderParseService/resources/database/university_list/J.html +82 -0
  115. data/perl/HeaderParseService/resources/database/university_list/K.html +115 -0
  116. data/perl/HeaderParseService/resources/database/university_list/L.html +131 -0
  117. data/perl/HeaderParseService/resources/database/university_list/M.html +201 -0
  118. data/perl/HeaderParseService/resources/database/university_list/N.html +204 -0
  119. data/perl/HeaderParseService/resources/database/university_list/O.html +89 -0
  120. data/perl/HeaderParseService/resources/database/university_list/P.html +125 -0
  121. data/perl/HeaderParseService/resources/database/university_list/Q.html +49 -0
  122. data/perl/HeaderParseService/resources/database/university_list/R.html +126 -0
  123. data/perl/HeaderParseService/resources/database/university_list/S.html +296 -0
  124. data/perl/HeaderParseService/resources/database/university_list/T.html +156 -0
  125. data/perl/HeaderParseService/resources/database/university_list/U.html +800 -0
  126. data/perl/HeaderParseService/resources/database/university_list/V.html +75 -0
  127. data/perl/HeaderParseService/resources/database/university_list/W.html +144 -0
  128. data/perl/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
  129. data/perl/HeaderParseService/resources/database/university_list/X.html +44 -0
  130. data/perl/HeaderParseService/resources/database/university_list/Y.html +53 -0
  131. data/perl/HeaderParseService/resources/database/university_list/Z.html +43 -0
  132. data/perl/HeaderParseService/resources/database/university_list/ae.html +31 -0
  133. data/perl/HeaderParseService/resources/database/university_list/am.html +30 -0
  134. data/perl/HeaderParseService/resources/database/university_list/ar.html +35 -0
  135. data/perl/HeaderParseService/resources/database/university_list/at.html +43 -0
  136. data/perl/HeaderParseService/resources/database/university_list/au.html +82 -0
  137. data/perl/HeaderParseService/resources/database/university_list/bd.html +28 -0
  138. data/perl/HeaderParseService/resources/database/university_list/be.html +41 -0
  139. data/perl/HeaderParseService/resources/database/university_list/bg.html +28 -0
  140. data/perl/HeaderParseService/resources/database/university_list/bh.html +28 -0
  141. data/perl/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
  142. data/perl/HeaderParseService/resources/database/university_list/bm.html +28 -0
  143. data/perl/HeaderParseService/resources/database/university_list/bn.html +28 -0
  144. data/perl/HeaderParseService/resources/database/university_list/br.html +66 -0
  145. data/perl/HeaderParseService/resources/database/university_list/ca.html +174 -0
  146. data/perl/HeaderParseService/resources/database/university_list/ch.html +52 -0
  147. data/perl/HeaderParseService/resources/database/university_list/cl.html +40 -0
  148. data/perl/HeaderParseService/resources/database/university_list/cn.html +87 -0
  149. data/perl/HeaderParseService/resources/database/university_list/co.html +39 -0
  150. data/perl/HeaderParseService/resources/database/university_list/cr.html +34 -0
  151. data/perl/HeaderParseService/resources/database/university_list/cy.html +34 -0
  152. data/perl/HeaderParseService/resources/database/university_list/cz.html +44 -0
  153. data/perl/HeaderParseService/resources/database/university_list/de.html +128 -0
  154. data/perl/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
  155. data/perl/HeaderParseService/resources/database/university_list/dk.html +42 -0
  156. data/perl/HeaderParseService/resources/database/university_list/ec.html +31 -0
  157. data/perl/HeaderParseService/resources/database/university_list/ee.html +30 -0
  158. data/perl/HeaderParseService/resources/database/university_list/eg.html +29 -0
  159. data/perl/HeaderParseService/resources/database/university_list/es.html +68 -0
  160. data/perl/HeaderParseService/resources/database/university_list/et.html +28 -0
  161. data/perl/HeaderParseService/resources/database/university_list/faq.html +147 -0
  162. data/perl/HeaderParseService/resources/database/university_list/fi.html +49 -0
  163. data/perl/HeaderParseService/resources/database/university_list/fj.html +28 -0
  164. data/perl/HeaderParseService/resources/database/university_list/fo.html +28 -0
  165. data/perl/HeaderParseService/resources/database/university_list/fr.html +106 -0
  166. data/perl/HeaderParseService/resources/database/university_list/geog.html +150 -0
  167. data/perl/HeaderParseService/resources/database/university_list/gr.html +38 -0
  168. data/perl/HeaderParseService/resources/database/university_list/gu.html +28 -0
  169. data/perl/HeaderParseService/resources/database/university_list/hk.html +34 -0
  170. data/perl/HeaderParseService/resources/database/university_list/hr.html +28 -0
  171. data/perl/HeaderParseService/resources/database/university_list/hu.html +46 -0
  172. data/perl/HeaderParseService/resources/database/university_list/id.html +29 -0
  173. data/perl/HeaderParseService/resources/database/university_list/ie.html +49 -0
  174. data/perl/HeaderParseService/resources/database/university_list/il.html +35 -0
  175. data/perl/HeaderParseService/resources/database/university_list/in.html +109 -0
  176. data/perl/HeaderParseService/resources/database/university_list/is.html +32 -0
  177. data/perl/HeaderParseService/resources/database/university_list/it.html +75 -0
  178. data/perl/HeaderParseService/resources/database/university_list/jm.html +28 -0
  179. data/perl/HeaderParseService/resources/database/university_list/jo.html +28 -0
  180. data/perl/HeaderParseService/resources/database/university_list/jp.html +155 -0
  181. data/perl/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
  182. data/perl/HeaderParseService/resources/database/university_list/kr.html +65 -0
  183. data/perl/HeaderParseService/resources/database/university_list/kw.html +28 -0
  184. data/perl/HeaderParseService/resources/database/university_list/lb.html +28 -0
  185. data/perl/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
  186. data/perl/HeaderParseService/resources/database/university_list/lk.html +30 -0
  187. data/perl/HeaderParseService/resources/database/university_list/lt.html +31 -0
  188. data/perl/HeaderParseService/resources/database/university_list/lu.html +34 -0
  189. data/perl/HeaderParseService/resources/database/university_list/lv.html +30 -0
  190. data/perl/HeaderParseService/resources/database/university_list/ma.html +28 -0
  191. data/perl/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
  192. data/perl/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
  193. data/perl/HeaderParseService/resources/database/university_list/mk.html +29 -0
  194. data/perl/HeaderParseService/resources/database/university_list/mo.html +29 -0
  195. data/perl/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
  196. data/perl/HeaderParseService/resources/database/university_list/mt.html +28 -0
  197. data/perl/HeaderParseService/resources/database/university_list/mx.html +68 -0
  198. data/perl/HeaderParseService/resources/database/university_list/my.html +39 -0
  199. data/perl/HeaderParseService/resources/database/university_list/ni.html +28 -0
  200. data/perl/HeaderParseService/resources/database/university_list/nl.html +51 -0
  201. data/perl/HeaderParseService/resources/database/university_list/no.html +56 -0
  202. data/perl/HeaderParseService/resources/database/university_list/nz.html +41 -0
  203. data/perl/HeaderParseService/resources/database/university_list/pa.html +31 -0
  204. data/perl/HeaderParseService/resources/database/university_list/pe.html +40 -0
  205. data/perl/HeaderParseService/resources/database/university_list/ph.html +41 -0
  206. data/perl/HeaderParseService/resources/database/university_list/pl.html +51 -0
  207. data/perl/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
  208. data/perl/HeaderParseService/resources/database/university_list/pr.html +31 -0
  209. data/perl/HeaderParseService/resources/database/university_list/ps.html +28 -0
  210. data/perl/HeaderParseService/resources/database/university_list/pt.html +45 -0
  211. data/perl/HeaderParseService/resources/database/university_list/recognition.html +69 -0
  212. data/perl/HeaderParseService/resources/database/university_list/results.html +71 -0
  213. data/perl/HeaderParseService/resources/database/university_list/ro.html +38 -0
  214. data/perl/HeaderParseService/resources/database/university_list/ru.html +48 -0
  215. data/perl/HeaderParseService/resources/database/university_list/sd.html +28 -0
  216. data/perl/HeaderParseService/resources/database/university_list/se.html +57 -0
  217. data/perl/HeaderParseService/resources/database/university_list/sg.html +33 -0
  218. data/perl/HeaderParseService/resources/database/university_list/si.html +30 -0
  219. data/perl/HeaderParseService/resources/database/university_list/sk.html +35 -0
  220. data/perl/HeaderParseService/resources/database/university_list/th.html +45 -0
  221. data/perl/HeaderParseService/resources/database/university_list/tr.html +44 -0
  222. data/perl/HeaderParseService/resources/database/university_list/tw.html +76 -0
  223. data/perl/HeaderParseService/resources/database/university_list/ua.html +29 -0
  224. data/perl/HeaderParseService/resources/database/university_list/uk.html +168 -0
  225. data/perl/HeaderParseService/resources/database/university_list/univ-full.html +3166 -0
  226. data/perl/HeaderParseService/resources/database/university_list/univ.html +122 -0
  227. data/perl/HeaderParseService/resources/database/university_list/uy.html +31 -0
  228. data/perl/HeaderParseService/resources/database/university_list/ve.html +34 -0
  229. data/perl/HeaderParseService/resources/database/university_list/yu.html +28 -0
  230. data/perl/HeaderParseService/resources/database/university_list/za.html +46 -0
  231. data/perl/HeaderParseService/resources/database/university_list/zm.html +28 -0
  232. data/perl/HeaderParseService/resources/database/university_list.txt +3025 -0
  233. data/perl/HeaderParseService/resources/database/url.txt +1 -0
  234. data/perl/HeaderParseService/resources/database/webTopWords +225 -0
  235. data/perl/HeaderParseService/resources/database/words +45402 -0
  236. data/perl/HeaderParseService/resources/models/10ContextModelfold1 +369 -0
  237. data/perl/HeaderParseService/resources/models/10Modelfold1 +376 -0
  238. data/perl/HeaderParseService/resources/models/11ContextModelfold1 +400 -0
  239. data/perl/HeaderParseService/resources/models/11Modelfold1 +526 -0
  240. data/perl/HeaderParseService/resources/models/12ContextModelfold1 +510 -0
  241. data/perl/HeaderParseService/resources/models/12Modelfold1 +423 -0
  242. data/perl/HeaderParseService/resources/models/13ContextModelfold1 +364 -0
  243. data/perl/HeaderParseService/resources/models/13Modelfold1 +677 -0
  244. data/perl/HeaderParseService/resources/models/14ContextModelfold1 +459 -0
  245. data/perl/HeaderParseService/resources/models/14Modelfold1 +325 -0
  246. data/perl/HeaderParseService/resources/models/15ContextModelfold1 +340 -0
  247. data/perl/HeaderParseService/resources/models/15Modelfold1 +390 -0
  248. data/perl/HeaderParseService/resources/models/1ContextModelfold1 +668 -0
  249. data/perl/HeaderParseService/resources/models/1Modelfold1 +1147 -0
  250. data/perl/HeaderParseService/resources/models/2ContextModelfold1 +755 -0
  251. data/perl/HeaderParseService/resources/models/2Modelfold1 +796 -0
  252. data/perl/HeaderParseService/resources/models/3ContextModelfold1 +1299 -0
  253. data/perl/HeaderParseService/resources/models/3Modelfold1 +1360 -0
  254. data/perl/HeaderParseService/resources/models/4ContextModelfold1 +1062 -0
  255. data/perl/HeaderParseService/resources/models/4Modelfold1 +993 -0
  256. data/perl/HeaderParseService/resources/models/5ContextModelfold1 +1339 -0
  257. data/perl/HeaderParseService/resources/models/5Modelfold1 +2098 -0
  258. data/perl/HeaderParseService/resources/models/6ContextModelfold1 +888 -0
  259. data/perl/HeaderParseService/resources/models/6Modelfold1 +620 -0
  260. data/perl/HeaderParseService/resources/models/7ContextModelfold1 +257 -0
  261. data/perl/HeaderParseService/resources/models/7Modelfold1 +228 -0
  262. data/perl/HeaderParseService/resources/models/8ContextModelfold1 +677 -0
  263. data/perl/HeaderParseService/resources/models/8Modelfold1 +1871 -0
  264. data/perl/HeaderParseService/resources/models/9ContextModelfold1 +198 -0
  265. data/perl/HeaderParseService/resources/models/9Modelfold1 +170 -0
  266. data/perl/HeaderParseService/resources/models/NameSpaceModel +181 -0
  267. data/perl/HeaderParseService/resources/models/NameSpaceTrainF +347 -0
  268. data/perl/HeaderParseService/resources/models/WrapperBaseFeaDict +13460 -0
  269. data/perl/HeaderParseService/resources/models/WrapperContextFeaDict +14045 -0
  270. data/perl/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +510 -0
  271. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +23 -0
  272. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +23 -0
  273. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +23 -0
  274. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +23 -0
  275. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +23 -0
  276. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +23 -0
  277. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +23 -0
  278. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +23 -0
  279. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +23 -0
  280. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +23 -0
  281. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +23 -0
  282. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +23 -0
  283. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +23 -0
  284. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +23 -0
  285. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +23 -0
  286. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +23 -0
  287. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +23 -0
  288. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +23 -0
  289. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +23 -0
  290. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +23 -0
  291. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +23 -0
  292. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +23 -0
  293. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +23 -0
  294. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +23 -0
  295. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +23 -0
  296. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +23 -0
  297. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +23 -0
  298. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +23 -0
  299. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +23 -0
  300. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +23 -0
  301. data/perl/ParsCit/README.TXT +82 -0
  302. data/perl/ParsCit/crfpp/traindata/parsCit.template +60 -0
  303. data/perl/ParsCit/crfpp/traindata/parsCit.train.data +12104 -0
  304. data/perl/ParsCit/crfpp/traindata/tagged_references.txt +500 -0
  305. data/perl/ParsCit/lib/CSXUtil/SafeText.pm +140 -0
  306. data/perl/ParsCit/lib/ParsCit/Citation.pm +462 -0
  307. data/perl/ParsCit/lib/ParsCit/CitationContext.pm +132 -0
  308. data/perl/ParsCit/lib/ParsCit/Config.pm +46 -0
  309. data/perl/ParsCit/lib/ParsCit/Controller.pm +306 -0
  310. data/perl/ParsCit/lib/ParsCit/PostProcess.pm +367 -0
  311. data/perl/ParsCit/lib/ParsCit/PreProcess.pm +333 -0
  312. data/perl/ParsCit/lib/ParsCit/Tr2crfpp.pm +331 -0
  313. data/perl/ParsCit/resources/parsCit.model +0 -0
  314. data/perl/ParsCit/resources/parsCitDict.txt +148783 -0
  315. data/perl/extract.pl +199 -0
  316. data/spec/biblicit/cb2bib_spec.rb +48 -0
  317. data/spec/biblicit/citeseer_spec.rb +40 -0
  318. data/spec/fixtures/pdf/10.1.1.109.4049.pdf +0 -0
  319. data/spec/fixtures/pdf/Bagnoli Watts TAR 2010.pdf +0 -0
  320. data/spec/fixtures/pdf/ICINCO_2010.pdf +0 -0
  321. data/spec/spec_helper.rb +3 -0
  322. metadata +474 -0
@@ -0,0 +1,968 @@
1
+ #
2
+ # Copyright 2007 Penn State University
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ #
13
+ package HeaderParse::API::AssembleXMLMetadata;
14
+
15
+ # example input:
16
+ #<DID> 5 </DID>
17
+ #<note>Third ILOG International Users Meeting , 9 10 July 1997 , Paris , France</note>
18
+ #<title>Daily management of an earth observation satellite :</title>
19
+ #<title>comparison of ILOG Solver with dedicated algorithms</title>
20
+ #<title>for Valued Constraint Satisfaction Problems</title>
21
+ #<author>Michel Lemaitre</author>
22
+ #<author>G'erard Verfaillie</author>
23
+ #<abstract>ONERA/CERT</abstract>
24
+ #<abstract>2 , avenue</abstract>
25
+ #<address>Edouard Belin -- BP 4025 --</address>
26
+ #<address>31055 Toulouse cedex 4 -- France</address>
27
+ #<email>fMichel.Lemaitre,Gerard.Verfaillieg@cert.fr</email>
28
+
29
+ ################Function Description########################
30
+ # Find the name and its affiliation/address mapping
31
+ #Assumption:
32
+ #(1) It is valid to split the header by authors into chunks; each resulting chunk has complete informaiton of the authors in that chunk
33
+ #(2) Use edit-distance to find the mapping between authors and emails.
34
+
35
+ #For each author chunk with N authors.
36
+ #case 1: N = 1;
37
+ # (a) if (exists following affi. and addr.) {
38
+ # combine the following affi. and addrs.
39
+ # }
40
+ # (b) else
41
+ # warning
42
+
43
+ #case 2: N > 1;
44
+ # (a) if the following affi. and addrs == 1
45
+ # these N people share the affi. and addr.
46
+ # (b) if the following affi. and addrs == N
47
+ # map 1-1
48
+ # (c) otherwise
49
+ # warning
50
+
51
+ #package finalize_metata_extraction_v4;
52
+ use utf8;
53
+ use HeaderParse::Config::API_Config;
54
+ use Data::Dumper;
55
+ use String::Approx 'adist';
56
+ use HeaderParse::API::Function qw(&weired_author);
57
+ use CSXUtil::SafeText qw(&cleanXML &cleanAll);
58
+
59
+
60
+ sub assemble(){
61
+ my $rstr = shift;
62
+
63
+ $$rstr =~ s/^\s+//g;
64
+ $$rstr =~ s/\s+$//g;
65
+
66
+ my @xml_arr = split(/<DID>\s*(\d+)\s*<\/DID>/, $$rstr);
67
+
68
+ $did=1;
69
+ $xml_hash{$did} = $$rstr;
70
+
71
+ #turn arr into hash;
72
+ splice(@xml_arr,0,1);
73
+
74
+ my %xml_hash_parsed;
75
+
76
+ #start parsing authors and all their attributes
77
+ ($xml_hash_parsed{$did}, $uncertain) = &parse_xml($did, $xml_hash{$did},$uncertain_addr);
78
+
79
+ $xml_hash_parsed{$did}{raw} = $xml_hash{$did};
80
+ delete($xml_hash{$did});
81
+
82
+ if ($uncertain) {
83
+ print STDERR "\n\n$did has mismatched address parsing \n";
84
+ $uncertain_addr++;
85
+ }
86
+
87
+ my $handle = Data::Dumper->new([\%xml_hash_parsed]);
88
+ $$rstr = $handle->Dump;
89
+
90
+ my $rFinalStr = &output_xml(\%xml_hash_parsed);
91
+ return $rFinalStr;
92
+ }
93
+
94
+
95
+
96
+ #cluster needs initialization --the first cluster!
97
+ sub parse_xml () {
98
+ my $did = shift;
99
+ my $str = shift;
100
+ my $uncertain_addr = shift;
101
+ my %xml_hash = ();
102
+
103
+ my @lines = split(/\s*\n\s*/, $str);
104
+ my $pre_stat = "";
105
+ my %pre_email = ();
106
+ #pre address/affiliation info.
107
+ my $pre_cluster_id = "";
108
+ my $pre_add_cluster_id = "";
109
+
110
+ my $cluster_affi_exist = 0; # may not be useful
111
+ my $cluster_addr_exist = 0; #may not be useful
112
+
113
+ my $abstractComplete = 0;
114
+
115
+ my $line_count = 0;
116
+ for my $i(0 .. $#lines) {
117
+ my $line = $lines[$i];
118
+ $line = &string_clean($line);
119
+ if ($line =~ /^\s*$/) {next;}
120
+ if ($line =~ /\<(\w+)\>(.*)\<\/\w+\>/) {
121
+ my $tag = $1;
122
+ my $content = $2;
123
+ #$content = &lclean($content);
124
+ $line_count++;
125
+ $content = &string_clean($content);
126
+ if ($pre_stat ne $tag) { #different tags
127
+ if ($tag =~ /abstract/) {
128
+ if (!defined $xml_hash{$tag}) {
129
+ $xml_hash{$tag} .= "$content";
130
+ } else {
131
+ $abstractComplete = 1;
132
+ }
133
+ } elsif ($tag =~ /author/) {
134
+ $content =~ s/\s*(\,|\;)\s*/ /g;
135
+ $content = &string_clean($content);
136
+ $xml_hash{cluster_num}++;
137
+ my $cluster_id = $xml_hash{cluster_num};
138
+ $xml_hash{cluster}{$cluster_id}{start} = $line_count;
139
+ $xml_hash{cluster}{$cluster_id}{end} = $line_count;
140
+
141
+ #heuristically judge the correctness of name parsing, and clean names
142
+ my @multi_names = split(/\s+and\s+/i, $content);
143
+ for my $i(0 .. $#multi_names) {
144
+ my $name = $multi_names[$i];
145
+ my ($weireness, $clean_name) = &weired_author($name);
146
+ if ($weireness) {next;}
147
+ $xml_hash{cluster}{$cluster_id}{author_num}++;
148
+ my $author_id = $xml_hash{cluster}{$cluster_id}{author_num};
149
+ $xml_hash{cluster}{$cluster_id}{author}{$author_id}{name} = $clean_name;
150
+ }
151
+
152
+ #within cluster parameters update
153
+ $cluster_affi_exist = 0;
154
+ $cluster_addr_exist = 0;
155
+ }elsif ($tag =~ /affiliation/) {
156
+ my $cluster_id = $xml_hash{cluster_num};
157
+ if ($cluster_id <0) {
158
+ print STDERR "warning $did has affiliations ahead of authors \n";
159
+ }else{
160
+ #start a new add_cluster regardless what different tags the previous line has
161
+ $xml_hash{cluster}{$cluster_id}{add_cluster_num}++;
162
+ my $add_cluster_id = $xml_hash{cluster}{$cluster_id}{add_cluster_num};
163
+ $xml_hash{cluster}{$cluster_id}{add_cluster}{$add_cluster_id}{affi_num}++;
164
+ my $affi_id = $xml_hash{cluster}{$cluster_id}{add_cluster}{$add_cluster_id}{affi_num};
165
+ $xml_hash{cluster}{$cluster_id}{add_cluster}{$add_cluster_id}{affi}{$affi_id} = $content;
166
+ #not good
167
+ if ($pre_stat eq "email") {
168
+ $xml_hash{cluster}{$cluster_id}{add_cluster}{$add_cluster_id}{pre_email} = %pre_email;
169
+ }
170
+ }
171
+ $cluster_affi_exist =1;
172
+ }elsif ($tag =~ /address/) {
173
+ my $cluster_id = $xml_hash{cluster_num};
174
+ if ($cluster_id <0) {
175
+ #print "warning $did has affiliations ahead of authors \n";
176
+ }else{
177
+ if ($pre_stat !~ /affiliation/) {
178
+ $xml_hash{cluster}{$cluster_id}{add_cluster_num}++;
179
+ }
180
+ my $add_cluster_id = $xml_hash{cluster}{$cluster_id}{add_cluster_num};
181
+ $xml_hash{cluster}{$cluster_id}{add_cluster}{$add_cluster_id}{addr_num}++;
182
+ my $addr_id = $xml_hash{cluster}{$cluster_id}{add_cluster}{$add_cluster_id}{addr_num};
183
+ $xml_hash{cluster}{$cluster_id}{add_cluster}{$add_cluster_id}{addr}{$addr_id} = $content;
184
+ #not good
185
+ if ($pre_stat eq "email") {
186
+ $xml_hash{cluster}{$cluster_id}{add_cluster}{$add_cluster_id}{pre_email} = %pre_email;
187
+ }
188
+ }
189
+ $cluster_addr_exist =1;
190
+ }elsif ($tag =~ /email/) { # other tags
191
+ if ($content =~ /\@/) {
192
+ $parsed_emails = &parse_email($content);
193
+ #concatenate these emails to the new one
194
+ for my $i(0 .. $#$parsed_emails) {
195
+ $$parsed_emails[$i] = &string_clean($$parsed_emails[$i]);
196
+ $xml_hash{email_num}++;
197
+ my $email_id = $xml_hash{email_num};
198
+ $xml_hash{email}{$email_id} = $$parsed_emails[$i];
199
+ $pre_email{$$parsed_emails[$i]}++;
200
+ #if previous taf is affi/addr. point the pre-affi/addr down to email
201
+ if (($pre_stat eq "affiliation") || ($pre_stat eq "address")) {
202
+ $xml_hash{cluster}{$pre_cluster_id}{add_cluster}{$pre_add_cluster_id}{next_email}{$$parsed_emails[$i]} = 1;
203
+ }
204
+ }
205
+ }
206
+ }else {
207
+ $xml_hash{$tag}= $content;
208
+ }
209
+ }else { #same tags with the previous line
210
+ my $cluster_id = $xml_hash{cluster_num};
211
+ if ($tag =~ /author/) {
212
+ $content =~ s/\s*(\,|\;)\s*/ /g;
213
+ $content = &string_clean($content);
214
+ if ($xml_hash{cluster}{$cluster_id}{end} != ($line_count-1)) {
215
+ die "SVMHeaderParse: $did cluster assignment inappropriate";
216
+ }else {
217
+ $xml_hash{cluster}{$cluster_id}{end} = $line_count;
218
+ #heuristically judge the correctness of name parsing, and clean names
219
+ my @multi_names = split(/\s+and\s+/i, $content);
220
+ for my $i(0 .. $#multi_names) {
221
+ my $name = $multi_names[$i];
222
+ my ($weireness, $clean_name) = &weired_author($name);
223
+ if ($weireness) {next;}
224
+ $xml_hash{cluster}{$cluster_id}{author_num}++;
225
+ my $author_id = $xml_hash{cluster}{$cluster_id}{author_num};
226
+ $xml_hash{cluster}{$cluster_id}{author}{$author_id}{name} = $clean_name;
227
+ }
228
+ }
229
+ }elsif ($tag =~ /affiliation/) {
230
+ my $add_cluster_id = $xml_hash{cluster}{$cluster_id}{add_cluster_num};
231
+ $xml_hash{cluster}{$cluster_id}{add_cluster}{$add_cluster_id}{affi_num}++;
232
+ my $affi_id = $xml_hash{cluster}{$cluster_id}{add_cluster}{$add_cluster_id}{affi_num};
233
+ $xml_hash{cluster}{$cluster_id}{add_cluster}{$add_cluster_id}{affi}{$affi_id} = $content;
234
+ $cluster_affi_exist =1;
235
+ }elsif ($tag =~ /address/) {
236
+ my $add_cluster_id = $xml_hash{cluster}{$cluster_id}{add_cluster_num};
237
+ $xml_hash{cluster}{$cluster_id}{add_cluster}{$add_cluster_id}{addr_num}++;
238
+ my $addr_id = $xml_hash{cluster}{$cluster_id}{add_cluster}{$add_cluster_id}{addr_num};
239
+ $xml_hash{cluster}{$cluster_id}{add_cluster}{$add_cluster_id}{addr}{$addr_id} = $content;
240
+ $cluster_addr_exist =1;
241
+ }elsif ($tag =~ /email/) {# other tags
242
+ if ($content =~ /\@/) {
243
+ $parsed_emails = &parse_email($content);
244
+ #concatenate these emails to the new one
245
+ for my $i(0 .. $#$parsed_emails) {
246
+ $$parsed_emails[$i] = &string_clean($$parsed_emails[$i]);
247
+ $xml_hash{email_num}++;
248
+ my $email_id = $xml_hash{email_num};
249
+ $xml_hash{email}{$email_id} = $$parsed_emails[$i];
250
+ $pre_email{$$parsed_emails[$i]}++;
251
+ }
252
+ }
253
+ } elsif ($tag =~ /abstract/) {
254
+ if ($abstractComplete <= 0) {
255
+ $xml_hash{$tag} .= "\n$content";
256
+ }
257
+ }else {
258
+ $xml_hash{$tag} .= " $content";
259
+ }
260
+ }
261
+ #parameters update
262
+ if (($tag eq "affiliation") || ($tag eq "address")) {
263
+ $pre_cluster_id = $xml_hash{cluster_num};
264
+ $pre_add_cluster_id = $xml_hash{cluster}{$pre_cluster_id}{add_cluster_num};
265
+ %pre_email = ();
266
+ }elsif ($tag ne "email") {
267
+ %pre_email = ();
268
+ $pre_cluster_id = "";
269
+ $pre_add_cluster_id = "";
270
+ }
271
+ $pre_stat = $tag;
272
+ }
273
+ }
274
+
275
+ $xml_hash{'abstractEnded'} = $abstractComplete;
276
+
277
+ #the order of adjusting email and address is non changable
278
+ $xml_hash = &adjust_email(\%xml_hash);
279
+ ($xml_hash, $uncertain) = &adjust_addr($xml_hash, $uncertain_addr);
280
+
281
+ return($xml_hash,$uncertain);
282
+ }
283
+
284
+
285
+ sub parse_email(){
286
+ my $content = shift;
287
+
288
+ #parse email; email could be separated by only author names, or the whole email addresses
289
+ my @all_emails = ();
290
+ $content =~ s/(email|e-mail|e mail)(s)*(\s*\:\s*)*//gi;
291
+ my @email_parts = split(/\@/, $content);
292
+ if ($#email_parts < 2) { #only one @
293
+ if ($content =~ /\,|;|\{|\}|\[|\]/) { #multiple people with the same email root
294
+ $content =~ s/\{|\}|\[|\]//g;
295
+ my ($pre, $last) = split(/\@/, $content);
296
+ my @authors = split(/\,/, $pre);
297
+ for my $k(0 .. $#authors) {
298
+ my $tmp_email = "$authors[$k]"."\@"."$last";
299
+ $tmp_email =~ s/^\s+//g;
300
+ $tmp_email =~ s/\s+$//g;
301
+ push @all_emails, $tmp_email;
302
+ }
303
+ }else {
304
+ push @all_emails, $content;
305
+ }
306
+ }else { # do not consider the case that some @ string has 1+ people (if exists; extend the function)
307
+ my @emails = ();
308
+ if ($content =~ /\,|\;/) {
309
+ @emails = split(/\,|\;/, $content);
310
+ }else {
311
+ @emails = split(/\s+/, $content);
312
+ }
313
+
314
+ for my $k(0 .. $#emails) {
315
+ push @all_emails, $emails[$k];
316
+ }
317
+ }
318
+ for (my $i=0; $i<=$#all_emails; $i++) {
319
+ my $email = $all_emails[$i];
320
+ $email =~ s/^[\(\[\s\<]+//;
321
+ $email =~ s/[\]\)\s\>]+$//;
322
+ $all_emails[$i] = $email;
323
+ }
324
+ return(\@all_emails);
325
+ }
326
+
327
+ sub adjust_email () {
328
+ my $adjust_hash= shift;
329
+
330
+ my %author_email = ();
331
+ my %email_author = ();
332
+ foreach my $cluster_id (sort {$a<=>$b} keys %{$$adjust_hash{cluster}}) {
333
+ foreach my $author_id (sort {$a<=>$b} keys %{$$adjust_hash{cluster}{$cluster_id}{author}}) {
334
+ my $name = $$adjust_hash{cluster}{$cluster_id}{author}{$author_id}{name};
335
+ my $name_id = "$cluster_id"."_"."$author_id";
336
+ if ($name !~ /\w/) {next;}
337
+ foreach my $email_id(sort {$a<=>$b} keys %{$$adjust_hash{email}}) {
338
+ my $email = $$adjust_hash{email}{$email_id};
339
+ my ($email_name, $email_root) = split(/\@/,$email);
340
+ if ($email_name !~ /\w/) {next;}
341
+ $dist = adist(lc($email_name), lc($name));
342
+ $author_email{$name_id}{$email} = abs($dist);
343
+ $email_author{$email}{$name_id} = abs($dist);
344
+ }
345
+ }
346
+ }
347
+
348
+ foreach my $author_id (keys %author_email) {
349
+ my $counter = keys %{$author_email{$author_id}};
350
+ foreach my $email_id (sort {$author_email{$author_id}{$b}<=>$author_email{$author_id}{$a}} keys %{$author_email{$author_id}}) {
351
+ $author_email{$author_id}{$email_id}{rank} = $counter;
352
+ $author_email{$author_id}{$email_id}{score} = $author_email{$author_id}{$email_id};
353
+ $counter--;
354
+ }
355
+ }
356
+
357
+ foreach my $email_id (keys %email_author) {
358
+ my $counter = keys %{$email_author{$email_id}};
359
+ foreach my $author_id (sort {$email_author{$email_id}{$b}<=>$email_author{$email_id}{$a}} keys %{$email_author{$email_id}}) {
360
+ $email_author{$email_id}{$author_id}{rank} = $counter;
361
+ $email_author{$email_id}{$author_id}{score} = $email_author{$email_id}{$author_id};
362
+ $counter--;
363
+ }
364
+ }
365
+
366
+ my %picked_author;
367
+ my %picked_email;
368
+ my %final_map = ();
369
+ my %sum_hash = (); #rank
370
+
371
+ #add the ranks from two sides -> all combinations
372
+ foreach my $author_id (sort {$a<=>$b} keys %author_email) {
373
+ foreach my $email_id (keys %{$author_email{$author_id}}) {
374
+ my $macro = "$author_id"."<>"."$email_id";
375
+ my ($cluster_id, $author_id2) = split(/\_/, $author_id);
376
+ #my $macro = "$$adjust_hash{cluster}{$cluster_id}{author}{$author_id2}"."<>"."$email_id";;
377
+ $sum_hash{$macro}{score} = $author_email{$author_id}{$email_id}{score} + $email_author{$email_id}{$author_id}{score};
378
+ }
379
+ }
380
+
381
+ foreach my $macro (sort {$sum_hash{$a}{score} <=> $sum_hash{$b}{score}} keys %sum_hash) {
382
+ my ($author_id1, $email_id) = split(/<>/,$macro);
383
+ if ($picked_author{$author_id1} || $picked_email{$email_id}) {next;}
384
+ $picked_author{$author_id1} = 1;
385
+ $picked_email{$email_id} = 1;
386
+ $final_map{$author_id1} = $email_id;
387
+ my ($cluster_id, $author_id2) = split(/\_/, $author_id1);
388
+ $$adjust_hash{cluster}{$cluster_id}{author}{$author_id2}{email}=$email_id;
389
+ }
390
+
391
+ return($adjust_hash);
392
+ }
393
+
394
+ sub adjust_addr() {
395
+ my $H= shift;
396
+
397
+ #address/affiliation assignment is unreasonable
398
+ my $uncertain = 0;
399
+
400
+ foreach my $cluster_id (sort {$a<=>$b} keys %{$$H{cluster}}) {
401
+ if ($$H{cluster}{$cluster_id}{author_num} eq 1) {
402
+ if ($$H{cluster}{$cluster_id}{add_cluster_num} > 1) {
403
+ #for one author - multiple address case; combine all
404
+ my ($affi, $addr) = &combine_all_addr_set($$H{cluster}{$cluster_id});
405
+ if ($affi ne "") {
406
+ $$H{cluster}{$cluster_id}{author}{1}{affi}=$affi;
407
+ }
408
+ if ($addr ne "") {
409
+ $$H{cluster}{$cluster_id}{author}{1}{addr}=$addr;
410
+ }
411
+ }elsif ($$H{cluster}{$cluster_id}{add_cluster_num} eq 1) {
412
+ my ($affi, $addr) = &combine_first_addr_set($$H{cluster}{$cluster_id});
413
+ #print "affi is $affi and addr is $addr \n";
414
+ if ($affi ne "") {
415
+ $$H{cluster}{$cluster_id}{author}{1}{affi}=$affi;
416
+ }
417
+ if ($addr ne "") {
418
+ $$H{cluster}{$cluster_id}{author}{1}{addr}=$addr;
419
+ }
420
+ }else {
421
+ #print "warning: No address and affiliations\n";
422
+ }
423
+ }elsif ($$H{cluster}{$cluster_id}{author_num} > 1) {
424
+ if ($$H{cluster}{$cluster_id}{add_cluster_num} > 1) {
425
+ if ($$H{cluster}{$cluster_id}{add_cluster_num} eq $$H{cluster}{$cluster_id}{author_num}) {
426
+ my $addr_cluster = &combine_addr_set($$H{cluster}{$cluster_id});
427
+ #equally assign
428
+ for my $author_id (1 .. $$H{cluster}{$cluster_id}{author_num}) {
429
+ $$H{cluster}{$cluster_id}{author}{$author_id}{affi}=$$addr_cluster{$cluster_id}{affi};
430
+ $$H{cluster}{$cluster_id}{author}{$author_id}{addr}=$$addr_cluster{$cluster_id}{addr};
431
+ }
432
+ }else {
433
+ #check if existing emails as separators.
434
+ my $consonence = &check_email_as_address_separator($$H{cluster}{$cluster_id}{add_cluster});
435
+ #emails ahead of each address cluster is the separator for author's address
436
+ if (($consonence eq "pre") || ($consonence eq "next")) {
437
+ $$H{cluster}{$cluster_id} = &adjust_cluster_by_email_separator($consonence, $$H{cluster}{$cluster_id});
438
+ }else {
439
+ $uncertain =1;
440
+ }
441
+ #another strategies:
442
+ #first name has first address; last name has last address
443
+ $$H{cluster}{$cluster_id} = &assign_edge_address($$H{cluster}{$cluster_id});
444
+ #other stategies???
445
+
446
+ }
447
+ }elsif ($$H{cluster}{$cluster_id}{add_cluster_num} eq 1) {
448
+ my ($affi, $addr) = &combine_first_addr_set($$H{cluster}{$cluster_id});
449
+ #print "affi is $affi and addr is $addr \n";
450
+ for my $author_id (1 .. $$H{cluster}{$cluster_id}{author_num}) {
451
+ $$H{cluster}{$cluster_id}{author}{$author_id}{affi}=$affi;
452
+ $$H{cluster}{$cluster_id}{author}{$author_id}{addr}=$addr;
453
+ }
454
+ }else {
455
+ #print "warning: No address and affiliations\n";
456
+ }
457
+ }
458
+ }
459
+
460
+ return($H, $uncertain);
461
+ }
462
+
463
+ sub combine_first_addr_set() {
464
+ my $H = shift;
465
+
466
+ my $affi = "";
467
+ my $addr = "";
468
+
469
+ foreach my $affi_id(sort {$a <=> $b} keys %{$$H{'add_cluster'}{1}{affi}}) {
470
+ $affi .= "\; $$H{'add_cluster'}{1}{affi}{$affi_id}";
471
+ }
472
+ $affi =~ s/^\s*\;\s*//g;
473
+ foreach my $addr_id(sort {$a <=> $b} keys %{$$H{'add_cluster'}{1}{addr}}) {
474
+ $addr .= "\; $$H{'add_cluster'}{1}{addr}{$addr_id}";
475
+ }
476
+ $addr =~ s/^\s*\;\s*//g;
477
+ delete($$H{'add_cluster'}{1}{affi});
478
+ delete($$H{'add_cluster'}{1}{addr});
479
+
480
+ return($affi,$addr);
481
+ }
482
+
483
+ sub combine_addr_set() {
484
+ my $H = shift;
485
+
486
+ my %add_cluster = ();
487
+
488
+ foreach my $cluster_id (sort {$a <=> $b} keys %{$$H{'add_cluster'}}) {
489
+ my $affi = "";
490
+ my $addr = "";
491
+ foreach my $affi_id(sort {$a <=> $b} keys %{$$H{'add_cluster'}{$cluster_id}{affi}}) {
492
+ $affi .= "\; $$H{'add_cluster'}{$cluster_id}{affi}{$affi_id}";
493
+ }
494
+ $affi =~ s/^\s*\;\s*//g;
495
+ foreach my $addr_id(sort {$a <=> $b} keys %{$$H{'add_cluster'}{$cluster_id}{addr}}) {
496
+ $addr .= "\; $$H{'add_cluster'}{$cluster_id}{addr}{$addr_id}";
497
+ }
498
+ $addr =~ s/^\s*\;\s*//g;
499
+ if ($affi ne "") {
500
+ $add_cluster{$cluster_id}{affi} = $affi;
501
+ }
502
+
503
+ if ($addr ne "") {
504
+ $add_cluster{$cluster_id}{addr} = $addr;
505
+ }
506
+
507
+ delete($$H{'add_cluster'}{$cluster_id}{affi});
508
+ delete($$H{'add_cluster'}{$cluster_id}{addr});
509
+ }
510
+
511
+ return(\%add_cluster);
512
+ }
513
+
514
+
515
+ sub combine_all_addr_set() {
516
+ my $H = shift;
517
+
518
+ my $affi = "";
519
+ my $addr = "";
520
+ my %add_cluster = ();
521
+
522
+ foreach my $cluster_id (sort {$a <=> $b} keys %{$$H{'add_cluster'}}) {
523
+ foreach my $affi_id(sort {$a <=> $b} keys %{$$H{'add_cluster'}{$cluster_id}{affi}}) {
524
+ $affi .= "\; $$H{'add_cluster'}{$cluster_id}{affi}{$affi_id}";
525
+ }
526
+ foreach my $addr_id(sort {$a <=> $b} keys %{$$H{'add_cluster'}{$cluster_id}{addr}}) {
527
+ $addr .= "\; $$H{'add_cluster'}{$cluster_id}{addr}{$addr_id}";
528
+ }
529
+ delete($$H{'add_cluster'}{$cluster_id}{affi});
530
+ delete($$H{'add_cluster'}{$cluster_id}{addr});
531
+ }
532
+
533
+ return($affi,$addr);
534
+ }
535
+
536
+ sub combine_affi() {
537
+ my $H = shift;
538
+
539
+ my $affi = "";
540
+ foreach my $affi_id (sort {$a <=> $b} keys %{$$H{affi}}) {
541
+ $affi .= "\; $$H{affi}{$affi_id}";
542
+ }
543
+ $affi =~ s/^\s*\;\s*//g;
544
+ return($affi);
545
+ }
546
+
547
+ sub combine_addr() {
548
+ my $H = shift;
549
+
550
+ my $addr = "";
551
+ foreach my $addr_id (sort {$a <=> $b} keys %{$$H{addr}}) {
552
+ $addr .= "\; $$H{addr}{$addr_id}";
553
+ }
554
+ $addr =~ s/^\s*\;\s*//g;
555
+
556
+ return($addr);
557
+ }
558
+
559
+ sub check_email_as_address_separator(){
560
+ my $H = shift;
561
+
562
+ my $consonence = 0; #0 is null state; -1 is conflict
563
+ foreach my $add_cluster_id (sort {$a<=>$b} keys %{$H}) {
564
+ if ($consonence eq "-1") {last;}
565
+ my $pre_email = $$H{$add_cluster_id}{'pre_email'};
566
+ my $next_email = $$H{$add_cluster_id}{'next_email'};
567
+ ###################here
568
+ foreach my $email_key (sort %{$pre_email}) {
569
+ if ($email_key =~ /\w/) {
570
+ if (($consonence eq "0") || ($consonence eq "pre")) {
571
+ $consonence = "pre";
572
+ }elsif ($consonence ne "next") {
573
+ $consonence = -1;
574
+ }
575
+ last;
576
+ }
577
+ }
578
+
579
+ foreach my $email_key (sort %{$next_email}) {
580
+ if ($email_key =~ /\w/) {
581
+ if (($consonence eq "0") || ($consonence eq "next")) {
582
+ $consonence = "next";
583
+ }elsif ($consonence ne "pre") {
584
+ $consonence = -1;
585
+ }
586
+ last;
587
+ }
588
+ }
589
+ } #end of checking emails as the address separators
590
+
591
+ return($consonence);
592
+ }
593
+
594
+
595
+
596
+ sub adjust_cluster_by_email_separator() {
597
+ my $consonence =shift;
598
+ #$$H{cluster}{$cluster_id})
599
+ my $H = shift;
600
+
601
+ if ($consonence eq "pre") {
602
+ foreach my $add_cluster_id (sort {$a<=>$b} keys %{$$H{add_cluster}}) {
603
+ my $pre_email_hash = $$H{add_cluster}{$add_cluster_id}{'pre_email'};
604
+ #check each email and assign to respective author
605
+ foreach my $pre_email (keys %{$pre_email_hash}) {
606
+ foreach my $author_id (keys %{$$H{author}}) {
607
+ if ($pre_email eq $$H{author}{$author_id}{email}) {
608
+ #need to combine them first.
609
+ my $combine_affi = &combine_affi($$H{add_cluster}{$add_cluster_id});
610
+ my $combine_addr = &combine_addr($$H{add_cluster}{$add_cluster_id});
611
+ $$H{author}{$author_id}{affi} = $combine_affi;
612
+ $$H{author}{$author_id}{addr} = $combine_addr;
613
+ }
614
+ }
615
+ }
616
+ }
617
+ }elsif ($consonence eq "next") {
618
+ foreach my $add_cluster_id (sort {$a<=>$b} keys %{$$H{add_cluster}}) {
619
+ my $next_email_hash = $$H{add_cluster}{$add_cluster_id}{'next_email'};
620
+ #check each email and assign to respective author
621
+ foreach my $next_email (keys %{$next_email_hash}) {
622
+ foreach my $author_id (keys %{$$H{author}}) {
623
+ if ($next_email eq $$H{author}{$author_id}{email}) {
624
+ #need to combine them first.
625
+ my $combine_affi = &combine_affi($$H{add_cluster}{$add_cluster_id});
626
+ my $combine_addr = &combine_addr($$H{add_cluster}{$add_cluster_id});
627
+ $$H{author}{$author_id}{affi} = $combine_affi;
628
+ $$H{author}{$author_id}{addr} = $combine_addr;
629
+ }
630
+ }
631
+ }
632
+ }
633
+ }
634
+
635
+ return($H);
636
+ }
637
+
638
+
639
+ #assign first name first address and last name last address
640
+ sub assign_edge_address () {
641
+ my $H = shift;
642
+
643
+ my $first_name_addr = $$H{author}{1}{addr};
644
+ my $first_name_affi = $$H{author}{1}{affi};
645
+ my $last_name_addr = $$H{author}{$$H{author_num}}{addr};
646
+ my $last_name_affi = $$H{author}{$$H{author_num}}{affi};
647
+
648
+ #needs to combine add and affi
649
+ if (($first_name_addr eq "" ) && ($first_name_affi eq "")) {
650
+ my $combine_affi = &combine_affi($$H{add_cluster}{1});
651
+ my $combine_addr = &combine_addr($$H{add_cluster}{1});
652
+
653
+ $$H{author}{1}{affi} = $combine_affi;
654
+ $$H{author}{1}{addr} = $combine_addr;
655
+ }
656
+
657
+ if (($last_name_addr eq "" ) && ($last_name_affi eq "")) {
658
+ my $combine_affi = &combine_affi($$H{add_cluster}{$$H{add_cluster_num}});
659
+ my $combine_addr = &combine_addr($$H{add_cluster}{$$H{add_cluster_num}});
660
+ $$H{author}{$$H{author_num}}{affi} = $combine_affi;
661
+ $$H{author}{$$H{author_num}}{addr} = $combine_addr;
662
+ }
663
+
664
+ return($H);
665
+ }
666
+
667
+ sub output_xml(){
668
+ my $parsed_hash = shift;
669
+ my $author_found = 0;
670
+
671
+ my $l_algName = $algName;
672
+ my $l_algVersion = $algVersion;
673
+ cleanXML(\$l_algName);
674
+ cleanXML(\$l_algVersion);
675
+
676
+ my $str = "<algorithm name=\"$l_algName\" version=\"$l_algVersion\">\n";
677
+
678
+ foreach my $did (sort {$a <=> $b} keys %{$parsed_hash}) {
679
+ my $title = $$parsed_hash{$did}{title};
680
+ $title = repairPunctuation($title);
681
+ cleanAll(\$title);
682
+
683
+ $str.="<title>$title</title>\n";
684
+ $str.="<authors>\n";
685
+ foreach my $cluster_id (sort {$a <=> $b} keys %{$$parsed_hash{$did}{cluster}}) {
686
+ if ($cluster_id =~ /\d+/) {
687
+ foreach my $author_id ( sort {$a <=> $b} keys %{$$parsed_hash{$did}{cluster}{$cluster_id}{author}}) {
688
+ $author_found = 1;
689
+
690
+ my $name = $$parsed_hash{$did}{cluster}{$cluster_id}{author}{$author_id}{name};
691
+ cleanAll(\$name);
692
+ $name = normalizeName($name);
693
+ my $affi = $$parsed_hash{$did}{cluster}{$cluster_id}{author}{$author_id}{affi};
694
+ $affi = repairPunctuation($affi);
695
+ cleanAll(\$affi);
696
+
697
+ my $addr = $$parsed_hash{$did}{cluster}{$cluster_id}{author}{$author_id}{addr};
698
+ $addr = repairPunctuation($addr);
699
+ cleanAll(\$addr);
700
+
701
+ my $email = $$parsed_hash{$did}{cluster}{$cluster_id}{author}{$author_id}{email};
702
+ cleanAll(\$email);
703
+
704
+ if ($name =~ /\w/) {
705
+ $str.="<author>\n";
706
+ $str.="<name>$name</name>\n";
707
+ if ($affi =~ /\w/) {
708
+ $str.="<affiliation>$affi</affiliation>\n";
709
+ }
710
+ if ($addr =~ /\w/) {
711
+ $str.="<address>$addr</address>\n";
712
+ }
713
+ if ($email =~ /\w/) {
714
+ $str.="<email>$email</email>\n";
715
+ }
716
+ $str.="</author>\n";
717
+ }
718
+ }
719
+ }
720
+ }
721
+ $str.="</authors>\n";
722
+
723
+ my $keywords = $$parsed_hash{$did}{keyword};
724
+ if ($keywords =~ /\w/) {
725
+ $keywords = repairPunctuation($keywords);
726
+ my @keywords = normalizeKeywords($keywords);
727
+ $str .= "<keywords>\n";
728
+ foreach my $keyword (@keywords) {
729
+ cleanAll(\$keyword);
730
+ $str .= "<keyword>$keyword</keyword>\n";
731
+ }
732
+ $str .= "</keywords>\n";
733
+ }
734
+
735
+ my $abstract = $$parsed_hash{$did}{abstract};
736
+ if ($abstract =~ /\w/) {
737
+ $abstract = repairPunctuation($abstract);
738
+ $abstract = normalizeAbstract($abstract, $$parsed_hash{$did}{abstractEnded});
739
+ cleanAll(\$abstract);
740
+ $str .= "<abstract>$abstract</abstract>\n";
741
+ }
742
+
743
+ my $date = $$parsed_hash{$did}{date};
744
+ if ($date =~ /\d/) {
745
+ $date = repairPunctuation($date);
746
+ cleanAll(\$date);
747
+ $date = normalizeDate($date);
748
+ if (defined $date) {
749
+ $str .= "<date>$date</date>\n";
750
+ }
751
+ }
752
+ }
753
+
754
+ my $titlelength = length($$parsed_hash{$did}{title});
755
+ my $authorcount = scalar keys %{$$parsed_hash{$did}{cluster}{$cluster_id}{author}};
756
+
757
+ my $validHeader;
758
+
759
+ if(length($$parsed_hash{$did}{title}) > 0 && $author_found) {
760
+ $validHeader = "<validHeader>1</validHeader>";
761
+ }
762
+ else {
763
+ $validHeader = "<validHeader>0</validHeader>";
764
+ }
765
+
766
+ $str.="$validHeader\n";
767
+ $str.="</algorithm>\n";
768
+
769
+ return \$str;
770
+ }
771
+
772
+
773
+ sub normalizeName {
774
+ my $name = shift;
775
+ my @tokens = split " ", $name;
776
+ my @newTokens = ();
777
+ foreach my $token (@tokens) {
778
+ if ($token =~ m/^and$/i) {
779
+ next;
780
+ }
781
+ push @newTokens, $token;
782
+ }
783
+ return join " ", @newTokens;
784
+ }
785
+
786
+
787
+ sub normalizeKeywords {
788
+ my $text = shift;
789
+ my @tokens = split '\s*[\:\;\,]\s*', $text;
790
+ for (my $i=0; $i<=$#tokens; $i++) {
791
+ $tokens[$i] = trimPunctuation($tokens[$i]);
792
+ }
793
+ if ($tokens[0] =~ m/keyword|keyphrase/i) {
794
+ return @tokens[1..$#tokens];
795
+ }
796
+ return @tokens;
797
+ }
798
+
799
+ sub normalizeAbstract {
800
+ my ($text, $abstractEnded) = @_;
801
+
802
+ my @lines = split '\n', $text;
803
+ if ($#lines < 0) {
804
+ return "";
805
+ }
806
+
807
+ if ($abstractEnded<=0) {
808
+ my $minLines = 5;
809
+ my $maxLines = 15;
810
+ my $lineCount = 0;
811
+ for (my $i=0; $i<$#lines; $i++) {
812
+ $lineCount++;
813
+ if (($lineCount >= $minLines) && $line =~ m/\.\s*$/) {
814
+ last;
815
+ }
816
+ if ($lineCount >= $maxLines) {
817
+ last;
818
+ }
819
+ }
820
+ @lines = @lines[0..($lineCount-1)];
821
+ }
822
+
823
+ my $abstract = "";
824
+ foreach my $line (@lines) {
825
+ if ($line =~ m/\b(?:Abstract|ABSTRACT|abstract|Introduction|INTRODUCTION)\:?\s*$/ ||
826
+ $line =~ m/^\s*$/) {
827
+ next;
828
+ }
829
+ if ($abstract =~ m/\-$/ || $abstract =~ m/^\s*$/s) {
830
+ $abstract .= $line;
831
+ } else {
832
+ $abstract .= " $line";
833
+ }
834
+ }
835
+ return $abstract;
836
+ }
837
+
838
+ sub normalizeDate {
839
+ my $date = shift;
840
+ if ($date =~ m/(\b\d{4}\b)/) {
841
+ my $year = $1;
842
+ my @timeData = localtime(time);
843
+ my $currentYear = $timeData[5]+1900;
844
+ if ($year <= $currentYear+3) {
845
+ return $year;
846
+ }
847
+ }
848
+ return undef;
849
+ }
850
+
851
+ sub trimPunctuation {
852
+ my $text = shift;
853
+ $text =~ s/[\.\,\<\>\?\/\:\;\"\'\{\[\}\]\+\=\_\-\(\)\*\&\^\%\$\#\@\!\~\`\\\|]+\s*$//;
854
+ $text =~ s/^\s*[\.\,\<\>\?\/\:\;\"\'\{\[\}\]\+\=\_\-\(\)\*\&\^\%\$\#\@\!\~\`\\\|]+//;
855
+ return $text;
856
+ }
857
+
858
+
859
+ sub string_clean() {
860
+ my $str = shift;
861
+ $str =~ s/^\s+//g;
862
+ $str =~ s/\s+$//g;
863
+
864
+ return($str);
865
+ }
866
+
867
+
868
+ sub new
869
+ {
870
+ my $classname = shift;
871
+
872
+ my $self = { XMLindent => ' ' };
873
+
874
+ my @upperentities = qw (nbsp iexcl cent pound curren yen brvbar sect
875
+ uml copy ordf laquo not 173 reg macr deg plusmn
876
+ sup2 sup3 acute micro para middot cedil supl
877
+ ordm raquo frac14 half frac34 iquest Agrave
878
+ Aacute Acirc Atilde Auml Aring AElig Ccedil
879
+ Egrave Eacute Ecirc Euml Igrave Iacute Icirc
880
+ Iuml ETH Ntilde Ograve Oacute Ocirc Otilde Ouml
881
+ times Oslash Ugrave Uacute Ucirc Uuml Yacute
882
+ THORN szlig agrave aacute acirc atilde auml
883
+ aring aelig ccedil egrave eacute ecirc euml
884
+ igrave iacute icirc iuml eth ntilde ograve
885
+ oacute ocirc otilde ouml divide oslash ugrave
886
+ uacute ucirc uuml yacute thorn yuml);
887
+ $upperentities[12] = '#173';
888
+
889
+ $self->{'hashentity'} = {};
890
+ for ( my $i=0; $i<=$#upperentities; $i++ )
891
+ {
892
+ my $key = '&'.$upperentities[$i].';';
893
+ $self->{'hashentity'}->{$key}=$i+160;
894
+ }
895
+
896
+ $self->{'hashstr'} = (join (';|', @upperentities)).';';
897
+
898
+ bless $self, $classname;
899
+ return $self;
900
+ }
901
+
902
+ sub char_converter()
903
+ {
904
+
905
+ my $H = { XMLindent => ' ' };
906
+
907
+ my @upperentities = qw (nbsp iexcl cent pound curren yen brvbar sect
908
+ uml copy ordf laquo not 173 reg macr deg plusmn
909
+ sup2 sup3 acute micro para middot cedil supl
910
+ ordm raquo frac14 half frac34 iquest Agrave
911
+ Aacute Acirc Atilde Auml Aring AElig Ccedil
912
+ Egrave Eacute Ecirc Euml Igrave Iacute Icirc
913
+ Iuml ETH Ntilde Ograve Oacute Ocirc Otilde Ouml
914
+ times Oslash Ugrave Uacute Ucirc Uuml Yacute
915
+ THORN szlig agrave aacute acirc atilde auml
916
+ aring aelig ccedil egrave eacute ecirc euml
917
+ igrave iacute icirc iuml eth ntilde ograve
918
+ oacute ocirc otilde ouml divide oslash ugrave
919
+ uacute ucirc uuml yacute thorn yuml);
920
+ $upperentities[12] = '#173';
921
+
922
+ $H->{'hashentity'} = {};
923
+ for ( my $i=0; $i<=$#upperentities; $i++ )
924
+ {
925
+ my $key = '&'.$upperentities[$i].';';
926
+ $H->{'hashentity'}->{$key}=$i+160;
927
+ }
928
+
929
+ $H->{'hashstr'} = (join (';|', @upperentities)).';';
930
+
931
+ return $H;
932
+ }
933
+
934
+ sub repairPunctuation {
935
+ my $text = shift;
936
+ $text =~ s/ / /gs;
937
+ $text =~ s/\s([\.\,\;\]\)\:\}\!\?\>\-])/$1/gs;
938
+ $text =~ s/^\s+//;
939
+ $text =~ s/\s+$//;
940
+ return $text;
941
+ }
942
+
943
+ # clean XML version two - for single-line streams
944
+ sub lclean
945
+ {
946
+ my $t = shift;
947
+ return undef if (! defined $t );
948
+
949
+ $H = &char_converter;
950
+ # make ISOlat1 entities into Unicode character entities
951
+ $t =~ s/&($H->{'hashstr'})/sprintf ("&#x%04X;", $H->{'hashentity'}->{$&})/geo;
952
+ # escape non-XML-encoded ampersands (including from other characters sets)
953
+ $t =~ s/&(?!((\#[0-9]*)|(\#x[0-9]*)|(amp)|(lt)|(gt)|(apos)|(quot));)/&amp;/go;
954
+ # convert extended ascii into Unicode character entities
955
+ $t =~ s/[\xa0-\xff]/'&#'.ord ($&).';'/geo;
956
+ # remove extended ascii that doesnt translate into ISO8859/1
957
+ $t =~ s/[\x00-\x08\x0B\x0C\x0E-\x1f\x80-\x9f]//go;
958
+ # make tags delimiters into entities
959
+ $t =~ s/</&lt;/go;
960
+ $t =~ s/>/&gt;/go;
961
+ # flatten whitespace
962
+ $t =~ s/[\s\t\r\n]+/ /go;
963
+ # kill leading and terminating spaces
964
+ $t =~ s/^[ ]+(.+)[ ]+$/$1/;
965
+ return $t;
966
+ }
967
+
968
+ 1;