biblicit 1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (322) hide show
  1. data/.gitignore +3 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +6 -0
  4. data/LICENSE.TXT +176 -0
  5. data/README.md +120 -0
  6. data/Rakefile +8 -0
  7. data/biblicit.gemspec +33 -0
  8. data/lib/biblicit/cb2bib.rb +83 -0
  9. data/lib/biblicit/citeseer.rb +53 -0
  10. data/lib/biblicit/extractor.rb +37 -0
  11. data/lib/biblicit.rb +6 -0
  12. data/perl/DocFilter/lib/CSXUtil/SafeText.pm +140 -0
  13. data/perl/DocFilter/lib/DocFilter/Config.pm +35 -0
  14. data/perl/DocFilter/lib/DocFilter/Filter.pm +51 -0
  15. data/perl/FileConversionService/README.TXT +11 -0
  16. data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
  17. data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +140 -0
  18. data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +77 -0
  19. data/perl/FileConversionService/lib/FileConverter/Compression.pm +137 -0
  20. data/perl/FileConversionService/lib/FileConverter/Config.pm +57 -0
  21. data/perl/FileConversionService/lib/FileConverter/Controller.pm +191 -0
  22. data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +61 -0
  23. data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +69 -0
  24. data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +69 -0
  25. data/perl/FileConversionService/lib/FileConverter/PSToText.pm +88 -0
  26. data/perl/FileConversionService/lib/FileConverter/Prescript.pm +68 -0
  27. data/perl/FileConversionService/lib/FileConverter/TET.pm +75 -0
  28. data/perl/FileConversionService/lib/FileConverter/Utils.pm +130 -0
  29. data/perl/HeaderParseService/README.TXT +80 -0
  30. data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +140 -0
  31. data/perl/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +968 -0
  32. data/perl/HeaderParseService/lib/HeaderParse/API/Function.pm +2016 -0
  33. data/perl/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +444 -0
  34. data/perl/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +409 -0
  35. data/perl/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +537 -0
  36. data/perl/HeaderParseService/lib/HeaderParse/API/Parser.pm +68 -0
  37. data/perl/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +1880 -0
  38. data/perl/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +46 -0
  39. data/perl/HeaderParseService/resources/data/EbizHeaders.txt +24330 -0
  40. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +27506 -0
  41. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +26495 -0
  42. data/perl/HeaderParseService/resources/data/tagged_headers.txt +40668 -0
  43. data/perl/HeaderParseService/resources/data/test_header.txt +31 -0
  44. data/perl/HeaderParseService/resources/data/test_header.txt.parsed +31 -0
  45. data/perl/HeaderParseService/resources/database/50states +60 -0
  46. data/perl/HeaderParseService/resources/database/AddrTopWords.txt +17 -0
  47. data/perl/HeaderParseService/resources/database/AffiTopWords.txt +35 -0
  48. data/perl/HeaderParseService/resources/database/AffiTopWordsAll.txt +533 -0
  49. data/perl/HeaderParseService/resources/database/ChineseSurNames.txt +276 -0
  50. data/perl/HeaderParseService/resources/database/Csurnames.bin +0 -0
  51. data/perl/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
  52. data/perl/HeaderParseService/resources/database/DomainSuffixes.txt +242 -0
  53. data/perl/HeaderParseService/resources/database/LabeledHeader +18 -0
  54. data/perl/HeaderParseService/resources/database/README +2 -0
  55. data/perl/HeaderParseService/resources/database/TrainMulClassLines +254 -0
  56. data/perl/HeaderParseService/resources/database/TrainMulClassLines1 +510 -0
  57. data/perl/HeaderParseService/resources/database/abstract.txt +1 -0
  58. data/perl/HeaderParseService/resources/database/abstractTopWords +9 -0
  59. data/perl/HeaderParseService/resources/database/addr.txt +28 -0
  60. data/perl/HeaderParseService/resources/database/affi.txt +34 -0
  61. data/perl/HeaderParseService/resources/database/affis.bin +0 -0
  62. data/perl/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
  63. data/perl/HeaderParseService/resources/database/allnamewords.bin +0 -0
  64. data/perl/HeaderParseService/resources/database/cities_US.txt +4512 -0
  65. data/perl/HeaderParseService/resources/database/cities_world.txt +4463 -0
  66. data/perl/HeaderParseService/resources/database/city.txt +3150 -0
  67. data/perl/HeaderParseService/resources/database/cityname.txt +3151 -0
  68. data/perl/HeaderParseService/resources/database/country_abbr.txt +243 -0
  69. data/perl/HeaderParseService/resources/database/countryname.txt +262 -0
  70. data/perl/HeaderParseService/resources/database/dateTopWords +30 -0
  71. data/perl/HeaderParseService/resources/database/degree.txt +67 -0
  72. data/perl/HeaderParseService/resources/database/email.txt +3 -0
  73. data/perl/HeaderParseService/resources/database/excludeWords.txt +40 -0
  74. data/perl/HeaderParseService/resources/database/female-names +4960 -0
  75. data/perl/HeaderParseService/resources/database/firstNames.txt +8448 -0
  76. data/perl/HeaderParseService/resources/database/firstnames.bin +0 -0
  77. data/perl/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
  78. data/perl/HeaderParseService/resources/database/intro.txt +2 -0
  79. data/perl/HeaderParseService/resources/database/keyword.txt +5 -0
  80. data/perl/HeaderParseService/resources/database/keywordTopWords +7 -0
  81. data/perl/HeaderParseService/resources/database/male-names +3906 -0
  82. data/perl/HeaderParseService/resources/database/middleNames.txt +2 -0
  83. data/perl/HeaderParseService/resources/database/month.txt +35 -0
  84. data/perl/HeaderParseService/resources/database/mul +868 -0
  85. data/perl/HeaderParseService/resources/database/mul.label +869 -0
  86. data/perl/HeaderParseService/resources/database/mul.label.old +869 -0
  87. data/perl/HeaderParseService/resources/database/mul.processed +762 -0
  88. data/perl/HeaderParseService/resources/database/mulAuthor +619 -0
  89. data/perl/HeaderParseService/resources/database/mulClassStat +45 -0
  90. data/perl/HeaderParseService/resources/database/nickname.txt +58 -0
  91. data/perl/HeaderParseService/resources/database/nicknames.bin +0 -0
  92. data/perl/HeaderParseService/resources/database/note.txt +121 -0
  93. data/perl/HeaderParseService/resources/database/page.txt +1 -0
  94. data/perl/HeaderParseService/resources/database/phone.txt +9 -0
  95. data/perl/HeaderParseService/resources/database/postcode.txt +54 -0
  96. data/perl/HeaderParseService/resources/database/pubnum.txt +45 -0
  97. data/perl/HeaderParseService/resources/database/statename.bin +0 -0
  98. data/perl/HeaderParseService/resources/database/statename.txt +73 -0
  99. data/perl/HeaderParseService/resources/database/states_and_abbreviations.txt +118 -0
  100. data/perl/HeaderParseService/resources/database/stopwords +438 -0
  101. data/perl/HeaderParseService/resources/database/stopwords.bin +0 -0
  102. data/perl/HeaderParseService/resources/database/surNames.txt +19613 -0
  103. data/perl/HeaderParseService/resources/database/surnames.bin +0 -0
  104. data/perl/HeaderParseService/resources/database/surnames_spec.bin +0 -0
  105. data/perl/HeaderParseService/resources/database/university_list/A.html +167 -0
  106. data/perl/HeaderParseService/resources/database/university_list/B.html +161 -0
  107. data/perl/HeaderParseService/resources/database/university_list/C.html +288 -0
  108. data/perl/HeaderParseService/resources/database/university_list/D.html +115 -0
  109. data/perl/HeaderParseService/resources/database/university_list/E.html +147 -0
  110. data/perl/HeaderParseService/resources/database/university_list/F.html +112 -0
  111. data/perl/HeaderParseService/resources/database/university_list/G.html +115 -0
  112. data/perl/HeaderParseService/resources/database/university_list/H.html +140 -0
  113. data/perl/HeaderParseService/resources/database/university_list/I.html +138 -0
  114. data/perl/HeaderParseService/resources/database/university_list/J.html +82 -0
  115. data/perl/HeaderParseService/resources/database/university_list/K.html +115 -0
  116. data/perl/HeaderParseService/resources/database/university_list/L.html +131 -0
  117. data/perl/HeaderParseService/resources/database/university_list/M.html +201 -0
  118. data/perl/HeaderParseService/resources/database/university_list/N.html +204 -0
  119. data/perl/HeaderParseService/resources/database/university_list/O.html +89 -0
  120. data/perl/HeaderParseService/resources/database/university_list/P.html +125 -0
  121. data/perl/HeaderParseService/resources/database/university_list/Q.html +49 -0
  122. data/perl/HeaderParseService/resources/database/university_list/R.html +126 -0
  123. data/perl/HeaderParseService/resources/database/university_list/S.html +296 -0
  124. data/perl/HeaderParseService/resources/database/university_list/T.html +156 -0
  125. data/perl/HeaderParseService/resources/database/university_list/U.html +800 -0
  126. data/perl/HeaderParseService/resources/database/university_list/V.html +75 -0
  127. data/perl/HeaderParseService/resources/database/university_list/W.html +144 -0
  128. data/perl/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
  129. data/perl/HeaderParseService/resources/database/university_list/X.html +44 -0
  130. data/perl/HeaderParseService/resources/database/university_list/Y.html +53 -0
  131. data/perl/HeaderParseService/resources/database/university_list/Z.html +43 -0
  132. data/perl/HeaderParseService/resources/database/university_list/ae.html +31 -0
  133. data/perl/HeaderParseService/resources/database/university_list/am.html +30 -0
  134. data/perl/HeaderParseService/resources/database/university_list/ar.html +35 -0
  135. data/perl/HeaderParseService/resources/database/university_list/at.html +43 -0
  136. data/perl/HeaderParseService/resources/database/university_list/au.html +82 -0
  137. data/perl/HeaderParseService/resources/database/university_list/bd.html +28 -0
  138. data/perl/HeaderParseService/resources/database/university_list/be.html +41 -0
  139. data/perl/HeaderParseService/resources/database/university_list/bg.html +28 -0
  140. data/perl/HeaderParseService/resources/database/university_list/bh.html +28 -0
  141. data/perl/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
  142. data/perl/HeaderParseService/resources/database/university_list/bm.html +28 -0
  143. data/perl/HeaderParseService/resources/database/university_list/bn.html +28 -0
  144. data/perl/HeaderParseService/resources/database/university_list/br.html +66 -0
  145. data/perl/HeaderParseService/resources/database/university_list/ca.html +174 -0
  146. data/perl/HeaderParseService/resources/database/university_list/ch.html +52 -0
  147. data/perl/HeaderParseService/resources/database/university_list/cl.html +40 -0
  148. data/perl/HeaderParseService/resources/database/university_list/cn.html +87 -0
  149. data/perl/HeaderParseService/resources/database/university_list/co.html +39 -0
  150. data/perl/HeaderParseService/resources/database/university_list/cr.html +34 -0
  151. data/perl/HeaderParseService/resources/database/university_list/cy.html +34 -0
  152. data/perl/HeaderParseService/resources/database/university_list/cz.html +44 -0
  153. data/perl/HeaderParseService/resources/database/university_list/de.html +128 -0
  154. data/perl/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
  155. data/perl/HeaderParseService/resources/database/university_list/dk.html +42 -0
  156. data/perl/HeaderParseService/resources/database/university_list/ec.html +31 -0
  157. data/perl/HeaderParseService/resources/database/university_list/ee.html +30 -0
  158. data/perl/HeaderParseService/resources/database/university_list/eg.html +29 -0
  159. data/perl/HeaderParseService/resources/database/university_list/es.html +68 -0
  160. data/perl/HeaderParseService/resources/database/university_list/et.html +28 -0
  161. data/perl/HeaderParseService/resources/database/university_list/faq.html +147 -0
  162. data/perl/HeaderParseService/resources/database/university_list/fi.html +49 -0
  163. data/perl/HeaderParseService/resources/database/university_list/fj.html +28 -0
  164. data/perl/HeaderParseService/resources/database/university_list/fo.html +28 -0
  165. data/perl/HeaderParseService/resources/database/university_list/fr.html +106 -0
  166. data/perl/HeaderParseService/resources/database/university_list/geog.html +150 -0
  167. data/perl/HeaderParseService/resources/database/university_list/gr.html +38 -0
  168. data/perl/HeaderParseService/resources/database/university_list/gu.html +28 -0
  169. data/perl/HeaderParseService/resources/database/university_list/hk.html +34 -0
  170. data/perl/HeaderParseService/resources/database/university_list/hr.html +28 -0
  171. data/perl/HeaderParseService/resources/database/university_list/hu.html +46 -0
  172. data/perl/HeaderParseService/resources/database/university_list/id.html +29 -0
  173. data/perl/HeaderParseService/resources/database/university_list/ie.html +49 -0
  174. data/perl/HeaderParseService/resources/database/university_list/il.html +35 -0
  175. data/perl/HeaderParseService/resources/database/university_list/in.html +109 -0
  176. data/perl/HeaderParseService/resources/database/university_list/is.html +32 -0
  177. data/perl/HeaderParseService/resources/database/university_list/it.html +75 -0
  178. data/perl/HeaderParseService/resources/database/university_list/jm.html +28 -0
  179. data/perl/HeaderParseService/resources/database/university_list/jo.html +28 -0
  180. data/perl/HeaderParseService/resources/database/university_list/jp.html +155 -0
  181. data/perl/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
  182. data/perl/HeaderParseService/resources/database/university_list/kr.html +65 -0
  183. data/perl/HeaderParseService/resources/database/university_list/kw.html +28 -0
  184. data/perl/HeaderParseService/resources/database/university_list/lb.html +28 -0
  185. data/perl/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
  186. data/perl/HeaderParseService/resources/database/university_list/lk.html +30 -0
  187. data/perl/HeaderParseService/resources/database/university_list/lt.html +31 -0
  188. data/perl/HeaderParseService/resources/database/university_list/lu.html +34 -0
  189. data/perl/HeaderParseService/resources/database/university_list/lv.html +30 -0
  190. data/perl/HeaderParseService/resources/database/university_list/ma.html +28 -0
  191. data/perl/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
  192. data/perl/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
  193. data/perl/HeaderParseService/resources/database/university_list/mk.html +29 -0
  194. data/perl/HeaderParseService/resources/database/university_list/mo.html +29 -0
  195. data/perl/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
  196. data/perl/HeaderParseService/resources/database/university_list/mt.html +28 -0
  197. data/perl/HeaderParseService/resources/database/university_list/mx.html +68 -0
  198. data/perl/HeaderParseService/resources/database/university_list/my.html +39 -0
  199. data/perl/HeaderParseService/resources/database/university_list/ni.html +28 -0
  200. data/perl/HeaderParseService/resources/database/university_list/nl.html +51 -0
  201. data/perl/HeaderParseService/resources/database/university_list/no.html +56 -0
  202. data/perl/HeaderParseService/resources/database/university_list/nz.html +41 -0
  203. data/perl/HeaderParseService/resources/database/university_list/pa.html +31 -0
  204. data/perl/HeaderParseService/resources/database/university_list/pe.html +40 -0
  205. data/perl/HeaderParseService/resources/database/university_list/ph.html +41 -0
  206. data/perl/HeaderParseService/resources/database/university_list/pl.html +51 -0
  207. data/perl/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
  208. data/perl/HeaderParseService/resources/database/university_list/pr.html +31 -0
  209. data/perl/HeaderParseService/resources/database/university_list/ps.html +28 -0
  210. data/perl/HeaderParseService/resources/database/university_list/pt.html +45 -0
  211. data/perl/HeaderParseService/resources/database/university_list/recognition.html +69 -0
  212. data/perl/HeaderParseService/resources/database/university_list/results.html +71 -0
  213. data/perl/HeaderParseService/resources/database/university_list/ro.html +38 -0
  214. data/perl/HeaderParseService/resources/database/university_list/ru.html +48 -0
  215. data/perl/HeaderParseService/resources/database/university_list/sd.html +28 -0
  216. data/perl/HeaderParseService/resources/database/university_list/se.html +57 -0
  217. data/perl/HeaderParseService/resources/database/university_list/sg.html +33 -0
  218. data/perl/HeaderParseService/resources/database/university_list/si.html +30 -0
  219. data/perl/HeaderParseService/resources/database/university_list/sk.html +35 -0
  220. data/perl/HeaderParseService/resources/database/university_list/th.html +45 -0
  221. data/perl/HeaderParseService/resources/database/university_list/tr.html +44 -0
  222. data/perl/HeaderParseService/resources/database/university_list/tw.html +76 -0
  223. data/perl/HeaderParseService/resources/database/university_list/ua.html +29 -0
  224. data/perl/HeaderParseService/resources/database/university_list/uk.html +168 -0
  225. data/perl/HeaderParseService/resources/database/university_list/univ-full.html +3166 -0
  226. data/perl/HeaderParseService/resources/database/university_list/univ.html +122 -0
  227. data/perl/HeaderParseService/resources/database/university_list/uy.html +31 -0
  228. data/perl/HeaderParseService/resources/database/university_list/ve.html +34 -0
  229. data/perl/HeaderParseService/resources/database/university_list/yu.html +28 -0
  230. data/perl/HeaderParseService/resources/database/university_list/za.html +46 -0
  231. data/perl/HeaderParseService/resources/database/university_list/zm.html +28 -0
  232. data/perl/HeaderParseService/resources/database/university_list.txt +3025 -0
  233. data/perl/HeaderParseService/resources/database/url.txt +1 -0
  234. data/perl/HeaderParseService/resources/database/webTopWords +225 -0
  235. data/perl/HeaderParseService/resources/database/words +45402 -0
  236. data/perl/HeaderParseService/resources/models/10ContextModelfold1 +369 -0
  237. data/perl/HeaderParseService/resources/models/10Modelfold1 +376 -0
  238. data/perl/HeaderParseService/resources/models/11ContextModelfold1 +400 -0
  239. data/perl/HeaderParseService/resources/models/11Modelfold1 +526 -0
  240. data/perl/HeaderParseService/resources/models/12ContextModelfold1 +510 -0
  241. data/perl/HeaderParseService/resources/models/12Modelfold1 +423 -0
  242. data/perl/HeaderParseService/resources/models/13ContextModelfold1 +364 -0
  243. data/perl/HeaderParseService/resources/models/13Modelfold1 +677 -0
  244. data/perl/HeaderParseService/resources/models/14ContextModelfold1 +459 -0
  245. data/perl/HeaderParseService/resources/models/14Modelfold1 +325 -0
  246. data/perl/HeaderParseService/resources/models/15ContextModelfold1 +340 -0
  247. data/perl/HeaderParseService/resources/models/15Modelfold1 +390 -0
  248. data/perl/HeaderParseService/resources/models/1ContextModelfold1 +668 -0
  249. data/perl/HeaderParseService/resources/models/1Modelfold1 +1147 -0
  250. data/perl/HeaderParseService/resources/models/2ContextModelfold1 +755 -0
  251. data/perl/HeaderParseService/resources/models/2Modelfold1 +796 -0
  252. data/perl/HeaderParseService/resources/models/3ContextModelfold1 +1299 -0
  253. data/perl/HeaderParseService/resources/models/3Modelfold1 +1360 -0
  254. data/perl/HeaderParseService/resources/models/4ContextModelfold1 +1062 -0
  255. data/perl/HeaderParseService/resources/models/4Modelfold1 +993 -0
  256. data/perl/HeaderParseService/resources/models/5ContextModelfold1 +1339 -0
  257. data/perl/HeaderParseService/resources/models/5Modelfold1 +2098 -0
  258. data/perl/HeaderParseService/resources/models/6ContextModelfold1 +888 -0
  259. data/perl/HeaderParseService/resources/models/6Modelfold1 +620 -0
  260. data/perl/HeaderParseService/resources/models/7ContextModelfold1 +257 -0
  261. data/perl/HeaderParseService/resources/models/7Modelfold1 +228 -0
  262. data/perl/HeaderParseService/resources/models/8ContextModelfold1 +677 -0
  263. data/perl/HeaderParseService/resources/models/8Modelfold1 +1871 -0
  264. data/perl/HeaderParseService/resources/models/9ContextModelfold1 +198 -0
  265. data/perl/HeaderParseService/resources/models/9Modelfold1 +170 -0
  266. data/perl/HeaderParseService/resources/models/NameSpaceModel +181 -0
  267. data/perl/HeaderParseService/resources/models/NameSpaceTrainF +347 -0
  268. data/perl/HeaderParseService/resources/models/WrapperBaseFeaDict +13460 -0
  269. data/perl/HeaderParseService/resources/models/WrapperContextFeaDict +14045 -0
  270. data/perl/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +510 -0
  271. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +23 -0
  272. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +23 -0
  273. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +23 -0
  274. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +23 -0
  275. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +23 -0
  276. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +23 -0
  277. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +23 -0
  278. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +23 -0
  279. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +23 -0
  280. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +23 -0
  281. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +23 -0
  282. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +23 -0
  283. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +23 -0
  284. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +23 -0
  285. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +23 -0
  286. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +23 -0
  287. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +23 -0
  288. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +23 -0
  289. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +23 -0
  290. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +23 -0
  291. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +23 -0
  292. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +23 -0
  293. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +23 -0
  294. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +23 -0
  295. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +23 -0
  296. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +23 -0
  297. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +23 -0
  298. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +23 -0
  299. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +23 -0
  300. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +23 -0
  301. data/perl/ParsCit/README.TXT +82 -0
  302. data/perl/ParsCit/crfpp/traindata/parsCit.template +60 -0
  303. data/perl/ParsCit/crfpp/traindata/parsCit.train.data +12104 -0
  304. data/perl/ParsCit/crfpp/traindata/tagged_references.txt +500 -0
  305. data/perl/ParsCit/lib/CSXUtil/SafeText.pm +140 -0
  306. data/perl/ParsCit/lib/ParsCit/Citation.pm +462 -0
  307. data/perl/ParsCit/lib/ParsCit/CitationContext.pm +132 -0
  308. data/perl/ParsCit/lib/ParsCit/Config.pm +46 -0
  309. data/perl/ParsCit/lib/ParsCit/Controller.pm +306 -0
  310. data/perl/ParsCit/lib/ParsCit/PostProcess.pm +367 -0
  311. data/perl/ParsCit/lib/ParsCit/PreProcess.pm +333 -0
  312. data/perl/ParsCit/lib/ParsCit/Tr2crfpp.pm +331 -0
  313. data/perl/ParsCit/resources/parsCit.model +0 -0
  314. data/perl/ParsCit/resources/parsCitDict.txt +148783 -0
  315. data/perl/extract.pl +199 -0
  316. data/spec/biblicit/cb2bib_spec.rb +48 -0
  317. data/spec/biblicit/citeseer_spec.rb +40 -0
  318. data/spec/fixtures/pdf/10.1.1.109.4049.pdf +0 -0
  319. data/spec/fixtures/pdf/Bagnoli Watts TAR 2010.pdf +0 -0
  320. data/spec/fixtures/pdf/ICINCO_2010.pdf +0 -0
  321. data/spec/spec_helper.rb +3 -0
  322. metadata +474 -0
@@ -0,0 +1,537 @@
1
+ #
2
+ # Copyright 2007 Penn State University
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ #
13
+ package HeaderParse::API::NamePatternMatch;
14
+
15
+ use utf8;
16
+
17
+ #this is wrapper version, with <<>> as the separator
18
+
19
+ #11/26-11/29
20
+ #Function 1:
21
+ #use model match method for the multi-author lines separated only by space to get all the legal patterns for the line;
22
+
23
+ #input: a line from file [MultiAuthorLines.space.processed]
24
+ #for example:
25
+ #Chungki <space> Lee <<sep>><</sep>> James <space> E. <space> Burns
26
+
27
+
28
+ #Function 2:
29
+ #FeatureRepresentation, given the predicted name sequence
30
+ #like\: Chungki Lee<>James E. Burns<>
31
+
32
+
33
+ #Idea: used recursive function, which might be improved by dynamic programming later
34
+ sub NamePatternMatch() {
35
+ my $line = shift;
36
+
37
+ # print "NAME LINE: $line\n";
38
+
39
+ #preprocess the line
40
+ $line =~ s/(<<sep>>)|(<<\/sep>>)|(<space>)/ /g;
41
+ $line =~ s/\s+[^\p{IsUpper}]+(\s+|$)/ /g;
42
+
43
+ #remove isolated punctuations or digits; ignore the the small case letter, because the output would be the extracted names; but index needs to be kept to compute the performance finally -- problem to be solved
44
+
45
+ $line =~ s/\+L/ /g;
46
+ $line =~ s/^\s+//g;
47
+ $line =~ s/\s+$//g;
48
+
49
+ my @authors = split(/\s+/, $line);
50
+
51
+ $AuthSeqMat = &FillSequenceMatrix(\@authors);
52
+
53
+ my $SeparatorMat = &InitializeSepMat($#authors); # separators
54
+
55
+ #step 2: Recursive function to find the separator
56
+ my ($separator, $sequenceArr, $separatorArr) = &SeekSep(0, $#authors, $AuthSeqMat);
57
+
58
+ my $PredictedNames = ();
59
+ $PredictedNames = &printNameArray($sequenceArr, $separatorArr, \@authors, 0);
60
+ return ($PredictedNames);
61
+ }
62
+
63
+ #this is the recursive function of getting the separators.
64
+ # $SeparatorMat might not be useful
65
+ sub SeekSep() {
66
+ my ($i, $j, $SequenceMat) = @_;
67
+ my $separator= 0;
68
+ my @FinalseparatorArr=();
69
+ my @FinalsequenceArr=();
70
+
71
+ if ($i+2 >= $j) { #non-separable unit with 3- words
72
+ if (&Duplicate([$$SequenceMat[$i][$j]], \@FinalsequenceArr) eq "-1") {
73
+ push @FinalsequenceArr, [$$SequenceMat[$i][$j]];
74
+ }
75
+ # no separator
76
+ }else {
77
+ #what is the basic case
78
+ #consider the case of 4 words here\: Paulo A. R. Lorenzo
79
+ # We just add this case into candidate sequence.
80
+ if ($i+3 == $j) {
81
+ if (&Duplicate([$$SequenceMat[$i][$j]], \@FinalsequenceArr) eq "-1") {
82
+ push @FinalsequenceArr, [$$SequenceMat[$i][$j]];
83
+ }
84
+ }
85
+ for my $k($i+2 .. $j-1) {
86
+ my ($leftSeparator, $leftSequenceArr, $leftSeparatorArr) = &SeekSep($i, $k-1, $SequenceMat);
87
+ my ($rightSeparator, $rightSequenceArr, $rightSeparatorArr) = &SeekSep($k, $j, $SequenceMat);
88
+ #left
89
+ if (! $leftSeparator) {
90
+ if (&valid($$leftSequenceArr[0][0])) {
91
+ #got left name string
92
+ }else {
93
+ next;
94
+ }
95
+ }else {#separable -- then get the sequence array and separator array
96
+
97
+ }
98
+
99
+ #right
100
+ if (! $rightSeparator) {
101
+ if (&valid($$rightSequenceArr[0][0])) {
102
+ #got right name string
103
+ }else {
104
+ next;
105
+ }
106
+ }else {#separable -- then get the sequence array and separator array
107
+
108
+ }
109
+
110
+ $separator = 1; #passed at this point
111
+ #combine two good sub parts and the current k
112
+ if (! $leftSeparator) {
113
+ if (! $rightSeparator) {
114
+ @separatorArr = ("$k");
115
+ @sequenceArr = ($$leftSequenceArr[0][0], $$rightSequenceArr[0][0]);
116
+ if (&Duplicate(\@sequenceArr, \@FinalsequenceArr) eq "-1") {
117
+ push @FinalsequenceArr, [@sequenceArr];
118
+ push @FinalseparatorArr, [@separatorArr];
119
+ }
120
+ }else {
121
+ for my $R(0 .. $#$rightSequenceArr) {
122
+ @sequenceArr = ($$leftSequenceArr[0][0]);
123
+ my @separatorArr = ("$k");
124
+ for my $ri(0 .. $#{$$rightSequenceArr[$R]}) {
125
+ @sequenceArr = (@sequenceArr, $$rightSequenceArr[$R][$ri]);
126
+ }
127
+ for my $ri(0 .. $#{$$rightSeparatorArr[$R]}) {
128
+ @separatorArr = (@separatorArr , $$rightSeparatorArr[$R][$ri]);
129
+ }
130
+ if (&Duplicate(\@sequenceArr, \@FinalsequenceArr) eq "-1") {
131
+ push @FinalsequenceArr, [@sequenceArr];
132
+ push @FinalseparatorArr, [@separatorArr];
133
+ }
134
+ }
135
+ }
136
+ }else {
137
+ if (! $rightSeparator) {
138
+ for my $L(0 .. $#$leftSequenceArr) {
139
+ my @sequenceArr = ();
140
+ my @separatorArr = ();
141
+ for my $li(0 .. $#{$$leftSequenceArr[$L]}) {
142
+ @sequenceArr = (@sequenceArr, $$leftSequenceArr[$L][$li]);
143
+ }
144
+ @sequenceArr = (@sequenceArr, $$rightSequenceArr[0][0]);
145
+ for my $li(0 .. $#{$$leftSeparatorArr[$L]}) {
146
+ @SeparatorArr = (@SeparatorArr, $$leftSeparatorArr[$L][$li]);
147
+ }
148
+ @SeparatorArr = (@SeparatorArr,"$k");
149
+
150
+ if (&Duplicate(\@sequenceArr, \@FinalsequenceArr) eq "-1") {
151
+ push @FinalsequenceArr, [@sequenceArr];
152
+ push @FinalseparatorArr, [@separatorArr];
153
+ }
154
+ }
155
+ }else {
156
+ for my $L(0 .. $#$leftSequenceArr) {
157
+ my @WholeSequence = ();
158
+ my @separatorArr = ();
159
+ my @leftSequence = ();
160
+ for my $li(0 .. $#{$$leftSequenceArr[$L]}) {
161
+ @leftSequence = (@leftSequence, $$leftSequenceArr[$L][$li]);
162
+ }
163
+ for my $li(0 .. $#{$$leftSeparatorArr[$L]}) {
164
+ @SeparatorArr = (@SeparatorArr, $$leftSeparatorArr[$L][$li]);
165
+ }
166
+ @SeparatorArr = (@SeparatorArr,"$k");
167
+
168
+ for my $R(0 .. $#$rightSequenceArr) {
169
+ my @rightSequence = ();
170
+ my @rightSeparator = ();
171
+ @WholeSequence = @leftSequence;
172
+ for my $ri(0 .. $#{$$rightSequenceArr[$R]}) {
173
+ @rightSequence = (@rightSequence, $$rightSequenceArr[$R][$ri]);
174
+ }
175
+ @WholeSequence = (@WholeSequence, @rightSequence);
176
+ for my $ri(0 .. $#{$$rightSeparatorArr[$R]}) {
177
+ @rightSeparator = (@rightSeparator, $$rightSeparatorArr[$R][$ri]);
178
+ }
179
+ @SeparatorArr = (@SeparatorArr, @rightSeparator);
180
+ if (&Duplicate(\@WholeSequence, \@FinalsequenceArr) eq "-1") {
181
+ push @FinalsequenceArr, [@WholeSequence];
182
+ push @FinalseparatorArr, [@separatorArr];
183
+ }
184
+ }
185
+ }
186
+ }
187
+ }
188
+
189
+ if (! $separator) {# no combination is valid
190
+ # push @FinalsequenceArr, [$$SequenceMat[$i][$j]];
191
+ }
192
+ }
193
+ }
194
+ return($separator, \@FinalsequenceArr, \@FinalseparatorArr);
195
+
196
+ }
197
+
198
+ # this is to initialize the separator matrix
199
+ sub InitializeSepMat() {
200
+ my $num = shift;
201
+ my @SeparatorMat = ();
202
+
203
+ for my $i(0 .. $num) {
204
+ for my $j(0 .. $i-1) {
205
+ $SeparatorMat[$i][$j] = "-2"; #err
206
+ }
207
+ $SeparatorMat[$i][$i] = "-1"; #no separator
208
+ }
209
+ $SeparatorMat[0][1] = "-1";
210
+ if (($num-1) > 0) {
211
+ $SeparatorMat[$num-1][$num] = "-1";
212
+ }
213
+ return(\@SeparatorMat);
214
+ }
215
+
216
+ sub FillSequenceMatrix() {
217
+ my $authors = shift;
218
+ #step 1: Fill the sequence matrix
219
+ my @SequenceMat = ();
220
+
221
+ for my $i(0 .. $#$authors) {
222
+ $SequenceMat[$i][$i] = &NameType($$authors[$i]);
223
+ }
224
+
225
+ # needs adjust
226
+ my $maxStep;
227
+ if ($#authors > 4) {
228
+ $maxStep = 3;
229
+ }else {
230
+ $maxStep = $#$authors; #remember the range starts from 0 and the maxStep = total number of words - 2
231
+ }
232
+
233
+ for my $step(1 .. $maxStep) {
234
+ for my $i(0 .. $#$authors-$step) {
235
+ my $tmpStr = "$SequenceMat[$i][$i+$step-1]"."$SequenceMat[$i+$step][$i+$step]";
236
+ if (length($tmpStr) < 6) {
237
+ $SequenceMat[$i][$i+$step] = $tmpStr;
238
+
239
+ }
240
+ }
241
+ }
242
+
243
+ return(\@SequenceMat);
244
+ }
245
+
246
+ sub NameType() {
247
+ my $name = shift;
248
+ my $reval = "";
249
+
250
+ if ($name =~ /^[\p{IsUpper}](\.)*$/) {
251
+ $reval = "I";
252
+ }elsif (($name =~ /^[\p{IsUpper}]+/) && (length($name) > 1)) {
253
+ $reval = "F";
254
+ }elsif (($name =~ /^[\p{IsLower}]+/) && (length($name) > 1)) {
255
+ $reval = "s";
256
+ }else {
257
+ print STDERR "odd $name \n";
258
+ $reval = "o";
259
+ }
260
+ return ($reval);
261
+ }
262
+
263
+
264
+ sub RichNameType() {
265
+ my $name = shift;
266
+ my $reval = "";
267
+
268
+ $name =~ s/^\s+//g;
269
+ $name =~ s/\s+$//g;
270
+
271
+ if ($name =~ /^[\p{IsUpper}]+(\.)*$/) {
272
+ $reval = "I";
273
+ }elsif (($name =~ /^[\p{IsUpper}]+/) && (length($name) > 1)) {
274
+ if ($name =~ /\w\-\w/) {
275
+ $reval = "F-";
276
+ }else{
277
+ $reval = "F";
278
+ }
279
+ }elsif (($name =~ /^[\p{IsLower}]+/) && (length($name) > 1)) {
280
+ $reval = "s";
281
+ }else {
282
+ print STDERR "odd $name \n";
283
+ $reval = "o";
284
+ }
285
+ return ($reval);
286
+ }
287
+
288
+
289
+ sub printMatrix() { # obsolete
290
+ my $TrueIndex;
291
+ my $arr = shift;
292
+ my $OutFH = shift;
293
+
294
+ for my $i(0 .. $#$arr) {
295
+ if ($i eq $TrueIndex) {
296
+ print OutFH "+1 ";
297
+ }else {
298
+ print OutFH "-1";
299
+ }
300
+ for my $j(0 .. $#{$$arr[$i]}) {
301
+ if ($$arr[$i][$j]) {
302
+ print OutFH "$$arr[$i][$j]<>";
303
+ }else {
304
+ print OutFH "0";
305
+ }
306
+ }
307
+ print OutFH "\n";
308
+ }
309
+ }
310
+
311
+
312
+ sub RemoveDuplicates(){ # remove duplicates(obsolete)
313
+ my $arr = shift;
314
+ my %uniqueH;
315
+ my @index = ();
316
+
317
+ for my $i(0 .. $#$arr) {
318
+ my $str = "";
319
+ for my $j(0 .. $#{$$arr[$i]}) {
320
+ if ($$arr[$i][$j]) {
321
+ $str .= "$$arr[$i][$j]<>";
322
+ }else {
323
+ $str .= "<>";
324
+ }
325
+ }
326
+ if (! $uniqueH{"$str"}) {
327
+ $uniqueH{"$str"} = 1; #otherwise index $i = 0 would be considered as negative
328
+ push @index, $i;
329
+ }
330
+ }
331
+ return(\@index);
332
+ }
333
+
334
+
335
+ sub Duplicate() { # see if a name array is already in a 2D name matrix
336
+ my $matchArr = shift;
337
+ my $arr = shift;
338
+ my $duplicate = "-1";
339
+
340
+ my $matchStr = "";
341
+ for my $j(0 .. $#{$matchArr}) {
342
+ $matchStr .= "$$matchArr[$j]<>";
343
+ }
344
+
345
+ for my $i(0 .. $#$arr) {
346
+ my $tmpstr = "";
347
+ for my $j(0 .. $#{$$arr[$i]}) {
348
+ if ($$arr[$i][$j]) {
349
+ $tmpstr .= "$$arr[$i][$j]<>";
350
+ }
351
+ }
352
+
353
+ if ($tmpstr eq $matchStr) {
354
+ $duplicate = $i; #rememver $i could be 0(the starting position of array)
355
+ last;
356
+ }
357
+ }
358
+
359
+ return($duplicate);
360
+ }
361
+
362
+
363
+ #Prints the pattern (FFF, FI..), separator and corresponding names
364
+ # if $PrintPattern = 1; and $print = 1;
365
+ #if $separator is 0. we should return the whole sequence as the name???
366
+ #did not consider about this case
367
+
368
+ sub printNameArray() {
369
+ my $NameArr = shift;
370
+ my $SeparatorArr = shift;
371
+ my $OriginalName = shift;
372
+ my $PrintPattern = shift;
373
+ my @nameToReturn = ();
374
+
375
+ my $print = 0;
376
+
377
+ for my $i(0 .. $#$NameArr) {
378
+ if ($PrintPattern) {
379
+ print STDERR "$i\-\- ";
380
+ for my $j(0 .. $#{$$NameArr[$i]}) {
381
+ if ($$NameArr[$i][$j]) {
382
+ print STDERR "$$NameArr[$i][$j] ";
383
+ }else {
384
+ print STDERR "0 ";
385
+ }
386
+ }
387
+ print STDERR "Separator \[";
388
+ for my $j(0 .. $#{$$SeparatorArr[$i]}) {
389
+ print STDERR "$$SeparatorArr[$i][$j] ";
390
+ }
391
+ print STDERR "\]\n";
392
+ }
393
+
394
+ #print original names
395
+ my $start = 0;
396
+ my $end = $#$OriginalName;
397
+ # print "Original name\:";
398
+ my $NameNum = 0;
399
+ for my $j(0 .. $#{$$SeparatorArr[$i]}) {
400
+ my $end = $$SeparatorArr[$i][$j]-1;
401
+ my $tmpName = "";
402
+ for my $k($start .. $end) {
403
+ if ($print) {
404
+ print STDERR "$$OriginalName[$k] ";
405
+ }
406
+ $tmpName .= "$$OriginalName[$k] ";
407
+ }
408
+
409
+ $tmpName =~ s/\s+$//g;
410
+ $nameToReturn[$i][$NameNum] = $tmpName;
411
+ $NameNum++;
412
+ if ($print) {
413
+ print STDERR "<>";
414
+ }
415
+ $start = $end+1;
416
+ }
417
+
418
+ $tmpName = "";
419
+ for my $k($start .. $end) {
420
+ if ($print) {
421
+ print STDERR "$$OriginalName[$k] ";
422
+ }
423
+ $tmpName .= "$$OriginalName[$k] ";
424
+ }
425
+ $tmpName =~ s/\s+$//g;
426
+ $nameToReturn[$i][$NameNum] = $tmpName;
427
+
428
+ if ($print) {
429
+ print STDERR "\n";
430
+ }
431
+ }
432
+
433
+ return(\@nameToReturn);
434
+
435
+ }
436
+
437
+ sub printNameArray2() { #obsolete; which remove the duplicates before printing
438
+ my $NameArr = shift;
439
+ my $SeparatorArr = shift;
440
+ my $OriginalName = shift;
441
+ my $Uncertainty = 0; #if it is of multiple cases
442
+
443
+ my $UniqueIndex = &RemoveDuplicates($NameArr);
444
+ if ($#$UniqueIndex > 0) {
445
+ $Uncertainty = 1;
446
+ }
447
+
448
+ for my $i(0 .. $#$UniqueIndex) {
449
+ print "$$UniqueIndex[$i]\-\- ";
450
+ for my $j(0 .. $#{$$NameArr[$$UniqueIndex[$i]]}) {
451
+ if ($$NameArr[$$UniqueIndex[$i]][$j]) {
452
+ print STDERR "$$NameArr[$$UniqueIndex[$i]][$j] ";
453
+ }else {
454
+ print STDERR "0 ";
455
+ }
456
+ }
457
+ print STDERR "Separator \[";
458
+ for my $j(0 .. $#{$$SeparatorArr[$$UniqueIndex[$i]]}) {
459
+ print STDERR "$$SeparatorArr[$$UniqueIndex[$i]][$j] ";
460
+ }
461
+ print STDERR "\]\n";
462
+
463
+ #print original names according to the sorted separators
464
+ my $start = 0;
465
+ my $end = $#$OriginalName;
466
+ print STDERR "Original name\:";
467
+ for my $j(0 .. $#{$$SeparatorArr[$$UniqueIndex[$i]]}) {
468
+ my $end = $$SeparatorArr[$$UniqueIndex[$i]][$j]-1;
469
+ for my $k ($start .. $end) {
470
+ print STDERR "$$OriginalName[$k] ";
471
+ }
472
+ print STDERR "<>";
473
+ $start = $end+1;
474
+ }
475
+ for my $k ($start .. $end) {
476
+ print STDERR "$$OriginalName[$k] ";
477
+ }
478
+ print STDERR "\n";
479
+ }
480
+
481
+ # return($Uncertainty);
482
+ }
483
+
484
+ sub valid() {
485
+ my $str= shift;
486
+ my %ValidPattern = (
487
+ "FF" => 1, #[FullName] [FullName]
488
+ "FFF" => 1,#[FullName] [FullName] [FullName]
489
+ "FIF" => 1,#FIF and derivatives: [FullName] [NameInitial]{1,3}/ [FullName]
490
+ "FIIF" => 1, #Paulo A. R. Lorenzo
491
+ "FIIIF" => 1,
492
+ "IF" => 1, ##IF and derivatives:[NameInitial]{1,2} [FullName]
493
+ "IIF" => 1,
494
+ "IFF" => 1,#E. Christopher Lewis
495
+ "FssF" => 1 #Th.P. van der Weide
496
+ );
497
+
498
+ if ($ValidPattern{$str}) {
499
+ return (1);
500
+ }else {
501
+ return (0);
502
+ }
503
+ }
504
+
505
+ # this is to present the macro features of the predicted names see
506
+ # if this is correct prediction;
507
+ # make it ready for training/testing or not?
508
+
509
+ sub NamePatternFeatureRepresent() {
510
+
511
+ return (1);
512
+
513
+ }
514
+
515
+ #Input format: Chungki <space> Lee <<sep>><</sep>> James <space> E. <space> Burns
516
+ sub GetTrueName() {
517
+ my $in = shift;
518
+ my @authors = split(/<<sep>>[^(<<)(>>)]*<<\/sep>>/, $in);
519
+
520
+ for my $i(0 .. $#authors) {
521
+ $authors[$i] =~ s/(<space>)/ /g;
522
+ $authors[$i] =~ s/\s+[^\p{IsUpper}]+(\s+|$)/ /g;
523
+ #remove isolated punctuations or digits;
524
+ #the small case letter like 'n', 'x'
525
+ #Disadvantage is: Th.P. van der Weide n would become "Th.P. Weide"
526
+ #Index to the original word position should be kept
527
+
528
+ $authors[$i] =~ s/\+L/ /g;
529
+ $authors[$i] =~ s/^\s+//g;
530
+ $authors[$i] =~ s/\s+$//g;
531
+ }
532
+
533
+ return(\@authors);
534
+ }
535
+
536
+
537
+ 1;
@@ -0,0 +1,68 @@
1
+ #
2
+ # Copyright 2007 Penn State University
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ #
13
+ #!/usr/bin/perl -w
14
+ package HeaderParse::API::Parser;
15
+ use strict;
16
+ use utf8;
17
+ use HeaderParse::API::ParserMethods;
18
+ use IO::Handle;
19
+ use HeaderParse::Config::API_Config;
20
+ use vars qw($ServerURL $repositoryLocation $algVersion);
21
+
22
+
23
+ sub _parseHeader{
24
+ my ($fileID, $jobID) = @_;
25
+ my ($header, $faultMessage, $rResponse, $success, $papertext);
26
+
27
+ my $status = 1;
28
+ my $msg = "";
29
+
30
+ # my $file = "$repositoryLocation/$fileID";
31
+ # print "file: $file\n";
32
+ my $file = $fileID;
33
+
34
+ if (! -e $file) {
35
+ return fatal("File does not exist: $file");
36
+ }
37
+
38
+ open(IN, "<:utf8", $file) or
39
+ return fatal("Could not open file: $file");
40
+ {
41
+ local $/ = undef;
42
+ $papertext = <IN>;
43
+ }
44
+ close IN;
45
+
46
+ ($faultMessage, $header) =
47
+ &HeaderParse::API::ParserMethods::ExtractHeaderInformation(\$papertext);
48
+ if(!length($faultMessage)){
49
+ $rResponse =
50
+ &HeaderParse::API::ParserMethods::Parse($header, $jobID);
51
+ if ($rResponse eq "0") {
52
+ return fatal("Timeout while parsing");
53
+ }
54
+ return ($status, $msg, $rResponse);
55
+
56
+ } else{
57
+ #error occured while extracing the header
58
+ return fatal("file $file: $faultMessage");
59
+ }
60
+
61
+ }
62
+
63
+ sub fatal {
64
+ my ($msg) = @_;
65
+ return (0, $msg, undef);
66
+ }
67
+
68
+ 1;