biblicit 1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (322) hide show
  1. data/.gitignore +3 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +6 -0
  4. data/LICENSE.TXT +176 -0
  5. data/README.md +120 -0
  6. data/Rakefile +8 -0
  7. data/biblicit.gemspec +33 -0
  8. data/lib/biblicit/cb2bib.rb +83 -0
  9. data/lib/biblicit/citeseer.rb +53 -0
  10. data/lib/biblicit/extractor.rb +37 -0
  11. data/lib/biblicit.rb +6 -0
  12. data/perl/DocFilter/lib/CSXUtil/SafeText.pm +140 -0
  13. data/perl/DocFilter/lib/DocFilter/Config.pm +35 -0
  14. data/perl/DocFilter/lib/DocFilter/Filter.pm +51 -0
  15. data/perl/FileConversionService/README.TXT +11 -0
  16. data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
  17. data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +140 -0
  18. data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +77 -0
  19. data/perl/FileConversionService/lib/FileConverter/Compression.pm +137 -0
  20. data/perl/FileConversionService/lib/FileConverter/Config.pm +57 -0
  21. data/perl/FileConversionService/lib/FileConverter/Controller.pm +191 -0
  22. data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +61 -0
  23. data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +69 -0
  24. data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +69 -0
  25. data/perl/FileConversionService/lib/FileConverter/PSToText.pm +88 -0
  26. data/perl/FileConversionService/lib/FileConverter/Prescript.pm +68 -0
  27. data/perl/FileConversionService/lib/FileConverter/TET.pm +75 -0
  28. data/perl/FileConversionService/lib/FileConverter/Utils.pm +130 -0
  29. data/perl/HeaderParseService/README.TXT +80 -0
  30. data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +140 -0
  31. data/perl/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +968 -0
  32. data/perl/HeaderParseService/lib/HeaderParse/API/Function.pm +2016 -0
  33. data/perl/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +444 -0
  34. data/perl/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +409 -0
  35. data/perl/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +537 -0
  36. data/perl/HeaderParseService/lib/HeaderParse/API/Parser.pm +68 -0
  37. data/perl/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +1880 -0
  38. data/perl/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +46 -0
  39. data/perl/HeaderParseService/resources/data/EbizHeaders.txt +24330 -0
  40. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +27506 -0
  41. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +26495 -0
  42. data/perl/HeaderParseService/resources/data/tagged_headers.txt +40668 -0
  43. data/perl/HeaderParseService/resources/data/test_header.txt +31 -0
  44. data/perl/HeaderParseService/resources/data/test_header.txt.parsed +31 -0
  45. data/perl/HeaderParseService/resources/database/50states +60 -0
  46. data/perl/HeaderParseService/resources/database/AddrTopWords.txt +17 -0
  47. data/perl/HeaderParseService/resources/database/AffiTopWords.txt +35 -0
  48. data/perl/HeaderParseService/resources/database/AffiTopWordsAll.txt +533 -0
  49. data/perl/HeaderParseService/resources/database/ChineseSurNames.txt +276 -0
  50. data/perl/HeaderParseService/resources/database/Csurnames.bin +0 -0
  51. data/perl/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
  52. data/perl/HeaderParseService/resources/database/DomainSuffixes.txt +242 -0
  53. data/perl/HeaderParseService/resources/database/LabeledHeader +18 -0
  54. data/perl/HeaderParseService/resources/database/README +2 -0
  55. data/perl/HeaderParseService/resources/database/TrainMulClassLines +254 -0
  56. data/perl/HeaderParseService/resources/database/TrainMulClassLines1 +510 -0
  57. data/perl/HeaderParseService/resources/database/abstract.txt +1 -0
  58. data/perl/HeaderParseService/resources/database/abstractTopWords +9 -0
  59. data/perl/HeaderParseService/resources/database/addr.txt +28 -0
  60. data/perl/HeaderParseService/resources/database/affi.txt +34 -0
  61. data/perl/HeaderParseService/resources/database/affis.bin +0 -0
  62. data/perl/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
  63. data/perl/HeaderParseService/resources/database/allnamewords.bin +0 -0
  64. data/perl/HeaderParseService/resources/database/cities_US.txt +4512 -0
  65. data/perl/HeaderParseService/resources/database/cities_world.txt +4463 -0
  66. data/perl/HeaderParseService/resources/database/city.txt +3150 -0
  67. data/perl/HeaderParseService/resources/database/cityname.txt +3151 -0
  68. data/perl/HeaderParseService/resources/database/country_abbr.txt +243 -0
  69. data/perl/HeaderParseService/resources/database/countryname.txt +262 -0
  70. data/perl/HeaderParseService/resources/database/dateTopWords +30 -0
  71. data/perl/HeaderParseService/resources/database/degree.txt +67 -0
  72. data/perl/HeaderParseService/resources/database/email.txt +3 -0
  73. data/perl/HeaderParseService/resources/database/excludeWords.txt +40 -0
  74. data/perl/HeaderParseService/resources/database/female-names +4960 -0
  75. data/perl/HeaderParseService/resources/database/firstNames.txt +8448 -0
  76. data/perl/HeaderParseService/resources/database/firstnames.bin +0 -0
  77. data/perl/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
  78. data/perl/HeaderParseService/resources/database/intro.txt +2 -0
  79. data/perl/HeaderParseService/resources/database/keyword.txt +5 -0
  80. data/perl/HeaderParseService/resources/database/keywordTopWords +7 -0
  81. data/perl/HeaderParseService/resources/database/male-names +3906 -0
  82. data/perl/HeaderParseService/resources/database/middleNames.txt +2 -0
  83. data/perl/HeaderParseService/resources/database/month.txt +35 -0
  84. data/perl/HeaderParseService/resources/database/mul +868 -0
  85. data/perl/HeaderParseService/resources/database/mul.label +869 -0
  86. data/perl/HeaderParseService/resources/database/mul.label.old +869 -0
  87. data/perl/HeaderParseService/resources/database/mul.processed +762 -0
  88. data/perl/HeaderParseService/resources/database/mulAuthor +619 -0
  89. data/perl/HeaderParseService/resources/database/mulClassStat +45 -0
  90. data/perl/HeaderParseService/resources/database/nickname.txt +58 -0
  91. data/perl/HeaderParseService/resources/database/nicknames.bin +0 -0
  92. data/perl/HeaderParseService/resources/database/note.txt +121 -0
  93. data/perl/HeaderParseService/resources/database/page.txt +1 -0
  94. data/perl/HeaderParseService/resources/database/phone.txt +9 -0
  95. data/perl/HeaderParseService/resources/database/postcode.txt +54 -0
  96. data/perl/HeaderParseService/resources/database/pubnum.txt +45 -0
  97. data/perl/HeaderParseService/resources/database/statename.bin +0 -0
  98. data/perl/HeaderParseService/resources/database/statename.txt +73 -0
  99. data/perl/HeaderParseService/resources/database/states_and_abbreviations.txt +118 -0
  100. data/perl/HeaderParseService/resources/database/stopwords +438 -0
  101. data/perl/HeaderParseService/resources/database/stopwords.bin +0 -0
  102. data/perl/HeaderParseService/resources/database/surNames.txt +19613 -0
  103. data/perl/HeaderParseService/resources/database/surnames.bin +0 -0
  104. data/perl/HeaderParseService/resources/database/surnames_spec.bin +0 -0
  105. data/perl/HeaderParseService/resources/database/university_list/A.html +167 -0
  106. data/perl/HeaderParseService/resources/database/university_list/B.html +161 -0
  107. data/perl/HeaderParseService/resources/database/university_list/C.html +288 -0
  108. data/perl/HeaderParseService/resources/database/university_list/D.html +115 -0
  109. data/perl/HeaderParseService/resources/database/university_list/E.html +147 -0
  110. data/perl/HeaderParseService/resources/database/university_list/F.html +112 -0
  111. data/perl/HeaderParseService/resources/database/university_list/G.html +115 -0
  112. data/perl/HeaderParseService/resources/database/university_list/H.html +140 -0
  113. data/perl/HeaderParseService/resources/database/university_list/I.html +138 -0
  114. data/perl/HeaderParseService/resources/database/university_list/J.html +82 -0
  115. data/perl/HeaderParseService/resources/database/university_list/K.html +115 -0
  116. data/perl/HeaderParseService/resources/database/university_list/L.html +131 -0
  117. data/perl/HeaderParseService/resources/database/university_list/M.html +201 -0
  118. data/perl/HeaderParseService/resources/database/university_list/N.html +204 -0
  119. data/perl/HeaderParseService/resources/database/university_list/O.html +89 -0
  120. data/perl/HeaderParseService/resources/database/university_list/P.html +125 -0
  121. data/perl/HeaderParseService/resources/database/university_list/Q.html +49 -0
  122. data/perl/HeaderParseService/resources/database/university_list/R.html +126 -0
  123. data/perl/HeaderParseService/resources/database/university_list/S.html +296 -0
  124. data/perl/HeaderParseService/resources/database/university_list/T.html +156 -0
  125. data/perl/HeaderParseService/resources/database/university_list/U.html +800 -0
  126. data/perl/HeaderParseService/resources/database/university_list/V.html +75 -0
  127. data/perl/HeaderParseService/resources/database/university_list/W.html +144 -0
  128. data/perl/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
  129. data/perl/HeaderParseService/resources/database/university_list/X.html +44 -0
  130. data/perl/HeaderParseService/resources/database/university_list/Y.html +53 -0
  131. data/perl/HeaderParseService/resources/database/university_list/Z.html +43 -0
  132. data/perl/HeaderParseService/resources/database/university_list/ae.html +31 -0
  133. data/perl/HeaderParseService/resources/database/university_list/am.html +30 -0
  134. data/perl/HeaderParseService/resources/database/university_list/ar.html +35 -0
  135. data/perl/HeaderParseService/resources/database/university_list/at.html +43 -0
  136. data/perl/HeaderParseService/resources/database/university_list/au.html +82 -0
  137. data/perl/HeaderParseService/resources/database/university_list/bd.html +28 -0
  138. data/perl/HeaderParseService/resources/database/university_list/be.html +41 -0
  139. data/perl/HeaderParseService/resources/database/university_list/bg.html +28 -0
  140. data/perl/HeaderParseService/resources/database/university_list/bh.html +28 -0
  141. data/perl/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
  142. data/perl/HeaderParseService/resources/database/university_list/bm.html +28 -0
  143. data/perl/HeaderParseService/resources/database/university_list/bn.html +28 -0
  144. data/perl/HeaderParseService/resources/database/university_list/br.html +66 -0
  145. data/perl/HeaderParseService/resources/database/university_list/ca.html +174 -0
  146. data/perl/HeaderParseService/resources/database/university_list/ch.html +52 -0
  147. data/perl/HeaderParseService/resources/database/university_list/cl.html +40 -0
  148. data/perl/HeaderParseService/resources/database/university_list/cn.html +87 -0
  149. data/perl/HeaderParseService/resources/database/university_list/co.html +39 -0
  150. data/perl/HeaderParseService/resources/database/university_list/cr.html +34 -0
  151. data/perl/HeaderParseService/resources/database/university_list/cy.html +34 -0
  152. data/perl/HeaderParseService/resources/database/university_list/cz.html +44 -0
  153. data/perl/HeaderParseService/resources/database/university_list/de.html +128 -0
  154. data/perl/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
  155. data/perl/HeaderParseService/resources/database/university_list/dk.html +42 -0
  156. data/perl/HeaderParseService/resources/database/university_list/ec.html +31 -0
  157. data/perl/HeaderParseService/resources/database/university_list/ee.html +30 -0
  158. data/perl/HeaderParseService/resources/database/university_list/eg.html +29 -0
  159. data/perl/HeaderParseService/resources/database/university_list/es.html +68 -0
  160. data/perl/HeaderParseService/resources/database/university_list/et.html +28 -0
  161. data/perl/HeaderParseService/resources/database/university_list/faq.html +147 -0
  162. data/perl/HeaderParseService/resources/database/university_list/fi.html +49 -0
  163. data/perl/HeaderParseService/resources/database/university_list/fj.html +28 -0
  164. data/perl/HeaderParseService/resources/database/university_list/fo.html +28 -0
  165. data/perl/HeaderParseService/resources/database/university_list/fr.html +106 -0
  166. data/perl/HeaderParseService/resources/database/university_list/geog.html +150 -0
  167. data/perl/HeaderParseService/resources/database/university_list/gr.html +38 -0
  168. data/perl/HeaderParseService/resources/database/university_list/gu.html +28 -0
  169. data/perl/HeaderParseService/resources/database/university_list/hk.html +34 -0
  170. data/perl/HeaderParseService/resources/database/university_list/hr.html +28 -0
  171. data/perl/HeaderParseService/resources/database/university_list/hu.html +46 -0
  172. data/perl/HeaderParseService/resources/database/university_list/id.html +29 -0
  173. data/perl/HeaderParseService/resources/database/university_list/ie.html +49 -0
  174. data/perl/HeaderParseService/resources/database/university_list/il.html +35 -0
  175. data/perl/HeaderParseService/resources/database/university_list/in.html +109 -0
  176. data/perl/HeaderParseService/resources/database/university_list/is.html +32 -0
  177. data/perl/HeaderParseService/resources/database/university_list/it.html +75 -0
  178. data/perl/HeaderParseService/resources/database/university_list/jm.html +28 -0
  179. data/perl/HeaderParseService/resources/database/university_list/jo.html +28 -0
  180. data/perl/HeaderParseService/resources/database/university_list/jp.html +155 -0
  181. data/perl/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
  182. data/perl/HeaderParseService/resources/database/university_list/kr.html +65 -0
  183. data/perl/HeaderParseService/resources/database/university_list/kw.html +28 -0
  184. data/perl/HeaderParseService/resources/database/university_list/lb.html +28 -0
  185. data/perl/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
  186. data/perl/HeaderParseService/resources/database/university_list/lk.html +30 -0
  187. data/perl/HeaderParseService/resources/database/university_list/lt.html +31 -0
  188. data/perl/HeaderParseService/resources/database/university_list/lu.html +34 -0
  189. data/perl/HeaderParseService/resources/database/university_list/lv.html +30 -0
  190. data/perl/HeaderParseService/resources/database/university_list/ma.html +28 -0
  191. data/perl/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
  192. data/perl/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
  193. data/perl/HeaderParseService/resources/database/university_list/mk.html +29 -0
  194. data/perl/HeaderParseService/resources/database/university_list/mo.html +29 -0
  195. data/perl/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
  196. data/perl/HeaderParseService/resources/database/university_list/mt.html +28 -0
  197. data/perl/HeaderParseService/resources/database/university_list/mx.html +68 -0
  198. data/perl/HeaderParseService/resources/database/university_list/my.html +39 -0
  199. data/perl/HeaderParseService/resources/database/university_list/ni.html +28 -0
  200. data/perl/HeaderParseService/resources/database/university_list/nl.html +51 -0
  201. data/perl/HeaderParseService/resources/database/university_list/no.html +56 -0
  202. data/perl/HeaderParseService/resources/database/university_list/nz.html +41 -0
  203. data/perl/HeaderParseService/resources/database/university_list/pa.html +31 -0
  204. data/perl/HeaderParseService/resources/database/university_list/pe.html +40 -0
  205. data/perl/HeaderParseService/resources/database/university_list/ph.html +41 -0
  206. data/perl/HeaderParseService/resources/database/university_list/pl.html +51 -0
  207. data/perl/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
  208. data/perl/HeaderParseService/resources/database/university_list/pr.html +31 -0
  209. data/perl/HeaderParseService/resources/database/university_list/ps.html +28 -0
  210. data/perl/HeaderParseService/resources/database/university_list/pt.html +45 -0
  211. data/perl/HeaderParseService/resources/database/university_list/recognition.html +69 -0
  212. data/perl/HeaderParseService/resources/database/university_list/results.html +71 -0
  213. data/perl/HeaderParseService/resources/database/university_list/ro.html +38 -0
  214. data/perl/HeaderParseService/resources/database/university_list/ru.html +48 -0
  215. data/perl/HeaderParseService/resources/database/university_list/sd.html +28 -0
  216. data/perl/HeaderParseService/resources/database/university_list/se.html +57 -0
  217. data/perl/HeaderParseService/resources/database/university_list/sg.html +33 -0
  218. data/perl/HeaderParseService/resources/database/university_list/si.html +30 -0
  219. data/perl/HeaderParseService/resources/database/university_list/sk.html +35 -0
  220. data/perl/HeaderParseService/resources/database/university_list/th.html +45 -0
  221. data/perl/HeaderParseService/resources/database/university_list/tr.html +44 -0
  222. data/perl/HeaderParseService/resources/database/university_list/tw.html +76 -0
  223. data/perl/HeaderParseService/resources/database/university_list/ua.html +29 -0
  224. data/perl/HeaderParseService/resources/database/university_list/uk.html +168 -0
  225. data/perl/HeaderParseService/resources/database/university_list/univ-full.html +3166 -0
  226. data/perl/HeaderParseService/resources/database/university_list/univ.html +122 -0
  227. data/perl/HeaderParseService/resources/database/university_list/uy.html +31 -0
  228. data/perl/HeaderParseService/resources/database/university_list/ve.html +34 -0
  229. data/perl/HeaderParseService/resources/database/university_list/yu.html +28 -0
  230. data/perl/HeaderParseService/resources/database/university_list/za.html +46 -0
  231. data/perl/HeaderParseService/resources/database/university_list/zm.html +28 -0
  232. data/perl/HeaderParseService/resources/database/university_list.txt +3025 -0
  233. data/perl/HeaderParseService/resources/database/url.txt +1 -0
  234. data/perl/HeaderParseService/resources/database/webTopWords +225 -0
  235. data/perl/HeaderParseService/resources/database/words +45402 -0
  236. data/perl/HeaderParseService/resources/models/10ContextModelfold1 +369 -0
  237. data/perl/HeaderParseService/resources/models/10Modelfold1 +376 -0
  238. data/perl/HeaderParseService/resources/models/11ContextModelfold1 +400 -0
  239. data/perl/HeaderParseService/resources/models/11Modelfold1 +526 -0
  240. data/perl/HeaderParseService/resources/models/12ContextModelfold1 +510 -0
  241. data/perl/HeaderParseService/resources/models/12Modelfold1 +423 -0
  242. data/perl/HeaderParseService/resources/models/13ContextModelfold1 +364 -0
  243. data/perl/HeaderParseService/resources/models/13Modelfold1 +677 -0
  244. data/perl/HeaderParseService/resources/models/14ContextModelfold1 +459 -0
  245. data/perl/HeaderParseService/resources/models/14Modelfold1 +325 -0
  246. data/perl/HeaderParseService/resources/models/15ContextModelfold1 +340 -0
  247. data/perl/HeaderParseService/resources/models/15Modelfold1 +390 -0
  248. data/perl/HeaderParseService/resources/models/1ContextModelfold1 +668 -0
  249. data/perl/HeaderParseService/resources/models/1Modelfold1 +1147 -0
  250. data/perl/HeaderParseService/resources/models/2ContextModelfold1 +755 -0
  251. data/perl/HeaderParseService/resources/models/2Modelfold1 +796 -0
  252. data/perl/HeaderParseService/resources/models/3ContextModelfold1 +1299 -0
  253. data/perl/HeaderParseService/resources/models/3Modelfold1 +1360 -0
  254. data/perl/HeaderParseService/resources/models/4ContextModelfold1 +1062 -0
  255. data/perl/HeaderParseService/resources/models/4Modelfold1 +993 -0
  256. data/perl/HeaderParseService/resources/models/5ContextModelfold1 +1339 -0
  257. data/perl/HeaderParseService/resources/models/5Modelfold1 +2098 -0
  258. data/perl/HeaderParseService/resources/models/6ContextModelfold1 +888 -0
  259. data/perl/HeaderParseService/resources/models/6Modelfold1 +620 -0
  260. data/perl/HeaderParseService/resources/models/7ContextModelfold1 +257 -0
  261. data/perl/HeaderParseService/resources/models/7Modelfold1 +228 -0
  262. data/perl/HeaderParseService/resources/models/8ContextModelfold1 +677 -0
  263. data/perl/HeaderParseService/resources/models/8Modelfold1 +1871 -0
  264. data/perl/HeaderParseService/resources/models/9ContextModelfold1 +198 -0
  265. data/perl/HeaderParseService/resources/models/9Modelfold1 +170 -0
  266. data/perl/HeaderParseService/resources/models/NameSpaceModel +181 -0
  267. data/perl/HeaderParseService/resources/models/NameSpaceTrainF +347 -0
  268. data/perl/HeaderParseService/resources/models/WrapperBaseFeaDict +13460 -0
  269. data/perl/HeaderParseService/resources/models/WrapperContextFeaDict +14045 -0
  270. data/perl/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +510 -0
  271. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +23 -0
  272. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +23 -0
  273. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +23 -0
  274. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +23 -0
  275. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +23 -0
  276. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +23 -0
  277. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +23 -0
  278. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +23 -0
  279. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +23 -0
  280. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +23 -0
  281. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +23 -0
  282. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +23 -0
  283. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +23 -0
  284. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +23 -0
  285. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +23 -0
  286. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +23 -0
  287. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +23 -0
  288. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +23 -0
  289. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +23 -0
  290. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +23 -0
  291. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +23 -0
  292. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +23 -0
  293. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +23 -0
  294. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +23 -0
  295. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +23 -0
  296. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +23 -0
  297. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +23 -0
  298. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +23 -0
  299. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +23 -0
  300. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +23 -0
  301. data/perl/ParsCit/README.TXT +82 -0
  302. data/perl/ParsCit/crfpp/traindata/parsCit.template +60 -0
  303. data/perl/ParsCit/crfpp/traindata/parsCit.train.data +12104 -0
  304. data/perl/ParsCit/crfpp/traindata/tagged_references.txt +500 -0
  305. data/perl/ParsCit/lib/CSXUtil/SafeText.pm +140 -0
  306. data/perl/ParsCit/lib/ParsCit/Citation.pm +462 -0
  307. data/perl/ParsCit/lib/ParsCit/CitationContext.pm +132 -0
  308. data/perl/ParsCit/lib/ParsCit/Config.pm +46 -0
  309. data/perl/ParsCit/lib/ParsCit/Controller.pm +306 -0
  310. data/perl/ParsCit/lib/ParsCit/PostProcess.pm +367 -0
  311. data/perl/ParsCit/lib/ParsCit/PreProcess.pm +333 -0
  312. data/perl/ParsCit/lib/ParsCit/Tr2crfpp.pm +331 -0
  313. data/perl/ParsCit/resources/parsCit.model +0 -0
  314. data/perl/ParsCit/resources/parsCitDict.txt +148783 -0
  315. data/perl/extract.pl +199 -0
  316. data/spec/biblicit/cb2bib_spec.rb +48 -0
  317. data/spec/biblicit/citeseer_spec.rb +40 -0
  318. data/spec/fixtures/pdf/10.1.1.109.4049.pdf +0 -0
  319. data/spec/fixtures/pdf/Bagnoli Watts TAR 2010.pdf +0 -0
  320. data/spec/fixtures/pdf/ICINCO_2010.pdf +0 -0
  321. data/spec/spec_helper.rb +3 -0
  322. metadata +474 -0
@@ -0,0 +1,537 @@
1
+ #
2
+ # Copyright 2007 Penn State University
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ #
13
+ package HeaderParse::API::NamePatternMatch;
14
+
15
+ use utf8;
16
+
17
+ #this is wrapper version, with <<>> as the separator
18
+
19
+ #11/26-11/29
20
+ #Function 1:
21
+ #use model match method for the multi-author lines separated only by space to get all the legal patterns for the line;
22
+
23
+ #input: a line from file [MultiAuthorLines.space.processed]
24
+ #for example:
25
+ #Chungki <space> Lee <<sep>><</sep>> James <space> E. <space> Burns
26
+
27
+
28
+ #Function 2:
29
+ #FeatureRepresentation, given the predicted name sequence
30
+ #like\: Chungki Lee<>James E. Burns<>
31
+
32
+
33
+ #Idea: used recursive function, which might be improved by dynamic programming later
34
+ sub NamePatternMatch() {
35
+ my $line = shift;
36
+
37
+ # print "NAME LINE: $line\n";
38
+
39
+ #preprocess the line
40
+ $line =~ s/(<<sep>>)|(<<\/sep>>)|(<space>)/ /g;
41
+ $line =~ s/\s+[^\p{IsUpper}]+(\s+|$)/ /g;
42
+
43
+ #remove isolated punctuations or digits; ignore the the small case letter, because the output would be the extracted names; but index needs to be kept to compute the performance finally -- problem to be solved
44
+
45
+ $line =~ s/\+L/ /g;
46
+ $line =~ s/^\s+//g;
47
+ $line =~ s/\s+$//g;
48
+
49
+ my @authors = split(/\s+/, $line);
50
+
51
+ $AuthSeqMat = &FillSequenceMatrix(\@authors);
52
+
53
+ my $SeparatorMat = &InitializeSepMat($#authors); # separators
54
+
55
+ #step 2: Recursive function to find the separator
56
+ my ($separator, $sequenceArr, $separatorArr) = &SeekSep(0, $#authors, $AuthSeqMat);
57
+
58
+ my $PredictedNames = ();
59
+ $PredictedNames = &printNameArray($sequenceArr, $separatorArr, \@authors, 0);
60
+ return ($PredictedNames);
61
+ }
62
+
63
+ #this is the recursive function of getting the separators.
64
+ # $SeparatorMat might not be useful
65
+ sub SeekSep() {
66
+ my ($i, $j, $SequenceMat) = @_;
67
+ my $separator= 0;
68
+ my @FinalseparatorArr=();
69
+ my @FinalsequenceArr=();
70
+
71
+ if ($i+2 >= $j) { #non-separable unit with 3- words
72
+ if (&Duplicate([$$SequenceMat[$i][$j]], \@FinalsequenceArr) eq "-1") {
73
+ push @FinalsequenceArr, [$$SequenceMat[$i][$j]];
74
+ }
75
+ # no separator
76
+ }else {
77
+ #what is the basic case
78
+ #consider the case of 4 words here\: Paulo A. R. Lorenzo
79
+ # We just add this case into candidate sequence.
80
+ if ($i+3 == $j) {
81
+ if (&Duplicate([$$SequenceMat[$i][$j]], \@FinalsequenceArr) eq "-1") {
82
+ push @FinalsequenceArr, [$$SequenceMat[$i][$j]];
83
+ }
84
+ }
85
+ for my $k($i+2 .. $j-1) {
86
+ my ($leftSeparator, $leftSequenceArr, $leftSeparatorArr) = &SeekSep($i, $k-1, $SequenceMat);
87
+ my ($rightSeparator, $rightSequenceArr, $rightSeparatorArr) = &SeekSep($k, $j, $SequenceMat);
88
+ #left
89
+ if (! $leftSeparator) {
90
+ if (&valid($$leftSequenceArr[0][0])) {
91
+ #got left name string
92
+ }else {
93
+ next;
94
+ }
95
+ }else {#separable -- then get the sequence array and separator array
96
+
97
+ }
98
+
99
+ #right
100
+ if (! $rightSeparator) {
101
+ if (&valid($$rightSequenceArr[0][0])) {
102
+ #got right name string
103
+ }else {
104
+ next;
105
+ }
106
+ }else {#separable -- then get the sequence array and separator array
107
+
108
+ }
109
+
110
+ $separator = 1; #passed at this point
111
+ #combine two good sub parts and the current k
112
+ if (! $leftSeparator) {
113
+ if (! $rightSeparator) {
114
+ @separatorArr = ("$k");
115
+ @sequenceArr = ($$leftSequenceArr[0][0], $$rightSequenceArr[0][0]);
116
+ if (&Duplicate(\@sequenceArr, \@FinalsequenceArr) eq "-1") {
117
+ push @FinalsequenceArr, [@sequenceArr];
118
+ push @FinalseparatorArr, [@separatorArr];
119
+ }
120
+ }else {
121
+ for my $R(0 .. $#$rightSequenceArr) {
122
+ @sequenceArr = ($$leftSequenceArr[0][0]);
123
+ my @separatorArr = ("$k");
124
+ for my $ri(0 .. $#{$$rightSequenceArr[$R]}) {
125
+ @sequenceArr = (@sequenceArr, $$rightSequenceArr[$R][$ri]);
126
+ }
127
+ for my $ri(0 .. $#{$$rightSeparatorArr[$R]}) {
128
+ @separatorArr = (@separatorArr , $$rightSeparatorArr[$R][$ri]);
129
+ }
130
+ if (&Duplicate(\@sequenceArr, \@FinalsequenceArr) eq "-1") {
131
+ push @FinalsequenceArr, [@sequenceArr];
132
+ push @FinalseparatorArr, [@separatorArr];
133
+ }
134
+ }
135
+ }
136
+ }else {
137
+ if (! $rightSeparator) {
138
+ for my $L(0 .. $#$leftSequenceArr) {
139
+ my @sequenceArr = ();
140
+ my @separatorArr = ();
141
+ for my $li(0 .. $#{$$leftSequenceArr[$L]}) {
142
+ @sequenceArr = (@sequenceArr, $$leftSequenceArr[$L][$li]);
143
+ }
144
+ @sequenceArr = (@sequenceArr, $$rightSequenceArr[0][0]);
145
+ for my $li(0 .. $#{$$leftSeparatorArr[$L]}) {
146
+ @SeparatorArr = (@SeparatorArr, $$leftSeparatorArr[$L][$li]);
147
+ }
148
+ @SeparatorArr = (@SeparatorArr,"$k");
149
+
150
+ if (&Duplicate(\@sequenceArr, \@FinalsequenceArr) eq "-1") {
151
+ push @FinalsequenceArr, [@sequenceArr];
152
+ push @FinalseparatorArr, [@separatorArr];
153
+ }
154
+ }
155
+ }else {
156
+ for my $L(0 .. $#$leftSequenceArr) {
157
+ my @WholeSequence = ();
158
+ my @separatorArr = ();
159
+ my @leftSequence = ();
160
+ for my $li(0 .. $#{$$leftSequenceArr[$L]}) {
161
+ @leftSequence = (@leftSequence, $$leftSequenceArr[$L][$li]);
162
+ }
163
+ for my $li(0 .. $#{$$leftSeparatorArr[$L]}) {
164
+ @SeparatorArr = (@SeparatorArr, $$leftSeparatorArr[$L][$li]);
165
+ }
166
+ @SeparatorArr = (@SeparatorArr,"$k");
167
+
168
+ for my $R(0 .. $#$rightSequenceArr) {
169
+ my @rightSequence = ();
170
+ my @rightSeparator = ();
171
+ @WholeSequence = @leftSequence;
172
+ for my $ri(0 .. $#{$$rightSequenceArr[$R]}) {
173
+ @rightSequence = (@rightSequence, $$rightSequenceArr[$R][$ri]);
174
+ }
175
+ @WholeSequence = (@WholeSequence, @rightSequence);
176
+ for my $ri(0 .. $#{$$rightSeparatorArr[$R]}) {
177
+ @rightSeparator = (@rightSeparator, $$rightSeparatorArr[$R][$ri]);
178
+ }
179
+ @SeparatorArr = (@SeparatorArr, @rightSeparator);
180
+ if (&Duplicate(\@WholeSequence, \@FinalsequenceArr) eq "-1") {
181
+ push @FinalsequenceArr, [@WholeSequence];
182
+ push @FinalseparatorArr, [@separatorArr];
183
+ }
184
+ }
185
+ }
186
+ }
187
+ }
188
+
189
+ if (! $separator) {# no combination is valid
190
+ # push @FinalsequenceArr, [$$SequenceMat[$i][$j]];
191
+ }
192
+ }
193
+ }
194
+ return($separator, \@FinalsequenceArr, \@FinalseparatorArr);
195
+
196
+ }
197
+
198
+ # this is to initialize the separator matrix
199
+ sub InitializeSepMat() {
200
+ my $num = shift;
201
+ my @SeparatorMat = ();
202
+
203
+ for my $i(0 .. $num) {
204
+ for my $j(0 .. $i-1) {
205
+ $SeparatorMat[$i][$j] = "-2"; #err
206
+ }
207
+ $SeparatorMat[$i][$i] = "-1"; #no separator
208
+ }
209
+ $SeparatorMat[0][1] = "-1";
210
+ if (($num-1) > 0) {
211
+ $SeparatorMat[$num-1][$num] = "-1";
212
+ }
213
+ return(\@SeparatorMat);
214
+ }
215
+
216
+ sub FillSequenceMatrix() {
217
+ my $authors = shift;
218
+ #step 1: Fill the sequence matrix
219
+ my @SequenceMat = ();
220
+
221
+ for my $i(0 .. $#$authors) {
222
+ $SequenceMat[$i][$i] = &NameType($$authors[$i]);
223
+ }
224
+
225
+ # needs adjust
226
+ my $maxStep;
227
+ if ($#authors > 4) {
228
+ $maxStep = 3;
229
+ }else {
230
+ $maxStep = $#$authors; #remember the range starts from 0 and the maxStep = total number of words - 2
231
+ }
232
+
233
+ for my $step(1 .. $maxStep) {
234
+ for my $i(0 .. $#$authors-$step) {
235
+ my $tmpStr = "$SequenceMat[$i][$i+$step-1]"."$SequenceMat[$i+$step][$i+$step]";
236
+ if (length($tmpStr) < 6) {
237
+ $SequenceMat[$i][$i+$step] = $tmpStr;
238
+
239
+ }
240
+ }
241
+ }
242
+
243
+ return(\@SequenceMat);
244
+ }
245
+
246
+ sub NameType() {
247
+ my $name = shift;
248
+ my $reval = "";
249
+
250
+ if ($name =~ /^[\p{IsUpper}](\.)*$/) {
251
+ $reval = "I";
252
+ }elsif (($name =~ /^[\p{IsUpper}]+/) && (length($name) > 1)) {
253
+ $reval = "F";
254
+ }elsif (($name =~ /^[\p{IsLower}]+/) && (length($name) > 1)) {
255
+ $reval = "s";
256
+ }else {
257
+ print STDERR "odd $name \n";
258
+ $reval = "o";
259
+ }
260
+ return ($reval);
261
+ }
262
+
263
+
264
+ sub RichNameType() {
265
+ my $name = shift;
266
+ my $reval = "";
267
+
268
+ $name =~ s/^\s+//g;
269
+ $name =~ s/\s+$//g;
270
+
271
+ if ($name =~ /^[\p{IsUpper}]+(\.)*$/) {
272
+ $reval = "I";
273
+ }elsif (($name =~ /^[\p{IsUpper}]+/) && (length($name) > 1)) {
274
+ if ($name =~ /\w\-\w/) {
275
+ $reval = "F-";
276
+ }else{
277
+ $reval = "F";
278
+ }
279
+ }elsif (($name =~ /^[\p{IsLower}]+/) && (length($name) > 1)) {
280
+ $reval = "s";
281
+ }else {
282
+ print STDERR "odd $name \n";
283
+ $reval = "o";
284
+ }
285
+ return ($reval);
286
+ }
287
+
288
+
289
+ sub printMatrix() { # obsolete
290
+ my $TrueIndex;
291
+ my $arr = shift;
292
+ my $OutFH = shift;
293
+
294
+ for my $i(0 .. $#$arr) {
295
+ if ($i eq $TrueIndex) {
296
+ print OutFH "+1 ";
297
+ }else {
298
+ print OutFH "-1";
299
+ }
300
+ for my $j(0 .. $#{$$arr[$i]}) {
301
+ if ($$arr[$i][$j]) {
302
+ print OutFH "$$arr[$i][$j]<>";
303
+ }else {
304
+ print OutFH "0";
305
+ }
306
+ }
307
+ print OutFH "\n";
308
+ }
309
+ }
310
+
311
+
312
+ sub RemoveDuplicates(){ # remove duplicates(obsolete)
313
+ my $arr = shift;
314
+ my %uniqueH;
315
+ my @index = ();
316
+
317
+ for my $i(0 .. $#$arr) {
318
+ my $str = "";
319
+ for my $j(0 .. $#{$$arr[$i]}) {
320
+ if ($$arr[$i][$j]) {
321
+ $str .= "$$arr[$i][$j]<>";
322
+ }else {
323
+ $str .= "<>";
324
+ }
325
+ }
326
+ if (! $uniqueH{"$str"}) {
327
+ $uniqueH{"$str"} = 1; #otherwise index $i = 0 would be considered as negative
328
+ push @index, $i;
329
+ }
330
+ }
331
+ return(\@index);
332
+ }
333
+
334
+
335
+ sub Duplicate() { # see if a name array is already in a 2D name matrix
336
+ my $matchArr = shift;
337
+ my $arr = shift;
338
+ my $duplicate = "-1";
339
+
340
+ my $matchStr = "";
341
+ for my $j(0 .. $#{$matchArr}) {
342
+ $matchStr .= "$$matchArr[$j]<>";
343
+ }
344
+
345
+ for my $i(0 .. $#$arr) {
346
+ my $tmpstr = "";
347
+ for my $j(0 .. $#{$$arr[$i]}) {
348
+ if ($$arr[$i][$j]) {
349
+ $tmpstr .= "$$arr[$i][$j]<>";
350
+ }
351
+ }
352
+
353
+ if ($tmpstr eq $matchStr) {
354
+ $duplicate = $i; #rememver $i could be 0(the starting position of array)
355
+ last;
356
+ }
357
+ }
358
+
359
+ return($duplicate);
360
+ }
361
+
362
+
363
+ #Prints the pattern (FFF, FI..), separator and corresponding names
364
+ # if $PrintPattern = 1; and $print = 1;
365
+ #if $separator is 0. we should return the whole sequence as the name???
366
+ #did not consider about this case
367
+
368
+ sub printNameArray() {
369
+ my $NameArr = shift;
370
+ my $SeparatorArr = shift;
371
+ my $OriginalName = shift;
372
+ my $PrintPattern = shift;
373
+ my @nameToReturn = ();
374
+
375
+ my $print = 0;
376
+
377
+ for my $i(0 .. $#$NameArr) {
378
+ if ($PrintPattern) {
379
+ print STDERR "$i\-\- ";
380
+ for my $j(0 .. $#{$$NameArr[$i]}) {
381
+ if ($$NameArr[$i][$j]) {
382
+ print STDERR "$$NameArr[$i][$j] ";
383
+ }else {
384
+ print STDERR "0 ";
385
+ }
386
+ }
387
+ print STDERR "Separator \[";
388
+ for my $j(0 .. $#{$$SeparatorArr[$i]}) {
389
+ print STDERR "$$SeparatorArr[$i][$j] ";
390
+ }
391
+ print STDERR "\]\n";
392
+ }
393
+
394
+ #print original names
395
+ my $start = 0;
396
+ my $end = $#$OriginalName;
397
+ # print "Original name\:";
398
+ my $NameNum = 0;
399
+ for my $j(0 .. $#{$$SeparatorArr[$i]}) {
400
+ my $end = $$SeparatorArr[$i][$j]-1;
401
+ my $tmpName = "";
402
+ for my $k($start .. $end) {
403
+ if ($print) {
404
+ print STDERR "$$OriginalName[$k] ";
405
+ }
406
+ $tmpName .= "$$OriginalName[$k] ";
407
+ }
408
+
409
+ $tmpName =~ s/\s+$//g;
410
+ $nameToReturn[$i][$NameNum] = $tmpName;
411
+ $NameNum++;
412
+ if ($print) {
413
+ print STDERR "<>";
414
+ }
415
+ $start = $end+1;
416
+ }
417
+
418
+ $tmpName = "";
419
+ for my $k($start .. $end) {
420
+ if ($print) {
421
+ print STDERR "$$OriginalName[$k] ";
422
+ }
423
+ $tmpName .= "$$OriginalName[$k] ";
424
+ }
425
+ $tmpName =~ s/\s+$//g;
426
+ $nameToReturn[$i][$NameNum] = $tmpName;
427
+
428
+ if ($print) {
429
+ print STDERR "\n";
430
+ }
431
+ }
432
+
433
+ return(\@nameToReturn);
434
+
435
+ }
436
+
437
+ sub printNameArray2() { #obsolete; which remove the duplicates before printing
438
+ my $NameArr = shift;
439
+ my $SeparatorArr = shift;
440
+ my $OriginalName = shift;
441
+ my $Uncertainty = 0; #if it is of multiple cases
442
+
443
+ my $UniqueIndex = &RemoveDuplicates($NameArr);
444
+ if ($#$UniqueIndex > 0) {
445
+ $Uncertainty = 1;
446
+ }
447
+
448
+ for my $i(0 .. $#$UniqueIndex) {
449
+ print "$$UniqueIndex[$i]\-\- ";
450
+ for my $j(0 .. $#{$$NameArr[$$UniqueIndex[$i]]}) {
451
+ if ($$NameArr[$$UniqueIndex[$i]][$j]) {
452
+ print STDERR "$$NameArr[$$UniqueIndex[$i]][$j] ";
453
+ }else {
454
+ print STDERR "0 ";
455
+ }
456
+ }
457
+ print STDERR "Separator \[";
458
+ for my $j(0 .. $#{$$SeparatorArr[$$UniqueIndex[$i]]}) {
459
+ print STDERR "$$SeparatorArr[$$UniqueIndex[$i]][$j] ";
460
+ }
461
+ print STDERR "\]\n";
462
+
463
+ #print original names according to the sorted separators
464
+ my $start = 0;
465
+ my $end = $#$OriginalName;
466
+ print STDERR "Original name\:";
467
+ for my $j(0 .. $#{$$SeparatorArr[$$UniqueIndex[$i]]}) {
468
+ my $end = $$SeparatorArr[$$UniqueIndex[$i]][$j]-1;
469
+ for my $k ($start .. $end) {
470
+ print STDERR "$$OriginalName[$k] ";
471
+ }
472
+ print STDERR "<>";
473
+ $start = $end+1;
474
+ }
475
+ for my $k ($start .. $end) {
476
+ print STDERR "$$OriginalName[$k] ";
477
+ }
478
+ print STDERR "\n";
479
+ }
480
+
481
+ # return($Uncertainty);
482
+ }
483
+
484
+ sub valid() {
485
+ my $str= shift;
486
+ my %ValidPattern = (
487
+ "FF" => 1, #[FullName] [FullName]
488
+ "FFF" => 1,#[FullName] [FullName] [FullName]
489
+ "FIF" => 1,#FIF and derivatives: [FullName] [NameInitial]{1,3}/ [FullName]
490
+ "FIIF" => 1, #Paulo A. R. Lorenzo
491
+ "FIIIF" => 1,
492
+ "IF" => 1, ##IF and derivatives:[NameInitial]{1,2} [FullName]
493
+ "IIF" => 1,
494
+ "IFF" => 1,#E. Christopher Lewis
495
+ "FssF" => 1 #Th.P. van der Weide
496
+ );
497
+
498
+ if ($ValidPattern{$str}) {
499
+ return (1);
500
+ }else {
501
+ return (0);
502
+ }
503
+ }
504
+
505
+ # this is to present the macro features of the predicted names see
506
+ # if this is correct prediction;
507
+ # make it ready for training/testing or not?
508
+
509
+ sub NamePatternFeatureRepresent() {
510
+
511
+ return (1);
512
+
513
+ }
514
+
515
+ #Input format: Chungki <space> Lee <<sep>><</sep>> James <space> E. <space> Burns
516
+ sub GetTrueName() {
517
+ my $in = shift;
518
+ my @authors = split(/<<sep>>[^(<<)(>>)]*<<\/sep>>/, $in);
519
+
520
+ for my $i(0 .. $#authors) {
521
+ $authors[$i] =~ s/(<space>)/ /g;
522
+ $authors[$i] =~ s/\s+[^\p{IsUpper}]+(\s+|$)/ /g;
523
+ #remove isolated punctuations or digits;
524
+ #the small case letter like 'n', 'x'
525
+ #Disadvantage is: Th.P. van der Weide n would become "Th.P. Weide"
526
+ #Index to the original word position should be kept
527
+
528
+ $authors[$i] =~ s/\+L/ /g;
529
+ $authors[$i] =~ s/^\s+//g;
530
+ $authors[$i] =~ s/\s+$//g;
531
+ }
532
+
533
+ return(\@authors);
534
+ }
535
+
536
+
537
+ 1;
@@ -0,0 +1,68 @@
1
+ #
2
+ # Copyright 2007 Penn State University
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ #
13
+ #!/usr/bin/perl -w
14
+ package HeaderParse::API::Parser;
15
+ use strict;
16
+ use utf8;
17
+ use HeaderParse::API::ParserMethods;
18
+ use IO::Handle;
19
+ use HeaderParse::Config::API_Config;
20
+ use vars qw($ServerURL $repositoryLocation $algVersion);
21
+
22
+
23
+ sub _parseHeader{
24
+ my ($fileID, $jobID) = @_;
25
+ my ($header, $faultMessage, $rResponse, $success, $papertext);
26
+
27
+ my $status = 1;
28
+ my $msg = "";
29
+
30
+ # my $file = "$repositoryLocation/$fileID";
31
+ # print "file: $file\n";
32
+ my $file = $fileID;
33
+
34
+ if (! -e $file) {
35
+ return fatal("File does not exist: $file");
36
+ }
37
+
38
+ open(IN, "<:utf8", $file) or
39
+ return fatal("Could not open file: $file");
40
+ {
41
+ local $/ = undef;
42
+ $papertext = <IN>;
43
+ }
44
+ close IN;
45
+
46
+ ($faultMessage, $header) =
47
+ &HeaderParse::API::ParserMethods::ExtractHeaderInformation(\$papertext);
48
+ if(!length($faultMessage)){
49
+ $rResponse =
50
+ &HeaderParse::API::ParserMethods::Parse($header, $jobID);
51
+ if ($rResponse eq "0") {
52
+ return fatal("Timeout while parsing");
53
+ }
54
+ return ($status, $msg, $rResponse);
55
+
56
+ } else{
57
+ #error occured while extracing the header
58
+ return fatal("file $file: $faultMessage");
59
+ }
60
+
61
+ }
62
+
63
+ sub fatal {
64
+ my ($msg) = @_;
65
+ return (0, $msg, undef);
66
+ }
67
+
68
+ 1;