biblicit 1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (322) hide show
  1. data/.gitignore +3 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +6 -0
  4. data/LICENSE.TXT +176 -0
  5. data/README.md +120 -0
  6. data/Rakefile +8 -0
  7. data/biblicit.gemspec +33 -0
  8. data/lib/biblicit/cb2bib.rb +83 -0
  9. data/lib/biblicit/citeseer.rb +53 -0
  10. data/lib/biblicit/extractor.rb +37 -0
  11. data/lib/biblicit.rb +6 -0
  12. data/perl/DocFilter/lib/CSXUtil/SafeText.pm +140 -0
  13. data/perl/DocFilter/lib/DocFilter/Config.pm +35 -0
  14. data/perl/DocFilter/lib/DocFilter/Filter.pm +51 -0
  15. data/perl/FileConversionService/README.TXT +11 -0
  16. data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
  17. data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +140 -0
  18. data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +77 -0
  19. data/perl/FileConversionService/lib/FileConverter/Compression.pm +137 -0
  20. data/perl/FileConversionService/lib/FileConverter/Config.pm +57 -0
  21. data/perl/FileConversionService/lib/FileConverter/Controller.pm +191 -0
  22. data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +61 -0
  23. data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +69 -0
  24. data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +69 -0
  25. data/perl/FileConversionService/lib/FileConverter/PSToText.pm +88 -0
  26. data/perl/FileConversionService/lib/FileConverter/Prescript.pm +68 -0
  27. data/perl/FileConversionService/lib/FileConverter/TET.pm +75 -0
  28. data/perl/FileConversionService/lib/FileConverter/Utils.pm +130 -0
  29. data/perl/HeaderParseService/README.TXT +80 -0
  30. data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +140 -0
  31. data/perl/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +968 -0
  32. data/perl/HeaderParseService/lib/HeaderParse/API/Function.pm +2016 -0
  33. data/perl/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +444 -0
  34. data/perl/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +409 -0
  35. data/perl/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +537 -0
  36. data/perl/HeaderParseService/lib/HeaderParse/API/Parser.pm +68 -0
  37. data/perl/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +1880 -0
  38. data/perl/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +46 -0
  39. data/perl/HeaderParseService/resources/data/EbizHeaders.txt +24330 -0
  40. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +27506 -0
  41. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +26495 -0
  42. data/perl/HeaderParseService/resources/data/tagged_headers.txt +40668 -0
  43. data/perl/HeaderParseService/resources/data/test_header.txt +31 -0
  44. data/perl/HeaderParseService/resources/data/test_header.txt.parsed +31 -0
  45. data/perl/HeaderParseService/resources/database/50states +60 -0
  46. data/perl/HeaderParseService/resources/database/AddrTopWords.txt +17 -0
  47. data/perl/HeaderParseService/resources/database/AffiTopWords.txt +35 -0
  48. data/perl/HeaderParseService/resources/database/AffiTopWordsAll.txt +533 -0
  49. data/perl/HeaderParseService/resources/database/ChineseSurNames.txt +276 -0
  50. data/perl/HeaderParseService/resources/database/Csurnames.bin +0 -0
  51. data/perl/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
  52. data/perl/HeaderParseService/resources/database/DomainSuffixes.txt +242 -0
  53. data/perl/HeaderParseService/resources/database/LabeledHeader +18 -0
  54. data/perl/HeaderParseService/resources/database/README +2 -0
  55. data/perl/HeaderParseService/resources/database/TrainMulClassLines +254 -0
  56. data/perl/HeaderParseService/resources/database/TrainMulClassLines1 +510 -0
  57. data/perl/HeaderParseService/resources/database/abstract.txt +1 -0
  58. data/perl/HeaderParseService/resources/database/abstractTopWords +9 -0
  59. data/perl/HeaderParseService/resources/database/addr.txt +28 -0
  60. data/perl/HeaderParseService/resources/database/affi.txt +34 -0
  61. data/perl/HeaderParseService/resources/database/affis.bin +0 -0
  62. data/perl/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
  63. data/perl/HeaderParseService/resources/database/allnamewords.bin +0 -0
  64. data/perl/HeaderParseService/resources/database/cities_US.txt +4512 -0
  65. data/perl/HeaderParseService/resources/database/cities_world.txt +4463 -0
  66. data/perl/HeaderParseService/resources/database/city.txt +3150 -0
  67. data/perl/HeaderParseService/resources/database/cityname.txt +3151 -0
  68. data/perl/HeaderParseService/resources/database/country_abbr.txt +243 -0
  69. data/perl/HeaderParseService/resources/database/countryname.txt +262 -0
  70. data/perl/HeaderParseService/resources/database/dateTopWords +30 -0
  71. data/perl/HeaderParseService/resources/database/degree.txt +67 -0
  72. data/perl/HeaderParseService/resources/database/email.txt +3 -0
  73. data/perl/HeaderParseService/resources/database/excludeWords.txt +40 -0
  74. data/perl/HeaderParseService/resources/database/female-names +4960 -0
  75. data/perl/HeaderParseService/resources/database/firstNames.txt +8448 -0
  76. data/perl/HeaderParseService/resources/database/firstnames.bin +0 -0
  77. data/perl/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
  78. data/perl/HeaderParseService/resources/database/intro.txt +2 -0
  79. data/perl/HeaderParseService/resources/database/keyword.txt +5 -0
  80. data/perl/HeaderParseService/resources/database/keywordTopWords +7 -0
  81. data/perl/HeaderParseService/resources/database/male-names +3906 -0
  82. data/perl/HeaderParseService/resources/database/middleNames.txt +2 -0
  83. data/perl/HeaderParseService/resources/database/month.txt +35 -0
  84. data/perl/HeaderParseService/resources/database/mul +868 -0
  85. data/perl/HeaderParseService/resources/database/mul.label +869 -0
  86. data/perl/HeaderParseService/resources/database/mul.label.old +869 -0
  87. data/perl/HeaderParseService/resources/database/mul.processed +762 -0
  88. data/perl/HeaderParseService/resources/database/mulAuthor +619 -0
  89. data/perl/HeaderParseService/resources/database/mulClassStat +45 -0
  90. data/perl/HeaderParseService/resources/database/nickname.txt +58 -0
  91. data/perl/HeaderParseService/resources/database/nicknames.bin +0 -0
  92. data/perl/HeaderParseService/resources/database/note.txt +121 -0
  93. data/perl/HeaderParseService/resources/database/page.txt +1 -0
  94. data/perl/HeaderParseService/resources/database/phone.txt +9 -0
  95. data/perl/HeaderParseService/resources/database/postcode.txt +54 -0
  96. data/perl/HeaderParseService/resources/database/pubnum.txt +45 -0
  97. data/perl/HeaderParseService/resources/database/statename.bin +0 -0
  98. data/perl/HeaderParseService/resources/database/statename.txt +73 -0
  99. data/perl/HeaderParseService/resources/database/states_and_abbreviations.txt +118 -0
  100. data/perl/HeaderParseService/resources/database/stopwords +438 -0
  101. data/perl/HeaderParseService/resources/database/stopwords.bin +0 -0
  102. data/perl/HeaderParseService/resources/database/surNames.txt +19613 -0
  103. data/perl/HeaderParseService/resources/database/surnames.bin +0 -0
  104. data/perl/HeaderParseService/resources/database/surnames_spec.bin +0 -0
  105. data/perl/HeaderParseService/resources/database/university_list/A.html +167 -0
  106. data/perl/HeaderParseService/resources/database/university_list/B.html +161 -0
  107. data/perl/HeaderParseService/resources/database/university_list/C.html +288 -0
  108. data/perl/HeaderParseService/resources/database/university_list/D.html +115 -0
  109. data/perl/HeaderParseService/resources/database/university_list/E.html +147 -0
  110. data/perl/HeaderParseService/resources/database/university_list/F.html +112 -0
  111. data/perl/HeaderParseService/resources/database/university_list/G.html +115 -0
  112. data/perl/HeaderParseService/resources/database/university_list/H.html +140 -0
  113. data/perl/HeaderParseService/resources/database/university_list/I.html +138 -0
  114. data/perl/HeaderParseService/resources/database/university_list/J.html +82 -0
  115. data/perl/HeaderParseService/resources/database/university_list/K.html +115 -0
  116. data/perl/HeaderParseService/resources/database/university_list/L.html +131 -0
  117. data/perl/HeaderParseService/resources/database/university_list/M.html +201 -0
  118. data/perl/HeaderParseService/resources/database/university_list/N.html +204 -0
  119. data/perl/HeaderParseService/resources/database/university_list/O.html +89 -0
  120. data/perl/HeaderParseService/resources/database/university_list/P.html +125 -0
  121. data/perl/HeaderParseService/resources/database/university_list/Q.html +49 -0
  122. data/perl/HeaderParseService/resources/database/university_list/R.html +126 -0
  123. data/perl/HeaderParseService/resources/database/university_list/S.html +296 -0
  124. data/perl/HeaderParseService/resources/database/university_list/T.html +156 -0
  125. data/perl/HeaderParseService/resources/database/university_list/U.html +800 -0
  126. data/perl/HeaderParseService/resources/database/university_list/V.html +75 -0
  127. data/perl/HeaderParseService/resources/database/university_list/W.html +144 -0
  128. data/perl/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
  129. data/perl/HeaderParseService/resources/database/university_list/X.html +44 -0
  130. data/perl/HeaderParseService/resources/database/university_list/Y.html +53 -0
  131. data/perl/HeaderParseService/resources/database/university_list/Z.html +43 -0
  132. data/perl/HeaderParseService/resources/database/university_list/ae.html +31 -0
  133. data/perl/HeaderParseService/resources/database/university_list/am.html +30 -0
  134. data/perl/HeaderParseService/resources/database/university_list/ar.html +35 -0
  135. data/perl/HeaderParseService/resources/database/university_list/at.html +43 -0
  136. data/perl/HeaderParseService/resources/database/university_list/au.html +82 -0
  137. data/perl/HeaderParseService/resources/database/university_list/bd.html +28 -0
  138. data/perl/HeaderParseService/resources/database/university_list/be.html +41 -0
  139. data/perl/HeaderParseService/resources/database/university_list/bg.html +28 -0
  140. data/perl/HeaderParseService/resources/database/university_list/bh.html +28 -0
  141. data/perl/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
  142. data/perl/HeaderParseService/resources/database/university_list/bm.html +28 -0
  143. data/perl/HeaderParseService/resources/database/university_list/bn.html +28 -0
  144. data/perl/HeaderParseService/resources/database/university_list/br.html +66 -0
  145. data/perl/HeaderParseService/resources/database/university_list/ca.html +174 -0
  146. data/perl/HeaderParseService/resources/database/university_list/ch.html +52 -0
  147. data/perl/HeaderParseService/resources/database/university_list/cl.html +40 -0
  148. data/perl/HeaderParseService/resources/database/university_list/cn.html +87 -0
  149. data/perl/HeaderParseService/resources/database/university_list/co.html +39 -0
  150. data/perl/HeaderParseService/resources/database/university_list/cr.html +34 -0
  151. data/perl/HeaderParseService/resources/database/university_list/cy.html +34 -0
  152. data/perl/HeaderParseService/resources/database/university_list/cz.html +44 -0
  153. data/perl/HeaderParseService/resources/database/university_list/de.html +128 -0
  154. data/perl/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
  155. data/perl/HeaderParseService/resources/database/university_list/dk.html +42 -0
  156. data/perl/HeaderParseService/resources/database/university_list/ec.html +31 -0
  157. data/perl/HeaderParseService/resources/database/university_list/ee.html +30 -0
  158. data/perl/HeaderParseService/resources/database/university_list/eg.html +29 -0
  159. data/perl/HeaderParseService/resources/database/university_list/es.html +68 -0
  160. data/perl/HeaderParseService/resources/database/university_list/et.html +28 -0
  161. data/perl/HeaderParseService/resources/database/university_list/faq.html +147 -0
  162. data/perl/HeaderParseService/resources/database/university_list/fi.html +49 -0
  163. data/perl/HeaderParseService/resources/database/university_list/fj.html +28 -0
  164. data/perl/HeaderParseService/resources/database/university_list/fo.html +28 -0
  165. data/perl/HeaderParseService/resources/database/university_list/fr.html +106 -0
  166. data/perl/HeaderParseService/resources/database/university_list/geog.html +150 -0
  167. data/perl/HeaderParseService/resources/database/university_list/gr.html +38 -0
  168. data/perl/HeaderParseService/resources/database/university_list/gu.html +28 -0
  169. data/perl/HeaderParseService/resources/database/university_list/hk.html +34 -0
  170. data/perl/HeaderParseService/resources/database/university_list/hr.html +28 -0
  171. data/perl/HeaderParseService/resources/database/university_list/hu.html +46 -0
  172. data/perl/HeaderParseService/resources/database/university_list/id.html +29 -0
  173. data/perl/HeaderParseService/resources/database/university_list/ie.html +49 -0
  174. data/perl/HeaderParseService/resources/database/university_list/il.html +35 -0
  175. data/perl/HeaderParseService/resources/database/university_list/in.html +109 -0
  176. data/perl/HeaderParseService/resources/database/university_list/is.html +32 -0
  177. data/perl/HeaderParseService/resources/database/university_list/it.html +75 -0
  178. data/perl/HeaderParseService/resources/database/university_list/jm.html +28 -0
  179. data/perl/HeaderParseService/resources/database/university_list/jo.html +28 -0
  180. data/perl/HeaderParseService/resources/database/university_list/jp.html +155 -0
  181. data/perl/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
  182. data/perl/HeaderParseService/resources/database/university_list/kr.html +65 -0
  183. data/perl/HeaderParseService/resources/database/university_list/kw.html +28 -0
  184. data/perl/HeaderParseService/resources/database/university_list/lb.html +28 -0
  185. data/perl/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
  186. data/perl/HeaderParseService/resources/database/university_list/lk.html +30 -0
  187. data/perl/HeaderParseService/resources/database/university_list/lt.html +31 -0
  188. data/perl/HeaderParseService/resources/database/university_list/lu.html +34 -0
  189. data/perl/HeaderParseService/resources/database/university_list/lv.html +30 -0
  190. data/perl/HeaderParseService/resources/database/university_list/ma.html +28 -0
  191. data/perl/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
  192. data/perl/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
  193. data/perl/HeaderParseService/resources/database/university_list/mk.html +29 -0
  194. data/perl/HeaderParseService/resources/database/university_list/mo.html +29 -0
  195. data/perl/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
  196. data/perl/HeaderParseService/resources/database/university_list/mt.html +28 -0
  197. data/perl/HeaderParseService/resources/database/university_list/mx.html +68 -0
  198. data/perl/HeaderParseService/resources/database/university_list/my.html +39 -0
  199. data/perl/HeaderParseService/resources/database/university_list/ni.html +28 -0
  200. data/perl/HeaderParseService/resources/database/university_list/nl.html +51 -0
  201. data/perl/HeaderParseService/resources/database/university_list/no.html +56 -0
  202. data/perl/HeaderParseService/resources/database/university_list/nz.html +41 -0
  203. data/perl/HeaderParseService/resources/database/university_list/pa.html +31 -0
  204. data/perl/HeaderParseService/resources/database/university_list/pe.html +40 -0
  205. data/perl/HeaderParseService/resources/database/university_list/ph.html +41 -0
  206. data/perl/HeaderParseService/resources/database/university_list/pl.html +51 -0
  207. data/perl/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
  208. data/perl/HeaderParseService/resources/database/university_list/pr.html +31 -0
  209. data/perl/HeaderParseService/resources/database/university_list/ps.html +28 -0
  210. data/perl/HeaderParseService/resources/database/university_list/pt.html +45 -0
  211. data/perl/HeaderParseService/resources/database/university_list/recognition.html +69 -0
  212. data/perl/HeaderParseService/resources/database/university_list/results.html +71 -0
  213. data/perl/HeaderParseService/resources/database/university_list/ro.html +38 -0
  214. data/perl/HeaderParseService/resources/database/university_list/ru.html +48 -0
  215. data/perl/HeaderParseService/resources/database/university_list/sd.html +28 -0
  216. data/perl/HeaderParseService/resources/database/university_list/se.html +57 -0
  217. data/perl/HeaderParseService/resources/database/university_list/sg.html +33 -0
  218. data/perl/HeaderParseService/resources/database/university_list/si.html +30 -0
  219. data/perl/HeaderParseService/resources/database/university_list/sk.html +35 -0
  220. data/perl/HeaderParseService/resources/database/university_list/th.html +45 -0
  221. data/perl/HeaderParseService/resources/database/university_list/tr.html +44 -0
  222. data/perl/HeaderParseService/resources/database/university_list/tw.html +76 -0
  223. data/perl/HeaderParseService/resources/database/university_list/ua.html +29 -0
  224. data/perl/HeaderParseService/resources/database/university_list/uk.html +168 -0
  225. data/perl/HeaderParseService/resources/database/university_list/univ-full.html +3166 -0
  226. data/perl/HeaderParseService/resources/database/university_list/univ.html +122 -0
  227. data/perl/HeaderParseService/resources/database/university_list/uy.html +31 -0
  228. data/perl/HeaderParseService/resources/database/university_list/ve.html +34 -0
  229. data/perl/HeaderParseService/resources/database/university_list/yu.html +28 -0
  230. data/perl/HeaderParseService/resources/database/university_list/za.html +46 -0
  231. data/perl/HeaderParseService/resources/database/university_list/zm.html +28 -0
  232. data/perl/HeaderParseService/resources/database/university_list.txt +3025 -0
  233. data/perl/HeaderParseService/resources/database/url.txt +1 -0
  234. data/perl/HeaderParseService/resources/database/webTopWords +225 -0
  235. data/perl/HeaderParseService/resources/database/words +45402 -0
  236. data/perl/HeaderParseService/resources/models/10ContextModelfold1 +369 -0
  237. data/perl/HeaderParseService/resources/models/10Modelfold1 +376 -0
  238. data/perl/HeaderParseService/resources/models/11ContextModelfold1 +400 -0
  239. data/perl/HeaderParseService/resources/models/11Modelfold1 +526 -0
  240. data/perl/HeaderParseService/resources/models/12ContextModelfold1 +510 -0
  241. data/perl/HeaderParseService/resources/models/12Modelfold1 +423 -0
  242. data/perl/HeaderParseService/resources/models/13ContextModelfold1 +364 -0
  243. data/perl/HeaderParseService/resources/models/13Modelfold1 +677 -0
  244. data/perl/HeaderParseService/resources/models/14ContextModelfold1 +459 -0
  245. data/perl/HeaderParseService/resources/models/14Modelfold1 +325 -0
  246. data/perl/HeaderParseService/resources/models/15ContextModelfold1 +340 -0
  247. data/perl/HeaderParseService/resources/models/15Modelfold1 +390 -0
  248. data/perl/HeaderParseService/resources/models/1ContextModelfold1 +668 -0
  249. data/perl/HeaderParseService/resources/models/1Modelfold1 +1147 -0
  250. data/perl/HeaderParseService/resources/models/2ContextModelfold1 +755 -0
  251. data/perl/HeaderParseService/resources/models/2Modelfold1 +796 -0
  252. data/perl/HeaderParseService/resources/models/3ContextModelfold1 +1299 -0
  253. data/perl/HeaderParseService/resources/models/3Modelfold1 +1360 -0
  254. data/perl/HeaderParseService/resources/models/4ContextModelfold1 +1062 -0
  255. data/perl/HeaderParseService/resources/models/4Modelfold1 +993 -0
  256. data/perl/HeaderParseService/resources/models/5ContextModelfold1 +1339 -0
  257. data/perl/HeaderParseService/resources/models/5Modelfold1 +2098 -0
  258. data/perl/HeaderParseService/resources/models/6ContextModelfold1 +888 -0
  259. data/perl/HeaderParseService/resources/models/6Modelfold1 +620 -0
  260. data/perl/HeaderParseService/resources/models/7ContextModelfold1 +257 -0
  261. data/perl/HeaderParseService/resources/models/7Modelfold1 +228 -0
  262. data/perl/HeaderParseService/resources/models/8ContextModelfold1 +677 -0
  263. data/perl/HeaderParseService/resources/models/8Modelfold1 +1871 -0
  264. data/perl/HeaderParseService/resources/models/9ContextModelfold1 +198 -0
  265. data/perl/HeaderParseService/resources/models/9Modelfold1 +170 -0
  266. data/perl/HeaderParseService/resources/models/NameSpaceModel +181 -0
  267. data/perl/HeaderParseService/resources/models/NameSpaceTrainF +347 -0
  268. data/perl/HeaderParseService/resources/models/WrapperBaseFeaDict +13460 -0
  269. data/perl/HeaderParseService/resources/models/WrapperContextFeaDict +14045 -0
  270. data/perl/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +510 -0
  271. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +23 -0
  272. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +23 -0
  273. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +23 -0
  274. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +23 -0
  275. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +23 -0
  276. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +23 -0
  277. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +23 -0
  278. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +23 -0
  279. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +23 -0
  280. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +23 -0
  281. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +23 -0
  282. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +23 -0
  283. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +23 -0
  284. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +23 -0
  285. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +23 -0
  286. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +23 -0
  287. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +23 -0
  288. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +23 -0
  289. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +23 -0
  290. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +23 -0
  291. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +23 -0
  292. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +23 -0
  293. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +23 -0
  294. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +23 -0
  295. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +23 -0
  296. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +23 -0
  297. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +23 -0
  298. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +23 -0
  299. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +23 -0
  300. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +23 -0
  301. data/perl/ParsCit/README.TXT +82 -0
  302. data/perl/ParsCit/crfpp/traindata/parsCit.template +60 -0
  303. data/perl/ParsCit/crfpp/traindata/parsCit.train.data +12104 -0
  304. data/perl/ParsCit/crfpp/traindata/tagged_references.txt +500 -0
  305. data/perl/ParsCit/lib/CSXUtil/SafeText.pm +140 -0
  306. data/perl/ParsCit/lib/ParsCit/Citation.pm +462 -0
  307. data/perl/ParsCit/lib/ParsCit/CitationContext.pm +132 -0
  308. data/perl/ParsCit/lib/ParsCit/Config.pm +46 -0
  309. data/perl/ParsCit/lib/ParsCit/Controller.pm +306 -0
  310. data/perl/ParsCit/lib/ParsCit/PostProcess.pm +367 -0
  311. data/perl/ParsCit/lib/ParsCit/PreProcess.pm +333 -0
  312. data/perl/ParsCit/lib/ParsCit/Tr2crfpp.pm +331 -0
  313. data/perl/ParsCit/resources/parsCit.model +0 -0
  314. data/perl/ParsCit/resources/parsCitDict.txt +148783 -0
  315. data/perl/extract.pl +199 -0
  316. data/spec/biblicit/cb2bib_spec.rb +48 -0
  317. data/spec/biblicit/citeseer_spec.rb +40 -0
  318. data/spec/fixtures/pdf/10.1.1.109.4049.pdf +0 -0
  319. data/spec/fixtures/pdf/Bagnoli Watts TAR 2010.pdf +0 -0
  320. data/spec/fixtures/pdf/ICINCO_2010.pdf +0 -0
  321. data/spec/spec_helper.rb +3 -0
  322. metadata +474 -0
@@ -0,0 +1,35 @@
1
+ #
2
+ # Copyright 2007 Penn State University
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ #
13
+ package DocFilter::Config;
14
+
15
+
16
+ ## Global
17
+
18
+ $algorithmName = "BasicDocFilter";
19
+ $algorithmVersion = "1.0";
20
+
21
+
22
+ ## Repository Mappings
23
+
24
+ %repositories = ('example1' => '/',
25
+ 'example2' => '/home',
26
+ );
27
+
28
+
29
+ ## WS Settings
30
+
31
+ $serverURL = '127.0.0.1';
32
+ $serverPort = 10666;
33
+ $URI = 'http://citeseerx.org/algorithms/docfilter/wsdl';
34
+
35
+ 1;
@@ -0,0 +1,51 @@
1
+ #
2
+ # Copyright 2007 Penn State University
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ #
13
+ package DocFilter::Filter;
14
+ ##
15
+ ## Isaac Councill, 7/31/07
16
+ ##
17
+ use strict;
18
+
19
+
20
+ sub filter {
21
+ my ($filePath) = @_;
22
+
23
+ if (!open (IN, "<$filePath")) {
24
+ return (0, 0, "Could not open file $filePath: $!");
25
+ }
26
+ my $text;
27
+ {
28
+ local $/ = undef;
29
+ $text = <IN>;
30
+ }
31
+
32
+ if (hasReferences(\$text) <= 0) {
33
+ return (1, 0, "No reference section is present");
34
+ }
35
+ return (1, 1, "All filters passed");
36
+
37
+ } # filter
38
+
39
+
40
+ sub hasReferences {
41
+ my $rText = shift;
42
+ if ($$rText =~ /\b(REFERENCES?|References?|BIBLIOGRAPHY|Bibliography|REFERENCES AND NOTES|References and Notes)\:?\s*\n/sg) {
43
+ return 1;
44
+ } else {
45
+ return 0;
46
+ }
47
+
48
+ } # hasReferences
49
+
50
+
51
+ 1;
@@ -0,0 +1,11 @@
1
+ FileConverter README
2
+ Isaac Councill
3
+
4
+ Check the FileConverter::Config module to make sure the settings
5
+ are correct. You will probably need to obtain the PDFBox jar file
6
+ and reference it in Config. It's better to use TET, but TET is
7
+ expensive.
8
+
9
+ If using PDFBox, make sure java is on your path.
10
+
11
+ I know this is sparse, more to come...
@@ -0,0 +1,140 @@
1
+ #
2
+ # Copyright 2007 Penn State University
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ #
13
+ package CSXUtil::SafeText;
14
+ ##
15
+ ## Methods for stripping bad (XML unsafe) characters
16
+ ## from strings and performing basic HTML entity
17
+ ## translations. Also contains a utility (stripArtifacts)
18
+ ## for getting rid of crazy control characters and
19
+ ## other things that probably aren't proper text.
20
+ ##
21
+ ## Isaac Councill, 12/06/06
22
+ ##
23
+ #######################################################
24
+ ##
25
+ use strict;
26
+ use utf8;
27
+ require Exporter;
28
+
29
+ our ($VERSION, @ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS);
30
+
31
+ $VERSION = 1.00;
32
+
33
+ @ISA = qw(Exporter);
34
+ @EXPORT_OK = qw(@badChars %htmlSpecialChars
35
+ %htmlCharEntities &stripBadChars
36
+ &encodeHTMLSpecialChars
37
+ &decodeHTMLSpecialChars
38
+ &cleanXML &cleanAll &stripArtifacts);
39
+
40
+
41
+ ##
42
+ #######################################################
43
+ ##
44
+ ## Sharable encoding data.
45
+ ##
46
+
47
+ ## Hex codes for characters that should never be put into
48
+ ## XML - or else parsers will barf.
49
+ our @badChars = qw(\x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07
50
+ \x08 \x0B \x0C \x0E \x0F \x10 \x11 \x12
51
+ \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1A
52
+ \x1B \x1C \x1D \x1E \x1F \x7F);
53
+
54
+ ## Subset of HTML characters that could be problematic
55
+ ## for XML. This is not a complete list of HTML
56
+ ## special characters, but more mappings can be added
57
+ ## as needed.
58
+ our %htmlSpecialCharEncodings = ("&" => "&amp;",
59
+ ">" => "&gt;",
60
+ "<" => "&lt;",
61
+ "\"" => "&quot;"
62
+ );
63
+
64
+ ## The reverse map.
65
+ our %htmlSpecialCharDecodings;
66
+ foreach my $key (keys %htmlSpecialCharEncodings) {
67
+ my $val = $htmlSpecialCharEncodings{$key};
68
+ $htmlSpecialCharDecodings{$val} = $key;
69
+ }
70
+
71
+
72
+ ##
73
+ #######################################################
74
+ ##
75
+ ## Subroutines
76
+ ##
77
+
78
+ ## Delete all occurences of bad characters in text,
79
+ ## returns a new string that is clean.
80
+ sub stripBadChars {
81
+ my $rtext = shift;
82
+ foreach my $char (@badChars) {
83
+ $$rtext =~ s/$char//g;
84
+ }
85
+ }
86
+
87
+
88
+ ## Encodes special characters into HTML equivalents
89
+ ## and returns the encoded string.
90
+ sub encodeHTMLSpecialChars {
91
+ my $rtext = shift;
92
+ foreach my $char (keys %htmlSpecialCharEncodings) {
93
+ my $code = $htmlSpecialCharEncodings{$char};
94
+ $$rtext =~ s/$char/$code/g;
95
+ }
96
+ }
97
+
98
+
99
+ ## Decodes a HTML entities in the supplied string
100
+ ## into non-HTML character equivalents and returns
101
+ ## the decoded string.
102
+ sub decodeHTMLSpecialChars {
103
+ my $rtext = shift;
104
+ foreach my $code (keys %htmlSpecialCharDecodings) {
105
+ my $char = $htmlSpecialCharDecodings{$code};
106
+ $$rtext =~ s/$code/$char/g;
107
+ }
108
+ }
109
+
110
+
111
+ ## Strip out any characters that don't look like they
112
+ ## belong in a proper, readable text string.
113
+ ##
114
+ sub stripArtifacts {
115
+ my $rtext = shift;
116
+ $$rtext =~ s/[^\p{IsAlnum}\p{IsPunct}\p{IsSpace}\p{IsS}]//g;
117
+ }
118
+
119
+
120
+ ## Convenience routine for executing both XML safety
121
+ ## routines in a single call.
122
+ ##
123
+ sub cleanXML {
124
+ my $rtext = shift;
125
+ stripBadChars($rtext);
126
+ encodeHTMLSpecialChars($rtext);
127
+ }
128
+
129
+
130
+ ## Clean for XML and also strip out strange characters.
131
+ ##
132
+ sub cleanAll {
133
+ my $rtext = shift;
134
+ stripBadChars($rtext);
135
+ stripArtifacts($rtext);
136
+ encodeHTMLSpecialChars($rtext);
137
+ }
138
+
139
+
140
+ 1;
@@ -0,0 +1,77 @@
1
+ #
2
+ # Copyright 2007 Penn State University
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ #
13
+ package FileConverter::CheckSum;
14
+ #
15
+ # Container for checksum info and utilities for individual files.
16
+ #
17
+ # Isaac Councill
18
+ #
19
+ use strict;
20
+ use Digest::SHA1;
21
+ use FileConverter::Utils;
22
+
23
+
24
+ sub new {
25
+ my ($class) = @_;
26
+ my $self = {
27
+ '_fileType' => undef,
28
+ '_sha1' => undef,
29
+ };
30
+ bless $self, $class;
31
+ return $self;
32
+
33
+ } # new
34
+
35
+
36
+ sub getFileType {
37
+ my $self = shift;
38
+ return $self->{'_fileType'};
39
+ }
40
+
41
+
42
+ sub setFileType {
43
+ my ($self, $fileType) = @_;
44
+ $self->{'_fileType'} = $fileType;
45
+ }
46
+
47
+
48
+ sub getSHA1 {
49
+ my $self = shift;
50
+ return $self->{'_sha1'};
51
+ }
52
+
53
+
54
+ sub setSHA1 {
55
+ my ($self, $sha1) = @_;
56
+ $self->{'_sha1'} = $sha1;
57
+ }
58
+
59
+
60
+ sub digest {
61
+ my ($self, $filePath) = @_;
62
+
63
+ open(FILE, "<$filePath") or die ("Could not open for reading: $filePath");
64
+ my $digester = Digest::SHA1->new;
65
+ $digester->addfile(*FILE);
66
+
67
+ my $ext = FileConverter::Utils::getExtension($filePath);
68
+ my $sha1 = $digester->hexdigest;
69
+ close FILE;
70
+
71
+ $self->setFileType($ext);
72
+ $self->setSHA1($sha1);
73
+
74
+ } # digest
75
+
76
+ 1;
77
+
@@ -0,0 +1,137 @@
1
+ #
2
+ # Copyright 2007 Penn State University
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ #
13
+ package FileConverter::Compression;
14
+ #
15
+ # Utilities for handling various compression formats.
16
+ #
17
+ # Isaac Councill, 09/06/07
18
+ #
19
+ use strict;
20
+ use FileConverter::Config;
21
+ use FileConverter::Utils;
22
+
23
+ # Should all be lower case.
24
+ my %supportedCompressionExt = ("gz" => 1,
25
+ "zip" => 1,
26
+ "z" => 1,
27
+ );
28
+
29
+ my $gunzip = $FileConverter::Config::gunzip;
30
+ my $uncompress = $FileConverter::Config::uncompress;
31
+ my $unzip = $FileConverter::Config::unzip;
32
+
33
+
34
+ sub decompress {
35
+ my ($fn, $rTrace) = @_;
36
+ my $ext = FileConverter::Utils::getExtension($fn);
37
+ if ($ext =~ m/^gz$/i) {
38
+ return gunzip($fn, $rTrace);
39
+ }
40
+ if ($ext =~ m/^z$/i) {
41
+ return uncompress($fn, $rTrace);
42
+ }
43
+ if ($ext =~ m/^zip$/i) {
44
+ return unzip($fn, $rTrace);
45
+ }
46
+ return (0, "Unsupported compression extension: $ext");
47
+
48
+ } # decompress
49
+
50
+
51
+ sub canDecompress {
52
+ my ($fn) = @_;
53
+ my $ext = FileConverter::Utils::getExtension($fn);
54
+ if (defined $supportedCompressionExt{lc($ext)}) {
55
+ return 1;
56
+ } else {
57
+ return 0;
58
+ }
59
+
60
+ } # canDecompress
61
+
62
+
63
+ sub gunzip {
64
+ my ($fn, $rTrace) = @_;
65
+ my @commandArgs = ($gunzip, "-f", $fn);
66
+
67
+ system(@commandArgs);
68
+
69
+ if ($? == -1) {
70
+ return (0, "Failed to execute gunzip: $!");
71
+ } elsif ($? & 127) {
72
+ return (0, "gunzip died with signal ".($? & 127));
73
+ };
74
+ my $code = $?>>8;
75
+ if ($code == 1) {
76
+ return (0, "Error executing gunzip (code $code): $!");
77
+ }
78
+
79
+ push @$rTrace, "gunzip";
80
+
81
+ my $newFile = FileConverter::Utils::stripExtension($fn);
82
+ return (1, "", $newFile, $rTrace);
83
+
84
+ } # gunzip
85
+
86
+
87
+ sub uncompress {
88
+ my ($fn, $rTrace) = @_;
89
+ my @commandArgs = ($uncompress, "-f", $fn);
90
+
91
+ system(@commandArgs);
92
+
93
+ if ($? == -1) {
94
+ return (0, "Failed to execute uncompress: $!");
95
+ } elsif ($? & 127) {
96
+ return (0, "uncompress died with signal ".($? & 127));
97
+ };
98
+ my $code = $?>>8;
99
+ if ($code == 1) {
100
+ return (0, "Error executing uncompress (code $code): $!");
101
+ }
102
+
103
+ push @$rTrace, "uncompress";
104
+
105
+ my $newFile = FileConverter::Utils::stripExtension($fn);
106
+ return (1, "", $newFile, $rTrace);
107
+
108
+ } # uncompress
109
+
110
+
111
+ sub unzip {
112
+ my ($fn, $rTrace) = @_;
113
+
114
+ my $dir = FileConverter::Utils::getDirectory($fn);
115
+ my @commandArgs = ($unzip, "-qqo", $fn, "-d", $dir);
116
+
117
+ system(@commandArgs);
118
+
119
+ if ($? == -1) {
120
+ return (0, "Failed to execute unzip: $!");
121
+ } elsif ($? & 127) {
122
+ return (0, "unzip died with signal ".($? & 127));
123
+ };
124
+ my $code = $?>>8;
125
+ if ($code > 2) {
126
+ return (0, "Error executing unzip (code $code): $!");
127
+ }
128
+
129
+ push @$rTrace, "unzip";
130
+
131
+ my $newFile = FileConverter::Utils::stripExtension($fn);
132
+ return (1, "", $newFile, $rTrace);
133
+
134
+ } # unzip
135
+
136
+
137
+ 1;
@@ -0,0 +1,57 @@
1
+ #
2
+ # Copyright 2007 Penn State University
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ #
13
+ package FileConverter::Config;
14
+
15
+ use FindBin;
16
+
17
+ ## Conversion utilities
18
+
19
+ # valid options are TET or PDFBOX
20
+ $PDFTOTEXT = "PDFBOX";
21
+
22
+ # valid options are TEXT or PDF
23
+ $PSConversion = "TEXT";
24
+
25
+ $TETPath = "$FindBin::Bin/../converters/TET-2.2-Linux/bin/tet";
26
+
27
+ $TETLicensePath =
28
+ "$FindBin::Bin/../converters/TET-2.2-Linux/licensekeys.txt";
29
+
30
+ $PDFBoxLocation = "$FindBin::Bin/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar";
31
+
32
+ $JODConverterPath =
33
+ "$FindBin::Bin/../converters/jodconverter-2.2.0/jodconverter-cli-2.2.0.jar";
34
+
35
+ $PrescriptPath = "/usr/local/bin/prescript";
36
+
37
+ ## Compression utilities
38
+
39
+ $gunzip = "/usr/bin/gunzip";
40
+ $uncompress = "/usr/bin/uncompress";
41
+ $unzip = "/usr/bin/unzip";
42
+
43
+
44
+ ## Repository Mappings
45
+
46
+ %repositories = ('example1' => '/',
47
+ 'example2' => '/home',
48
+ );
49
+
50
+
51
+ ## WS settings
52
+
53
+ $serverURL = '127.0.0.1';
54
+ $serverPort = 10888;
55
+ $URI = 'http://citeseerx.org/fileConversion/wsdl';
56
+
57
+ 1;
@@ -0,0 +1,191 @@
1
+ #
2
+ # Copyright 2007 Penn State University
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ #
13
+ package FileConverter::Controller;
14
+ #
15
+ # Main interface to FileConverter. This is where all calls
16
+ # should start, and where all conversion sequences should
17
+ # be managed.
18
+ #
19
+ # Isaac Councill, 09/06/07
20
+ #
21
+ use strict;
22
+ use FileConverter::Utils;
23
+ use FileConverter::Compression;
24
+ use FileConverter::TET;
25
+ use FileConverter::PDFBox;
26
+ use FileConverter::JODConverter;
27
+ use FileConverter::PSConverter;
28
+ use FileConverter::PSToText;
29
+ use FileConverter::Prescript;
30
+ use FileConverter::CheckSum;
31
+
32
+
33
+ ##
34
+ # Convert the given file to text. Decompression will occur
35
+ # first, and then further conversion and/or text extraction
36
+ # will proceed according to the file type.
37
+ #
38
+ # Supports zip, gz, .Z compression, but only supports the
39
+ # PDF file type for now.
40
+ ##
41
+ sub extractText {
42
+ my $fn = shift;
43
+
44
+ if (! -e $fn) {
45
+ return (0, "File does not exist: $fn");
46
+ }
47
+
48
+ my ($status, $msg) = (1, "");
49
+ my @trace = ();
50
+ my @checkSums = ();
51
+ my ($tstatus, $tmsg, $tfn, $rTrace, $rCheckSums);
52
+
53
+ while(FileConverter::Compression::canDecompress($fn) > 0) {
54
+ ($tstatus, $tmsg, $tfn, $rTrace) =
55
+ FileConverter::Compression::decompress($fn, \@trace);
56
+ if ($tstatus <= 0) {
57
+ return ($tstatus, $tmsg);
58
+ }
59
+ $fn = $tfn;
60
+ }
61
+
62
+ my $extension = FileConverter::Utils::getExtension($fn);
63
+
64
+ if (!defined $extension) {
65
+ return (0, "File $fn has no extension");
66
+ }
67
+
68
+ if ($extension =~ m/^ps$/i) {
69
+ # convert poscript file according config.
70
+ if ($FileConverter::Config::PSConversion eq "TEXT") {
71
+ # go from ps to text directly
72
+ _convert2pdf($fn, $extension, [], \@checkSums);
73
+ return ps2text($fn, \@trace, \@checkSums);
74
+ }
75
+ if ($FileConverter::Config::PSConversion eq "PDF") {
76
+ # convert to PDF first then to text
77
+ ($tstatus, $tmsg, $tfn, $rTrace, $rCheckSums) =
78
+ _convert2pdf($fn, $extension, \@trace, \@checkSums);
79
+ if ($tstatus <= 0) {
80
+ return ($tstatus, $tmsg);
81
+ }
82
+ $fn = $tfn;
83
+ $extension = FileConverter::Utils::getExtension($fn);
84
+ }
85
+ }
86
+ elsif (($extension !~ m/^pdf$/i) && ($extension !~ m/^ps$/i)) {
87
+ # first, we need to convert the file to PDF.
88
+ ($tstatus, $tmsg, $tfn, $rTrace, $rCheckSums) =
89
+ _convert2pdf($fn, $extension, \@trace, \@checkSums);
90
+ if ($tstatus <= 0) {
91
+ return ($tstatus, $tmsg);
92
+ }
93
+ $fn = $tfn;
94
+ $extension = FileConverter::Utils::getExtension($fn);
95
+ }
96
+
97
+ if ($extension =~ m/^pdf$/i) {
98
+ return pdf2text($fn, \@trace, \@checkSums);
99
+ }
100
+ return (0, "Unsupported file type: $extension");
101
+
102
+ } # extractText
103
+
104
+
105
+ sub convert2pdf {
106
+ my $fn = shift;
107
+
108
+ if (! -e $fn) {
109
+ return (0, "File does not exist: $fn");
110
+ }
111
+
112
+ my ($status, $msg) = (1, "");
113
+ my @trace = ();
114
+ my @checkSums = ();
115
+ my ($tstatus, $tmsg, $tfn, $rTrace, $rCheckSums);
116
+
117
+ while(FileConverter::Compression::canDecompress($fn) > 0) {
118
+ ($tstatus, $tmsg, $tfn, $rTrace) =
119
+ FileConverter::Compression::decompress($fn, \@trace);
120
+ if ($tstatus <= 0) {
121
+ return ($tstatus, $tmsg);
122
+ }
123
+ $fn = $tfn;
124
+ }
125
+
126
+ my $extension = FileConverter::Utils::getExtension($fn);
127
+
128
+ if (!defined $extension) {
129
+ return (0, "File $fn has no extension");
130
+ }
131
+
132
+ if ($extension =~ m/^pdf$/i) {
133
+
134
+ my $sha1 = FileConverter::CheckSum->new();
135
+ $sha1->digest($fn);
136
+ push @checkSums, $sha1;
137
+
138
+ return (1, "", $fn, \@trace, \@checkSums);
139
+ }
140
+
141
+ if ($extension =~ m/^ps$/i || $extension =~ m/^rtf$/i) {
142
+ ($tstatus, $tmsg, $tfn, $rTrace, $rCheckSums) =
143
+ _convert2pdf($fn, $extension, \@trace, \@checkSums);
144
+ if ($tstatus <= 0) {
145
+ return ($tstatus, $tmsg);
146
+ }
147
+
148
+ my $sha1 = FileConverter::CheckSum->new();
149
+ $sha1->digest($tfn);
150
+ push @$rCheckSums, $sha1;
151
+
152
+ return ($tstatus, $tmsg, $tfn, $rTrace, $rCheckSums);
153
+ }
154
+
155
+ return (0, "Unsupported file type: $extension");
156
+
157
+ } # convert2pdf
158
+
159
+
160
+ sub pdf2text {
161
+ my ($fn, $rTrace, $rCheckSums) = @_;
162
+ if ($FileConverter::Config::PDFTOTEXT eq "TET") {
163
+ return FileConverter::TET::extractText($fn, $rTrace, $rCheckSums);
164
+ }
165
+ if ($FileConverter::Config::PDFTOTEXT eq "PDFBOX") {
166
+ return FileConverter::PDFBox::extractText($fn, $rTrace, $rCheckSums);
167
+ }
168
+
169
+ } # pdf2text
170
+
171
+ sub _convert2pdf {
172
+ my ($fn, $extension, $rTrace, $rCheckSums) = @_;
173
+
174
+ if (($extension =~ m/^rtf$/i) || ($extension =~ m/^doc$/i)) {
175
+ return FileConverter::JODConverter::convertFile($fn, $rTrace,
176
+ $rCheckSums);
177
+ }
178
+ elsif (($extension =~ m/^ps$/i) || ($extension =~ m/^eps$/i)) {
179
+ return FileConverter::PSConverter::convertFile($fn, $rTrace,
180
+ $rCheckSums);
181
+ }
182
+
183
+ } # _convert2pdf
184
+
185
+ sub ps2text {
186
+ my($fn, $rTrace, $rCheckSums) = @_;
187
+ return FileConverter::PSToText::extractText($fn, $rTrace, $rCheckSums);
188
+
189
+ } # ps2text
190
+
191
+ 1;