biblicit 1.0 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (406) hide show
  1. data/.gitmodules +3 -0
  2. data/Gemfile +1 -1
  3. data/README.md +125 -30
  4. data/Rakefile +22 -0
  5. data/biblicit.gemspec +9 -7
  6. data/lib/biblicit/cb2bib.rb +10 -11
  7. data/lib/biblicit/citeseer.rb +14 -26
  8. data/lib/biblicit/extractor.rb +40 -19
  9. data/lib/biblicit/parscit.rb +38 -0
  10. data/parscit/.gitignore +8 -0
  11. data/parscit/CHANGELOG +125 -0
  12. data/parscit/COPYING +674 -0
  13. data/parscit/COPYING.LESSER +165 -0
  14. data/parscit/INSTALL +105 -0
  15. data/parscit/README +97 -0
  16. data/{perl/ParsCit/README.TXT → parscit/USAGE} +25 -15
  17. data/parscit/bin/archtest.pl +31 -0
  18. data/parscit/bin/citeExtract.pl +562 -0
  19. data/parscit/bin/conlleval.pl +315 -0
  20. data/parscit/bin/headExtract.pl +40 -0
  21. data/parscit/bin/parsHed/convert2TokenLevel.pl +138 -0
  22. data/parscit/bin/parsHed/keywordGen.pl +308 -0
  23. data/parscit/bin/parsHed/parseXmlHeader.pl +141 -0
  24. data/parscit/bin/parsHed/redo.parsHed.pl +198 -0
  25. data/parscit/bin/parsHed/tr2crfpp_parsHed.pl +521 -0
  26. data/parscit/bin/parseRefStrings.pl +102 -0
  27. data/parscit/bin/phOutput2xml.pl +223 -0
  28. data/parscit/bin/redo.parsCit.pl +105 -0
  29. data/parscit/bin/sectExtract.pl +149 -0
  30. data/parscit/bin/sectLabel/README +110 -0
  31. data/parscit/bin/sectLabel/README.txt +110 -0
  32. data/parscit/bin/sectLabel/genericSect/crossValidation.rb +98 -0
  33. data/parscit/bin/sectLabel/genericSect/extractFeature.rb +104 -0
  34. data/parscit/bin/sectLabel/genericSectExtract.rb +53 -0
  35. data/parscit/bin/sectLabel/getStructureInfo.pl +156 -0
  36. data/parscit/bin/sectLabel/processOmniXML.pl +1427 -0
  37. data/parscit/bin/sectLabel/processOmniXML_new.pl +1025 -0
  38. data/parscit/bin/sectLabel/processOmniXMLv2.pl +1529 -0
  39. data/parscit/bin/sectLabel/processOmniXMLv3.pl +964 -0
  40. data/parscit/bin/sectLabel/redo.sectLabel.pl +219 -0
  41. data/parscit/bin/sectLabel/simplifyOmniXML.pl +382 -0
  42. data/parscit/bin/sectLabel/single2multi.pl +190 -0
  43. data/parscit/bin/sectLabel/tr2crfpp.pl +158 -0
  44. data/parscit/bin/tr2crfpp.pl +260 -0
  45. data/parscit/bin/xml2train.pl +193 -0
  46. data/parscit/lib/CSXUtil/SafeText.pm +130 -0
  47. data/parscit/lib/Omni/Config.pm +93 -0
  48. data/parscit/lib/Omni/Omnicell.pm +263 -0
  49. data/parscit/lib/Omni/Omnicol.pm +292 -0
  50. data/parscit/lib/Omni/Omnidd.pm +328 -0
  51. data/parscit/lib/Omni/Omnidoc.pm +153 -0
  52. data/parscit/lib/Omni/Omniframe.pm +223 -0
  53. data/parscit/lib/Omni/Omniline.pm +423 -0
  54. data/parscit/lib/Omni/Omnipage.pm +282 -0
  55. data/parscit/lib/Omni/Omnipara.pm +232 -0
  56. data/parscit/lib/Omni/Omnirun.pm +303 -0
  57. data/parscit/lib/Omni/Omnitable.pm +336 -0
  58. data/parscit/lib/Omni/Omniword.pm +162 -0
  59. data/parscit/lib/Omni/Traversal.pm +313 -0
  60. data/parscit/lib/ParsCit/.PostProcess.pm.swp +0 -0
  61. data/parscit/lib/ParsCit/Citation.pm +737 -0
  62. data/parscit/lib/ParsCit/CitationContext.pm +220 -0
  63. data/parscit/lib/ParsCit/Config.pm +35 -0
  64. data/parscit/lib/ParsCit/Controller.pm +653 -0
  65. data/parscit/lib/ParsCit/PostProcess.pm +505 -0
  66. data/parscit/lib/ParsCit/PreProcess.pm +1041 -0
  67. data/parscit/lib/ParsCit/Tr2crfpp.pm +1195 -0
  68. data/parscit/lib/ParsHed/Config.pm +49 -0
  69. data/parscit/lib/ParsHed/Controller.pm +143 -0
  70. data/parscit/lib/ParsHed/PostProcess.pm +322 -0
  71. data/parscit/lib/ParsHed/Tr2crfpp.pm +448 -0
  72. data/{perl/ParsCit/lib/ParsCit/Tr2crfpp.pm → parscit/lib/ParsHed/Tr2crfpp_token.pm} +22 -21
  73. data/parscit/lib/SectLabel/AAMatching.pm +1949 -0
  74. data/parscit/lib/SectLabel/Config.pm +88 -0
  75. data/parscit/lib/SectLabel/Controller.pm +332 -0
  76. data/parscit/lib/SectLabel/PostProcess.pm +425 -0
  77. data/parscit/lib/SectLabel/PreProcess.pm +116 -0
  78. data/parscit/lib/SectLabel/Tr2crfpp.pm +1246 -0
  79. data/parscit/resources/parsCit.model +0 -0
  80. data/parscit/resources/parsCit.split.model +0 -0
  81. data/{perl/ParsCit → parscit}/resources/parsCitDict.txt +205 -0
  82. data/parscit/resources/parsHed/bigram +10 -0
  83. data/parscit/resources/parsHed/keywords +10 -0
  84. data/parscit/resources/parsHed/parsHed.model +0 -0
  85. data/parscit/resources/parsHed/parsHed.template +178 -0
  86. data/parscit/resources/sectLabel/affiliation.model +0 -0
  87. data/parscit/resources/sectLabel/author.model +0 -0
  88. data/parscit/resources/sectLabel/funcWord +320 -0
  89. data/parscit/resources/sectLabel/genericSect.model +0 -0
  90. data/parscit/resources/sectLabel/sectLabel.config +42 -0
  91. data/parscit/resources/sectLabel/sectLabel.configXml +42 -0
  92. data/parscit/resources/sectLabel/sectLabel.model +0 -0
  93. data/sh/convert_to_text.sh +20 -0
  94. data/spec/biblicit/extractor_spec.rb +121 -0
  95. data/spec/fixtures/Review_of_Michael_Tyes_Consciousness_Revisited.docx +0 -0
  96. data/spec/fixtures/critical-infrastructures.ps +63951 -0
  97. data/spec/fixtures/txt/E06-1050.txt +867 -0
  98. data/spec/fixtures/txt/sample1.txt +902 -0
  99. data/spec/fixtures/txt/sample2.txt +394 -0
  100. data/spec/spec_helper.rb +3 -0
  101. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Function.pm +2 -20
  102. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +0 -7
  103. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Parser.pm +0 -2
  104. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +0 -7
  105. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +6 -1
  106. data/svm-header-parse/HeaderParseService/tmp/.gitignore +4 -0
  107. data/svm-header-parse/extract.pl +75 -0
  108. metadata +351 -317
  109. data/perl/DocFilter/lib/DocFilter/Config.pm +0 -35
  110. data/perl/DocFilter/lib/DocFilter/Filter.pm +0 -51
  111. data/perl/FileConversionService/README.TXT +0 -11
  112. data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
  113. data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +0 -140
  114. data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +0 -77
  115. data/perl/FileConversionService/lib/FileConverter/Compression.pm +0 -137
  116. data/perl/FileConversionService/lib/FileConverter/Config.pm +0 -57
  117. data/perl/FileConversionService/lib/FileConverter/Controller.pm +0 -191
  118. data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +0 -61
  119. data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +0 -69
  120. data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +0 -69
  121. data/perl/FileConversionService/lib/FileConverter/PSToText.pm +0 -88
  122. data/perl/FileConversionService/lib/FileConverter/Prescript.pm +0 -68
  123. data/perl/FileConversionService/lib/FileConverter/TET.pm +0 -75
  124. data/perl/FileConversionService/lib/FileConverter/Utils.pm +0 -130
  125. data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +0 -140
  126. data/perl/HeaderParseService/resources/data/EbizHeaders.txt +0 -24330
  127. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +0 -27506
  128. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +0 -26495
  129. data/perl/HeaderParseService/resources/data/tagged_headers.txt +0 -40668
  130. data/perl/HeaderParseService/resources/data/test_header.txt +0 -31
  131. data/perl/HeaderParseService/resources/data/test_header.txt.parsed +0 -31
  132. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +0 -23
  133. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +0 -23
  134. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +0 -23
  135. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +0 -23
  136. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +0 -23
  137. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +0 -23
  138. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +0 -23
  139. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +0 -23
  140. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +0 -23
  141. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +0 -23
  142. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +0 -23
  143. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +0 -23
  144. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +0 -23
  145. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +0 -23
  146. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +0 -23
  147. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +0 -23
  148. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +0 -23
  149. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +0 -23
  150. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +0 -23
  151. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +0 -23
  152. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +0 -23
  153. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +0 -23
  154. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +0 -23
  155. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +0 -23
  156. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +0 -23
  157. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +0 -23
  158. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +0 -23
  159. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +0 -23
  160. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +0 -23
  161. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +0 -23
  162. data/perl/ParsCit/crfpp/traindata/parsCit.template +0 -60
  163. data/perl/ParsCit/crfpp/traindata/parsCit.train.data +0 -12104
  164. data/perl/ParsCit/crfpp/traindata/tagged_references.txt +0 -500
  165. data/perl/ParsCit/lib/CSXUtil/SafeText.pm +0 -140
  166. data/perl/ParsCit/lib/ParsCit/Citation.pm +0 -462
  167. data/perl/ParsCit/lib/ParsCit/CitationContext.pm +0 -132
  168. data/perl/ParsCit/lib/ParsCit/Config.pm +0 -46
  169. data/perl/ParsCit/lib/ParsCit/Controller.pm +0 -306
  170. data/perl/ParsCit/lib/ParsCit/PostProcess.pm +0 -367
  171. data/perl/ParsCit/lib/ParsCit/PreProcess.pm +0 -333
  172. data/perl/ParsCit/resources/parsCit.model +0 -0
  173. data/perl/extract.pl +0 -199
  174. data/spec/biblicit/cb2bib_spec.rb +0 -48
  175. data/spec/biblicit/citeseer_spec.rb +0 -40
  176. /data/{perl → svm-header-parse}/HeaderParseService/README.TXT +0 -0
  177. /data/{perl/DocFilter → svm-header-parse/HeaderParseService}/lib/CSXUtil/SafeText.pm +0 -0
  178. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +0 -0
  179. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +0 -0
  180. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +0 -0
  181. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/50states +0 -0
  182. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AddrTopWords.txt +0 -0
  183. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWords.txt +0 -0
  184. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWordsAll.txt +0 -0
  185. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/ChineseSurNames.txt +0 -0
  186. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames.bin +0 -0
  187. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
  188. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/DomainSuffixes.txt +0 -0
  189. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/LabeledHeader +0 -0
  190. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/README +0 -0
  191. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines +0 -0
  192. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines1 +0 -0
  193. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstract.txt +0 -0
  194. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstractTopWords +0 -0
  195. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/addr.txt +0 -0
  196. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affi.txt +0 -0
  197. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affis.bin +0 -0
  198. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
  199. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/allnamewords.bin +0 -0
  200. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_US.txt +0 -0
  201. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_world.txt +0 -0
  202. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/city.txt +0 -0
  203. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cityname.txt +0 -0
  204. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/country_abbr.txt +0 -0
  205. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/countryname.txt +0 -0
  206. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/dateTopWords +0 -0
  207. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/degree.txt +0 -0
  208. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/email.txt +0 -0
  209. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/excludeWords.txt +0 -0
  210. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/female-names +0 -0
  211. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstNames.txt +0 -0
  212. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames.bin +0 -0
  213. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
  214. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/intro.txt +0 -0
  215. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keyword.txt +0 -0
  216. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keywordTopWords +0 -0
  217. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/male-names +0 -0
  218. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/middleNames.txt +0 -0
  219. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/month.txt +0 -0
  220. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul +0 -0
  221. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label +0 -0
  222. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label.old +0 -0
  223. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.processed +0 -0
  224. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulAuthor +0 -0
  225. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulClassStat +0 -0
  226. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nickname.txt +0 -0
  227. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nicknames.bin +0 -0
  228. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/note.txt +0 -0
  229. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/page.txt +0 -0
  230. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/phone.txt +0 -0
  231. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/postcode.txt +0 -0
  232. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/pubnum.txt +0 -0
  233. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.bin +0 -0
  234. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.txt +0 -0
  235. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/states_and_abbreviations.txt +0 -0
  236. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords +0 -0
  237. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords.bin +0 -0
  238. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surNames.txt +0 -0
  239. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames.bin +0 -0
  240. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames_spec.bin +0 -0
  241. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/A.html +0 -0
  242. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/B.html +0 -0
  243. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/C.html +0 -0
  244. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/D.html +0 -0
  245. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/E.html +0 -0
  246. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/F.html +0 -0
  247. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/G.html +0 -0
  248. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/H.html +0 -0
  249. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/I.html +0 -0
  250. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/J.html +0 -0
  251. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/K.html +0 -0
  252. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/L.html +0 -0
  253. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/M.html +0 -0
  254. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/N.html +0 -0
  255. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/O.html +0 -0
  256. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/P.html +0 -0
  257. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Q.html +0 -0
  258. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/R.html +0 -0
  259. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/S.html +0 -0
  260. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/T.html +0 -0
  261. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/U.html +0 -0
  262. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/V.html +0 -0
  263. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/W.html +0 -0
  264. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
  265. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/X.html +0 -0
  266. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Y.html +0 -0
  267. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Z.html +0 -0
  268. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ae.html +0 -0
  269. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/am.html +0 -0
  270. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ar.html +0 -0
  271. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/at.html +0 -0
  272. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/au.html +0 -0
  273. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bd.html +0 -0
  274. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/be.html +0 -0
  275. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bg.html +0 -0
  276. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bh.html +0 -0
  277. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
  278. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bm.html +0 -0
  279. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bn.html +0 -0
  280. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/br.html +0 -0
  281. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ca.html +0 -0
  282. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ch.html +0 -0
  283. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cl.html +0 -0
  284. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cn.html +0 -0
  285. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/co.html +0 -0
  286. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cr.html +0 -0
  287. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cy.html +0 -0
  288. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cz.html +0 -0
  289. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/de.html +0 -0
  290. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
  291. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dk.html +0 -0
  292. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ec.html +0 -0
  293. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ee.html +0 -0
  294. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/eg.html +0 -0
  295. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/es.html +0 -0
  296. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/et.html +0 -0
  297. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/faq.html +0 -0
  298. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fi.html +0 -0
  299. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fj.html +0 -0
  300. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fo.html +0 -0
  301. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fr.html +0 -0
  302. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/geog.html +0 -0
  303. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gr.html +0 -0
  304. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gu.html +0 -0
  305. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hk.html +0 -0
  306. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hr.html +0 -0
  307. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hu.html +0 -0
  308. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/id.html +0 -0
  309. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ie.html +0 -0
  310. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/il.html +0 -0
  311. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/in.html +0 -0
  312. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/is.html +0 -0
  313. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/it.html +0 -0
  314. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jm.html +0 -0
  315. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jo.html +0 -0
  316. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jp.html +0 -0
  317. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
  318. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kr.html +0 -0
  319. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kw.html +0 -0
  320. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lb.html +0 -0
  321. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
  322. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lk.html +0 -0
  323. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lt.html +0 -0
  324. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lu.html +0 -0
  325. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lv.html +0 -0
  326. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ma.html +0 -0
  327. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
  328. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
  329. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mk.html +0 -0
  330. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mo.html +0 -0
  331. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
  332. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mt.html +0 -0
  333. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mx.html +0 -0
  334. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/my.html +0 -0
  335. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ni.html +0 -0
  336. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nl.html +0 -0
  337. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/no.html +0 -0
  338. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nz.html +0 -0
  339. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pa.html +0 -0
  340. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pe.html +0 -0
  341. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ph.html +0 -0
  342. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pl.html +0 -0
  343. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
  344. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pr.html +0 -0
  345. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ps.html +0 -0
  346. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pt.html +0 -0
  347. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/recognition.html +0 -0
  348. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/results.html +0 -0
  349. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ro.html +0 -0
  350. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ru.html +0 -0
  351. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sd.html +0 -0
  352. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/se.html +0 -0
  353. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sg.html +0 -0
  354. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/si.html +0 -0
  355. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sk.html +0 -0
  356. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/th.html +0 -0
  357. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tr.html +0 -0
  358. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tw.html +0 -0
  359. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ua.html +0 -0
  360. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uk.html +0 -0
  361. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ-full.html +0 -0
  362. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ.html +0 -0
  363. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uy.html +0 -0
  364. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ve.html +0 -0
  365. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/yu.html +0 -0
  366. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/za.html +0 -0
  367. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/zm.html +0 -0
  368. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list.txt +0 -0
  369. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/url.txt +0 -0
  370. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/webTopWords +0 -0
  371. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/words +0 -0
  372. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10ContextModelfold1 +0 -0
  373. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10Modelfold1 +0 -0
  374. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11ContextModelfold1 +0 -0
  375. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11Modelfold1 +0 -0
  376. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12ContextModelfold1 +0 -0
  377. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12Modelfold1 +0 -0
  378. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13ContextModelfold1 +0 -0
  379. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13Modelfold1 +0 -0
  380. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14ContextModelfold1 +0 -0
  381. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14Modelfold1 +0 -0
  382. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15ContextModelfold1 +0 -0
  383. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15Modelfold1 +0 -0
  384. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1ContextModelfold1 +0 -0
  385. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1Modelfold1 +0 -0
  386. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2ContextModelfold1 +0 -0
  387. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2Modelfold1 +0 -0
  388. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3ContextModelfold1 +0 -0
  389. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3Modelfold1 +0 -0
  390. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4ContextModelfold1 +0 -0
  391. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4Modelfold1 +0 -0
  392. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5ContextModelfold1 +0 -0
  393. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5Modelfold1 +0 -0
  394. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6ContextModelfold1 +0 -0
  395. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6Modelfold1 +0 -0
  396. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7ContextModelfold1 +0 -0
  397. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7Modelfold1 +0 -0
  398. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8ContextModelfold1 +0 -0
  399. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8Modelfold1 +0 -0
  400. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9ContextModelfold1 +0 -0
  401. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9Modelfold1 +0 -0
  402. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceModel +0 -0
  403. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceTrainF +0 -0
  404. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperBaseFeaDict +0 -0
  405. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperContextFeaDict +0 -0
  406. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +0 -0
@@ -1,61 +0,0 @@
1
- #
2
- # Copyright 2007 Penn State University
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- # http://www.apache.org/licenses/LICENSE-2.0
7
- # Unless required by applicable law or agreed to in writing, software
8
- # distributed under the License is distributed on an "AS IS" BASIS,
9
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
- # See the License for the specific language governing permissions and
11
- # limitations under the License.
12
- #
13
- package FileConverter::JODConverter;
14
- #
15
- # Wrapper to execute the JODConverter command-line tool for converting
16
- # doc, rtf to PDF files.
17
- #
18
- # Juan Pablo Fernandez Ramirez, 10/05/07
19
- #
20
- use strict;
21
- use FileConverter::Config;
22
- use FileConverter::Utils;
23
-
24
- my $JODConverterLoc = $FileConverter::Config::JODConverterPath;
25
-
26
- ##
27
- # Execute the JODConverter utility.
28
- ##
29
- sub convertFile {
30
- my ($filePath, $rTrace, $rCheckSums) = @_;
31
- my ($status, $msg) = (1, "");
32
-
33
- if (FileConverter::Utils::checkProcess("soffice") == 0) {
34
- return (0, "Open Office Service is not running");
35
- }
36
-
37
- my $pdfFilePath = FileConverter::Utils::changeExtension($filePath, "pdf");
38
- my @commandArgs = ("java", "-jar", $JODConverterLoc, $filePath,
39
- $pdfFilePath);
40
- system(@commandArgs);
41
-
42
- if ($? == -1) {
43
- return (0, "Failed to execute JODConverter: $!");
44
- } elsif ($? & 127) {
45
- return (0, "Java died with signal ".($? & 127));
46
- }
47
-
48
- my $code = $?>>8;
49
- if ($code == 0) {
50
- push @$rTrace, "JODConverter";
51
-
52
- my $sha1 = FileConverter::CheckSum->new();
53
- $sha1->digest($filePath);
54
- push @$rCheckSums, $sha1;
55
-
56
- return ($status, $msg, $pdfFilePath, $rTrace, $rCheckSums);
57
- } else {
58
- return (0, "Error executing JODConverter (code $code): $!");
59
- }
60
- } # convertFile
61
- 1;
@@ -1,69 +0,0 @@
1
- #
2
- # Copyright 2007 Penn State University
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- # http://www.apache.org/licenses/LICENSE-2.0
7
- # Unless required by applicable law or agreed to in writing, software
8
- # distributed under the License is distributed on an "AS IS" BASIS,
9
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
- # See the License for the specific language governing permissions and
11
- # limitations under the License.
12
- #
13
- package FileConverter::PDFBox;
14
- #
15
- # Wrapper to call the PDFBox ExtractText command-line tool
16
- # for extracting text from PDF files. It's recommended to
17
- # use TET instead, if TET is available.
18
- #
19
- # Isaac Councill, 09/06/07
20
- #
21
- use strict;
22
- use FileConverter::Config;
23
- use FileConverter::Utils;
24
-
25
- my $PDFBoxLoc = $FileConverter::Config::PDFBoxLocation;
26
-
27
- ##
28
- # Execute the PDFBox utility.
29
- ##
30
- sub extractText {
31
- my ($filePath, $rTrace, $rCheckSums) = @_;
32
- my ($status, $msg) = (1, "");
33
-
34
- if (FileConverter::Utils::checkExtension($filePath, "pdf") <= 0) {
35
- return (0, "Unexpected file extension at ".
36
- __FILE__." line ".__LINE__);
37
- }
38
-
39
- my $textFilePath =
40
- FileConverter::Utils::changeExtension($filePath, "txt");
41
- my @commandArgs = ("java", "-jar", $PDFBoxLoc,
42
- "ExtractText", "-encoding",
43
- "utf8", $filePath, $textFilePath);
44
-
45
- system(@commandArgs);
46
-
47
- if ($? == -1) {
48
- return (0, "Failed to execute PDFBox: $!");
49
- } elsif ($? & 127) {
50
- return (0, "Java died with signal ".($? & 127));
51
- }
52
-
53
- my $code = $?>>8;
54
- if ($code == 0) {
55
- push @$rTrace, "PDFBox";
56
-
57
- my $sha1 = FileConverter::CheckSum->new();
58
- $sha1->digest($filePath);
59
- push @$rCheckSums, $sha1;
60
-
61
- return ($status, $msg, $textFilePath, $rTrace, $rCheckSums);
62
- } else {
63
- return (0, "Error executing PDFBox (code $code): $!");
64
- }
65
-
66
- } # extractText
67
-
68
-
69
- 1;
@@ -1,69 +0,0 @@
1
- #
2
- # Copyright 2007 Penn State University
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- # http://www.apache.org/licenses/LICENSE-2.0
7
- # Unless required by applicable law or agreed to in writing, software
8
- # distributed under the License is distributed on an "AS IS" BASIS,
9
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
- # See the License for the specific language governing permissions and
11
- # limitations under the License.
12
- #
13
- package FileConverter::PSConverter;
14
- #
15
- # Wrapper to execute the ps2pdf command-line tool for converting
16
- # ps to PDF files.
17
- #
18
- # Juan Pablo Fernandez Ramirez, 10/08/07
19
- #
20
- use strict;
21
- use FileConverter::Config;
22
- use FileConverter::Utils;
23
-
24
- my $timeout = 20;
25
-
26
- ##
27
- # Execute the converter utility.
28
- ##
29
- sub convertFile {
30
- my ($filePath, $rTrace, $rCheckSums) = @_;
31
- my ($status, $msg) = (1, "");
32
-
33
- my $pdfFilePath = FileConverter::Utils::changeExtension($filePath, "pdf");
34
- my @commandArgs = ("ps2pdf13", $filePath, $pdfFilePath);
35
- my $child;
36
- eval {
37
- local $SIG{'ALRM'} = sub { die "alarm\n" };
38
- alarm $timeout;
39
- $child = system(@commandArgs);
40
- alarm 0;
41
- };
42
-
43
- if ($@) {
44
- if ($@ eq "alarm\n") {
45
- if (defined $child) { kill 9, $child; }
46
- return (0, "ps2pdf timeout");
47
- }
48
- }
49
-
50
- if ($? == -1) {
51
- return (0, "Failed to execute ps2pdf: $!");
52
- } elsif ($? & 127) {
53
- return (0, "ps2pdf died with signal ".($? & 127));
54
- }
55
-
56
- my $code = $?>>8;
57
- if ($code == 0) {
58
- push @$rTrace, "ps2pdf";
59
-
60
- my $sha1 = FileConverter::CheckSum->new();
61
- $sha1->digest($filePath);
62
- push @$rCheckSums, $sha1;
63
-
64
- return ($status, $msg, $pdfFilePath, $rTrace, $rCheckSums);
65
- } else {
66
- return (0, "Error executing ps2pdf (code $code): $!");
67
- }
68
- } # convertFile
69
- 1;
@@ -1,88 +0,0 @@
1
- #
2
- # Copyright 2007 Penn State University
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- # http://www.apache.org/licenses/LICENSE-2.0
7
- # Unless required by applicable law or agreed to in writing, software
8
- # distributed under the License is distributed on an "AS IS" BASIS,
9
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
- # See the License for the specific language governing permissions and
11
- # limitations under the License.
12
- #
13
- package FileConverter::PSToText;
14
- #
15
- # Wrapper to execute the ps2ascii command-line tool for converting
16
- # ps to text files.
17
- #
18
- # Isaac, 10/08/07
19
- #
20
- use strict;
21
- use FileConverter::Config;
22
- use FileConverter::Utils;
23
- use Encode;
24
-
25
- my $timeout = 20;
26
-
27
- ##
28
- # Execute the converter utility.
29
- ##
30
- sub extractText {
31
- my ($filePath, $rTrace, $rCheckSums) = @_;
32
- my ($status, $msg) = (1, "");
33
-
34
- my $txtFilePath = FileConverter::Utils::changeExtension($filePath, "txt");
35
-
36
- my @commandArgs = ("ps2ascii", $filePath, $txtFilePath);
37
- my $child;
38
- eval {
39
- local $SIG{'ALRM'} = sub { die "alarm\n" };
40
- alarm $timeout;
41
- $child = system(@commandArgs);
42
- alarm 0;
43
- };
44
-
45
- if ($@) {
46
- if ($@ eq "alarm\n") {
47
- if (defined $child) { kill 9, $child; }
48
- return (0, "ps2ascii timeout");
49
- }
50
- }
51
-
52
- if ($? == -1) {
53
- return (0, "Failed to execute ps2ascii: $!");
54
- } elsif ($? & 127) {
55
- return (0, "ps2ascii died with signal ".($? & 127));
56
- }
57
-
58
- my $code = $?>>8;
59
- if ($code == 0) {
60
- push @$rTrace, "ps2ascii";
61
- ascii2utf8($txtFilePath);
62
-
63
- my $sha1 = FileConverter::CheckSum->new();
64
- $sha1->digest($filePath);
65
- push @$rCheckSums, $sha1;
66
-
67
- return ($status, $msg, $txtFilePath, $rTrace, $rCheckSums);
68
- } else {
69
- return (0, "Error executing ps2ascii (code $code): $!");
70
- }
71
- } # convertFile
72
-
73
- sub ascii2utf8 {
74
- my $fn = shift;
75
-
76
- open(IN, "<$fn") or die $!;
77
- my $text;
78
- {
79
- local $/ = undef;
80
- $text = <IN>;
81
- }
82
- close IN;
83
- $text = Encode::decode_utf8($text);
84
- open(OUT, ">:utf8", $fn) or die $!;
85
- print OUT $text;
86
- close OUT;
87
- }
88
- 1;
@@ -1,68 +0,0 @@
1
- #
2
- # Copyright 2007 Penn State University
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- # http://www.apache.org/licenses/LICENSE-2.0
7
- # Unless required by applicable law or agreed to in writing, software
8
- # distributed under the License is distributed on an "AS IS" BASIS,
9
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
- # See the License for the specific language governing permissions and
11
- # limitations under the License.
12
- #
13
- package FileConverter::Prescript;
14
- #
15
- # Wrapper to execute the Prescript command-line tool for extracting
16
- # text from PS files.
17
- #
18
- # Juan Pablo Fernandez R., 10/31/07
19
- #
20
- use strict;
21
- use FileConverter::Config;
22
- use FileConverter::Utils;
23
- use FileConverter::CheckSum;
24
-
25
- my $PrescriptPath = $FileConverter::Config::PrescriptPath;
26
-
27
- ##
28
- # Execute the Prescript utility.
29
- ##
30
- sub extractText {
31
- my ($filePath, $rTrace, $rCheckSums) = @_;
32
- my ($status, $msg) = (1, "");
33
-
34
- if (FileConverter::Utils::checkExtension($filePath, "ps") <= 0) {
35
- return (0, "Unexpected file extension at ". __FILE__." line ".__LINE__);
36
- }
37
-
38
- my $textFilePath = FileConverter::Utils::changeExtension($filePath, "txt");
39
- my @commandArgs = ($PrescriptPath, "plain", $filePath, $textFilePath);
40
-
41
- system(@commandArgs);
42
-
43
- if ($? == -1) {
44
- return (0, "Failed to execute Prescript: $!");
45
- } elsif ($? & 127) {
46
- return (0, "Prescript died with signal ".($? & 127));
47
- }
48
-
49
- my $code = $?>>8;
50
- if (($code == 0) || ($code == 1)) {
51
- if ($code == 1) {
52
- print STDERR "Prescript completed with errors: $filePath\n";
53
- }
54
-
55
- push @$rTrace, "PSLIB Prescript";
56
-
57
- my $sha1 = new FileConverter::CheckSum();
58
- $sha1->digest($filePath);
59
- push @$rCheckSums, $sha1;
60
-
61
- return ($status, $msg, $textFilePath, $rTrace, $rCheckSums);
62
-
63
- } else {
64
- return (0, "Error executing Prescript (code $code): $!");
65
- }
66
- } # extractText
67
-
68
- 1;
@@ -1,75 +0,0 @@
1
- #
2
- # Copyright 2007 Penn State University
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- # http://www.apache.org/licenses/LICENSE-2.0
7
- # Unless required by applicable law or agreed to in writing, software
8
- # distributed under the License is distributed on an "AS IS" BASIS,
9
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
- # See the License for the specific language governing permissions and
11
- # limitations under the License.
12
- #
13
- package FileConverter::TET;
14
- #
15
- # Wrapper to execute the TET command-line tool for extracting
16
- # text from PDF files.
17
- #
18
- # Isaac Councill, 09/06/07
19
- #
20
- use strict;
21
- use FileConverter::Config;
22
- use FileConverter::Utils;
23
- use FileConverter::CheckSum;
24
-
25
- my $TETPath = $FileConverter::Config::TETPath;
26
- my $TETLicensePath = $FileConverter::Config::TETLicensePath;
27
-
28
- $ENV{'PDFLIBLICENSEFILE'} = $TETLicensePath;
29
-
30
- ##
31
- # Execute the TET utility.
32
- ##
33
- sub extractText {
34
- my ($filePath, $rTrace, $rCheckSums) = @_;
35
- my ($status, $msg) = (1, "");
36
-
37
- if (FileConverter::Utils::checkExtension($filePath, "pdf") <= 0) {
38
- return (0, "Unexpected file extension at ".
39
- __FILE__." line ".__LINE__);
40
- }
41
-
42
- my $textFilePath =
43
- FileConverter::Utils::changeExtension($filePath, "txt");
44
- my @commandArgs = ($TETPath, "-o", $textFilePath, $filePath);
45
-
46
- system(@commandArgs);
47
-
48
- if ($? == -1) {
49
- return (0, "Failed to execute TET: $!");
50
- } elsif ($? & 127) {
51
- return (0, "TET died with signal ".($? & 127));
52
- }
53
-
54
- my $code = $?>>8;
55
- if (($code == 0) || ($code == 1)) {
56
- if ($code == 1) {
57
- print STDERR "TET completed with errors: $filePath\n";
58
- }
59
-
60
- push @$rTrace, "PDFLib TET";
61
-
62
- my $sha1 = new FileConverter::CheckSum();
63
- $sha1->digest($filePath);
64
- push @$rCheckSums, $sha1;
65
-
66
- return ($status, $msg, $textFilePath, $rTrace, $rCheckSums);
67
-
68
- } else {
69
- return (0, "Error executing TET (code $code): $!");
70
- }
71
-
72
- } # extractText
73
-
74
-
75
- 1;
@@ -1,130 +0,0 @@
1
- #
2
- # Copyright 2007 Penn State University
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- # http://www.apache.org/licenses/LICENSE-2.0
7
- # Unless required by applicable law or agreed to in writing, software
8
- # distributed under the License is distributed on an "AS IS" BASIS,
9
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
- # See the License for the specific language governing permissions and
11
- # limitations under the License.
12
- #
13
- package FileConverter::Utils;
14
- #
15
- # Container for subroutines that may be shared across multiple
16
- # FileConverter modules.
17
- #
18
- # Isaac Councill, 09/06/07
19
- #
20
- use strict;
21
- use Encode;
22
-
23
- ##
24
- # Returns the file extension of a file name, if there is one.
25
- ##
26
- sub getExtension {
27
- my ($fn) = @_;
28
- if ($fn =~ m/^.*\.(.*)$/) {
29
- return $1;
30
- }
31
- return undef;
32
-
33
- } # getExtension
34
-
35
-
36
- ##
37
- # Strips off the last extension of the file name.
38
- ##
39
- sub stripExtension {
40
- my ($fn) = @_;
41
- $fn =~ s/^(.*)\..*$/$1/;
42
- return $fn;
43
-
44
- } # stripExtension
45
-
46
-
47
- ##
48
- ##
49
- # Routine for checking that a filename ends with an expected
50
- # extension. Returns 1 if it does, 0 if not.
51
- ##
52
- sub checkExtension {
53
- my ($fn, $ext) = @_;
54
- if ($fn =~ m/^.*\.(.*)$/) {
55
- if ($1 =~ m/$ext/i) {
56
- return 1;
57
- }
58
- }
59
- return 0;
60
-
61
- } # checkExtension
62
-
63
-
64
- ##
65
- # Simple routine for changing the extension of a file.
66
- # Example: $newFileName = changeExtension($oldFileName, "txt");
67
- ##
68
- sub changeExtension {
69
- my ($fn, $ext) = @_;
70
- unless ($fn =~ s/^(.*)\..*$/$1\.$ext/) {
71
- $fn .= ".$ext";
72
- }
73
- return $fn;
74
-
75
- } # changeExtension
76
-
77
-
78
- ##
79
- # Returns the directory part of a file path.
80
- ##
81
- sub getDirectory {
82
- my ($filePath) = @_;
83
- if ($filePath =~ m/^(.*)\/.*$/) {
84
- return $1;
85
- } else {
86
- return $filePath;
87
- }
88
-
89
- } # getDirectory
90
-
91
- ##
92
- ##
93
- # Routine for checking if a process is running or not
94
- # Returns 1 if it is runnig, 0 if not.
95
- ##
96
- sub checkProcess {
97
- my ($process) = @_;
98
- my $cmd = "ps -ef | grep " . $process . " | grep -v grep";
99
- my $result = `$cmd`;
100
- if ($result eq '') {
101
- return 0;
102
- }
103
- else {
104
- return 1;
105
- }
106
- } # checkProcess
107
-
108
-
109
- ##
110
- # Convert an file of the specified encoding to UTF-8
111
- ##
112
- sub convertToUTF8 {
113
- my ($fn, $encoding) = @_;
114
- my $octets;
115
- open (FILE, "<$fn") or die "could not open file $fn: $!";
116
- binmode FILE, ":bytes";
117
- {
118
- local $/ = undef;
119
- $octets = <FILE>;
120
- }
121
- close FILE;
122
-
123
- Encode::from_to($octets, $encoding, "utf8");
124
- open (FILE, ">:utf8", "$fn") or die "could not open file $fn: $!";
125
- print FILE Encode::decode_utf8($octets);
126
- close FILE;
127
-
128
- }
129
-
130
- 1;
@@ -1,140 +0,0 @@
1
- #
2
- # Copyright 2007 Penn State University
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- # http://www.apache.org/licenses/LICENSE-2.0
7
- # Unless required by applicable law or agreed to in writing, software
8
- # distributed under the License is distributed on an "AS IS" BASIS,
9
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
- # See the License for the specific language governing permissions and
11
- # limitations under the License.
12
- #
13
- package CSXUtil::SafeText;
14
- ##
15
- ## Methods for stripping bad (XML unsafe) characters
16
- ## from strings and performing basic HTML entity
17
- ## translations. Also contains a utility (stripArtifacts)
18
- ## for getting rid of crazy control characters and
19
- ## other things that probably aren't proper text.
20
- ##
21
- ## Isaac Councill, 12/06/06
22
- ##
23
- #######################################################
24
- ##
25
- use strict;
26
- use utf8;
27
- require Exporter;
28
-
29
- our ($VERSION, @ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS);
30
-
31
- $VERSION = 1.00;
32
-
33
- @ISA = qw(Exporter);
34
- @EXPORT_OK = qw(@badChars %htmlSpecialChars
35
- %htmlCharEntities &stripBadChars
36
- &encodeHTMLSpecialChars
37
- &decodeHTMLSpecialChars
38
- &cleanXML &cleanAll &stripArtifacts);
39
-
40
-
41
- ##
42
- #######################################################
43
- ##
44
- ## Sharable encoding data.
45
- ##
46
-
47
- ## Hex codes for characters that should never be put into
48
- ## XML - or else parsers will barf.
49
- our @badChars = qw(\x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07
50
- \x08 \x0B \x0C \x0E \x0F \x10 \x11 \x12
51
- \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1A
52
- \x1B \x1C \x1D \x1E \x1F \x7F);
53
-
54
- ## Subset of HTML characters that could be problematic
55
- ## for XML. This is not a complete list of HTML
56
- ## special characters, but more mappings can be added
57
- ## as needed.
58
- our %htmlSpecialCharEncodings = ("&" => "&amp;",
59
- ">" => "&gt;",
60
- "<" => "&lt;",
61
- "\"" => "&quot;"
62
- );
63
-
64
- ## The reverse map.
65
- our %htmlSpecialCharDecodings;
66
- foreach my $key (keys %htmlSpecialCharEncodings) {
67
- my $val = $htmlSpecialCharEncodings{$key};
68
- $htmlSpecialCharDecodings{$val} = $key;
69
- }
70
-
71
-
72
- ##
73
- #######################################################
74
- ##
75
- ## Subroutines
76
- ##
77
-
78
- ## Delete all occurences of bad characters in text,
79
- ## returns a new string that is clean.
80
- sub stripBadChars {
81
- my $rtext = shift;
82
- foreach my $char (@badChars) {
83
- $$rtext =~ s/$char//g;
84
- }
85
- }
86
-
87
-
88
- ## Encodes special characters into HTML equivalents
89
- ## and returns the encoded string.
90
- sub encodeHTMLSpecialChars {
91
- my $rtext = shift;
92
- foreach my $char (keys %htmlSpecialCharEncodings) {
93
- my $code = $htmlSpecialCharEncodings{$char};
94
- $$rtext =~ s/$char/$code/g;
95
- }
96
- }
97
-
98
-
99
- ## Decodes a HTML entities in the supplied string
100
- ## into non-HTML character equivalents and returns
101
- ## the decoded string.
102
- sub decodeHTMLSpecialChars {
103
- my $rtext = shift;
104
- foreach my $code (keys %htmlSpecialCharDecodings) {
105
- my $char = $htmlSpecialCharDecodings{$code};
106
- $$rtext =~ s/$code/$char/g;
107
- }
108
- }
109
-
110
-
111
- ## Strip out any characters that don't look like they
112
- ## belong in a proper, readable text string.
113
- ##
114
- sub stripArtifacts {
115
- my $rtext = shift;
116
- $$rtext =~ s/[^\p{IsAlnum}\p{IsPunct}\p{IsSpace}\p{IsS}]//g;
117
- }
118
-
119
-
120
- ## Convenience routine for executing both XML safety
121
- ## routines in a single call.
122
- ##
123
- sub cleanXML {
124
- my $rtext = shift;
125
- stripBadChars($rtext);
126
- encodeHTMLSpecialChars($rtext);
127
- }
128
-
129
-
130
- ## Clean for XML and also strip out strange characters.
131
- ##
132
- sub cleanAll {
133
- my $rtext = shift;
134
- stripBadChars($rtext);
135
- stripArtifacts($rtext);
136
- encodeHTMLSpecialChars($rtext);
137
- }
138
-
139
-
140
- 1;