biblicit 1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +3 -0
- data/.rspec +1 -0
- data/Gemfile +6 -0
- data/LICENSE.TXT +176 -0
- data/README.md +120 -0
- data/Rakefile +8 -0
- data/biblicit.gemspec +33 -0
- data/lib/biblicit/cb2bib.rb +83 -0
- data/lib/biblicit/citeseer.rb +53 -0
- data/lib/biblicit/extractor.rb +37 -0
- data/lib/biblicit.rb +6 -0
- data/perl/DocFilter/lib/CSXUtil/SafeText.pm +140 -0
- data/perl/DocFilter/lib/DocFilter/Config.pm +35 -0
- data/perl/DocFilter/lib/DocFilter/Filter.pm +51 -0
- data/perl/FileConversionService/README.TXT +11 -0
- data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
- data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +140 -0
- data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +77 -0
- data/perl/FileConversionService/lib/FileConverter/Compression.pm +137 -0
- data/perl/FileConversionService/lib/FileConverter/Config.pm +57 -0
- data/perl/FileConversionService/lib/FileConverter/Controller.pm +191 -0
- data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +61 -0
- data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +69 -0
- data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +69 -0
- data/perl/FileConversionService/lib/FileConverter/PSToText.pm +88 -0
- data/perl/FileConversionService/lib/FileConverter/Prescript.pm +68 -0
- data/perl/FileConversionService/lib/FileConverter/TET.pm +75 -0
- data/perl/FileConversionService/lib/FileConverter/Utils.pm +130 -0
- data/perl/HeaderParseService/README.TXT +80 -0
- data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +140 -0
- data/perl/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +968 -0
- data/perl/HeaderParseService/lib/HeaderParse/API/Function.pm +2016 -0
- data/perl/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +444 -0
- data/perl/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +409 -0
- data/perl/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +537 -0
- data/perl/HeaderParseService/lib/HeaderParse/API/Parser.pm +68 -0
- data/perl/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +1880 -0
- data/perl/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +46 -0
- data/perl/HeaderParseService/resources/data/EbizHeaders.txt +24330 -0
- data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +27506 -0
- data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +26495 -0
- data/perl/HeaderParseService/resources/data/tagged_headers.txt +40668 -0
- data/perl/HeaderParseService/resources/data/test_header.txt +31 -0
- data/perl/HeaderParseService/resources/data/test_header.txt.parsed +31 -0
- data/perl/HeaderParseService/resources/database/50states +60 -0
- data/perl/HeaderParseService/resources/database/AddrTopWords.txt +17 -0
- data/perl/HeaderParseService/resources/database/AffiTopWords.txt +35 -0
- data/perl/HeaderParseService/resources/database/AffiTopWordsAll.txt +533 -0
- data/perl/HeaderParseService/resources/database/ChineseSurNames.txt +276 -0
- data/perl/HeaderParseService/resources/database/Csurnames.bin +0 -0
- data/perl/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
- data/perl/HeaderParseService/resources/database/DomainSuffixes.txt +242 -0
- data/perl/HeaderParseService/resources/database/LabeledHeader +18 -0
- data/perl/HeaderParseService/resources/database/README +2 -0
- data/perl/HeaderParseService/resources/database/TrainMulClassLines +254 -0
- data/perl/HeaderParseService/resources/database/TrainMulClassLines1 +510 -0
- data/perl/HeaderParseService/resources/database/abstract.txt +1 -0
- data/perl/HeaderParseService/resources/database/abstractTopWords +9 -0
- data/perl/HeaderParseService/resources/database/addr.txt +28 -0
- data/perl/HeaderParseService/resources/database/affi.txt +34 -0
- data/perl/HeaderParseService/resources/database/affis.bin +0 -0
- data/perl/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
- data/perl/HeaderParseService/resources/database/allnamewords.bin +0 -0
- data/perl/HeaderParseService/resources/database/cities_US.txt +4512 -0
- data/perl/HeaderParseService/resources/database/cities_world.txt +4463 -0
- data/perl/HeaderParseService/resources/database/city.txt +3150 -0
- data/perl/HeaderParseService/resources/database/cityname.txt +3151 -0
- data/perl/HeaderParseService/resources/database/country_abbr.txt +243 -0
- data/perl/HeaderParseService/resources/database/countryname.txt +262 -0
- data/perl/HeaderParseService/resources/database/dateTopWords +30 -0
- data/perl/HeaderParseService/resources/database/degree.txt +67 -0
- data/perl/HeaderParseService/resources/database/email.txt +3 -0
- data/perl/HeaderParseService/resources/database/excludeWords.txt +40 -0
- data/perl/HeaderParseService/resources/database/female-names +4960 -0
- data/perl/HeaderParseService/resources/database/firstNames.txt +8448 -0
- data/perl/HeaderParseService/resources/database/firstnames.bin +0 -0
- data/perl/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
- data/perl/HeaderParseService/resources/database/intro.txt +2 -0
- data/perl/HeaderParseService/resources/database/keyword.txt +5 -0
- data/perl/HeaderParseService/resources/database/keywordTopWords +7 -0
- data/perl/HeaderParseService/resources/database/male-names +3906 -0
- data/perl/HeaderParseService/resources/database/middleNames.txt +2 -0
- data/perl/HeaderParseService/resources/database/month.txt +35 -0
- data/perl/HeaderParseService/resources/database/mul +868 -0
- data/perl/HeaderParseService/resources/database/mul.label +869 -0
- data/perl/HeaderParseService/resources/database/mul.label.old +869 -0
- data/perl/HeaderParseService/resources/database/mul.processed +762 -0
- data/perl/HeaderParseService/resources/database/mulAuthor +619 -0
- data/perl/HeaderParseService/resources/database/mulClassStat +45 -0
- data/perl/HeaderParseService/resources/database/nickname.txt +58 -0
- data/perl/HeaderParseService/resources/database/nicknames.bin +0 -0
- data/perl/HeaderParseService/resources/database/note.txt +121 -0
- data/perl/HeaderParseService/resources/database/page.txt +1 -0
- data/perl/HeaderParseService/resources/database/phone.txt +9 -0
- data/perl/HeaderParseService/resources/database/postcode.txt +54 -0
- data/perl/HeaderParseService/resources/database/pubnum.txt +45 -0
- data/perl/HeaderParseService/resources/database/statename.bin +0 -0
- data/perl/HeaderParseService/resources/database/statename.txt +73 -0
- data/perl/HeaderParseService/resources/database/states_and_abbreviations.txt +118 -0
- data/perl/HeaderParseService/resources/database/stopwords +438 -0
- data/perl/HeaderParseService/resources/database/stopwords.bin +0 -0
- data/perl/HeaderParseService/resources/database/surNames.txt +19613 -0
- data/perl/HeaderParseService/resources/database/surnames.bin +0 -0
- data/perl/HeaderParseService/resources/database/surnames_spec.bin +0 -0
- data/perl/HeaderParseService/resources/database/university_list/A.html +167 -0
- data/perl/HeaderParseService/resources/database/university_list/B.html +161 -0
- data/perl/HeaderParseService/resources/database/university_list/C.html +288 -0
- data/perl/HeaderParseService/resources/database/university_list/D.html +115 -0
- data/perl/HeaderParseService/resources/database/university_list/E.html +147 -0
- data/perl/HeaderParseService/resources/database/university_list/F.html +112 -0
- data/perl/HeaderParseService/resources/database/university_list/G.html +115 -0
- data/perl/HeaderParseService/resources/database/university_list/H.html +140 -0
- data/perl/HeaderParseService/resources/database/university_list/I.html +138 -0
- data/perl/HeaderParseService/resources/database/university_list/J.html +82 -0
- data/perl/HeaderParseService/resources/database/university_list/K.html +115 -0
- data/perl/HeaderParseService/resources/database/university_list/L.html +131 -0
- data/perl/HeaderParseService/resources/database/university_list/M.html +201 -0
- data/perl/HeaderParseService/resources/database/university_list/N.html +204 -0
- data/perl/HeaderParseService/resources/database/university_list/O.html +89 -0
- data/perl/HeaderParseService/resources/database/university_list/P.html +125 -0
- data/perl/HeaderParseService/resources/database/university_list/Q.html +49 -0
- data/perl/HeaderParseService/resources/database/university_list/R.html +126 -0
- data/perl/HeaderParseService/resources/database/university_list/S.html +296 -0
- data/perl/HeaderParseService/resources/database/university_list/T.html +156 -0
- data/perl/HeaderParseService/resources/database/university_list/U.html +800 -0
- data/perl/HeaderParseService/resources/database/university_list/V.html +75 -0
- data/perl/HeaderParseService/resources/database/university_list/W.html +144 -0
- data/perl/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
- data/perl/HeaderParseService/resources/database/university_list/X.html +44 -0
- data/perl/HeaderParseService/resources/database/university_list/Y.html +53 -0
- data/perl/HeaderParseService/resources/database/university_list/Z.html +43 -0
- data/perl/HeaderParseService/resources/database/university_list/ae.html +31 -0
- data/perl/HeaderParseService/resources/database/university_list/am.html +30 -0
- data/perl/HeaderParseService/resources/database/university_list/ar.html +35 -0
- data/perl/HeaderParseService/resources/database/university_list/at.html +43 -0
- data/perl/HeaderParseService/resources/database/university_list/au.html +82 -0
- data/perl/HeaderParseService/resources/database/university_list/bd.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/be.html +41 -0
- data/perl/HeaderParseService/resources/database/university_list/bg.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/bh.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
- data/perl/HeaderParseService/resources/database/university_list/bm.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/bn.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/br.html +66 -0
- data/perl/HeaderParseService/resources/database/university_list/ca.html +174 -0
- data/perl/HeaderParseService/resources/database/university_list/ch.html +52 -0
- data/perl/HeaderParseService/resources/database/university_list/cl.html +40 -0
- data/perl/HeaderParseService/resources/database/university_list/cn.html +87 -0
- data/perl/HeaderParseService/resources/database/university_list/co.html +39 -0
- data/perl/HeaderParseService/resources/database/university_list/cr.html +34 -0
- data/perl/HeaderParseService/resources/database/university_list/cy.html +34 -0
- data/perl/HeaderParseService/resources/database/university_list/cz.html +44 -0
- data/perl/HeaderParseService/resources/database/university_list/de.html +128 -0
- data/perl/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
- data/perl/HeaderParseService/resources/database/university_list/dk.html +42 -0
- data/perl/HeaderParseService/resources/database/university_list/ec.html +31 -0
- data/perl/HeaderParseService/resources/database/university_list/ee.html +30 -0
- data/perl/HeaderParseService/resources/database/university_list/eg.html +29 -0
- data/perl/HeaderParseService/resources/database/university_list/es.html +68 -0
- data/perl/HeaderParseService/resources/database/university_list/et.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/faq.html +147 -0
- data/perl/HeaderParseService/resources/database/university_list/fi.html +49 -0
- data/perl/HeaderParseService/resources/database/university_list/fj.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/fo.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/fr.html +106 -0
- data/perl/HeaderParseService/resources/database/university_list/geog.html +150 -0
- data/perl/HeaderParseService/resources/database/university_list/gr.html +38 -0
- data/perl/HeaderParseService/resources/database/university_list/gu.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/hk.html +34 -0
- data/perl/HeaderParseService/resources/database/university_list/hr.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/hu.html +46 -0
- data/perl/HeaderParseService/resources/database/university_list/id.html +29 -0
- data/perl/HeaderParseService/resources/database/university_list/ie.html +49 -0
- data/perl/HeaderParseService/resources/database/university_list/il.html +35 -0
- data/perl/HeaderParseService/resources/database/university_list/in.html +109 -0
- data/perl/HeaderParseService/resources/database/university_list/is.html +32 -0
- data/perl/HeaderParseService/resources/database/university_list/it.html +75 -0
- data/perl/HeaderParseService/resources/database/university_list/jm.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/jo.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/jp.html +155 -0
- data/perl/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
- data/perl/HeaderParseService/resources/database/university_list/kr.html +65 -0
- data/perl/HeaderParseService/resources/database/university_list/kw.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/lb.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
- data/perl/HeaderParseService/resources/database/university_list/lk.html +30 -0
- data/perl/HeaderParseService/resources/database/university_list/lt.html +31 -0
- data/perl/HeaderParseService/resources/database/university_list/lu.html +34 -0
- data/perl/HeaderParseService/resources/database/university_list/lv.html +30 -0
- data/perl/HeaderParseService/resources/database/university_list/ma.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
- data/perl/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
- data/perl/HeaderParseService/resources/database/university_list/mk.html +29 -0
- data/perl/HeaderParseService/resources/database/university_list/mo.html +29 -0
- data/perl/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
- data/perl/HeaderParseService/resources/database/university_list/mt.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/mx.html +68 -0
- data/perl/HeaderParseService/resources/database/university_list/my.html +39 -0
- data/perl/HeaderParseService/resources/database/university_list/ni.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/nl.html +51 -0
- data/perl/HeaderParseService/resources/database/university_list/no.html +56 -0
- data/perl/HeaderParseService/resources/database/university_list/nz.html +41 -0
- data/perl/HeaderParseService/resources/database/university_list/pa.html +31 -0
- data/perl/HeaderParseService/resources/database/university_list/pe.html +40 -0
- data/perl/HeaderParseService/resources/database/university_list/ph.html +41 -0
- data/perl/HeaderParseService/resources/database/university_list/pl.html +51 -0
- data/perl/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
- data/perl/HeaderParseService/resources/database/university_list/pr.html +31 -0
- data/perl/HeaderParseService/resources/database/university_list/ps.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/pt.html +45 -0
- data/perl/HeaderParseService/resources/database/university_list/recognition.html +69 -0
- data/perl/HeaderParseService/resources/database/university_list/results.html +71 -0
- data/perl/HeaderParseService/resources/database/university_list/ro.html +38 -0
- data/perl/HeaderParseService/resources/database/university_list/ru.html +48 -0
- data/perl/HeaderParseService/resources/database/university_list/sd.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/se.html +57 -0
- data/perl/HeaderParseService/resources/database/university_list/sg.html +33 -0
- data/perl/HeaderParseService/resources/database/university_list/si.html +30 -0
- data/perl/HeaderParseService/resources/database/university_list/sk.html +35 -0
- data/perl/HeaderParseService/resources/database/university_list/th.html +45 -0
- data/perl/HeaderParseService/resources/database/university_list/tr.html +44 -0
- data/perl/HeaderParseService/resources/database/university_list/tw.html +76 -0
- data/perl/HeaderParseService/resources/database/university_list/ua.html +29 -0
- data/perl/HeaderParseService/resources/database/university_list/uk.html +168 -0
- data/perl/HeaderParseService/resources/database/university_list/univ-full.html +3166 -0
- data/perl/HeaderParseService/resources/database/university_list/univ.html +122 -0
- data/perl/HeaderParseService/resources/database/university_list/uy.html +31 -0
- data/perl/HeaderParseService/resources/database/university_list/ve.html +34 -0
- data/perl/HeaderParseService/resources/database/university_list/yu.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/za.html +46 -0
- data/perl/HeaderParseService/resources/database/university_list/zm.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list.txt +3025 -0
- data/perl/HeaderParseService/resources/database/url.txt +1 -0
- data/perl/HeaderParseService/resources/database/webTopWords +225 -0
- data/perl/HeaderParseService/resources/database/words +45402 -0
- data/perl/HeaderParseService/resources/models/10ContextModelfold1 +369 -0
- data/perl/HeaderParseService/resources/models/10Modelfold1 +376 -0
- data/perl/HeaderParseService/resources/models/11ContextModelfold1 +400 -0
- data/perl/HeaderParseService/resources/models/11Modelfold1 +526 -0
- data/perl/HeaderParseService/resources/models/12ContextModelfold1 +510 -0
- data/perl/HeaderParseService/resources/models/12Modelfold1 +423 -0
- data/perl/HeaderParseService/resources/models/13ContextModelfold1 +364 -0
- data/perl/HeaderParseService/resources/models/13Modelfold1 +677 -0
- data/perl/HeaderParseService/resources/models/14ContextModelfold1 +459 -0
- data/perl/HeaderParseService/resources/models/14Modelfold1 +325 -0
- data/perl/HeaderParseService/resources/models/15ContextModelfold1 +340 -0
- data/perl/HeaderParseService/resources/models/15Modelfold1 +390 -0
- data/perl/HeaderParseService/resources/models/1ContextModelfold1 +668 -0
- data/perl/HeaderParseService/resources/models/1Modelfold1 +1147 -0
- data/perl/HeaderParseService/resources/models/2ContextModelfold1 +755 -0
- data/perl/HeaderParseService/resources/models/2Modelfold1 +796 -0
- data/perl/HeaderParseService/resources/models/3ContextModelfold1 +1299 -0
- data/perl/HeaderParseService/resources/models/3Modelfold1 +1360 -0
- data/perl/HeaderParseService/resources/models/4ContextModelfold1 +1062 -0
- data/perl/HeaderParseService/resources/models/4Modelfold1 +993 -0
- data/perl/HeaderParseService/resources/models/5ContextModelfold1 +1339 -0
- data/perl/HeaderParseService/resources/models/5Modelfold1 +2098 -0
- data/perl/HeaderParseService/resources/models/6ContextModelfold1 +888 -0
- data/perl/HeaderParseService/resources/models/6Modelfold1 +620 -0
- data/perl/HeaderParseService/resources/models/7ContextModelfold1 +257 -0
- data/perl/HeaderParseService/resources/models/7Modelfold1 +228 -0
- data/perl/HeaderParseService/resources/models/8ContextModelfold1 +677 -0
- data/perl/HeaderParseService/resources/models/8Modelfold1 +1871 -0
- data/perl/HeaderParseService/resources/models/9ContextModelfold1 +198 -0
- data/perl/HeaderParseService/resources/models/9Modelfold1 +170 -0
- data/perl/HeaderParseService/resources/models/NameSpaceModel +181 -0
- data/perl/HeaderParseService/resources/models/NameSpaceTrainF +347 -0
- data/perl/HeaderParseService/resources/models/WrapperBaseFeaDict +13460 -0
- data/perl/HeaderParseService/resources/models/WrapperContextFeaDict +14045 -0
- data/perl/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +510 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +23 -0
- data/perl/ParsCit/README.TXT +82 -0
- data/perl/ParsCit/crfpp/traindata/parsCit.template +60 -0
- data/perl/ParsCit/crfpp/traindata/parsCit.train.data +12104 -0
- data/perl/ParsCit/crfpp/traindata/tagged_references.txt +500 -0
- data/perl/ParsCit/lib/CSXUtil/SafeText.pm +140 -0
- data/perl/ParsCit/lib/ParsCit/Citation.pm +462 -0
- data/perl/ParsCit/lib/ParsCit/CitationContext.pm +132 -0
- data/perl/ParsCit/lib/ParsCit/Config.pm +46 -0
- data/perl/ParsCit/lib/ParsCit/Controller.pm +306 -0
- data/perl/ParsCit/lib/ParsCit/PostProcess.pm +367 -0
- data/perl/ParsCit/lib/ParsCit/PreProcess.pm +333 -0
- data/perl/ParsCit/lib/ParsCit/Tr2crfpp.pm +331 -0
- data/perl/ParsCit/resources/parsCit.model +0 -0
- data/perl/ParsCit/resources/parsCitDict.txt +148783 -0
- data/perl/extract.pl +199 -0
- data/spec/biblicit/cb2bib_spec.rb +48 -0
- data/spec/biblicit/citeseer_spec.rb +40 -0
- data/spec/fixtures/pdf/10.1.1.109.4049.pdf +0 -0
- data/spec/fixtures/pdf/Bagnoli Watts TAR 2010.pdf +0 -0
- data/spec/fixtures/pdf/ICINCO_2010.pdf +0 -0
- data/spec/spec_helper.rb +3 -0
- metadata +474 -0
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright 2007 Penn State University
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
#
|
|
13
|
+
package DocFilter::Config;
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
## Global
|
|
17
|
+
|
|
18
|
+
$algorithmName = "BasicDocFilter";
|
|
19
|
+
$algorithmVersion = "1.0";
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
## Repository Mappings
|
|
23
|
+
|
|
24
|
+
%repositories = ('example1' => '/',
|
|
25
|
+
'example2' => '/home',
|
|
26
|
+
);
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
## WS Settings
|
|
30
|
+
|
|
31
|
+
$serverURL = '127.0.0.1';
|
|
32
|
+
$serverPort = 10666;
|
|
33
|
+
$URI = 'http://citeseerx.org/algorithms/docfilter/wsdl';
|
|
34
|
+
|
|
35
|
+
1;
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright 2007 Penn State University
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
#
|
|
13
|
+
package DocFilter::Filter;
|
|
14
|
+
##
|
|
15
|
+
## Isaac Councill, 7/31/07
|
|
16
|
+
##
|
|
17
|
+
use strict;
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
sub filter {
|
|
21
|
+
my ($filePath) = @_;
|
|
22
|
+
|
|
23
|
+
if (!open (IN, "<$filePath")) {
|
|
24
|
+
return (0, 0, "Could not open file $filePath: $!");
|
|
25
|
+
}
|
|
26
|
+
my $text;
|
|
27
|
+
{
|
|
28
|
+
local $/ = undef;
|
|
29
|
+
$text = <IN>;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
if (hasReferences(\$text) <= 0) {
|
|
33
|
+
return (1, 0, "No reference section is present");
|
|
34
|
+
}
|
|
35
|
+
return (1, 1, "All filters passed");
|
|
36
|
+
|
|
37
|
+
} # filter
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
sub hasReferences {
|
|
41
|
+
my $rText = shift;
|
|
42
|
+
if ($$rText =~ /\b(REFERENCES?|References?|BIBLIOGRAPHY|Bibliography|REFERENCES AND NOTES|References and Notes)\:?\s*\n/sg) {
|
|
43
|
+
return 1;
|
|
44
|
+
} else {
|
|
45
|
+
return 0;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
} # hasReferences
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
1;
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
FileConverter README
|
|
2
|
+
Isaac Councill
|
|
3
|
+
|
|
4
|
+
Check the FileConverter::Config module to make sure the settings
|
|
5
|
+
are correct. You will probably need to obtain the PDFBox jar file
|
|
6
|
+
and reference it in Config. It's better to use TET, but TET is
|
|
7
|
+
expensive.
|
|
8
|
+
|
|
9
|
+
If using PDFBox, make sure java is on your path.
|
|
10
|
+
|
|
11
|
+
I know this is sparse, more to come...
|
|
Binary file
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright 2007 Penn State University
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
#
|
|
13
|
+
package CSXUtil::SafeText;
|
|
14
|
+
##
|
|
15
|
+
## Methods for stripping bad (XML unsafe) characters
|
|
16
|
+
## from strings and performing basic HTML entity
|
|
17
|
+
## translations. Also contains a utility (stripArtifacts)
|
|
18
|
+
## for getting rid of crazy control characters and
|
|
19
|
+
## other things that probably aren't proper text.
|
|
20
|
+
##
|
|
21
|
+
## Isaac Councill, 12/06/06
|
|
22
|
+
##
|
|
23
|
+
#######################################################
|
|
24
|
+
##
|
|
25
|
+
use strict;
|
|
26
|
+
use utf8;
|
|
27
|
+
require Exporter;
|
|
28
|
+
|
|
29
|
+
our ($VERSION, @ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS);
|
|
30
|
+
|
|
31
|
+
$VERSION = 1.00;
|
|
32
|
+
|
|
33
|
+
@ISA = qw(Exporter);
|
|
34
|
+
@EXPORT_OK = qw(@badChars %htmlSpecialChars
|
|
35
|
+
%htmlCharEntities &stripBadChars
|
|
36
|
+
&encodeHTMLSpecialChars
|
|
37
|
+
&decodeHTMLSpecialChars
|
|
38
|
+
&cleanXML &cleanAll &stripArtifacts);
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
##
|
|
42
|
+
#######################################################
|
|
43
|
+
##
|
|
44
|
+
## Sharable encoding data.
|
|
45
|
+
##
|
|
46
|
+
|
|
47
|
+
## Hex codes for characters that should never be put into
|
|
48
|
+
## XML - or else parsers will barf.
|
|
49
|
+
our @badChars = qw(\x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07
|
|
50
|
+
\x08 \x0B \x0C \x0E \x0F \x10 \x11 \x12
|
|
51
|
+
\x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1A
|
|
52
|
+
\x1B \x1C \x1D \x1E \x1F \x7F);
|
|
53
|
+
|
|
54
|
+
## Subset of HTML characters that could be problematic
|
|
55
|
+
## for XML. This is not a complete list of HTML
|
|
56
|
+
## special characters, but more mappings can be added
|
|
57
|
+
## as needed.
|
|
58
|
+
our %htmlSpecialCharEncodings = ("&" => "&",
|
|
59
|
+
">" => ">",
|
|
60
|
+
"<" => "<",
|
|
61
|
+
"\"" => """
|
|
62
|
+
);
|
|
63
|
+
|
|
64
|
+
## The reverse map.
|
|
65
|
+
our %htmlSpecialCharDecodings;
|
|
66
|
+
foreach my $key (keys %htmlSpecialCharEncodings) {
|
|
67
|
+
my $val = $htmlSpecialCharEncodings{$key};
|
|
68
|
+
$htmlSpecialCharDecodings{$val} = $key;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
##
|
|
73
|
+
#######################################################
|
|
74
|
+
##
|
|
75
|
+
## Subroutines
|
|
76
|
+
##
|
|
77
|
+
|
|
78
|
+
## Delete all occurences of bad characters in text,
|
|
79
|
+
## returns a new string that is clean.
|
|
80
|
+
sub stripBadChars {
|
|
81
|
+
my $rtext = shift;
|
|
82
|
+
foreach my $char (@badChars) {
|
|
83
|
+
$$rtext =~ s/$char//g;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
## Encodes special characters into HTML equivalents
|
|
89
|
+
## and returns the encoded string.
|
|
90
|
+
sub encodeHTMLSpecialChars {
|
|
91
|
+
my $rtext = shift;
|
|
92
|
+
foreach my $char (keys %htmlSpecialCharEncodings) {
|
|
93
|
+
my $code = $htmlSpecialCharEncodings{$char};
|
|
94
|
+
$$rtext =~ s/$char/$code/g;
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
## Decodes a HTML entities in the supplied string
|
|
100
|
+
## into non-HTML character equivalents and returns
|
|
101
|
+
## the decoded string.
|
|
102
|
+
sub decodeHTMLSpecialChars {
|
|
103
|
+
my $rtext = shift;
|
|
104
|
+
foreach my $code (keys %htmlSpecialCharDecodings) {
|
|
105
|
+
my $char = $htmlSpecialCharDecodings{$code};
|
|
106
|
+
$$rtext =~ s/$code/$char/g;
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
## Strip out any characters that don't look like they
|
|
112
|
+
## belong in a proper, readable text string.
|
|
113
|
+
##
|
|
114
|
+
sub stripArtifacts {
|
|
115
|
+
my $rtext = shift;
|
|
116
|
+
$$rtext =~ s/[^\p{IsAlnum}\p{IsPunct}\p{IsSpace}\p{IsS}]//g;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
## Convenience routine for executing both XML safety
|
|
121
|
+
## routines in a single call.
|
|
122
|
+
##
|
|
123
|
+
sub cleanXML {
|
|
124
|
+
my $rtext = shift;
|
|
125
|
+
stripBadChars($rtext);
|
|
126
|
+
encodeHTMLSpecialChars($rtext);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
## Clean for XML and also strip out strange characters.
|
|
131
|
+
##
|
|
132
|
+
sub cleanAll {
|
|
133
|
+
my $rtext = shift;
|
|
134
|
+
stripBadChars($rtext);
|
|
135
|
+
stripArtifacts($rtext);
|
|
136
|
+
encodeHTMLSpecialChars($rtext);
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
1;
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright 2007 Penn State University
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
#
|
|
13
|
+
package FileConverter::CheckSum;
|
|
14
|
+
#
|
|
15
|
+
# Container for checksum info and utilities for individual files.
|
|
16
|
+
#
|
|
17
|
+
# Isaac Councill
|
|
18
|
+
#
|
|
19
|
+
use strict;
|
|
20
|
+
use Digest::SHA1;
|
|
21
|
+
use FileConverter::Utils;
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
sub new {
|
|
25
|
+
my ($class) = @_;
|
|
26
|
+
my $self = {
|
|
27
|
+
'_fileType' => undef,
|
|
28
|
+
'_sha1' => undef,
|
|
29
|
+
};
|
|
30
|
+
bless $self, $class;
|
|
31
|
+
return $self;
|
|
32
|
+
|
|
33
|
+
} # new
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
sub getFileType {
|
|
37
|
+
my $self = shift;
|
|
38
|
+
return $self->{'_fileType'};
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
sub setFileType {
|
|
43
|
+
my ($self, $fileType) = @_;
|
|
44
|
+
$self->{'_fileType'} = $fileType;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
sub getSHA1 {
|
|
49
|
+
my $self = shift;
|
|
50
|
+
return $self->{'_sha1'};
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
sub setSHA1 {
|
|
55
|
+
my ($self, $sha1) = @_;
|
|
56
|
+
$self->{'_sha1'} = $sha1;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
sub digest {
|
|
61
|
+
my ($self, $filePath) = @_;
|
|
62
|
+
|
|
63
|
+
open(FILE, "<$filePath") or die ("Could not open for reading: $filePath");
|
|
64
|
+
my $digester = Digest::SHA1->new;
|
|
65
|
+
$digester->addfile(*FILE);
|
|
66
|
+
|
|
67
|
+
my $ext = FileConverter::Utils::getExtension($filePath);
|
|
68
|
+
my $sha1 = $digester->hexdigest;
|
|
69
|
+
close FILE;
|
|
70
|
+
|
|
71
|
+
$self->setFileType($ext);
|
|
72
|
+
$self->setSHA1($sha1);
|
|
73
|
+
|
|
74
|
+
} # digest
|
|
75
|
+
|
|
76
|
+
1;
|
|
77
|
+
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright 2007 Penn State University
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
#
|
|
13
|
+
package FileConverter::Compression;
|
|
14
|
+
#
|
|
15
|
+
# Utilities for handling various compression formats.
|
|
16
|
+
#
|
|
17
|
+
# Isaac Councill, 09/06/07
|
|
18
|
+
#
|
|
19
|
+
use strict;
|
|
20
|
+
use FileConverter::Config;
|
|
21
|
+
use FileConverter::Utils;
|
|
22
|
+
|
|
23
|
+
# Should all be lower case.
|
|
24
|
+
my %supportedCompressionExt = ("gz" => 1,
|
|
25
|
+
"zip" => 1,
|
|
26
|
+
"z" => 1,
|
|
27
|
+
);
|
|
28
|
+
|
|
29
|
+
my $gunzip = $FileConverter::Config::gunzip;
|
|
30
|
+
my $uncompress = $FileConverter::Config::uncompress;
|
|
31
|
+
my $unzip = $FileConverter::Config::unzip;
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
sub decompress {
|
|
35
|
+
my ($fn, $rTrace) = @_;
|
|
36
|
+
my $ext = FileConverter::Utils::getExtension($fn);
|
|
37
|
+
if ($ext =~ m/^gz$/i) {
|
|
38
|
+
return gunzip($fn, $rTrace);
|
|
39
|
+
}
|
|
40
|
+
if ($ext =~ m/^z$/i) {
|
|
41
|
+
return uncompress($fn, $rTrace);
|
|
42
|
+
}
|
|
43
|
+
if ($ext =~ m/^zip$/i) {
|
|
44
|
+
return unzip($fn, $rTrace);
|
|
45
|
+
}
|
|
46
|
+
return (0, "Unsupported compression extension: $ext");
|
|
47
|
+
|
|
48
|
+
} # decompress
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
sub canDecompress {
|
|
52
|
+
my ($fn) = @_;
|
|
53
|
+
my $ext = FileConverter::Utils::getExtension($fn);
|
|
54
|
+
if (defined $supportedCompressionExt{lc($ext)}) {
|
|
55
|
+
return 1;
|
|
56
|
+
} else {
|
|
57
|
+
return 0;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
} # canDecompress
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
sub gunzip {
|
|
64
|
+
my ($fn, $rTrace) = @_;
|
|
65
|
+
my @commandArgs = ($gunzip, "-f", $fn);
|
|
66
|
+
|
|
67
|
+
system(@commandArgs);
|
|
68
|
+
|
|
69
|
+
if ($? == -1) {
|
|
70
|
+
return (0, "Failed to execute gunzip: $!");
|
|
71
|
+
} elsif ($? & 127) {
|
|
72
|
+
return (0, "gunzip died with signal ".($? & 127));
|
|
73
|
+
};
|
|
74
|
+
my $code = $?>>8;
|
|
75
|
+
if ($code == 1) {
|
|
76
|
+
return (0, "Error executing gunzip (code $code): $!");
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
push @$rTrace, "gunzip";
|
|
80
|
+
|
|
81
|
+
my $newFile = FileConverter::Utils::stripExtension($fn);
|
|
82
|
+
return (1, "", $newFile, $rTrace);
|
|
83
|
+
|
|
84
|
+
} # gunzip
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
sub uncompress {
|
|
88
|
+
my ($fn, $rTrace) = @_;
|
|
89
|
+
my @commandArgs = ($uncompress, "-f", $fn);
|
|
90
|
+
|
|
91
|
+
system(@commandArgs);
|
|
92
|
+
|
|
93
|
+
if ($? == -1) {
|
|
94
|
+
return (0, "Failed to execute uncompress: $!");
|
|
95
|
+
} elsif ($? & 127) {
|
|
96
|
+
return (0, "uncompress died with signal ".($? & 127));
|
|
97
|
+
};
|
|
98
|
+
my $code = $?>>8;
|
|
99
|
+
if ($code == 1) {
|
|
100
|
+
return (0, "Error executing uncompress (code $code): $!");
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
push @$rTrace, "uncompress";
|
|
104
|
+
|
|
105
|
+
my $newFile = FileConverter::Utils::stripExtension($fn);
|
|
106
|
+
return (1, "", $newFile, $rTrace);
|
|
107
|
+
|
|
108
|
+
} # uncompress
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
sub unzip {
|
|
112
|
+
my ($fn, $rTrace) = @_;
|
|
113
|
+
|
|
114
|
+
my $dir = FileConverter::Utils::getDirectory($fn);
|
|
115
|
+
my @commandArgs = ($unzip, "-qqo", $fn, "-d", $dir);
|
|
116
|
+
|
|
117
|
+
system(@commandArgs);
|
|
118
|
+
|
|
119
|
+
if ($? == -1) {
|
|
120
|
+
return (0, "Failed to execute unzip: $!");
|
|
121
|
+
} elsif ($? & 127) {
|
|
122
|
+
return (0, "unzip died with signal ".($? & 127));
|
|
123
|
+
};
|
|
124
|
+
my $code = $?>>8;
|
|
125
|
+
if ($code > 2) {
|
|
126
|
+
return (0, "Error executing unzip (code $code): $!");
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
push @$rTrace, "unzip";
|
|
130
|
+
|
|
131
|
+
my $newFile = FileConverter::Utils::stripExtension($fn);
|
|
132
|
+
return (1, "", $newFile, $rTrace);
|
|
133
|
+
|
|
134
|
+
} # unzip
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
1;
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright 2007 Penn State University
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
#
|
|
13
|
+
package FileConverter::Config;
|
|
14
|
+
|
|
15
|
+
use FindBin;
|
|
16
|
+
|
|
17
|
+
## Conversion utilities
|
|
18
|
+
|
|
19
|
+
# valid options are TET or PDFBOX
|
|
20
|
+
$PDFTOTEXT = "PDFBOX";
|
|
21
|
+
|
|
22
|
+
# valid options are TEXT or PDF
|
|
23
|
+
$PSConversion = "TEXT";
|
|
24
|
+
|
|
25
|
+
$TETPath = "$FindBin::Bin/../converters/TET-2.2-Linux/bin/tet";
|
|
26
|
+
|
|
27
|
+
$TETLicensePath =
|
|
28
|
+
"$FindBin::Bin/../converters/TET-2.2-Linux/licensekeys.txt";
|
|
29
|
+
|
|
30
|
+
$PDFBoxLocation = "$FindBin::Bin/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar";
|
|
31
|
+
|
|
32
|
+
$JODConverterPath =
|
|
33
|
+
"$FindBin::Bin/../converters/jodconverter-2.2.0/jodconverter-cli-2.2.0.jar";
|
|
34
|
+
|
|
35
|
+
$PrescriptPath = "/usr/local/bin/prescript";
|
|
36
|
+
|
|
37
|
+
## Compression utilities
|
|
38
|
+
|
|
39
|
+
$gunzip = "/usr/bin/gunzip";
|
|
40
|
+
$uncompress = "/usr/bin/uncompress";
|
|
41
|
+
$unzip = "/usr/bin/unzip";
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
## Repository Mappings
|
|
45
|
+
|
|
46
|
+
%repositories = ('example1' => '/',
|
|
47
|
+
'example2' => '/home',
|
|
48
|
+
);
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
## WS settings
|
|
52
|
+
|
|
53
|
+
$serverURL = '127.0.0.1';
|
|
54
|
+
$serverPort = 10888;
|
|
55
|
+
$URI = 'http://citeseerx.org/fileConversion/wsdl';
|
|
56
|
+
|
|
57
|
+
1;
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright 2007 Penn State University
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
#
|
|
13
|
+
package FileConverter::Controller;
|
|
14
|
+
#
|
|
15
|
+
# Main interface to FileConverter. This is where all calls
|
|
16
|
+
# should start, and where all conversion sequences should
|
|
17
|
+
# be managed.
|
|
18
|
+
#
|
|
19
|
+
# Isaac Councill, 09/06/07
|
|
20
|
+
#
|
|
21
|
+
use strict;
|
|
22
|
+
use FileConverter::Utils;
|
|
23
|
+
use FileConverter::Compression;
|
|
24
|
+
use FileConverter::TET;
|
|
25
|
+
use FileConverter::PDFBox;
|
|
26
|
+
use FileConverter::JODConverter;
|
|
27
|
+
use FileConverter::PSConverter;
|
|
28
|
+
use FileConverter::PSToText;
|
|
29
|
+
use FileConverter::Prescript;
|
|
30
|
+
use FileConverter::CheckSum;
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
##
|
|
34
|
+
# Convert the given file to text. Decompression will occur
|
|
35
|
+
# first, and then further conversion and/or text extraction
|
|
36
|
+
# will proceed according to the file type.
|
|
37
|
+
#
|
|
38
|
+
# Supports zip, gz, .Z compression, but only supports the
|
|
39
|
+
# PDF file type for now.
|
|
40
|
+
##
|
|
41
|
+
sub extractText {
|
|
42
|
+
my $fn = shift;
|
|
43
|
+
|
|
44
|
+
if (! -e $fn) {
|
|
45
|
+
return (0, "File does not exist: $fn");
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
my ($status, $msg) = (1, "");
|
|
49
|
+
my @trace = ();
|
|
50
|
+
my @checkSums = ();
|
|
51
|
+
my ($tstatus, $tmsg, $tfn, $rTrace, $rCheckSums);
|
|
52
|
+
|
|
53
|
+
while(FileConverter::Compression::canDecompress($fn) > 0) {
|
|
54
|
+
($tstatus, $tmsg, $tfn, $rTrace) =
|
|
55
|
+
FileConverter::Compression::decompress($fn, \@trace);
|
|
56
|
+
if ($tstatus <= 0) {
|
|
57
|
+
return ($tstatus, $tmsg);
|
|
58
|
+
}
|
|
59
|
+
$fn = $tfn;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
my $extension = FileConverter::Utils::getExtension($fn);
|
|
63
|
+
|
|
64
|
+
if (!defined $extension) {
|
|
65
|
+
return (0, "File $fn has no extension");
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
if ($extension =~ m/^ps$/i) {
|
|
69
|
+
# convert poscript file according config.
|
|
70
|
+
if ($FileConverter::Config::PSConversion eq "TEXT") {
|
|
71
|
+
# go from ps to text directly
|
|
72
|
+
_convert2pdf($fn, $extension, [], \@checkSums);
|
|
73
|
+
return ps2text($fn, \@trace, \@checkSums);
|
|
74
|
+
}
|
|
75
|
+
if ($FileConverter::Config::PSConversion eq "PDF") {
|
|
76
|
+
# convert to PDF first then to text
|
|
77
|
+
($tstatus, $tmsg, $tfn, $rTrace, $rCheckSums) =
|
|
78
|
+
_convert2pdf($fn, $extension, \@trace, \@checkSums);
|
|
79
|
+
if ($tstatus <= 0) {
|
|
80
|
+
return ($tstatus, $tmsg);
|
|
81
|
+
}
|
|
82
|
+
$fn = $tfn;
|
|
83
|
+
$extension = FileConverter::Utils::getExtension($fn);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
elsif (($extension !~ m/^pdf$/i) && ($extension !~ m/^ps$/i)) {
|
|
87
|
+
# first, we need to convert the file to PDF.
|
|
88
|
+
($tstatus, $tmsg, $tfn, $rTrace, $rCheckSums) =
|
|
89
|
+
_convert2pdf($fn, $extension, \@trace, \@checkSums);
|
|
90
|
+
if ($tstatus <= 0) {
|
|
91
|
+
return ($tstatus, $tmsg);
|
|
92
|
+
}
|
|
93
|
+
$fn = $tfn;
|
|
94
|
+
$extension = FileConverter::Utils::getExtension($fn);
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
if ($extension =~ m/^pdf$/i) {
|
|
98
|
+
return pdf2text($fn, \@trace, \@checkSums);
|
|
99
|
+
}
|
|
100
|
+
return (0, "Unsupported file type: $extension");
|
|
101
|
+
|
|
102
|
+
} # extractText
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
sub convert2pdf {
|
|
106
|
+
my $fn = shift;
|
|
107
|
+
|
|
108
|
+
if (! -e $fn) {
|
|
109
|
+
return (0, "File does not exist: $fn");
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
my ($status, $msg) = (1, "");
|
|
113
|
+
my @trace = ();
|
|
114
|
+
my @checkSums = ();
|
|
115
|
+
my ($tstatus, $tmsg, $tfn, $rTrace, $rCheckSums);
|
|
116
|
+
|
|
117
|
+
while(FileConverter::Compression::canDecompress($fn) > 0) {
|
|
118
|
+
($tstatus, $tmsg, $tfn, $rTrace) =
|
|
119
|
+
FileConverter::Compression::decompress($fn, \@trace);
|
|
120
|
+
if ($tstatus <= 0) {
|
|
121
|
+
return ($tstatus, $tmsg);
|
|
122
|
+
}
|
|
123
|
+
$fn = $tfn;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
my $extension = FileConverter::Utils::getExtension($fn);
|
|
127
|
+
|
|
128
|
+
if (!defined $extension) {
|
|
129
|
+
return (0, "File $fn has no extension");
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
if ($extension =~ m/^pdf$/i) {
|
|
133
|
+
|
|
134
|
+
my $sha1 = FileConverter::CheckSum->new();
|
|
135
|
+
$sha1->digest($fn);
|
|
136
|
+
push @checkSums, $sha1;
|
|
137
|
+
|
|
138
|
+
return (1, "", $fn, \@trace, \@checkSums);
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
if ($extension =~ m/^ps$/i || $extension =~ m/^rtf$/i) {
|
|
142
|
+
($tstatus, $tmsg, $tfn, $rTrace, $rCheckSums) =
|
|
143
|
+
_convert2pdf($fn, $extension, \@trace, \@checkSums);
|
|
144
|
+
if ($tstatus <= 0) {
|
|
145
|
+
return ($tstatus, $tmsg);
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
my $sha1 = FileConverter::CheckSum->new();
|
|
149
|
+
$sha1->digest($tfn);
|
|
150
|
+
push @$rCheckSums, $sha1;
|
|
151
|
+
|
|
152
|
+
return ($tstatus, $tmsg, $tfn, $rTrace, $rCheckSums);
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
return (0, "Unsupported file type: $extension");
|
|
156
|
+
|
|
157
|
+
} # convert2pdf
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
sub pdf2text {
|
|
161
|
+
my ($fn, $rTrace, $rCheckSums) = @_;
|
|
162
|
+
if ($FileConverter::Config::PDFTOTEXT eq "TET") {
|
|
163
|
+
return FileConverter::TET::extractText($fn, $rTrace, $rCheckSums);
|
|
164
|
+
}
|
|
165
|
+
if ($FileConverter::Config::PDFTOTEXT eq "PDFBOX") {
|
|
166
|
+
return FileConverter::PDFBox::extractText($fn, $rTrace, $rCheckSums);
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
} # pdf2text
|
|
170
|
+
|
|
171
|
+
sub _convert2pdf {
|
|
172
|
+
my ($fn, $extension, $rTrace, $rCheckSums) = @_;
|
|
173
|
+
|
|
174
|
+
if (($extension =~ m/^rtf$/i) || ($extension =~ m/^doc$/i)) {
|
|
175
|
+
return FileConverter::JODConverter::convertFile($fn, $rTrace,
|
|
176
|
+
$rCheckSums);
|
|
177
|
+
}
|
|
178
|
+
elsif (($extension =~ m/^ps$/i) || ($extension =~ m/^eps$/i)) {
|
|
179
|
+
return FileConverter::PSConverter::convertFile($fn, $rTrace,
|
|
180
|
+
$rCheckSums);
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
} # _convert2pdf
|
|
184
|
+
|
|
185
|
+
sub ps2text {
|
|
186
|
+
my($fn, $rTrace, $rCheckSums) = @_;
|
|
187
|
+
return FileConverter::PSToText::extractText($fn, $rTrace, $rCheckSums);
|
|
188
|
+
|
|
189
|
+
} # ps2text
|
|
190
|
+
|
|
191
|
+
1;
|