biblicit 1.0 → 2.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitmodules +3 -0
- data/Gemfile +1 -1
- data/README.md +125 -30
- data/Rakefile +22 -0
- data/biblicit.gemspec +9 -7
- data/lib/biblicit/cb2bib.rb +10 -11
- data/lib/biblicit/citeseer.rb +14 -26
- data/lib/biblicit/extractor.rb +40 -19
- data/lib/biblicit/parscit.rb +38 -0
- data/parscit/.gitignore +8 -0
- data/parscit/CHANGELOG +125 -0
- data/parscit/COPYING +674 -0
- data/parscit/COPYING.LESSER +165 -0
- data/parscit/INSTALL +105 -0
- data/parscit/README +97 -0
- data/{perl/ParsCit/README.TXT → parscit/USAGE} +25 -15
- data/parscit/bin/archtest.pl +31 -0
- data/parscit/bin/citeExtract.pl +562 -0
- data/parscit/bin/conlleval.pl +315 -0
- data/parscit/bin/headExtract.pl +40 -0
- data/parscit/bin/parsHed/convert2TokenLevel.pl +138 -0
- data/parscit/bin/parsHed/keywordGen.pl +308 -0
- data/parscit/bin/parsHed/parseXmlHeader.pl +141 -0
- data/parscit/bin/parsHed/redo.parsHed.pl +198 -0
- data/parscit/bin/parsHed/tr2crfpp_parsHed.pl +521 -0
- data/parscit/bin/parseRefStrings.pl +102 -0
- data/parscit/bin/phOutput2xml.pl +223 -0
- data/parscit/bin/redo.parsCit.pl +105 -0
- data/parscit/bin/sectExtract.pl +149 -0
- data/parscit/bin/sectLabel/README +110 -0
- data/parscit/bin/sectLabel/README.txt +110 -0
- data/parscit/bin/sectLabel/genericSect/crossValidation.rb +98 -0
- data/parscit/bin/sectLabel/genericSect/extractFeature.rb +104 -0
- data/parscit/bin/sectLabel/genericSectExtract.rb +53 -0
- data/parscit/bin/sectLabel/getStructureInfo.pl +156 -0
- data/parscit/bin/sectLabel/processOmniXML.pl +1427 -0
- data/parscit/bin/sectLabel/processOmniXML_new.pl +1025 -0
- data/parscit/bin/sectLabel/processOmniXMLv2.pl +1529 -0
- data/parscit/bin/sectLabel/processOmniXMLv3.pl +964 -0
- data/parscit/bin/sectLabel/redo.sectLabel.pl +219 -0
- data/parscit/bin/sectLabel/simplifyOmniXML.pl +382 -0
- data/parscit/bin/sectLabel/single2multi.pl +190 -0
- data/parscit/bin/sectLabel/tr2crfpp.pl +158 -0
- data/parscit/bin/tr2crfpp.pl +260 -0
- data/parscit/bin/xml2train.pl +193 -0
- data/parscit/lib/CSXUtil/SafeText.pm +130 -0
- data/parscit/lib/Omni/Config.pm +93 -0
- data/parscit/lib/Omni/Omnicell.pm +263 -0
- data/parscit/lib/Omni/Omnicol.pm +292 -0
- data/parscit/lib/Omni/Omnidd.pm +328 -0
- data/parscit/lib/Omni/Omnidoc.pm +153 -0
- data/parscit/lib/Omni/Omniframe.pm +223 -0
- data/parscit/lib/Omni/Omniline.pm +423 -0
- data/parscit/lib/Omni/Omnipage.pm +282 -0
- data/parscit/lib/Omni/Omnipara.pm +232 -0
- data/parscit/lib/Omni/Omnirun.pm +303 -0
- data/parscit/lib/Omni/Omnitable.pm +336 -0
- data/parscit/lib/Omni/Omniword.pm +162 -0
- data/parscit/lib/Omni/Traversal.pm +313 -0
- data/parscit/lib/ParsCit/.PostProcess.pm.swp +0 -0
- data/parscit/lib/ParsCit/Citation.pm +737 -0
- data/parscit/lib/ParsCit/CitationContext.pm +220 -0
- data/parscit/lib/ParsCit/Config.pm +35 -0
- data/parscit/lib/ParsCit/Controller.pm +653 -0
- data/parscit/lib/ParsCit/PostProcess.pm +505 -0
- data/parscit/lib/ParsCit/PreProcess.pm +1041 -0
- data/parscit/lib/ParsCit/Tr2crfpp.pm +1195 -0
- data/parscit/lib/ParsHed/Config.pm +49 -0
- data/parscit/lib/ParsHed/Controller.pm +143 -0
- data/parscit/lib/ParsHed/PostProcess.pm +322 -0
- data/parscit/lib/ParsHed/Tr2crfpp.pm +448 -0
- data/{perl/ParsCit/lib/ParsCit/Tr2crfpp.pm → parscit/lib/ParsHed/Tr2crfpp_token.pm} +22 -21
- data/parscit/lib/SectLabel/AAMatching.pm +1949 -0
- data/parscit/lib/SectLabel/Config.pm +88 -0
- data/parscit/lib/SectLabel/Controller.pm +332 -0
- data/parscit/lib/SectLabel/PostProcess.pm +425 -0
- data/parscit/lib/SectLabel/PreProcess.pm +116 -0
- data/parscit/lib/SectLabel/Tr2crfpp.pm +1246 -0
- data/parscit/resources/parsCit.model +0 -0
- data/parscit/resources/parsCit.split.model +0 -0
- data/{perl/ParsCit → parscit}/resources/parsCitDict.txt +205 -0
- data/parscit/resources/parsHed/bigram +10 -0
- data/parscit/resources/parsHed/keywords +10 -0
- data/parscit/resources/parsHed/parsHed.model +0 -0
- data/parscit/resources/parsHed/parsHed.template +178 -0
- data/parscit/resources/sectLabel/affiliation.model +0 -0
- data/parscit/resources/sectLabel/author.model +0 -0
- data/parscit/resources/sectLabel/funcWord +320 -0
- data/parscit/resources/sectLabel/genericSect.model +0 -0
- data/parscit/resources/sectLabel/sectLabel.config +42 -0
- data/parscit/resources/sectLabel/sectLabel.configXml +42 -0
- data/parscit/resources/sectLabel/sectLabel.model +0 -0
- data/sh/convert_to_text.sh +20 -0
- data/spec/biblicit/extractor_spec.rb +121 -0
- data/spec/fixtures/Review_of_Michael_Tyes_Consciousness_Revisited.docx +0 -0
- data/spec/fixtures/critical-infrastructures.ps +63951 -0
- data/spec/fixtures/txt/E06-1050.txt +867 -0
- data/spec/fixtures/txt/sample1.txt +902 -0
- data/spec/fixtures/txt/sample2.txt +394 -0
- data/spec/spec_helper.rb +3 -0
- data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Function.pm +2 -20
- data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +0 -7
- data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Parser.pm +0 -2
- data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +0 -7
- data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +6 -1
- data/svm-header-parse/HeaderParseService/tmp/.gitignore +4 -0
- data/svm-header-parse/extract.pl +75 -0
- metadata +351 -317
- data/perl/DocFilter/lib/DocFilter/Config.pm +0 -35
- data/perl/DocFilter/lib/DocFilter/Filter.pm +0 -51
- data/perl/FileConversionService/README.TXT +0 -11
- data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
- data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +0 -140
- data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +0 -77
- data/perl/FileConversionService/lib/FileConverter/Compression.pm +0 -137
- data/perl/FileConversionService/lib/FileConverter/Config.pm +0 -57
- data/perl/FileConversionService/lib/FileConverter/Controller.pm +0 -191
- data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +0 -61
- data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +0 -69
- data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +0 -69
- data/perl/FileConversionService/lib/FileConverter/PSToText.pm +0 -88
- data/perl/FileConversionService/lib/FileConverter/Prescript.pm +0 -68
- data/perl/FileConversionService/lib/FileConverter/TET.pm +0 -75
- data/perl/FileConversionService/lib/FileConverter/Utils.pm +0 -130
- data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +0 -140
- data/perl/HeaderParseService/resources/data/EbizHeaders.txt +0 -24330
- data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +0 -27506
- data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +0 -26495
- data/perl/HeaderParseService/resources/data/tagged_headers.txt +0 -40668
- data/perl/HeaderParseService/resources/data/test_header.txt +0 -31
- data/perl/HeaderParseService/resources/data/test_header.txt.parsed +0 -31
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +0 -23
- data/perl/ParsCit/crfpp/traindata/parsCit.template +0 -60
- data/perl/ParsCit/crfpp/traindata/parsCit.train.data +0 -12104
- data/perl/ParsCit/crfpp/traindata/tagged_references.txt +0 -500
- data/perl/ParsCit/lib/CSXUtil/SafeText.pm +0 -140
- data/perl/ParsCit/lib/ParsCit/Citation.pm +0 -462
- data/perl/ParsCit/lib/ParsCit/CitationContext.pm +0 -132
- data/perl/ParsCit/lib/ParsCit/Config.pm +0 -46
- data/perl/ParsCit/lib/ParsCit/Controller.pm +0 -306
- data/perl/ParsCit/lib/ParsCit/PostProcess.pm +0 -367
- data/perl/ParsCit/lib/ParsCit/PreProcess.pm +0 -333
- data/perl/ParsCit/resources/parsCit.model +0 -0
- data/perl/extract.pl +0 -199
- data/spec/biblicit/cb2bib_spec.rb +0 -48
- data/spec/biblicit/citeseer_spec.rb +0 -40
- /data/{perl → svm-header-parse}/HeaderParseService/README.TXT +0 -0
- /data/{perl/DocFilter → svm-header-parse/HeaderParseService}/lib/CSXUtil/SafeText.pm +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/50states +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AddrTopWords.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWords.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWordsAll.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/ChineseSurNames.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/DomainSuffixes.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/LabeledHeader +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/README +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstract.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstractTopWords +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/addr.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affi.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affis.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/allnamewords.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_US.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_world.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/city.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cityname.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/country_abbr.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/countryname.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/dateTopWords +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/degree.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/email.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/excludeWords.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/female-names +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstNames.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/intro.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keyword.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keywordTopWords +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/male-names +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/middleNames.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/month.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label.old +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.processed +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulAuthor +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulClassStat +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nickname.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nicknames.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/note.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/page.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/phone.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/postcode.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/pubnum.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/states_and_abbreviations.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surNames.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames_spec.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/A.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/B.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/C.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/D.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/E.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/F.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/G.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/H.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/I.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/J.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/K.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/L.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/M.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/N.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/O.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/P.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Q.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/R.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/S.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/T.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/U.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/V.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/W.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/X.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Y.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Z.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ae.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/am.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ar.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/at.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/au.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bd.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/be.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bg.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bh.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bm.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bn.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/br.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ca.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ch.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cl.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cn.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/co.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cy.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cz.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/de.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ec.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ee.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/eg.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/es.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/et.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/faq.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fi.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fj.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fo.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/geog.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gu.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hu.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/id.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ie.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/il.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/in.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/is.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/it.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jm.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jo.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jp.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kw.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lb.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lt.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lu.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lv.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ma.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mo.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mt.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mx.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/my.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ni.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nl.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/no.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nz.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pa.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pe.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ph.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pl.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ps.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pt.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/recognition.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/results.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ro.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ru.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sd.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/se.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sg.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/si.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/th.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tw.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ua.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ-full.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uy.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ve.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/yu.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/za.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/zm.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/url.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/webTopWords +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/words +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceModel +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceTrainF +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperBaseFeaDict +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperContextFeaDict +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +0 -0
data/.gitmodules
ADDED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -3,33 +3,130 @@ biblicit
|
|
3
3
|
|
4
4
|
Extract citations from PDFs.
|
5
5
|
|
6
|
-
|
6
|
+
Note: The version is 2.x, but really should be 0.2.x.
|
7
|
+
|
8
|
+
|
9
|
+
# Usage
|
7
10
|
|
8
11
|
```ruby
|
9
|
-
# Extract metadata from a file using
|
10
|
-
Biblicit.extract(
|
12
|
+
# Extract metadata from a file using default tools and settings
|
13
|
+
result = Biblicit::Extractor.extract(content: "a string containing the content of a PDF file")
|
14
|
+
|
15
|
+
# Extract metadata from a file using all available tools
|
16
|
+
result = Biblicit::Extractor.extract(file: "myfile.pdf", tools: [:citeseer, :parshed, :cb2bib], remote: true, token: false)
|
11
17
|
|
12
|
-
#
|
13
|
-
|
18
|
+
# See reference information for "myfile.pdf"
|
19
|
+
result[:citeseer][:title]
|
20
|
+
result[:parshed][:title]
|
21
|
+
result[:citeseer][:authors]
|
22
|
+
# etc
|
14
23
|
```
|
15
24
|
|
16
|
-
|
25
|
+
|
26
|
+
# Algorithms
|
17
27
|
|
18
28
|
### CiteSeer (default)
|
19
29
|
|
20
30
|
Wrapper around Perl code extracted from [CiteSeerX](http://citeseer.ist.psu.edu/).
|
21
31
|
|
22
|
-
Uses
|
32
|
+
Uses a model trained with the [svm-light](http://svmlight.joachims.org/) Support Vector Machine library.
|
33
|
+
|
34
|
+
### ParsCit (default)
|
35
|
+
|
36
|
+
Wrapper around Perl & Ruby code from [ParsCit](http://aye.comp.nus.edu.sg/parsCit/), which is included as a Git submodule.
|
37
|
+
|
38
|
+
Uses a model trained with the [CRF++](http://code.google.com/p/crfpp/) Conditional Random Fields library.
|
23
39
|
|
24
40
|
### cb2Bib
|
25
41
|
|
26
42
|
Wrapper around [cb2Bib](http://www.molspaces.com/cb2bib/) in command-line mode.
|
27
43
|
|
28
|
-
Uses
|
44
|
+
Uses an apparently less-sophisticated parsing algorithm than the others to parse metadata, but then, if :remote=true, scrapes one of a large number of journal or public repository websites for a structured version of the citation data. Warning: sometimes it finds the wrong work!
|
45
|
+
|
46
|
+
|
47
|
+
# Requirements
|
48
|
+
|
49
|
+
There are a lot, but you may not need all of them, depending on your use case.
|
50
|
+
|
51
|
+
|
52
|
+
## Required to support various input file formats
|
53
|
+
|
54
|
+
Different tools are used for different input file formats.
|
55
|
+
|
56
|
+
#### PDF - [Poppler](http://poppler.freedesktop.org/)
|
57
|
+
|
58
|
+
This provides `pdftotext`. You could install `xpdf` instead.
|
59
|
+
|
60
|
+
##### From source
|
61
|
+
|
62
|
+
Requires fontconfig.
|
63
|
+
|
64
|
+
wget http://poppler.freedesktop.org/poppler-0.22.1.tar.gz
|
65
|
+
tar -xzf poppler-0.22.1.tar.gz
|
66
|
+
cd poppler-0.22.1
|
67
|
+
./configure
|
68
|
+
make
|
69
|
+
sudo make install
|
70
|
+
|
71
|
+
##### On Debian/Ubuntu
|
72
|
+
|
73
|
+
sudo apt-get install poppler-utils
|
74
|
+
|
75
|
+
##### On OS X with Homebrew
|
76
|
+
|
77
|
+
brew install poppler
|
78
|
+
|
79
|
+
#### Postscript - [Ghostscript](http://www.ghostscript.com/)
|
80
|
+
|
81
|
+
This provides `ps2ascii`.
|
82
|
+
|
83
|
+
##### From source
|
84
|
+
|
85
|
+
wget http://downloads.ghostscript.com/public/ghostscript-9.06.tar.gz
|
86
|
+
tar -xzf ghostscript-9.06.tar.gz
|
87
|
+
cd ghostscript-9.06
|
88
|
+
make
|
89
|
+
sudo make install
|
90
|
+
|
91
|
+
##### On Debian/Ubuntu
|
92
|
+
|
93
|
+
sudo apt-get install ghostscript
|
94
|
+
|
95
|
+
##### On OS X with Homebrew
|
96
|
+
|
97
|
+
brew install ghostscript
|
98
|
+
|
99
|
+
#### Other (e.g. docx) - [AbiWord](http://www.abisource.com/)
|
100
|
+
|
101
|
+
This provides `abiword`.
|
102
|
+
|
103
|
+
##### On Debian/Ubuntu
|
104
|
+
|
105
|
+
sudo apt-get install abiword
|
106
|
+
|
107
|
+
##### On OS X
|
108
|
+
|
109
|
+
As of writing, you're out of luck, because AbiWord doesn't compile on recent versions of OS X. According to their website, however, this is being actively worked on.
|
110
|
+
|
111
|
+
|
112
|
+
## Required to use either the ParsCit or CiteSeer algorithms
|
113
|
+
|
114
|
+
#### Perl modules
|
115
|
+
|
116
|
+
More than these might be required; this is what I had to add to my default installation.
|
29
117
|
|
30
|
-
|
118
|
+
##### From CPAN
|
119
|
+
|
120
|
+
sudo cpan install Digest::SHA1
|
121
|
+
sudo cpan install String::Approx
|
122
|
+
sudo cpan install XML::Writer::String
|
123
|
+
sudo cpan install XML::Twig
|
124
|
+
|
125
|
+
## Required to use the ParsCit algorithm
|
31
126
|
|
32
|
-
|
127
|
+
#### CRF++
|
128
|
+
|
129
|
+
You can specify where you have installed CRF++ by setting the CRFPP_HOME environment variable.
|
33
130
|
|
34
131
|
##### From source
|
35
132
|
|
@@ -44,15 +141,19 @@ Uses pdf2text from [Xpdf](http://www.foolabs.com/xpdf/download.html) to extract
|
|
44
141
|
|
45
142
|
sudo apt-add-repository 'deb http://cl.naist.jp/~eric-n/ubuntu-nlp oneiric all'
|
46
143
|
sudo apt-get update
|
47
|
-
sudo apt-get install libcrf++
|
144
|
+
sudo apt-get install libcrf++ crf++
|
48
145
|
|
49
146
|
##### On OS X with Homebrew
|
50
147
|
|
51
148
|
brew install crf++
|
52
149
|
|
53
|
-
|
150
|
+
## Required to use the CiteSeer algorithm
|
151
|
+
|
152
|
+
#### svm-light
|
54
153
|
|
55
|
-
|
154
|
+
Required for header extraction (reference information for the input work itself).
|
155
|
+
|
156
|
+
The included model requires version 5, not the current version. You can specify where you have installed svm-light by setting the SVM_LIGHT_HOME environment variable.
|
56
157
|
|
57
158
|
##### From source
|
58
159
|
|
@@ -61,22 +162,12 @@ The included model requires version 5, not the current version.
|
|
61
162
|
wget http://download.joachims.org/svm_light/v5.00/svm_light.tar.gz
|
62
163
|
tar -xzf svm_light.tar.gz
|
63
164
|
make
|
64
|
-
|
65
|
-
|
165
|
+
echo "export SVM_LIGHT_HOME=`pwd`" >> ~/.profile # or .bashrc or whatever
|
166
|
+
source ~/.profile
|
66
167
|
|
67
|
-
|
168
|
+
## Required to use the cb2bib algorithm
|
68
169
|
|
69
|
-
|
70
|
-
|
71
|
-
##### From CPAN
|
72
|
-
|
73
|
-
sudo cpan install DBI
|
74
|
-
sudo cpan install Digest::SHA1
|
75
|
-
sudo cpan install Log::Log4perl
|
76
|
-
sudo cpan install Log::Dispatch
|
77
|
-
sudo cpan install String::Approx
|
78
|
-
|
79
|
-
### cb2bib
|
170
|
+
#### cb2Bib
|
80
171
|
|
81
172
|
##### From source (Linux)
|
82
173
|
|
@@ -105,15 +196,19 @@ Requires Qt & X11, unfortunately, and still requires a hack to work on recent ve
|
|
105
196
|
|
106
197
|
sudo apt-get install cb2bib
|
107
198
|
|
108
|
-
|
199
|
+
|
200
|
+
## Other
|
201
|
+
|
202
|
+
(I'm not currently sure what this was required for; TODO figure it out!)
|
109
203
|
|
110
204
|
##### On Debian/Ubuntu
|
111
205
|
|
112
206
|
sudo apt-get install libicu-dev
|
113
207
|
|
114
|
-
## Copying
|
115
208
|
|
116
|
-
|
209
|
+
# Copying
|
210
|
+
|
211
|
+
Copyright Academia.edu or the original author(s) - see documentation in the included parscit and svm-header-parse directories.
|
117
212
|
|
118
213
|
Apache licensed (see LICENSE.TXT).
|
119
214
|
|
data/Rakefile
CHANGED
@@ -6,3 +6,25 @@ require 'rspec/core/rake_task'
|
|
6
6
|
require 'biblicit'
|
7
7
|
|
8
8
|
RSpec::Core::RakeTask.new :spec
|
9
|
+
|
10
|
+
desc "Tag #{Bundler::GemHelper.new.send(:version_tag)}, build and push to gemfury"
|
11
|
+
task :release_internal do |t|
|
12
|
+
require 'gemfury'
|
13
|
+
|
14
|
+
class ReleaseInternalGem < Bundler::GemHelper
|
15
|
+
def release_gem
|
16
|
+
guard_clean
|
17
|
+
built_gem_path = build_gem
|
18
|
+
if Bundler::VERSION =~ /1\.3\.\d/
|
19
|
+
tag_version { git_push } unless already_tagged?
|
20
|
+
else
|
21
|
+
guard_already_tagged
|
22
|
+
tag_version { git_push }
|
23
|
+
end
|
24
|
+
`fury push #{built_gem_path}`
|
25
|
+
Bundler.ui.confirm "Pushed #{name} #{version} to gemfury"
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
ReleaseInternalGem.new.release_gem
|
30
|
+
end
|
data/biblicit.gemspec
CHANGED
@@ -5,14 +5,15 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |gem|
|
7
7
|
gem.name = "biblicit"
|
8
|
-
gem.version = "
|
8
|
+
gem.version = "2.0.3"
|
9
9
|
gem.authors = ["David Judd"]
|
10
10
|
gem.email = ["david@academia.edu"]
|
11
|
-
gem.description = %q{Extract citations from PDFs.}
|
12
11
|
gem.summary = %q{Extract citations from PDFs.}
|
13
12
|
gem.homepage = "http://github.com/academia-edu/biblicit"
|
14
13
|
|
15
|
-
gem.files =
|
14
|
+
gem.files =
|
15
|
+
`git ls-files`.split("\n") +
|
16
|
+
`cd parscit && git ls-files`.split("\n").map{ |f| "parscit/#{f}" }
|
16
17
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
18
|
gem.test_files = gem.files.grep(%r{^spec/})
|
18
19
|
gem.require_paths = ["lib"]
|
@@ -25,9 +26,10 @@ Gem::Specification.new do |gem|
|
|
25
26
|
gem.add_development_dependency 'pry'
|
26
27
|
gem.add_development_dependency 'pry-debugger'
|
27
28
|
|
28
|
-
gem.requirements << '
|
29
|
-
gem.requirements << '
|
30
|
-
gem.requirements << '
|
31
|
-
gem.requirements << '
|
29
|
+
gem.requirements << 'For PDFs, Poppler or XPDF (try "which pdftotext")'
|
30
|
+
gem.requirements << 'For Postscript files, Ghostscript (try "which ps2ascii")'
|
31
|
+
gem.requirements << 'For word processor files, AbiWord (try "which abiword")'
|
32
|
+
gem.requirements << 'For the :citeseer algorithm, Perl, CPAN, CRF++ (try "which crf_test"), and svm-light 5.0, aliased to svm_classify5 (try "svm_classify -h")'
|
33
|
+
gem.requirements << 'For the :cb2bib algorithm, cb2Bib (try "which cb2bib")'
|
32
34
|
|
33
35
|
end
|
data/lib/biblicit/cb2bib.rb
CHANGED
@@ -4,30 +4,28 @@ require 'tempfile'
|
|
4
4
|
|
5
5
|
module Cb2Bib
|
6
6
|
|
7
|
-
def self.extract(file, opts)
|
8
|
-
ParseOperation.new(file, opts)
|
7
|
+
def self.extract(file, opts={})
|
8
|
+
ParseOperation.new(file, opts).result
|
9
9
|
end
|
10
10
|
|
11
11
|
class ParseOperation
|
12
12
|
|
13
|
+
attr_reader :result
|
14
|
+
|
13
15
|
def initialize(file, opts)
|
14
16
|
extract_from_file(file, opts[:remote] || false, opts[:sloppy] || true)
|
15
17
|
end
|
16
18
|
|
17
|
-
def header
|
18
|
-
@result
|
19
|
-
end
|
20
|
-
|
21
19
|
private
|
22
20
|
|
23
|
-
def extract_from_file(
|
21
|
+
def extract_from_file(doc, remote=false, sloppy=true)
|
24
22
|
bib = Tempfile.new(['out','.bib'])
|
25
23
|
conf = Tempfile.new(['cb2bib','.conf']) # we'll put our custom configuration here, and then cb2bib will fill in the rest with its defaults
|
26
24
|
|
27
25
|
begin
|
28
26
|
conf.write(cb2bib_config(remote))
|
29
27
|
conf.open # not clear why we have to do this, but otherwise cb2bib doesn't read it
|
30
|
-
`cb2bib #{sloppy ? '--sloppy' : ''} --
|
28
|
+
`cb2bib #{sloppy ? '--sloppy' : ''} --txt2bib #{doc.path} #{bib.path} --conf #{conf.path}`
|
31
29
|
bibtext = bib.read
|
32
30
|
ensure
|
33
31
|
conf.close!
|
@@ -45,12 +43,13 @@ module Cb2Bib
|
|
45
43
|
end
|
46
44
|
end
|
47
45
|
end
|
48
|
-
|
49
|
-
@result[:valid] = !@result[:title].blank?
|
50
46
|
end
|
51
47
|
|
52
48
|
def cb2bib_config(remote)
|
53
|
-
"
|
49
|
+
"""
|
50
|
+
[cb2Bib]
|
51
|
+
AutomaticQuery=#{!!remote}
|
52
|
+
"""
|
54
53
|
end
|
55
54
|
|
56
55
|
def cleaned_field(field)
|
data/lib/biblicit/citeseer.rb
CHANGED
@@ -6,48 +6,36 @@ require 'nokogiri'
|
|
6
6
|
|
7
7
|
module CiteSeer
|
8
8
|
|
9
|
-
PERL_DIR = "#{File.dirname(__FILE__)}/../../
|
9
|
+
PERL_DIR = "#{File.dirname(__FILE__)}/../../svm-header-parse"
|
10
10
|
|
11
|
-
def self.extract(in_file, opts)
|
12
|
-
ParseOperation.new(in_file)
|
11
|
+
def self.extract(in_file, opts={})
|
12
|
+
ParseOperation.new(in_file).result
|
13
13
|
end
|
14
14
|
|
15
15
|
class ParseOperation
|
16
16
|
|
17
|
+
attr_reader :result
|
18
|
+
|
17
19
|
def initialize(in_file)
|
18
20
|
Dir.mktmpdir do |out_dir|
|
19
|
-
`#{PERL_DIR}/extract.pl #{in_file.
|
20
|
-
|
21
|
-
|
21
|
+
`#{PERL_DIR}/extract.pl #{in_file.path} #{out_dir}`
|
22
|
+
output = IO.read("#{out_dir}/out.header")
|
23
|
+
xml = Nokogiri::XML output
|
24
|
+
@result = parse(xml)
|
22
25
|
end
|
23
26
|
end
|
24
27
|
|
25
|
-
def header
|
26
|
-
@header ||= get_header
|
27
|
-
end
|
28
|
-
|
29
|
-
def citations
|
30
|
-
@citations ||= get_citations
|
31
|
-
end
|
32
|
-
|
33
28
|
private
|
34
29
|
|
35
|
-
def
|
36
|
-
parsed = Nokogiri::XML @header_xml
|
37
|
-
|
30
|
+
def parse(xml)
|
38
31
|
{
|
39
|
-
title:
|
40
|
-
authors:
|
41
|
-
abstract:
|
42
|
-
valid:
|
32
|
+
title: xml.css('title').text,
|
33
|
+
authors: xml.css('author > name').map { |n| n.text },
|
34
|
+
abstract: xml.css('abstract').text,
|
35
|
+
valid: xml.css('validHeader').first.text == '1',
|
43
36
|
}
|
44
37
|
end
|
45
38
|
|
46
|
-
def get_citations
|
47
|
-
# TODO
|
48
|
-
[]
|
49
|
-
end
|
50
|
-
|
51
39
|
end
|
52
40
|
|
53
41
|
end
|
data/lib/biblicit/extractor.rb
CHANGED
@@ -2,36 +2,57 @@
|
|
2
2
|
|
3
3
|
require 'biblicit/cb2bib'
|
4
4
|
require 'biblicit/citeseer'
|
5
|
+
require 'biblicit/parscit'
|
5
6
|
|
6
7
|
require 'tempfile'
|
7
8
|
|
8
9
|
module Biblicit
|
9
10
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
11
|
+
SH_DIR = "#{File.dirname(__FILE__)}/../../sh"
|
12
|
+
|
13
|
+
module Extractor
|
14
|
+
|
15
|
+
def self.extract(opts)
|
16
|
+
if (content = opts.delete(:content))
|
17
|
+
Tempfile.open('in') do |in_file|
|
18
|
+
in_file.binmode
|
19
|
+
in_file.write(content)
|
20
|
+
extract_from_file in_file.path, opts
|
21
|
+
end
|
22
|
+
elsif (file = opts.delete(:file))
|
23
|
+
extract_from_file file, opts
|
24
|
+
else
|
25
|
+
raise 'Either file or content is required'
|
15
26
|
end
|
16
|
-
elsif (file = opts.delete(:file))
|
17
|
-
extract_from_file file, opts
|
18
|
-
else
|
19
|
-
raise 'Either file or content is required'
|
20
27
|
end
|
21
|
-
end
|
22
28
|
|
23
|
-
private
|
29
|
+
private
|
30
|
+
|
31
|
+
def self.extract_from_file(file, opts)
|
32
|
+
file = File.realpath(file)
|
33
|
+
tools = opts.delete(:tools) || [:parshed, :citeseer]
|
34
|
+
|
35
|
+
result = {}
|
36
|
+
|
37
|
+
Tempfile.open(['in','.txt']) do |in_txt|
|
38
|
+
`#{SH_DIR}/convert_to_text.sh #{file.shellescape} #{in_txt.path}`
|
24
39
|
|
25
|
-
|
26
|
-
|
40
|
+
if tools.include?(:parshed)
|
41
|
+
result.merge!( parshed: ParsCit.extract(in_txt, opts) )
|
42
|
+
end
|
27
43
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
44
|
+
if tools.include?(:citeseer)
|
45
|
+
result.merge!( citeseer: CiteSeer.extract(in_txt, opts) )
|
46
|
+
end
|
47
|
+
|
48
|
+
if tools.include?(:cb2bib)
|
49
|
+
result.merge!( cb2bib: Cb2Bib.extract(in_txt, opts) )
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
result
|
34
54
|
end
|
55
|
+
|
35
56
|
end
|
36
57
|
|
37
58
|
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'tmpdir'
|
4
|
+
require 'shellwords'
|
5
|
+
require 'nokogiri'
|
6
|
+
|
7
|
+
module ParsCit
|
8
|
+
|
9
|
+
PERL_DIR = "#{File.dirname(__FILE__)}/../../parscit"
|
10
|
+
|
11
|
+
def self.extract(in_file, opts={})
|
12
|
+
ParseOperation.new(in_file, opts).result
|
13
|
+
end
|
14
|
+
|
15
|
+
class ParseOperation
|
16
|
+
|
17
|
+
attr_reader :result
|
18
|
+
|
19
|
+
def initialize(in_txt, opts={})
|
20
|
+
ENV['CRFPP_HOME'] ||= "#{File.dirname(`which crf_test`)}/../"
|
21
|
+
output = `#{PERL_DIR}/bin/citeExtract.pl -q -m extract_all #{in_txt.path}`
|
22
|
+
@result = parse(Nokogiri::XML output)
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def parse(xml)
|
28
|
+
parsed = xml.css("algorithm[name=ParsHed]")
|
29
|
+
{
|
30
|
+
title: parsed.css('title').text.gsub(/\s+/,' ').strip,
|
31
|
+
authors: parsed.css('author').map { |a| a.text.gsub(/\s+/,' ').strip },
|
32
|
+
abstract: parsed.css('abstract').text
|
33
|
+
}
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
data/parscit/.gitignore
ADDED
data/parscit/CHANGELOG
ADDED
@@ -0,0 +1,125 @@
|
|
1
|
+
110505 (done by Huy)
|
2
|
+
- New features: Sectlabel
|
3
|
+
- New sectlabel xml model with more training data (resources/sectLabel/sectLabel.modelXml.v2)
|
4
|
+
|
5
|
+
- Major changes: Sectlabel
|
6
|
+
- Addded Omni (lib/Omni/Omnitable.pm lib/Omni/Omnicell.pm lib/Omni/Omnidd.pm lib/Omni/Omniframe.pm lib/Omni/Traversal.pm)
|
7
|
+
- ParsCit now uses the reference output from Sectlabel as its input
|
8
|
+
|
9
|
+
- Minor changes: Sectlabel and ParsCit
|
10
|
+
- Cleanup all temporary files in /tmp properly
|
11
|
+
|
12
|
+
- Minor changes: bug fixes
|
13
|
+
- Biblioscript: patch from Tim Brody
|
14
|
+
|
15
|
+
110121 (done by Huy)
|
16
|
+
- Major changes: Omni classes for Omnipage XML process using XML::Twig and XML::Writer instead of regular expression
|
17
|
+
- Added Omni (lib/Omni/Config.pm lib/Omni/Omnicol.pm lib/Omni/Omnidoc.pm lib/Omni/Omniline.pm lib/Omni/Omnipage.pm lib/Omni/Omnipara.pm lib/Omni/Omnirun.pm lib/Omni/Omniword.pm)
|
18
|
+
- Use the new Omni classes in Parscit module
|
19
|
+
|
20
|
+
- Major changes: improve Parscit performance when there's no reference marker
|
21
|
+
- Added new crf++ XML model, template and train data (resources/parsCit.split.model crfpp/traindata/parsCit/parsCit.split.template crfpp/traindata/parsCit/parsCit.split.train.data)
|
22
|
+
- Use the new Parscit model when there's no reference marker
|
23
|
+
|
24
|
+
- Minor changes: bug fixes
|
25
|
+
- Parscit post-process: stripPunctuation function removes the semi-colon in XML special characters, e.g. & Thanks to Qasemizadeh, Behrang for reporting these issues.
|
26
|
+
- Parscit controller: terminate properly when no reference is found (normBodyText size 1 != posArray size 0)
|
27
|
+
- Parscit post-process: fix the volume number truncation, e.g. vol 5(1) becomes vol 5; Thanks to Lennart Borgman for reporting these issues
|
28
|
+
|
29
|
+
- Minor changes: Parscit
|
30
|
+
- Add bin/xml2train.pl: extract reference text and XML information from Omnipage and save it into Parscit train file's format
|
31
|
+
|
32
|
+
100901 (done by Thang)
|
33
|
+
- Incorporate BiblioScrip (http://github.com/mromanello/BiblioScript) and BibUtils (http://www.scripps.edu/~cdputnam/software/bibutils/)
|
34
|
+
|
35
|
+
100401e (done by Min on 100725)
|
36
|
+
- Minor changes to paths and to make it work again from wing.nus directory
|
37
|
+
(moved from forecite, due to restructuring of WING server)
|
38
|
+
|
39
|
+
100401d
|
40
|
+
- Minor changes to documentation and ParsHed library updating.
|
41
|
+
|
42
|
+
100401c
|
43
|
+
- Minor changes for correcting errors with punctuation and XML
|
44
|
+
entities in reference string parsing. Reported by Cheong Chi Hong
|
45
|
+
and Mario Lipinski. Fixed by Minh-Thang Luong.
|
46
|
+
|
47
|
+
100401b
|
48
|
+
- Minor changes (bug fixes) to section labeler model.
|
49
|
+
|
50
|
+
100401 (done by Thang)
|
51
|
+
- Major Change: Added SectLabel module (due to Minh-Thang Luong and Thuy Dung Nguyen)
|
52
|
+
- Added Iconip training data from Cheong Chi Hong
|
53
|
+
- Updated default model to include Iconip data
|
54
|
+
- Updated CGI demo to call new SectLabel module as well
|
55
|
+
- Updated documentation
|
56
|
+
- Corrected small regexp error in lib/ParsCit/PreProcess.pm
|
57
|
+
- Corrected small problem with training data in mixed-humanities
|
58
|
+
|
59
|
+
- Added SectLabel (bin/sectExtract.pl, bin/sectExtract/, resource/sectExtract, lib/SectExtract)
|
60
|
+
- Modified bin/citeExtract.pl, lib/ParsCit/PostProcess.pm, lib/ParsHed/PostProcess.pm to combine ParsCit, ParsHed, SectLabel, and standardize XML output
|
61
|
+
- Added test/ for testing purpose with 12 samples documents and standard-output of citeExtract.pl in 5 modes (citations, header, section, meta, and all) using both txt and XML inputs
|
62
|
+
- Added SectLabel annotated data doc/sectLabel.tagged.txt and doc/sectLabelXml.tagged.txt (40 documents fully annotated)
|
63
|
+
- Added in crfpp/traindata CRF++ feature files (sectLabel.train.dataXml, sectLabel.train.data) and templates (sectLabel.templateXml, sectLabel.template)
|
64
|
+
|
65
|
+
- Incorporated works by Emma
|
66
|
+
- Added GenericSect code (bin/sectLabel/genericSectExtract.rb, crfpp/traindata/genericSect.train.data, bin/sectLabel/genericSect/) into SectLabel
|
67
|
+
- Added GenericSect annotated data doc/genericSect.tagged.txt (211 documents with headers annotated)
|
68
|
+
- Added in crfpp/traindata CRF++ feature file (genericSect.train.data) and template (genericSect.template)
|
69
|
+
|
70
|
+
090625b
|
71
|
+
- Updated documentation only. no change to executables
|
72
|
+
- Released on 30 September 2009
|
73
|
+
|
74
|
+
090625 (due to Minh-Thang Luong)
|
75
|
+
- Standardized and improved ParsHed model with line-level classification instead of token-level as previously.
|
76
|
+
- Add a post-processing module for ParsHed to normalize field data, e.g. authors, email, etc.
|
77
|
+
- Detailed changes are reflected as follows:
|
78
|
+
* Added resources/parsHed - all parsHed-related including models (old models in resources/parsHed/archive), template file, and top frequent keyword files.
|
79
|
+
* Added lib/ParsHed - similar architect as lib/ParsCit to modularizes and faciliates line-level training in ParsHed.
|
80
|
+
lib/ParsHed/Tr2crf.pm: line-level CRF feature extractor
|
81
|
+
lib/ParsHed/PostProcess.pm: post-processing of field data
|
82
|
+
* Added bin/parsHed - all parsHed-related scripts (redo.parsHed.pl, and tr2crffpp_parsHed.pl).
|
83
|
+
Includes parseXmlHeader.pl and convert2TokenLevel.pl used by redo.parsHed.pl to convert output from line to token-level.
|
84
|
+
* Updated bin/headExtract.pl - to use the new model lib/ParsHed, as well as old model (with -tokenLevel flag).
|
85
|
+
* Bug fixes in Citation.pm and Preprocess.pm (see doc/v090625-Artemy-issues.txt). Thanks to Artemy Kolchinsky for reporting these issues.
|
86
|
+
* Reordered the CHANGELOG into descending chronological order.
|
87
|
+
090625 (due to Min-Yen KAN)
|
88
|
+
- Deprecated and unified ParsHedClient.rb into ParsCitClient.rb
|
89
|
+
- Deprecated and unified ParsHedServer.rb into ParsCitServer.rb
|
90
|
+
- Added wsdl/forecite.wsdl which describes the ParsCit portion of the services
|
91
|
+
- Added bin/ParsCitClientWSDL.rb which demonstrates the use of forecite.wsdl
|
92
|
+
|
93
|
+
090316:
|
94
|
+
- Adds ParsHed module, updates to:
|
95
|
+
* resources/ - parsHed.*.model model files for binaries
|
96
|
+
* doc/ - svm_headerparse.tagged.txt (from CiteSeer; 935 headers)
|
97
|
+
* bin/ - headExtract.pl, phOutput2xml.pl, redo.parsCit.pl, redo.parsHed.pl
|
98
|
+
* crfpp/traindata/ - parsHed.train.data (converted from svm_headerparse.tagged.txt)
|
99
|
+
Changes doc/index.html, doc/parsCit.cgi to handle parsHed call (not
|
100
|
+
yet entirely integrated, still separate module).
|
101
|
+
|
102
|
+
081201:
|
103
|
+
- Bug fixes from Scienstein.org team
|
104
|
+
* Added context positions
|
105
|
+
* Handle reference patterns such as [1-5]
|
106
|
+
* Handles context references within same window
|
107
|
+
* See doc/v081201-sciensteinEmail.txt for detailed notes
|
108
|
+
- Updated ParsCit.cgi to update for context position output
|
109
|
+
|
110
|
+
080917:
|
111
|
+
- Added new training data from Matteo Romanello
|
112
|
+
- Fixed Preprocess.pm bug (thanks to Dain Kaplan)
|
113
|
+
- Upgraded CRF++ model to 0.51 (now bundled with CRF++ source just in case it is no longer available on sourceforge)
|
114
|
+
- Added bin/redo.pl script to retrain model
|
115
|
+
|
116
|
+
080402:
|
117
|
+
- Added re-tagged data from FLUX-CiM
|
118
|
+
- Added conlleval.pl evaluation script
|
119
|
+
- Added output2xml.pl transformation script
|
120
|
+
- Corrected warning in parseRefString.pl (thanks to Ayeh Bandeh-Ahmadi)
|
121
|
+
|
122
|
+
080310:
|
123
|
+
- First released version to Peter Weiland
|
124
|
+
- Web services working for wing machines at NUS
|
125
|
+
|