biblicit 1.0 → 2.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitmodules +3 -0
- data/Gemfile +1 -1
- data/README.md +125 -30
- data/Rakefile +22 -0
- data/biblicit.gemspec +9 -7
- data/lib/biblicit/cb2bib.rb +10 -11
- data/lib/biblicit/citeseer.rb +14 -26
- data/lib/biblicit/extractor.rb +40 -19
- data/lib/biblicit/parscit.rb +38 -0
- data/parscit/.gitignore +8 -0
- data/parscit/CHANGELOG +125 -0
- data/parscit/COPYING +674 -0
- data/parscit/COPYING.LESSER +165 -0
- data/parscit/INSTALL +105 -0
- data/parscit/README +97 -0
- data/{perl/ParsCit/README.TXT → parscit/USAGE} +25 -15
- data/parscit/bin/archtest.pl +31 -0
- data/parscit/bin/citeExtract.pl +562 -0
- data/parscit/bin/conlleval.pl +315 -0
- data/parscit/bin/headExtract.pl +40 -0
- data/parscit/bin/parsHed/convert2TokenLevel.pl +138 -0
- data/parscit/bin/parsHed/keywordGen.pl +308 -0
- data/parscit/bin/parsHed/parseXmlHeader.pl +141 -0
- data/parscit/bin/parsHed/redo.parsHed.pl +198 -0
- data/parscit/bin/parsHed/tr2crfpp_parsHed.pl +521 -0
- data/parscit/bin/parseRefStrings.pl +102 -0
- data/parscit/bin/phOutput2xml.pl +223 -0
- data/parscit/bin/redo.parsCit.pl +105 -0
- data/parscit/bin/sectExtract.pl +149 -0
- data/parscit/bin/sectLabel/README +110 -0
- data/parscit/bin/sectLabel/README.txt +110 -0
- data/parscit/bin/sectLabel/genericSect/crossValidation.rb +98 -0
- data/parscit/bin/sectLabel/genericSect/extractFeature.rb +104 -0
- data/parscit/bin/sectLabel/genericSectExtract.rb +53 -0
- data/parscit/bin/sectLabel/getStructureInfo.pl +156 -0
- data/parscit/bin/sectLabel/processOmniXML.pl +1427 -0
- data/parscit/bin/sectLabel/processOmniXML_new.pl +1025 -0
- data/parscit/bin/sectLabel/processOmniXMLv2.pl +1529 -0
- data/parscit/bin/sectLabel/processOmniXMLv3.pl +964 -0
- data/parscit/bin/sectLabel/redo.sectLabel.pl +219 -0
- data/parscit/bin/sectLabel/simplifyOmniXML.pl +382 -0
- data/parscit/bin/sectLabel/single2multi.pl +190 -0
- data/parscit/bin/sectLabel/tr2crfpp.pl +158 -0
- data/parscit/bin/tr2crfpp.pl +260 -0
- data/parscit/bin/xml2train.pl +193 -0
- data/parscit/lib/CSXUtil/SafeText.pm +130 -0
- data/parscit/lib/Omni/Config.pm +93 -0
- data/parscit/lib/Omni/Omnicell.pm +263 -0
- data/parscit/lib/Omni/Omnicol.pm +292 -0
- data/parscit/lib/Omni/Omnidd.pm +328 -0
- data/parscit/lib/Omni/Omnidoc.pm +153 -0
- data/parscit/lib/Omni/Omniframe.pm +223 -0
- data/parscit/lib/Omni/Omniline.pm +423 -0
- data/parscit/lib/Omni/Omnipage.pm +282 -0
- data/parscit/lib/Omni/Omnipara.pm +232 -0
- data/parscit/lib/Omni/Omnirun.pm +303 -0
- data/parscit/lib/Omni/Omnitable.pm +336 -0
- data/parscit/lib/Omni/Omniword.pm +162 -0
- data/parscit/lib/Omni/Traversal.pm +313 -0
- data/parscit/lib/ParsCit/.PostProcess.pm.swp +0 -0
- data/parscit/lib/ParsCit/Citation.pm +737 -0
- data/parscit/lib/ParsCit/CitationContext.pm +220 -0
- data/parscit/lib/ParsCit/Config.pm +35 -0
- data/parscit/lib/ParsCit/Controller.pm +653 -0
- data/parscit/lib/ParsCit/PostProcess.pm +505 -0
- data/parscit/lib/ParsCit/PreProcess.pm +1041 -0
- data/parscit/lib/ParsCit/Tr2crfpp.pm +1195 -0
- data/parscit/lib/ParsHed/Config.pm +49 -0
- data/parscit/lib/ParsHed/Controller.pm +143 -0
- data/parscit/lib/ParsHed/PostProcess.pm +322 -0
- data/parscit/lib/ParsHed/Tr2crfpp.pm +448 -0
- data/{perl/ParsCit/lib/ParsCit/Tr2crfpp.pm → parscit/lib/ParsHed/Tr2crfpp_token.pm} +22 -21
- data/parscit/lib/SectLabel/AAMatching.pm +1949 -0
- data/parscit/lib/SectLabel/Config.pm +88 -0
- data/parscit/lib/SectLabel/Controller.pm +332 -0
- data/parscit/lib/SectLabel/PostProcess.pm +425 -0
- data/parscit/lib/SectLabel/PreProcess.pm +116 -0
- data/parscit/lib/SectLabel/Tr2crfpp.pm +1246 -0
- data/parscit/resources/parsCit.model +0 -0
- data/parscit/resources/parsCit.split.model +0 -0
- data/{perl/ParsCit → parscit}/resources/parsCitDict.txt +205 -0
- data/parscit/resources/parsHed/bigram +10 -0
- data/parscit/resources/parsHed/keywords +10 -0
- data/parscit/resources/parsHed/parsHed.model +0 -0
- data/parscit/resources/parsHed/parsHed.template +178 -0
- data/parscit/resources/sectLabel/affiliation.model +0 -0
- data/parscit/resources/sectLabel/author.model +0 -0
- data/parscit/resources/sectLabel/funcWord +320 -0
- data/parscit/resources/sectLabel/genericSect.model +0 -0
- data/parscit/resources/sectLabel/sectLabel.config +42 -0
- data/parscit/resources/sectLabel/sectLabel.configXml +42 -0
- data/parscit/resources/sectLabel/sectLabel.model +0 -0
- data/sh/convert_to_text.sh +20 -0
- data/spec/biblicit/extractor_spec.rb +121 -0
- data/spec/fixtures/Review_of_Michael_Tyes_Consciousness_Revisited.docx +0 -0
- data/spec/fixtures/critical-infrastructures.ps +63951 -0
- data/spec/fixtures/txt/E06-1050.txt +867 -0
- data/spec/fixtures/txt/sample1.txt +902 -0
- data/spec/fixtures/txt/sample2.txt +394 -0
- data/spec/spec_helper.rb +3 -0
- data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Function.pm +2 -20
- data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +0 -7
- data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Parser.pm +0 -2
- data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +0 -7
- data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +6 -1
- data/svm-header-parse/HeaderParseService/tmp/.gitignore +4 -0
- data/svm-header-parse/extract.pl +75 -0
- metadata +351 -317
- data/perl/DocFilter/lib/DocFilter/Config.pm +0 -35
- data/perl/DocFilter/lib/DocFilter/Filter.pm +0 -51
- data/perl/FileConversionService/README.TXT +0 -11
- data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
- data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +0 -140
- data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +0 -77
- data/perl/FileConversionService/lib/FileConverter/Compression.pm +0 -137
- data/perl/FileConversionService/lib/FileConverter/Config.pm +0 -57
- data/perl/FileConversionService/lib/FileConverter/Controller.pm +0 -191
- data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +0 -61
- data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +0 -69
- data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +0 -69
- data/perl/FileConversionService/lib/FileConverter/PSToText.pm +0 -88
- data/perl/FileConversionService/lib/FileConverter/Prescript.pm +0 -68
- data/perl/FileConversionService/lib/FileConverter/TET.pm +0 -75
- data/perl/FileConversionService/lib/FileConverter/Utils.pm +0 -130
- data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +0 -140
- data/perl/HeaderParseService/resources/data/EbizHeaders.txt +0 -24330
- data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +0 -27506
- data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +0 -26495
- data/perl/HeaderParseService/resources/data/tagged_headers.txt +0 -40668
- data/perl/HeaderParseService/resources/data/test_header.txt +0 -31
- data/perl/HeaderParseService/resources/data/test_header.txt.parsed +0 -31
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +0 -23
- data/perl/ParsCit/crfpp/traindata/parsCit.template +0 -60
- data/perl/ParsCit/crfpp/traindata/parsCit.train.data +0 -12104
- data/perl/ParsCit/crfpp/traindata/tagged_references.txt +0 -500
- data/perl/ParsCit/lib/CSXUtil/SafeText.pm +0 -140
- data/perl/ParsCit/lib/ParsCit/Citation.pm +0 -462
- data/perl/ParsCit/lib/ParsCit/CitationContext.pm +0 -132
- data/perl/ParsCit/lib/ParsCit/Config.pm +0 -46
- data/perl/ParsCit/lib/ParsCit/Controller.pm +0 -306
- data/perl/ParsCit/lib/ParsCit/PostProcess.pm +0 -367
- data/perl/ParsCit/lib/ParsCit/PreProcess.pm +0 -333
- data/perl/ParsCit/resources/parsCit.model +0 -0
- data/perl/extract.pl +0 -199
- data/spec/biblicit/cb2bib_spec.rb +0 -48
- data/spec/biblicit/citeseer_spec.rb +0 -40
- /data/{perl → svm-header-parse}/HeaderParseService/README.TXT +0 -0
- /data/{perl/DocFilter → svm-header-parse/HeaderParseService}/lib/CSXUtil/SafeText.pm +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/50states +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AddrTopWords.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWords.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWordsAll.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/ChineseSurNames.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/DomainSuffixes.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/LabeledHeader +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/README +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstract.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstractTopWords +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/addr.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affi.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affis.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/allnamewords.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_US.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_world.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/city.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cityname.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/country_abbr.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/countryname.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/dateTopWords +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/degree.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/email.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/excludeWords.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/female-names +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstNames.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/intro.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keyword.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keywordTopWords +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/male-names +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/middleNames.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/month.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label.old +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.processed +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulAuthor +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulClassStat +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nickname.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nicknames.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/note.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/page.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/phone.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/postcode.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/pubnum.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/states_and_abbreviations.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surNames.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames_spec.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/A.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/B.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/C.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/D.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/E.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/F.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/G.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/H.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/I.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/J.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/K.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/L.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/M.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/N.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/O.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/P.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Q.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/R.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/S.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/T.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/U.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/V.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/W.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/X.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Y.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Z.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ae.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/am.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ar.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/at.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/au.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bd.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/be.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bg.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bh.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bm.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bn.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/br.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ca.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ch.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cl.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cn.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/co.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cy.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cz.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/de.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ec.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ee.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/eg.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/es.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/et.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/faq.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fi.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fj.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fo.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/geog.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gu.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hu.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/id.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ie.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/il.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/in.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/is.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/it.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jm.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jo.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jp.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kw.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lb.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lt.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lu.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lv.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ma.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mo.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mt.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mx.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/my.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ni.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nl.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/no.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nz.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pa.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pe.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ph.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pl.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ps.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pt.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/recognition.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/results.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ro.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ru.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sd.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/se.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sg.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/si.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/th.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tw.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ua.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ-full.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uy.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ve.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/yu.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/za.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/zm.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/url.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/webTopWords +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/words +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceModel +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceTrainF +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperBaseFeaDict +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperContextFeaDict +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +0 -0
data/perl/extract.pl
DELETED
|
@@ -1,199 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/perl -CSD
|
|
2
|
-
#use strict;
|
|
3
|
-
use FindBin;
|
|
4
|
-
|
|
5
|
-
use lib "$FindBin::Bin/FileConversionService/lib";
|
|
6
|
-
use lib "$FindBin::Bin/DocFilter/lib";
|
|
7
|
-
use lib "$FindBin::Bin/ParsCit/lib";
|
|
8
|
-
use lib "$FindBin::Bin/HeaderParseService/lib";
|
|
9
|
-
|
|
10
|
-
use DBI;
|
|
11
|
-
use File::Copy;
|
|
12
|
-
use FileConverter::Controller;
|
|
13
|
-
use DocFilter::Filter;
|
|
14
|
-
use ParsCit::Controller;
|
|
15
|
-
use HeaderParse::API::Parser;
|
|
16
|
-
use HeaderParse::Config::API_Config;
|
|
17
|
-
|
|
18
|
-
my $logDir = "$FindBin::Bin/log";
|
|
19
|
-
|
|
20
|
-
my $xmlHeader = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
|
|
21
|
-
|
|
22
|
-
system("mkdir","-p","$logDir");
|
|
23
|
-
|
|
24
|
-
open (LOG, ">>$logDir/prep.log");
|
|
25
|
-
open (ERR, ">>$logDir/prep.err");
|
|
26
|
-
|
|
27
|
-
my $argc = scalar(@ARGV);
|
|
28
|
-
|
|
29
|
-
if ($argc != 2) {
|
|
30
|
-
print "Usage: ./extract.pl path_to_input path_to_output\n";
|
|
31
|
-
exit 1;
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
my $inputPath = $ARGV[0];
|
|
35
|
-
my $outputPath = $ARGV[1];
|
|
36
|
-
|
|
37
|
-
import($inputPath, $outputPath);
|
|
38
|
-
|
|
39
|
-
close LOG;
|
|
40
|
-
close ERR;
|
|
41
|
-
|
|
42
|
-
exit;
|
|
43
|
-
|
|
44
|
-
sub import {
|
|
45
|
-
my ($filePath, $id) = @_;
|
|
46
|
-
|
|
47
|
-
system("mkdir","-p","$id");
|
|
48
|
-
|
|
49
|
-
my ($status, $msg) = prep($filePath, $id);
|
|
50
|
-
if ($status == 0) {
|
|
51
|
-
print ERR "$id: $msg\n";
|
|
52
|
-
}
|
|
53
|
-
if ($status == 1) {
|
|
54
|
-
print LOG "$id\n";
|
|
55
|
-
}
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
sub prep {
|
|
60
|
-
my ($filePath, $id) = @_;
|
|
61
|
-
|
|
62
|
-
$filePath =~ m/^.*(\.(ps|pdf)(\.g?z)?)$/i;
|
|
63
|
-
my $ext = $1;
|
|
64
|
-
|
|
65
|
-
my $targetPath = "$outputPath/out$ext";
|
|
66
|
-
|
|
67
|
-
unless(copy($filePath, $targetPath)) {
|
|
68
|
-
return (0, "unable to copy $filePath to $targetPath: $!");
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
my $textFile;
|
|
72
|
-
my $conversionSuccess = 0;
|
|
73
|
-
|
|
74
|
-
my ($status, $msg, $textPath) = extractText($targetPath, $id);
|
|
75
|
-
if ($status > 0) {
|
|
76
|
-
$textFile = $textPath;
|
|
77
|
-
my ($fstatus, $msg) = filter($textFile);
|
|
78
|
-
if ($fstatus > 0) {
|
|
79
|
-
$conversionSuccess = 1;
|
|
80
|
-
}
|
|
81
|
-
} else {
|
|
82
|
-
return ($status, $msg);
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
my ($fstatus, $msg) = filter($textFile);
|
|
86
|
-
if ($fstatus <= 0) {
|
|
87
|
-
return ($fstatus, $msg);
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
my ($ecstatus, $msg) = extractCitations($textFile, $id);
|
|
91
|
-
if ($ecstatus <= 0) {
|
|
92
|
-
return ($estatus, $msg);
|
|
93
|
-
}
|
|
94
|
-
|
|
95
|
-
my ($ehstatus, $msg) = extractHeader($textFile, $id);
|
|
96
|
-
if ($ehstatus <= 0) {
|
|
97
|
-
return ($ehstatus, $msg);
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
return (1, "");
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
sub checkPDF {
|
|
106
|
-
my $url = shift;
|
|
107
|
-
if ($url =~ m/pdf(\.g?z)?$/i) {
|
|
108
|
-
return 1;
|
|
109
|
-
} else {
|
|
110
|
-
return 0;
|
|
111
|
-
}
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
sub extractText {
|
|
116
|
-
my ($filePath, $id) = @_;
|
|
117
|
-
my ($status, $msg, $textFile, $rTrace, $rCheckSums) =
|
|
118
|
-
FileConverter::Controller::extractText($filePath);
|
|
119
|
-
if ($status <= 0) {
|
|
120
|
-
return ($status, $msg);
|
|
121
|
-
} else {
|
|
122
|
-
unless(open(FINFO, ">$outputPath/out.file")) {
|
|
123
|
-
return (0, "unable to write finfo: $!");
|
|
124
|
-
}
|
|
125
|
-
print FINFO $xmlHeader;
|
|
126
|
-
print FINFO "<conversionTrace>";
|
|
127
|
-
print FINFO join ",", @$rTrace;
|
|
128
|
-
print FINFO "</conversionTrace>\n";
|
|
129
|
-
print FINFO "<checksums>\n";
|
|
130
|
-
foreach my $checkSum(@$rCheckSums) {
|
|
131
|
-
print FINFO "<checksum>\n";
|
|
132
|
-
print FINFO "<fileType>".$checkSum->getFileType()."</fileType>\n";
|
|
133
|
-
print FINFO "<sha1>".$checkSum->getSHA1()."</sha1>\n";
|
|
134
|
-
print FINFO "</checksum>\n";
|
|
135
|
-
}
|
|
136
|
-
print FINFO "</checkSums>\n";
|
|
137
|
-
close FINFO;
|
|
138
|
-
}
|
|
139
|
-
return (1, "", $textFile);
|
|
140
|
-
}
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
sub filter {
|
|
144
|
-
my $textFile = shift;
|
|
145
|
-
my ($sysStatus, $filterStatus, $msg) =
|
|
146
|
-
DocFilter::Filter::filter($textFile);
|
|
147
|
-
if ($sysStatus > 0) {
|
|
148
|
-
if ($filterStatus > 0) {
|
|
149
|
-
return (1);
|
|
150
|
-
} else {
|
|
151
|
-
return (0, "document failed filtration");
|
|
152
|
-
}
|
|
153
|
-
} else {
|
|
154
|
-
return (0, "An error occurred during filtration: $msg");
|
|
155
|
-
}
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
sub extractCitations {
|
|
160
|
-
my ($textFile, $id) = @_;
|
|
161
|
-
|
|
162
|
-
my $rXML = ParsCit::Controller::extractCitations($textFile);
|
|
163
|
-
|
|
164
|
-
unless(open(CITE, ">:utf8", "$outputPath/out.parscit")) {
|
|
165
|
-
return (0, "Unable to open parscit file: $!");
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
print CITE $$rXML;
|
|
169
|
-
close CITE;
|
|
170
|
-
return (1);
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
sub extractHeader {
|
|
175
|
-
my ($textFile, $id) = @_;
|
|
176
|
-
|
|
177
|
-
my $jobID;
|
|
178
|
-
while($jobID = rand(time)) {
|
|
179
|
-
unless(-f $offlineD."$jobID") {
|
|
180
|
-
last;
|
|
181
|
-
}
|
|
182
|
-
}
|
|
183
|
-
|
|
184
|
-
my ($status, $msg, $rXML) =
|
|
185
|
-
HeaderParse::API::Parser::_parseHeader($textFile, $jobID);
|
|
186
|
-
|
|
187
|
-
if ($status <= 0) {
|
|
188
|
-
return ($status, $msg);
|
|
189
|
-
}
|
|
190
|
-
|
|
191
|
-
unless(open(HEAD, ">:utf8", "$outputPath/out.header")) {
|
|
192
|
-
return (0, "Unable to open header file: $!");
|
|
193
|
-
}
|
|
194
|
-
|
|
195
|
-
print HEAD $$rXML;
|
|
196
|
-
close HEAD;
|
|
197
|
-
return (1);
|
|
198
|
-
|
|
199
|
-
}
|
|
@@ -1,48 +0,0 @@
|
|
|
1
|
-
# encoding: UTF-8
|
|
2
|
-
|
|
3
|
-
describe Cb2Bib do
|
|
4
|
-
|
|
5
|
-
PDF_DIR = "#{File.dirname(__FILE__)}/../fixtures/pdf"
|
|
6
|
-
|
|
7
|
-
unless ENV['LOCAL']
|
|
8
|
-
|
|
9
|
-
it "parses 'Multi-scale collaborative...' headers from file" do
|
|
10
|
-
result = Biblicit.extract(file: "#{PDF_DIR}/ICINCO_2010.pdf", tool: :cb2bib, remote: true)
|
|
11
|
-
parsed = result.header
|
|
12
|
-
|
|
13
|
-
parsed[:title].should == "Multiscale collaborative searching through swarming"
|
|
14
|
-
parsed[:authors].should == ["W. Liu", "M. B. Short", "Y. E. Taima", "A. L. Bertozzi"]
|
|
15
|
-
parsed[:year].should == 2010
|
|
16
|
-
end
|
|
17
|
-
|
|
18
|
-
it "parses 'Multi-scale collaborative...' headers from content" do
|
|
19
|
-
content = IO.read("#{PDF_DIR}/ICINCO_2010.pdf")
|
|
20
|
-
result = Biblicit.extract(content: content, tool: :cb2bib, remote: true)
|
|
21
|
-
parsed = result.header
|
|
22
|
-
|
|
23
|
-
parsed[:title].should == "Multiscale collaborative searching through swarming"
|
|
24
|
-
parsed[:authors].should == ["W. Liu", "M. B. Short", "Y. E. Taima", "A. L. Bertozzi"]
|
|
25
|
-
parsed[:year].should == 2010
|
|
26
|
-
end
|
|
27
|
-
|
|
28
|
-
end
|
|
29
|
-
|
|
30
|
-
it "parses 'Oligopoly, Disclosure...' headers" do
|
|
31
|
-
result = Biblicit.extract(file: "#{PDF_DIR}/Bagnoli Watts TAR 2010.pdf", tool: :cb2bib)
|
|
32
|
-
parsed = result.header
|
|
33
|
-
|
|
34
|
-
parsed[:valid].should == false # unfortunately
|
|
35
|
-
#parsed[:title].should == 'Oligopoly, Disclosure, and Earnings Management'
|
|
36
|
-
#parsed[:authors].should == ["Mark Bagnoli", "Susan G. Watts"]
|
|
37
|
-
end
|
|
38
|
-
|
|
39
|
-
it "parses Google paper headers" do
|
|
40
|
-
result = Biblicit.extract(file: "#{PDF_DIR}/10.1.1.109.4049.pdf", tool: :cb2bib)
|
|
41
|
-
parsed = result.header
|
|
42
|
-
|
|
43
|
-
parsed[:valid].should == false # unfortunately
|
|
44
|
-
#parsed[:title].should == 'The Anatomy of a Large-Scale Hypertextual Web Search Engine'
|
|
45
|
-
#parsed[:authors].should == ['Sergey Brin']
|
|
46
|
-
end
|
|
47
|
-
|
|
48
|
-
end
|
|
@@ -1,40 +0,0 @@
|
|
|
1
|
-
# encoding: UTF-8
|
|
2
|
-
|
|
3
|
-
describe CiteSeer do
|
|
4
|
-
|
|
5
|
-
PDF_DIR = "#{File.dirname(__FILE__)}/../fixtures/pdf"
|
|
6
|
-
|
|
7
|
-
it "parses 'Multi-scale collaborative...' header from file" do
|
|
8
|
-
result = Biblicit.extract(file: "#{PDF_DIR}/ICINCO_2010.pdf", tool: :citeseer)
|
|
9
|
-
header = result.header
|
|
10
|
-
header[:valid].should be_true
|
|
11
|
-
header[:title].should == 'MULTI-SCALE COLLABORATIVE SEARCHING THROUGH SWARMING'
|
|
12
|
-
header[:authors].should == ["Wangyi Liu", "Yasser E. Taima", "Martin B. Short", "Andrea L. Bertozzi"]
|
|
13
|
-
end
|
|
14
|
-
|
|
15
|
-
it "parses 'Multi-scale collaborative...' header from content" do
|
|
16
|
-
content = IO.read("#{PDF_DIR}/ICINCO_2010.pdf")
|
|
17
|
-
result = Biblicit.extract(content: content, tool: :citeseer)
|
|
18
|
-
header = result.header
|
|
19
|
-
header[:valid].should be_true
|
|
20
|
-
header[:title].should == 'MULTI-SCALE COLLABORATIVE SEARCHING THROUGH SWARMING'
|
|
21
|
-
header[:authors].should == ["Wangyi Liu", "Yasser E. Taima", "Martin B. Short", "Andrea L. Bertozzi"]
|
|
22
|
-
end
|
|
23
|
-
|
|
24
|
-
it "parses 'Oligopoly, Disclosure...' headers" do
|
|
25
|
-
result = Biblicit.extract(file: "#{PDF_DIR}/Bagnoli Watts TAR 2010.pdf", tool: :citeseer)
|
|
26
|
-
header = result.header
|
|
27
|
-
header[:valid].should be_true
|
|
28
|
-
header[:title].should == 'Oligopoly, Disclosure, and Earnings Management'
|
|
29
|
-
header[:authors].should == ["Mark Bagnoli", "Susan G. Watts"]
|
|
30
|
-
end
|
|
31
|
-
|
|
32
|
-
it "parses Google paper headers" do
|
|
33
|
-
result = Biblicit.extract(file: "#{PDF_DIR}/10.1.1.109.4049.pdf", tool: :citeseer)
|
|
34
|
-
header = result.header
|
|
35
|
-
header[:valid].should be_true
|
|
36
|
-
header[:title].should == 'The Anatomy of a Large-Scale Hypertextual Web Search Engine'
|
|
37
|
-
header[:authors].should == ['Sergey Brin']
|
|
38
|
-
end
|
|
39
|
-
|
|
40
|
-
end
|
|
File without changes
|
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/all_namewords_spec.bin
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/states_and_abbreviations.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/A.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/B.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/C.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/D.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/E.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/F.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/G.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/H.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/I.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/J.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/K.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/L.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/M.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/N.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/O.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/P.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Q.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/R.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/S.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/T.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/U.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/V.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/W.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/WCSelect.gif
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/X.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Y.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Z.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ae.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/am.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ar.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/at.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/au.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bd.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/be.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bg.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bh.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/blueribbon.gif
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bm.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bn.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/br.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ca.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ch.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cl.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cn.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/co.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cr.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cy.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cz.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/de.html
RENAMED
|
File without changes
|
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dk.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ec.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ee.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/eg.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/es.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/et.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/faq.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fi.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fj.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fo.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fr.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/geog.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gr.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gu.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hk.html
RENAMED
|
File without changes
|
/data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hr.html
RENAMED
|
File without changes
|