biblicit 1.0 → 2.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitmodules +3 -0
- data/Gemfile +1 -1
- data/README.md +125 -30
- data/Rakefile +22 -0
- data/biblicit.gemspec +9 -7
- data/lib/biblicit/cb2bib.rb +10 -11
- data/lib/biblicit/citeseer.rb +14 -26
- data/lib/biblicit/extractor.rb +40 -19
- data/lib/biblicit/parscit.rb +38 -0
- data/parscit/.gitignore +8 -0
- data/parscit/CHANGELOG +125 -0
- data/parscit/COPYING +674 -0
- data/parscit/COPYING.LESSER +165 -0
- data/parscit/INSTALL +105 -0
- data/parscit/README +97 -0
- data/{perl/ParsCit/README.TXT → parscit/USAGE} +25 -15
- data/parscit/bin/archtest.pl +31 -0
- data/parscit/bin/citeExtract.pl +562 -0
- data/parscit/bin/conlleval.pl +315 -0
- data/parscit/bin/headExtract.pl +40 -0
- data/parscit/bin/parsHed/convert2TokenLevel.pl +138 -0
- data/parscit/bin/parsHed/keywordGen.pl +308 -0
- data/parscit/bin/parsHed/parseXmlHeader.pl +141 -0
- data/parscit/bin/parsHed/redo.parsHed.pl +198 -0
- data/parscit/bin/parsHed/tr2crfpp_parsHed.pl +521 -0
- data/parscit/bin/parseRefStrings.pl +102 -0
- data/parscit/bin/phOutput2xml.pl +223 -0
- data/parscit/bin/redo.parsCit.pl +105 -0
- data/parscit/bin/sectExtract.pl +149 -0
- data/parscit/bin/sectLabel/README +110 -0
- data/parscit/bin/sectLabel/README.txt +110 -0
- data/parscit/bin/sectLabel/genericSect/crossValidation.rb +98 -0
- data/parscit/bin/sectLabel/genericSect/extractFeature.rb +104 -0
- data/parscit/bin/sectLabel/genericSectExtract.rb +53 -0
- data/parscit/bin/sectLabel/getStructureInfo.pl +156 -0
- data/parscit/bin/sectLabel/processOmniXML.pl +1427 -0
- data/parscit/bin/sectLabel/processOmniXML_new.pl +1025 -0
- data/parscit/bin/sectLabel/processOmniXMLv2.pl +1529 -0
- data/parscit/bin/sectLabel/processOmniXMLv3.pl +964 -0
- data/parscit/bin/sectLabel/redo.sectLabel.pl +219 -0
- data/parscit/bin/sectLabel/simplifyOmniXML.pl +382 -0
- data/parscit/bin/sectLabel/single2multi.pl +190 -0
- data/parscit/bin/sectLabel/tr2crfpp.pl +158 -0
- data/parscit/bin/tr2crfpp.pl +260 -0
- data/parscit/bin/xml2train.pl +193 -0
- data/parscit/lib/CSXUtil/SafeText.pm +130 -0
- data/parscit/lib/Omni/Config.pm +93 -0
- data/parscit/lib/Omni/Omnicell.pm +263 -0
- data/parscit/lib/Omni/Omnicol.pm +292 -0
- data/parscit/lib/Omni/Omnidd.pm +328 -0
- data/parscit/lib/Omni/Omnidoc.pm +153 -0
- data/parscit/lib/Omni/Omniframe.pm +223 -0
- data/parscit/lib/Omni/Omniline.pm +423 -0
- data/parscit/lib/Omni/Omnipage.pm +282 -0
- data/parscit/lib/Omni/Omnipara.pm +232 -0
- data/parscit/lib/Omni/Omnirun.pm +303 -0
- data/parscit/lib/Omni/Omnitable.pm +336 -0
- data/parscit/lib/Omni/Omniword.pm +162 -0
- data/parscit/lib/Omni/Traversal.pm +313 -0
- data/parscit/lib/ParsCit/.PostProcess.pm.swp +0 -0
- data/parscit/lib/ParsCit/Citation.pm +737 -0
- data/parscit/lib/ParsCit/CitationContext.pm +220 -0
- data/parscit/lib/ParsCit/Config.pm +35 -0
- data/parscit/lib/ParsCit/Controller.pm +653 -0
- data/parscit/lib/ParsCit/PostProcess.pm +505 -0
- data/parscit/lib/ParsCit/PreProcess.pm +1041 -0
- data/parscit/lib/ParsCit/Tr2crfpp.pm +1195 -0
- data/parscit/lib/ParsHed/Config.pm +49 -0
- data/parscit/lib/ParsHed/Controller.pm +143 -0
- data/parscit/lib/ParsHed/PostProcess.pm +322 -0
- data/parscit/lib/ParsHed/Tr2crfpp.pm +448 -0
- data/{perl/ParsCit/lib/ParsCit/Tr2crfpp.pm → parscit/lib/ParsHed/Tr2crfpp_token.pm} +22 -21
- data/parscit/lib/SectLabel/AAMatching.pm +1949 -0
- data/parscit/lib/SectLabel/Config.pm +88 -0
- data/parscit/lib/SectLabel/Controller.pm +332 -0
- data/parscit/lib/SectLabel/PostProcess.pm +425 -0
- data/parscit/lib/SectLabel/PreProcess.pm +116 -0
- data/parscit/lib/SectLabel/Tr2crfpp.pm +1246 -0
- data/parscit/resources/parsCit.model +0 -0
- data/parscit/resources/parsCit.split.model +0 -0
- data/{perl/ParsCit → parscit}/resources/parsCitDict.txt +205 -0
- data/parscit/resources/parsHed/bigram +10 -0
- data/parscit/resources/parsHed/keywords +10 -0
- data/parscit/resources/parsHed/parsHed.model +0 -0
- data/parscit/resources/parsHed/parsHed.template +178 -0
- data/parscit/resources/sectLabel/affiliation.model +0 -0
- data/parscit/resources/sectLabel/author.model +0 -0
- data/parscit/resources/sectLabel/funcWord +320 -0
- data/parscit/resources/sectLabel/genericSect.model +0 -0
- data/parscit/resources/sectLabel/sectLabel.config +42 -0
- data/parscit/resources/sectLabel/sectLabel.configXml +42 -0
- data/parscit/resources/sectLabel/sectLabel.model +0 -0
- data/sh/convert_to_text.sh +20 -0
- data/spec/biblicit/extractor_spec.rb +121 -0
- data/spec/fixtures/Review_of_Michael_Tyes_Consciousness_Revisited.docx +0 -0
- data/spec/fixtures/critical-infrastructures.ps +63951 -0
- data/spec/fixtures/txt/E06-1050.txt +867 -0
- data/spec/fixtures/txt/sample1.txt +902 -0
- data/spec/fixtures/txt/sample2.txt +394 -0
- data/spec/spec_helper.rb +3 -0
- data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Function.pm +2 -20
- data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +0 -7
- data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Parser.pm +0 -2
- data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +0 -7
- data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +6 -1
- data/svm-header-parse/HeaderParseService/tmp/.gitignore +4 -0
- data/svm-header-parse/extract.pl +75 -0
- metadata +351 -317
- data/perl/DocFilter/lib/DocFilter/Config.pm +0 -35
- data/perl/DocFilter/lib/DocFilter/Filter.pm +0 -51
- data/perl/FileConversionService/README.TXT +0 -11
- data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
- data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +0 -140
- data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +0 -77
- data/perl/FileConversionService/lib/FileConverter/Compression.pm +0 -137
- data/perl/FileConversionService/lib/FileConverter/Config.pm +0 -57
- data/perl/FileConversionService/lib/FileConverter/Controller.pm +0 -191
- data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +0 -61
- data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +0 -69
- data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +0 -69
- data/perl/FileConversionService/lib/FileConverter/PSToText.pm +0 -88
- data/perl/FileConversionService/lib/FileConverter/Prescript.pm +0 -68
- data/perl/FileConversionService/lib/FileConverter/TET.pm +0 -75
- data/perl/FileConversionService/lib/FileConverter/Utils.pm +0 -130
- data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +0 -140
- data/perl/HeaderParseService/resources/data/EbizHeaders.txt +0 -24330
- data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +0 -27506
- data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +0 -26495
- data/perl/HeaderParseService/resources/data/tagged_headers.txt +0 -40668
- data/perl/HeaderParseService/resources/data/test_header.txt +0 -31
- data/perl/HeaderParseService/resources/data/test_header.txt.parsed +0 -31
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +0 -23
- data/perl/ParsCit/crfpp/traindata/parsCit.template +0 -60
- data/perl/ParsCit/crfpp/traindata/parsCit.train.data +0 -12104
- data/perl/ParsCit/crfpp/traindata/tagged_references.txt +0 -500
- data/perl/ParsCit/lib/CSXUtil/SafeText.pm +0 -140
- data/perl/ParsCit/lib/ParsCit/Citation.pm +0 -462
- data/perl/ParsCit/lib/ParsCit/CitationContext.pm +0 -132
- data/perl/ParsCit/lib/ParsCit/Config.pm +0 -46
- data/perl/ParsCit/lib/ParsCit/Controller.pm +0 -306
- data/perl/ParsCit/lib/ParsCit/PostProcess.pm +0 -367
- data/perl/ParsCit/lib/ParsCit/PreProcess.pm +0 -333
- data/perl/ParsCit/resources/parsCit.model +0 -0
- data/perl/extract.pl +0 -199
- data/spec/biblicit/cb2bib_spec.rb +0 -48
- data/spec/biblicit/citeseer_spec.rb +0 -40
- /data/{perl → svm-header-parse}/HeaderParseService/README.TXT +0 -0
- /data/{perl/DocFilter → svm-header-parse/HeaderParseService}/lib/CSXUtil/SafeText.pm +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/50states +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AddrTopWords.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWords.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWordsAll.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/ChineseSurNames.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/DomainSuffixes.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/LabeledHeader +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/README +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstract.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstractTopWords +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/addr.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affi.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affis.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/allnamewords.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_US.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_world.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/city.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cityname.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/country_abbr.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/countryname.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/dateTopWords +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/degree.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/email.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/excludeWords.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/female-names +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstNames.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/intro.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keyword.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keywordTopWords +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/male-names +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/middleNames.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/month.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label.old +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.processed +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulAuthor +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulClassStat +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nickname.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nicknames.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/note.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/page.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/phone.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/postcode.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/pubnum.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/states_and_abbreviations.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surNames.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames_spec.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/A.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/B.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/C.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/D.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/E.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/F.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/G.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/H.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/I.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/J.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/K.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/L.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/M.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/N.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/O.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/P.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Q.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/R.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/S.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/T.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/U.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/V.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/W.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/X.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Y.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Z.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ae.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/am.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ar.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/at.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/au.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bd.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/be.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bg.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bh.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bm.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bn.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/br.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ca.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ch.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cl.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cn.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/co.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cy.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cz.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/de.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ec.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ee.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/eg.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/es.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/et.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/faq.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fi.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fj.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fo.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/geog.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gu.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hu.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/id.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ie.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/il.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/in.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/is.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/it.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jm.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jo.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jp.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kw.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lb.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lt.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lu.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lv.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ma.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mo.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mt.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mx.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/my.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ni.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nl.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/no.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nz.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pa.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pe.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ph.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pl.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ps.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pt.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/recognition.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/results.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ro.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ru.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sd.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/se.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sg.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/si.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/th.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tw.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ua.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ-full.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uy.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ve.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/yu.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/za.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/zm.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/url.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/webTopWords +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/words +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceModel +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceTrainF +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperBaseFeaDict +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperContextFeaDict +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +0 -0
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
#!/usr/bin/env perl
|
|
2
|
+
# -*- cperl -*-
|
|
3
|
+
=head1 NAME
|
|
4
|
+
|
|
5
|
+
phOutput2xml.pl
|
|
6
|
+
|
|
7
|
+
=head1 SYNOPSYS
|
|
8
|
+
|
|
9
|
+
RCS:$Id$
|
|
10
|
+
|
|
11
|
+
=head1 DESCRIPTION
|
|
12
|
+
|
|
13
|
+
=head1 HISTORY
|
|
14
|
+
|
|
15
|
+
ORIGIN: created from templateApp.pl version 3.4 by Min-Yen Kan <kanmy@comp.nus.edu.sg>
|
|
16
|
+
|
|
17
|
+
modified from output2xml.pl for ParsCit.
|
|
18
|
+
|
|
19
|
+
RCS:$Log$
|
|
20
|
+
|
|
21
|
+
=cut
|
|
22
|
+
|
|
23
|
+
require 5.0;
|
|
24
|
+
use Getopt::Std;
|
|
25
|
+
use strict 'vars';
|
|
26
|
+
# use diagnostics;
|
|
27
|
+
|
|
28
|
+
### USER customizable section
|
|
29
|
+
my $tmpfile .= $0; $tmpfile =~ s/[\.\/]//g;
|
|
30
|
+
$tmpfile .= $$ . time;
|
|
31
|
+
if ($tmpfile =~ /^([-\@\w.]+)$/) { $tmpfile = $1; } # untaint tmpfile variable
|
|
32
|
+
$tmpfile = "/tmp/" . $tmpfile;
|
|
33
|
+
$0 =~ /([^\/]+)$/; my $progname = $1;
|
|
34
|
+
my $outputVersion = "1.0";
|
|
35
|
+
### END user customizable section
|
|
36
|
+
|
|
37
|
+
### Ctrl-C handler
|
|
38
|
+
sub quitHandler {
|
|
39
|
+
print STDERR "\n# $progname fatal\t\tReceived a 'SIGINT'\n# $progname - exiting cleanly\n";
|
|
40
|
+
exit;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
### HELP Sub-procedure
|
|
44
|
+
sub Help {
|
|
45
|
+
print STDERR "usage: $progname -h\t\t\t\t[invokes help]\n";
|
|
46
|
+
print STDERR " $progname -v\t\t\t\t[invokes version]\n";
|
|
47
|
+
print STDERR " $progname [-qEl] [-r <rankfile> -n <num>] filename(s)...\n";
|
|
48
|
+
print STDERR "Options:\n";
|
|
49
|
+
print STDERR "\t-q\tQuiet Mode (don't echo license)\n";
|
|
50
|
+
print STDERR "\t-E\tTurn OFF error checking\n";
|
|
51
|
+
print STDERR "\t-l\tEliminate newline tags\n";
|
|
52
|
+
print STDERR "\t-r <file>\tSVM Ranking output file\n";
|
|
53
|
+
print STDERR "\t-n <num>\tNumber of choices in both ranking file and input file\n";
|
|
54
|
+
print STDERR "\n";
|
|
55
|
+
print STDERR "Will accept input on STDIN as a single file.\n";
|
|
56
|
+
print STDERR "\n";
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
### VERSION Sub-procedure
|
|
60
|
+
sub Version {
|
|
61
|
+
if (system ("perldoc $0")) {
|
|
62
|
+
die "Need \"perldoc\" in PATH to print version information";
|
|
63
|
+
}
|
|
64
|
+
exit;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
sub License {
|
|
68
|
+
print STDERR "# Copyright 2009 \251 by Min-Yen Kan\n";
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
###
|
|
72
|
+
### MAIN program
|
|
73
|
+
###
|
|
74
|
+
|
|
75
|
+
my $cmdLine = $0 . " " . join (" ", @ARGV);
|
|
76
|
+
if ($#ARGV == -1) { # invoked with no arguments, possible error in execution?
|
|
77
|
+
print STDERR "# $progname info\t\tNo arguments detected, waiting for input on command line.\n";
|
|
78
|
+
print STDERR "# $progname info\t\tIf you need help, stop this program and reinvoke with \"-h\".\n";
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
$SIG{'INT'} = 'quitHandler';
|
|
82
|
+
getopts ('Ehlqr:n:v');
|
|
83
|
+
|
|
84
|
+
our ($opt_q, $opt_v, $opt_h, $opt_r, $opt_n, $opt_E, $opt_l);
|
|
85
|
+
# use (!defined $opt_X) for options with arguments
|
|
86
|
+
if (!$opt_q) { License(); } # call License, if asked for
|
|
87
|
+
if ($opt_v) { Version(); exit(0); } # call Version, if asked for
|
|
88
|
+
if ($opt_h) { Help(); exit (0); } # call help, if asked for
|
|
89
|
+
my $errorChecking = (defined $opt_E) ? 0 : 1;
|
|
90
|
+
my $ignoreNewlines = (defined $opt_l) ? 1 : 0;
|
|
91
|
+
my $svmRankFile = (defined $opt_r) ? $opt_r : undef;
|
|
92
|
+
my $rankChoices = (defined $opt_n) ? $opt_n : undef;
|
|
93
|
+
if ((defined $rankChoices && !defined $svmRankFile) ||
|
|
94
|
+
(!defined $rankChoices && defined $svmRankFile)) {
|
|
95
|
+
die "# $progname fatal\t\t-n and -r are mutually necessary switches";
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
## standardize input stream (either STDIN on first arg on command line)
|
|
99
|
+
my $fh;
|
|
100
|
+
my $filename;
|
|
101
|
+
if ($filename = shift) {
|
|
102
|
+
NEWFILE:
|
|
103
|
+
if (!(-e $filename)) { die "# $progname crash\t\tFile \"$filename\" doesn't exist"; }
|
|
104
|
+
open (*IF, $filename) || die "# $progname crash\t\tCan't open \"$filename\"";
|
|
105
|
+
$fh = "IF";
|
|
106
|
+
} else {
|
|
107
|
+
$filename = "<STDIN>";
|
|
108
|
+
$fh = "STDIN";
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
# open rank file info, if applicable
|
|
112
|
+
my $rfh;
|
|
113
|
+
my @max = ();
|
|
114
|
+
if (defined $rankChoices && defined $svmRankFile) {
|
|
115
|
+
open (*RFH, $svmRankFile) || die "# $progname crash\t\tCan't open rankfile \"$svmRankFile\"!";
|
|
116
|
+
$rfh = "RFH";
|
|
117
|
+
my $line = 0;
|
|
118
|
+
my $curLine = 0;
|
|
119
|
+
my $max = 0;
|
|
120
|
+
my $maxLine = 0;
|
|
121
|
+
while (<$rfh>) {
|
|
122
|
+
chop;
|
|
123
|
+
$line++;
|
|
124
|
+
$curLine++;
|
|
125
|
+
if ($_ > $max) { # advance max if applicable
|
|
126
|
+
$max = $_;
|
|
127
|
+
$maxLine = $curLine-1;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
if ($line % $rankChoices == 0) { # save data at fencepost
|
|
131
|
+
$max[int($line/$rankChoices)-1] = $maxLine;
|
|
132
|
+
# print "$line $max $maxLine\n";
|
|
133
|
+
|
|
134
|
+
$curLine = 0; # reset values
|
|
135
|
+
$max = 0;
|
|
136
|
+
$maxLine = 0;
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
close ($rfh);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
## output XML file for display
|
|
143
|
+
my $line = 0;
|
|
144
|
+
my $buf = "";
|
|
145
|
+
my $buf2 = "";
|
|
146
|
+
my $lastTag = "";
|
|
147
|
+
my $variant = "";
|
|
148
|
+
my $confidence = "1.0";
|
|
149
|
+
print "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
|
|
150
|
+
print "<?xml-stylesheet href=\"bibxml.xsl\" type=\"text/xsl\" ?>\n";
|
|
151
|
+
print "<file>\n";
|
|
152
|
+
while (<$fh>) {
|
|
153
|
+
if (/^\# (\d+) ([\.\d]+)/) {
|
|
154
|
+
$variant = $1;
|
|
155
|
+
$confidence = $2;
|
|
156
|
+
next;
|
|
157
|
+
}
|
|
158
|
+
elsif (/^\#/) { next; } # skip comments
|
|
159
|
+
|
|
160
|
+
if (/^\s*$/) {
|
|
161
|
+
$buf =~ s/&/&/g;
|
|
162
|
+
|
|
163
|
+
if ($variant eq "") {
|
|
164
|
+
print "<entry no=\"$line\">\n";
|
|
165
|
+
if ($ignoreNewlines) {
|
|
166
|
+
$buf =~ s/\- ([a-z])/$1/g;
|
|
167
|
+
$buf =~ s/>\s+/>/g;
|
|
168
|
+
$buf =~ s/\s+</</g;
|
|
169
|
+
$buf =~ s/\s+$//g;
|
|
170
|
+
$buf =~ s/^\s+/</g;
|
|
171
|
+
# $buf =~ s/PARSHED</\n </g; # replace with newline and spaces for formatting
|
|
172
|
+
$buf =~ s/PARSHED</\n</g; # replace with newline and spaces for formatting
|
|
173
|
+
}
|
|
174
|
+
print "<variant no=\"0\" confidence=\"$confidence\">" . $buf . "</$lastTag>\n</variant>\n";
|
|
175
|
+
print "</entry>\n";
|
|
176
|
+
$line++;
|
|
177
|
+
} else {
|
|
178
|
+
if ($variant eq "0" && $buf2 ne "") {
|
|
179
|
+
print "<entry no=\"$line svmRank: $max[$line]\">\n" . $buf2 . " </entry>\n";
|
|
180
|
+
$buf2 = "";
|
|
181
|
+
$line++;
|
|
182
|
+
}
|
|
183
|
+
$buf2 .= "<variant no=\"$variant\" confidence=\"$confidence\">\n" . $buf . "</$lastTag>\n</variant>\n";
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
$lastTag = "";
|
|
187
|
+
$buf = "";
|
|
188
|
+
} else {
|
|
189
|
+
chop;
|
|
190
|
+
|
|
191
|
+
my @tokens = split (/\t/);
|
|
192
|
+
|
|
193
|
+
my $token = $tokens[0];
|
|
194
|
+
my $sys = $tokens[-1];
|
|
195
|
+
my $gold = $tokens[-2];
|
|
196
|
+
if ($sys ne $lastTag) {
|
|
197
|
+
if ($lastTag ne "") { $buf .= "</$lastTag>\n"; }
|
|
198
|
+
$buf .= "PARSHED<$sys>";
|
|
199
|
+
# $buf .= "<$sys>";
|
|
200
|
+
}
|
|
201
|
+
if ($token eq "+L+" && $ignoreNewlines) {
|
|
202
|
+
next;
|
|
203
|
+
}
|
|
204
|
+
if ($gold ne $sys && $errorChecking) {
|
|
205
|
+
$buf .= "<error correct=\"$gold\" taggedAs=\"$sys\">$token </error>";
|
|
206
|
+
} else {
|
|
207
|
+
$buf .= "$token ";
|
|
208
|
+
}
|
|
209
|
+
$lastTag = $sys;
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
# print " <entry no=\"$line\">\n" . $buf2 . " </entry>\n";
|
|
213
|
+
print "</file>\n";
|
|
214
|
+
|
|
215
|
+
close ($fh);
|
|
216
|
+
|
|
217
|
+
if ($filename = shift) {
|
|
218
|
+
goto NEWFILE;
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
###
|
|
222
|
+
### END of main program
|
|
223
|
+
###
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
#!/usr/bin/env perl
|
|
2
|
+
# -*- cperl -*-
|
|
3
|
+
|
|
4
|
+
### USER customizable section
|
|
5
|
+
my $tmpfile .= $0; $tmpfile =~ s/[\.\/]//g;
|
|
6
|
+
$tmpfile .= $$ . time;
|
|
7
|
+
if ($tmpfile =~ /^([-\@\w.]+)$/) { $tmpfile = $1; } # untaint tmpfile variable
|
|
8
|
+
$tmpfile = "/tmp/" . $tmpfile;
|
|
9
|
+
$0 =~ /([^\/]+)$/; my $progname = $1;
|
|
10
|
+
my $outputVersion = "1.0";
|
|
11
|
+
my $parscitHome = "/home/wing.nus/services/parscit/tools/";
|
|
12
|
+
my $tr2crfppLoc = "$parscitHome/bin/tr2crfpp.pl";
|
|
13
|
+
my $crf_learnLoc = "$ENV{'CRFPP_HOME'}/bin/crf_learn";
|
|
14
|
+
my $crf_testLoc = "$ENV{'CRFPP_HOME'}/bin/crf_test";
|
|
15
|
+
my $conllevalLoc = "$parscitHome/bin/conlleval.pl";
|
|
16
|
+
my $crfTemplateLoc = "$parscitHome/crfpp/traindata/parsCit.template";
|
|
17
|
+
### END user customizable section
|
|
18
|
+
|
|
19
|
+
my $trainingFile = $ARGV[0];
|
|
20
|
+
my $folds = $ARGV[1];
|
|
21
|
+
|
|
22
|
+
# construct test data
|
|
23
|
+
open (IF, $trainingFile) || die "# $progname fatal\tTraining file cannot be opened \"$trainingFile\"!";
|
|
24
|
+
my $i = 0;
|
|
25
|
+
while (<IF>) {
|
|
26
|
+
open (OF, ">>$tmpfile.$i.test.src") || die "$progname fatal\tCan't append to file \"$tmpfile.$i.test.src\"!";
|
|
27
|
+
print OF $_;
|
|
28
|
+
$i++;
|
|
29
|
+
$i = $i % $folds;
|
|
30
|
+
}
|
|
31
|
+
close (IF);
|
|
32
|
+
for (my $i = 0; $i < $folds; $i++) {
|
|
33
|
+
`$tr2crfppLoc $tmpfile.$i.test.src> $tmpfile.$i.test`;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
# construct training data
|
|
37
|
+
for (my $i = 0; $i < $folds; $i++) {
|
|
38
|
+
for (my $j = 0; $j < $folds; $j++) {
|
|
39
|
+
if ($j == $i) {next; }
|
|
40
|
+
else {
|
|
41
|
+
`cat $tmpfile.$j.test >> $tmpfile.$i.train`;
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
# train
|
|
47
|
+
for (my $i = 0; $i < $folds; $i++) {
|
|
48
|
+
my $cmd = "$crf_learnLoc -f 2 -c 3 $crfTemplateLoc $tmpfile.$i.train $tmpfile.$i.model ";
|
|
49
|
+
print "$cmd\n";
|
|
50
|
+
system ($cmd);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
# test
|
|
54
|
+
for (my $i = 0; $i < $folds; $i++) {
|
|
55
|
+
my $cmd = "$crf_testLoc -m $tmpfile.$i.model $tmpfile.$i.test > $tmpfile.$i.out";
|
|
56
|
+
print "$cmd\n";
|
|
57
|
+
system ($cmd);
|
|
58
|
+
my $cmd = "cat $tmpfile.$i.out >> $tmpfile.all.out ";
|
|
59
|
+
print "$cmd\n";
|
|
60
|
+
system ($cmd);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
# eval
|
|
64
|
+
#for (my $i = 0; $i < $folds; $i++) {
|
|
65
|
+
# my $cmd = "$conllevalLoc -r -d \" \" < $tmpfile.$i.out";
|
|
66
|
+
# print "$cmd\n";
|
|
67
|
+
# system ($cmd);
|
|
68
|
+
#}
|
|
69
|
+
my $cmd = "$conllevalLoc -r -d \" \" < $tmpfile.all.out";
|
|
70
|
+
print "$cmd\n";
|
|
71
|
+
system ($cmd);
|
|
72
|
+
|
|
73
|
+
# clean up
|
|
74
|
+
`rm -f $tmpfile*`;
|
|
75
|
+
|
|
76
|
+
######################################################################
|
|
77
|
+
# .51
|
|
78
|
+
# on head (first 500 lines of tagged.txt)
|
|
79
|
+
# f=2, c=3 2fold: 92.86
|
|
80
|
+
# f=2, c=3 2fold (more unigram): 93.23
|
|
81
|
+
# 93.19 (with B features)
|
|
82
|
+
# 93.35 without B features
|
|
83
|
+
#
|
|
84
|
+
# on tagged.txt
|
|
85
|
+
# f=2, c=3 2fold cv: 95.24 / 93.99 => 94.61
|
|
86
|
+
# f=2, c=5 2fold cv: => 94.55
|
|
87
|
+
# f=2, c=3 2fold cv = 94.77
|
|
88
|
+
#
|
|
89
|
+
# .48
|
|
90
|
+
# on tagged.txt (cat of all *tagged.txt):
|
|
91
|
+
# normal, 2fold cv: 95.12 / 93.33
|
|
92
|
+
# c=1.5, 2fold cv: 95.14 / 93.38
|
|
93
|
+
# f=2, 2fold cv: 95.29 / 93.93
|
|
94
|
+
# f=2, c=1.5 2fold cv: 95.31 / 93.82
|
|
95
|
+
# f=2, c=3 2fold cv: 95.31 / 93.82
|
|
96
|
+
# f=3, 2fold cv: 95.25 / 93.69
|
|
97
|
+
#
|
|
98
|
+
#
|
|
99
|
+
# a=CRF-L1, f=2 2fold cv: 88.25 / 91.29
|
|
100
|
+
# a=CRF-L1 2fold cv: 80.63 / -- didn't complete
|
|
101
|
+
# a=MIRA 2fold cv: 94.48 / 92.69
|
|
102
|
+
# a=MIRA, f=2 2fold cv: 94.31 / 93.60
|
|
103
|
+
|
|
104
|
+
# 100326 .51 normal, 2fold cv, over all data (including iconip)
|
|
105
|
+
# accuracy: 94.83%; precision: 94.83%; recall: 94.83%; FB1: 94.83
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
#!/usr/bin/perl -wT
|
|
2
|
+
|
|
3
|
+
# Author: Luong Minh Thang <luongmin@comp.nus.edu.sg>, generated at Wed, 03 Mar 2010 00:36:36
|
|
4
|
+
# Modified from template by Min-Yen Kan <kanmy@comp.nus.edu.sg>
|
|
5
|
+
|
|
6
|
+
require 5.0;
|
|
7
|
+
use strict;
|
|
8
|
+
use Getopt::Long;
|
|
9
|
+
|
|
10
|
+
# I do not know a better solution to find a lib path in -T mode.
|
|
11
|
+
# So if you know a better solution, I'd be glad to hear.
|
|
12
|
+
# See this http://www.perlmonks.org/?node_id=585299 for why I
|
|
13
|
+
# used the below code
|
|
14
|
+
use FindBin;
|
|
15
|
+
|
|
16
|
+
my $path;
|
|
17
|
+
BEGIN
|
|
18
|
+
{
|
|
19
|
+
if ($FindBin::Bin =~ /(.*)/) { $path = $1; }
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
use lib "$path/../lib";
|
|
23
|
+
|
|
24
|
+
use SectLabel::Config;
|
|
25
|
+
use SectLabel::Controller;
|
|
26
|
+
|
|
27
|
+
### USER customizable section
|
|
28
|
+
$0 =~ /([^\/]+)$/; my $progname = $1;
|
|
29
|
+
my $outputVersion = "1.0";
|
|
30
|
+
### END user customizable section
|
|
31
|
+
|
|
32
|
+
sub License {
|
|
33
|
+
print STDERR "# Copyright 2009 \251 by Luong Minh Thang\n";
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
### HELP Sub-procedure
|
|
37
|
+
sub Help {
|
|
38
|
+
print STDERR "usage: $progname -h\t[invokes help]\n";
|
|
39
|
+
print STDERR " $progname -in inFile [-out outFile -no-xmlInput -no-xmlOutput -log -new]\n";
|
|
40
|
+
print STDERR "Options:\n";
|
|
41
|
+
print STDERR "\t-q\tQuiet Mode (don't echo license)\n";
|
|
42
|
+
print STDERR "\t-out: indicate output file (if not specified output to STDOUT)\n";
|
|
43
|
+
print STDERR "\t-no-xmlInput: indicate that input is normal text file (default: assume XML file from Omnipage-multiple pages concatenated)\n";
|
|
44
|
+
print STDERR "\t-no-xmlOutput: do not wrap results in XML format (default: xmlOutput)\n";
|
|
45
|
+
print STDERR "\t-log: output debugging messages\n";
|
|
46
|
+
}
|
|
47
|
+
my $QUIET = 0;
|
|
48
|
+
my $HELP = 0;
|
|
49
|
+
my $inFile = undef;
|
|
50
|
+
my $outFile = undef;
|
|
51
|
+
my $isXmlInput = 1;
|
|
52
|
+
my $isXmlOutput = 1;
|
|
53
|
+
my $isDebug = 0;
|
|
54
|
+
my $isNew = 0; # if = 1, use processOmniXml_new.pl
|
|
55
|
+
$HELP = 1 unless GetOptions('in=s' => \$inFile,
|
|
56
|
+
'out=s' => \$outFile,
|
|
57
|
+
'xmlInput!' => \$isXmlInput,
|
|
58
|
+
'xmlOutput!' => \$isXmlOutput,
|
|
59
|
+
'log' => \$isDebug,
|
|
60
|
+
'new' => \$isNew,
|
|
61
|
+
'h' => \$HELP,
|
|
62
|
+
'q' => \$QUIET);
|
|
63
|
+
|
|
64
|
+
if ($HELP || !defined $inFile) {
|
|
65
|
+
Help();
|
|
66
|
+
exit(0);
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
if (!$QUIET) {
|
|
70
|
+
License();
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
### Untaint ###
|
|
74
|
+
$inFile = untaintPath($inFile);
|
|
75
|
+
my $envPath = $ENV{'PATH'};
|
|
76
|
+
$envPath = untaintPath($envPath);
|
|
77
|
+
$ENV{'PATH'} = $envPath;
|
|
78
|
+
### End untaint ###
|
|
79
|
+
|
|
80
|
+
my $modelFile = $isXmlInput? $SectLabel::Config::modelXmlFile : $SectLabel::Config::modelFile;
|
|
81
|
+
$modelFile = "$path/../$modelFile";
|
|
82
|
+
my $configFile = $isXmlInput ? $SectLabel::Config::configXmlFile : $SectLabel::Config::configFile;
|
|
83
|
+
$configFile = "$path/../$configFile";
|
|
84
|
+
|
|
85
|
+
if($isXmlInput){
|
|
86
|
+
my $xmlInFile = newTmpFile();
|
|
87
|
+
$xmlInFile = untaintPath($xmlInFile);
|
|
88
|
+
my $cmd = "$path/sectLabel/";
|
|
89
|
+
$cmd .= ($isNew) ? "processOmniXMLv2.pl" : "processOmniXML.pl";
|
|
90
|
+
$cmd .= " -in $inFile -out $xmlInFile -xmlFeature -decode";
|
|
91
|
+
execute($cmd);
|
|
92
|
+
$inFile = $xmlInFile;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
my $dictFile = $SectLabel::Config::dictFile;
|
|
96
|
+
$dictFile = "$path/../$dictFile";
|
|
97
|
+
|
|
98
|
+
my $funcFile = $SectLabel::Config::funcFile;
|
|
99
|
+
$funcFile = "$path/../$funcFile";
|
|
100
|
+
my $rXML = SectLabel::Controller::extractSection($inFile, $isXmlOutput, $modelFile, $dictFile, $funcFile, $configFile, $isXmlInput, $isDebug);
|
|
101
|
+
|
|
102
|
+
if($isXmlInput){
|
|
103
|
+
unlink($inFile);
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
if (defined $outFile) {
|
|
107
|
+
$outFile = untaintPath($outFile);
|
|
108
|
+
|
|
109
|
+
open (OUT, ">:utf8", $outFile) or die "Could not open $outFile for writing: $!";
|
|
110
|
+
print OUT $$rXML;
|
|
111
|
+
close OUT;
|
|
112
|
+
} else {
|
|
113
|
+
print "$$rXML";
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
sub untaintPath {
|
|
117
|
+
my ($path) = @_;
|
|
118
|
+
|
|
119
|
+
if ( $path =~ /^([-_\/\w\.\d: ]+)$/ ) {
|
|
120
|
+
$path = $1;
|
|
121
|
+
} else {
|
|
122
|
+
die "Bad path $path\n";
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
return $path;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
sub untaint {
|
|
129
|
+
my ($s) = @_;
|
|
130
|
+
if ($s =~ /^([\w \-\@\(\),\.\/<>]+)$/) {
|
|
131
|
+
$s = $1; # $data now untainted
|
|
132
|
+
} else {
|
|
133
|
+
die "Bad data in $s"; # log this somewhere
|
|
134
|
+
}
|
|
135
|
+
return $s;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
sub execute {
|
|
139
|
+
my ($cmd) = @_;
|
|
140
|
+
print STDERR "Executing: $cmd\n";
|
|
141
|
+
$cmd = untaint($cmd);
|
|
142
|
+
system($cmd);
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
sub newTmpFile {
|
|
146
|
+
my $tmpFile = `date '+%Y%m%d-%H%M%S-$$'`;
|
|
147
|
+
chomp($tmpFile);
|
|
148
|
+
return $tmpFile;
|
|
149
|
+
}
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
README for sectLabel module (v100401)
|
|
2
|
+
|
|
3
|
+
CONTENTS
|
|
4
|
+
[0] Directory structure
|
|
5
|
+
[1] Command line Usage
|
|
6
|
+
[1.1] SectLabel
|
|
7
|
+
[1.2] GenericSect
|
|
8
|
+
[3] Known issues
|
|
9
|
+
|
|
10
|
+
------------------------------------------------------------
|
|
11
|
+
[0] DIRECTORY STRUCTURE
|
|
12
|
+
|
|
13
|
+
* processOmniXML.pl: Process Omnipage XML output (concatenated results
|
|
14
|
+
fromm all pages of a PDF file), and extract text lines together with
|
|
15
|
+
other XML infos
|
|
16
|
+
Note: the current script is complicated since it mixes 2 things: process Omnipage XML as well as extract XML features. We are planning to break into 2 scripts: 1) simplifyOmniXML.pl (Done!) -- to convert Omnipage into output into internal format, and 2) extractXMLFeatures.pl (TODO) -- to take input as the internal results produced by simplifyOmniXML.pl and generate XML features.
|
|
17
|
+
|
|
18
|
+
* redo.sectLabel.pl: Perform stratified cross-validation for SectLabel
|
|
19
|
+
* tr2crfpp.pl: Generate SectLabel features for CRF++
|
|
20
|
+
* single2multi.pl: Convert SectLabel training file
|
|
21
|
+
(e.g. doc/sectLabel.tagged.txt) from single- to multi-line
|
|
22
|
+
format. This script is called by tr2crfpp.pl
|
|
23
|
+
* genericSectExtract.rb: given a list of section headers of a
|
|
24
|
+
scientific document in an input file, assign generic headers for the
|
|
25
|
+
section headers.
|
|
26
|
+
* genericSect/
|
|
27
|
+
|
|
28
|
+
------------------------------------------------------------
|
|
29
|
+
[1] COMMAND LINE USAGE
|
|
30
|
+
|
|
31
|
+
------------------------------
|
|
32
|
+
[1.1] SectLabel
|
|
33
|
+
* Process Omnipage XML output
|
|
34
|
+
|
|
35
|
+
** Usage: processOmniXML.pl -h [invokes help]
|
|
36
|
+
processOmniXML.pl -in xmlFile -out outFile [-xmlFeature -decode -markup -para] [-tag tagFile -allowEmptyLine -log]
|
|
37
|
+
Options:
|
|
38
|
+
-q Quiet Mode (don't echo license)
|
|
39
|
+
-xmlFeature: append XML feature together with text extracted
|
|
40
|
+
-decode: decode HTML entities and then output, to avoid double
|
|
41
|
+
entity encoding later
|
|
42
|
+
-tag tagFile: count XML tags/values for statistics
|
|
43
|
+
-markup: add factor infos (bold, italic etc) per word using
|
|
44
|
+
the format "word|||(b|nb)|||(i|ni)", useful in extracting
|
|
45
|
+
bold/italic phrases
|
|
46
|
+
|
|
47
|
+
* Perform stratified cross-validation
|
|
48
|
+
|
|
49
|
+
** Usage: redo.sectLabel.pl -h [invokes help]
|
|
50
|
+
redo.sectLabel.pl -in trainFile -dir outDir -n folds -c configFile [-p numCpus -iter numIter -f freqCutoff]
|
|
51
|
+
|
|
52
|
+
Options:
|
|
53
|
+
|
|
54
|
+
-in: training file in the format as in
|
|
55
|
+
doc/sectLabel.tagged.txt
|
|
56
|
+
-dir: output directory, containing all intermediate
|
|
57
|
+
files and outputs
|
|
58
|
+
-n: num of cross validation folds
|
|
59
|
+
-c: config file to extract features and automatically
|
|
60
|
+
generate CRF++ template
|
|
61
|
+
|
|
62
|
+
-p: CRF++ num of CPUs (deault = 6)
|
|
63
|
+
-iter: CRF++ max iteration (default = 100)
|
|
64
|
+
-f: CRF++ frequency cut-off (default = 3)
|
|
65
|
+
|
|
66
|
+
** E.g.:
|
|
67
|
+
./bin/sectLabel/redo.sectLabel.pl -in ./doc/sectLabelXml.tagged.txt
|
|
68
|
+
-dir testRedoDir -n 10 -c ./resources/sectLabel/sectLabel.configXml
|
|
69
|
+
|
|
70
|
+
* Extract features
|
|
71
|
+
|
|
72
|
+
** Usage: tr2crfpp.pl -h [invokes help]
|
|
73
|
+
tr2crfpp.pl -in inFile -c configFile -out outFile [-template -single]
|
|
74
|
+
|
|
75
|
+
Options:
|
|
76
|
+
-q Quiet Mode (don't echo license)
|
|
77
|
+
-in inFile: labeled input file
|
|
78
|
+
-c configFile: to specify which feature set to use.
|
|
79
|
+
-out outFile: output file for CRF++ training.
|
|
80
|
+
-template: to output a template used by CRF++ according to the
|
|
81
|
+
config file.
|
|
82
|
+
-single: indicate that each input document is in single-line
|
|
83
|
+
format (e.g., ./doc/sectLabel.tagged.txt)
|
|
84
|
+
|
|
85
|
+
------------------------------
|
|
86
|
+
[1.2] GenericSect
|
|
87
|
+
* Create feature file
|
|
88
|
+
|
|
89
|
+
** Usage: ruby extractFeature.rb filePath
|
|
90
|
+
filePath: path to the labeled data file which lists the actual
|
|
91
|
+
section headers and their corressponding manually assigned generic
|
|
92
|
+
section headers (if it exists)
|
|
93
|
+
syntax: generic_header ||| actual_header
|
|
94
|
+
|
|
95
|
+
* Generate generic section headers for a document
|
|
96
|
+
|
|
97
|
+
** Usage: ruby genericSectExtract.rb filePath
|
|
98
|
+
|
|
99
|
+
where filePath is a file which lists the actual headers of a
|
|
100
|
+
document (automaticaly extracted by other module of SectLabel)
|
|
101
|
+
|
|
102
|
+
* Perform stratified cross-validation
|
|
103
|
+
|
|
104
|
+
** Usage: ruby crossValidation.rb dataFile numFold
|
|
105
|
+
|
|
106
|
+
Note that data file has the format as in doc/genericSect.tagged.txt
|
|
107
|
+
|
|
108
|
+
------------------------------------------------------------
|
|
109
|
+
[3] KNOWN ISSUES
|
|
110
|
+
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
README for sectLabel module (v100401)
|
|
2
|
+
|
|
3
|
+
CONTENTS
|
|
4
|
+
[0] Directory structure
|
|
5
|
+
[1] Command line Usage
|
|
6
|
+
[1.1] SectLabel
|
|
7
|
+
[1.2] GenericSect
|
|
8
|
+
[3] Known issues
|
|
9
|
+
|
|
10
|
+
------------------------------------------------------------
|
|
11
|
+
[0] DIRECTORY STRUCTURE
|
|
12
|
+
|
|
13
|
+
* processOmniXML.pl: Process Omnipage XML output (concatenated results
|
|
14
|
+
fromm all pages of a PDF file), and extract text lines together with
|
|
15
|
+
other XML infos
|
|
16
|
+
Note: the current script is complicated since it mixes 2 things: process Omnipage XML as well as extract XML features. We are planning to break into 2 scripts: 1) simplifyOmniXML.pl (Done!) -- to convert Omnipage into output into internal format, and 2) extractXMLFeatures.pl (TODO) -- to take input as the internal results produced by simplifyOmniXML.pl and generate XML features.
|
|
17
|
+
|
|
18
|
+
* redo.sectLabel.pl: Perform stratified cross-validation for SectLabel
|
|
19
|
+
* tr2crfpp.pl: Generate SectLabel features for CRF++
|
|
20
|
+
* single2multi.pl: Convert SectLabel training file
|
|
21
|
+
(e.g. doc/sectLabel.tagged.txt) from single- to multi-line
|
|
22
|
+
format. This script is called by tr2crfpp.pl
|
|
23
|
+
* genericSectExtract.rb: given a list of section headers of a
|
|
24
|
+
scientific document in an input file, assign generic headers for the
|
|
25
|
+
section headers.
|
|
26
|
+
* genericSect/
|
|
27
|
+
|
|
28
|
+
------------------------------------------------------------
|
|
29
|
+
[1] COMMAND LINE USAGE
|
|
30
|
+
|
|
31
|
+
------------------------------
|
|
32
|
+
[1.1] SectLabel
|
|
33
|
+
* Process Omnipage XML output
|
|
34
|
+
|
|
35
|
+
** Usage: processOmniXML.pl -h [invokes help]
|
|
36
|
+
processOmniXML.pl -in xmlFile -out outFile [-xmlFeature -decode -markup -para] [-tag tagFile -allowEmptyLine -log]
|
|
37
|
+
Options:
|
|
38
|
+
-q Quiet Mode (don't echo license)
|
|
39
|
+
-xmlFeature: append XML feature together with text extracted
|
|
40
|
+
-decode: decode HTML entities and then output, to avoid double
|
|
41
|
+
entity encoding later
|
|
42
|
+
-tag tagFile: count XML tags/values for statistics
|
|
43
|
+
-markup: add factor infos (bold, italic etc) per word using
|
|
44
|
+
the format "word|||(b|nb)|||(i|ni)", useful in extracting
|
|
45
|
+
bold/italic phrases
|
|
46
|
+
|
|
47
|
+
* Perform stratified cross-validation
|
|
48
|
+
|
|
49
|
+
** Usage: redo.sectLabel.pl -h [invokes help]
|
|
50
|
+
redo.sectLabel.pl -in trainFile -dir outDir -n folds -c configFile [-p numCpus -iter numIter -f freqCutoff]
|
|
51
|
+
|
|
52
|
+
Options:
|
|
53
|
+
|
|
54
|
+
-in: training file in the format as in
|
|
55
|
+
doc/sectLabel.tagged.txt
|
|
56
|
+
-dir: output directory, containing all intermediate
|
|
57
|
+
files and outputs
|
|
58
|
+
-n: num of cross validation folds
|
|
59
|
+
-c: config file to extract features and automatically
|
|
60
|
+
generate CRF++ template
|
|
61
|
+
|
|
62
|
+
-p: CRF++ num of CPUs (deault = 6)
|
|
63
|
+
-iter: CRF++ max iteration (default = 100)
|
|
64
|
+
-f: CRF++ frequency cut-off (default = 3)
|
|
65
|
+
|
|
66
|
+
** E.g.:
|
|
67
|
+
./bin/sectLabel/redo.sectLabel.pl -in ./doc/sectLabelXml.tagged.txt
|
|
68
|
+
-dir testRedoDir -n 10 -c ./resources/sectLabel/sectLabel.configXml
|
|
69
|
+
|
|
70
|
+
* Extract features
|
|
71
|
+
|
|
72
|
+
** Usage: tr2crfpp.pl -h [invokes help]
|
|
73
|
+
tr2crfpp.pl -in inFile -c configFile -out outFile [-template -single]
|
|
74
|
+
|
|
75
|
+
Options:
|
|
76
|
+
-q Quiet Mode (don't echo license)
|
|
77
|
+
-in inFile: labeled input file
|
|
78
|
+
-c configFile: to specify which feature set to use.
|
|
79
|
+
-out outFile: output file for CRF++ training.
|
|
80
|
+
-template: to output a template used by CRF++ according to the
|
|
81
|
+
config file.
|
|
82
|
+
-single: indicate that each input document is in single-line
|
|
83
|
+
format (e.g., ./doc/sectLabel.tagged.txt)
|
|
84
|
+
|
|
85
|
+
------------------------------
|
|
86
|
+
[1.2] GenericSect
|
|
87
|
+
* Create feature file
|
|
88
|
+
|
|
89
|
+
** Usage: ruby extractFeature.rb filePath
|
|
90
|
+
filePath: path to the labeled data file which lists the actual
|
|
91
|
+
section headers and their corressponding manually assigned generic
|
|
92
|
+
section headers (if it exists)
|
|
93
|
+
syntax: generic_header ||| actual_header
|
|
94
|
+
|
|
95
|
+
* Generate generic section headers for a document
|
|
96
|
+
|
|
97
|
+
** Usage: ruby genericSectExtract.rb filePath
|
|
98
|
+
|
|
99
|
+
where filePath is a file which lists the actual headers of a
|
|
100
|
+
document (automaticaly extracted by other module of SectLabel)
|
|
101
|
+
|
|
102
|
+
* Perform stratified cross-validation
|
|
103
|
+
|
|
104
|
+
** Usage: ruby crossValidation.rb dataFile numFold
|
|
105
|
+
|
|
106
|
+
Note that data file has the format as in doc/genericSect.tagged.txt
|
|
107
|
+
|
|
108
|
+
------------------------------------------------------------
|
|
109
|
+
[3] KNOWN ISSUES
|
|
110
|
+
|