biblicit 2.1.0 → 2.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/biblicit.gemspec +0 -1
- data/lib/biblicit/extractor.rb +2 -7
- data/lib/biblicit/parscit.rb +18 -6
- data/lib/biblicit/version.rb +1 -1
- data/parscit/bin/citeExtract.pl +16 -4
- data/{svm-header-parse/HeaderParseService → parscit}/lib/HeaderParse/API/AssembleXMLMetadata.pm +0 -0
- data/{svm-header-parse/HeaderParseService → parscit}/lib/HeaderParse/API/Function.pm +0 -0
- data/{svm-header-parse/HeaderParseService → parscit}/lib/HeaderParse/API/LoadInformation.pm +0 -0
- data/{svm-header-parse/HeaderParseService → parscit}/lib/HeaderParse/API/MultiClassChunking.pm +0 -0
- data/{svm-header-parse/HeaderParseService → parscit}/lib/HeaderParse/API/NamePatternMatch.pm +0 -0
- data/{svm-header-parse/HeaderParseService → parscit}/lib/HeaderParse/API/Parser.pm +21 -0
- data/{svm-header-parse/HeaderParseService → parscit}/lib/HeaderParse/API/ParserMethods.pm +0 -0
- data/{svm-header-parse/HeaderParseService → parscit}/lib/HeaderParse/Config/API_Config.pm +11 -10
- data/{svm-header-parse/HeaderParseService → parscit/lib/HeaderParse}/README.TXT +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/50states +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/AddrTopWords.txt +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/AffiTopWords.txt +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/AffiTopWordsAll.txt +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/ChineseSurNames.txt +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/Csurnames.bin +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/Csurnames_spec.bin +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/DomainSuffixes.txt +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/LabeledHeader +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/README +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/TrainMulClassLines +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/TrainMulClassLines1 +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/abstract.txt +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/abstractTopWords +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/addr.txt +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/affi.txt +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/affis.bin +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/all_namewords_spec.bin +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/allnamewords.bin +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/cities_US.txt +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/cities_world.txt +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/city.txt +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/cityname.txt +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/country_abbr.txt +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/countryname.txt +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/dateTopWords +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/degree.txt +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/email.txt +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/excludeWords.txt +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/female-names +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/firstNames.txt +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/firstnames.bin +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/firstnames_spec.bin +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/intro.txt +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/keyword.txt +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/keywordTopWords +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/male-names +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/middleNames.txt +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/month.txt +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/mul +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/mul.label +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/mul.label.old +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/mul.processed +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/mulAuthor +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/mulClassStat +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/nickname.txt +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/nicknames.bin +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/note.txt +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/page.txt +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/phone.txt +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/postcode.txt +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/pubnum.txt +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/statename.bin +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/statename.txt +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/states_and_abbreviations.txt +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/stopwords +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/stopwords.bin +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/surNames.txt +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/surnames.bin +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/surnames_spec.bin +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list.txt +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/A.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/B.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/C.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/D.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/E.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/F.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/G.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/H.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/I.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/J.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/K.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/L.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/M.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/N.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/O.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/P.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/Q.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/R.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/S.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/T.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/U.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/V.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/W.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/WCSelect.gif +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/X.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/Y.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/Z.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/ae.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/am.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/ar.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/at.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/au.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/bd.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/be.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/bg.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/bh.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/blueribbon.gif +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/bm.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/bn.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/br.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/ca.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/ch.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/cl.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/cn.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/co.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/cr.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/cy.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/cz.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/de.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/dean-mainlink.jpg +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/dk.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/ec.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/ee.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/eg.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/es.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/et.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/faq.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/fi.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/fj.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/fo.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/fr.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/geog.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/gr.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/gu.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/hk.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/hr.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/hu.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/id.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/ie.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/il.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/in.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/is.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/it.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/jm.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/jo.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/jp.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/kaplan.gif +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/kr.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/kw.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/lb.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/linkbw2.gif +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/lk.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/lt.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/lu.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/lv.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/ma.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/maczynski.gif +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/mirror.tar +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/mk.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/mo.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/mseawdm.gif +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/mt.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/mx.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/my.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/ni.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/nl.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/no.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/nz.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/pa.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/pe.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/ph.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/pl.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/pointcom.gif +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/pr.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/ps.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/pt.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/recognition.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/results.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/ro.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/ru.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/sd.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/se.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/sg.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/si.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/sk.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/th.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/tr.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/tw.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/ua.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/uk.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/univ-full.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/univ.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/uy.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/ve.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/yu.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/za.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/zm.html +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/url.txt +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/webTopWords +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/words +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/10ContextModelfold1 +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/10Modelfold1 +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/11ContextModelfold1 +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/11Modelfold1 +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/12ContextModelfold1 +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/12Modelfold1 +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/13ContextModelfold1 +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/13Modelfold1 +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/14ContextModelfold1 +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/14Modelfold1 +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/15ContextModelfold1 +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/15Modelfold1 +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/1ContextModelfold1 +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/1Modelfold1 +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/2ContextModelfold1 +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/2Modelfold1 +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/3ContextModelfold1 +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/3Modelfold1 +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/4ContextModelfold1 +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/4Modelfold1 +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/5ContextModelfold1 +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/5Modelfold1 +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/6ContextModelfold1 +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/6Modelfold1 +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/7ContextModelfold1 +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/7Modelfold1 +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/8ContextModelfold1 +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/8Modelfold1 +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/9ContextModelfold1 +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/9Modelfold1 +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/NameSpaceModel +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/NameSpaceTrainF +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/WrapperBaseFeaDict +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/WrapperContextFeaDict +0 -0
- data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/WrapperSpaceAuthorFeaDict +0 -0
- data/sh/convert_to_text.sh +2 -1
- metadata +267 -282
- data/lib/biblicit/citeseer.rb +0 -42
- data/svm-header-parse/HeaderParseService/lib/CSXUtil/SafeText.pm +0 -140
- data/svm-header-parse/HeaderParseService/tmp/.gitignore +0 -4
- data/svm-header-parse/extract.pl +0 -75
data/lib/biblicit/citeseer.rb
DELETED
@@ -1,42 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
|
3
|
-
require 'tmpdir'
|
4
|
-
require 'shellwords'
|
5
|
-
require 'nokogiri'
|
6
|
-
|
7
|
-
module CiteSeer
|
8
|
-
|
9
|
-
PERL_DIR = "#{File.dirname(__FILE__)}/../../svm-header-parse"
|
10
|
-
|
11
|
-
def self.extract(in_file, opts={})
|
12
|
-
ParseOperation.new(in_file).result
|
13
|
-
end
|
14
|
-
|
15
|
-
class ParseOperation
|
16
|
-
|
17
|
-
attr_reader :result
|
18
|
-
|
19
|
-
def initialize(in_file)
|
20
|
-
Dir.mktmpdir do |out_dir|
|
21
|
-
`#{PERL_DIR}/extract.pl #{in_file.path} #{out_dir}`
|
22
|
-
output = IO.read("#{out_dir}/out.header")
|
23
|
-
xml = Nokogiri::XML output
|
24
|
-
@result = parse(xml)
|
25
|
-
end
|
26
|
-
end
|
27
|
-
|
28
|
-
private
|
29
|
-
|
30
|
-
def parse(xml)
|
31
|
-
{
|
32
|
-
title: xml.css('title').text,
|
33
|
-
authors: xml.css('author > name').map { |n| n.text.strip }.reject(&:blank?).uniq,
|
34
|
-
author_emails: xml.css('author > email').map { |n| n.text.strip }.reject(&:blank?).uniq,
|
35
|
-
abstract: xml.css('abstract').text,
|
36
|
-
valid: xml.css('validHeader').first.text == '1',
|
37
|
-
}
|
38
|
-
end
|
39
|
-
|
40
|
-
end
|
41
|
-
|
42
|
-
end
|
@@ -1,140 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# Copyright 2007 Penn State University
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
# you may not use this file except in compliance with the License.
|
5
|
-
# You may obtain a copy of the License at
|
6
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
-
# Unless required by applicable law or agreed to in writing, software
|
8
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
9
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
10
|
-
# See the License for the specific language governing permissions and
|
11
|
-
# limitations under the License.
|
12
|
-
#
|
13
|
-
package CSXUtil::SafeText;
|
14
|
-
##
|
15
|
-
## Methods for stripping bad (XML unsafe) characters
|
16
|
-
## from strings and performing basic HTML entity
|
17
|
-
## translations. Also contains a utility (stripArtifacts)
|
18
|
-
## for getting rid of crazy control characters and
|
19
|
-
## other things that probably aren't proper text.
|
20
|
-
##
|
21
|
-
## Isaac Councill, 12/06/06
|
22
|
-
##
|
23
|
-
#######################################################
|
24
|
-
##
|
25
|
-
use strict;
|
26
|
-
use utf8;
|
27
|
-
require Exporter;
|
28
|
-
|
29
|
-
our ($VERSION, @ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS);
|
30
|
-
|
31
|
-
$VERSION = 1.00;
|
32
|
-
|
33
|
-
@ISA = qw(Exporter);
|
34
|
-
@EXPORT_OK = qw(@badChars %htmlSpecialChars
|
35
|
-
%htmlCharEntities &stripBadChars
|
36
|
-
&encodeHTMLSpecialChars
|
37
|
-
&decodeHTMLSpecialChars
|
38
|
-
&cleanXML &cleanAll &stripArtifacts);
|
39
|
-
|
40
|
-
|
41
|
-
##
|
42
|
-
#######################################################
|
43
|
-
##
|
44
|
-
## Sharable encoding data.
|
45
|
-
##
|
46
|
-
|
47
|
-
## Hex codes for characters that should never be put into
|
48
|
-
## XML - or else parsers will barf.
|
49
|
-
our @badChars = qw(\x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07
|
50
|
-
\x08 \x0B \x0C \x0E \x0F \x10 \x11 \x12
|
51
|
-
\x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1A
|
52
|
-
\x1B \x1C \x1D \x1E \x1F \x7F);
|
53
|
-
|
54
|
-
## Subset of HTML characters that could be problematic
|
55
|
-
## for XML. This is not a complete list of HTML
|
56
|
-
## special characters, but more mappings can be added
|
57
|
-
## as needed.
|
58
|
-
our %htmlSpecialCharEncodings = ("&" => "&",
|
59
|
-
">" => ">",
|
60
|
-
"<" => "<",
|
61
|
-
"\"" => """
|
62
|
-
);
|
63
|
-
|
64
|
-
## The reverse map.
|
65
|
-
our %htmlSpecialCharDecodings;
|
66
|
-
foreach my $key (keys %htmlSpecialCharEncodings) {
|
67
|
-
my $val = $htmlSpecialCharEncodings{$key};
|
68
|
-
$htmlSpecialCharDecodings{$val} = $key;
|
69
|
-
}
|
70
|
-
|
71
|
-
|
72
|
-
##
|
73
|
-
#######################################################
|
74
|
-
##
|
75
|
-
## Subroutines
|
76
|
-
##
|
77
|
-
|
78
|
-
## Delete all occurences of bad characters in text,
|
79
|
-
## returns a new string that is clean.
|
80
|
-
sub stripBadChars {
|
81
|
-
my $rtext = shift;
|
82
|
-
foreach my $char (@badChars) {
|
83
|
-
$$rtext =~ s/$char//g;
|
84
|
-
}
|
85
|
-
}
|
86
|
-
|
87
|
-
|
88
|
-
## Encodes special characters into HTML equivalents
|
89
|
-
## and returns the encoded string.
|
90
|
-
sub encodeHTMLSpecialChars {
|
91
|
-
my $rtext = shift;
|
92
|
-
foreach my $char (keys %htmlSpecialCharEncodings) {
|
93
|
-
my $code = $htmlSpecialCharEncodings{$char};
|
94
|
-
$$rtext =~ s/$char/$code/g;
|
95
|
-
}
|
96
|
-
}
|
97
|
-
|
98
|
-
|
99
|
-
## Decodes a HTML entities in the supplied string
|
100
|
-
## into non-HTML character equivalents and returns
|
101
|
-
## the decoded string.
|
102
|
-
sub decodeHTMLSpecialChars {
|
103
|
-
my $rtext = shift;
|
104
|
-
foreach my $code (keys %htmlSpecialCharDecodings) {
|
105
|
-
my $char = $htmlSpecialCharDecodings{$code};
|
106
|
-
$$rtext =~ s/$code/$char/g;
|
107
|
-
}
|
108
|
-
}
|
109
|
-
|
110
|
-
|
111
|
-
## Strip out any characters that don't look like they
|
112
|
-
## belong in a proper, readable text string.
|
113
|
-
##
|
114
|
-
sub stripArtifacts {
|
115
|
-
my $rtext = shift;
|
116
|
-
$$rtext =~ s/[^\p{IsAlnum}\p{IsPunct}\p{IsSpace}\p{IsS}]//g;
|
117
|
-
}
|
118
|
-
|
119
|
-
|
120
|
-
## Convenience routine for executing both XML safety
|
121
|
-
## routines in a single call.
|
122
|
-
##
|
123
|
-
sub cleanXML {
|
124
|
-
my $rtext = shift;
|
125
|
-
stripBadChars($rtext);
|
126
|
-
encodeHTMLSpecialChars($rtext);
|
127
|
-
}
|
128
|
-
|
129
|
-
|
130
|
-
## Clean for XML and also strip out strange characters.
|
131
|
-
##
|
132
|
-
sub cleanAll {
|
133
|
-
my $rtext = shift;
|
134
|
-
stripBadChars($rtext);
|
135
|
-
stripArtifacts($rtext);
|
136
|
-
encodeHTMLSpecialChars($rtext);
|
137
|
-
}
|
138
|
-
|
139
|
-
|
140
|
-
1;
|
data/svm-header-parse/extract.pl
DELETED
@@ -1,75 +0,0 @@
|
|
1
|
-
#!/usr/bin/perl -CSD
|
2
|
-
use strict;
|
3
|
-
use FindBin;
|
4
|
-
|
5
|
-
use lib "$FindBin::Bin/HeaderParseService/lib";
|
6
|
-
|
7
|
-
use HeaderParse::API::Parser;
|
8
|
-
use HeaderParse::Config::API_Config;
|
9
|
-
|
10
|
-
my $argc = scalar(@ARGV);
|
11
|
-
|
12
|
-
if ($argc != 2) {
|
13
|
-
print "Usage: ./extract.pl path_to_input path_to_output\n";
|
14
|
-
exit 1;
|
15
|
-
}
|
16
|
-
|
17
|
-
my $inputPath = $ARGV[0];
|
18
|
-
my $outputPath = $ARGV[1];
|
19
|
-
|
20
|
-
import($inputPath, $outputPath);
|
21
|
-
|
22
|
-
exit;
|
23
|
-
|
24
|
-
sub import {
|
25
|
-
my ($filePath, $id) = @_;
|
26
|
-
|
27
|
-
system("mkdir","-p","$id");
|
28
|
-
|
29
|
-
my ($status, $msg) = prep($filePath, $id);
|
30
|
-
if ($status == 0) {
|
31
|
-
print STDERR "$id: $msg\n";
|
32
|
-
}
|
33
|
-
if ($status == 1) {
|
34
|
-
print STDOUT "$id\n";
|
35
|
-
}
|
36
|
-
}
|
37
|
-
|
38
|
-
|
39
|
-
sub prep {
|
40
|
-
my ($textFile, $id) = @_;
|
41
|
-
|
42
|
-
my ($ehstatus, $msg) = extractHeader($textFile, $id);
|
43
|
-
if ($ehstatus <= 0) {
|
44
|
-
return ($ehstatus, $msg);
|
45
|
-
}
|
46
|
-
|
47
|
-
return (1, "");
|
48
|
-
}
|
49
|
-
|
50
|
-
sub extractHeader {
|
51
|
-
my ($textFile, $id) = @_;
|
52
|
-
|
53
|
-
my $jobID;
|
54
|
-
while($jobID = rand(time)) {
|
55
|
-
unless(-f $offlineD."$jobID") {
|
56
|
-
last;
|
57
|
-
}
|
58
|
-
}
|
59
|
-
|
60
|
-
my ($status, $msg, $rXML) =
|
61
|
-
HeaderParse::API::Parser::_parseHeader($textFile, $jobID);
|
62
|
-
|
63
|
-
if ($status <= 0) {
|
64
|
-
return ($status, $msg);
|
65
|
-
}
|
66
|
-
|
67
|
-
unless(open(HEAD, ">:utf8", "$outputPath/out.header")) {
|
68
|
-
return (0, "Unable to open header file: $!");
|
69
|
-
}
|
70
|
-
|
71
|
-
print HEAD $$rXML;
|
72
|
-
close HEAD;
|
73
|
-
return (1);
|
74
|
-
|
75
|
-
}
|