biblicit 1.0 → 2.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitmodules +3 -0
- data/Gemfile +1 -1
- data/README.md +125 -30
- data/Rakefile +22 -0
- data/biblicit.gemspec +9 -7
- data/lib/biblicit/cb2bib.rb +10 -11
- data/lib/biblicit/citeseer.rb +14 -26
- data/lib/biblicit/extractor.rb +40 -19
- data/lib/biblicit/parscit.rb +38 -0
- data/parscit/.gitignore +8 -0
- data/parscit/CHANGELOG +125 -0
- data/parscit/COPYING +674 -0
- data/parscit/COPYING.LESSER +165 -0
- data/parscit/INSTALL +105 -0
- data/parscit/README +97 -0
- data/{perl/ParsCit/README.TXT → parscit/USAGE} +25 -15
- data/parscit/bin/archtest.pl +31 -0
- data/parscit/bin/citeExtract.pl +562 -0
- data/parscit/bin/conlleval.pl +315 -0
- data/parscit/bin/headExtract.pl +40 -0
- data/parscit/bin/parsHed/convert2TokenLevel.pl +138 -0
- data/parscit/bin/parsHed/keywordGen.pl +308 -0
- data/parscit/bin/parsHed/parseXmlHeader.pl +141 -0
- data/parscit/bin/parsHed/redo.parsHed.pl +198 -0
- data/parscit/bin/parsHed/tr2crfpp_parsHed.pl +521 -0
- data/parscit/bin/parseRefStrings.pl +102 -0
- data/parscit/bin/phOutput2xml.pl +223 -0
- data/parscit/bin/redo.parsCit.pl +105 -0
- data/parscit/bin/sectExtract.pl +149 -0
- data/parscit/bin/sectLabel/README +110 -0
- data/parscit/bin/sectLabel/README.txt +110 -0
- data/parscit/bin/sectLabel/genericSect/crossValidation.rb +98 -0
- data/parscit/bin/sectLabel/genericSect/extractFeature.rb +104 -0
- data/parscit/bin/sectLabel/genericSectExtract.rb +53 -0
- data/parscit/bin/sectLabel/getStructureInfo.pl +156 -0
- data/parscit/bin/sectLabel/processOmniXML.pl +1427 -0
- data/parscit/bin/sectLabel/processOmniXML_new.pl +1025 -0
- data/parscit/bin/sectLabel/processOmniXMLv2.pl +1529 -0
- data/parscit/bin/sectLabel/processOmniXMLv3.pl +964 -0
- data/parscit/bin/sectLabel/redo.sectLabel.pl +219 -0
- data/parscit/bin/sectLabel/simplifyOmniXML.pl +382 -0
- data/parscit/bin/sectLabel/single2multi.pl +190 -0
- data/parscit/bin/sectLabel/tr2crfpp.pl +158 -0
- data/parscit/bin/tr2crfpp.pl +260 -0
- data/parscit/bin/xml2train.pl +193 -0
- data/parscit/lib/CSXUtil/SafeText.pm +130 -0
- data/parscit/lib/Omni/Config.pm +93 -0
- data/parscit/lib/Omni/Omnicell.pm +263 -0
- data/parscit/lib/Omni/Omnicol.pm +292 -0
- data/parscit/lib/Omni/Omnidd.pm +328 -0
- data/parscit/lib/Omni/Omnidoc.pm +153 -0
- data/parscit/lib/Omni/Omniframe.pm +223 -0
- data/parscit/lib/Omni/Omniline.pm +423 -0
- data/parscit/lib/Omni/Omnipage.pm +282 -0
- data/parscit/lib/Omni/Omnipara.pm +232 -0
- data/parscit/lib/Omni/Omnirun.pm +303 -0
- data/parscit/lib/Omni/Omnitable.pm +336 -0
- data/parscit/lib/Omni/Omniword.pm +162 -0
- data/parscit/lib/Omni/Traversal.pm +313 -0
- data/parscit/lib/ParsCit/.PostProcess.pm.swp +0 -0
- data/parscit/lib/ParsCit/Citation.pm +737 -0
- data/parscit/lib/ParsCit/CitationContext.pm +220 -0
- data/parscit/lib/ParsCit/Config.pm +35 -0
- data/parscit/lib/ParsCit/Controller.pm +653 -0
- data/parscit/lib/ParsCit/PostProcess.pm +505 -0
- data/parscit/lib/ParsCit/PreProcess.pm +1041 -0
- data/parscit/lib/ParsCit/Tr2crfpp.pm +1195 -0
- data/parscit/lib/ParsHed/Config.pm +49 -0
- data/parscit/lib/ParsHed/Controller.pm +143 -0
- data/parscit/lib/ParsHed/PostProcess.pm +322 -0
- data/parscit/lib/ParsHed/Tr2crfpp.pm +448 -0
- data/{perl/ParsCit/lib/ParsCit/Tr2crfpp.pm → parscit/lib/ParsHed/Tr2crfpp_token.pm} +22 -21
- data/parscit/lib/SectLabel/AAMatching.pm +1949 -0
- data/parscit/lib/SectLabel/Config.pm +88 -0
- data/parscit/lib/SectLabel/Controller.pm +332 -0
- data/parscit/lib/SectLabel/PostProcess.pm +425 -0
- data/parscit/lib/SectLabel/PreProcess.pm +116 -0
- data/parscit/lib/SectLabel/Tr2crfpp.pm +1246 -0
- data/parscit/resources/parsCit.model +0 -0
- data/parscit/resources/parsCit.split.model +0 -0
- data/{perl/ParsCit → parscit}/resources/parsCitDict.txt +205 -0
- data/parscit/resources/parsHed/bigram +10 -0
- data/parscit/resources/parsHed/keywords +10 -0
- data/parscit/resources/parsHed/parsHed.model +0 -0
- data/parscit/resources/parsHed/parsHed.template +178 -0
- data/parscit/resources/sectLabel/affiliation.model +0 -0
- data/parscit/resources/sectLabel/author.model +0 -0
- data/parscit/resources/sectLabel/funcWord +320 -0
- data/parscit/resources/sectLabel/genericSect.model +0 -0
- data/parscit/resources/sectLabel/sectLabel.config +42 -0
- data/parscit/resources/sectLabel/sectLabel.configXml +42 -0
- data/parscit/resources/sectLabel/sectLabel.model +0 -0
- data/sh/convert_to_text.sh +20 -0
- data/spec/biblicit/extractor_spec.rb +121 -0
- data/spec/fixtures/Review_of_Michael_Tyes_Consciousness_Revisited.docx +0 -0
- data/spec/fixtures/critical-infrastructures.ps +63951 -0
- data/spec/fixtures/txt/E06-1050.txt +867 -0
- data/spec/fixtures/txt/sample1.txt +902 -0
- data/spec/fixtures/txt/sample2.txt +394 -0
- data/spec/spec_helper.rb +3 -0
- data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Function.pm +2 -20
- data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +0 -7
- data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Parser.pm +0 -2
- data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +0 -7
- data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +6 -1
- data/svm-header-parse/HeaderParseService/tmp/.gitignore +4 -0
- data/svm-header-parse/extract.pl +75 -0
- metadata +351 -317
- data/perl/DocFilter/lib/DocFilter/Config.pm +0 -35
- data/perl/DocFilter/lib/DocFilter/Filter.pm +0 -51
- data/perl/FileConversionService/README.TXT +0 -11
- data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
- data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +0 -140
- data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +0 -77
- data/perl/FileConversionService/lib/FileConverter/Compression.pm +0 -137
- data/perl/FileConversionService/lib/FileConverter/Config.pm +0 -57
- data/perl/FileConversionService/lib/FileConverter/Controller.pm +0 -191
- data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +0 -61
- data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +0 -69
- data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +0 -69
- data/perl/FileConversionService/lib/FileConverter/PSToText.pm +0 -88
- data/perl/FileConversionService/lib/FileConverter/Prescript.pm +0 -68
- data/perl/FileConversionService/lib/FileConverter/TET.pm +0 -75
- data/perl/FileConversionService/lib/FileConverter/Utils.pm +0 -130
- data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +0 -140
- data/perl/HeaderParseService/resources/data/EbizHeaders.txt +0 -24330
- data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +0 -27506
- data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +0 -26495
- data/perl/HeaderParseService/resources/data/tagged_headers.txt +0 -40668
- data/perl/HeaderParseService/resources/data/test_header.txt +0 -31
- data/perl/HeaderParseService/resources/data/test_header.txt.parsed +0 -31
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +0 -23
- data/perl/ParsCit/crfpp/traindata/parsCit.template +0 -60
- data/perl/ParsCit/crfpp/traindata/parsCit.train.data +0 -12104
- data/perl/ParsCit/crfpp/traindata/tagged_references.txt +0 -500
- data/perl/ParsCit/lib/CSXUtil/SafeText.pm +0 -140
- data/perl/ParsCit/lib/ParsCit/Citation.pm +0 -462
- data/perl/ParsCit/lib/ParsCit/CitationContext.pm +0 -132
- data/perl/ParsCit/lib/ParsCit/Config.pm +0 -46
- data/perl/ParsCit/lib/ParsCit/Controller.pm +0 -306
- data/perl/ParsCit/lib/ParsCit/PostProcess.pm +0 -367
- data/perl/ParsCit/lib/ParsCit/PreProcess.pm +0 -333
- data/perl/ParsCit/resources/parsCit.model +0 -0
- data/perl/extract.pl +0 -199
- data/spec/biblicit/cb2bib_spec.rb +0 -48
- data/spec/biblicit/citeseer_spec.rb +0 -40
- /data/{perl → svm-header-parse}/HeaderParseService/README.TXT +0 -0
- /data/{perl/DocFilter → svm-header-parse/HeaderParseService}/lib/CSXUtil/SafeText.pm +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/50states +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AddrTopWords.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWords.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWordsAll.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/ChineseSurNames.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/DomainSuffixes.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/LabeledHeader +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/README +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstract.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstractTopWords +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/addr.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affi.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affis.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/allnamewords.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_US.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_world.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/city.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cityname.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/country_abbr.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/countryname.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/dateTopWords +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/degree.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/email.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/excludeWords.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/female-names +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstNames.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/intro.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keyword.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keywordTopWords +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/male-names +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/middleNames.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/month.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label.old +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.processed +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulAuthor +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulClassStat +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nickname.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nicknames.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/note.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/page.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/phone.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/postcode.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/pubnum.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/states_and_abbreviations.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surNames.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames_spec.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/A.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/B.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/C.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/D.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/E.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/F.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/G.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/H.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/I.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/J.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/K.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/L.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/M.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/N.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/O.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/P.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Q.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/R.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/S.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/T.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/U.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/V.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/W.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/X.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Y.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Z.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ae.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/am.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ar.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/at.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/au.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bd.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/be.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bg.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bh.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bm.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bn.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/br.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ca.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ch.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cl.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cn.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/co.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cy.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cz.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/de.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ec.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ee.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/eg.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/es.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/et.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/faq.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fi.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fj.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fo.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/geog.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gu.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hu.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/id.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ie.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/il.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/in.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/is.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/it.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jm.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jo.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jp.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kw.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lb.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lt.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lu.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lv.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ma.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mo.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mt.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mx.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/my.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ni.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nl.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/no.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nz.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pa.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pe.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ph.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pl.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ps.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pt.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/recognition.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/results.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ro.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ru.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sd.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/se.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sg.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/si.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/th.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tw.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ua.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ-full.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uy.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ve.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/yu.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/za.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/zm.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/url.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/webTopWords +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/words +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceModel +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceTrainF +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperBaseFeaDict +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperContextFeaDict +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +0 -0
|
@@ -0,0 +1,1041 @@
|
|
|
1
|
+
package ParsCit::PreProcess;
|
|
2
|
+
|
|
3
|
+
###
|
|
4
|
+
# Utilities for finding and normalizing citations within
|
|
5
|
+
# text files, including separating citation text from
|
|
6
|
+
# body text and segmenting citations.
|
|
7
|
+
#
|
|
8
|
+
# Isaac Councill, 7/19/07
|
|
9
|
+
###
|
|
10
|
+
|
|
11
|
+
use utf8;
|
|
12
|
+
use strict;
|
|
13
|
+
|
|
14
|
+
use Omni::Config;
|
|
15
|
+
use ParsCit::Citation;
|
|
16
|
+
|
|
17
|
+
my %marker_types = ( 'SQUARE' => '\\[.+?\\]',
|
|
18
|
+
'PAREN' => '\\(.+?\\)',
|
|
19
|
+
'NAKEDNUM' => '\\d+',
|
|
20
|
+
'NAKEDNUMDOT' => '\\d+\\.',
|
|
21
|
+
#'NAKEDNUM' => '\\d{1,3}', # Modified by Artemy Kolchinsky (v090625)
|
|
22
|
+
#'NAKEDNUMDOT' => '\\d{1,3}\\.' # Modified by Artemy Kolchinsky (v090625)
|
|
23
|
+
);
|
|
24
|
+
|
|
25
|
+
# Omnilib configuration: object name
|
|
26
|
+
my $obj_list = $Omni::Config::obj_list;
|
|
27
|
+
|
|
28
|
+
###
|
|
29
|
+
# Huydhn: similar to findCitationText, find the citation portion using regular expression.
|
|
30
|
+
# However the input is an omnipage xml document object, not the raw text
|
|
31
|
+
###
|
|
32
|
+
sub FindCitationTextXML
|
|
33
|
+
{
|
|
34
|
+
my ($doc) = @_;
|
|
35
|
+
|
|
36
|
+
# Positions or addresses of all lines in the reference
|
|
37
|
+
my @cit_addrs = ();
|
|
38
|
+
|
|
39
|
+
# Start and end of a reference
|
|
40
|
+
my $start_found = 0;
|
|
41
|
+
my %start_ref = ();
|
|
42
|
+
my $end_found = 0;
|
|
43
|
+
my %end_ref = ();
|
|
44
|
+
|
|
45
|
+
# All pages in the document
|
|
46
|
+
my $pages = $doc->get_objs_ref();
|
|
47
|
+
# Foreach line in the document, check if it is the beginning of a reference using regular expression
|
|
48
|
+
for (my $x = scalar(@{ $pages }) - 1; $x >= 0; $x--)
|
|
49
|
+
{
|
|
50
|
+
# All columns in one page
|
|
51
|
+
my $columns = $pages->[ $x ]->get_objs_ref();
|
|
52
|
+
|
|
53
|
+
for (my $y = scalar(@{ $columns }) - 1; $y >= 0; $y--)
|
|
54
|
+
{
|
|
55
|
+
# All paragraphs in one column
|
|
56
|
+
my $paras = $columns->[ $y ]->get_objs_ref();
|
|
57
|
+
|
|
58
|
+
for (my $z = scalar(@{ $paras }) - 1; $z >= 0; $z--)
|
|
59
|
+
{
|
|
60
|
+
# All lines in one paragraph
|
|
61
|
+
my $lines = $paras->[ $z ]->get_objs_ref();
|
|
62
|
+
|
|
63
|
+
for (my $t = scalar(@{ $lines }) - 1; $t >= 0; $t--)
|
|
64
|
+
{
|
|
65
|
+
my $ln_content = $lines->[ $t ]->get_content();
|
|
66
|
+
|
|
67
|
+
# Is it the beginning of a reference
|
|
68
|
+
if ($ln_content =~ m/\b(References?|REFERENCES?|Bibliography|BIBLIOGRAPHY|References?\s+and\s+Notes?|References?\s+Cited|REFERENCES?\s+CITED|REFERENCES?\s+AND\s+NOTES?|LITERATURE?\s+CITED?):?\s*$/)
|
|
69
|
+
{
|
|
70
|
+
if (($t + 1) < scalar(@{ $lines }))
|
|
71
|
+
{
|
|
72
|
+
$start_ref{ 'L4' } = $t + 1;
|
|
73
|
+
$start_ref{ 'L3' } = $z;
|
|
74
|
+
$start_ref{ 'L2' } = $y;
|
|
75
|
+
$start_ref{ 'L1' } = $x;
|
|
76
|
+
}
|
|
77
|
+
elsif (($z + 1) < scalar(@{ $paras }))
|
|
78
|
+
{
|
|
79
|
+
$start_ref{ 'L4' } = 0;
|
|
80
|
+
$start_ref{ 'L3' } = $z + 1;
|
|
81
|
+
$start_ref{ 'L2' } = $y;
|
|
82
|
+
$start_ref{ 'L1' } = $x;
|
|
83
|
+
}
|
|
84
|
+
elsif (($y + 1) < scalar(@{ $columns }))
|
|
85
|
+
{
|
|
86
|
+
$start_ref{ 'L4' } = 0;
|
|
87
|
+
$start_ref{ 'L3' } = 0;
|
|
88
|
+
$start_ref{ 'L2' } = $y + 1;
|
|
89
|
+
$start_ref{ 'L1' } = $x;
|
|
90
|
+
}
|
|
91
|
+
elsif (($x + 1) < scalar(@{ $pages }))
|
|
92
|
+
{
|
|
93
|
+
$start_ref{ 'L4' } = 0;
|
|
94
|
+
$start_ref{ 'L3' } = 0;
|
|
95
|
+
$start_ref{ 'L2' } = 0;
|
|
96
|
+
$start_ref{ 'L1' } = $x + 1;
|
|
97
|
+
}
|
|
98
|
+
else
|
|
99
|
+
{
|
|
100
|
+
# What the heck, the beginning is at the end of the document.
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
$start_found = 1;
|
|
104
|
+
last;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
if ($start_found == 1) { last; }
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
if ($start_found == 1) { last; }
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
if ($start_found == 1) { last; }
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
# Reference length
|
|
118
|
+
my $reference_length = 0;
|
|
119
|
+
# Citation
|
|
120
|
+
my $reference_text = "";
|
|
121
|
+
|
|
122
|
+
# Reference not found
|
|
123
|
+
if (! exists $start_ref{ 'L1' }) { return (\%start_ref, \%end_ref, \$reference_text); }
|
|
124
|
+
|
|
125
|
+
# Foreach line in the document after the start of the reference, check if it is the end of a reference using regular expression
|
|
126
|
+
for (my $x = $start_ref{ 'L1' }; $x < scalar(@{ $pages }); $x++)
|
|
127
|
+
{
|
|
128
|
+
# All columns in one page
|
|
129
|
+
my $columns = $pages->[ $x ]->get_objs_ref();
|
|
130
|
+
|
|
131
|
+
my $start_column = ($x == $start_ref{ 'L1' }) ? $start_ref{ 'L2' } : 0;
|
|
132
|
+
|
|
133
|
+
for (my $y = $start_column; $y < scalar(@{ $columns }); $y++)
|
|
134
|
+
{
|
|
135
|
+
# All paragraphs in one column
|
|
136
|
+
my $paras = $columns->[ $y ]->get_objs_ref();
|
|
137
|
+
|
|
138
|
+
my $start_para = (($x == $start_ref{ 'L1' }) && ($y == $start_ref{ 'L2' })) ? $start_ref{ 'L3' } : 0;
|
|
139
|
+
|
|
140
|
+
for (my $z = $start_para; $z < scalar(@{ $paras }); $z++)
|
|
141
|
+
{
|
|
142
|
+
# All lines in one paragraph
|
|
143
|
+
my $lines = $paras->[ $z ]->get_objs_ref();
|
|
144
|
+
|
|
145
|
+
my $start_line = (($x == $start_ref{ 'L1' }) && ($y == $start_ref{ 'L2' }) && ($z == $start_ref{ 'L3' })) ? $start_ref{ 'L4' } : 0;
|
|
146
|
+
|
|
147
|
+
for (my $t = $start_line; $t < scalar(@{ $lines }); $t++)
|
|
148
|
+
{
|
|
149
|
+
my $ln_content = $lines->[ $t ]->get_content();
|
|
150
|
+
|
|
151
|
+
# Just a temporary variable
|
|
152
|
+
my $tmp = undef;
|
|
153
|
+
# Is it the end?
|
|
154
|
+
if ($ln_content =~ m/^([\s\d\.]+)?(Acknowledge?ments?|Autobiographical|Tables?|Appendix|Exhibit|Annex|Fig|Notes?)(.*?)$/)
|
|
155
|
+
{
|
|
156
|
+
# Then save its location
|
|
157
|
+
if ($t == 0)
|
|
158
|
+
{
|
|
159
|
+
if ($z == 0)
|
|
160
|
+
{
|
|
161
|
+
if ($y == 0)
|
|
162
|
+
{
|
|
163
|
+
if ($x == 0)
|
|
164
|
+
{
|
|
165
|
+
# What the heck, the end is at the beginning of the document.
|
|
166
|
+
}
|
|
167
|
+
else
|
|
168
|
+
{
|
|
169
|
+
$end_ref{ 'L1' } = $x - 1;
|
|
170
|
+
|
|
171
|
+
$tmp = $pages->[ $x - 1 ]->get_objs_ref();
|
|
172
|
+
$end_ref{ 'L2' } = scalar(@{ $tmp }) - 1;
|
|
173
|
+
|
|
174
|
+
$tmp = $tmp->[ -1 ]->get_objs_ref();
|
|
175
|
+
$end_ref{ 'L3' } = scalar(@{ $tmp }) - 1;
|
|
176
|
+
|
|
177
|
+
$tmp = $tmp->[ -1 ]->get_objs_ref();
|
|
178
|
+
$end_ref{ 'L4' } = scalar(@{ $tmp }) - 1;
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
else
|
|
182
|
+
{
|
|
183
|
+
$end_ref{ 'L1' } = $x;
|
|
184
|
+
$end_ref{ 'L2' } = $y - 1;
|
|
185
|
+
|
|
186
|
+
$tmp = $columns->[ $y - 1 ]->get_objs_ref();
|
|
187
|
+
$end_ref{ 'L3' } = scalar(@{ $tmp }) - 1;
|
|
188
|
+
|
|
189
|
+
$tmp = $tmp->[ -1 ]->get_objs_ref();
|
|
190
|
+
$end_ref{ 'L4' } = scalar(@{ $tmp }) - 1;
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
else
|
|
194
|
+
{
|
|
195
|
+
$end_ref{ 'L1' } = $x;
|
|
196
|
+
$end_ref{ 'L2' } = $y;
|
|
197
|
+
$end_ref{ 'L3' } = $z - 1;
|
|
198
|
+
|
|
199
|
+
$tmp = $paras->[ $z - 1 ]->get_objs_ref();
|
|
200
|
+
$end_ref{ 'L4' } = scalar(@{ $tmp }) - 1;
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
else
|
|
204
|
+
{
|
|
205
|
+
$end_ref{ 'L1' } = $x;
|
|
206
|
+
$end_ref{ 'L2' } = $y;
|
|
207
|
+
$end_ref{ 'L3' } = $z;
|
|
208
|
+
$end_ref{ 'L4' } = $t - 1;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
$end_found = 1;
|
|
212
|
+
last;
|
|
213
|
+
}
|
|
214
|
+
# This is is not the end of the reference, so, logically, it belongs to the reference
|
|
215
|
+
else
|
|
216
|
+
{
|
|
217
|
+
push @cit_addrs, { 'L1' => $x, 'L2' => $y, 'L3' => $z, 'L4' => $t };
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
$reference_length += length($ln_content);
|
|
221
|
+
$reference_text .= $ln_content . "\n";
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
if ($end_found == 1) { last; }
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
if ($end_found == 1) { last; }
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
if ($end_found == 1) { last; }
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
# End of the reference not found, asume that it's the end of the document
|
|
234
|
+
if (! exists $end_ref{ 'L1' })
|
|
235
|
+
{
|
|
236
|
+
# Just a temporary variable
|
|
237
|
+
my $tmp = undef;
|
|
238
|
+
|
|
239
|
+
$end_ref{ 'L1' } = scalar(@{ $pages }) - 1;
|
|
240
|
+
|
|
241
|
+
$tmp = $pages->[ -1 ]->get_objs_ref();
|
|
242
|
+
$end_ref{ 'L2' } = scalar(@{ $tmp }) - 1;
|
|
243
|
+
|
|
244
|
+
$tmp = $tmp->[ -1 ]->get_objs_ref();
|
|
245
|
+
$end_ref{ 'L3' } = scalar(@{ $tmp }) - 1;
|
|
246
|
+
|
|
247
|
+
$tmp = $tmp->[ -1 ]->get_objs_ref();
|
|
248
|
+
$end_ref{ 'L4' } = scalar(@{ $tmp }) - 1;
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
# Odd case: when citation is longer than the content itself, what should we do?
|
|
252
|
+
if (1.8 * $reference_length >= 0.8 * length($doc->get_content()))
|
|
253
|
+
{
|
|
254
|
+
print STDERR "Citation text longer than article body: ignoring\n";
|
|
255
|
+
|
|
256
|
+
%start_ref = (); %end_ref = (); $reference_text = "";
|
|
257
|
+
return (\%start_ref, \%end_ref, \$reference_text);
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
# Now we have the citation text
|
|
261
|
+
return (\%start_ref, \%end_ref, \$reference_text, \@cit_addrs);
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
###
|
|
265
|
+
# Looks for reference section markers in the supplied text and
|
|
266
|
+
# separates the citation text from the body text based on these
|
|
267
|
+
# indicators. If it looks like there is a reference section marker
|
|
268
|
+
# too early in the document, this procedure will try to find later
|
|
269
|
+
# ones. If the final reference section is still too long, an empty
|
|
270
|
+
# citation text string will be returned. Returns references to
|
|
271
|
+
# the citation text, normalized body text, and original body text.
|
|
272
|
+
###
|
|
273
|
+
sub FindCitationText
|
|
274
|
+
{
|
|
275
|
+
my ($rtext, $pos_array) = @_;
|
|
276
|
+
|
|
277
|
+
# Save the text
|
|
278
|
+
my $text = $$rtext;
|
|
279
|
+
my $bodytext = "";
|
|
280
|
+
my $citetext = "";
|
|
281
|
+
|
|
282
|
+
###
|
|
283
|
+
# Corrected by Cheong Chi Hong <chcheong@cse.cuhk.edu.hk> 2 Feb 2010
|
|
284
|
+
# while ($text =~ m/\b(References?|REFERENCES?|Bibliography|BIBLIOGRAPHY|References?\s+and\s+Notes?|References?\s+Cited|REFERENCE?\s+CITED|REFERENCES?\s+AND\s+NOTES?):?\s*\n+/sg)
|
|
285
|
+
# {
|
|
286
|
+
###
|
|
287
|
+
###
|
|
288
|
+
# Corrected by Huy Do, 15 Jan 2011
|
|
289
|
+
# while ($text =~ m/\b(References?|REFERENCES?|Bibliography|BIBLIOGRAPHY|References?\s+and\s+Notes?|References?\s+Cited|REFERENCES?\s+CITED|REFERENCES?\s+AND\s+NOTES?):?\s*\n+/sg)
|
|
290
|
+
# {
|
|
291
|
+
###
|
|
292
|
+
while ($text =~ m/\b(References?|REFERENCES?|Bibliography|BIBLIOGRAPHY|References?\s+and\s+Notes?|References?\s+Cited|REFERENCES?\s+CITED|REFERENCES?\s+AND\s+NOTES?|LITERATURE?\s+CITED?):?\s*\n+/sg)
|
|
293
|
+
{
|
|
294
|
+
$bodytext = substr $text, 0, pos $text;
|
|
295
|
+
$citetext = substr $text, pos $text unless (pos $text < 1);
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
# No citation
|
|
299
|
+
if ($citetext eq "")
|
|
300
|
+
{
|
|
301
|
+
print STDERR "Citation text cannot be found: ignoring", "\n";
|
|
302
|
+
return \$citetext, NormalizeBodyText(\$bodytext, $pos_array), \$bodytext;
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
# Odd case: when citation is longer than the content itself, what should we do?
|
|
306
|
+
if (length($citetext) >= 0.8 * length($bodytext))
|
|
307
|
+
{
|
|
308
|
+
print STDERR "Citation text longer than article body: ignoring\n";
|
|
309
|
+
return \$citetext, NormalizeBodyText(\$bodytext, $pos_array), \$bodytext;
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
# Citation stops when another section starts
|
|
313
|
+
my ($scitetext, $tmp) = split(/^([\s\d\.]+)?(Acknowledge?ments?|Autobiographical|Tables?|Appendix|Exhibit|Annex|Fig|Notes?)(.*?)\n+/m, $citetext);
|
|
314
|
+
|
|
315
|
+
if (length($scitetext) > 0) { $citetext = $scitetext; }
|
|
316
|
+
|
|
317
|
+
# No citation exists
|
|
318
|
+
if ($citetext eq '0' || ! defined $citetext) { print STDERR "warning: no citation text found\n"; }
|
|
319
|
+
|
|
320
|
+
# Now we have the citation text
|
|
321
|
+
return (NormalizeCiteText(\$citetext), NormalizeBodyText(\$bodytext, $pos_array), \$bodytext);
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
###
|
|
325
|
+
# Huydhn: find citation section in raw text
|
|
326
|
+
# This function is used exclusively when the citation
|
|
327
|
+
# section is provided by sectlabel
|
|
328
|
+
sub FindCitationText2
|
|
329
|
+
{
|
|
330
|
+
my ($rtext, $rcit_lines, $pos_array) = @_;
|
|
331
|
+
|
|
332
|
+
# Citation and body text
|
|
333
|
+
my $citetext = "";
|
|
334
|
+
my $bodytext = "";
|
|
335
|
+
|
|
336
|
+
# All line in the document
|
|
337
|
+
my @lines = split(/\n/, $$rtext);
|
|
338
|
+
|
|
339
|
+
# Append all lines that belong to the citation
|
|
340
|
+
foreach my $line_index (@{ $rcit_lines })
|
|
341
|
+
{
|
|
342
|
+
$citetext = $citetext . $lines[ $line_index ] . "\n";
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
# If a line is not in @cit_lines, it belongs to the body text
|
|
346
|
+
for (my $i = 0; $i < $rcit_lines->[ 0 ]; $i++)
|
|
347
|
+
{
|
|
348
|
+
$bodytext = $bodytext . $lines[ $i ] . "\n";
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
# Odd case: when citation is longer than the content itself, what should we do?
|
|
352
|
+
if (length($citetext) >= 0.8 * length($bodytext))
|
|
353
|
+
{
|
|
354
|
+
print STDERR "Citation text longer than article body: ignoring\n";
|
|
355
|
+
return \$citetext, NormalizeBodyText(\$bodytext, $pos_array), \$bodytext;
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
# Now we have the citation text
|
|
359
|
+
return (NormalizeCiteText(\$citetext), NormalizeBodyText(\$bodytext, $pos_array), \$bodytext);
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
##
|
|
363
|
+
# Removes lines that appear to be junk from the citation text.
|
|
364
|
+
##
|
|
365
|
+
sub NormalizeCiteText
|
|
366
|
+
{
|
|
367
|
+
my ($rcitetext) = @_;
|
|
368
|
+
|
|
369
|
+
my @newlines = ();
|
|
370
|
+
my @lines = split "\n", $$rcitetext;
|
|
371
|
+
|
|
372
|
+
###
|
|
373
|
+
# Modified by Artemy Kolchinsky (v090625)
|
|
374
|
+
# In some cases, I had situations like:
|
|
375
|
+
# Smith B, "Blah Blah." Journal1, 2000, p. 23-
|
|
376
|
+
# 85
|
|
377
|
+
# Here, the line consisting of '85' is part of the citation and shouldn't be dropped,
|
|
378
|
+
# even though it only consist of numeric characters. The way I went about this is
|
|
379
|
+
# that I dropped those lines consisting of only spacing characters, *or* only numeric
|
|
380
|
+
# characters *if the previous line did not end on a hyphen*.
|
|
381
|
+
###
|
|
382
|
+
my $oldline = "";
|
|
383
|
+
|
|
384
|
+
foreach my $line (@lines)
|
|
385
|
+
{
|
|
386
|
+
$line =~ s/^\s*//g; # Dropped leading spaces added by Thang (v090625)
|
|
387
|
+
$line =~ s/\s*$//g; # Dropped trailing spaces added by Thang (v090625)
|
|
388
|
+
|
|
389
|
+
if ($line =~ m/^\s*$/ || ($oldline !~ m/\-$/ && $line =~ m/^\d*$/))
|
|
390
|
+
{
|
|
391
|
+
$oldline = $line;
|
|
392
|
+
next;
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
$oldline = $line;
|
|
396
|
+
push @newlines, $line;
|
|
397
|
+
}
|
|
398
|
+
###
|
|
399
|
+
# End modified by Artemy Kolchinsky (v090625)
|
|
400
|
+
###
|
|
401
|
+
|
|
402
|
+
my $newtext = join "\n", @newlines;
|
|
403
|
+
return \$newtext;
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
###
|
|
407
|
+
# Thang May 2010
|
|
408
|
+
# Address the problem Nick mentioned in method normalizeBodyText()
|
|
409
|
+
# This method handle multiple bracket references in a line, e.g "abc [1, 2-5, 11] def [1-3, 5] ghi jkl"
|
|
410
|
+
# + this method maps the position of tokens in normalized body text --> positions of tokens in body text (for later retrieve context positions)
|
|
411
|
+
###
|
|
412
|
+
sub ExpandBracketMarker
|
|
413
|
+
{
|
|
414
|
+
my ($line, $pos_array, $token_count) = @_;
|
|
415
|
+
# $line = "abc [1, 2-5, 11] def [1-3, 5] ghi jkl";
|
|
416
|
+
# $line = "abc[1, 2-5, 11]def[1-3, 5]ghi jkl";
|
|
417
|
+
# $line = "abc def ghi jkl";
|
|
418
|
+
|
|
419
|
+
my $count = 0;
|
|
420
|
+
my $front = "";
|
|
421
|
+
my $match = "";
|
|
422
|
+
my $remain = $line;
|
|
423
|
+
my $newline = "";
|
|
424
|
+
my $space_flag = 0;
|
|
425
|
+
|
|
426
|
+
while($line =~ m/\[(\d+[,;] *)*((\d+)-(\d+))([,;] *\d+)*\]/g)
|
|
427
|
+
{
|
|
428
|
+
$front = $`;
|
|
429
|
+
$match = $&;
|
|
430
|
+
$line = $';
|
|
431
|
+
|
|
432
|
+
# Handle front part
|
|
433
|
+
if($space_flag == 1) { $newline .= " "; }
|
|
434
|
+
$newline .= $front;
|
|
435
|
+
|
|
436
|
+
my @tokens = split(/\s+/, $front);
|
|
437
|
+
my $length = scalar(@tokens);
|
|
438
|
+
|
|
439
|
+
for(my $i=0; $i < $length; $i++)
|
|
440
|
+
{
|
|
441
|
+
if($i < ($length -1) || $front =~ / $/)
|
|
442
|
+
{
|
|
443
|
+
#print STDERR "$tokens[$i] --> ".$token_count."\n";
|
|
444
|
+
push(@{ $pos_array }, $token_count++);
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
# Handle match part
|
|
449
|
+
my $num_new_tokens = 0;
|
|
450
|
+
if ($match =~ /^\[(\d+[,;] *)*((\d+)-(\d+))([,;] *\d+)*\]$/)
|
|
451
|
+
{
|
|
452
|
+
$num_new_tokens = $4 - $3;
|
|
453
|
+
if ($num_new_tokens > 0)
|
|
454
|
+
{
|
|
455
|
+
$match = "[" . $1 . TransformMarker($3, $4) . $5 . "]";
|
|
456
|
+
}
|
|
457
|
+
else
|
|
458
|
+
{
|
|
459
|
+
$num_new_tokens = 0;
|
|
460
|
+
}
|
|
461
|
+
}
|
|
462
|
+
$newline .= $match;
|
|
463
|
+
|
|
464
|
+
@tokens = split(/\s+/, $match);
|
|
465
|
+
$length = scalar(@tokens);
|
|
466
|
+
|
|
467
|
+
for(my $i=0; $i < $length; $i++)
|
|
468
|
+
{
|
|
469
|
+
if($i < ($length -1) || $line =~ /^ /)
|
|
470
|
+
{
|
|
471
|
+
#print STDERR "$tokens[$i] --> ".$token_count."\n";
|
|
472
|
+
if ($i >= ($length - $num_new_tokens-1) && $i < ($length -1))
|
|
473
|
+
{
|
|
474
|
+
push(@{ $pos_array }, $token_count);
|
|
475
|
+
}
|
|
476
|
+
else
|
|
477
|
+
{
|
|
478
|
+
push(@{ $pos_array }, $token_count++);
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
if ($line =~ /^ /)
|
|
484
|
+
{
|
|
485
|
+
$space_flag = 1;
|
|
486
|
+
$line =~ s/^\s+//;
|
|
487
|
+
}
|
|
488
|
+
else
|
|
489
|
+
{
|
|
490
|
+
$space_flag = 0;
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
$count++;
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
if($space_flag == 1) { $newline .= " "; }
|
|
497
|
+
$newline .= $line;
|
|
498
|
+
|
|
499
|
+
my @tokens = split(/\s+/, $line);
|
|
500
|
+
my $length = scalar(@tokens);
|
|
501
|
+
|
|
502
|
+
for(my $i=0; $i < $length; $i++)
|
|
503
|
+
{
|
|
504
|
+
#print STDERR "$tokens[$i] --> ".$token_count."\n";
|
|
505
|
+
push(@{ $pos_array }, $token_count++);
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
return ($newline, $token_count);
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
###
|
|
512
|
+
# Removes lines that appear to be junk from the body text,
|
|
513
|
+
# de-hyphenates words where a hyphen occurs at the end of
|
|
514
|
+
# a line, and normalizes strings of blank spaces to only
|
|
515
|
+
# single blancks.
|
|
516
|
+
#
|
|
517
|
+
# HISTORY: Nick (v081201)
|
|
518
|
+
#
|
|
519
|
+
# In some publications markers with a range such as [1-5] or [1-12, 16]
|
|
520
|
+
# are used. ParsCit cannot find these markers. I added a simple
|
|
521
|
+
# workaround to PreProcess::normalizeBodyText. The markers with range
|
|
522
|
+
# are replaced by markers containing every number of the range
|
|
523
|
+
# (e.g. [1-5] replaced by [1, 2, 3, 4, 5]).
|
|
524
|
+
###
|
|
525
|
+
sub NormalizeBodyText
|
|
526
|
+
{
|
|
527
|
+
my ($rtext, $pos_array) = @_;
|
|
528
|
+
|
|
529
|
+
my @lines = split "\n", $$rtext;
|
|
530
|
+
my $text = "";
|
|
531
|
+
my $token_count = 0;
|
|
532
|
+
|
|
533
|
+
foreach my $line (@lines)
|
|
534
|
+
{
|
|
535
|
+
$line =~ s/^\s+//; # Thang May 2010: trip leading spaces
|
|
536
|
+
|
|
537
|
+
my @tmp_pos_array = ();
|
|
538
|
+
($line, $token_count) = ExpandBracketMarker($line, \@tmp_pos_array, $token_count); # Thang May 2010
|
|
539
|
+
my @tokens = split(/\s+/, $line);
|
|
540
|
+
|
|
541
|
+
if(scalar(@tokens) != scalar(@tmp_pos_array))
|
|
542
|
+
{
|
|
543
|
+
die "scalar(@tokens) != scalar(@tmp_pos_array)\n$line\n";
|
|
544
|
+
}
|
|
545
|
+
#$line =~ s/\[(\d+[,;] *)*((\d+)-(\d+))([,;] *\d+)*\]/"[".$1.transformMarker($3,$4).$5."]"/e;
|
|
546
|
+
|
|
547
|
+
if ($line =~ m/^\s*$/) { next; }
|
|
548
|
+
|
|
549
|
+
###
|
|
550
|
+
# Modified by Artemy Kolchinsky (v090625)
|
|
551
|
+
# !!! merge without removing "-" if preceeded by numbers...
|
|
552
|
+
###
|
|
553
|
+
if ($text =~ s/([A-Za-z])\-$/$1/)
|
|
554
|
+
{
|
|
555
|
+
$text .= $line;
|
|
556
|
+
shift(@tmp_pos_array);
|
|
557
|
+
}
|
|
558
|
+
else
|
|
559
|
+
{
|
|
560
|
+
if ($text !~ m/\-\s+$/ && $text ne "") { $text .= " " } # Thang May 2010: change m/\-\s*$/ -> m/\-\s+$/
|
|
561
|
+
$text .= $line;
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
push(@{$pos_array}, @tmp_pos_array);
|
|
565
|
+
###
|
|
566
|
+
# End modified by Artemy Kolchinsky (v090625)
|
|
567
|
+
###
|
|
568
|
+
}
|
|
569
|
+
|
|
570
|
+
$text =~ s/\s{2,}/ /g;
|
|
571
|
+
return \$text;
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
#
|
|
575
|
+
sub TransformMarker
|
|
576
|
+
{
|
|
577
|
+
my ($first_number, $second_number) = @_;
|
|
578
|
+
|
|
579
|
+
my $new_marker = $first_number;
|
|
580
|
+
for (my $i = ($first_number + 1) ; $i <= $second_number ; $i++) { $new_marker .= ", " . $i; }
|
|
581
|
+
return $new_marker;
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
###
|
|
585
|
+
# Controls the process by which citations are segmented, based
|
|
586
|
+
# on the result of trying to guess the type of citation marker
|
|
587
|
+
# used in the reference section. Returns a reference to a list
|
|
588
|
+
# of citation objects.
|
|
589
|
+
###
|
|
590
|
+
sub SegmentCitations
|
|
591
|
+
{
|
|
592
|
+
my ($rcite_text) = @_;
|
|
593
|
+
|
|
594
|
+
my $marker_type = GuessMarkerType($rcite_text);
|
|
595
|
+
|
|
596
|
+
my $rcitations = undef;
|
|
597
|
+
if ($marker_type ne 'UNKNOWN')
|
|
598
|
+
{
|
|
599
|
+
$rcitations = SplitCitationsByMarker($rcite_text, $marker_type);
|
|
600
|
+
}
|
|
601
|
+
else
|
|
602
|
+
{
|
|
603
|
+
$rcitations = SplitUnmarkedCitations($rcite_text);
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
return $rcitations;
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
###
|
|
610
|
+
# Segments citations that have explicit markers in the
|
|
611
|
+
# reference section. Whenever a new line starts with an
|
|
612
|
+
# expression that matches what we'd expect of a marker,
|
|
613
|
+
# a new citation is started. Returns a reference to a
|
|
614
|
+
# list of citation objects.
|
|
615
|
+
###
|
|
616
|
+
sub SplitCitationsByMarker
|
|
617
|
+
{
|
|
618
|
+
my ($rcite_text, $marker_type) = @_;
|
|
619
|
+
|
|
620
|
+
my @citations = ();
|
|
621
|
+
my $current_citation = new ParsCit::Citation();
|
|
622
|
+
my $current_citation_string = undef;
|
|
623
|
+
|
|
624
|
+
# TODO: Might want to add a check that marker number is
|
|
625
|
+
# increasing as we'd expect, if the marker is numeric.
|
|
626
|
+
|
|
627
|
+
foreach my $line (split "\n", $$rcite_text)
|
|
628
|
+
{
|
|
629
|
+
if ($line =~ m/^\s*($marker_types{ $marker_type })\s*(.*)$/)
|
|
630
|
+
{
|
|
631
|
+
my ($marker, $cite_string) = ($1, $2);
|
|
632
|
+
|
|
633
|
+
if (defined $current_citation_string)
|
|
634
|
+
{
|
|
635
|
+
$current_citation->setString($current_citation_string);
|
|
636
|
+
push @citations, $current_citation;
|
|
637
|
+
$current_citation_string = undef;
|
|
638
|
+
}
|
|
639
|
+
|
|
640
|
+
$current_citation = new ParsCit::Citation();
|
|
641
|
+
$current_citation->setMarkerType($marker_type);
|
|
642
|
+
$current_citation->setMarker($marker);
|
|
643
|
+
$current_citation_string = $cite_string;
|
|
644
|
+
}
|
|
645
|
+
else
|
|
646
|
+
{
|
|
647
|
+
###
|
|
648
|
+
# Modified by Artemy Kolchinsky (v090625)
|
|
649
|
+
# !!! merge without removing "-" if preceeded by numbers...
|
|
650
|
+
###
|
|
651
|
+
if ((defined $current_citation_string) && ($current_citation_string =~ m/[A-Za-z]\-$/))
|
|
652
|
+
{
|
|
653
|
+
# Merge words when lines are hyphenated
|
|
654
|
+
$current_citation_string =~ s/\-$//;
|
|
655
|
+
$current_citation_string .= $line;
|
|
656
|
+
}
|
|
657
|
+
else
|
|
658
|
+
{
|
|
659
|
+
if ((! defined $current_citation_string) || ($current_citation_string !~ m/\-\s*$/)) { $current_citation_string .= " "; } #!!!
|
|
660
|
+
$current_citation_string .= $line;
|
|
661
|
+
}
|
|
662
|
+
###
|
|
663
|
+
# End modified by Artemy Kolchinsky (v090625)
|
|
664
|
+
###
|
|
665
|
+
}
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
# Last citation
|
|
669
|
+
if (defined $current_citation && defined $current_citation_string)
|
|
670
|
+
{
|
|
671
|
+
$current_citation->setString($current_citation_string);
|
|
672
|
+
push @citations, $current_citation;
|
|
673
|
+
}
|
|
674
|
+
|
|
675
|
+
# Now, we have an array of separated citations
|
|
676
|
+
return \@citations;
|
|
677
|
+
}
|
|
678
|
+
|
|
679
|
+
|
|
680
|
+
###
|
|
681
|
+
# Uses several heuristics to decide where individual citations
|
|
682
|
+
# begin and end based on the length of previous lines, strings
|
|
683
|
+
# that look like author lists, and punctuation. Returns a
|
|
684
|
+
# reference to a list of citation objects.
|
|
685
|
+
#
|
|
686
|
+
# HISTORY: Modified in 081201 by Nick and J\"{o}ran.
|
|
687
|
+
#
|
|
688
|
+
# There was an error with unmarkedCitations. ParsCit ignored the last
|
|
689
|
+
# citation in the reference section due to a simple error in a for loop.
|
|
690
|
+
# In PreProcess::splitUnmarkedCitations (line 241; line 258 in my
|
|
691
|
+
# modified file) "$k<$#citeStarts" is used as exit condition. It should
|
|
692
|
+
# be "<=" and not "<" beause $#citeStarts provides the last index and
|
|
693
|
+
# not the length of the array.
|
|
694
|
+
#
|
|
695
|
+
# HISTORY: Modified in 081201 by Min to remove superfluous print statements
|
|
696
|
+
###
|
|
697
|
+
sub SplitUnmarkedCitations
|
|
698
|
+
{
|
|
699
|
+
my ($rcite_text) = @_;
|
|
700
|
+
|
|
701
|
+
my @content = split "\n", $$rcite_text;
|
|
702
|
+
|
|
703
|
+
my $cite_start = 0;
|
|
704
|
+
my @cite_starts = ();
|
|
705
|
+
my @citations = ();
|
|
706
|
+
|
|
707
|
+
###
|
|
708
|
+
# Huydhn: when a line is an author line (the line at the start of
|
|
709
|
+
# a citation with a long list of author), the next line cannot be
|
|
710
|
+
# the start of another (consequence) citation. This next line should
|
|
711
|
+
# be the next part of the current citation after the author line.
|
|
712
|
+
###
|
|
713
|
+
my $last_author_line = undef;
|
|
714
|
+
|
|
715
|
+
for (my $i = 0; $i <= $#content; $i++)
|
|
716
|
+
{
|
|
717
|
+
if ($content[ $i ] =~ m/\b\(?[1-2][0-9]{3}[\p{IsLower}]?[\)?\s,\.]*(\s|\b)/s)
|
|
718
|
+
{
|
|
719
|
+
for (my $k = $i; $k > $cite_start; $k--)
|
|
720
|
+
{
|
|
721
|
+
if ($content[ $k ] =~ m/\s*[\p{IsUpper}]/g)
|
|
722
|
+
{
|
|
723
|
+
###
|
|
724
|
+
# Huydhn: The previous line is an author line, so this line
|
|
725
|
+
# cannot be the start of another citation
|
|
726
|
+
if ($last_author_line == $k - 1) { next; }
|
|
727
|
+
|
|
728
|
+
# If length of previous line is extremely
|
|
729
|
+
# small, then start a new citation here.
|
|
730
|
+
if (length($content[ $k - 1 ]) < 2)
|
|
731
|
+
{
|
|
732
|
+
$cite_start = $k;
|
|
733
|
+
last;
|
|
734
|
+
}
|
|
735
|
+
|
|
736
|
+
# Start looking backwards for lines that could
|
|
737
|
+
# be author lists - these usually start the
|
|
738
|
+
# citation, have several separation characters (,;),
|
|
739
|
+
# and shouldn't contain any numbers.
|
|
740
|
+
my $beginning_author_line = -1;
|
|
741
|
+
|
|
742
|
+
for (my $j = $k - 1; $j > $cite_start; $j--)
|
|
743
|
+
{
|
|
744
|
+
if ($content[ $j ] =~ m/\d/) { last; }
|
|
745
|
+
|
|
746
|
+
$_ = $content[ $j ];
|
|
747
|
+
my $n_sep = s/([,;])/$1/g;
|
|
748
|
+
|
|
749
|
+
if ($n_sep >= 3)
|
|
750
|
+
{
|
|
751
|
+
if (($content[ $j - 1 ] =~ m/\.\s*$/) || $j == 0)
|
|
752
|
+
{
|
|
753
|
+
$beginning_author_line = $j;
|
|
754
|
+
}
|
|
755
|
+
}
|
|
756
|
+
else
|
|
757
|
+
{
|
|
758
|
+
last;
|
|
759
|
+
}
|
|
760
|
+
}
|
|
761
|
+
|
|
762
|
+
if ($beginning_author_line >= 0)
|
|
763
|
+
{
|
|
764
|
+
$cite_start = $beginning_author_line;
|
|
765
|
+
|
|
766
|
+
###
|
|
767
|
+
# Huydhn: see $last_author_line
|
|
768
|
+
###
|
|
769
|
+
$last_author_line = $beginning_author_line;
|
|
770
|
+
|
|
771
|
+
last;
|
|
772
|
+
}
|
|
773
|
+
|
|
774
|
+
# Now that the backwards author search failed
|
|
775
|
+
# to find any extra lines, start a new citation
|
|
776
|
+
# here if the previous line ends with a ".".
|
|
777
|
+
|
|
778
|
+
###
|
|
779
|
+
# Modified by Artemy Kolchinsky (v090625)
|
|
780
|
+
# A new citation is started if the previous line ended with
|
|
781
|
+
# a period, but not if it ended with a period, something else,
|
|
782
|
+
# and then a period. This is to avoid assuming that abbrevations,
|
|
783
|
+
# like U.S.A. , indicate the end of a cite. Also, a new cite is
|
|
784
|
+
# started only if the current line does not begin with a series of
|
|
785
|
+
# 4 digits. This helped avoid some mis-parsed citations for me.
|
|
786
|
+
# The new if-statement read like:
|
|
787
|
+
###
|
|
788
|
+
if ($content[ $k - 1 ] =~ m/[^\.].\.\s*$/ && $content[ $k ] !~ m/^\d\d\d\d/)
|
|
789
|
+
{
|
|
790
|
+
$cite_start = $k;
|
|
791
|
+
last;
|
|
792
|
+
}
|
|
793
|
+
}
|
|
794
|
+
}
|
|
795
|
+
# End of for
|
|
796
|
+
|
|
797
|
+
push @cite_starts, $cite_start unless (($cite_start <= $cite_starts[ $#cite_starts ]) && ($cite_start != 0));
|
|
798
|
+
}
|
|
799
|
+
}
|
|
800
|
+
|
|
801
|
+
for (my $k = 0; $k <= $#cite_starts; $k++)
|
|
802
|
+
{
|
|
803
|
+
my $first_line = $cite_starts[ $k ];
|
|
804
|
+
my $last_line = ($k == $#cite_starts) ? $#content : ($cite_starts[ $k + 1 ] - 1);
|
|
805
|
+
|
|
806
|
+
my $cite_string = MergeLines(join "\n", @content[ $first_line .. $last_line ]);
|
|
807
|
+
|
|
808
|
+
my $citation = new ParsCit::Citation();
|
|
809
|
+
$citation->setString($cite_string);
|
|
810
|
+
push @citations, $citation;
|
|
811
|
+
}
|
|
812
|
+
|
|
813
|
+
# And then from nothing came everything
|
|
814
|
+
return \@citations;
|
|
815
|
+
}
|
|
816
|
+
|
|
817
|
+
###
|
|
818
|
+
# Controls the process by which citations are segmented.
|
|
819
|
+
# Input includes XML information.
|
|
820
|
+
# Returns a reference to a list of citation objects.
|
|
821
|
+
#
|
|
822
|
+
# Added by Huydhn, 13 Jan 2011
|
|
823
|
+
###
|
|
824
|
+
sub SegmentCitationsXML
|
|
825
|
+
{
|
|
826
|
+
my ($rcite_text_from_xml, $tmp_file) = @_;
|
|
827
|
+
|
|
828
|
+
# TODO: Need to be removed
|
|
829
|
+
my $marker_type = GuessMarkerType($rcite_text_from_xml);
|
|
830
|
+
|
|
831
|
+
my $rcitations = undef;
|
|
832
|
+
if ($marker_type ne 'UNKNOWN')
|
|
833
|
+
{
|
|
834
|
+
# TODO: Need to be removed
|
|
835
|
+
$rcitations = SplitCitationsByMarker($rcite_text_from_xml, $marker_type);
|
|
836
|
+
}
|
|
837
|
+
else
|
|
838
|
+
{
|
|
839
|
+
# Huydhn: split reference using crf++ model
|
|
840
|
+
$rcitations = SplitUnmarkedCitations2($tmp_file);
|
|
841
|
+
}
|
|
842
|
+
|
|
843
|
+
return $rcitations;
|
|
844
|
+
}
|
|
845
|
+
|
|
846
|
+
###
|
|
847
|
+
# Replace heuristics rules with crf++ model based on both textual
|
|
848
|
+
# and XML features from Omnipage.
|
|
849
|
+
#
|
|
850
|
+
# HISTORY: Added in 100111 by Huy Do
|
|
851
|
+
###
|
|
852
|
+
sub SplitUnmarkedCitations2
|
|
853
|
+
{
|
|
854
|
+
my ($infile) = @_;
|
|
855
|
+
|
|
856
|
+
# Citation list
|
|
857
|
+
my @citations = ();
|
|
858
|
+
|
|
859
|
+
# Run the crf++
|
|
860
|
+
my $outfile = $infile . "_split.dec";
|
|
861
|
+
if (ParsCit::Tr2crfpp::SplitReference($infile, $outfile))
|
|
862
|
+
{
|
|
863
|
+
my $file_handle = undef;
|
|
864
|
+
unless(open($file_handle, "<:utf8", $outfile))
|
|
865
|
+
{
|
|
866
|
+
fatal("Could not open file: $!");
|
|
867
|
+
return;
|
|
868
|
+
}
|
|
869
|
+
|
|
870
|
+
# Read all lines
|
|
871
|
+
my @lines = ();
|
|
872
|
+
while(<$file_handle>)
|
|
873
|
+
{
|
|
874
|
+
chomp();
|
|
875
|
+
push @lines, $_;
|
|
876
|
+
}
|
|
877
|
+
close $file_handle;
|
|
878
|
+
|
|
879
|
+
my $cit_str = "";
|
|
880
|
+
for (my $i = 0; $i < scalar(@lines); $i++)
|
|
881
|
+
{
|
|
882
|
+
# Get the class of the file: "parsCit_begin", "parsCit_continue", or "parsCit_end"
|
|
883
|
+
my @tokens = split(/\s+/, $lines[$i]);
|
|
884
|
+
my $class = $tokens[ $#tokens ];
|
|
885
|
+
|
|
886
|
+
# Line content
|
|
887
|
+
my $ln_con = undef;
|
|
888
|
+
$ln_con = $tokens[ 0 ];
|
|
889
|
+
# Replace the ||| sequence with \s
|
|
890
|
+
$ln_con =~ s/\|\|\|/ /g;
|
|
891
|
+
|
|
892
|
+
# Beginning of a citation
|
|
893
|
+
if ($class eq "parsCit_begin")
|
|
894
|
+
{
|
|
895
|
+
# Save the previous citation
|
|
896
|
+
if ($cit_str ne "")
|
|
897
|
+
{
|
|
898
|
+
my $citation = new ParsCit::Citation();
|
|
899
|
+
|
|
900
|
+
# Clean up the citation text first
|
|
901
|
+
my $one_cit_str = MergeLines($cit_str);
|
|
902
|
+
|
|
903
|
+
# Save the citation
|
|
904
|
+
$citation->setString($one_cit_str);
|
|
905
|
+
push @citations, $citation;
|
|
906
|
+
}
|
|
907
|
+
|
|
908
|
+
# Create new citation
|
|
909
|
+
$cit_str = $ln_con;
|
|
910
|
+
}
|
|
911
|
+
# Inside a citation
|
|
912
|
+
elsif ($class ne "parsCit_unknown")
|
|
913
|
+
{
|
|
914
|
+
$cit_str = $cit_str . "\n" . $ln_con;
|
|
915
|
+
}
|
|
916
|
+
}
|
|
917
|
+
|
|
918
|
+
# Last citation
|
|
919
|
+
if ($cit_str ne "")
|
|
920
|
+
{
|
|
921
|
+
my $citation = new ParsCit::Citation();
|
|
922
|
+
|
|
923
|
+
# Clean up the citation text first
|
|
924
|
+
my $one_cit_str = MergeLines($cit_str);
|
|
925
|
+
|
|
926
|
+
# Save the citation
|
|
927
|
+
$citation->setString($one_cit_str);
|
|
928
|
+
push @citations, $citation;
|
|
929
|
+
}
|
|
930
|
+
}
|
|
931
|
+
|
|
932
|
+
unlink($infile);
|
|
933
|
+
unlink($outfile);
|
|
934
|
+
|
|
935
|
+
# Our work here is done
|
|
936
|
+
return \@citations;
|
|
937
|
+
}
|
|
938
|
+
|
|
939
|
+
###
|
|
940
|
+
# Merges lines of text by dehyphenating where appropriate,
|
|
941
|
+
# with normal spacing.
|
|
942
|
+
###
|
|
943
|
+
sub MergeLines
|
|
944
|
+
{
|
|
945
|
+
my ($text) = shift;
|
|
946
|
+
|
|
947
|
+
my @lines = split "\n", $text;
|
|
948
|
+
my $merged_text = "";
|
|
949
|
+
|
|
950
|
+
foreach my $line (@lines)
|
|
951
|
+
{
|
|
952
|
+
$line = Trim($line);
|
|
953
|
+
|
|
954
|
+
###
|
|
955
|
+
# Modified by Artemy Kolchinsky (v090625)
|
|
956
|
+
# # !!! merge without removing "-" if preceeded by numbers...
|
|
957
|
+
###
|
|
958
|
+
if ($merged_text =~ m/[A-Za-z]\-$/)
|
|
959
|
+
{
|
|
960
|
+
# Merge words when lines are hyphenated
|
|
961
|
+
$merged_text =~ s/\-$//;
|
|
962
|
+
$merged_text .= $line;
|
|
963
|
+
}
|
|
964
|
+
else
|
|
965
|
+
{
|
|
966
|
+
if ($merged_text !~ m/\-\s*$/) { $merged_text .= " " } #!!!
|
|
967
|
+
$merged_text .= $line;
|
|
968
|
+
}
|
|
969
|
+
###
|
|
970
|
+
# End modified by Artemy Kolchinsky (v090625)
|
|
971
|
+
###
|
|
972
|
+
}
|
|
973
|
+
|
|
974
|
+
return Trim($merged_text);
|
|
975
|
+
}
|
|
976
|
+
|
|
977
|
+
###
|
|
978
|
+
# Uses a list of regular expressions that match common citation
|
|
979
|
+
# markers to count the number of matches for each type in the
|
|
980
|
+
# text. If a sufficient number of matches to a particular type
|
|
981
|
+
# are found, we can be reasonably sure of the type.
|
|
982
|
+
###
|
|
983
|
+
sub GuessMarkerType
|
|
984
|
+
{
|
|
985
|
+
my ($rcite_text) = @_;
|
|
986
|
+
|
|
987
|
+
my $marker_type = 'UNKNOWN';
|
|
988
|
+
my %marker_observations = ();
|
|
989
|
+
|
|
990
|
+
foreach my $type (keys %marker_types)
|
|
991
|
+
{
|
|
992
|
+
$marker_observations{$type} = 0;
|
|
993
|
+
}
|
|
994
|
+
|
|
995
|
+
my $cite_text = "\n" . $$rcite_text;
|
|
996
|
+
$_ = $cite_text;
|
|
997
|
+
my $n_lines = s/\n/\n/gs - 1;
|
|
998
|
+
|
|
999
|
+
while ($cite_text =~ m/\n\s*($marker_types{'SQUARE'}([^\n]){10})/sg)
|
|
1000
|
+
{
|
|
1001
|
+
$marker_observations{'SQUARE'}++;
|
|
1002
|
+
}
|
|
1003
|
+
|
|
1004
|
+
while ($cite_text =~ m/\n\s*($marker_types{'PAREN'}([^\n]){10})/sg)
|
|
1005
|
+
{
|
|
1006
|
+
$marker_observations{'PAREN'}++;
|
|
1007
|
+
}
|
|
1008
|
+
|
|
1009
|
+
###
|
|
1010
|
+
# Modified by Artemy Kolchinsky (v090625): remove space after {10})
|
|
1011
|
+
###
|
|
1012
|
+
while ($cite_text =~ m/\n\s*($marker_types{'NAKEDNUM'} [^\n]{10})/sg)
|
|
1013
|
+
{
|
|
1014
|
+
$marker_observations{'NAKEDNUM'}++;
|
|
1015
|
+
}
|
|
1016
|
+
|
|
1017
|
+
while ($cite_text =~ m/\n\s*$marker_types{'NAKEDNUMDOT'}([^\n]){10}/sg)
|
|
1018
|
+
{
|
|
1019
|
+
$marker_observations{'NAKEDNUMDOT'}++;
|
|
1020
|
+
}
|
|
1021
|
+
|
|
1022
|
+
my @sorted_observations = sort { $marker_observations{ $b } <=> $marker_observations{ $a } } keys %marker_observations;
|
|
1023
|
+
|
|
1024
|
+
my $min_markers = $n_lines / 6;
|
|
1025
|
+
if ($marker_observations{ $sorted_observations[0] } >= $min_markers)
|
|
1026
|
+
{
|
|
1027
|
+
$marker_type = $sorted_observations[0];
|
|
1028
|
+
}
|
|
1029
|
+
|
|
1030
|
+
return $marker_type;
|
|
1031
|
+
}
|
|
1032
|
+
|
|
1033
|
+
sub Trim
|
|
1034
|
+
{
|
|
1035
|
+
my $text = shift;
|
|
1036
|
+
$text =~ s/^\s+//;
|
|
1037
|
+
$text =~ s/\s+$//;
|
|
1038
|
+
return $text;
|
|
1039
|
+
}
|
|
1040
|
+
|
|
1041
|
+
1;
|