biblicit 1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +3 -0
- data/.rspec +1 -0
- data/Gemfile +6 -0
- data/LICENSE.TXT +176 -0
- data/README.md +120 -0
- data/Rakefile +8 -0
- data/biblicit.gemspec +33 -0
- data/lib/biblicit/cb2bib.rb +83 -0
- data/lib/biblicit/citeseer.rb +53 -0
- data/lib/biblicit/extractor.rb +37 -0
- data/lib/biblicit.rb +6 -0
- data/perl/DocFilter/lib/CSXUtil/SafeText.pm +140 -0
- data/perl/DocFilter/lib/DocFilter/Config.pm +35 -0
- data/perl/DocFilter/lib/DocFilter/Filter.pm +51 -0
- data/perl/FileConversionService/README.TXT +11 -0
- data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
- data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +140 -0
- data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +77 -0
- data/perl/FileConversionService/lib/FileConverter/Compression.pm +137 -0
- data/perl/FileConversionService/lib/FileConverter/Config.pm +57 -0
- data/perl/FileConversionService/lib/FileConverter/Controller.pm +191 -0
- data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +61 -0
- data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +69 -0
- data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +69 -0
- data/perl/FileConversionService/lib/FileConverter/PSToText.pm +88 -0
- data/perl/FileConversionService/lib/FileConverter/Prescript.pm +68 -0
- data/perl/FileConversionService/lib/FileConverter/TET.pm +75 -0
- data/perl/FileConversionService/lib/FileConverter/Utils.pm +130 -0
- data/perl/HeaderParseService/README.TXT +80 -0
- data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +140 -0
- data/perl/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +968 -0
- data/perl/HeaderParseService/lib/HeaderParse/API/Function.pm +2016 -0
- data/perl/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +444 -0
- data/perl/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +409 -0
- data/perl/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +537 -0
- data/perl/HeaderParseService/lib/HeaderParse/API/Parser.pm +68 -0
- data/perl/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +1880 -0
- data/perl/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +46 -0
- data/perl/HeaderParseService/resources/data/EbizHeaders.txt +24330 -0
- data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +27506 -0
- data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +26495 -0
- data/perl/HeaderParseService/resources/data/tagged_headers.txt +40668 -0
- data/perl/HeaderParseService/resources/data/test_header.txt +31 -0
- data/perl/HeaderParseService/resources/data/test_header.txt.parsed +31 -0
- data/perl/HeaderParseService/resources/database/50states +60 -0
- data/perl/HeaderParseService/resources/database/AddrTopWords.txt +17 -0
- data/perl/HeaderParseService/resources/database/AffiTopWords.txt +35 -0
- data/perl/HeaderParseService/resources/database/AffiTopWordsAll.txt +533 -0
- data/perl/HeaderParseService/resources/database/ChineseSurNames.txt +276 -0
- data/perl/HeaderParseService/resources/database/Csurnames.bin +0 -0
- data/perl/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
- data/perl/HeaderParseService/resources/database/DomainSuffixes.txt +242 -0
- data/perl/HeaderParseService/resources/database/LabeledHeader +18 -0
- data/perl/HeaderParseService/resources/database/README +2 -0
- data/perl/HeaderParseService/resources/database/TrainMulClassLines +254 -0
- data/perl/HeaderParseService/resources/database/TrainMulClassLines1 +510 -0
- data/perl/HeaderParseService/resources/database/abstract.txt +1 -0
- data/perl/HeaderParseService/resources/database/abstractTopWords +9 -0
- data/perl/HeaderParseService/resources/database/addr.txt +28 -0
- data/perl/HeaderParseService/resources/database/affi.txt +34 -0
- data/perl/HeaderParseService/resources/database/affis.bin +0 -0
- data/perl/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
- data/perl/HeaderParseService/resources/database/allnamewords.bin +0 -0
- data/perl/HeaderParseService/resources/database/cities_US.txt +4512 -0
- data/perl/HeaderParseService/resources/database/cities_world.txt +4463 -0
- data/perl/HeaderParseService/resources/database/city.txt +3150 -0
- data/perl/HeaderParseService/resources/database/cityname.txt +3151 -0
- data/perl/HeaderParseService/resources/database/country_abbr.txt +243 -0
- data/perl/HeaderParseService/resources/database/countryname.txt +262 -0
- data/perl/HeaderParseService/resources/database/dateTopWords +30 -0
- data/perl/HeaderParseService/resources/database/degree.txt +67 -0
- data/perl/HeaderParseService/resources/database/email.txt +3 -0
- data/perl/HeaderParseService/resources/database/excludeWords.txt +40 -0
- data/perl/HeaderParseService/resources/database/female-names +4960 -0
- data/perl/HeaderParseService/resources/database/firstNames.txt +8448 -0
- data/perl/HeaderParseService/resources/database/firstnames.bin +0 -0
- data/perl/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
- data/perl/HeaderParseService/resources/database/intro.txt +2 -0
- data/perl/HeaderParseService/resources/database/keyword.txt +5 -0
- data/perl/HeaderParseService/resources/database/keywordTopWords +7 -0
- data/perl/HeaderParseService/resources/database/male-names +3906 -0
- data/perl/HeaderParseService/resources/database/middleNames.txt +2 -0
- data/perl/HeaderParseService/resources/database/month.txt +35 -0
- data/perl/HeaderParseService/resources/database/mul +868 -0
- data/perl/HeaderParseService/resources/database/mul.label +869 -0
- data/perl/HeaderParseService/resources/database/mul.label.old +869 -0
- data/perl/HeaderParseService/resources/database/mul.processed +762 -0
- data/perl/HeaderParseService/resources/database/mulAuthor +619 -0
- data/perl/HeaderParseService/resources/database/mulClassStat +45 -0
- data/perl/HeaderParseService/resources/database/nickname.txt +58 -0
- data/perl/HeaderParseService/resources/database/nicknames.bin +0 -0
- data/perl/HeaderParseService/resources/database/note.txt +121 -0
- data/perl/HeaderParseService/resources/database/page.txt +1 -0
- data/perl/HeaderParseService/resources/database/phone.txt +9 -0
- data/perl/HeaderParseService/resources/database/postcode.txt +54 -0
- data/perl/HeaderParseService/resources/database/pubnum.txt +45 -0
- data/perl/HeaderParseService/resources/database/statename.bin +0 -0
- data/perl/HeaderParseService/resources/database/statename.txt +73 -0
- data/perl/HeaderParseService/resources/database/states_and_abbreviations.txt +118 -0
- data/perl/HeaderParseService/resources/database/stopwords +438 -0
- data/perl/HeaderParseService/resources/database/stopwords.bin +0 -0
- data/perl/HeaderParseService/resources/database/surNames.txt +19613 -0
- data/perl/HeaderParseService/resources/database/surnames.bin +0 -0
- data/perl/HeaderParseService/resources/database/surnames_spec.bin +0 -0
- data/perl/HeaderParseService/resources/database/university_list/A.html +167 -0
- data/perl/HeaderParseService/resources/database/university_list/B.html +161 -0
- data/perl/HeaderParseService/resources/database/university_list/C.html +288 -0
- data/perl/HeaderParseService/resources/database/university_list/D.html +115 -0
- data/perl/HeaderParseService/resources/database/university_list/E.html +147 -0
- data/perl/HeaderParseService/resources/database/university_list/F.html +112 -0
- data/perl/HeaderParseService/resources/database/university_list/G.html +115 -0
- data/perl/HeaderParseService/resources/database/university_list/H.html +140 -0
- data/perl/HeaderParseService/resources/database/university_list/I.html +138 -0
- data/perl/HeaderParseService/resources/database/university_list/J.html +82 -0
- data/perl/HeaderParseService/resources/database/university_list/K.html +115 -0
- data/perl/HeaderParseService/resources/database/university_list/L.html +131 -0
- data/perl/HeaderParseService/resources/database/university_list/M.html +201 -0
- data/perl/HeaderParseService/resources/database/university_list/N.html +204 -0
- data/perl/HeaderParseService/resources/database/university_list/O.html +89 -0
- data/perl/HeaderParseService/resources/database/university_list/P.html +125 -0
- data/perl/HeaderParseService/resources/database/university_list/Q.html +49 -0
- data/perl/HeaderParseService/resources/database/university_list/R.html +126 -0
- data/perl/HeaderParseService/resources/database/university_list/S.html +296 -0
- data/perl/HeaderParseService/resources/database/university_list/T.html +156 -0
- data/perl/HeaderParseService/resources/database/university_list/U.html +800 -0
- data/perl/HeaderParseService/resources/database/university_list/V.html +75 -0
- data/perl/HeaderParseService/resources/database/university_list/W.html +144 -0
- data/perl/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
- data/perl/HeaderParseService/resources/database/university_list/X.html +44 -0
- data/perl/HeaderParseService/resources/database/university_list/Y.html +53 -0
- data/perl/HeaderParseService/resources/database/university_list/Z.html +43 -0
- data/perl/HeaderParseService/resources/database/university_list/ae.html +31 -0
- data/perl/HeaderParseService/resources/database/university_list/am.html +30 -0
- data/perl/HeaderParseService/resources/database/university_list/ar.html +35 -0
- data/perl/HeaderParseService/resources/database/university_list/at.html +43 -0
- data/perl/HeaderParseService/resources/database/university_list/au.html +82 -0
- data/perl/HeaderParseService/resources/database/university_list/bd.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/be.html +41 -0
- data/perl/HeaderParseService/resources/database/university_list/bg.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/bh.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
- data/perl/HeaderParseService/resources/database/university_list/bm.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/bn.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/br.html +66 -0
- data/perl/HeaderParseService/resources/database/university_list/ca.html +174 -0
- data/perl/HeaderParseService/resources/database/university_list/ch.html +52 -0
- data/perl/HeaderParseService/resources/database/university_list/cl.html +40 -0
- data/perl/HeaderParseService/resources/database/university_list/cn.html +87 -0
- data/perl/HeaderParseService/resources/database/university_list/co.html +39 -0
- data/perl/HeaderParseService/resources/database/university_list/cr.html +34 -0
- data/perl/HeaderParseService/resources/database/university_list/cy.html +34 -0
- data/perl/HeaderParseService/resources/database/university_list/cz.html +44 -0
- data/perl/HeaderParseService/resources/database/university_list/de.html +128 -0
- data/perl/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
- data/perl/HeaderParseService/resources/database/university_list/dk.html +42 -0
- data/perl/HeaderParseService/resources/database/university_list/ec.html +31 -0
- data/perl/HeaderParseService/resources/database/university_list/ee.html +30 -0
- data/perl/HeaderParseService/resources/database/university_list/eg.html +29 -0
- data/perl/HeaderParseService/resources/database/university_list/es.html +68 -0
- data/perl/HeaderParseService/resources/database/university_list/et.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/faq.html +147 -0
- data/perl/HeaderParseService/resources/database/university_list/fi.html +49 -0
- data/perl/HeaderParseService/resources/database/university_list/fj.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/fo.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/fr.html +106 -0
- data/perl/HeaderParseService/resources/database/university_list/geog.html +150 -0
- data/perl/HeaderParseService/resources/database/university_list/gr.html +38 -0
- data/perl/HeaderParseService/resources/database/university_list/gu.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/hk.html +34 -0
- data/perl/HeaderParseService/resources/database/university_list/hr.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/hu.html +46 -0
- data/perl/HeaderParseService/resources/database/university_list/id.html +29 -0
- data/perl/HeaderParseService/resources/database/university_list/ie.html +49 -0
- data/perl/HeaderParseService/resources/database/university_list/il.html +35 -0
- data/perl/HeaderParseService/resources/database/university_list/in.html +109 -0
- data/perl/HeaderParseService/resources/database/university_list/is.html +32 -0
- data/perl/HeaderParseService/resources/database/university_list/it.html +75 -0
- data/perl/HeaderParseService/resources/database/university_list/jm.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/jo.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/jp.html +155 -0
- data/perl/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
- data/perl/HeaderParseService/resources/database/university_list/kr.html +65 -0
- data/perl/HeaderParseService/resources/database/university_list/kw.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/lb.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
- data/perl/HeaderParseService/resources/database/university_list/lk.html +30 -0
- data/perl/HeaderParseService/resources/database/university_list/lt.html +31 -0
- data/perl/HeaderParseService/resources/database/university_list/lu.html +34 -0
- data/perl/HeaderParseService/resources/database/university_list/lv.html +30 -0
- data/perl/HeaderParseService/resources/database/university_list/ma.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
- data/perl/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
- data/perl/HeaderParseService/resources/database/university_list/mk.html +29 -0
- data/perl/HeaderParseService/resources/database/university_list/mo.html +29 -0
- data/perl/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
- data/perl/HeaderParseService/resources/database/university_list/mt.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/mx.html +68 -0
- data/perl/HeaderParseService/resources/database/university_list/my.html +39 -0
- data/perl/HeaderParseService/resources/database/university_list/ni.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/nl.html +51 -0
- data/perl/HeaderParseService/resources/database/university_list/no.html +56 -0
- data/perl/HeaderParseService/resources/database/university_list/nz.html +41 -0
- data/perl/HeaderParseService/resources/database/university_list/pa.html +31 -0
- data/perl/HeaderParseService/resources/database/university_list/pe.html +40 -0
- data/perl/HeaderParseService/resources/database/university_list/ph.html +41 -0
- data/perl/HeaderParseService/resources/database/university_list/pl.html +51 -0
- data/perl/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
- data/perl/HeaderParseService/resources/database/university_list/pr.html +31 -0
- data/perl/HeaderParseService/resources/database/university_list/ps.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/pt.html +45 -0
- data/perl/HeaderParseService/resources/database/university_list/recognition.html +69 -0
- data/perl/HeaderParseService/resources/database/university_list/results.html +71 -0
- data/perl/HeaderParseService/resources/database/university_list/ro.html +38 -0
- data/perl/HeaderParseService/resources/database/university_list/ru.html +48 -0
- data/perl/HeaderParseService/resources/database/university_list/sd.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/se.html +57 -0
- data/perl/HeaderParseService/resources/database/university_list/sg.html +33 -0
- data/perl/HeaderParseService/resources/database/university_list/si.html +30 -0
- data/perl/HeaderParseService/resources/database/university_list/sk.html +35 -0
- data/perl/HeaderParseService/resources/database/university_list/th.html +45 -0
- data/perl/HeaderParseService/resources/database/university_list/tr.html +44 -0
- data/perl/HeaderParseService/resources/database/university_list/tw.html +76 -0
- data/perl/HeaderParseService/resources/database/university_list/ua.html +29 -0
- data/perl/HeaderParseService/resources/database/university_list/uk.html +168 -0
- data/perl/HeaderParseService/resources/database/university_list/univ-full.html +3166 -0
- data/perl/HeaderParseService/resources/database/university_list/univ.html +122 -0
- data/perl/HeaderParseService/resources/database/university_list/uy.html +31 -0
- data/perl/HeaderParseService/resources/database/university_list/ve.html +34 -0
- data/perl/HeaderParseService/resources/database/university_list/yu.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/za.html +46 -0
- data/perl/HeaderParseService/resources/database/university_list/zm.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list.txt +3025 -0
- data/perl/HeaderParseService/resources/database/url.txt +1 -0
- data/perl/HeaderParseService/resources/database/webTopWords +225 -0
- data/perl/HeaderParseService/resources/database/words +45402 -0
- data/perl/HeaderParseService/resources/models/10ContextModelfold1 +369 -0
- data/perl/HeaderParseService/resources/models/10Modelfold1 +376 -0
- data/perl/HeaderParseService/resources/models/11ContextModelfold1 +400 -0
- data/perl/HeaderParseService/resources/models/11Modelfold1 +526 -0
- data/perl/HeaderParseService/resources/models/12ContextModelfold1 +510 -0
- data/perl/HeaderParseService/resources/models/12Modelfold1 +423 -0
- data/perl/HeaderParseService/resources/models/13ContextModelfold1 +364 -0
- data/perl/HeaderParseService/resources/models/13Modelfold1 +677 -0
- data/perl/HeaderParseService/resources/models/14ContextModelfold1 +459 -0
- data/perl/HeaderParseService/resources/models/14Modelfold1 +325 -0
- data/perl/HeaderParseService/resources/models/15ContextModelfold1 +340 -0
- data/perl/HeaderParseService/resources/models/15Modelfold1 +390 -0
- data/perl/HeaderParseService/resources/models/1ContextModelfold1 +668 -0
- data/perl/HeaderParseService/resources/models/1Modelfold1 +1147 -0
- data/perl/HeaderParseService/resources/models/2ContextModelfold1 +755 -0
- data/perl/HeaderParseService/resources/models/2Modelfold1 +796 -0
- data/perl/HeaderParseService/resources/models/3ContextModelfold1 +1299 -0
- data/perl/HeaderParseService/resources/models/3Modelfold1 +1360 -0
- data/perl/HeaderParseService/resources/models/4ContextModelfold1 +1062 -0
- data/perl/HeaderParseService/resources/models/4Modelfold1 +993 -0
- data/perl/HeaderParseService/resources/models/5ContextModelfold1 +1339 -0
- data/perl/HeaderParseService/resources/models/5Modelfold1 +2098 -0
- data/perl/HeaderParseService/resources/models/6ContextModelfold1 +888 -0
- data/perl/HeaderParseService/resources/models/6Modelfold1 +620 -0
- data/perl/HeaderParseService/resources/models/7ContextModelfold1 +257 -0
- data/perl/HeaderParseService/resources/models/7Modelfold1 +228 -0
- data/perl/HeaderParseService/resources/models/8ContextModelfold1 +677 -0
- data/perl/HeaderParseService/resources/models/8Modelfold1 +1871 -0
- data/perl/HeaderParseService/resources/models/9ContextModelfold1 +198 -0
- data/perl/HeaderParseService/resources/models/9Modelfold1 +170 -0
- data/perl/HeaderParseService/resources/models/NameSpaceModel +181 -0
- data/perl/HeaderParseService/resources/models/NameSpaceTrainF +347 -0
- data/perl/HeaderParseService/resources/models/WrapperBaseFeaDict +13460 -0
- data/perl/HeaderParseService/resources/models/WrapperContextFeaDict +14045 -0
- data/perl/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +510 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +23 -0
- data/perl/ParsCit/README.TXT +82 -0
- data/perl/ParsCit/crfpp/traindata/parsCit.template +60 -0
- data/perl/ParsCit/crfpp/traindata/parsCit.train.data +12104 -0
- data/perl/ParsCit/crfpp/traindata/tagged_references.txt +500 -0
- data/perl/ParsCit/lib/CSXUtil/SafeText.pm +140 -0
- data/perl/ParsCit/lib/ParsCit/Citation.pm +462 -0
- data/perl/ParsCit/lib/ParsCit/CitationContext.pm +132 -0
- data/perl/ParsCit/lib/ParsCit/Config.pm +46 -0
- data/perl/ParsCit/lib/ParsCit/Controller.pm +306 -0
- data/perl/ParsCit/lib/ParsCit/PostProcess.pm +367 -0
- data/perl/ParsCit/lib/ParsCit/PreProcess.pm +333 -0
- data/perl/ParsCit/lib/ParsCit/Tr2crfpp.pm +331 -0
- data/perl/ParsCit/resources/parsCit.model +0 -0
- data/perl/ParsCit/resources/parsCitDict.txt +148783 -0
- data/perl/extract.pl +199 -0
- data/spec/biblicit/cb2bib_spec.rb +48 -0
- data/spec/biblicit/citeseer_spec.rb +40 -0
- data/spec/fixtures/pdf/10.1.1.109.4049.pdf +0 -0
- data/spec/fixtures/pdf/Bagnoli Watts TAR 2010.pdf +0 -0
- data/spec/fixtures/pdf/ICINCO_2010.pdf +0 -0
- data/spec/spec_helper.rb +3 -0
- metadata +474 -0
@@ -0,0 +1,1880 @@
|
|
1
|
+
#
|
2
|
+
# Copyright 2007 Penn State University
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
10
|
+
# See the License for the specific language governing permissions and
|
11
|
+
# limitations under the License.
|
12
|
+
#
|
13
|
+
package HeaderParse::API::ParserMethods;
|
14
|
+
|
15
|
+
#06/27/2003, start to make this program to handle real data. So there is no evaluation, and off line classifiers should be trained be trained beforehand.
|
16
|
+
#02/10/2004 Apply to citeseer data (with the same format of EbizSearch data)
|
17
|
+
|
18
|
+
use utf8;
|
19
|
+
use Data::Dumper;
|
20
|
+
use FindBin;
|
21
|
+
use HeaderParse::API::NamePatternMatch;
|
22
|
+
use HeaderParse::API::MultiClassChunking; #default to use all export by this module
|
23
|
+
use HeaderParse::API::LoadInformation;
|
24
|
+
use HeaderParse::Config::API_Config;
|
25
|
+
use HeaderParse::API::AssembleXMLMetadata;
|
26
|
+
use vars qw($debug %dictH %nameH %firstnameH %lastnameH %BasicFeatureDictH %InverseTagMap);
|
27
|
+
use vars qw($Classifier $offlineD $Tmp_Dir $nMinHeaderLength $nMaxHeaderLength);
|
28
|
+
use HeaderParse::API::Function qw(&AddrMatch &printDict &GenTrainVecMatrix &LineFeatureRepre &FillSpace &SeparatePunc);
|
29
|
+
|
30
|
+
my $FeatureDictH = \%BasicFeatureDictH;
|
31
|
+
my $ContextFeatureDictH;
|
32
|
+
my $SpaceAuthorFeatureDictH; #do not know if it is OK to define a hash
|
33
|
+
my $PuncAuthorFeatureDictH;
|
34
|
+
my $NameSpaceTrainVecH;
|
35
|
+
my $NameSpaceTrainF = "$offlineD"."NameSpaceTrainF";
|
36
|
+
my $SVMNameSpaceModel = "$offlineD"."NameSpaceModel";
|
37
|
+
my $TestH;
|
38
|
+
my $TrainH;
|
39
|
+
my $TotalHea = 0;
|
40
|
+
|
41
|
+
my $timestamp;
|
42
|
+
|
43
|
+
#my $offlineD = "../../offline/";
|
44
|
+
#my $TestOutF = "$TestF"."\.parsed";
|
45
|
+
#my $tmpCacheVecB = "$Tmp_Dir/tmpVec";
|
46
|
+
#my $SVMTmpResultB = "$Tmp_Dir/tmpresult";
|
47
|
+
|
48
|
+
my $FeatureDict = "$offlineD"."WrapperBaseFeaDict";
|
49
|
+
my $ContextFeatureDict = "$offlineD"."WrapperContextFeaDict";
|
50
|
+
my $SpaceAuthorFeatureDictF = "$offlineD"."WrapperSpaceAuthorFeaDict";
|
51
|
+
my $PuncAuthorFeatureDictF = "$offlineD"."WrapperPuncAuthorFeaDict";
|
52
|
+
|
53
|
+
my $linear = 1; # just want to be fast
|
54
|
+
|
55
|
+
my %evalH; # global hash to record classification result for baseline, each context round and IE
|
56
|
+
my $norm = 1;
|
57
|
+
my $testp = 1; # this is only to make the program run, no meaning.
|
58
|
+
|
59
|
+
|
60
|
+
my %TestDataIndex; #It indexes the header no in the testing dataset
|
61
|
+
|
62
|
+
#Read dictionary files
|
63
|
+
undef $/;
|
64
|
+
open(dumpFH, "$FeatureDict") || die "SVMHeaderParse: could not open $FeatureDict to read: $!";
|
65
|
+
my $string = <dumpFH>;
|
66
|
+
close(dumpFH);
|
67
|
+
eval $string;
|
68
|
+
$FeatureDictH = $VAR1;
|
69
|
+
$string ="";
|
70
|
+
|
71
|
+
open(dumpFH, "$ContextFeatureDict") || die "SVMHeaderParse: could not open $ContextFeatureDict to read: $!";
|
72
|
+
$string = <dumpFH>;
|
73
|
+
close(dumpFH);
|
74
|
+
eval $string;
|
75
|
+
$ContextFeatureDictH = $VAR1;
|
76
|
+
$string ="";
|
77
|
+
|
78
|
+
open(dumpFH, "$SpaceAuthorFeatureDictF") || die "SVMHeaderParse: could not open $SpaceAuthorFeatureDictF to read: $!";
|
79
|
+
$string = <dumpFH>;
|
80
|
+
close(dumpFH);
|
81
|
+
eval $string;
|
82
|
+
$SpaceAuthorFeatureDictH = $VAR1;
|
83
|
+
$string ="";
|
84
|
+
$/ = "\n";
|
85
|
+
#End read dictionary files
|
86
|
+
|
87
|
+
|
88
|
+
sub Parse{
|
89
|
+
my $header=shift;
|
90
|
+
$timestamp = shift;
|
91
|
+
my $success = 0;
|
92
|
+
# $tmpCacheVec = $tmpCacheVec . "\_$timestamp\_";
|
93
|
+
# $SVMTmpResult = $SVMTmpResult . "\_$timestamp\_";
|
94
|
+
my $tmpCacheVec = "$Tmp_Dir/tmpVec"."\_$timestamp\_";
|
95
|
+
|
96
|
+
my $SVMTmpResult = "$Tmp_Dir/tmpresult"."\_$timestamp\_";
|
97
|
+
$TestH = &HashEbizHeader(\$header);
|
98
|
+
$TestH = &VectorizeUnknownHeaderLine($TestH);
|
99
|
+
|
100
|
+
my $baseline = 1;
|
101
|
+
$TestH = &LineClassify($testp, "", $baseline, $FeatureDictH,
|
102
|
+
$TestH, $tmpCacheVec, $SVMTmpResult);
|
103
|
+
$TestH = &UpdatePretag($TestH);
|
104
|
+
|
105
|
+
my $maxLoop = 2;
|
106
|
+
for my $loop(1 .. $maxLoop) {
|
107
|
+
$baseline = 0;
|
108
|
+
my $NowContext = "context"."$loop";
|
109
|
+
|
110
|
+
$TestH = &LineClassify($testp, $NowContext, $baseline,
|
111
|
+
$ContextFeatureDictH, $TestH,
|
112
|
+
$tmpCacheVec, $SVMTmpResult);
|
113
|
+
$TestH = &UpdatePretag($TestH);
|
114
|
+
}
|
115
|
+
|
116
|
+
#Phase 2: Extraction Information from Multi-Class Lines and Author Lines Chunks
|
117
|
+
my $LastContext = "context"."$maxLoop";
|
118
|
+
|
119
|
+
# BUG: InfoExtract hangs on some documents.
|
120
|
+
# this is reproducible with data extracted using TET from doc 654835
|
121
|
+
# from the legacy citeseer system.
|
122
|
+
eval {
|
123
|
+
local $SIG{'ALRM'} = sub { die "alarm\n"; };
|
124
|
+
alarm 15;
|
125
|
+
$TestH = &InfoExtract($testp, $TestH,$SpaceAuthorFeatureDictH, $PuncAuthorFeatureDictH, $SVMNameSpaceModel, $tmpCacheVec, $SVMTmpResult);
|
126
|
+
alarm 0;
|
127
|
+
};
|
128
|
+
if ($@) {
|
129
|
+
if ($@ eq "alarm\n") {
|
130
|
+
return 0;
|
131
|
+
}
|
132
|
+
}
|
133
|
+
$rXML = &ExportRDF($TestH);
|
134
|
+
|
135
|
+
for my $i(1..15){
|
136
|
+
unlink "$Tmp_Dir/tmpVec\_$timestamp\_test$i";
|
137
|
+
unlink "$Tmp_Dir/tmpresult\_$timestamp\_$i";
|
138
|
+
}
|
139
|
+
return $rXML;
|
140
|
+
}
|
141
|
+
|
142
|
+
|
143
|
+
# This is the header extraction module from CiteSeer.
|
144
|
+
# Only the parts related to header extraction is used.
|
145
|
+
sub ExtractHeaderInformation {
|
146
|
+
my $papertext = shift;
|
147
|
+
my $header='';
|
148
|
+
|
149
|
+
if (!(length($$papertext))){
|
150
|
+
return ('Paper text is empty');
|
151
|
+
}
|
152
|
+
|
153
|
+
# $$papertext =~ s/<[SEFC][\d\.e\+\-]*>//sgi; # remove S|E|F|C tags
|
154
|
+
|
155
|
+
if ($$papertext =~ /^(.*?\b(?:Introduction|INTRODUCTION|Contents|CONTENTS)(?:.*?\n){6})/s) {
|
156
|
+
$header = $1;
|
157
|
+
} else {
|
158
|
+
my $nLines = 150;
|
159
|
+
my @lines = split '\n', $$papertext;
|
160
|
+
my $contentLines = 0;
|
161
|
+
for (my $i=0; $i<=$#lines; $i++) {
|
162
|
+
if ($lines[$i] !~ m/^\s*$/) {
|
163
|
+
$contentLines++;
|
164
|
+
}
|
165
|
+
$header .= $lines[$i]."\n";
|
166
|
+
if ($contentLines >= $nLines) {
|
167
|
+
last;
|
168
|
+
}
|
169
|
+
}
|
170
|
+
}
|
171
|
+
|
172
|
+
# if ($$papertext =~ /^(.*?)\b(?:Abstract|ABSTRACT|Introduction|INTRODUCTION|Contents|CONTENTS|[Tt]his\s+(paper|memo|technical|article|document|report|dissertation))\b/s) { $header = $1; }
|
173
|
+
# elsif ($$papertext =~ /^(.*?)\n[\d\.\s]*(Reference|Bibliography)/si) { $header = $1; }
|
174
|
+
# else{
|
175
|
+
# return ('Header could not be extracted');
|
176
|
+
# }
|
177
|
+
|
178
|
+
if ((defined $header) && (length ($header) > $nMaxHeaderLength)) {
|
179
|
+
$header = substr ($header, 0, $nMaxHeaderLength) . '...';
|
180
|
+
}
|
181
|
+
if (length($header) < $nMinHeaderLength) {
|
182
|
+
return ('Header could not be extracted');
|
183
|
+
}
|
184
|
+
return ('',$header);
|
185
|
+
}
|
186
|
+
|
187
|
+
|
188
|
+
sub UpdatePretag() {
|
189
|
+
my $testH = shift;
|
190
|
+
# foreach my $testHea(sort {$a <=> $b} keys %{$testH}) {
|
191
|
+
foreach my $LN(sort {$a <=> $b} keys %{$testH}) {
|
192
|
+
delete($$testH{$LN}{Pretag});
|
193
|
+
if ($$testH{$LN}{PClass} eq "s") {
|
194
|
+
$$testH{$LN}{Pretag}{$$testH{$LN}{PSClsName}} = 1;
|
195
|
+
}elsif ($$testH{$LN}{PClass} eq "m") {
|
196
|
+
foreach my $mytag(keys %{$$testH{$LN}{PClsName}}) {
|
197
|
+
$$testH{$LN}{Pretag}{$mytag} = 1;
|
198
|
+
}
|
199
|
+
}
|
200
|
+
}
|
201
|
+
# }
|
202
|
+
return($testH);
|
203
|
+
}
|
204
|
+
|
205
|
+
|
206
|
+
#input: the file with all Training and testing samples
|
207
|
+
#output: $HeaderH{$HeaNO}{$LineNO} = "";
|
208
|
+
sub HashAllHeader() {
|
209
|
+
my $simulateHeaNum = shift;
|
210
|
+
my $tagF= shift;
|
211
|
+
my %HeaH = ();
|
212
|
+
my $HeaNO = 1; #start from 1
|
213
|
+
my $LineNO = 1;
|
214
|
+
|
215
|
+
open(tagFH, "$tagF") || die "SVMHeaderParse: could not open tag file\: $tagF to read: $!";
|
216
|
+
while (my $line = <tagFH>) {
|
217
|
+
$line =~ s/\+L\+//g;
|
218
|
+
$line =~ s/^\s+//g;
|
219
|
+
$line =~ s/\s+$//g;
|
220
|
+
|
221
|
+
if ($line =~ /^\s*\<NEW\_HEADER\>/) {
|
222
|
+
$HeaNO++;
|
223
|
+
$LineNO = 1;
|
224
|
+
#remove the line with only tag like </author>
|
225
|
+
}elsif (($line =~ /^\s*$/) || ($line =~ /^\<(\/)*(\w+)\>$/)) {
|
226
|
+
next;
|
227
|
+
}else {
|
228
|
+
$HeaH{$HeaNO}{$LineNO}{RawContent} = $line;
|
229
|
+
$LineNO++;
|
230
|
+
}
|
231
|
+
|
232
|
+
if ($simulateHeaNum > 0 && $HeaNO >= $simulateHeaNum) {
|
233
|
+
last;
|
234
|
+
}
|
235
|
+
}
|
236
|
+
close(tagFH);
|
237
|
+
return($HeaNO, \%HeaH);
|
238
|
+
}
|
239
|
+
|
240
|
+
|
241
|
+
#HEADER_DID[1]
|
242
|
+
#TRECS: Developing a Web-based e-Commerce Business Simulation
|
243
|
+
#TRECS: Developing a Web-based
|
244
|
+
sub HashEbizHeader() {
|
245
|
+
my $headerRef= shift;
|
246
|
+
my %HeaH = ();
|
247
|
+
# my $HeaNO = 1; #start from 1
|
248
|
+
my $LineNO = 1;
|
249
|
+
|
250
|
+
my @lines = split(/\n/, $$headerRef);
|
251
|
+
my $line;
|
252
|
+
|
253
|
+
#open(FH, "$F") || die "SVMHeaderParse: could not open file\: $F to read: $!";
|
254
|
+
#while (my $line = <FH>) {
|
255
|
+
foreach $line (@lines){
|
256
|
+
$line =~ s/^\s+//g;
|
257
|
+
$line =~ s/\s+$//g;
|
258
|
+
|
259
|
+
# if ($line =~ /^\s*HEADER\_DID\[(\d+)\]/) {
|
260
|
+
# $HeaNO = $1;
|
261
|
+
# $LineNO = 1;
|
262
|
+
# }elsif ($line !~ /^\s*$/) {
|
263
|
+
#$HeaH{$HeaNO}{$LineNO}{RawContent} = $line;
|
264
|
+
$HeaH{$LineNO}{RawContent} = $line;
|
265
|
+
$LineNO++;
|
266
|
+
# }
|
267
|
+
}
|
268
|
+
#close(FH);
|
269
|
+
return(\%HeaH);
|
270
|
+
}
|
271
|
+
|
272
|
+
|
273
|
+
sub BaseLineTrainSys() {
|
274
|
+
my $HeaderH = shift;
|
275
|
+
my $FeatureDictH = shift;
|
276
|
+
|
277
|
+
my %InitialHash = ();
|
278
|
+
$InitialHash{FeatureCounter} = 0;
|
279
|
+
|
280
|
+
my $PuncAuthorDictH = \%InitialHash;
|
281
|
+
my $SpaceAuthorDictH;
|
282
|
+
#this is the place to generate feature dictionrauy and name pattern dictionary
|
283
|
+
($HeaderH, $FeatureDictH, $SpaceAuthorDictH) = &FormFeaDict($HeaderH, $FeatureDictH);
|
284
|
+
#Prune features in Dictionary with DF < 3
|
285
|
+
$FeatureDictH = &PruneDict($FeatureDictH);
|
286
|
+
|
287
|
+
#prune features not in the pruned dict from the feature vector
|
288
|
+
foreach my $HeaNO (sort {$a <=> $b} keys %{$HeaderH}) {
|
289
|
+
foreach my $line(sort {$a <=> $b} keys %{$$HeaderH{$HeaNO}}) {
|
290
|
+
foreach my $fea(keys %{$$HeaderH{$HeaNO}{$line}{FeaVec}}) {
|
291
|
+
if (! $$FeatureDictH{$fea}{ID}) {
|
292
|
+
delete ($$HeaderH{$HeaNO}{$line}{FeaVec}{$fea});
|
293
|
+
}
|
294
|
+
}
|
295
|
+
|
296
|
+
if ($$HeaderH{$HeaNO}{$line}{FeaVec} ne "") {
|
297
|
+
my $tmpFeaVec = "";
|
298
|
+
foreach my $fea(sort{$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$HeaderH{$HeaNO}{$line}{FeaVec}}) {
|
299
|
+
|
300
|
+
if ($norm) {
|
301
|
+
#normalization
|
302
|
+
$$HeaderH{$HeaNO}{$line}{FeaVec}{$fea} = sprintf("%.8f", $$HeaderH{$HeaNO}{$line}{FeaVec}{$fea}/$$FeatureDictH{$fea}{max});
|
303
|
+
}
|
304
|
+
|
305
|
+
$tmpFeaVec .= "$$FeatureDictH{$fea}{ID}\:$$HeaderH{$HeaNO}{$line}{FeaVec}{$fea} ";
|
306
|
+
}
|
307
|
+
$$HeaderH{$HeaNO}{$line}{SVMFeaVec} = "$tmpFeaVec";
|
308
|
+
}
|
309
|
+
|
310
|
+
}
|
311
|
+
}
|
312
|
+
|
313
|
+
my %NameSpaceTrainVecH = (); #a separate hash for later printing
|
314
|
+
my $Lcount = 0;
|
315
|
+
#Prune acordingly features
|
316
|
+
foreach my $HeaNO (sort {$a <=> $b} keys %{$HeaderH}) {
|
317
|
+
foreach my $line(sort {$a <=> $b} keys %{$$HeaderH{$HeaNO}}) {
|
318
|
+
if (exists $$HeaderH{$HeaNO}{$line}{NamePattern}) {
|
319
|
+
foreach my $CandidateNamePattern(keys %{$$HeaderH{$HeaNO}{$line}{NamePattern}}) {
|
320
|
+
foreach my $fea(keys %{$$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}}) {
|
321
|
+
if (! $$SpaceAuthorDictH{$fea}{ID}) {
|
322
|
+
delete($$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}{$fea});
|
323
|
+
}
|
324
|
+
}
|
325
|
+
|
326
|
+
#normalization
|
327
|
+
if ($$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec} ne "") {
|
328
|
+
$Lcount++;
|
329
|
+
my $tmpFeaVec = "$$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{tag} ";
|
330
|
+
my $tmpTextVec = "$$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{tag} ";
|
331
|
+
|
332
|
+
foreach my $fea(sort{$$SpaceAuthorDictH{$a}{ID} <=> $$SpaceAuthorDictH{$b}{ID}} keys %{$$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}}) {
|
333
|
+
$$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}{$fea} = sprintf("%.8f", $$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}{$fea}/$$SpaceAuthorDictH{$fea}{max});
|
334
|
+
$tmpFeaVec .= "$$SpaceAuthorDictH{$fea}{ID}\:$$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}{$fea} ";
|
335
|
+
$tmpTextVec .= "$fea\:$$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}{$fea} ";
|
336
|
+
}
|
337
|
+
$NameSpaceTrainVecH{$Lcount}{SpaceNameVec}=$tmpFeaVec;
|
338
|
+
$NameSpaceTrainVecH{$Lcount}{SpaceTextNameVec}=$tmpTextVec; #for debugging
|
339
|
+
}
|
340
|
+
}
|
341
|
+
}
|
342
|
+
}
|
343
|
+
}
|
344
|
+
|
345
|
+
return($HeaderH, $FeatureDictH, $PuncAuthorDictH, $SpaceAuthorDictH, \%NameSpaceTrainVecH);
|
346
|
+
}
|
347
|
+
|
348
|
+
sub ContextTrainSys() {
|
349
|
+
my $FeatureDictH = shift;
|
350
|
+
my $HeaderH = shift;
|
351
|
+
|
352
|
+
foreach my $HeaNO (sort {$a <=> $b} keys %{$HeaderH}) {
|
353
|
+
#assign neighour line's tag
|
354
|
+
($FeatureDictH, $$HeaderH{$HeaNO}) = &TrainAssignLineTag($FeatureDictH, $$HeaderH{$HeaNO});
|
355
|
+
}
|
356
|
+
return($FeatureDictH, $HeaderH);
|
357
|
+
}
|
358
|
+
|
359
|
+
#this is to write all the testing lines into one file to speed up
|
360
|
+
sub LineClassify() {
|
361
|
+
my ($testp, $nowLoop, $baseline, $FeatureDictH,
|
362
|
+
$HeaderH, $tmpCacheVec, $SVMTmpResult) = @_;
|
363
|
+
my %memoryH = ();
|
364
|
+
my $GlobalLineNO = 0;
|
365
|
+
|
366
|
+
#step1: collect all test data and write into one file
|
367
|
+
# keep a hash to record the global lineNO and the header no its local line no
|
368
|
+
# here is the file for all the testing data
|
369
|
+
|
370
|
+
# foreach my $testHea(sort {$a <=> $b} keys %{$HeaderH}) {
|
371
|
+
if ($baseline) {
|
372
|
+
#Filter feature vector by Feature Dictionary
|
373
|
+
### $$HeaderH{$testHea} = &FormTestFeaVec($FeatureDictH, $$HeaderH{$testHea});
|
374
|
+
$HeaderH = &FormTestFeaVec($FeatureDictH, $HeaderH);
|
375
|
+
}else {
|
376
|
+
$HeaderH = &TestAssignLineTag($FeatureDictH, $HeaderH);
|
377
|
+
}
|
378
|
+
|
379
|
+
foreach my $LN(sort {$a <=> $b} keys %{$HeaderH}) {
|
380
|
+
if (! $baseline) {
|
381
|
+
#To make the iteration correct, we should initialize $$HeaderH{$testHea} by removing all the single and multiple classes in the hash
|
382
|
+
delete($$HeaderH{$LN}{PClass});
|
383
|
+
delete($$HeaderH{$LN}{PSClsName});
|
384
|
+
delete($$HeaderH{$LN}{PClsName});
|
385
|
+
}elsif ($baseline && ($$HeaderH{$LN}{FeaVec} ne "")) {
|
386
|
+
#modify the feature vector(normalization)
|
387
|
+
my $tmpFeaVec = "";
|
388
|
+
foreach my $fea(sort{$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$HeaderH{$LN}{FeaVec}}) {
|
389
|
+
if (exists ($$FeatureDictH{$fea}{ID})) {
|
390
|
+
|
391
|
+
if ($norm) {
|
392
|
+
if ($debug) {
|
393
|
+
if ($$FeatureDictH{$fea}{max} == 0) {
|
394
|
+
print STDERR "fea $fea has max value 0! \n";
|
395
|
+
}
|
396
|
+
}
|
397
|
+
$$HeaderH{$LN}{FeaVec}{$fea} = sprintf("%.8f", $$HeaderH{$LN}{FeaVec}{$fea}/$$FeatureDictH{$fea}{max});
|
398
|
+
}
|
399
|
+
|
400
|
+
$tmpFeaVec .= "$$FeatureDictH{$fea}{ID}\:$$HeaderH{$LN}{FeaVec}{$fea} ";
|
401
|
+
}
|
402
|
+
}
|
403
|
+
|
404
|
+
$$HeaderH{$LN}{SVMFeaVec} = "$tmpFeaVec";
|
405
|
+
|
406
|
+
#be carefull here!!
|
407
|
+
if ($$HeaderH{$LN}{SVMFeaVec} eq "") {
|
408
|
+
if ($debug) {
|
409
|
+
print STDERR "header($testHea) -- Line($LN) has a null feature vector ($$HeaderH{$testHea}{$LN}{RawContent}) \n";
|
410
|
+
}
|
411
|
+
next;
|
412
|
+
}
|
413
|
+
}
|
414
|
+
|
415
|
+
$GlobalLineNO++;
|
416
|
+
$memoryH{$GlobalLineNO}{HeaNO} = $testHea;
|
417
|
+
$memoryH{$GlobalLineNO}{LocalLineNO} = $LN;
|
418
|
+
}
|
419
|
+
# }
|
420
|
+
|
421
|
+
#step2:we print 15 files with labelled feature vectors
|
422
|
+
for my $clsNO(1 .. 15) {
|
423
|
+
my $testF = "$tmpCacheVec"."test"."$clsNO";
|
424
|
+
open(testFH, ">$testF") || die "SVMHeaderParse: could not open $testF to write: $!";
|
425
|
+
# foreach my $HeaNO (sort {$a <=> $b} keys %{$HeaderH}) {
|
426
|
+
foreach my $LN(sort {$a <=> $b} keys %{$HeaderH}) {
|
427
|
+
my $tag = 1; # just to conform to the format
|
428
|
+
if ($baseline) {
|
429
|
+
print testFH "$tag $$HeaderH{$LN}{SVMFeaVec}\n";
|
430
|
+
}else {
|
431
|
+
print testFH "$tag $$HeaderH{$LN}{ContextSVMFeaVec}\n";
|
432
|
+
#print "context feature vec is $$HeaderH{$HeaNO}{$LN}{ContextSVMFeaVec}\n";
|
433
|
+
}
|
434
|
+
}
|
435
|
+
# } # end of collecting all the testing data into a file
|
436
|
+
close(testFH);
|
437
|
+
}
|
438
|
+
|
439
|
+
#step3: SVM classify
|
440
|
+
for my $clsNO(1 .. 15) {
|
441
|
+
my $testF = "$tmpCacheVec"."test"."$clsNO";
|
442
|
+
my $mySVMResult = "$SVMTmpResult"."$clsNO";
|
443
|
+
my $SVMModelF;
|
444
|
+
my $printstr = "";
|
445
|
+
if ($baseline) {
|
446
|
+
$printstr = "baseline";
|
447
|
+
$SVMModelF = "$offlineD"."$clsNO"."Model"."fold"."$testp";
|
448
|
+
}else {
|
449
|
+
$printstr = "context"."$nowLoop";
|
450
|
+
$SVMModelF = "$offlineD"."$clsNO"."ContextModel"."fold"."$testp";
|
451
|
+
}
|
452
|
+
# print "$Classifier -v 0 $testF $SVMModelF $mySVMResult\n";
|
453
|
+
# print "classification result from fold($testp)-class($clsNO)-$printstr\:\n";
|
454
|
+
system("$Classifier -v 0 $testF $SVMModelF $mySVMResult");
|
455
|
+
}
|
456
|
+
|
457
|
+
#step4:Read all the result into a hash
|
458
|
+
my %SVMResultHash = ();
|
459
|
+
my %OrphanTagAssignHash = (); #This records the accuracy of assigned tags
|
460
|
+
my %NegMeanH = (); #record the mean of the negative value each classifier made
|
461
|
+
my %PosMinH = ();
|
462
|
+
|
463
|
+
for my $clsNO(1 .. 15) {
|
464
|
+
my $mySVMResult = "$SVMTmpResult"."$clsNO";
|
465
|
+
my $myLineNO = 0;
|
466
|
+
|
467
|
+
#initialize %PosMinH 's value
|
468
|
+
$PosMinH{$clsNO} = 100;
|
469
|
+
|
470
|
+
open(mySVMResultFH, "$mySVMResult") || die "SVMHeaderParse: could not open $mySVMResult to read: $!";
|
471
|
+
while (my $myline = <mySVMResultFH>) {
|
472
|
+
$myline =~ s/^\s+//g;
|
473
|
+
$myline =~ s/\s+$//g;
|
474
|
+
if ($myline !~ /^\s*$/) {
|
475
|
+
$myLineNO++;
|
476
|
+
if ($debug) {
|
477
|
+
print STDERR " current lineNo is $myLineNO and score for class $clsNO is $myline \n";
|
478
|
+
}
|
479
|
+
$SVMResultHash{$myLineNO}{$clsNO} = $myline;
|
480
|
+
if ($myline < 0) {
|
481
|
+
$NegMeanH{$clsNO} += $myline;
|
482
|
+
}else {
|
483
|
+
if ($PosMinH{$clsNO} > $myline) {
|
484
|
+
$PosMinH{$clsNO} = $myline;
|
485
|
+
}
|
486
|
+
}
|
487
|
+
}
|
488
|
+
}
|
489
|
+
|
490
|
+
if ($myLineNO < 1) {
|
491
|
+
if ($debug) {
|
492
|
+
print STDERR "yahoo: $mySVMResult has myLineNO 0 \n";
|
493
|
+
}
|
494
|
+
}else {
|
495
|
+
$NegMeanH{$clsNO} = sprintf("%.8f", $NegMeanH{$clsNO}/$myLineNO);
|
496
|
+
}
|
497
|
+
|
498
|
+
close(mySVMResultFH);
|
499
|
+
}
|
500
|
+
|
501
|
+
my $PredTagbyMinNeg = 0;
|
502
|
+
my $PredValbyMinNeg = 100;
|
503
|
+
my $PredTagbyMinPos = 0;
|
504
|
+
my $PredValbyMinPos = 100;
|
505
|
+
|
506
|
+
#analyze the results from the hash and fill the Test Hash(HeaderH)
|
507
|
+
for my $myline(1 .. $GlobalLineNO) {
|
508
|
+
my @PredictTags = ();
|
509
|
+
my $minVal = 100;
|
510
|
+
my $CandidateTag = -1;
|
511
|
+
my $myHeaNO = $memoryH{$myline}{HeaNO};
|
512
|
+
my $myLineNO = $memoryH{$myline}{LocalLineNO};
|
513
|
+
|
514
|
+
for my $clsNO(1 .. 15) {
|
515
|
+
my $myresult = $SVMResultHash{$myline}{$clsNO};
|
516
|
+
#keep the classification results for multi-class line
|
517
|
+
$$HeaderH{$myLineNO}{ClassifyResult}{$clsNO} = $myresult;
|
518
|
+
if ($debug) {
|
519
|
+
print STDERR "\t\t result by class $clsNO -- $result \n";
|
520
|
+
}
|
521
|
+
my $myRelDiv = 10;
|
522
|
+
|
523
|
+
if ($myresult > 0) {
|
524
|
+
push @PredictTags, $clsNO;
|
525
|
+
}else {
|
526
|
+
$myRelDiv = sprintf("%.8f", $myresult/$NegMeanH{$clsNO});
|
527
|
+
if ($myRelDiv < $minVal) {
|
528
|
+
$minVal = $myRelDiv;
|
529
|
+
$CandidateTag = $clsNO;
|
530
|
+
}
|
531
|
+
if ( (0 - $myresult) < $PredValbyMinNeg) {
|
532
|
+
$PredValbyMinNeg = -$myresult;
|
533
|
+
$PredTagbyMinNeg = $clsNO;
|
534
|
+
}
|
535
|
+
if (($PosMinH{$clsNO}- $myresult) < $PredValbyMinPos) {
|
536
|
+
$PredValbyMinPos = $PosMinH{$clsNO}- $myresult;
|
537
|
+
$PredTagbyMinPos = $clsNO;
|
538
|
+
}
|
539
|
+
}
|
540
|
+
}
|
541
|
+
#Assign ONLY class nearest to the hyperplane to the orphan point
|
542
|
+
if ($#PredictTags < 0) {
|
543
|
+
push @PredictTags, $CandidateTag;
|
544
|
+
$OrphanTagAssignHash{TotalLineNum}++;
|
545
|
+
}
|
546
|
+
|
547
|
+
#Fill the hash with the classification result
|
548
|
+
if ($#PredictTags eq 0) {
|
549
|
+
$$HeaderH{$myLineNO}{PClass} = "s";
|
550
|
+
$$HeaderH{$myLineNO}{PSClsName} = $PredictTags[0];
|
551
|
+
}elsif ($#PredictTags > 0) {
|
552
|
+
$$HeaderH{$myLineNO}{PClass} = "m";
|
553
|
+
# the multi tags predicted in one line has no sense of the order
|
554
|
+
for my $i(0 .. $#PredictTags) {
|
555
|
+
$$HeaderH{$myLineNO}{PClsName}{$PredictTags[$i]} = 1;
|
556
|
+
if ($debug) {
|
557
|
+
print STDERR "hea($myHeaNO)-- line($myLineNO) is classified as multi-class $PredictTags[$i] \n";
|
558
|
+
}
|
559
|
+
}
|
560
|
+
}else { #impossible
|
561
|
+
if ($debug) {
|
562
|
+
print STDERR "hea($myHeaNO)-- line($myLineNO) is orphan\n";
|
563
|
+
}
|
564
|
+
}
|
565
|
+
}
|
566
|
+
return($HeaderH);
|
567
|
+
}
|
568
|
+
|
569
|
+
|
570
|
+
#this is to
|
571
|
+
#(1) populate the predicted items(done in the LineClassify)
|
572
|
+
#(2) Extract related information from multi-author line and multi-classline
|
573
|
+
#all information to be extracted comes from {Pchunk}
|
574
|
+
#all word distribution information comes from {Pline} word dist.;
|
575
|
+
|
576
|
+
sub InfoExtract() {
|
577
|
+
my $testp = shift;
|
578
|
+
my $TestH = shift;
|
579
|
+
my $PuncAuthorDictH = shift;
|
580
|
+
my $SpaceAuthorDictH = shift;
|
581
|
+
my $SVMNameSpaceModel = shift;
|
582
|
+
my $tmpCacheVec = shift;
|
583
|
+
my $SVMTmpResult = shift;
|
584
|
+
|
585
|
+
# foreach my $testHea(sort {$a <=> $b} keys %{$TestH}) {
|
586
|
+
foreach my $LN(sort {$a <=> $b} keys %{$TestH}) {
|
587
|
+
if ($$TestH{$LN}{'PClass'} eq "s") { # single class
|
588
|
+
if ($$TestH{$LN}{PSClsName} ne '2') { #non-author single class
|
589
|
+
$$TestH{$LN}{Pchunk}{ChunkCounter}++;
|
590
|
+
my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
|
591
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = $$TestH{$LN}{PSClsName};
|
592
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $$TestH{$LN}{PureText};
|
593
|
+
}else {
|
594
|
+
if ($$TestH{$LN}{SClsWordCount} < 4) { #obvious single name
|
595
|
+
$$TestH{$LN}{Pchunk}{ChunkCounter}++;
|
596
|
+
my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
|
597
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = 2;
|
598
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $$TestH{$LN}{PureText};
|
599
|
+
}else { #multi-authors
|
600
|
+
my $Tline = $$TestH{$LN}{RawContent};
|
601
|
+
$Tline =~ s/<(\/)*author>//g;
|
602
|
+
if ($debug) {
|
603
|
+
print STDERR "predicted Multi-Author line -- $Tline \n";
|
604
|
+
}
|
605
|
+
my $NamePunc = 0;
|
606
|
+
#judge this is punctuated line or pure text-space
|
607
|
+
if (($$TestH{$LN}{PureText} =~ /([^\p{IsLower}\p{IsUpper}\s+\-\.\d+])/) || ($$TestH{$LN}{PureText} =~ /\band\b/i)) {
|
608
|
+
#multi-class needs while ... $punc++;
|
609
|
+
$NamePunc = 1;
|
610
|
+
}else {
|
611
|
+
$NamePunc = 0;
|
612
|
+
}
|
613
|
+
|
614
|
+
if ($NamePunc) {
|
615
|
+
#Heuristics bases separation based on features learned.
|
616
|
+
if (($$TestH{$LN}{PureText} =~ /Jr|Dr/) && ($$TestH{$LN}{SClsWordCount} <5)) {
|
617
|
+
#this is only one name
|
618
|
+
$$TestH{$LN}{Pchunk}{ChunkCounter}++;
|
619
|
+
my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
|
620
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = 2;
|
621
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $$TestH{$LN}{PureText};
|
622
|
+
}else {
|
623
|
+
my $nameStr = $$TestH{$LN}{PureText};
|
624
|
+
$nameStr =~ s/^\s+//g;
|
625
|
+
$nameStr =~ s/\s+$//g;
|
626
|
+
my @GuessedNames = split(/\,|\&|and/, $nameStr);
|
627
|
+
for my $i(0 .. $#GuessedNames) {
|
628
|
+
#chunk starts from 1
|
629
|
+
$GuessedNames[$i] =~ s/^\s+//g;
|
630
|
+
$GuessedNames[$i] =~ s/\s+$//g;
|
631
|
+
if ($GuessedNames[$i] !~ /^\s*$/) {
|
632
|
+
my @Nameparts = split(/\s+/, $GuessedNames[$i]);
|
633
|
+
if ($#Nameparts < 3) {
|
634
|
+
$$TestH{$LN}{Pchunk}{ChunkCounter}++;
|
635
|
+
my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
|
636
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = 2;
|
637
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $GuessedNames[$i];
|
638
|
+
}else {
|
639
|
+
#space separated names [name1 name2 name3 and name4]
|
640
|
+
my $PredictedNames = &HeaderParse::API::NamePatternMatch::NamePatternMatch($GuessedNames[$i]);
|
641
|
+
if ($#$PredictedNames < 1){
|
642
|
+
#only 1/0 reasonable name pattern, take it
|
643
|
+
$$TestH{$LN}{Pchunk}{ChunkCounter}++;
|
644
|
+
my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
|
645
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = 2;
|
646
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $GuessedNames[$i];
|
647
|
+
}else { #classify to predict
|
648
|
+
my $BestNamePattern = &PredictBestNamePattern($PredictedNames, $SVMNameSpaceModel, $SpaceAuthorDictH, $tmpCacheVec, $SVMTmpResult);
|
649
|
+
my @names = split(/<>/, $BestNamePattern);
|
650
|
+
for my $i(0 .. $#names) {
|
651
|
+
$$TestH{$LN}{Pchunk}{ChunkCounter}++;
|
652
|
+
my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
|
653
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = 2;
|
654
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $names[$i];
|
655
|
+
}
|
656
|
+
}
|
657
|
+
}
|
658
|
+
}
|
659
|
+
}
|
660
|
+
}
|
661
|
+
}else {
|
662
|
+
#name Space
|
663
|
+
my $nameStr = $$TestH{$LN}{PureText};
|
664
|
+
my $PredictedNames = &HeaderParse::API::NamePatternMatch::NamePatternMatch($nameStr);
|
665
|
+
if ($#$PredictedNames < 1){
|
666
|
+
#only 1/0 reasonable name pattern, take the parser-decided chunks
|
667
|
+
my $tmp_name_container = $$PredictedNames[0];
|
668
|
+
if ($#$tmp_name_container > 0) {
|
669
|
+
for my $kk(0 .. $#$tmp_name_container) {
|
670
|
+
$$TestH{$LN}{Pchunk}{ChunkCounter}++;
|
671
|
+
my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
|
672
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = 2;
|
673
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $$tmp_name_container[$kk];
|
674
|
+
}
|
675
|
+
}else {
|
676
|
+
#this branch is original
|
677
|
+
$$TestH{$LN}{Pchunk}{ChunkCounter}++;
|
678
|
+
my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
|
679
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = 2;
|
680
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $nameStr;
|
681
|
+
}
|
682
|
+
}else {
|
683
|
+
#classify to predict
|
684
|
+
my $BestNamePattern = &PredictBestNamePattern($PredictedNames, $SVMNameSpaceModel, $SpaceAuthorDictH, $tmpCacheVec, $SVMTmpResult);
|
685
|
+
my @names = split(/<>/, $BestNamePattern);
|
686
|
+
for my $i(0 .. $#names) {
|
687
|
+
$$TestH{$LN}{Pchunk}{ChunkCounter}++;
|
688
|
+
my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
|
689
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = 2;
|
690
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $names[$i];
|
691
|
+
}
|
692
|
+
}
|
693
|
+
}
|
694
|
+
}
|
695
|
+
}
|
696
|
+
#multiple class
|
697
|
+
}elsif ($$TestH{$LN}{PClass} eq "m"){
|
698
|
+
my (%TagH, $emailChunkH, $URLChunkH, @ArrayofHash);
|
699
|
+
#get a hash of all tags
|
700
|
+
foreach my $tag(keys %{$$TestH{$LN}{PClsName}}) {
|
701
|
+
$TagH{counter}++;
|
702
|
+
$TagH{$tag}++;
|
703
|
+
}
|
704
|
+
my ($PuncNum, $SepH, $component) = &GetSeparatorIndex($$TestH{$LN}{PureText});
|
705
|
+
#Preprocess -- extract email and URL out
|
706
|
+
if ($$TestH{$LN}{PClsName}{6}) {
|
707
|
+
#component has holes of "-1", after extracting emailchunk out
|
708
|
+
($emailChunkH, $component) = &LocateEmailFromComponent($component);
|
709
|
+
delete($TagH{6});
|
710
|
+
$TagH{counter}--;
|
711
|
+
push @ArrayofHash, $emailChunkH;
|
712
|
+
}
|
713
|
+
if ($$TestH{$LN}{PClsName}{12}) {
|
714
|
+
($URLChunkH, $component) = &LocateURLFromComponent($component);
|
715
|
+
delete($TagH{12});
|
716
|
+
$TagH{counter}--;
|
717
|
+
push @ArrayofHash, $URLChunkH;
|
718
|
+
}
|
719
|
+
|
720
|
+
if($TagH{counter} <1){ #no additional class
|
721
|
+
#exception: what if still text left ???????
|
722
|
+
$$TestH{$LN} = &FillChunkH($$TestH{$LN},$component, \@ArrayofHash);
|
723
|
+
#tag each word
|
724
|
+
}elsif ($TagH{counter} == 1){
|
725
|
+
#only one class left ..
|
726
|
+
my $lastTag = "";
|
727
|
+
foreach my $tag(keys %TagH) {
|
728
|
+
if ($tag ne "counter") {
|
729
|
+
$lastTag = $tag;
|
730
|
+
}
|
731
|
+
}
|
732
|
+
#Get the rest possible chunks separated by the email and URL
|
733
|
+
my $UnIdentifiedChunk = &LocateUnIdentifiedChunk($component);
|
734
|
+
#Tag all the test chunk as the only left class
|
735
|
+
foreach my $chunkNO(sort{$a<=>$b} keys %{$UnIdentifiedChunk}) {
|
736
|
+
$$UnIdentifiedChunk{$chunkNO}{cls} = $lastTag;
|
737
|
+
}
|
738
|
+
push @ArrayofHash, $UnIdentifiedChunk; #or\%myHash--must be pointer
|
739
|
+
#fill in the TestH chunk in a ordered way and tag each word
|
740
|
+
$$TestH{$LN} = &FillChunkH($$TestH{$LN}, $component, \@ArrayofHash);
|
741
|
+
# two class module
|
742
|
+
}elsif ($TagH{counter} == 2) {
|
743
|
+
#needs maping!
|
744
|
+
my @TagsArray = ();
|
745
|
+
foreach my $mytag(sort keys %TagH) {
|
746
|
+
if ($mytag ne "counter") {
|
747
|
+
push @TagsArray, $mytag;
|
748
|
+
}
|
749
|
+
}
|
750
|
+
|
751
|
+
my $UnIdentifiedChunk = &LocateUnIdentifiedChunk($component);
|
752
|
+
my $chunk1start = $$UnIdentifiedChunk{1}{startPos};
|
753
|
+
my $chunk1end = $$UnIdentifiedChunk{1}{endPos};
|
754
|
+
my $IdentifiedChunk;
|
755
|
+
#continuous
|
756
|
+
if ($$UnIdentifiedChunk{counter} == 1) {
|
757
|
+
my $offset;
|
758
|
+
my $newComponent = $component;
|
759
|
+
my $newSepH = $SepH;
|
760
|
+
if (($chunk1start == 0) && ($chunk1end == $#$component)) {
|
761
|
+
$offset = 0;
|
762
|
+
}else {
|
763
|
+
$offset = $chunk1start;
|
764
|
+
#adjust $component and $SepH
|
765
|
+
$newComponent = ();
|
766
|
+
for my $tmpi($chunk1start .. $chunk1end) {
|
767
|
+
$$newComponent[$tmpi-$offset] = $$component[$tmpi];
|
768
|
+
}
|
769
|
+
|
770
|
+
foreach my $tmpSep(sort keys %{$newSepH}) {
|
771
|
+
if (($tmpSep >= $chunk1start) && ($tmpSep <= $chunk1end)) {
|
772
|
+
my $newSep = $tmpSep - $offset;
|
773
|
+
$$newSepH{$newSep} = $$newSepH{$tmpSep};
|
774
|
+
}
|
775
|
+
delete($$newSepH{$tmpSep});
|
776
|
+
}
|
777
|
+
}
|
778
|
+
|
779
|
+
if ($PuncNum > 1) {
|
780
|
+
$IdentifiedChunk = &Cont2ClassChunking($testp, \@TagsArray, "punc", $newSepH, $newComponent, $FeatureDictH, $tmpCacheVec, $SVMTmpResult);
|
781
|
+
}else {
|
782
|
+
$IdentifiedChunk = &Cont2ClassChunking($testp, \@TagsArray, "space", $newSepH, $newComponent, $FeatureDictH, $tmpCacheVec, $SVMTmpResult);
|
783
|
+
}
|
784
|
+
#adjust back $chunk
|
785
|
+
if ($offset > 0) {
|
786
|
+
foreach my $tmpi(sort keys %{$IdentifiedChunk}) {
|
787
|
+
$$IdentifiedChunk{$tmpi}{startPos} += $offset;
|
788
|
+
$$IdentifiedChunk{$tmpi}{endPos} += $offset;
|
789
|
+
}
|
790
|
+
}
|
791
|
+
push @ArrayofHash, $IdentifiedChunk;
|
792
|
+
$$TestH{$LN} = &FillChunkH($$TestH{$LN}, $component, \@ArrayofHash);
|
793
|
+
}elsif ($$UnIdentifiedChunk{counter} == 2) { #discrete
|
794
|
+
$IdentifiedChunk = &Disc2ClassChunking_2chunk($testp, \@TagsArray, $UnIdentifiedChunk, $FeatureDictH, $tmpCacheVec, $SVMTmpResult);
|
795
|
+
push @ArrayofHash, $IdentifiedChunk;
|
796
|
+
$$TestH{$LN} = &FillChunkH($$TestH{$LN}, $component, \@ArrayofHash);
|
797
|
+
}elsif ($$UnIdentifiedChunk{counter} > 2) { #disc
|
798
|
+
if ($debug) {
|
799
|
+
print STDERR "2 classes with 3+ chunks\n";
|
800
|
+
}
|
801
|
+
}
|
802
|
+
#see 3 and 4 as one class
|
803
|
+
}elsif (($TagH{counter} == 3) && $TagH{3} && $TagH{4}) {
|
804
|
+
#tag array includes only 4 and the other tag
|
805
|
+
my @TagsArray = ();
|
806
|
+
foreach my $mytag(sort keys %TagH) {
|
807
|
+
if (($mytag ne "3") && ($mytag ne "4") && ($mytag ne "counter")) {
|
808
|
+
push @TagsArray, $mytag;
|
809
|
+
}
|
810
|
+
}
|
811
|
+
push @TagsArray, 4;
|
812
|
+
|
813
|
+
my $UnIdentifiedChunk = &LocateUnIdentifiedChunk($component);
|
814
|
+
my $chunk1start = $$UnIdentifiedChunk{1}{startPos};
|
815
|
+
my $chunk1end = $$UnIdentifiedChunk{1}{endPos};
|
816
|
+
|
817
|
+
my $IdentifiedChunk;
|
818
|
+
my $startPos34 = 0;
|
819
|
+
my $endPos34 = 0;
|
820
|
+
#continuous
|
821
|
+
if ($$UnIdentifiedChunk{counter} == 1) {
|
822
|
+
my $offset;
|
823
|
+
my $newComponent = $component;
|
824
|
+
my $newSepH = $SepH;
|
825
|
+
|
826
|
+
if (($chunk1start == 0) && ($chunk1end == $#$component)) {
|
827
|
+
$offset = 0;
|
828
|
+
}else {
|
829
|
+
$offset = $chunk1start;
|
830
|
+
#adjust $component and $SepH
|
831
|
+
$newComponent = ();
|
832
|
+
for my $tmpi($chunk1start .. $chunk1end) {
|
833
|
+
$$newComponent[$tmpi-$offset] = $$component[$tmpi];
|
834
|
+
}
|
835
|
+
|
836
|
+
foreach my $tmpSep(sort keys %{$newSepH}) {
|
837
|
+
if (($tmpSep >= $chunk1start) && ($tmpSep <= $chunk1end)) {
|
838
|
+
my $newSep = $tmpSep - $offset;
|
839
|
+
$$newSepH{$newSep} = $$newSepH{$tmpSep};
|
840
|
+
}
|
841
|
+
delete($$newSepH{$tmpSep});
|
842
|
+
}
|
843
|
+
}
|
844
|
+
|
845
|
+
#find the boundary between 34 and the other tag
|
846
|
+
if ($PuncNum > 1) {
|
847
|
+
$IdentifiedChunk = &Cont2ClassChunking($testp, \@TagsArray, "punc", $newSepH, $newComponent, $FeatureDictH, $tmpCacheVec, $SVMTmpResult);
|
848
|
+
}else {
|
849
|
+
$IdentifiedChunk = &Cont2ClassChunking($testp, \@TagsArray, "space", $newSepH, $newComponent, $FeatureDictH, $tmpCacheVec, $SVMTmpResult);
|
850
|
+
}
|
851
|
+
|
852
|
+
#adjust back $chunk
|
853
|
+
#get the position of the 3 4
|
854
|
+
foreach my $tmpi(sort keys %{$IdentifiedChunk}) {
|
855
|
+
if ($offset > 0) {
|
856
|
+
$$IdentifiedChunk{$tmpi}{startPos} += $offset;
|
857
|
+
$$IdentifiedChunk{$tmpi}{endPos} += $offset;
|
858
|
+
}
|
859
|
+
if ($$IdentifiedChunk{$tmpi}{cls} eq 4) {
|
860
|
+
$startPos34 = $$IdentifiedChunk{$tmpi}{startPos}; #absolute pos
|
861
|
+
$endPos34 = $$IdentifiedChunk{$tmpi}{endPos};
|
862
|
+
delete($$IdentifiedChunk{$tmpi});
|
863
|
+
}
|
864
|
+
}
|
865
|
+
push @ArrayofHash, $IdentifiedChunk;
|
866
|
+
|
867
|
+
}else { #if 2 discrete chunks
|
868
|
+
$IdentifiedChunk = &Disc2ClassChunking_2chunk($testp, \@TagsArray, $UnIdentifiedChunk, $FeatureDictH, $tmpCacheVec, $SVMTmpResult);
|
869
|
+
foreach my $tmpi(sort keys %{$IdentifiedChunk}) {
|
870
|
+
if ($$IdentifiedChunk{$tmpi}{cls} eq 4) {
|
871
|
+
$startPos34 = $$IdentifiedChunk{$tmpi}{startPos};
|
872
|
+
$endPos34 = $$IdentifiedChunk{$tmpi}{endPos};
|
873
|
+
delete($$IdentifiedChunk{$tmpi});
|
874
|
+
}
|
875
|
+
}
|
876
|
+
push @ArrayofHash, $IdentifiedChunk;
|
877
|
+
}
|
878
|
+
|
879
|
+
#find the boundary between 3 and 4
|
880
|
+
my $newComponent = (); #modified by Hui 03/19
|
881
|
+
my $newSepH = $SepH;
|
882
|
+
my $newPuncNum = 0;
|
883
|
+
my $offset = $startPos34;
|
884
|
+
for (my $tmpi=$startPos34; $tmpi<=$endPos34; $tmpi++) {
|
885
|
+
#modified by Hui 03/19/03 -$offset
|
886
|
+
$$newComponent[$tmpi-$offset] = $$component[$tmpi];
|
887
|
+
if ($$newComponent[$tmpi-$offset] =~ /^\W+$/) {
|
888
|
+
$newPuncNum++;
|
889
|
+
}
|
890
|
+
}
|
891
|
+
|
892
|
+
if ($newPuncNum > 1) {
|
893
|
+
foreach my $tmpSep(sort keys %{$$newSepH{punc}}) {
|
894
|
+
if (($tmpSep >= $startPos34) && ($tmpSep <= $endPos34)) {
|
895
|
+
my $newSep = $tmpSep - $offset;
|
896
|
+
$$newSepH{punc}{$newSep} = $$newSepH{punc}{$tmpSep};
|
897
|
+
}
|
898
|
+
delete($$newSepH{punc}{$tmpSep});
|
899
|
+
}
|
900
|
+
}else {
|
901
|
+
foreach my $tmpSep(sort keys %{$$newSepH{space}}) {
|
902
|
+
if (($tmpSep >= $startPos34) && ($tmpSep <= $endPos34)) {
|
903
|
+
my $newSep = $tmpSep - $offset;
|
904
|
+
$$newSepH{space}{$newSep} = $$newSepH{space}{$tmpSep};
|
905
|
+
}
|
906
|
+
delete($$newSepH{space}{$tmpSep});
|
907
|
+
}
|
908
|
+
}
|
909
|
+
|
910
|
+
my @NewTagsArray = ();
|
911
|
+
push @NewTagsArray, 3;
|
912
|
+
push @NewTagsArray, 4;
|
913
|
+
if ($newPuncNum > 1) {
|
914
|
+
$IdentifiedChunk = &Cont2ClassChunking($testp, \@NewTagsArray, "punc", $newSepH, $newComponent, $FeatureDictH, $tmpCacheVec, $SVMTmpResult);
|
915
|
+
}else {
|
916
|
+
$IdentifiedChunk = &Cont2ClassChunking($testp, \@NewTagsArray, "space", $newSepH, $newComponent, $FeatureDictH, $tmpCacheVec, $SVMTmpResult);
|
917
|
+
}
|
918
|
+
|
919
|
+
#adjust back $chunk
|
920
|
+
if ($offset > 0) {
|
921
|
+
foreach my $tmpi(sort keys %{$IdentifiedChunk}) {
|
922
|
+
$$IdentifiedChunk{$tmpi}{startPos} += $offset;
|
923
|
+
$$IdentifiedChunk{$tmpi}{endPos} += $offset;
|
924
|
+
}
|
925
|
+
}
|
926
|
+
push @ArrayofHash, $IdentifiedChunk;
|
927
|
+
$$TestH{$LN} = &FillChunkH($$TestH{$LN}, $component, \@ArrayofHash);
|
928
|
+
}elsif ($TagH{counter} > 2) { #3+ cases.
|
929
|
+
#consider about 3 discrete chunks for 3 tags????
|
930
|
+
if ($debug) {
|
931
|
+
print STDERR "do not care yet -- here is the case for 3+ classes after preprocessing \n";
|
932
|
+
#find the most likely position and expand to arround some(like 3) words
|
933
|
+
}
|
934
|
+
}
|
935
|
+
}
|
936
|
+
}
|
937
|
+
# }
|
938
|
+
|
939
|
+
return($TestH);
|
940
|
+
}
|
941
|
+
|
942
|
+
|
943
|
+
sub ExportInfo(){
|
944
|
+
my $TestH = shift;
|
945
|
+
my $outF = "output.txt";
|
946
|
+
open(WRITER, ">$outF") || die "SVMHeaderParse: could not open $outF to write: $!";
|
947
|
+
# foreach my $testHea(sort {$a <=> $b} keys %{$TestH}) {
|
948
|
+
print WRITER "headerno($testHea) -- ";
|
949
|
+
foreach my $LN(sort {$a <=> $b} keys %{$TestH}) {
|
950
|
+
print WRITER "lineno($LN)\: \n ";
|
951
|
+
foreach my $chunk(sort {$a <=> $b} keys %{$$TestH{$LN}{Pchunk}}) {
|
952
|
+
if ($chunk ne "ChunkCounter") {
|
953
|
+
print WRITER "\t chunk($chunk) -- class($$TestH{$LN}{Pchunk}{$chunk}{cls} <> content($$TestH{$LN}{Pchunk}{$chunk}{content} \n";
|
954
|
+
}
|
955
|
+
}
|
956
|
+
}
|
957
|
+
# }
|
958
|
+
close(WRITER);
|
959
|
+
}
|
960
|
+
|
961
|
+
|
962
|
+
sub ExportRDF(){
|
963
|
+
my $TestH = shift;
|
964
|
+
my $str='';
|
965
|
+
my $tempStr='';
|
966
|
+
foreach my $LN(sort {$a <=> $b} keys %{$TestH}) {
|
967
|
+
foreach my $chunk(sort {$a <=> $b} keys %{$$TestH{$LN}{Pchunk}}) {
|
968
|
+
my $tag = $InverseTagMap{$$TestH{$LN}{Pchunk}{$chunk}{cls}};
|
969
|
+
my $content = $$TestH{$LN}{Pchunk}{$chunk}{content};
|
970
|
+
if ($content =~ /\w+/) {
|
971
|
+
$str .="<$tag>$content</$tag>\n";
|
972
|
+
# if($tag =~/(url|note|date|abstract|intro|keyword|web|degree|pubnum|page)/){
|
973
|
+
# $tempStr .= "\n<cs_header:$tag>$content<\/cs_header:$tag>";
|
974
|
+
# }
|
975
|
+
}
|
976
|
+
}
|
977
|
+
}
|
978
|
+
|
979
|
+
# print "RDF:\n\n $str\n";
|
980
|
+
# print "$str\n";
|
981
|
+
$rXML = &HeaderParse::API::AssembleXMLMetadata::assemble(\$str);
|
982
|
+
return $rXML;
|
983
|
+
}
|
984
|
+
|
985
|
+
|
986
|
+
#Basic function: popuate information from line -- feature vector and class assignment and name patterns.
|
987
|
+
#no dictionary would be formed here
|
988
|
+
sub PopulateLineInfo4Header_unit() {
|
989
|
+
my $HeaderH = shift;
|
990
|
+
my %curState = ();
|
991
|
+
|
992
|
+
foreach my $line(sort {$a <=> $b} keys %{$HeaderH}) {
|
993
|
+
my $PureTextLine = $$HeaderH{$line}{RawContent};
|
994
|
+
$PureTextLine =~ s/(\<)*\<(\/)*(\w+)\>(\>)*/ /g; # remove the tags
|
995
|
+
$PureTextLine =~ s/\+L\+//g;
|
996
|
+
$PureTextLine =~ s/^\s+//g;
|
997
|
+
$PureTextLine =~ s/\s+$//g;
|
998
|
+
#should make punctuation separate!
|
999
|
+
$$HeaderH{$line}{PureText} = &SeparatePunc($PureTextLine);
|
1000
|
+
$$HeaderH{$line}{FeaVec} = &LineFeatureRepre($$HeaderH{$line}{PureText});
|
1001
|
+
#add the position of the line here!!!!
|
1002
|
+
$$HeaderH{$line}{FeaVec}{Clinepos} = $line;
|
1003
|
+
my $textFeaVec = "";
|
1004
|
+
foreach my $fea(keys %{$$HeaderH{$line}{FeaVec}}) {
|
1005
|
+
if($$HeaderH{$line}{FeaVec}{$fea} == 0) {
|
1006
|
+
delete ($$HeaderH{$line}{FeaVec}{$fea});
|
1007
|
+
}else {
|
1008
|
+
$textFeaVec .= "$fea($$HeaderH{$line}{FeaVec}{$fea}) ";
|
1009
|
+
}
|
1010
|
+
}
|
1011
|
+
$$HeaderH{$line}{TextFeaVec} = $textFeaVec; # for read and debug
|
1012
|
+
|
1013
|
+
#assign class tag to each line -- not separator <<sep>><</sep>> here
|
1014
|
+
if ($$HeaderH{$line}{RawContent} =~ /([^\<]+|(^\s*))\<(\/)*(\w+)\>($|[^\<]+)/) {
|
1015
|
+
%curState = ();
|
1016
|
+
my $tmpIndex = 0; # the order of this tag showed up last time
|
1017
|
+
my $preTag = -1;
|
1018
|
+
my $mul = 0;
|
1019
|
+
while ($$HeaderH{$line}{RawContent} =~ /([^\<]+|(^\s*))\<(\/)*(\w+)\>($|[^\<]+)/g) {
|
1020
|
+
$tmpIndex++;
|
1021
|
+
my $tmptag = $4;
|
1022
|
+
if (($preTag > 0) && ($preTag ne $tagMap{$tmptag})) {
|
1023
|
+
$mul = 1;
|
1024
|
+
}
|
1025
|
+
$curState{$tagMap{$tmptag}} = $tmpIndex;
|
1026
|
+
$preTag = $tagMap{$tmptag};
|
1027
|
+
}
|
1028
|
+
|
1029
|
+
if ($mul) {
|
1030
|
+
$$HeaderH{$line}{TClass} = "m";
|
1031
|
+
my $order = 1;
|
1032
|
+
foreach my $tag(sort {$curState{$a} <=> $curState{$b}} keys %curState) {
|
1033
|
+
$$HeaderH{$line}{MClsName}{$tag} = $order;
|
1034
|
+
$order++;
|
1035
|
+
}
|
1036
|
+
|
1037
|
+
#represent the class distribution only for this multi-class case.
|
1038
|
+
my $Tline = $$HeaderH{$line}{RawContent};
|
1039
|
+
#main purpose is to combine </phone><email> as one <s>
|
1040
|
+
$Tline =~ s/\<(\/)*(\w+)\>/<s>/g; #replace the tags with <s>
|
1041
|
+
$Tline =~ s/^\s*<s>\s*//g;
|
1042
|
+
$Tline =~ s/\s*<s>\s*$//g;
|
1043
|
+
$Tline =~ s/<s>\s*<s>/<s>/g;
|
1044
|
+
$Tline =~ s/\s+/ /g;
|
1045
|
+
|
1046
|
+
$Tline = &SeparatePunc($Tline);
|
1047
|
+
|
1048
|
+
while ($Tline =~ /(\s+(\W+)\s+<s>)/g) {
|
1049
|
+
my $whole = $1;
|
1050
|
+
my $punc = $2;
|
1051
|
+
$punc =~ s/^\s+//g;
|
1052
|
+
$punc =~ s/\s+$//g;
|
1053
|
+
|
1054
|
+
if ($punc eq "\|") {
|
1055
|
+
$Tline =~ s/\|/\!\!\!/g;
|
1056
|
+
$whole =~ s/\|/\!\!\!/g;
|
1057
|
+
}
|
1058
|
+
$Tline =~ s/$whole/<<sep>>$punc<<\/sep>>/g; #only once no "g"
|
1059
|
+
if ($punc eq "\|") {
|
1060
|
+
$Tline =~ s/\!\!\!//g;
|
1061
|
+
$whole =~ s/\!\!\!//g;
|
1062
|
+
}
|
1063
|
+
}
|
1064
|
+
while ($Tline =~ /(<s>\s+(\W+)\s+)/g) {
|
1065
|
+
my $whole = $1;
|
1066
|
+
my $punc = $2;
|
1067
|
+
$punc =~ s/^\s+//g;
|
1068
|
+
$punc =~ s/\s+$//g;
|
1069
|
+
if ($punc eq "\|") {
|
1070
|
+
$Tline =~ s/\|/\!\!\!/g;
|
1071
|
+
$whole =~ s/\|/\!\!\!/g;
|
1072
|
+
}
|
1073
|
+
$Tline =~ s/$whole/<<sep>>$punc<<\/sep>>/g; #only once no "g"
|
1074
|
+
if ($punc eq "\|") {
|
1075
|
+
$Tline =~ s/\!\!\!/\|/g;
|
1076
|
+
$whole =~ s/\!\!\!/\|/g;
|
1077
|
+
}
|
1078
|
+
}
|
1079
|
+
$Tline =~ s/<s>/<<sep>><<\/sep>>/g;
|
1080
|
+
my ($PuncNum, $SepH, $component) = &GetSeparatorIndex($Tline);
|
1081
|
+
#Populate Truth Hash by the chunk and word-class distribution
|
1082
|
+
$$HeaderH{$line} = &AssignWordTagFromChunk($$HeaderH{$line}, $SepH, $component);
|
1083
|
+
}else {
|
1084
|
+
$$HeaderH{$line}{TClass} = "s";
|
1085
|
+
my @Tarr = split(/\s+/, $PureTextLine);
|
1086
|
+
$$HeaderH{$line}{SClsWordCount} = $#Tarr +1;
|
1087
|
+
foreach my $tag(sort {$curState{$a} <=> $curState{$b}} keys %curState) {
|
1088
|
+
$$HeaderH{$line}{SClsName} = $tag;
|
1089
|
+
}
|
1090
|
+
|
1091
|
+
#Fill in the word-class distribution for single class line
|
1092
|
+
my $lineContent = &SeparatePunc($$HeaderH{$line}{PureText});
|
1093
|
+
my @wordArray = split(/\s+/, $lineContent);
|
1094
|
+
undef $lineContent;
|
1095
|
+
|
1096
|
+
$$HeaderH{$line} = &AssignWordTag4SingleClassLine("truth", $$HeaderH{$line}{SClsName}, $$HeaderH{$line}, \@wordArray);
|
1097
|
+
|
1098
|
+
#but only multi-author has multiple chunks
|
1099
|
+
#all reasonable name patterns for space separated names
|
1100
|
+
#feature vec for each space namepatterns and puncutation separators
|
1101
|
+
#Test/prediction will base on the predicted line tag in another module
|
1102
|
+
|
1103
|
+
#single author
|
1104
|
+
if ($$HeaderH{$line}{SClsName} eq "2") {
|
1105
|
+
#From Truth
|
1106
|
+
if ($$HeaderH{$line}{RawContent} !~ /<<sep>>/) {
|
1107
|
+
#could we save space by indicating the pure text directly
|
1108
|
+
$$HeaderH{$line}{Tchunk}{$i}{cls} = 2;
|
1109
|
+
$$HeaderH{$line}{Tchunk}{$i}{content} = $$HeaderH{$line}{PureText};
|
1110
|
+
#multiple authors
|
1111
|
+
}else {
|
1112
|
+
my $Tline = $$HeaderH{$line}{RawContent};
|
1113
|
+
$Tline =~ s/<(\/)*author>//g;
|
1114
|
+
|
1115
|
+
my ($PuncNum, $SepH, $component) = &GetSeparatorIndex($Tline);
|
1116
|
+
my $nameStr = join(" ", @$component);
|
1117
|
+
|
1118
|
+
#judge this is punctuated line or pure text-space
|
1119
|
+
if ($$HeaderH{$line}{PureText} =~ /([^\p{IsLower}\p{IsUpper}\s+\-\.\d+])|(\W+and\W+)/ig) {
|
1120
|
+
#multi-class needs while ... $punc++;
|
1121
|
+
$$HeaderH{$line}{NamePunc} = 1;
|
1122
|
+
}else {
|
1123
|
+
$$HeaderH{$line}{NameSpace} = 1;
|
1124
|
+
}
|
1125
|
+
|
1126
|
+
#{NamePuncFeaVec} and {NameSpaceFeaVec} based on number of puncs (>2)
|
1127
|
+
#{MulClsPuncFeaVec}
|
1128
|
+
|
1129
|
+
######common to both name space and name punc ######
|
1130
|
+
my $TrueNames = &HeaderParse::API::NamePatternMatch::GetTrueName($nameStr);
|
1131
|
+
for my $i(0 .. $#$TrueNames) {
|
1132
|
+
my $j = $i+1; #chunk should start from 1
|
1133
|
+
$$HeaderH{$line}{Tchunk}{$j}{cls} = 2;
|
1134
|
+
$$HeaderH{$line}{Tchunk}{$j}{content} = "$$TrueNames[$i]";
|
1135
|
+
}
|
1136
|
+
################################################
|
1137
|
+
|
1138
|
+
if ($$HeaderH{$line}{NamePunc}) {
|
1139
|
+
}else {
|
1140
|
+
my $PredictedNames = &HeaderParse::API::NamePatternMatch::NamePatternMatch($nameStr);
|
1141
|
+
if ($#$PredictedNames < 1) {
|
1142
|
+
#only one pattern -- do not fill name pattern
|
1143
|
+
}else {
|
1144
|
+
my $TrueIndex = &HeaderParse::API::NamePatternMatch::Duplicate($TrueNames, $PredictedNames);
|
1145
|
+
#must solve the problem
|
1146
|
+
if ($TrueIndex eq "-1") {
|
1147
|
+
if ($debug) {
|
1148
|
+
print STDERR "here the true name($TrueNames) is null from the line $content \n";
|
1149
|
+
}
|
1150
|
+
}else {
|
1151
|
+
#populate all reasonable name patterns
|
1152
|
+
for my $i(0 .. $#$PredictedNames) {
|
1153
|
+
my $candidateName = "";
|
1154
|
+
for my $j(0 .. $#{$$PredictedNames[$i]}) {
|
1155
|
+
if ($$PredictedNames[$i][$j]) {
|
1156
|
+
$candidateName .= "$$PredictedNames[$i][$j]<>";
|
1157
|
+
}
|
1158
|
+
}
|
1159
|
+
# print "candidate name\: $candidateName ";
|
1160
|
+
$$HeaderH{$line}{NamePattern}{$candidateName}{content} = $candidateName;
|
1161
|
+
($$HeaderH{$line}{NamePattern}{$candidateName}{SpaceNameVec}) = &SpaceNameLnFeaRepre_unit($candidateName);
|
1162
|
+
if ($i eq $TrueIndex) {
|
1163
|
+
$$HeaderH{$line}{NamePattern}{$candidateName}{tag} = 1;
|
1164
|
+
}else {
|
1165
|
+
$$HeaderH{$line}{NamePattern}{$candidateName}{tag} = -1;
|
1166
|
+
}
|
1167
|
+
}
|
1168
|
+
}
|
1169
|
+
}
|
1170
|
+
}
|
1171
|
+
}
|
1172
|
+
}
|
1173
|
+
}
|
1174
|
+
}else { #if there is no explicit tag for this line, this line only belongs to the last class of the previous line
|
1175
|
+
my $tmpI = 0;
|
1176
|
+
foreach my $state (sort {$curState{$b} <=> $curState{$a}} keys %curState) {
|
1177
|
+
if ($tmpI > 0) {
|
1178
|
+
delete ($curState{$state});
|
1179
|
+
} #only keep the last tag
|
1180
|
+
$tmpI++;
|
1181
|
+
}
|
1182
|
+
$$HeaderH{$line}{TClass} = "s";
|
1183
|
+
foreach my $tag(sort {$curState{$a} <=> $curState{$b}} keys %curState) {
|
1184
|
+
$$HeaderH{$line}{SClsName} = $tag;
|
1185
|
+
}
|
1186
|
+
}
|
1187
|
+
}
|
1188
|
+
|
1189
|
+
return($HeaderH);
|
1190
|
+
}
|
1191
|
+
|
1192
|
+
|
1193
|
+
sub VectorizeUnknownHeaderLine () {
|
1194
|
+
my $HeaderH = shift;
|
1195
|
+
|
1196
|
+
my %curState = ();
|
1197
|
+
foreach my $line(sort {$a <=> $b} keys %{$HeaderH}) {
|
1198
|
+
my $PureTextLine = $$HeaderH{$line}{RawContent};
|
1199
|
+
# print "LINE $line: $PureTextLine\n";
|
1200
|
+
$PureTextLine =~ s/^\s+//g;
|
1201
|
+
$PureTextLine =~ s/\s+$//g;
|
1202
|
+
#should make punctuation separate!
|
1203
|
+
$$HeaderH{$line}{PureText} = &SeparatePunc($PureTextLine);
|
1204
|
+
|
1205
|
+
my @Tarr = split(/\s+/, $PureTextLine);
|
1206
|
+
$$HeaderH{$line}{SClsWordCount} = $#Tarr +1;
|
1207
|
+
$$HeaderH{$line}{FeaVec} = &LineFeatureRepre($$HeaderH{$line}{PureText});
|
1208
|
+
# foreach my $key (keys %{$$HeaderH{$line}{FeaVec}}) {
|
1209
|
+
# print "$key :: ".${$$HeaderH{$line}{FeaVec}}{$key}."\n";
|
1210
|
+
# }
|
1211
|
+
# print "\n";
|
1212
|
+
#add the position of the line here!!!!
|
1213
|
+
$$HeaderH{$line}{FeaVec}{Clinepos} = $line;
|
1214
|
+
|
1215
|
+
my $textFeaVec = "";
|
1216
|
+
foreach my $fea(keys %{$$HeaderH{$line}{FeaVec}}) {
|
1217
|
+
if($$HeaderH{$line}{FeaVec}{$fea} == 0) {
|
1218
|
+
delete ($$HeaderH{$line}{FeaVec}{$fea});
|
1219
|
+
}else {
|
1220
|
+
$textFeaVec .= "$fea($$HeaderH{$line}{FeaVec}{$fea}) ";
|
1221
|
+
}
|
1222
|
+
}
|
1223
|
+
$$HeaderH{$line}{TextFeaVec} = $textFeaVec; # for read and debug
|
1224
|
+
}
|
1225
|
+
|
1226
|
+
return($HeaderH);
|
1227
|
+
}
|
1228
|
+
|
1229
|
+
|
1230
|
+
#training data are assigned the true neighbour lines' tag
|
1231
|
+
sub TrainAssignLineTag() {
|
1232
|
+
my $FeatureDictH = shift;
|
1233
|
+
my $HeaderH = shift;
|
1234
|
+
my %curState = ();
|
1235
|
+
|
1236
|
+
foreach my $line(sort {$a <=> $b} keys %{$HeaderH}) {
|
1237
|
+
my $PC = 1; # 0 means the tag for current line (which might be useful)
|
1238
|
+
my $Pline = $line - $PC;
|
1239
|
+
while (($PC < 5) && ($Pline > 0)) { #previous line
|
1240
|
+
if (exists $$HeaderH{$Pline}{TClass}) {
|
1241
|
+
if ($$HeaderH{$Pline}{TClass} eq "s") {
|
1242
|
+
my $ContextFea = "P"."$PC"."$$HeaderH{$Pline}{SClsName}";
|
1243
|
+
if (! $$FeatureDictH{$ContextFea}{ID}) {
|
1244
|
+
$$FeatureDictH{FeatureCounter}++;
|
1245
|
+
$$FeatureDictH{$ContextFea}{ID} = $$FeatureDictH{FeatureCounter};
|
1246
|
+
$$FeatureDictH{$ContextFea}{max} = 0.5;
|
1247
|
+
}
|
1248
|
+
|
1249
|
+
if ($$FeatureDictH{$ContextFea}{ID}) {
|
1250
|
+
$$HeaderH{$line}{ContextFeaVec}{$ContextFea} = 0.5;
|
1251
|
+
$$FeatureDictH{$ContextFea}{DF}++;
|
1252
|
+
}
|
1253
|
+
}else { # consider the order of the tag
|
1254
|
+
foreach my $tag(sort {$$HeaderH{$Pline}{MClsName}{$a} <=> $$HeaderH{$Pline}{MClsName}{$b}} keys %{$$HeaderH{$Pline}{MClsName}}){
|
1255
|
+
my $ContextFea = "P"."$PC"."$tag";
|
1256
|
+
if (! $$FeatureDictH{$ContextFea}{ID}) {
|
1257
|
+
$$FeatureDictH{FeatureCounter}++;
|
1258
|
+
$$FeatureDictH{$ContextFea}{ID} = $$FeatureDictH{FeatureCounter};
|
1259
|
+
$$FeatureDictH{$ContextFea}{max} = 0.5;
|
1260
|
+
}
|
1261
|
+
if ($$FeatureDictH{$ContextFea}{ID}) {
|
1262
|
+
$$HeaderH{$line}{ContextFeaVec}{$ContextFea} = 0.5;
|
1263
|
+
$$FeatureDictH{$ContextFea}{DF}++;
|
1264
|
+
}
|
1265
|
+
}
|
1266
|
+
}
|
1267
|
+
$PC++;
|
1268
|
+
$Pline = $line - $PC;
|
1269
|
+
}else {
|
1270
|
+
last;
|
1271
|
+
}
|
1272
|
+
}
|
1273
|
+
|
1274
|
+
my $NC = 1;
|
1275
|
+
my $Nline = $line + $NC;
|
1276
|
+
while (($NC < 5) && (exists $$HeaderH{$Nline})) { #next line
|
1277
|
+
if ($$HeaderH{$Nline}{TClass} eq "s") {
|
1278
|
+
my $ContextFea = "N"."$NC"."$$HeaderH{$Nline}{SClsName}";
|
1279
|
+
if (! $$FeatureDictH{$ContextFea}{ID}) {
|
1280
|
+
$$FeatureDictH{FeatureCounter}++;
|
1281
|
+
$$FeatureDictH{$ContextFea}{ID} = $$FeatureDictH{FeatureCounter};
|
1282
|
+
$$FeatureDictH{$ContextFea}{max} = 0.5;
|
1283
|
+
}
|
1284
|
+
if ($$FeatureDictH{$ContextFea}{ID}) {
|
1285
|
+
$$HeaderH{$line}{ContextFeaVec}{$ContextFea} = 0.5;
|
1286
|
+
$$FeatureDictH{$ContextFea}{DF}++;
|
1287
|
+
}
|
1288
|
+
}else { # consider the order of the tag
|
1289
|
+
foreach my $tag(sort {$$HeaderH{$Nline}{MClsName}{$a} <=> $$HeaderH{$Nline}{MClsName}{$b}} keys %{$$HeaderH{$Nline}{MClsName}}){
|
1290
|
+
my $ContextFea = "N"."$NC"."$tag";
|
1291
|
+
if (! $$FeatureDictH{$ContextFea}{ID}) {
|
1292
|
+
$$FeatureDictH{FeatureCounter}++;
|
1293
|
+
$$FeatureDictH{$ContextFea}{ID} = $$FeatureDictH{FeatureCounter};
|
1294
|
+
$$FeatureDictH{$ContextFea}{max} = 0.5;
|
1295
|
+
}
|
1296
|
+
if ($$FeatureDictH{$ContextFea}{ID}) {
|
1297
|
+
$$HeaderH{$line}{ContextFeaVec}{$ContextFea} = 0.5;
|
1298
|
+
$$FeatureDictH{$ContextFea}{DF}++;
|
1299
|
+
}
|
1300
|
+
}
|
1301
|
+
}
|
1302
|
+
$NC++;
|
1303
|
+
$Nline = $line + $NC;
|
1304
|
+
}
|
1305
|
+
|
1306
|
+
#assemble features and their weight into string without normalization
|
1307
|
+
my $tmpFeaVec = $$HeaderH{$line}{SVMFeaVec};
|
1308
|
+
foreach my $fea(sort{$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$HeaderH{$line}{ContextFeaVec}}) {
|
1309
|
+
if (exists $$FeatureDictH{$fea}{ID}) {
|
1310
|
+
$tmpFeaVec .= "$$FeatureDictH{$fea}{ID}\:$$HeaderH{$line}{ContextFeaVec}{$fea} ";
|
1311
|
+
}
|
1312
|
+
}
|
1313
|
+
$$HeaderH{$line}{ContextSVMFeaVec} = "$tmpFeaVec";
|
1314
|
+
}
|
1315
|
+
return($FeatureDictH, $HeaderH);
|
1316
|
+
}
|
1317
|
+
|
1318
|
+
sub TestAssignLineTag() {
|
1319
|
+
my $FeatureDictH = shift;
|
1320
|
+
my $HeaderH = shift;
|
1321
|
+
my %curState = ();
|
1322
|
+
|
1323
|
+
foreach $line(sort {$a <=> $b} keys %{$HeaderH}) {
|
1324
|
+
#Initialize-remove the $$HeaderH{$line}{ContextFeaVec}
|
1325
|
+
if(exists ($$HeaderH{$line}{ContextFeaVec})) {
|
1326
|
+
delete($$HeaderH{$line}{ContextFeaVec});
|
1327
|
+
}
|
1328
|
+
|
1329
|
+
my $PC = 1; # 0 means the tag for current line (which might be useful)
|
1330
|
+
my $Pline = $line - $PC;
|
1331
|
+
while (($PC < 5) && ($Pline > 0)) { #previous line
|
1332
|
+
if (exists $$HeaderH{$Pline}{Pretag}) {
|
1333
|
+
foreach my $tag(sort keys %{$$HeaderH{$Pline}{Pretag}}){
|
1334
|
+
my $ContextFea = "P"."$PC"."$tag";
|
1335
|
+
if ($$FeatureDictH{$ContextFea}{ID}) {
|
1336
|
+
$$HeaderH{$line}{ContextFeaVec}{$ContextFea} = 0.5;
|
1337
|
+
}
|
1338
|
+
}
|
1339
|
+
}
|
1340
|
+
$PC++;
|
1341
|
+
$Pline = $line - $PC;
|
1342
|
+
}
|
1343
|
+
|
1344
|
+
my $NC = 1;
|
1345
|
+
my $Nline = $line + $NC;
|
1346
|
+
while (($NC < 5) && (exists $$HeaderH{$Nline})) { #next line
|
1347
|
+
foreach my $tag(sort keys %{$$HeaderH{$Nline}{Pretag}}){
|
1348
|
+
my $ContextFea = "N"."$NC"."$tag";
|
1349
|
+
if ($$FeatureDictH{$ContextFea}{ID}) {
|
1350
|
+
$$HeaderH{$line}{ContextFeaVec}{$ContextFea} = 0.5;
|
1351
|
+
}
|
1352
|
+
}
|
1353
|
+
$NC++;
|
1354
|
+
$Nline = $line + $NC;
|
1355
|
+
}
|
1356
|
+
|
1357
|
+
#assemble features and their weight into string without normalization
|
1358
|
+
my $tmpFeaVec = $$HeaderH{$line}{SVMFeaVec};
|
1359
|
+
|
1360
|
+
foreach my $fea(sort{$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$HeaderH{$line}{ContextFeaVec}}) {
|
1361
|
+
if (exists $$FeatureDictH{$fea}{ID}) {
|
1362
|
+
$tmpFeaVec .= "$$FeatureDictH{$fea}{ID}\:$$HeaderH{$line}{ContextFeaVec}{$fea} ";
|
1363
|
+
}
|
1364
|
+
}
|
1365
|
+
$tmpFeaVec =~ s/\s+$//g;
|
1366
|
+
$$HeaderH{$line}{ContextSVMFeaVec} = "$tmpFeaVec";
|
1367
|
+
}
|
1368
|
+
return($FeatureDictH, $HeaderH);
|
1369
|
+
}
|
1370
|
+
|
1371
|
+
#given a line, check the number and the position of punctuation/space it contains
|
1372
|
+
sub GetSeparatorIndex() {
|
1373
|
+
my $line = shift;
|
1374
|
+
my %SeparatorH = ();
|
1375
|
+
|
1376
|
+
my $PuncNum = 0;
|
1377
|
+
$line =~ s/^\s+//g;
|
1378
|
+
$line =~ s/\s+$//g;
|
1379
|
+
|
1380
|
+
#punc means this line contains punc or only space
|
1381
|
+
#each space occupies a position and punctuations are separate
|
1382
|
+
my ($punc, $spaceLine) = &FillSpace($line);
|
1383
|
+
|
1384
|
+
#punctuation is specific; space separator contains punctuation separators.
|
1385
|
+
my @component = split(/\s+/, $spaceLine);
|
1386
|
+
foreach my $i(0 .. $#component) {
|
1387
|
+
if ($component[$i] =~ /<<sep>>(\W+|\s*)<<\/sep>>/) {
|
1388
|
+
$component[$i] = $1;
|
1389
|
+
if ($component[$i] eq "") {
|
1390
|
+
$component[$i] = "<<sep>><<\/sep>>";
|
1391
|
+
$SeparatorH{space}{$i} = 2;
|
1392
|
+
}else {
|
1393
|
+
$SeparatorH{punc}{$i} = 2;
|
1394
|
+
$PuncNum++;
|
1395
|
+
$SeparatorH{space}{$i} = 2;
|
1396
|
+
}
|
1397
|
+
}elsif ($component[$i] =~ /<space>/) {
|
1398
|
+
$SeparatorH{space}{$i} = 1;
|
1399
|
+
}elsif ($component[$i] =~ /^[^\p{IsLower}\p{IsUpper}\s+\-\d+]+$/) {
|
1400
|
+
$SeparatorH{punc}{$i} = 1; #position(not what punc)
|
1401
|
+
$PuncNum++;
|
1402
|
+
$SeparatorH{space}{$i} = 1;
|
1403
|
+
}
|
1404
|
+
}
|
1405
|
+
return($PuncNum, \%SeparatorH, \@component);
|
1406
|
+
}
|
1407
|
+
|
1408
|
+
|
1409
|
+
#multi-Authors line still has only one class, although 1+ authors
|
1410
|
+
sub AssignWordTagFromChunk() {
|
1411
|
+
my ($LineH, $SepH, $component) = @_;
|
1412
|
+
my @tags = ();
|
1413
|
+
foreach my $tag(sort {$$LineH{MClsName}{$a} <=> $$LineH{MClsName}{$b}} keys %{$$LineH{MClsName}}) {
|
1414
|
+
push @tags, $tag;
|
1415
|
+
}
|
1416
|
+
|
1417
|
+
my $ChunkNO = 1;
|
1418
|
+
my $curTag = $tags[$tagP];
|
1419
|
+
my $WordPos = 1;
|
1420
|
+
my $chunk = "";
|
1421
|
+
for my $i(0 .. $#$component) {
|
1422
|
+
#we do not assign class to separators
|
1423
|
+
if ($$SepH{space}{$i} >1) {
|
1424
|
+
if ($chunk ne "") {
|
1425
|
+
$$LineH{Tchunk}{$ChunkNO}{cls} = $curTag;
|
1426
|
+
$$LineH{Tchunk}{$ChunkNO}{content} = $chunk;
|
1427
|
+
$chunk = "";
|
1428
|
+
$curTag = $tags[$ChunkNO];
|
1429
|
+
$ChunkNO++;
|
1430
|
+
}
|
1431
|
+
}elsif ($$component[$i] !~ /<space>|^\W+$|\<\<.*\>\>/) { # such as <<sep>> <</sep>>
|
1432
|
+
$chunk .= "$$component[$i] ";
|
1433
|
+
$$LineH{Tline}{$WordPos}{cls} = $curTag;
|
1434
|
+
$$LineH{Tline}{$WordPos}{OriginalWord} = $$component[$i];
|
1435
|
+
$WordPos++;
|
1436
|
+
}
|
1437
|
+
};
|
1438
|
+
|
1439
|
+
#Fill in the last chunk
|
1440
|
+
$$LineH{Tchunk}{$ChunkNO}{cls} = $curTag;
|
1441
|
+
$$LineH{Tchunk}{$ChunkNO}{content} = $chunk;
|
1442
|
+
|
1443
|
+
return ($LineH);
|
1444
|
+
}
|
1445
|
+
|
1446
|
+
|
1447
|
+
sub AssignWordTag4SingleClassLine() {
|
1448
|
+
my ($type, $curTag, $LineH, $component) = @_;
|
1449
|
+
|
1450
|
+
my $WordPos = 1;
|
1451
|
+
for my $i(0 .. $#$component) {
|
1452
|
+
if ($$component[$i] !~ /<space>|^\W+$|\<\<.*\>\>/) { # such as <<sep>> <</sep>>
|
1453
|
+
if ($type eq "truth") {
|
1454
|
+
$$LineH{Tline}{$WordPos}{cls} = $curTag;
|
1455
|
+
#added 01/08 the original word in a position
|
1456
|
+
$$LineH{Tline}{$WordPos}{OriginalWord} = $$component[$i];
|
1457
|
+
}elsif ($type eq "predict") {
|
1458
|
+
$$LineH{Pline}{$WordPos}{cls} = $curTag;
|
1459
|
+
$$LineH{Pline}{$WordPos}{OriginalWord} = $$component[$i];
|
1460
|
+
}
|
1461
|
+
$WordPos++;
|
1462
|
+
}
|
1463
|
+
}
|
1464
|
+
|
1465
|
+
return ($LineH);
|
1466
|
+
}
|
1467
|
+
|
1468
|
+
|
1469
|
+
sub Analyze() {
|
1470
|
+
my $resultF = shift;
|
1471
|
+
open(resultFH, "$resultF") || die "SVMHeaderParse: could not open $resultF to read: $!";
|
1472
|
+
my $result = <resultFH>;
|
1473
|
+
close(resultFH);
|
1474
|
+
$result =~ s/\s+$//g;
|
1475
|
+
return($result);
|
1476
|
+
}
|
1477
|
+
|
1478
|
+
|
1479
|
+
sub ReadFeatureDict() {
|
1480
|
+
my $Fname = shift;
|
1481
|
+
my %FeatureDictH;
|
1482
|
+
|
1483
|
+
open (FH, "$Fname") || die "SVMHeaderParse: could not open $Fname to read: $!";
|
1484
|
+
while (my $line = <FH>) {
|
1485
|
+
my ($ID, $fea, $max, $DF) = split(/<>/, $line);
|
1486
|
+
$ID =~ s/^\s+//g;
|
1487
|
+
$ID =~ s/\s+$//g;
|
1488
|
+
|
1489
|
+
if ($fea =~ /FeatureCounter/) {
|
1490
|
+
$FeatureDictH{$fea}{num} = $ID;
|
1491
|
+
next;
|
1492
|
+
}
|
1493
|
+
|
1494
|
+
$fea =~ s/^\s+//g;
|
1495
|
+
$fea =~ s/\s+$//g;
|
1496
|
+
$max =~ s/^\s+//g;
|
1497
|
+
$max =~ s/\s+$//g;
|
1498
|
+
$DF =~ s/^\s+//g;
|
1499
|
+
$Df =~ s/\s+$//g;
|
1500
|
+
$FeatureDictH{$fea}{ID} = $ID;
|
1501
|
+
$FeatureDictH{$fea}{max} = $max;
|
1502
|
+
$FeatureDictH{$fea}{DF} = $DF;
|
1503
|
+
}
|
1504
|
+
close(FH);
|
1505
|
+
return(\%FeatureDictH);
|
1506
|
+
}
|
1507
|
+
|
1508
|
+
|
1509
|
+
sub printTrainData() {
|
1510
|
+
my $affix = shift;
|
1511
|
+
my $HeaderH = shift;
|
1512
|
+
|
1513
|
+
#Sometimes $$HeaderH{$HeaNO}{$LN}{ContextSVMFeaVec} is not null, but
|
1514
|
+
#$$HeaderH{$HeaNO}{$LN}{SVMFeaVec} is null so they have different file length!
|
1515
|
+
for my $clsNO(1 .. 15) {
|
1516
|
+
my $F = "$offlineD"."$clsNO"."\."."$affix";
|
1517
|
+
open(FH, ">$F") || die "SVMHeaderParse: could not open $F to write: $!";
|
1518
|
+
foreach my $HeaNO (sort {$a <=> $b} keys %{$HeaderH}) {
|
1519
|
+
foreach my $LN(sort {$a <=> $b} keys %{$$HeaderH{$HeaNO}}) {
|
1520
|
+
if ($$HeaderH{$HeaNO}{$LN}{SVMFeaVec} ne "") {
|
1521
|
+
if ($affix eq "train") {
|
1522
|
+
if ($$HeaderH{$HeaNO}{$LN}{SVMFeaVec} ne "") {
|
1523
|
+
if (($$HeaderH{$HeaNO}{$LN}{SClsName} eq "$clsNO") || exists($$HeaderH{$HeaNO}{$LN}{MClsName}{$clsNO})) {
|
1524
|
+
print FH "1 $$HeaderH{$HeaNO}{$LN}{SVMFeaVec}\n";
|
1525
|
+
}else {
|
1526
|
+
print FH "-1 $$HeaderH{$HeaNO}{$LN}{SVMFeaVec}\n";
|
1527
|
+
}
|
1528
|
+
}
|
1529
|
+
}elsif ($affix eq "context") {
|
1530
|
+
#if ($$HeaderH{$HeaNO}{$LN}{ContextSVMFeaVec} ne "") {
|
1531
|
+
if (($$HeaderH{$HeaNO}{$LN}{SClsName} eq "$clsNO") || exists($$HeaderH{$HeaNO}{$LN}{MClsName}{$clsNO})) {
|
1532
|
+
print FH "1 $$HeaderH{$HeaNO}{$LN}{ContextSVMFeaVec}\n";
|
1533
|
+
}else {
|
1534
|
+
print FH "-1 $$HeaderH{$HeaNO}{$LN}{ContextSVMFeaVec}\n";
|
1535
|
+
}
|
1536
|
+
}else {
|
1537
|
+
print "weired -- $affix is not context nor train \n";
|
1538
|
+
}
|
1539
|
+
}
|
1540
|
+
}
|
1541
|
+
}
|
1542
|
+
close(FH);
|
1543
|
+
}
|
1544
|
+
}
|
1545
|
+
|
1546
|
+
|
1547
|
+
sub printNameSpaceTrainData(){
|
1548
|
+
my $printF = shift;
|
1549
|
+
my $NameSpaceTrainVecH = shift;
|
1550
|
+
|
1551
|
+
open(FH, ">$printF") || die "SVMHeaderParse: could not open $printF to write: $!";
|
1552
|
+
foreach my $Lcount(sort{$a<=>$b} keys %{$NameSpaceTrainVecH}) {
|
1553
|
+
print FH "$$NameSpaceTrainVecH{$Lcount}{SpaceNameVec}\n";
|
1554
|
+
}
|
1555
|
+
close(FH);
|
1556
|
+
}
|
1557
|
+
|
1558
|
+
|
1559
|
+
sub SpaceNameLnFeaRepre() {
|
1560
|
+
my $type = shift;
|
1561
|
+
my $NamePatternStr = shift;
|
1562
|
+
my $NameDictH = shift;
|
1563
|
+
|
1564
|
+
#feature generation and representation
|
1565
|
+
#It is good to make each of the apple's feature(color, shape..) separate.
|
1566
|
+
my %FeatureH = ();
|
1567
|
+
$NamePatternStr =~ s/\<\>$//g; #remove the last <>
|
1568
|
+
my @Names = split(/<>/, $NamePatternStr);
|
1569
|
+
|
1570
|
+
#try making features binary
|
1571
|
+
for my $i(0 .. $#Names) {
|
1572
|
+
my @NameComponent = split(/\s+/, $Names[$i]);
|
1573
|
+
for my $j(0 .. $#NameComponent){
|
1574
|
+
|
1575
|
+
#feature generation($i = 0 is the first one)
|
1576
|
+
$FeatureH{"Name"."$i"."part"."$j"."form"} = &HeaderParse::API::NamePatternMatch::RichNameType($NameComponent[$j]);
|
1577
|
+
if ($j eq $#NameComponent) {
|
1578
|
+
$FeatureH{"Name"."$i"."part"."$j"."pos"} = "Last";
|
1579
|
+
}elsif ($j eq $#NameComponent -1) {
|
1580
|
+
$FeatureH{"Name"."$i"."part"."$j"."pos"} = "SecLast";
|
1581
|
+
}else {
|
1582
|
+
$FeatureH{"Name"."$i"."part"."$j"."pos"} = $j;
|
1583
|
+
}
|
1584
|
+
|
1585
|
+
#firstname, lastname information
|
1586
|
+
# print "hello: ".lc($NameComponent[$j])."\n";
|
1587
|
+
if (($firstnameH{lc($NameComponent[$j])}) && (!$lastnameH{lc($NameComponent[$j])})) {
|
1588
|
+
$FeatureH{"Name"."$i"."part"."$j"."FN"} = 1;
|
1589
|
+
}elsif (($lastnameH{lc($NameComponent[$j])}) && (!$firstnameH{lc($NameComponent[$j])})) {
|
1590
|
+
$FeatureH{"Name"."$i"."part"."$j"."LN"} = 1;
|
1591
|
+
}elsif (! $dictH{lc($NameComponent[$j])}) {
|
1592
|
+
$FeatureH{"Name"."$i"."part"."$j"."NonDict"} = 1;
|
1593
|
+
}
|
1594
|
+
|
1595
|
+
#space for more features
|
1596
|
+
}
|
1597
|
+
}
|
1598
|
+
|
1599
|
+
#Build up FeatureVec
|
1600
|
+
#code for the attribute ID separately so that the ID for features would be continuous
|
1601
|
+
if ($type eq "train") {
|
1602
|
+
foreach my $fea(sort {$a <=> $b} keys %FeatureH) {
|
1603
|
+
if (! $$NameDictH{$fea}{ID}) {
|
1604
|
+
$$NameDictH{FeatureCounter}++;
|
1605
|
+
$$NameDictH{$fea}{ID} = $$NameDictH{FeatureCounter};
|
1606
|
+
}
|
1607
|
+
|
1608
|
+
if (! IsNumber($FeatureH{$fea})) {
|
1609
|
+
if (! exists $$NameDictH{$FeatureH{$fea}}{ID}) {
|
1610
|
+
$$NameDictH{FeatureCounter}++;
|
1611
|
+
$$NameDictH{$FeatureH{$fea}}{ID} = $$NameDictH{FeatureCounter};
|
1612
|
+
}
|
1613
|
+
$FeatureH{$fea} = $$NameDictH{$FeatureH{$fea}}{ID};
|
1614
|
+
}
|
1615
|
+
|
1616
|
+
if ($FeatureH{$fea} == 0) {
|
1617
|
+
delete($FeatureH{$fea});
|
1618
|
+
}else {
|
1619
|
+
if ((! exists $$NameDictH{$fea}{max}) || ($$NameDictH{$fea}{max} < $FeatureH{$fea})) {
|
1620
|
+
$$NameDictH{$fea}{max} = $FeatureH{$fea};
|
1621
|
+
}
|
1622
|
+
}
|
1623
|
+
}
|
1624
|
+
return(\%FeatureH, $NameDictH);
|
1625
|
+
#test
|
1626
|
+
}else {
|
1627
|
+
my $SpaceNameFeaVec = "";
|
1628
|
+
my $SpaceNameTextFeaVec = "";
|
1629
|
+
foreach my $fea(sort {$$NameDictH{$a}{ID} <=> $$NameDictH{$b}{ID}} keys %FeatureH) {
|
1630
|
+
if (! &IsNumber($FeatureH{$fea})) {
|
1631
|
+
if (exists $$NameDictH{$FeatureH{$fea}}{ID}) {
|
1632
|
+
$FeatureH{$fea} = $$NameDictH{$FeatureH{$fea}}{ID};
|
1633
|
+
}else {
|
1634
|
+
delete($FeatureH{$fea});
|
1635
|
+
}
|
1636
|
+
}
|
1637
|
+
|
1638
|
+
if (! ($FeatureH{$fea} && $$NameDictH{$fea}{ID})) {
|
1639
|
+
delete($FeatureH{$fea});
|
1640
|
+
}else {
|
1641
|
+
$FeatureH{$fea} = sprintf("%.8f", $FeatureH{$fea}/$$NameDictH{$fea}{max});
|
1642
|
+
$SpaceNameFeaVec .= "$$NameDictH{$fea}{ID}\:$FeatureH{$fea} ";
|
1643
|
+
$SpaceNameTextFeaVec .= "$fea\:$FeatureH{$fea} ";
|
1644
|
+
}
|
1645
|
+
}
|
1646
|
+
return($SpaceNameFeaVec, $SpaceNameTextFeaVec);
|
1647
|
+
}
|
1648
|
+
}
|
1649
|
+
|
1650
|
+
|
1651
|
+
sub SpaceNameLnFeaRepre_unit() {
|
1652
|
+
my $NamePatternStr = shift;
|
1653
|
+
|
1654
|
+
#feature generation and representation
|
1655
|
+
#It is good to make each of the apple's feature(color, shape..) separate.
|
1656
|
+
my %FeatureH = ();
|
1657
|
+
$NamePatternStr =~ s/\<\>$//g; #remove the last <>
|
1658
|
+
my @Names = split(/<>/, $NamePatternStr);
|
1659
|
+
|
1660
|
+
#try making features binary
|
1661
|
+
for my $i(0 .. $#Names) {
|
1662
|
+
my @NameComponent = split(/\s+/, $Names[$i]);
|
1663
|
+
for my $j(0 .. $#NameComponent){
|
1664
|
+
#feature generation($i = 0 is the first one)
|
1665
|
+
$FeatureH{"Name"."$i"."part"."$j"."form"} = &HeaderParse::API::NamePatternMatch::RichNameType($NameComponent[$j]);
|
1666
|
+
if ($j eq $#NameComponent) {
|
1667
|
+
$FeatureH{"Name"."$i"."part"."$j"."pos"} = "Last";
|
1668
|
+
}elsif ($j eq $#NameComponent -1) {
|
1669
|
+
$FeatureH{"Name"."$i"."part"."$j"."pos"} = "SecLast";
|
1670
|
+
}else {
|
1671
|
+
$FeatureH{"Name"."$i"."part"."$j"."pos"} = $j;
|
1672
|
+
}
|
1673
|
+
#firstname, lastname information
|
1674
|
+
# print "hello2: ".lc($NameComponent[$j])."\n";
|
1675
|
+
if (($firstnameH{lc($NameComponent[$j])}) && (!$lastnameH{lc($NameComponent[$j])})) {
|
1676
|
+
# print "NAME MATCH: ".lc($NameComponent[$j])."\n";
|
1677
|
+
$FeatureH{"Name"."$i"."part"."$j"."FN"} = 1;
|
1678
|
+
}elsif (($lastnameH{lc($NameComponent[$j])}) && (!$firstnameH{lc($NameComponent[$j])})) {
|
1679
|
+
# print "NAME MATCH: ".lc($NameComponent[$j])."\n";
|
1680
|
+
|
1681
|
+
$FeatureH{"Name"."$i"."part"."$j"."LN"} = 1;
|
1682
|
+
}elsif (! $dictH{lc($NameComponent[$j])}) {
|
1683
|
+
# print "NAME MATCH: ".lc($NameComponent[$j])."\n";
|
1684
|
+
|
1685
|
+
$FeatureH{"Name"."$i"."part"."$j"."NonDict"} = 1;
|
1686
|
+
}
|
1687
|
+
|
1688
|
+
#space for more features
|
1689
|
+
}
|
1690
|
+
}
|
1691
|
+
return(\%FeatureH);
|
1692
|
+
}
|
1693
|
+
|
1694
|
+
|
1695
|
+
sub IsNumber ()
|
1696
|
+
{
|
1697
|
+
my $in = shift;
|
1698
|
+
if ($in =~ m/^(\d+)(\.\d+)*$/) {
|
1699
|
+
return 1;
|
1700
|
+
}else {
|
1701
|
+
return 0;
|
1702
|
+
}
|
1703
|
+
}
|
1704
|
+
|
1705
|
+
|
1706
|
+
sub FormFeaDict() {
|
1707
|
+
my $DataH = shift;
|
1708
|
+
my $FeatureDictH = shift;
|
1709
|
+
my %NameSpaceFeaDictH = ();
|
1710
|
+
|
1711
|
+
foreach my $HeaNO (sort {$a <=> $b} keys %{$DataH}) {
|
1712
|
+
foreach my $line (sort {$a <=> $b} keys %{$$DataH{$HeaNO}}) {
|
1713
|
+
foreach my $fea(keys %{$$DataH{$HeaNO}{$line}{FeaVec}}) {
|
1714
|
+
if ($$DataH{$HeaNO}{$line}{FeaVec}{$fea} == 0) {
|
1715
|
+
delete ($$DataH{$HeaNO}{$line}{FeaVec}{$fea});
|
1716
|
+
next;
|
1717
|
+
}else {
|
1718
|
+
if (! $$FeatureDictH{$fea}{ID}) {
|
1719
|
+
$$FeatureDictH{FeatureCounter}++;
|
1720
|
+
$$FeatureDictH{$fea}{ID} = $$FeatureDictH{FeatureCounter};
|
1721
|
+
}
|
1722
|
+
if ($$DataH{$HeaNO}{$line}{FeaVec}{$fea} > $$FeatureDictH{$fea}{max}) {
|
1723
|
+
$$FeatureDictH{$fea}{max} = $$DataH{$HeaNO}{$line}{FeaVec}{$fea};
|
1724
|
+
}
|
1725
|
+
$$FeatureDictH{$fea}{DF}++;
|
1726
|
+
}
|
1727
|
+
#test needs this line!
|
1728
|
+
if ((! $$FeatureDictH{$fea}{ID}) || ($$DataH{$HeaNO}{$line}{FeaVec}{$fea} == 0)) { #some basic feature defined in initialization such as pubnumber could be 0
|
1729
|
+
delete ($$DataH{$HeaNO}{$line}{FeaVec}{$fea});
|
1730
|
+
}
|
1731
|
+
}
|
1732
|
+
|
1733
|
+
#form the Name Space Feature Dict
|
1734
|
+
if (exists $$DataH{$HeaNO}{$line}{NamePattern}) {
|
1735
|
+
foreach my $CandidateNamePattern(keys %{$$DataH{$HeaNO}{$line}{NamePattern}}) {
|
1736
|
+
foreach my $fea(keys %{$$DataH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}}) {
|
1737
|
+
my $wt = $$DataH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}{$fea};
|
1738
|
+
if (! $NameSpaceFeaDictH{$fea}{ID}) {
|
1739
|
+
$NameSpaceFeaDictH{FeatureCounter}++;
|
1740
|
+
$NameSpaceFeaDictH{$fea}{ID} = $NameSpaceFeaDictH{FeatureCounter};
|
1741
|
+
}
|
1742
|
+
if (! &IsNumber($wt)) {
|
1743
|
+
if (! exists $NameSpaceFeaDictH{$wt}{ID}) {
|
1744
|
+
$NameSpaceFeaDictH{FeatureCounter}++;
|
1745
|
+
$NameSpaceFeaDictH{$wt}{ID} = $NameSpaceFeaDictH{FeatureCounter};
|
1746
|
+
}
|
1747
|
+
$$DataH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}{$fea} = $NameSpaceFeaDictH{$wt}{ID};
|
1748
|
+
}
|
1749
|
+
|
1750
|
+
if ($wt == 0) {
|
1751
|
+
delete($$DataH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}{$fea});
|
1752
|
+
}else {
|
1753
|
+
if ((! exists $NameSpaceFeaDictH{$fea}{max}) || ($NameSpaceFeaDictH{$fea}{max} < $wt)) {
|
1754
|
+
$NameSpaceFeaDictH{$fea}{max} = $wt;
|
1755
|
+
}
|
1756
|
+
}
|
1757
|
+
}
|
1758
|
+
}
|
1759
|
+
}
|
1760
|
+
#end of form the dictionary for the name
|
1761
|
+
}
|
1762
|
+
}
|
1763
|
+
return($DataH, $FeatureDictH, \%NameSpaceFeaDictH);
|
1764
|
+
}
|
1765
|
+
|
1766
|
+
|
1767
|
+
sub FormTestFeaVec(){
|
1768
|
+
my $FeatureDictH = shift;
|
1769
|
+
my $TestHeaderH = shift;
|
1770
|
+
|
1771
|
+
foreach my $line(sort{$a<=>$b} keys %{$TestHeaderH}) {
|
1772
|
+
foreach my $fea(keys %{$$TestHeaderH{$line}{FeaVec}}) {
|
1773
|
+
if ((! $$FeatureDictH{$fea}{ID}) || ($$TestHeaderH{$line}{FeaVec}{$fea} == 0)) {
|
1774
|
+
delete($$TestHeaderH{$line}{FeaVec}{$fea});
|
1775
|
+
}
|
1776
|
+
}
|
1777
|
+
}
|
1778
|
+
return($TestHeaderH);
|
1779
|
+
}
|
1780
|
+
|
1781
|
+
|
1782
|
+
sub PruneDict() {
|
1783
|
+
my $FeatureDictH = shift;
|
1784
|
+
my $Recount = 1;
|
1785
|
+
|
1786
|
+
foreach my $DictFea(sort{$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$FeatureDictH}) {
|
1787
|
+
if ((($DictFea ne "FeatureCounter") && ($$FeatureDictH{$DictFea}{max} == 0)) || ($$FeatureDictH{$DictFea}{DF} < 2)) {
|
1788
|
+
delete($$FeatureDictH{$DictFea});
|
1789
|
+
}else {
|
1790
|
+
$$FeatureDictH{$DictFea}{ID} = $Recount;
|
1791
|
+
$Recount++;
|
1792
|
+
}
|
1793
|
+
}
|
1794
|
+
|
1795
|
+
$$FeatureDictH{FeatureCounter} = $Recount-1;
|
1796
|
+
|
1797
|
+
return($FeatureDictH);
|
1798
|
+
}
|
1799
|
+
|
1800
|
+
#input is an array of name patterns
|
1801
|
+
#return a string of the best name pattern
|
1802
|
+
sub PredictBestNamePattern() {
|
1803
|
+
my $PredictedNames = shift;
|
1804
|
+
my $SVMNameSpaceModel = shift;
|
1805
|
+
my $SpaceNameDictH = shift;
|
1806
|
+
my $tmpCacheVec = shift;
|
1807
|
+
my $SVMTmpResult = shift;
|
1808
|
+
|
1809
|
+
my $MaxVal = -10;
|
1810
|
+
my $BestNamePattern = "";
|
1811
|
+
|
1812
|
+
for my $i(0 .. $#$PredictedNames) {
|
1813
|
+
my $candidateName = "";
|
1814
|
+
for my $j(0 .. $#{$$PredictedNames[$i]}) {
|
1815
|
+
if ($$PredictedNames[$i][$j]) {
|
1816
|
+
$candidateName .= "$$PredictedNames[$i][$j]<>";
|
1817
|
+
}
|
1818
|
+
}
|
1819
|
+
|
1820
|
+
my ($RawNameFeaVec) = &SpaceNameLnFeaRepre_unit($candidateName);
|
1821
|
+
#filter out the non-dictinary features
|
1822
|
+
my $SpaceNameVec = "";
|
1823
|
+
my $SpaceNameTextFeaVec = "";
|
1824
|
+
foreach my $fea(sort {$$SpaceNameDictH{$a}{ID} <=> $$SpaceNameDictH{$b}{ID}} keys %{$RawNameFeaVec}) {
|
1825
|
+
my $wt = $$RawNameFeaVec{$fea};
|
1826
|
+
if (! &IsNumber($wt)) {
|
1827
|
+
if (exists $$SpaceNameDictH{$wt}{ID}) {
|
1828
|
+
$$RawNameFeaVec{$fea} = $$SpaceNameDictH{$wt}{ID};
|
1829
|
+
}else {
|
1830
|
+
delete($$RawNameFeaVec{$fea});
|
1831
|
+
}
|
1832
|
+
}
|
1833
|
+
|
1834
|
+
if (! (($$RawNameFeaVec{$fea}>0) && $$SpaceNameDictH{$fea}{ID})) {
|
1835
|
+
delete($$RawNameFeaVec{$fea});
|
1836
|
+
}else {
|
1837
|
+
$$RawNameFeaVec{$fea} = sprintf("%.8f", $$RawNameFeaVec{$fea}/$$SpaceNameDictH{$fea}{max}
|
1838
|
+
);
|
1839
|
+
$SpaceNameVec .= "$$SpaceNameDictH{$fea}{ID}\:$$RawNameFeaVec{$fea} ";
|
1840
|
+
$SpaceNameTextFeaVec .= "$fea\:$$RawNameFeaVec{$fea} ";
|
1841
|
+
}
|
1842
|
+
}
|
1843
|
+
open(testVec, ">$tmpCacheVec") || die "SVMHeaderParse: could not open $tmpCacheVec to write: $!";
|
1844
|
+
# print "NamePattern FeatureVec is\: $SpaceNameTextVec\n";
|
1845
|
+
print testVec "$SpaceNameVec";
|
1846
|
+
close(testVec);
|
1847
|
+
`$Classifier -v 0 $tmpCacheVec $SVMNameSpaceModel $SVMTmpResult`;
|
1848
|
+
my $result = &Analyze($SVMTmpResult);
|
1849
|
+
if ($result > $MaxVal) {
|
1850
|
+
$MaxVal = $result;
|
1851
|
+
$BestNamePattern = $candidateName;
|
1852
|
+
}
|
1853
|
+
}
|
1854
|
+
|
1855
|
+
unlink $tmpCacheVec;
|
1856
|
+
unlink $SVMTmpResult;
|
1857
|
+
|
1858
|
+
#split the multiple names in order
|
1859
|
+
$BestNamePattern =~ s/\<\>$//g; #remove the last <>
|
1860
|
+
|
1861
|
+
return($BestNamePattern);
|
1862
|
+
}
|
1863
|
+
|
1864
|
+
|
1865
|
+
sub WordCount() { #didn't try, but should be OK, since it is borrowed from AddrMatch in function.pm
|
1866
|
+
my $inStr = shift;
|
1867
|
+
$inStr =~ s/^\s+//g;
|
1868
|
+
$inStr =~ s/\s+$//g;
|
1869
|
+
|
1870
|
+
my $senLen = 0;
|
1871
|
+
my @words = split(/\s+/, $inStr);
|
1872
|
+
for my $i(0 .. $#words) {
|
1873
|
+
if ($words[0] !~ /^\W+\s*$/) { #punctuation
|
1874
|
+
$senLen ++;
|
1875
|
+
}
|
1876
|
+
}
|
1877
|
+
return($senLen);
|
1878
|
+
}
|
1879
|
+
|
1880
|
+
1;
|