biblicit 1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +3 -0
- data/.rspec +1 -0
- data/Gemfile +6 -0
- data/LICENSE.TXT +176 -0
- data/README.md +120 -0
- data/Rakefile +8 -0
- data/biblicit.gemspec +33 -0
- data/lib/biblicit/cb2bib.rb +83 -0
- data/lib/biblicit/citeseer.rb +53 -0
- data/lib/biblicit/extractor.rb +37 -0
- data/lib/biblicit.rb +6 -0
- data/perl/DocFilter/lib/CSXUtil/SafeText.pm +140 -0
- data/perl/DocFilter/lib/DocFilter/Config.pm +35 -0
- data/perl/DocFilter/lib/DocFilter/Filter.pm +51 -0
- data/perl/FileConversionService/README.TXT +11 -0
- data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
- data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +140 -0
- data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +77 -0
- data/perl/FileConversionService/lib/FileConverter/Compression.pm +137 -0
- data/perl/FileConversionService/lib/FileConverter/Config.pm +57 -0
- data/perl/FileConversionService/lib/FileConverter/Controller.pm +191 -0
- data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +61 -0
- data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +69 -0
- data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +69 -0
- data/perl/FileConversionService/lib/FileConverter/PSToText.pm +88 -0
- data/perl/FileConversionService/lib/FileConverter/Prescript.pm +68 -0
- data/perl/FileConversionService/lib/FileConverter/TET.pm +75 -0
- data/perl/FileConversionService/lib/FileConverter/Utils.pm +130 -0
- data/perl/HeaderParseService/README.TXT +80 -0
- data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +140 -0
- data/perl/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +968 -0
- data/perl/HeaderParseService/lib/HeaderParse/API/Function.pm +2016 -0
- data/perl/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +444 -0
- data/perl/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +409 -0
- data/perl/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +537 -0
- data/perl/HeaderParseService/lib/HeaderParse/API/Parser.pm +68 -0
- data/perl/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +1880 -0
- data/perl/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +46 -0
- data/perl/HeaderParseService/resources/data/EbizHeaders.txt +24330 -0
- data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +27506 -0
- data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +26495 -0
- data/perl/HeaderParseService/resources/data/tagged_headers.txt +40668 -0
- data/perl/HeaderParseService/resources/data/test_header.txt +31 -0
- data/perl/HeaderParseService/resources/data/test_header.txt.parsed +31 -0
- data/perl/HeaderParseService/resources/database/50states +60 -0
- data/perl/HeaderParseService/resources/database/AddrTopWords.txt +17 -0
- data/perl/HeaderParseService/resources/database/AffiTopWords.txt +35 -0
- data/perl/HeaderParseService/resources/database/AffiTopWordsAll.txt +533 -0
- data/perl/HeaderParseService/resources/database/ChineseSurNames.txt +276 -0
- data/perl/HeaderParseService/resources/database/Csurnames.bin +0 -0
- data/perl/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
- data/perl/HeaderParseService/resources/database/DomainSuffixes.txt +242 -0
- data/perl/HeaderParseService/resources/database/LabeledHeader +18 -0
- data/perl/HeaderParseService/resources/database/README +2 -0
- data/perl/HeaderParseService/resources/database/TrainMulClassLines +254 -0
- data/perl/HeaderParseService/resources/database/TrainMulClassLines1 +510 -0
- data/perl/HeaderParseService/resources/database/abstract.txt +1 -0
- data/perl/HeaderParseService/resources/database/abstractTopWords +9 -0
- data/perl/HeaderParseService/resources/database/addr.txt +28 -0
- data/perl/HeaderParseService/resources/database/affi.txt +34 -0
- data/perl/HeaderParseService/resources/database/affis.bin +0 -0
- data/perl/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
- data/perl/HeaderParseService/resources/database/allnamewords.bin +0 -0
- data/perl/HeaderParseService/resources/database/cities_US.txt +4512 -0
- data/perl/HeaderParseService/resources/database/cities_world.txt +4463 -0
- data/perl/HeaderParseService/resources/database/city.txt +3150 -0
- data/perl/HeaderParseService/resources/database/cityname.txt +3151 -0
- data/perl/HeaderParseService/resources/database/country_abbr.txt +243 -0
- data/perl/HeaderParseService/resources/database/countryname.txt +262 -0
- data/perl/HeaderParseService/resources/database/dateTopWords +30 -0
- data/perl/HeaderParseService/resources/database/degree.txt +67 -0
- data/perl/HeaderParseService/resources/database/email.txt +3 -0
- data/perl/HeaderParseService/resources/database/excludeWords.txt +40 -0
- data/perl/HeaderParseService/resources/database/female-names +4960 -0
- data/perl/HeaderParseService/resources/database/firstNames.txt +8448 -0
- data/perl/HeaderParseService/resources/database/firstnames.bin +0 -0
- data/perl/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
- data/perl/HeaderParseService/resources/database/intro.txt +2 -0
- data/perl/HeaderParseService/resources/database/keyword.txt +5 -0
- data/perl/HeaderParseService/resources/database/keywordTopWords +7 -0
- data/perl/HeaderParseService/resources/database/male-names +3906 -0
- data/perl/HeaderParseService/resources/database/middleNames.txt +2 -0
- data/perl/HeaderParseService/resources/database/month.txt +35 -0
- data/perl/HeaderParseService/resources/database/mul +868 -0
- data/perl/HeaderParseService/resources/database/mul.label +869 -0
- data/perl/HeaderParseService/resources/database/mul.label.old +869 -0
- data/perl/HeaderParseService/resources/database/mul.processed +762 -0
- data/perl/HeaderParseService/resources/database/mulAuthor +619 -0
- data/perl/HeaderParseService/resources/database/mulClassStat +45 -0
- data/perl/HeaderParseService/resources/database/nickname.txt +58 -0
- data/perl/HeaderParseService/resources/database/nicknames.bin +0 -0
- data/perl/HeaderParseService/resources/database/note.txt +121 -0
- data/perl/HeaderParseService/resources/database/page.txt +1 -0
- data/perl/HeaderParseService/resources/database/phone.txt +9 -0
- data/perl/HeaderParseService/resources/database/postcode.txt +54 -0
- data/perl/HeaderParseService/resources/database/pubnum.txt +45 -0
- data/perl/HeaderParseService/resources/database/statename.bin +0 -0
- data/perl/HeaderParseService/resources/database/statename.txt +73 -0
- data/perl/HeaderParseService/resources/database/states_and_abbreviations.txt +118 -0
- data/perl/HeaderParseService/resources/database/stopwords +438 -0
- data/perl/HeaderParseService/resources/database/stopwords.bin +0 -0
- data/perl/HeaderParseService/resources/database/surNames.txt +19613 -0
- data/perl/HeaderParseService/resources/database/surnames.bin +0 -0
- data/perl/HeaderParseService/resources/database/surnames_spec.bin +0 -0
- data/perl/HeaderParseService/resources/database/university_list/A.html +167 -0
- data/perl/HeaderParseService/resources/database/university_list/B.html +161 -0
- data/perl/HeaderParseService/resources/database/university_list/C.html +288 -0
- data/perl/HeaderParseService/resources/database/university_list/D.html +115 -0
- data/perl/HeaderParseService/resources/database/university_list/E.html +147 -0
- data/perl/HeaderParseService/resources/database/university_list/F.html +112 -0
- data/perl/HeaderParseService/resources/database/university_list/G.html +115 -0
- data/perl/HeaderParseService/resources/database/university_list/H.html +140 -0
- data/perl/HeaderParseService/resources/database/university_list/I.html +138 -0
- data/perl/HeaderParseService/resources/database/university_list/J.html +82 -0
- data/perl/HeaderParseService/resources/database/university_list/K.html +115 -0
- data/perl/HeaderParseService/resources/database/university_list/L.html +131 -0
- data/perl/HeaderParseService/resources/database/university_list/M.html +201 -0
- data/perl/HeaderParseService/resources/database/university_list/N.html +204 -0
- data/perl/HeaderParseService/resources/database/university_list/O.html +89 -0
- data/perl/HeaderParseService/resources/database/university_list/P.html +125 -0
- data/perl/HeaderParseService/resources/database/university_list/Q.html +49 -0
- data/perl/HeaderParseService/resources/database/university_list/R.html +126 -0
- data/perl/HeaderParseService/resources/database/university_list/S.html +296 -0
- data/perl/HeaderParseService/resources/database/university_list/T.html +156 -0
- data/perl/HeaderParseService/resources/database/university_list/U.html +800 -0
- data/perl/HeaderParseService/resources/database/university_list/V.html +75 -0
- data/perl/HeaderParseService/resources/database/university_list/W.html +144 -0
- data/perl/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
- data/perl/HeaderParseService/resources/database/university_list/X.html +44 -0
- data/perl/HeaderParseService/resources/database/university_list/Y.html +53 -0
- data/perl/HeaderParseService/resources/database/university_list/Z.html +43 -0
- data/perl/HeaderParseService/resources/database/university_list/ae.html +31 -0
- data/perl/HeaderParseService/resources/database/university_list/am.html +30 -0
- data/perl/HeaderParseService/resources/database/university_list/ar.html +35 -0
- data/perl/HeaderParseService/resources/database/university_list/at.html +43 -0
- data/perl/HeaderParseService/resources/database/university_list/au.html +82 -0
- data/perl/HeaderParseService/resources/database/university_list/bd.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/be.html +41 -0
- data/perl/HeaderParseService/resources/database/university_list/bg.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/bh.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
- data/perl/HeaderParseService/resources/database/university_list/bm.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/bn.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/br.html +66 -0
- data/perl/HeaderParseService/resources/database/university_list/ca.html +174 -0
- data/perl/HeaderParseService/resources/database/university_list/ch.html +52 -0
- data/perl/HeaderParseService/resources/database/university_list/cl.html +40 -0
- data/perl/HeaderParseService/resources/database/university_list/cn.html +87 -0
- data/perl/HeaderParseService/resources/database/university_list/co.html +39 -0
- data/perl/HeaderParseService/resources/database/university_list/cr.html +34 -0
- data/perl/HeaderParseService/resources/database/university_list/cy.html +34 -0
- data/perl/HeaderParseService/resources/database/university_list/cz.html +44 -0
- data/perl/HeaderParseService/resources/database/university_list/de.html +128 -0
- data/perl/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
- data/perl/HeaderParseService/resources/database/university_list/dk.html +42 -0
- data/perl/HeaderParseService/resources/database/university_list/ec.html +31 -0
- data/perl/HeaderParseService/resources/database/university_list/ee.html +30 -0
- data/perl/HeaderParseService/resources/database/university_list/eg.html +29 -0
- data/perl/HeaderParseService/resources/database/university_list/es.html +68 -0
- data/perl/HeaderParseService/resources/database/university_list/et.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/faq.html +147 -0
- data/perl/HeaderParseService/resources/database/university_list/fi.html +49 -0
- data/perl/HeaderParseService/resources/database/university_list/fj.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/fo.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/fr.html +106 -0
- data/perl/HeaderParseService/resources/database/university_list/geog.html +150 -0
- data/perl/HeaderParseService/resources/database/university_list/gr.html +38 -0
- data/perl/HeaderParseService/resources/database/university_list/gu.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/hk.html +34 -0
- data/perl/HeaderParseService/resources/database/university_list/hr.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/hu.html +46 -0
- data/perl/HeaderParseService/resources/database/university_list/id.html +29 -0
- data/perl/HeaderParseService/resources/database/university_list/ie.html +49 -0
- data/perl/HeaderParseService/resources/database/university_list/il.html +35 -0
- data/perl/HeaderParseService/resources/database/university_list/in.html +109 -0
- data/perl/HeaderParseService/resources/database/university_list/is.html +32 -0
- data/perl/HeaderParseService/resources/database/university_list/it.html +75 -0
- data/perl/HeaderParseService/resources/database/university_list/jm.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/jo.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/jp.html +155 -0
- data/perl/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
- data/perl/HeaderParseService/resources/database/university_list/kr.html +65 -0
- data/perl/HeaderParseService/resources/database/university_list/kw.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/lb.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
- data/perl/HeaderParseService/resources/database/university_list/lk.html +30 -0
- data/perl/HeaderParseService/resources/database/university_list/lt.html +31 -0
- data/perl/HeaderParseService/resources/database/university_list/lu.html +34 -0
- data/perl/HeaderParseService/resources/database/university_list/lv.html +30 -0
- data/perl/HeaderParseService/resources/database/university_list/ma.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
- data/perl/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
- data/perl/HeaderParseService/resources/database/university_list/mk.html +29 -0
- data/perl/HeaderParseService/resources/database/university_list/mo.html +29 -0
- data/perl/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
- data/perl/HeaderParseService/resources/database/university_list/mt.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/mx.html +68 -0
- data/perl/HeaderParseService/resources/database/university_list/my.html +39 -0
- data/perl/HeaderParseService/resources/database/university_list/ni.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/nl.html +51 -0
- data/perl/HeaderParseService/resources/database/university_list/no.html +56 -0
- data/perl/HeaderParseService/resources/database/university_list/nz.html +41 -0
- data/perl/HeaderParseService/resources/database/university_list/pa.html +31 -0
- data/perl/HeaderParseService/resources/database/university_list/pe.html +40 -0
- data/perl/HeaderParseService/resources/database/university_list/ph.html +41 -0
- data/perl/HeaderParseService/resources/database/university_list/pl.html +51 -0
- data/perl/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
- data/perl/HeaderParseService/resources/database/university_list/pr.html +31 -0
- data/perl/HeaderParseService/resources/database/university_list/ps.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/pt.html +45 -0
- data/perl/HeaderParseService/resources/database/university_list/recognition.html +69 -0
- data/perl/HeaderParseService/resources/database/university_list/results.html +71 -0
- data/perl/HeaderParseService/resources/database/university_list/ro.html +38 -0
- data/perl/HeaderParseService/resources/database/university_list/ru.html +48 -0
- data/perl/HeaderParseService/resources/database/university_list/sd.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/se.html +57 -0
- data/perl/HeaderParseService/resources/database/university_list/sg.html +33 -0
- data/perl/HeaderParseService/resources/database/university_list/si.html +30 -0
- data/perl/HeaderParseService/resources/database/university_list/sk.html +35 -0
- data/perl/HeaderParseService/resources/database/university_list/th.html +45 -0
- data/perl/HeaderParseService/resources/database/university_list/tr.html +44 -0
- data/perl/HeaderParseService/resources/database/university_list/tw.html +76 -0
- data/perl/HeaderParseService/resources/database/university_list/ua.html +29 -0
- data/perl/HeaderParseService/resources/database/university_list/uk.html +168 -0
- data/perl/HeaderParseService/resources/database/university_list/univ-full.html +3166 -0
- data/perl/HeaderParseService/resources/database/university_list/univ.html +122 -0
- data/perl/HeaderParseService/resources/database/university_list/uy.html +31 -0
- data/perl/HeaderParseService/resources/database/university_list/ve.html +34 -0
- data/perl/HeaderParseService/resources/database/university_list/yu.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/za.html +46 -0
- data/perl/HeaderParseService/resources/database/university_list/zm.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list.txt +3025 -0
- data/perl/HeaderParseService/resources/database/url.txt +1 -0
- data/perl/HeaderParseService/resources/database/webTopWords +225 -0
- data/perl/HeaderParseService/resources/database/words +45402 -0
- data/perl/HeaderParseService/resources/models/10ContextModelfold1 +369 -0
- data/perl/HeaderParseService/resources/models/10Modelfold1 +376 -0
- data/perl/HeaderParseService/resources/models/11ContextModelfold1 +400 -0
- data/perl/HeaderParseService/resources/models/11Modelfold1 +526 -0
- data/perl/HeaderParseService/resources/models/12ContextModelfold1 +510 -0
- data/perl/HeaderParseService/resources/models/12Modelfold1 +423 -0
- data/perl/HeaderParseService/resources/models/13ContextModelfold1 +364 -0
- data/perl/HeaderParseService/resources/models/13Modelfold1 +677 -0
- data/perl/HeaderParseService/resources/models/14ContextModelfold1 +459 -0
- data/perl/HeaderParseService/resources/models/14Modelfold1 +325 -0
- data/perl/HeaderParseService/resources/models/15ContextModelfold1 +340 -0
- data/perl/HeaderParseService/resources/models/15Modelfold1 +390 -0
- data/perl/HeaderParseService/resources/models/1ContextModelfold1 +668 -0
- data/perl/HeaderParseService/resources/models/1Modelfold1 +1147 -0
- data/perl/HeaderParseService/resources/models/2ContextModelfold1 +755 -0
- data/perl/HeaderParseService/resources/models/2Modelfold1 +796 -0
- data/perl/HeaderParseService/resources/models/3ContextModelfold1 +1299 -0
- data/perl/HeaderParseService/resources/models/3Modelfold1 +1360 -0
- data/perl/HeaderParseService/resources/models/4ContextModelfold1 +1062 -0
- data/perl/HeaderParseService/resources/models/4Modelfold1 +993 -0
- data/perl/HeaderParseService/resources/models/5ContextModelfold1 +1339 -0
- data/perl/HeaderParseService/resources/models/5Modelfold1 +2098 -0
- data/perl/HeaderParseService/resources/models/6ContextModelfold1 +888 -0
- data/perl/HeaderParseService/resources/models/6Modelfold1 +620 -0
- data/perl/HeaderParseService/resources/models/7ContextModelfold1 +257 -0
- data/perl/HeaderParseService/resources/models/7Modelfold1 +228 -0
- data/perl/HeaderParseService/resources/models/8ContextModelfold1 +677 -0
- data/perl/HeaderParseService/resources/models/8Modelfold1 +1871 -0
- data/perl/HeaderParseService/resources/models/9ContextModelfold1 +198 -0
- data/perl/HeaderParseService/resources/models/9Modelfold1 +170 -0
- data/perl/HeaderParseService/resources/models/NameSpaceModel +181 -0
- data/perl/HeaderParseService/resources/models/NameSpaceTrainF +347 -0
- data/perl/HeaderParseService/resources/models/WrapperBaseFeaDict +13460 -0
- data/perl/HeaderParseService/resources/models/WrapperContextFeaDict +14045 -0
- data/perl/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +510 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +23 -0
- data/perl/ParsCit/README.TXT +82 -0
- data/perl/ParsCit/crfpp/traindata/parsCit.template +60 -0
- data/perl/ParsCit/crfpp/traindata/parsCit.train.data +12104 -0
- data/perl/ParsCit/crfpp/traindata/tagged_references.txt +500 -0
- data/perl/ParsCit/lib/CSXUtil/SafeText.pm +140 -0
- data/perl/ParsCit/lib/ParsCit/Citation.pm +462 -0
- data/perl/ParsCit/lib/ParsCit/CitationContext.pm +132 -0
- data/perl/ParsCit/lib/ParsCit/Config.pm +46 -0
- data/perl/ParsCit/lib/ParsCit/Controller.pm +306 -0
- data/perl/ParsCit/lib/ParsCit/PostProcess.pm +367 -0
- data/perl/ParsCit/lib/ParsCit/PreProcess.pm +333 -0
- data/perl/ParsCit/lib/ParsCit/Tr2crfpp.pm +331 -0
- data/perl/ParsCit/resources/parsCit.model +0 -0
- data/perl/ParsCit/resources/parsCitDict.txt +148783 -0
- data/perl/extract.pl +199 -0
- data/spec/biblicit/cb2bib_spec.rb +48 -0
- data/spec/biblicit/citeseer_spec.rb +40 -0
- data/spec/fixtures/pdf/10.1.1.109.4049.pdf +0 -0
- data/spec/fixtures/pdf/Bagnoli Watts TAR 2010.pdf +0 -0
- data/spec/fixtures/pdf/ICINCO_2010.pdf +0 -0
- data/spec/spec_helper.rb +3 -0
- metadata +474 -0
@@ -0,0 +1,2016 @@
|
|
1
|
+
#
|
2
|
+
# Copyright 2007 Penn State University
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
10
|
+
# See the License for the specific language governing permissions and
|
11
|
+
# limitations under the License.
|
12
|
+
#
|
13
|
+
package HeaderParse::API::Function;
|
14
|
+
|
15
|
+
use utf8;
|
16
|
+
use HeaderParse::Config::API_Config qw($Database_Dir);
|
17
|
+
use HeaderParse::API::LoadInformation;
|
18
|
+
require Exporter;
|
19
|
+
use Storable qw(nfreeze thaw);
|
20
|
+
use Data::Dumper;
|
21
|
+
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
|
22
|
+
use vars qw(%dictH %nameH %monthH %affiH %addrH %conjH %prepH %postcodeH %cityH %stateH %countryH %abstractH);
|
23
|
+
|
24
|
+
@ISA = qw(Exporter); # important!!
|
25
|
+
@EXPORT = qw(&weired_author &AddrMatch &printDict &GenTrainVecMatrix &GetBorderLine &LineFeatureRepre &LineFeatureRepre2 &OfflineFillSpace &FillSpace &SeparatePunc &hash_stopwords &hash_nickname &hash_affi_stopwords &hash_addrwords &hash_statewords &str_space_clean &dump_hash_to_file &nfreeze_hash_to_file &read_hash_from_file &thaw_hash_from_file &rand_split_samples_to2parts &rand_split_samples_toNparts &rand_split_hash_index_toNparts &ExtractBinaryNfoldSVMResult &GetNameVariations &get_university_emails &compute_std);
|
26
|
+
|
27
|
+
sub AddrMatch() {
|
28
|
+
|
29
|
+
###open (MYLOGGER, ">ADDRMATCH.LOG");
|
30
|
+
###MYLOGGER->autoflush(1);
|
31
|
+
|
32
|
+
my $inline = shift;
|
33
|
+
|
34
|
+
###$inline="Solitary Waves in the Critical Surface Tension Model";
|
35
|
+
###print MYLOGGER "$inline\n";
|
36
|
+
|
37
|
+
my @words = split(/\s+/, $inline);
|
38
|
+
my $senLen = 0;
|
39
|
+
|
40
|
+
# match the state and country here using one or two words
|
41
|
+
# this step might be very time consuming
|
42
|
+
if ($words[0] !~ /^\W+\s*$/) {
|
43
|
+
$senLen ++; # punctuation
|
44
|
+
}
|
45
|
+
|
46
|
+
###foreach $word (@words){
|
47
|
+
###print MYLOGGER "before : word is \"$word\"\n";
|
48
|
+
###$word = lc($word);
|
49
|
+
###print MYLOGGER "after : word is \"$word\"\n";
|
50
|
+
###}
|
51
|
+
|
52
|
+
|
53
|
+
###print MYLOGGER "count is $#words\n";
|
54
|
+
|
55
|
+
for my $i(1 .. $#words) {
|
56
|
+
### print MYLOGGER "word is $words[$i]\n";
|
57
|
+
if ($words[$i] !~ /^\W+\s*$/) {
|
58
|
+
$senLen ++; # punctuation
|
59
|
+
}
|
60
|
+
#the first letter is capitalized
|
61
|
+
if (($words[$i-1] =~ /^[\p{IsUpper}]/) && ($words[$i] =~ /^[\p{IsUpper}]/)) {
|
62
|
+
###print MYLOGGER "before: $words[$i-1],$words[$i]\n";
|
63
|
+
my $pre = lc($words[$i-1]);
|
64
|
+
|
65
|
+
my $now = lc($words[$i]);
|
66
|
+
###print MYLOGGER "pre is $pre\n now is $now\n";
|
67
|
+
if (exists $stateH{"$pre $now"}) { # need to check if it is correct
|
68
|
+
$words[$i-1] = "";
|
69
|
+
$words[$i] = ":state:";
|
70
|
+
}elsif (exists $countryH{"$pre $now"}) {
|
71
|
+
$words[$i-1] = "";
|
72
|
+
$words[$i] = ":country:";
|
73
|
+
}elsif (exists $cityH{"$pre $now"}) {
|
74
|
+
$words[$i-1] = "";
|
75
|
+
$words[$i] = ":city:";
|
76
|
+
}
|
77
|
+
}
|
78
|
+
}
|
79
|
+
###CLOSE(MYLOGGER);
|
80
|
+
#Broken line is because of the insufficient hard disk
|
81
|
+
$inline = "@words"; #nice join!
|
82
|
+
$inline =~ s/^\s+//g;
|
83
|
+
$inline =~ s/\s+$//g;
|
84
|
+
|
85
|
+
return($inline, $senLen);
|
86
|
+
}
|
87
|
+
|
88
|
+
|
89
|
+
sub printDict() {
|
90
|
+
my ($TotalTrainLineCount, $dictF, %dictH) = @_;
|
91
|
+
|
92
|
+
open(DictFH, ">$dictF") || die "SVMHeaderParse: could not open dictfile\: $dictF to write\n";
|
93
|
+
# replace the old FeatureDictH with the new IDs
|
94
|
+
foreach my $feature (sort{$dictH{$a}{ID} <=> $dictH{$b}{ID}} keys %dictH) {
|
95
|
+
if (defined $dictH{$feature}{ID}) {
|
96
|
+
$dictH{$feature}{mean} = sprintf("%.8f", $dictH{$feature}{mean}/$TotalTrainLineCount);
|
97
|
+
|
98
|
+
if ($dictH{$feature}{max} == 0) {
|
99
|
+
print STDERR "$feature Yahoo1 \n";
|
100
|
+
}
|
101
|
+
my $ANmean = sprintf("%.8f", $dictH{$feature}{mean}/$dictH{$feature}{max});
|
102
|
+
print DictFH "$dictH{$feature}{df} $dictH{$feature}{ID} $feature\: max\($dictH{$feature}{max}\) BNmean\($dictH{$feature}{mean}\) ANmean\($ANmean\)\n";
|
103
|
+
}
|
104
|
+
}
|
105
|
+
close(DictFH);
|
106
|
+
|
107
|
+
return (%dictH);
|
108
|
+
}
|
109
|
+
|
110
|
+
sub GenTrainVecMatrix() {
|
111
|
+
my ($FeatureDictH, $TrainFeatureVecH, $TrainFeatureVec, $TrainMatrixF, $TrainTagInd, $GenMatrix, $norm, $center) = @_;
|
112
|
+
|
113
|
+
open (TrainFeatureVec, ">$TrainFeatureVec") || die "SVMHeaderParse: could not open TrainFeatureVec $TrainFeatureVec to write\n";
|
114
|
+
if ($GenMatrix) {
|
115
|
+
open (TrainMatrxFH, ">$TrainMatrixF") || die "SVMHeaderParse: could not open TrainMatrF\: $TrainMatrixF to write\n";
|
116
|
+
open (TrainTagIndFH, ">$TrainTagInd") || die "SVMHeaderParse: could not open TrainTagInd\: $TrainTagInd to write\n";
|
117
|
+
}
|
118
|
+
my $TmpTrainLineNo = 0;
|
119
|
+
foreach my $s (sort {$a <=> $b} keys %{$TrainFeatureVecH}) {
|
120
|
+
foreach my $li (sort {$a <=> $b} keys %{$$TrainFeatureVecH{$s}}) {
|
121
|
+
$TmpTrainLineNo ++;
|
122
|
+
|
123
|
+
#10/17 multi-class
|
124
|
+
print TrainFeatureVec "\(";
|
125
|
+
# print TrainTagIndFH "";
|
126
|
+
foreach my $tmpCurState (keys %{$$TrainFeatureVecH{$s}{$li}{tag}}) {
|
127
|
+
print TrainFeatureVec "$tmpCurState ";
|
128
|
+
print TrainTagIndFH "$tmpCurState ";
|
129
|
+
}
|
130
|
+
print TrainFeatureVec "\) ";
|
131
|
+
print TrainTagIndFH "\n";
|
132
|
+
|
133
|
+
# brute force; insufficient memorty
|
134
|
+
if ($GenMatrix == 0) {
|
135
|
+
if ($norm) {
|
136
|
+
if ($center == 1) {
|
137
|
+
#a loop of each feature in the dictionary.
|
138
|
+
foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$FeatureDictH}) {
|
139
|
+
if (! exists ($$TrainFeatureVecH{$s}{$li}{content}{$feature})) {
|
140
|
+
$$TrainFeatureVecH{$s}{$li}{content}{$feature} = 0;
|
141
|
+
}
|
142
|
+
# norm
|
143
|
+
my $featureVal = sprintf("%.8f", $$TrainFeatureVecH{$s}{$li}{content}{$feature}/$$FeatureDictH{$feature}{max});
|
144
|
+
#centering
|
145
|
+
$featureVal -= sprintf("%.8f", $$FeatureDictH{$feature}{mean}/$$FeatureDictH{$feature}{max});
|
146
|
+
$$TrainFeatureVecH{$s}{$li}{content}{$feature} = $featureVal;
|
147
|
+
if ($$TrainFeatureVecH{$s}{$li}{content}{$feature} != 0) {
|
148
|
+
print TrainFeatureVec "$$FeatureDictH{$feature}{ID}\:$$TrainFeatureVecH{$s}{$li}{content}{$feature} ";
|
149
|
+
}
|
150
|
+
}
|
151
|
+
print TrainFeatureVec "\n";
|
152
|
+
}else {
|
153
|
+
foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$TrainFeatureVecH{$s}{$li}{content}}) {
|
154
|
+
if (! defined ($$FeatureDictH{$feature}{ID})) {
|
155
|
+
next;
|
156
|
+
}
|
157
|
+
my $featureVal = sprintf("%.8f", $$TrainFeatureVecH{$s}{$li}{content}{$feature}/$$FeatureDictH{$feature}{max});
|
158
|
+
$$TrainFeatureVecH{$s}{$li}{content}{$feature} = $featureVal;
|
159
|
+
if ($$TrainFeatureVecH{$s}{$li}{content}{$feature} != 0) {
|
160
|
+
print TrainFeatureVec "$$FeatureDictH{$feature}{ID}\:$$TrainFeatureVecH{$s}{$li}{content}{$feature} ";
|
161
|
+
}
|
162
|
+
}
|
163
|
+
print TrainFeatureVec "\n";
|
164
|
+
}
|
165
|
+
}else { # norm = 0;
|
166
|
+
foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$TrainFeatureVecH{$s}{$li}{content}}) {
|
167
|
+
if (! defined ($$FeatureDictH{$feature}{ID})) {
|
168
|
+
next;
|
169
|
+
}
|
170
|
+
if ($$TrainFeatureVecH{$s}{$li}{content}{$feature} != 0) { # must be != 0
|
171
|
+
print TrainFeatureVec "$$FeatureDictH{$feature}{ID}\:$$TrainFeatureVecH{$s}{$li}{content}{$feature} ";
|
172
|
+
}
|
173
|
+
}
|
174
|
+
print TrainFeatureVec "\n";
|
175
|
+
}
|
176
|
+
}else {
|
177
|
+
foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$TrainFeatureVecH{$s}{$li}{content}}) {
|
178
|
+
if (! defined ($$FeatureDictH{$feature}{ID})) {
|
179
|
+
next;
|
180
|
+
}
|
181
|
+
if ($norm == 1) {
|
182
|
+
my $featureVal = sprintf("%.8f", $$TrainFeatureVecH{$s}{$li}{content}{$feature}/$$FeatureDictH{$feature}{max});
|
183
|
+
$$TrainFeatureVecH{$s}{$li}{content}{$feature} = $featureVal;
|
184
|
+
}
|
185
|
+
if ($$TrainFeatureVecH{$s}{$li}{content}{$feature}) {
|
186
|
+
print TrainFeatureVec "$$FeatureDictH{$feature}{ID}\:$$TrainFeatureVecH{$s}{$li}{content}{$feature} ";
|
187
|
+
# generate the matrix file for the training samples \n";
|
188
|
+
print TrainMatrxFH " $TmpTrainLineNo $$FeatureDictH{$feature}{ID} $$TrainFeatureVecH{$s}{$li}{content}{$feature}\n";
|
189
|
+
}
|
190
|
+
}
|
191
|
+
print TrainFeatureVec "\n";
|
192
|
+
}# end if
|
193
|
+
|
194
|
+
}# end foreach l(line)
|
195
|
+
print TrainFeatureVec "<NEW_HEADER>\n";
|
196
|
+
print TrainTagIndFH "<NEW_HEADER>\n";
|
197
|
+
}#end foreach s(sample)
|
198
|
+
close (TrainFeatureVec);
|
199
|
+
undef (%{$TrainFeatureVecH}); # release the training vector hash
|
200
|
+
$endTrain = 0;
|
201
|
+
close(TrainTagIndFH);
|
202
|
+
if ($GenMatrix) {
|
203
|
+
close(TrainMatrxFH);
|
204
|
+
}
|
205
|
+
}
|
206
|
+
|
207
|
+
# this is for the plaintext class -- no difference from GenTrainVecMatrix
|
208
|
+
sub GenOriginalTrainVecMatrix() {
|
209
|
+
my ($FeatureDictH, $TrainFeatureVecH, $TrainFeatureVec, $TrainMatrixF, $TrainTagInd, $GenMatrix, $norm, $center) = @_;
|
210
|
+
|
211
|
+
|
212
|
+
open (TrainFeatureVec, ">$TrainFeatureVec") || die "SVMHeaderParse: here1...could not open TrainFeatureVec $TrainFeatureVec to write\n";
|
213
|
+
if ($GenMatrix) {
|
214
|
+
open (TrainMatrxFH, ">$TrainMatrixF") || die "SVMHeaderParse: here2...could not open TrainMatrF\: $TrainMatrixF to write\n";
|
215
|
+
open (TrainTagIndFH, ">$TrainTagInd") || die "SVMHeaderParse: here3...could not open TrainTagInd\: $TrainTagInd to write\n";
|
216
|
+
}
|
217
|
+
my $TmpTrainLineNo = 0;
|
218
|
+
foreach my $s (sort {$a <=> $b} keys %{$TrainFeatureVecH}) {
|
219
|
+
foreach my $li (sort {$a <=> $b} keys %{$$TrainFeatureVecH{$s}}) {
|
220
|
+
$TmpTrainLineNo ++;
|
221
|
+
|
222
|
+
#10/17 multi-class
|
223
|
+
print TrainFeatureVec "\(";
|
224
|
+
# print TrainTagIndFH "";
|
225
|
+
foreach my $tmpCurState (keys %{$$TrainFeatureVecH{$s}{$li}{tag}}) {
|
226
|
+
print TrainFeatureVec "$tmpCurState ";
|
227
|
+
print TrainTagIndFH "$tmpCurState ";
|
228
|
+
}
|
229
|
+
print TrainFeatureVec "\) ";
|
230
|
+
print TrainTagIndFH "\n";
|
231
|
+
|
232
|
+
#
|
233
|
+
|
234
|
+
if (0) {
|
235
|
+
print TrainFeatureVec "$$TrainFeatureVecH{$s}{$li}{tag} ";
|
236
|
+
print TrainTagIndFH "$$TrainFeatureVecH{$s}{$li}{tag}\n";
|
237
|
+
}
|
238
|
+
|
239
|
+
# brute force; insufficient memorty
|
240
|
+
if (($GenMatrix == 0) && ($norm == 1) && ($center == 1)) {
|
241
|
+
#a loop of each feature in the dictionary.
|
242
|
+
foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$FeatureDictH}) {
|
243
|
+
if (! exists ($$TrainFeatureVecH{$s}{$li}{content}{$feature})) {
|
244
|
+
$$TrainFeatureVecH{$s}{$li}{content}{$feature} = 0;
|
245
|
+
}
|
246
|
+
# norm
|
247
|
+
my $featureVal = sprintf("%.8f", $$TrainFeatureVecH{$s}{$li}{content}{$feature}/$$FeatureDictH{$feature}{max});
|
248
|
+
#centering
|
249
|
+
$featureVal -= sprintf("%.8f", $$FeatureDictH{$feature}{mean}/$$FeatureDictH{$feature}{max});
|
250
|
+
$$TrainFeatureVecH{$s}{$li}{content}{$feature} = $featureVal;
|
251
|
+
if ($$TrainFeatureVecH{$s}{$li}{content}{$feature} != 0) {
|
252
|
+
print TrainFeatureVec "$$FeatureDictH{$feature}{ID}\:$$TrainFeatureVecH{$s}{$li}{content}{$feature} ";
|
253
|
+
}
|
254
|
+
}
|
255
|
+
print TrainFeatureVec "\n";
|
256
|
+
}elsif (($GenMatrix == 0) && ($norm == 1)) {
|
257
|
+
foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$TrainFeatureVecH{$s}{$li}{content}}) {
|
258
|
+
if (! defined ($$FeatureDictH{$feature}{ID})) {
|
259
|
+
next;
|
260
|
+
}
|
261
|
+
my $featureVal = sprintf("%.8f", $$TrainFeatureVecH{$s}{$li}{content}{$feature}/$$FeatureDictH{$feature}{max});
|
262
|
+
$$TrainFeatureVecH{$s}{$li}{content}{$feature} = $featureVal;
|
263
|
+
if ($$TrainFeatureVecH{$s}{$li}{content}{$feature} != 0) {
|
264
|
+
print TrainFeatureVec "$$FeatureDictH{$feature}{ID}\:$$TrainFeatureVecH{$s}{$li}{content}{$feature} ";
|
265
|
+
}
|
266
|
+
}
|
267
|
+
print TrainFeatureVec "\n";
|
268
|
+
}elsif($GenMatrix == 1) {
|
269
|
+
foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$TrainFeatureVecH{$s}{$li}{content}}) {
|
270
|
+
if (! defined ($$FeatureDictH{$feature}{ID})) {
|
271
|
+
next;
|
272
|
+
}
|
273
|
+
if ($norm == 1) {
|
274
|
+
my $featureVal = sprintf("%.8f", $$TrainFeatureVecH{$s}{$li}{content}{$feature}/$$FeatureDictH{$feature}{max});
|
275
|
+
$$TrainFeatureVecH{$s}{$li}{content}{$feature} = $featureVal;
|
276
|
+
}
|
277
|
+
if ($$TrainFeatureVecH{$s}{$li}{content}{$feature} != 0) {
|
278
|
+
print TrainFeatureVec "$$FeatureDictH{$feature}{ID}\:$$TrainFeatureVecH{$s}{$li}{content}{$feature} ";
|
279
|
+
# generate the matrix file for the training samples \n";
|
280
|
+
print TrainMatrxFH " $TmpTrainLineNo $$FeatureDictH{$feature}{ID} $$TrainFeatureVecH{$s}{$li}{content}{$feature}\n";
|
281
|
+
}
|
282
|
+
}
|
283
|
+
print TrainFeatureVec "\n";
|
284
|
+
}# end if
|
285
|
+
|
286
|
+
}# end foreach l(line)
|
287
|
+
print TrainFeatureVec "<NEW_HEADER>\n";
|
288
|
+
print TrainTagIndFH "<NEW_HEADER>\n";
|
289
|
+
}#end foreach s(sample)
|
290
|
+
close (TrainFeatureVec);
|
291
|
+
undef (%{$TrainFeatureVecH}); # release the training vector hash
|
292
|
+
$endTrain = 0;
|
293
|
+
close(TrainTagIndFH);
|
294
|
+
if ($GenMatrix) {
|
295
|
+
close(TrainMatrxFH);
|
296
|
+
}
|
297
|
+
}
|
298
|
+
|
299
|
+
|
300
|
+
|
301
|
+
sub GetBorderLine() {
|
302
|
+
my $InFile = shift; # this file contains the sample separator
|
303
|
+
my %BorderLineH;
|
304
|
+
my $LineNO = 0;
|
305
|
+
|
306
|
+
open(INFH, "$InFile") || die "SVMHeaderParse: could not open Infile\: $InFile to read \n";
|
307
|
+
while (my $li =<INFH>) {
|
308
|
+
$li =~ s/^\s+//g;
|
309
|
+
$li =~ s/\s+$//g;
|
310
|
+
if ($li !~ /^\s*$/) {
|
311
|
+
$LineNO++;
|
312
|
+
if ($LineNO == 1) {
|
313
|
+
$BorderLineH{$LineNO} = "N"; #only has next line
|
314
|
+
}elsif ($li =~ /^\<NEW\_HEADER\>/) {
|
315
|
+
$BorderLineH{$LineNO-1} = "P";
|
316
|
+
$BorderLineH{$LineNO+1} = "N";
|
317
|
+
}
|
318
|
+
}
|
319
|
+
}
|
320
|
+
close(INFH);
|
321
|
+
delete($BorderLineH{$LineNO+1}); # delete the last line
|
322
|
+
|
323
|
+
return(\%BorderLineH);
|
324
|
+
}
|
325
|
+
|
326
|
+
#all relevant domain databases are imported as shown at the beginning of this
|
327
|
+
#program
|
328
|
+
#useful for OfflineSeparateMultiClassLine.pl esp. for printing
|
329
|
+
sub LineFeatureRepre2() {
|
330
|
+
my $label = shift;
|
331
|
+
my $line = shift;
|
332
|
+
my $FeatureDictH = shift;
|
333
|
+
my $FiletoPrint = shift;
|
334
|
+
|
335
|
+
my $neutral = 1;
|
336
|
+
my $neutralAddName = 0;
|
337
|
+
my $norm = 1;
|
338
|
+
|
339
|
+
my %TestFeatureVecH = (); #very important
|
340
|
+
|
341
|
+
#some of these features might not work for single word case such as
|
342
|
+
#senLen, so might just take this factor out for word case
|
343
|
+
#########categorical features################
|
344
|
+
my $senLen = 0;
|
345
|
+
my $dateNum = 0;
|
346
|
+
my $DictWordNum = 0;
|
347
|
+
my $NonDictWordNum = 0;
|
348
|
+
my $Cap1DictWordNum = 0;
|
349
|
+
my $Cap1NonDictWordNum = 0;
|
350
|
+
my $digitNum = 0;
|
351
|
+
my $others = 0;
|
352
|
+
my $affiNum = 0;
|
353
|
+
my $addrNum = 0; # let city, state, country all counted as the addr
|
354
|
+
# for word case, we might need more specific recognition
|
355
|
+
my $capNum = 0;
|
356
|
+
my $introNum = 0;
|
357
|
+
my $phoneNum = 0;
|
358
|
+
my $degreeNum = 0;
|
359
|
+
my $pubNum = 0;
|
360
|
+
my $noteNum = 0;
|
361
|
+
my $pageNum = 0;
|
362
|
+
###
|
363
|
+
|
364
|
+
my $TokenLine;
|
365
|
+
if (length($line) > 1) {
|
366
|
+
($TokenLine, $senLen) = &AddrMatch($line); # this is to match the bi-grams in the address database; assume bi-gram is unique for address
|
367
|
+
#transformed features
|
368
|
+
}else {
|
369
|
+
$TokenLine = $line;
|
370
|
+
}
|
371
|
+
|
372
|
+
my @words = split(/\s+/, $TokenLine);
|
373
|
+
#now start the AddrNameConfu, shared among address and people's name
|
374
|
+
#normally do not use this representation
|
375
|
+
|
376
|
+
for my $i(0 .. $#words) {
|
377
|
+
if ($words[$i] =~ /\+PAGE\+/) {
|
378
|
+
$words[$i] = ":page:";
|
379
|
+
$pageNum++;
|
380
|
+
}
|
381
|
+
} # end with for each word
|
382
|
+
|
383
|
+
#match bi-gram on Pubnum, Note and Degree and affiliation (might make it a separate func)
|
384
|
+
if (($neutral) && (length($line) > 1)) {
|
385
|
+
for my $i(1 .. $#words) {
|
386
|
+
my $pre = lc($words[$i-1]);
|
387
|
+
my $now = lc($words[$i]);
|
388
|
+
my $prestem;
|
389
|
+
my $nowstem;
|
390
|
+
my $degreeMatch;
|
391
|
+
my $pubnumMatch;
|
392
|
+
my $noteMatch;
|
393
|
+
my $affiMatch;
|
394
|
+
|
395
|
+
if ($stem) {
|
396
|
+
$prestem = &PSTEM::stem($pre);
|
397
|
+
$nowstem = &PSTEM::stem($now);
|
398
|
+
$degreeMatch = $degreeH{lc("$prestem $nowstem")};
|
399
|
+
$pubnumMatch = $pubnumH{lc("$prestem $nowstem")};
|
400
|
+
$noteMatch = $noteH{lc("$prestem $nowstem")};
|
401
|
+
$affiMatch = $affiH{lc("$prestem $nowstem")};
|
402
|
+
}else { # for bigram match, we do not request both to be capitalized
|
403
|
+
$degreeMatch = $degreeH{lc("$pre $now")};
|
404
|
+
$pubnumMatch = $pubnumH{lc("$pre $now")};
|
405
|
+
$noteMatch = $noteH{lc("$pre $now")};
|
406
|
+
$affiMatch = $affiH{lc("$pre $now")};
|
407
|
+
}
|
408
|
+
|
409
|
+
|
410
|
+
if (($pre =~ /^\s*$/) || ($pre =~ /\:\w+\:/)) {next; }
|
411
|
+
|
412
|
+
my %Confuse4BiGram = (
|
413
|
+
1 => 0,
|
414
|
+
2 => 0,
|
415
|
+
3 => 0,
|
416
|
+
4 => 0
|
417
|
+
);
|
418
|
+
my $match = 0;
|
419
|
+
if ($degreeMatch) {
|
420
|
+
$Confuse4BiGram{1} = 1;
|
421
|
+
$match = 1;
|
422
|
+
}
|
423
|
+
if ($pubnumMatch) {
|
424
|
+
$Confuse4BiGram{2} = 1;
|
425
|
+
$match = 1;
|
426
|
+
}
|
427
|
+
if ($noteMatch) {
|
428
|
+
$Confuse4BiGram{3} = 1;
|
429
|
+
$match = 1;
|
430
|
+
}
|
431
|
+
|
432
|
+
if ($affiMatch) {
|
433
|
+
$Confuse4BiGram{4} = 1;
|
434
|
+
$match = 1;
|
435
|
+
}
|
436
|
+
|
437
|
+
if ($match == 0) { next; }
|
438
|
+
|
439
|
+
$words[$i] = "\:Confuse4BiGram";
|
440
|
+
foreach my $ind(sort {$a <=> $b} keys %Confuse4BiGram) {
|
441
|
+
$words[$i] .= "$Confuse4BiGram{$ind}";
|
442
|
+
}
|
443
|
+
$words[$i] .= "\:";
|
444
|
+
|
445
|
+
if ($words[$i] eq "\:Confuse4BiGram1000\:") {
|
446
|
+
$words[$i-1] = "";
|
447
|
+
$words[$i] = ":degree:";
|
448
|
+
$degreeNum++;
|
449
|
+
}elsif ($words[$i] eq "\:Confuse4BiGram0100\:") {
|
450
|
+
$words[$i-1] = "";
|
451
|
+
$words[$i] = ":pubnum:";
|
452
|
+
$pubNum++;
|
453
|
+
}elsif ($words[$i] eq "\:Confuse4BiGram0010\:") {
|
454
|
+
$words[$i-1] = "";
|
455
|
+
$words[$i] = ":note:";
|
456
|
+
$noteNum++;
|
457
|
+
}elsif ($words[$i] eq "\:Confuse4BiGram0001\:") {
|
458
|
+
$words[$i-1] = "";
|
459
|
+
$words[$i] = ":affi:";
|
460
|
+
$affiNum++;
|
461
|
+
}
|
462
|
+
}
|
463
|
+
}#end with neutral bigram
|
464
|
+
|
465
|
+
# single words match on Pubnum, notes and degree!
|
466
|
+
for my $i(0 .. $#words) {
|
467
|
+
if (($words[$i] !~ /\:\w+\:/) && ($words[$i] !~ /^\W+\s*$/)) {
|
468
|
+
if ($neutral) {
|
469
|
+
my %Confuse4Single = (
|
470
|
+
1 => 0,
|
471
|
+
2 => 0,
|
472
|
+
3 => 0,
|
473
|
+
4 => 0
|
474
|
+
);
|
475
|
+
my $match = 0;
|
476
|
+
my $degreeMatch;
|
477
|
+
my $pubnumMatch;
|
478
|
+
my $noteMatch;
|
479
|
+
my $affiMatch;
|
480
|
+
my $stemword;
|
481
|
+
|
482
|
+
if ($stem) {
|
483
|
+
$stemword = &PSTEM::stem($stemword);
|
484
|
+
$degreeMatch = $degreeH{$stemword};
|
485
|
+
$pubnumMatch = $pubnumH{$stemword};
|
486
|
+
$noteMatch = $noteH{$stemword};
|
487
|
+
$affiMatch = ($words[$i] =~ /^[\p{IsUpper}]/ && $affiH{$stemword});
|
488
|
+
}else {
|
489
|
+
$degreeMatch = $degreeH{lc($words[$i])};
|
490
|
+
$pubnumMatch = $pubnumH{lc($words[$i])};
|
491
|
+
$noteMatch = $noteH{lc($words[$i])};
|
492
|
+
$affiMatch = ($words[$i] =~ /^[\p{IsUpper}]/ && $affiH{lc($words[$i])});
|
493
|
+
}
|
494
|
+
|
495
|
+
#because hhan@cse.psu.edu will become hhan.psu.edu after stemming
|
496
|
+
#and $stemword is lower case
|
497
|
+
if ($degreeMatch) {
|
498
|
+
$Confuse4Single{1} = 1;
|
499
|
+
$match = 1;
|
500
|
+
}
|
501
|
+
if ($pubnumMatch) {
|
502
|
+
$Confuse4Single{2} = 1;
|
503
|
+
$match = 1;
|
504
|
+
}
|
505
|
+
if ($noteMatch) {
|
506
|
+
$Confuse4Single{3} = 1;
|
507
|
+
$match = 1;
|
508
|
+
}
|
509
|
+
if ($affiMatch) {
|
510
|
+
$Confuse4Single{4} = 1;
|
511
|
+
$match = 1;
|
512
|
+
}
|
513
|
+
|
514
|
+
if ($match) {
|
515
|
+
$words[$i] = "\:Confuse4Single";
|
516
|
+
foreach my $ind(sort {$a <=> $b} keys %Confuse4Single) {
|
517
|
+
$words[$i] .= "$Confuse4Single{$ind}";
|
518
|
+
}
|
519
|
+
$words[$i] .= "\:";
|
520
|
+
if ($words[$i] eq "\:Confuse4Single1000\:") {
|
521
|
+
$words[$i] = ":degree:";
|
522
|
+
$degreeNum++;
|
523
|
+
}elsif ($words[$i] eq "\:Confuse4Single0100\:") {
|
524
|
+
$words[$i] = ":pubnum:";
|
525
|
+
$pubNum++;
|
526
|
+
}elsif ($words[$i] eq "\:Confuse4Single0010\:") {
|
527
|
+
$words[$i] = ":note:";
|
528
|
+
$noteNum++;
|
529
|
+
}elsif ($words[$i] eq "\:Confuse4Single0001\:") {
|
530
|
+
$words[$i] = ":affi:";
|
531
|
+
$affiNum++;
|
532
|
+
}
|
533
|
+
}
|
534
|
+
}# end with neutral
|
535
|
+
|
536
|
+
if ($words[$i] !~ /\:\w+\:/) {
|
537
|
+
if (exists($conjH{$words[$i]})) {
|
538
|
+
$words[$i] = ":conj:";
|
539
|
+
}elsif (exists($prepH{$words[$i]})) {
|
540
|
+
$words[$i] = ":prep:";
|
541
|
+
}elsif ($words[$i] =~ /\@/) {
|
542
|
+
$words[$i] = "\:Email\:";
|
543
|
+
}elsif ($words[$i] =~ /(http)|(ftp)\:\/\/(\w+\.){1,}/i) {
|
544
|
+
$words[$i] = "\:http\:";
|
545
|
+
}elsif ($words[$i] =~ /^[\p{IsUpper}]/) { # Capitalize letter 1
|
546
|
+
if ((length($words[$i]) == 1) || ($words[$i] =~ /^[\p{IsUpper}]\.$/)) {
|
547
|
+
$words[$i] = ":SingleCap:"; #like M
|
548
|
+
$capNum ++; # actually only the number of single cap
|
549
|
+
}elsif (exists ($postcodeH{lc($words[$i])})) { # 2 caps
|
550
|
+
$words[$i] = ":postcode:";
|
551
|
+
}elsif (($i == 0) && ($abstractH{lc($words[$i])})) {
|
552
|
+
$words[$i] = ":abstract:";
|
553
|
+
}elsif (($i == 0) && ($keywordH{lc($words[$i])})) {
|
554
|
+
$words[$i] = ":keyword:";
|
555
|
+
}elsif ($introH{lc($words[$i])}) {
|
556
|
+
$words[$i] = ":intro:";
|
557
|
+
$introNum++;
|
558
|
+
}elsif ($phoneH{lc($words[$i])}) {
|
559
|
+
$words[$i] = ":phone:";
|
560
|
+
$phoneNum++;
|
561
|
+
}elsif ($monthH{lc($words[$i])}) {
|
562
|
+
$words[$i] = ":month:";
|
563
|
+
$dateNum++;
|
564
|
+
}else {
|
565
|
+
if ($neutral) {
|
566
|
+
if ($addrH{lc($words[$i])}) {
|
567
|
+
$words[$i] = ":addr:";
|
568
|
+
$addrNum++;
|
569
|
+
}elsif ($cityH{lc($words[$i])}) { #If not neutral class
|
570
|
+
$words[$i] = ":city:";
|
571
|
+
$addrNum++;
|
572
|
+
}elsif ($stateH{lc($words[$i])}) {
|
573
|
+
$words[$i] = ":state:";
|
574
|
+
$addrNum++;
|
575
|
+
}elsif ($countryH{lc($words[$i])}) {
|
576
|
+
$words[$i] = ":country:";
|
577
|
+
$addrNum++;
|
578
|
+
}elsif ($nameH{lc($words[$i])}) { # end with not neutral class
|
579
|
+
$words[$i] = ":MayName:";
|
580
|
+
$Cap1NonDictWordNum ++;
|
581
|
+
}elsif ($dictH{lc($words[$i])}) {
|
582
|
+
$words[$i] = ":Cap1DictWord:";
|
583
|
+
$Cap1DictWordNum ++;
|
584
|
+
}elsif ($words[$i] =~ /\W+|\-/) { #like BU-CS-93-015; maybe the length could be relaxed; I add \W+ here!!!
|
585
|
+
my @Parts = split(/\W+|\-/, $words[$i]);
|
586
|
+
for $i(0 .. $#Parts) {
|
587
|
+
if ($Parts[$i] =~ /^[\p{IsLower}]+$/) {
|
588
|
+
my $len = length($Parts[$i]);
|
589
|
+
# $Parts[$i] = "\:LowerWord"."$len"."\:";
|
590
|
+
$Parts[$i] = "\:LowerWords\:";
|
591
|
+
}elsif ($Parts[$i] =~ /^[\p{IsUpper}]+$/) {
|
592
|
+
my $len = length($Parts[$i]);
|
593
|
+
# $Parts[$i] = "\:CapWord"."$len"."\:";
|
594
|
+
$Parts[$i] = "\:CapWords\:";
|
595
|
+
}elsif ($Parts[$i] =~ /^\d+$/) {
|
596
|
+
my $len = length($Parts[$i]);
|
597
|
+
# $Parts[$i] = "\:Dig\[$len\]\:";
|
598
|
+
$Parts[$i] = "\:Digs\:";
|
599
|
+
}elsif ($Parts[$i] =~ /^[\p{IsUpper}\p{IsLower}]+$/) {
|
600
|
+
$Parts[$i] = "\:MixCaseWords\:";
|
601
|
+
}else {
|
602
|
+
my $len = length($Parts[$i]);
|
603
|
+
$Parts[$i] = "\:Mix\[$len\]\:";
|
604
|
+
}
|
605
|
+
}
|
606
|
+
$words[$i] = join("\-", @Parts);
|
607
|
+
}elsif ($words[$i] =~ /^[\p{IsUpper}]+$/) {
|
608
|
+
my $len = length($words[$i]);
|
609
|
+
$words[$i] = "\:CapWord"."$len"."\:";
|
610
|
+
# $words[$i] = "\:CapWords\:";
|
611
|
+
}else {
|
612
|
+
$words[$i] = ":Cap1NonDictWord:";
|
613
|
+
$Cap1NonDictWordNum ++;
|
614
|
+
}
|
615
|
+
}else {
|
616
|
+
if ($degreeH{lc($words[$i])}) {
|
617
|
+
$words[$i] = ":degree:";
|
618
|
+
$degreeNum++;
|
619
|
+
}elsif ($pubnumH{lc($words[$i])}) {
|
620
|
+
$words[$i] = ":pubnum:";
|
621
|
+
$pubNum++;
|
622
|
+
}elsif ($noteH{lc($words[$i])}) {
|
623
|
+
$words[$i] = ":note:";
|
624
|
+
$noteNum++;
|
625
|
+
}elsif ($monthH{lc($words[$i])}) {
|
626
|
+
$words[$i] = ":month:";
|
627
|
+
$dateNum++;
|
628
|
+
}elsif ($affiH{lc($words[$i])}) {
|
629
|
+
$words[$i] = ":affi:";
|
630
|
+
$affiNum++;
|
631
|
+
}elsif ($addrH{lc($words[$i])}) {
|
632
|
+
$words[$i] = ":addr:";
|
633
|
+
$addrNum++;
|
634
|
+
}elsif ($cityH{lc($words[$i])}) { #If not neutral class
|
635
|
+
$words[$i] = ":city:";
|
636
|
+
# $words[$i] = ":addr:";
|
637
|
+
$addrNum++;
|
638
|
+
}elsif ($stateH{lc($words[$i])}) {
|
639
|
+
$words[$i] = ":state:";
|
640
|
+
# $words[$i] = ":addr:";
|
641
|
+
$addrNum++;
|
642
|
+
}elsif ($countryH{lc($words[$i])}) {
|
643
|
+
$words[$i] = ":country:";
|
644
|
+
# $words[$i] = ":addr:";
|
645
|
+
$addrNum++;
|
646
|
+
}elsif ($nameH{lc($words[$i])}) { # end with not neutral class
|
647
|
+
$words[$i] = ":MayName:";
|
648
|
+
$Cap1NonDictWordNum ++;
|
649
|
+
}elsif ( $dictH{lc($words[$i])}) {
|
650
|
+
$words[$i] = ":Cap1DictWord:";
|
651
|
+
$Cap1DictWordNum ++;
|
652
|
+
}elsif ($words[$i] =~ /\W+|\-/) { #like BU-CS-93-015; maybe the length could be relaxed; I add \W+ here!!!
|
653
|
+
my @Parts = split(/\W+|\-/, $words[$i]);
|
654
|
+
for $i(0 .. $#Parts) {
|
655
|
+
if ($Parts[$i] =~ /^[\p{IsLower}]+$/) {
|
656
|
+
my $len = length($Parts[$i]);
|
657
|
+
# $Parts[$i] = "\:LowerWord"."$len"."\:";
|
658
|
+
$Parts[$i] = "\:LowerWords\:";
|
659
|
+
}elsif ($Parts[$i] =~ /^[\p{IsUpper}]+$/) {
|
660
|
+
my $len = length($Parts[$i]);
|
661
|
+
# $Parts[$i] = "\:CapWord"."$len"."\:";
|
662
|
+
$Parts[$i] = "\:CapWords\:";
|
663
|
+
}elsif ($Parts[$i] =~ /^\d+$/) {
|
664
|
+
my $len = length($Parts[$i]);
|
665
|
+
# $Parts[$i] = "\:Dig\[$len\]\:";
|
666
|
+
$Parts[$i] = "\:Digs\:";
|
667
|
+
}elsif ($Parts[$i] =~ /^[\p{IsUpper}\p{IsLower}]+$/) {
|
668
|
+
$Parts[$i] = "\:MixCaseWords\:";
|
669
|
+
}else {
|
670
|
+
my $len = length($Parts[$i]);
|
671
|
+
$Parts[$i] = "\:Mix\[$len\]\:";
|
672
|
+
}
|
673
|
+
}
|
674
|
+
$words[$i] = join("\-", @Parts);
|
675
|
+
}elsif ($words[$i] =~ /^[\p{IsUpper}]+$/) {
|
676
|
+
my $len = length($words[$i]);
|
677
|
+
$words[$i] = "\:CapWord"."$len"."\:";
|
678
|
+
# $words[$i] = "\:CapWords\:";
|
679
|
+
}else {
|
680
|
+
$words[$i] = ":Cap1NonDictWord:";
|
681
|
+
$Cap1NonDictWordNum ++;
|
682
|
+
}
|
683
|
+
}
|
684
|
+
}#end with neutral
|
685
|
+
}elsif ($words[$i] =~ /^[\p{IsLower}]/) { # small case letter 1
|
686
|
+
if (exists ($phoneH{$words[$i]})) {
|
687
|
+
$words[$i] = ":phone:";
|
688
|
+
$phoneNum++;
|
689
|
+
}elsif (exists ($monthH{lc($words[$i])})) {
|
690
|
+
$words[$i] = ":month:";
|
691
|
+
$dateNum++;
|
692
|
+
}elsif ($keywordH{lc($words[$i])}) {
|
693
|
+
$words[$i] = ":keyword:";
|
694
|
+
}elsif (exists $dictH{lc($words[$i])}) {
|
695
|
+
$words[$i] = ":DictWord:";
|
696
|
+
$DictWordNum ++;
|
697
|
+
}else {# should consider the mixure of digit and letters
|
698
|
+
$words[$i] = ":NonDictWord:";
|
699
|
+
$NonDictWordNum ++;
|
700
|
+
}
|
701
|
+
}elsif ($words[$i] =~ /^[\d\-]+$/) { #like 30332-0280 or 1111
|
702
|
+
my $newword = $words[$i];
|
703
|
+
while ($words[$i] =~ /(\d+)/g) {
|
704
|
+
my $dig = $1;
|
705
|
+
my $diglen = length($dig);
|
706
|
+
$newword =~ s/$dig/ \:Dig\[$diglen\]\: /;
|
707
|
+
}
|
708
|
+
$words[$i] = $newword;
|
709
|
+
$digitNum++;
|
710
|
+
}elsif ($words[$i] =~ /^(\W+)(.*)$/) { #start from a non-word character
|
711
|
+
my $nonword = $1;
|
712
|
+
my $rest = $2;
|
713
|
+
$words[$i] = $nonword;
|
714
|
+
while (length($rest) > 0) {
|
715
|
+
if ($rest =~ /^([\p{IsUpper}]+)(.*)$/) {
|
716
|
+
my $tmp = $1;
|
717
|
+
$rest = $2;
|
718
|
+
$words[$i] .= "\:CapWords\:".length($tmp); #length may be relaxed
|
719
|
+
}elsif ($rest =~ /^([\p{IsLower}]+)(.*)$/) {
|
720
|
+
my $tmp = $1;
|
721
|
+
$rest = $2;
|
722
|
+
$words[$i] .= "\:LowerWords\:".length($tmp);
|
723
|
+
}elsif ($rest =~ /^(\d+)(.*)$/) {
|
724
|
+
my $tmp = $1;
|
725
|
+
$rest = $2;
|
726
|
+
$words[$i] .= "\:Digs\:".length($tmp);
|
727
|
+
}else { #get the head character
|
728
|
+
my $restLen = length($rest);
|
729
|
+
$restLen--;
|
730
|
+
$words[$i] .= substr($rest, 0, 1);
|
731
|
+
$rest = substr($rest, 1, $restLen);
|
732
|
+
}
|
733
|
+
}
|
734
|
+
}else {
|
735
|
+
$others++;
|
736
|
+
}
|
737
|
+
}
|
738
|
+
}else {
|
739
|
+
# print " already token or punctuation\: $words[$i] \n";
|
740
|
+
}
|
741
|
+
}
|
742
|
+
|
743
|
+
for my $i(0 .. $#words) {
|
744
|
+
if (exists ($$FeatureDictH{$words[$i]}{ID})) {
|
745
|
+
$TestFeatureVecH{$words[$i]}++;
|
746
|
+
}
|
747
|
+
}
|
748
|
+
|
749
|
+
# here we add in the bigrams
|
750
|
+
if (length($line) > 1) {
|
751
|
+
for my $i(1 .. $#words) { #not good for (0 .. $#words-1) soemtimes
|
752
|
+
my $pre = $words[$i-1];
|
753
|
+
my $now = $words[$i];
|
754
|
+
# add bigram into dict and train or test vector
|
755
|
+
if (exists ($$FeatureDictH{"$pre $now"}{ID})) {
|
756
|
+
$TestFeatureVecH{"$pre $now"}++;
|
757
|
+
}
|
758
|
+
} # end with bigram features
|
759
|
+
}
|
760
|
+
|
761
|
+
# try to normalize using F1
|
762
|
+
$TestFeatureVecH{CsenLen} = $senLen;
|
763
|
+
if ($senLen > 0) {
|
764
|
+
$TestFeatureVecH{CdateNumPer} = sprintf("%.8f", $dateNum/$senLen);
|
765
|
+
$TestFeatureVecH{CDictWordNumPer} = sprintf("%.8f", $DictWordNum/$senLen);
|
766
|
+
$TestFeatureVecH{CNonDictWordNumPer} = sprintf("%.8f", $NonDictWordNum/$senLen);
|
767
|
+
$TestFeatureVecH{CCap1DictWordNumPer} = sprintf("%.8f", $Cap1DictWordNum/$senLen);
|
768
|
+
$TestFeatureVecH{CCap1NonDictWordNumPer} = sprintf("%.8f", $Cap1NonDictWordNum/$senLen);
|
769
|
+
$TestFeatureVecH{CdigitNumPer} = sprintf("%.8f", $digitNum/$senLen);
|
770
|
+
$TestFeatureVecH{CaffiNumPer} = sprintf("%.8f", $affiNum/$senLen);
|
771
|
+
$TestFeatureVecH{CaddrNumPer} = sprintf("%.8f", $addrNum/$senLen);
|
772
|
+
$TestFeatureVecH{CintroNumPer} = sprintf("%.8f",$introNum/$senLen);
|
773
|
+
$TestFeatureVecH{CphoneNumPer} = sprintf("%.8f",$phoneNum/$senLen);
|
774
|
+
$TestFeatureVecH{CdegreeNumPer} = sprintf("%.8f",$degreeNum/$senLen);
|
775
|
+
$TestFeatureVecH{CpubNumPer} = sprintf("%.8f",$pubNum/$senLen);
|
776
|
+
$TestFeatureVecH{CnoteNumPer} = sprintf("%.8f",$noteNum/$senLen);
|
777
|
+
$TestFeatureVecH{CpageNumPer} = sprintf("%.8f",$pageNum/$senLen);
|
778
|
+
$TestFeatureVecH{CcapNumPer} = sprintf("%.8f",$capNum/$senLen);
|
779
|
+
$TestFeatureVecH{CothersPer} = sprintf("%.8f", $others/$senLen);
|
780
|
+
#$TestFeatureVecH{ClinePos} = sprintf("%.8f", $linePos);
|
781
|
+
}else {
|
782
|
+
#print "null line\: $line \n";
|
783
|
+
}
|
784
|
+
|
785
|
+
if ($FiletoPrint ne "") {
|
786
|
+
open(PFH, ">$FiletoPrint") || die "SVMHeaderParse: here4...could not open $FiletoPrint to write\n";
|
787
|
+
print PFH "$label ";
|
788
|
+
}
|
789
|
+
|
790
|
+
my $SVMFeaVec = "$label "; #this is a string
|
791
|
+
foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %TestFeatureVecH) {
|
792
|
+
if ($TestFeatureVecH{$feature} != 0){
|
793
|
+
if ($norm) {
|
794
|
+
if ($$FeatureDictH{$feature}{max} != 0) {
|
795
|
+
# print "feature: $TestFeatureVecH{$feature} ; dict $$FeatureDictH{$feature}{max} => ";
|
796
|
+
my $tmpval = sprintf("%.8f", $TestFeatureVecH{$feature}/$$FeatureDictH{$feature}{max});
|
797
|
+
$TestFeatureVecH{$feature} = $tmpval;
|
798
|
+
#print " $TestFeatureVecH{$feature} \n";
|
799
|
+
}else {
|
800
|
+
#print "zero max\: $feature \n";
|
801
|
+
}
|
802
|
+
}
|
803
|
+
if ($FiletoPrint ne "") {
|
804
|
+
print PFH "$$FeatureDictH{$feature}{ID}\:$TestFeatureVecH{$feature} ";
|
805
|
+
}
|
806
|
+
$SVMFeaVec .= "$$FeatureDictH{$feature}{ID}\:$TestFeatureVecH{$feature} ";
|
807
|
+
}else {
|
808
|
+
#print "zero value\: $feature ($TestFeatureVecH{$feature}) \n";
|
809
|
+
}
|
810
|
+
}
|
811
|
+
if ($FiletoPrint ne "") {
|
812
|
+
print PFH "\n";
|
813
|
+
close(PFH);
|
814
|
+
}
|
815
|
+
|
816
|
+
my $convertedStr = join(" ", @words);
|
817
|
+
#return(\%TestFeatureVecH);
|
818
|
+
return($SVMFeaVec);
|
819
|
+
#return($convertedStr);
|
820
|
+
}
|
821
|
+
|
822
|
+
|
823
|
+
sub LineFeatureRepre() {
|
824
|
+
my $line = shift;
|
825
|
+
my $neutral = 1;
|
826
|
+
my $neutralAddName = 0;
|
827
|
+
my $norm = 1;
|
828
|
+
my %TestFeatureVecH = (); #very important
|
829
|
+
|
830
|
+
#some of these features might not work for single word case such as
|
831
|
+
#senLen, so might just take this factor out for word case
|
832
|
+
#########categorical features################
|
833
|
+
my $senLen = 0;
|
834
|
+
my $dateNum = 0;
|
835
|
+
my $DictWordNum = 0;
|
836
|
+
my $NonDictWordNum = 0;
|
837
|
+
my $Cap1DictWordNum = 0;
|
838
|
+
my $Cap1NonDictWordNum = 0;
|
839
|
+
my $digitNum = 0;
|
840
|
+
my $others = 0;
|
841
|
+
my $affiNum = 0;
|
842
|
+
my $addrNum = 0; # let city, state, country all counted as the addr
|
843
|
+
# for word case, we might need more specific recognition
|
844
|
+
my $capNum = 0;
|
845
|
+
my $introNum = 0;
|
846
|
+
my $phoneNum = 0;
|
847
|
+
my $degreeNum = 0;
|
848
|
+
my $pubNum = 0;
|
849
|
+
my $noteNum = 0;
|
850
|
+
my $pageNum = 0;
|
851
|
+
###
|
852
|
+
|
853
|
+
my $TokenLine;
|
854
|
+
if (length($line) > 1) {
|
855
|
+
($TokenLine, $senLen) = &AddrMatch($line); # this is to match the bi-grams in the address database; assume bi-gram is unique for address
|
856
|
+
#transformed features
|
857
|
+
}else {
|
858
|
+
$TokenLine = $line;
|
859
|
+
}
|
860
|
+
my @words = split(/\s+/, $TokenLine);
|
861
|
+
#now start the AddrNameConfu, shared among address and people's name
|
862
|
+
#normally do not use this representation
|
863
|
+
|
864
|
+
for my $i(0 .. $#words) {
|
865
|
+
if ($words[$i] =~ /\+PAGE\+/) {
|
866
|
+
$words[$i] = ":page:";
|
867
|
+
$pageNum++;
|
868
|
+
}
|
869
|
+
} # end with for each word
|
870
|
+
|
871
|
+
#match bi-gram on Pubnum, Note and Degree and affiliation (might make it a separate func)
|
872
|
+
if (($neutral) && (length($line) > 1)) {
|
873
|
+
for my $i(1 .. $#words) {
|
874
|
+
my $pre = lc($words[$i-1]);
|
875
|
+
my $now = lc($words[$i]);
|
876
|
+
my $prestem;
|
877
|
+
my $nowstem;
|
878
|
+
my $degreeMatch;
|
879
|
+
my $pubnumMatch;
|
880
|
+
my $noteMatch;
|
881
|
+
my $affiMatch;
|
882
|
+
|
883
|
+
if ($stem) {
|
884
|
+
$prestem = &PSTEM::stem($pre);
|
885
|
+
$nowstem = &PSTEM::stem($now);
|
886
|
+
$degreeMatch = $degreeH{lc("$prestem $nowstem")};
|
887
|
+
$pubnumMatch = $pubnumH{lc("$prestem $nowstem")};
|
888
|
+
$noteMatch = $noteH{lc("$prestem $nowstem")};
|
889
|
+
$affiMatch = $affiH{lc("$prestem $nowstem")};
|
890
|
+
}else { # for bigram match, we do not request both to be capitalized
|
891
|
+
$degreeMatch = $degreeH{lc("$pre $now")};
|
892
|
+
$pubnumMatch = $pubnumH{lc("$pre $now")};
|
893
|
+
$noteMatch = $noteH{lc("$pre $now")};
|
894
|
+
$affiMatch = $affiH{lc("$pre $now")};
|
895
|
+
}
|
896
|
+
|
897
|
+
|
898
|
+
if (($pre =~ /^\s*$/) || ($pre =~ /\:\w+\:/)) {next; }
|
899
|
+
|
900
|
+
my %Confuse4BiGram = (
|
901
|
+
1 => 0,
|
902
|
+
2 => 0,
|
903
|
+
3 => 0,
|
904
|
+
4 => 0
|
905
|
+
);
|
906
|
+
my $match = 0;
|
907
|
+
if ($degreeMatch) {
|
908
|
+
$Confuse4BiGram{1} = 1;
|
909
|
+
$match = 1;
|
910
|
+
}
|
911
|
+
if ($pubnumMatch) {
|
912
|
+
$Confuse4BiGram{2} = 1;
|
913
|
+
$match = 1;
|
914
|
+
}
|
915
|
+
if ($noteMatch) {
|
916
|
+
$Confuse4BiGram{3} = 1;
|
917
|
+
$match = 1;
|
918
|
+
}
|
919
|
+
|
920
|
+
if ($affiMatch) {
|
921
|
+
$Confuse4BiGram{4} = 1;
|
922
|
+
$match = 1;
|
923
|
+
}
|
924
|
+
|
925
|
+
if ($match == 0) { next; }
|
926
|
+
|
927
|
+
$words[$i] = "\:Confuse4BiGram";
|
928
|
+
foreach my $ind(sort {$a <=> $b} keys %Confuse4BiGram) {
|
929
|
+
$words[$i] .= "$Confuse4BiGram{$ind}";
|
930
|
+
}
|
931
|
+
$words[$i] .= "\:";
|
932
|
+
|
933
|
+
if ($words[$i] eq "\:Confuse4BiGram1000\:") {
|
934
|
+
$words[$i-1] = "";
|
935
|
+
$words[$i] = ":degree:";
|
936
|
+
$degreeNum++;
|
937
|
+
}elsif ($words[$i] eq "\:Confuse4BiGram0100\:") {
|
938
|
+
$words[$i-1] = "";
|
939
|
+
$words[$i] = ":pubnum:";
|
940
|
+
$pubNum++;
|
941
|
+
}elsif ($words[$i] eq "\:Confuse4BiGram0010\:") {
|
942
|
+
$words[$i-1] = "";
|
943
|
+
$words[$i] = ":note:";
|
944
|
+
$noteNum++;
|
945
|
+
}elsif ($words[$i] eq "\:Confuse4BiGram0001\:") {
|
946
|
+
$words[$i-1] = "";
|
947
|
+
$words[$i] = ":affi:";
|
948
|
+
$affiNum++;
|
949
|
+
}
|
950
|
+
}
|
951
|
+
}#end with neutral bigram
|
952
|
+
|
953
|
+
# single words match on Pubnum, notes and degree!
|
954
|
+
for my $i(0 .. $#words) {
|
955
|
+
if (($words[$i] !~ /\:\w+\:/) && ($words[$i] !~ /^\W+\s*$/)) {
|
956
|
+
if ($neutral) {
|
957
|
+
my %Confuse4Single = (
|
958
|
+
1 => 0,
|
959
|
+
2 => 0,
|
960
|
+
3 => 0,
|
961
|
+
4 => 0
|
962
|
+
);
|
963
|
+
my $match = 0;
|
964
|
+
my $degreeMatch;
|
965
|
+
my $pubnumMatch;
|
966
|
+
my $noteMatch;
|
967
|
+
my $affiMatch;
|
968
|
+
my $stemword;
|
969
|
+
|
970
|
+
if ($stem) {
|
971
|
+
$stemword = &PSTEM::stem($stemword);
|
972
|
+
$degreeMatch = $degreeH{$stemword};
|
973
|
+
$pubnumMatch = $pubnumH{$stemword};
|
974
|
+
$noteMatch = $noteH{$stemword};
|
975
|
+
$affiMatch = ($words[$i] =~ /^[\p{IsUpper}]/ && $affiH{$stemword});
|
976
|
+
}else {
|
977
|
+
$degreeMatch = $degreeH{lc($words[$i])};
|
978
|
+
$pubnumMatch = $pubnumH{lc($words[$i])};
|
979
|
+
$noteMatch = $noteH{lc($words[$i])};
|
980
|
+
$affiMatch = ($words[$i] =~ /^[\p{IsUpper}]/ && $affiH{lc($words[$i])});
|
981
|
+
}
|
982
|
+
|
983
|
+
#because hhan@cse.psu.edu will become hhan.psu.edu after stemming
|
984
|
+
#and $stemword is lower case
|
985
|
+
if ($degreeMatch) {
|
986
|
+
$Confuse4Single{1} = 1;
|
987
|
+
$match = 1;
|
988
|
+
}
|
989
|
+
if ($pubnumMatch) {
|
990
|
+
$Confuse4Single{2} = 1;
|
991
|
+
$match = 1;
|
992
|
+
}
|
993
|
+
if ($noteMatch) {
|
994
|
+
$Confuse4Single{3} = 1;
|
995
|
+
$match = 1;
|
996
|
+
}
|
997
|
+
if ($affiMatch) {
|
998
|
+
$Confuse4Single{4} = 1;
|
999
|
+
$match = 1;
|
1000
|
+
}
|
1001
|
+
|
1002
|
+
if ($match) {
|
1003
|
+
$words[$i] = "\:Confuse4Single";
|
1004
|
+
foreach my $ind(sort {$a <=> $b} keys %Confuse4Single) {
|
1005
|
+
$words[$i] .= "$Confuse4Single{$ind}";
|
1006
|
+
}
|
1007
|
+
$words[$i] .= "\:";
|
1008
|
+
if ($words[$i] eq "\:Confuse4Single1000\:") {
|
1009
|
+
$words[$i] = ":degree:";
|
1010
|
+
$degreeNum++;
|
1011
|
+
}elsif ($words[$i] eq "\:Confuse4Single0100\:") {
|
1012
|
+
$words[$i] = ":pubnum:";
|
1013
|
+
$pubNum++;
|
1014
|
+
}elsif ($words[$i] eq "\:Confuse4Single0010\:") {
|
1015
|
+
$words[$i] = ":note:";
|
1016
|
+
$noteNum++;
|
1017
|
+
}elsif ($words[$i] eq "\:Confuse4Single0001\:") {
|
1018
|
+
$words[$i] = ":affi:";
|
1019
|
+
$affiNum++;
|
1020
|
+
}
|
1021
|
+
}
|
1022
|
+
}# end with neutral
|
1023
|
+
|
1024
|
+
if ($words[$i] !~ /\:\w+\:/) {
|
1025
|
+
if (exists($conjH{$words[$i]})) {
|
1026
|
+
$words[$i] = ":conj:";
|
1027
|
+
}elsif (exists($prepH{$words[$i]})) {
|
1028
|
+
$words[$i] = ":prep:";
|
1029
|
+
}elsif ($words[$i] =~ /\@/) {
|
1030
|
+
$words[$i] = "\:Email\:";
|
1031
|
+
}elsif ($words[$i] =~ /(http)|(ftp)\:\/\/(\w+\.){1,}/i) {
|
1032
|
+
$words[$i] = "\:http\:";
|
1033
|
+
}elsif ($words[$i] =~ /^[\p{IsUpper}]/) { # Capitalize letter 1
|
1034
|
+
if ((length($words[$i]) == 1) || ($words[$i] =~ /^[\p{IsUpper}]\.$/)) {
|
1035
|
+
$words[$i] = ":SingleCap:"; #like M
|
1036
|
+
$capNum ++; # actually only the number of single cap
|
1037
|
+
}elsif (exists ($postcodeH{lc($words[$i])})) { # 2 caps
|
1038
|
+
$words[$i] = ":postcode:";
|
1039
|
+
}elsif (($i == 0) && ($abstractH{lc($words[$i])})) {
|
1040
|
+
$words[$i] = ":abstract:";
|
1041
|
+
}elsif (($i == 0) && ($keywordH{lc($words[$i])})) {
|
1042
|
+
$words[$i] = ":keyword:";
|
1043
|
+
}elsif ($introH{lc($words[$i])}) {
|
1044
|
+
$words[$i] = ":intro:";
|
1045
|
+
$introNum++;
|
1046
|
+
}elsif ($phoneH{lc($words[$i])}) {
|
1047
|
+
$words[$i] = ":phone:";
|
1048
|
+
$phoneNum++;
|
1049
|
+
}elsif ($monthH{lc($words[$i])}) {
|
1050
|
+
$words[$i] = ":month:";
|
1051
|
+
$dateNum++;
|
1052
|
+
}else {
|
1053
|
+
if ($neutral) {
|
1054
|
+
if ($addrH{lc($words[$i])}) {
|
1055
|
+
$words[$i] = ":addr:";
|
1056
|
+
$addrNum++;
|
1057
|
+
}elsif ($cityH{lc($words[$i])}) { #If not neutral class
|
1058
|
+
$words[$i] = ":city:";
|
1059
|
+
$addrNum++;
|
1060
|
+
}elsif ($stateH{lc($words[$i])}) {
|
1061
|
+
$words[$i] = ":state:";
|
1062
|
+
$addrNum++;
|
1063
|
+
}elsif ($countryH{lc($words[$i])}) {
|
1064
|
+
$words[$i] = ":country:";
|
1065
|
+
$addrNum++;
|
1066
|
+
}elsif ($nameH{lc($words[$i])}) { # end with not neutral class
|
1067
|
+
$words[$i] = ":MayName:";
|
1068
|
+
$Cap1NonDictWordNum ++;
|
1069
|
+
}elsif ($dictH{lc($words[$i])}) {
|
1070
|
+
$words[$i] = ":Cap1DictWord:";
|
1071
|
+
$Cap1DictWordNum ++;
|
1072
|
+
}elsif ($words[$i] =~ /\W+|\-/) { #like BU-CS-93-015; maybe the length could be relaxed; I add \W+ here!!!
|
1073
|
+
my @Parts = split(/\W+|\-/, $words[$i]);
|
1074
|
+
for $i(0 .. $#Parts) {
|
1075
|
+
if ($Parts[$i] =~ /^[\p{IsLower}]+$/) {
|
1076
|
+
my $len = length($Parts[$i]);
|
1077
|
+
# $Parts[$i] = "\:LowerWord"."$len"."\:";
|
1078
|
+
$Parts[$i] = "\:LowerWords\:";
|
1079
|
+
}elsif ($Parts[$i] =~ /^[\p{IsUpper}]+$/) {
|
1080
|
+
my $len = length($Parts[$i]);
|
1081
|
+
# $Parts[$i] = "\:CapWord"."$len"."\:";
|
1082
|
+
$Parts[$i] = "\:CapWords\:";
|
1083
|
+
}elsif ($Parts[$i] =~ /^\d+$/) {
|
1084
|
+
my $len = length($Parts[$i]);
|
1085
|
+
# $Parts[$i] = "\:Dig\[$len\]\:";
|
1086
|
+
$Parts[$i] = "\:Digs\:";
|
1087
|
+
}elsif ($Parts[$i] =~ /^[\p{IsUpper}\p{IsLower}]+$/) {
|
1088
|
+
$Parts[$i] = "\:MixCaseWords\:";
|
1089
|
+
}else {
|
1090
|
+
my $len = length($Parts[$i]);
|
1091
|
+
$Parts[$i] = "\:Mix\[$len\]\:";
|
1092
|
+
}
|
1093
|
+
}
|
1094
|
+
$words[$i] = join("\-", @Parts);
|
1095
|
+
}elsif ($words[$i] =~ /^[\p{IsUpper}]+$/) {
|
1096
|
+
my $len = length($words[$i]);
|
1097
|
+
$words[$i] = "\:CapWord"."$len"."\:";
|
1098
|
+
# $words[$i] = "\:CapWords\:";
|
1099
|
+
}else {
|
1100
|
+
$words[$i] = ":Cap1NonDictWord:";
|
1101
|
+
$Cap1NonDictWordNum ++;
|
1102
|
+
}
|
1103
|
+
}else {#end with neutral
|
1104
|
+
|
1105
|
+
if ($degreeH{lc($words[$i])}) {
|
1106
|
+
$words[$i] = ":degree:";
|
1107
|
+
$degreeNum++;
|
1108
|
+
}elsif ($pubnumH{lc($words[$i])}) {
|
1109
|
+
$words[$i] = ":pubnum:";
|
1110
|
+
$pubNum++;
|
1111
|
+
}elsif ($noteH{lc($words[$i])}) {
|
1112
|
+
$words[$i] = ":note:";
|
1113
|
+
$noteNum++;
|
1114
|
+
}elsif ($monthH{lc($words[$i])}) {
|
1115
|
+
$words[$i] = ":month:";
|
1116
|
+
$dateNum++;
|
1117
|
+
}elsif ($affiH{lc($words[$i])}) {
|
1118
|
+
$words[$i] = ":affi:";
|
1119
|
+
$affiNum++;
|
1120
|
+
}elsif ($addrH{lc($words[$i])}) {
|
1121
|
+
$words[$i] = ":addr:";
|
1122
|
+
$addrNum++;
|
1123
|
+
}elsif ($cityH{lc($words[$i])}) { #If not neutral class
|
1124
|
+
$words[$i] = ":city:";
|
1125
|
+
# $words[$i] = ":addr:";
|
1126
|
+
$addrNum++;
|
1127
|
+
}elsif ($stateH{lc($words[$i])}) {
|
1128
|
+
$words[$i] = ":state:";
|
1129
|
+
# $words[$i] = ":addr:";
|
1130
|
+
$addrNum++;
|
1131
|
+
}elsif ($countryH{lc($words[$i])}) {
|
1132
|
+
$words[$i] = ":country:";
|
1133
|
+
# $words[$i] = ":addr:";
|
1134
|
+
$addrNum++;
|
1135
|
+
}elsif ($nameH{lc($words[$i])}) { # end with not neutral class
|
1136
|
+
$words[$i] = ":MayName:";
|
1137
|
+
$Cap1NonDictWordNum ++;
|
1138
|
+
}elsif ( $dictH{lc($words[$i])}) {
|
1139
|
+
$words[$i] = ":Cap1DictWord:";
|
1140
|
+
$Cap1DictWordNum ++;
|
1141
|
+
}elsif ($words[$i] =~ /\W+|\-/) { #like BU-CS-93-015; maybe the length could be relaxed; I add \W+ here!!!
|
1142
|
+
my @Parts = split(/\W+|\-/, $words[$i]);
|
1143
|
+
for $i(0 .. $#Parts) {
|
1144
|
+
if ($Parts[$i] =~ /^[\p{IsLower}]+$/) {
|
1145
|
+
my $len = length($Parts[$i]);
|
1146
|
+
# $Parts[$i] = "\:LowerWord"."$len"."\:";
|
1147
|
+
$Parts[$i] = "\:LowerWords\:";
|
1148
|
+
}elsif ($Parts[$i] =~ /^[\p{IsUpper}]+$/) {
|
1149
|
+
my $len = length($Parts[$i]);
|
1150
|
+
# $Parts[$i] = "\:CapWord"."$len"."\:";
|
1151
|
+
$Parts[$i] = "\:CapWords\:";
|
1152
|
+
}elsif ($Parts[$i] =~ /^\d+$/) {
|
1153
|
+
my $len = length($Parts[$i]);
|
1154
|
+
# $Parts[$i] = "\:Dig\[$len\]\:";
|
1155
|
+
$Parts[$i] = "\:Digs\:";
|
1156
|
+
}elsif ($Parts[$i] =~ /^[\p{IsUpper}\p{IsLower}]+$/) {
|
1157
|
+
$Parts[$i] = "\:MixCaseWords\:";
|
1158
|
+
}else {
|
1159
|
+
my $len = length($Parts[$i]);
|
1160
|
+
$Parts[$i] = "\:Mix\[$len\]\:";
|
1161
|
+
}
|
1162
|
+
}
|
1163
|
+
$words[$i] = join("\-", @Parts);
|
1164
|
+
}elsif ($words[$i] =~ /^[\p{IsUpper}]+$/) {
|
1165
|
+
my $len = length($words[$i]);
|
1166
|
+
$words[$i] = "\:CapWord"."$len"."\:";
|
1167
|
+
# $words[$i] = "\:CapWords\:";
|
1168
|
+
}else {
|
1169
|
+
$words[$i] = ":Cap1NonDictWord:";
|
1170
|
+
$Cap1NonDictWordNum ++;
|
1171
|
+
}
|
1172
|
+
}
|
1173
|
+
}#end with else neutral
|
1174
|
+
}elsif ($words[$i] =~ /^[\p{IsLower}]/) { # small case letter 1
|
1175
|
+
if (exists ($phoneH{$words[$i]})) {
|
1176
|
+
$words[$i] = ":phone:";
|
1177
|
+
$phoneNum++;
|
1178
|
+
}elsif (exists ($monthH{lc($words[$i])})) {
|
1179
|
+
$words[$i] = ":month:";
|
1180
|
+
$dateNum++;
|
1181
|
+
}elsif ($keywordH{lc($words[$i])}) {
|
1182
|
+
$words[$i] = ":keyword:";
|
1183
|
+
}elsif (exists $dictH{lc($words[$i])}) {
|
1184
|
+
$words[$i] = ":DictWord:";
|
1185
|
+
$DictWordNum ++;
|
1186
|
+
}else {# should consider the mixure of digit and letters
|
1187
|
+
$words[$i] = ":NonDictWord:";
|
1188
|
+
$NonDictWordNum ++;
|
1189
|
+
}
|
1190
|
+
}elsif ($words[$i] =~ /^[\d\-]+$/) { #like 30332-0280 or 1111
|
1191
|
+
my $newword = $words[$i];
|
1192
|
+
while ($words[$i] =~ /(\d+)/g) {
|
1193
|
+
my $dig = $1;
|
1194
|
+
my $diglen = length($dig);
|
1195
|
+
$newword =~ s/$dig/ \:Dig\[$diglen\]\: /;
|
1196
|
+
}
|
1197
|
+
$words[$i] = $newword;
|
1198
|
+
$digitNum++;
|
1199
|
+
}elsif ($words[$i] =~ /^(\W+)(.*)$/) { #start from a non-word character
|
1200
|
+
my $nonword = $1;
|
1201
|
+
my $rest = $2;
|
1202
|
+
$words[$i] = $nonword;
|
1203
|
+
while (length($rest) > 0) {
|
1204
|
+
if ($rest =~ /^([\p{IsUpper}]+)(.*)$/) {
|
1205
|
+
my $tmp = $1;
|
1206
|
+
$rest = $2;
|
1207
|
+
$words[$i] .= "\:CapWords\:".length($tmp); #length may be relaxed
|
1208
|
+
}elsif ($rest =~ /^([\p{IsLower}]+)(.*)$/) {
|
1209
|
+
my $tmp = $1;
|
1210
|
+
$rest = $2;
|
1211
|
+
$words[$i] .= "\:LowerWords\:".length($tmp);
|
1212
|
+
}elsif ($rest =~ /^(\d+)(.*)$/) {
|
1213
|
+
my $tmp = $1;
|
1214
|
+
$rest = $2;
|
1215
|
+
$words[$i] .= "\:Digs\:".length($tmp);
|
1216
|
+
}else { #get the head character
|
1217
|
+
my $restLen = length($rest);
|
1218
|
+
$restLen--;
|
1219
|
+
$words[$i] .= substr($rest, 0, 1);
|
1220
|
+
$rest = substr($rest, 1, $restLen);
|
1221
|
+
}
|
1222
|
+
}
|
1223
|
+
}else {
|
1224
|
+
$others++;
|
1225
|
+
}
|
1226
|
+
}
|
1227
|
+
}else {
|
1228
|
+
# print " already token or punctuation\: $words[$i] \n";
|
1229
|
+
}
|
1230
|
+
}
|
1231
|
+
|
1232
|
+
for my $i(0 .. $#words) {
|
1233
|
+
# if (exists ($$FeatureDictH{$words[$i]}{ID})) {
|
1234
|
+
$TestFeatureVecH{$words[$i]}++;
|
1235
|
+
# }
|
1236
|
+
}
|
1237
|
+
|
1238
|
+
# here we add in the bigrams
|
1239
|
+
if (length($line) > 1) {
|
1240
|
+
for my $i(1 .. $#words) { #not good for (0 .. $#words-1) soemtimes
|
1241
|
+
my $pre = $words[$i-1];
|
1242
|
+
my $now = $words[$i];
|
1243
|
+
# add bigram into dict and train or test vector
|
1244
|
+
# if (exists ($$FeatureDictH{"$pre $now"}{ID})) {
|
1245
|
+
$TestFeatureVecH{"$pre $now"}++;
|
1246
|
+
# }
|
1247
|
+
} # end with bigram features
|
1248
|
+
}
|
1249
|
+
|
1250
|
+
# try to normalize using F1
|
1251
|
+
$TestFeatureVecH{CsenLen} = $senLen;
|
1252
|
+
if ($senLen > 0) {
|
1253
|
+
$TestFeatureVecH{CdateNumPer} = sprintf("%.8f", $dateNum/$senLen);
|
1254
|
+
$TestFeatureVecH{CDictWordNumPer} = sprintf("%.8f", $DictWordNum/$senLen);
|
1255
|
+
$TestFeatureVecH{CNonDictWordNumPer} = sprintf("%.8f", $NonDictWordNum/$senLen);
|
1256
|
+
$TestFeatureVecH{CCap1DictWordNumPer} = sprintf("%.8f", $Cap1DictWordNum/$senLen);
|
1257
|
+
$TestFeatureVecH{CCap1NonDictWordNumPer} = sprintf("%.8f", $Cap1NonDictWordNum/$senLen);
|
1258
|
+
$TestFeatureVecH{CdigitNumPer} = sprintf("%.8f", $digitNum/$senLen);
|
1259
|
+
$TestFeatureVecH{CaffiNumPer} = sprintf("%.8f", $affiNum/$senLen);
|
1260
|
+
$TestFeatureVecH{CaddrNumPer} = sprintf("%.8f", $addrNum/$senLen);
|
1261
|
+
$TestFeatureVecH{CintroNumPer} = sprintf("%.8f",$introNum/$senLen);
|
1262
|
+
$TestFeatureVecH{CphoneNumPer} = sprintf("%.8f",$phoneNum/$senLen);
|
1263
|
+
$TestFeatureVecH{CdegreeNumPer} = sprintf("%.8f",$degreeNum/$senLen);
|
1264
|
+
$TestFeatureVecH{CpubNumPer} = sprintf("%.8f",$pubNum/$senLen);
|
1265
|
+
$TestFeatureVecH{CnoteNumPer} = sprintf("%.8f",$noteNum/$senLen);
|
1266
|
+
$TestFeatureVecH{CpageNumPer} = sprintf("%.8f",$pageNum/$senLen);
|
1267
|
+
$TestFeatureVecH{CcapNumPer} = sprintf("%.8f",$capNum/$senLen);
|
1268
|
+
$TestFeatureVecH{CothersPer} = sprintf("%.8f", $others/$senLen);
|
1269
|
+
#$TestFeatureVecH{ClinePos} = sprintf("%.8f", $linePos);
|
1270
|
+
}else {
|
1271
|
+
#print "null line\: $line \n";
|
1272
|
+
}
|
1273
|
+
|
1274
|
+
if ($FiletoPrint ne "") {
|
1275
|
+
open(PFH, ">$FiletoPrint") || die "SVMHeaderParse: could not open $FiletoPrint to write: $!";
|
1276
|
+
print PFH "$label ";
|
1277
|
+
}
|
1278
|
+
|
1279
|
+
if (0) {
|
1280
|
+
my $SVMFeaVec = ""; #this is a string
|
1281
|
+
foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %TestFeatureVecH) {
|
1282
|
+
if ($TestFeatureVecH{$feature} != 0){
|
1283
|
+
if ($norm) {
|
1284
|
+
if ($$FeatureDictH{$feature}{max} != 0) {
|
1285
|
+
# print "feature: $TestFeatureVecH{$feature} ; dict $$FeatureDictH{$feature}{max} => ";
|
1286
|
+
my $tmpval = sprintf("%.8f", $TestFeatureVecH{$feature}/$$FeatureDictH{$feature}{max});
|
1287
|
+
$TestFeatureVecH{$feature} = $tmpval;
|
1288
|
+
#print " $TestFeatureVecH{$feature} \n";
|
1289
|
+
}else {
|
1290
|
+
#print "zero max\: $feature \n";
|
1291
|
+
}
|
1292
|
+
}
|
1293
|
+
if ($FiletoPrint ne "") {
|
1294
|
+
print PFH "$$FeatureDictH{$feature}{ID}\:$TestFeatureVecH{$feature} ";
|
1295
|
+
}
|
1296
|
+
$SVMFeaVec .= "$$FeatureDictH{$feature}{ID}\:$TestFeatureVecH{$feature} ";
|
1297
|
+
}else {
|
1298
|
+
#print "zero value\: $feature ($TestFeatureVecH{$feature}) \n";
|
1299
|
+
}
|
1300
|
+
}
|
1301
|
+
}
|
1302
|
+
|
1303
|
+
|
1304
|
+
if ($FiletoPrint ne "") {
|
1305
|
+
print PFH "\n";
|
1306
|
+
close(PFH);
|
1307
|
+
}
|
1308
|
+
|
1309
|
+
my $convertedStr = join(" ", @words);
|
1310
|
+
return(\%TestFeatureVecH);
|
1311
|
+
# return($SVMFeaVec);
|
1312
|
+
#return($convertedStr);
|
1313
|
+
}
|
1314
|
+
|
1315
|
+
|
1316
|
+
sub WordFeatureRepre() {
|
1317
|
+
my $line = shift;
|
1318
|
+
my $dict = shift;
|
1319
|
+
my @FeatureLine;
|
1320
|
+
|
1321
|
+
|
1322
|
+
return(\@FeatureLine);
|
1323
|
+
}
|
1324
|
+
|
1325
|
+
#Given a line, make the space explicit
|
1326
|
+
sub FillSpace() { #recognize <<sep>>, instead of <sep>
|
1327
|
+
my $content = shift;
|
1328
|
+
my $lineNO = 0;
|
1329
|
+
|
1330
|
+
$content =~ s/\s+<<sep>>/<<sep>>/g;
|
1331
|
+
$content =~ s/<<\/sep>>\s+/<<\/sep>>/g;
|
1332
|
+
|
1333
|
+
my $punc = 0; # space is the only separator
|
1334
|
+
if ($content =~ /([^\p{IsLower}\p{IsUpper}\s+\-\.\d+(<<sep>>)(<<\/sep>>)])|(\W+and\W+)/ig) {
|
1335
|
+
$punc = 1; #contains punctuation
|
1336
|
+
}
|
1337
|
+
|
1338
|
+
my @Seq = split(/(<<sep>>[^\<\>]*<<\/sep>>)/, $content); #the () keeps the spliter in the array @Seq!
|
1339
|
+
for my $i (0 .. $#Seq) {
|
1340
|
+
if ($Seq[$i] =~ /<<sep>>/) {
|
1341
|
+
#print "spliter\: $Seq[$i] \n";
|
1342
|
+
}else {
|
1343
|
+
#this is the place to separate the punctuations and fill the space
|
1344
|
+
# print "before removing the space $Seq[$i] \n";
|
1345
|
+
$Seq[$i] =~ s/\s+<<sep>>/<<sep>>/g;
|
1346
|
+
$Seq[$i] =~ s/<<\/sep>>\s+/<<\/sep>>/g;
|
1347
|
+
# remove space arround punctuations
|
1348
|
+
$Seq[$i] =~ s/\s+/ \<space\> /g;
|
1349
|
+
$Seq[$i] =~ s/<space>\s+(\W+)\s+<space>/ $1 /g;
|
1350
|
+
# print "after removing the space $Seq[$i]\n";
|
1351
|
+
}
|
1352
|
+
}
|
1353
|
+
|
1354
|
+
$content = join(" ", @Seq);
|
1355
|
+
return($punc, $content);
|
1356
|
+
}
|
1357
|
+
|
1358
|
+
|
1359
|
+
#Given a line, make the space explicit
|
1360
|
+
sub OfflineFillSpace() { #recognize <sep>
|
1361
|
+
my $content = shift;
|
1362
|
+
my $lineNO = 0;
|
1363
|
+
|
1364
|
+
$content =~ s/\s+<sep>/<sep>/g;
|
1365
|
+
$content =~ s/<\/sep>\s+/<\/sep>/g;
|
1366
|
+
|
1367
|
+
my $punc = 0; # space is the only separator
|
1368
|
+
if ($content =~ /([^\p{IsLower}\p{IsUpper}\s+\-\.\d+(<sep>)(<\/sep>)])|(\W+and\W+)/ig) {
|
1369
|
+
$punc = 1; #contains punctuation
|
1370
|
+
}
|
1371
|
+
|
1372
|
+
my @Seq = split(/(<sep>[^\<\>]*<\/sep>)/, $content); #the () keeps the spliter in the array @Seq!
|
1373
|
+
for my $i (0 .. $#Seq) {
|
1374
|
+
if ($Seq[$i] =~ /<sep>/) {
|
1375
|
+
#print "spliter\: $Seq[$i] \n";
|
1376
|
+
}else {
|
1377
|
+
#this is the place to separate the punctuations and fill the space
|
1378
|
+
# print "before removing the space $Seq[$i] \n";
|
1379
|
+
$Seq[$i] =~ s/\s+<sep>/<sep>/g;
|
1380
|
+
$Seq[$i] =~ s/<\/sep>\s+/<\/sep>/g;
|
1381
|
+
# remove space arround punctuations
|
1382
|
+
$Seq[$i] =~ s/\s+/ \<space\> /g;
|
1383
|
+
$Seq[$i] =~ s/<space>\s+(\W+)\s+<space>/ $1 /g;
|
1384
|
+
# print "after removing the space $Seq[$i]\n";
|
1385
|
+
}
|
1386
|
+
}
|
1387
|
+
|
1388
|
+
$content = join(" ", @Seq);
|
1389
|
+
return($punc, $content);
|
1390
|
+
}
|
1391
|
+
|
1392
|
+
|
1393
|
+
sub SeparatePunc0108bak() {
|
1394
|
+
my $line = shift;
|
1395
|
+
|
1396
|
+
#added 12/16
|
1397
|
+
$line =~ s/^\s+//g;
|
1398
|
+
$line =~ s/\s+$//g;
|
1399
|
+
|
1400
|
+
$line =~ s/([^\p{IsLower}\p{IsUpper}\s+\-\d+\<\>\.]\s+)/ $1 /g;
|
1401
|
+
$line =~ s/[\w+]{3,}(\.)\s+/ $1 /g;
|
1402
|
+
$line =~ s/\s+/ /;
|
1403
|
+
|
1404
|
+
return($line);
|
1405
|
+
}
|
1406
|
+
|
1407
|
+
sub SeparatePunc() {
|
1408
|
+
my $line = shift;
|
1409
|
+
|
1410
|
+
$line =~ s/^\s+//g;
|
1411
|
+
$line =~ s/\s+$//g;
|
1412
|
+
|
1413
|
+
$line =~ s/([^\p{IsLower}\p{IsUpper}\s+\-\d+\<\>\.]\s+)/ $1 /g;
|
1414
|
+
$line =~ s/([\w+]{3,})(\.)\s+/$1 $2 /g; #"Dept. of" becomes "Dept . of"
|
1415
|
+
#How about blah, blah, ... blah. And ....
|
1416
|
+
#Dr. Smith will be keep the small dot
|
1417
|
+
#Sep. will keep the small dot as well.
|
1418
|
+
#But how about removing every dot, including Dr. and Sep. ?
|
1419
|
+
|
1420
|
+
# $line =~ s/\W+$//g; #remove last punctuation
|
1421
|
+
$line =~ s/\s+/ /;
|
1422
|
+
|
1423
|
+
return($line);
|
1424
|
+
}
|
1425
|
+
|
1426
|
+
|
1427
|
+
sub weired_author(){
|
1428
|
+
my $str = shift;
|
1429
|
+
|
1430
|
+
my $weired = 0;
|
1431
|
+
my %weired_words = (
|
1432
|
+
'Departamento' =>1,
|
1433
|
+
'IN PRESS'=>1,
|
1434
|
+
'PRESS'=>1,
|
1435
|
+
'Center'=>1,
|
1436
|
+
'Ltd' =>1,
|
1437
|
+
'Universidad'=>1,
|
1438
|
+
'chair' =>1,
|
1439
|
+
'Submitted'=>1,
|
1440
|
+
'pp'=>1,
|
1441
|
+
'Version'=>1,
|
1442
|
+
'Thesis' =>1,
|
1443
|
+
'Proposal' =>1,
|
1444
|
+
'University'=>1,
|
1445
|
+
'Universiteit'=>1,
|
1446
|
+
'Institut'=>1,
|
1447
|
+
'extended'=>1,
|
1448
|
+
'abstract'=>1,
|
1449
|
+
'Laboratoire'=>1,
|
1450
|
+
'COVER PAGE'=>1,
|
1451
|
+
'COVER'=>1,
|
1452
|
+
'Page' => 1,
|
1453
|
+
'Job Title'=>1,
|
1454
|
+
'Job'=>1,
|
1455
|
+
'Title'=>1,
|
1456
|
+
'Case Study'=>1,
|
1457
|
+
'Case Sludy'=>1,
|
1458
|
+
'Case'=>1,
|
1459
|
+
'Report'=>1,
|
1460
|
+
'Reply'=>1,
|
1461
|
+
'A Report'=>1,
|
1462
|
+
'A Reply'=>1,
|
1463
|
+
'Research'=>1,
|
1464
|
+
'Paper'=>1,
|
1465
|
+
'Research Paper'=>1,
|
1466
|
+
'Research Project'=>1,
|
1467
|
+
'Project'=>1,
|
1468
|
+
'Retrospective'=>1,
|
1469
|
+
'Roadmap'=>1,
|
1470
|
+
'Tutorial'=>1,
|
1471
|
+
'WORKING PAPER'=>1,
|
1472
|
+
'Working' =>1,
|
1473
|
+
'White Paper'=>1,
|
1474
|
+
'in honor of'=>1,
|
1475
|
+
'international' =>1,
|
1476
|
+
'Dataset' =>1,
|
1477
|
+
'Sample' =>1,
|
1478
|
+
'Network'=>1,
|
1479
|
+
'Networks'=>1,
|
1480
|
+
'Academiae'=>1,
|
1481
|
+
'company'=>1,
|
1482
|
+
'Submitted'=>1,
|
1483
|
+
);
|
1484
|
+
|
1485
|
+
my %filter_words = (
|
1486
|
+
'honor'=>1,
|
1487
|
+
'ed'=>1,
|
1488
|
+
'eds'=>1,
|
1489
|
+
'jr'=>1,
|
1490
|
+
'jr\.'=>1,
|
1491
|
+
'authors'=>1,
|
1492
|
+
'author' =>1,
|
1493
|
+
'editor'=>1,
|
1494
|
+
'editors'=>1,
|
1495
|
+
'with'=>1,
|
1496
|
+
'by'=>1,
|
1497
|
+
);
|
1498
|
+
|
1499
|
+
#if separate authors into individuals.
|
1500
|
+
## $str =~ s/^\s*[^\p{IsLower}\p{IsUpper}\d\-\.]//g;
|
1501
|
+
# $str =~ s/[^\p{IsLower}\p{IsUpper}\d\-\.]\s*$//g;
|
1502
|
+
|
1503
|
+
my @weired_words_arr = keys %weired_words;
|
1504
|
+
my $weired_words_str = join("|", @weired_words_arr);
|
1505
|
+
|
1506
|
+
#print "\n\nbefore: $str\n";
|
1507
|
+
$str =~ s/\./\. /g;
|
1508
|
+
$str =~ s/\d+//g;
|
1509
|
+
$str =~ s/^\s*\W+//g;
|
1510
|
+
$str =~ s/\W+\s*$//g;
|
1511
|
+
$str =~ s/\s+/ /g;
|
1512
|
+
$str = &str_space_clean($str);
|
1513
|
+
#print "after: $str \n";
|
1514
|
+
|
1515
|
+
my @words = split(/\s+/, $str);
|
1516
|
+
my $lcase_num = 0;
|
1517
|
+
my $weired_form = 0;
|
1518
|
+
my @new_name = ();
|
1519
|
+
my $pure_single_letter = 1;
|
1520
|
+
for my $i(0 .. $#words) {
|
1521
|
+
if ( (length($words[$i]) > 1) && ($words[$i] !~ /^\w\.$/)) {
|
1522
|
+
$pure_single_letter = 0;
|
1523
|
+
}
|
1524
|
+
if ($filter_words{lc($words[$i])} || ($words[$i] !~ /\w/)) {
|
1525
|
+
next;
|
1526
|
+
}else {
|
1527
|
+
if ($words[$i] =~ /^[\p{IsLower}\-]+$/) {
|
1528
|
+
$lcase_num++;
|
1529
|
+
}elsif ($words[$i] =~ /[^\p{IsLower}\p{IsUpper}\-\.]/) {
|
1530
|
+
$weired_form++;
|
1531
|
+
}
|
1532
|
+
#make the first letter capitalized
|
1533
|
+
$words[$i] = ucfirst(lc($words[$i]));
|
1534
|
+
push @new_name, $words[$i];
|
1535
|
+
}
|
1536
|
+
}
|
1537
|
+
if (($pure_single_letter) || ($str =~ /$weired_words_str/) || ($#words > 4) || ($#new_name <1) || (($#words +1 - $weired_form) < 2) || ($lcase_num>2)) {
|
1538
|
+
$weired = 1;
|
1539
|
+
}
|
1540
|
+
#print "weired:? $weired \n";
|
1541
|
+
$str = join(' ', @new_name);
|
1542
|
+
#print "final str $str\n";
|
1543
|
+
|
1544
|
+
return($weired, $str);
|
1545
|
+
}
|
1546
|
+
|
1547
|
+
|
1548
|
+
|
1549
|
+
#turn array into hash map { $hash_name{$_} =$some_value } @array_name;
|
1550
|
+
sub hash_stopwords {
|
1551
|
+
my $stopword = "$Database_Dir/stopwords";
|
1552
|
+
my %stopH = ();
|
1553
|
+
open(stopReader, "$stopword") || die "SVMHeaderParse: could not open $stopword to read \n";
|
1554
|
+
while (my $line = <stopReader>) {
|
1555
|
+
$line = &str_space_clean($line);
|
1556
|
+
$stopH{$line}++;
|
1557
|
+
}
|
1558
|
+
close(stopReader);
|
1559
|
+
return(\%stopH);
|
1560
|
+
}
|
1561
|
+
|
1562
|
+
sub hash_affi_stopwords {
|
1563
|
+
#my $DB_dir = "/home/hhan/projects/public_library/DB";
|
1564
|
+
my $stopword = "$Database_Dir/affi.txt";
|
1565
|
+
my %stopH = ();
|
1566
|
+
open(stopReader, "$stopword") || die "SVMHeaderParse: could not open $stopword to read \n";
|
1567
|
+
while (my $line = <stopReader>) {
|
1568
|
+
$line = &str_space_clean($line);
|
1569
|
+
$line =~ s/^\d+\s+//g;
|
1570
|
+
$stopH{lc($line)}++;
|
1571
|
+
}
|
1572
|
+
close(stopReader);
|
1573
|
+
return(\%stopH);
|
1574
|
+
}
|
1575
|
+
|
1576
|
+
sub hash_nickname{
|
1577
|
+
#my $DB_dir = "/home/hhan/projects/public_library/DB";
|
1578
|
+
my $stopword = "$Database_Dir/nickname.txt";
|
1579
|
+
my %stopH = ();
|
1580
|
+
open(stopReader, "$stopword") || die "SVMHeaderParse: could not open $stopword to read \n";
|
1581
|
+
while (my $line = <stopReader>) {
|
1582
|
+
$line = &str_space_clean($line);
|
1583
|
+
my @names = split(/<>|\s*\,\s*/, $line);
|
1584
|
+
for my $i(1 .. $#names) {
|
1585
|
+
$stopH{lc($names[0])}{lc($names[$i])} = 1;
|
1586
|
+
}
|
1587
|
+
}
|
1588
|
+
close(stopReader);
|
1589
|
+
return(\%stopH);
|
1590
|
+
}
|
1591
|
+
|
1592
|
+
sub hash_statewords {
|
1593
|
+
#my $DB_dir = "/home/hhan/projects/public_library/DB";
|
1594
|
+
my $stopword = "$Database_Dir/statename.txt";
|
1595
|
+
my %stopH = ();
|
1596
|
+
open(stopReader, "$stopword") || die "SVMHeaderParse: could not open $stopword to read \n";
|
1597
|
+
while (my $line = <stopReader>) {
|
1598
|
+
$line = &str_space_clean($line);
|
1599
|
+
my ($state, $abbr) = split(/\s*\,\s*/, $line);
|
1600
|
+
$stopH{$abbr} = $state;
|
1601
|
+
}
|
1602
|
+
close(stopReader);
|
1603
|
+
return(\%stopH);
|
1604
|
+
}
|
1605
|
+
|
1606
|
+
|
1607
|
+
sub hash_addrwords {
|
1608
|
+
#my $DB_dir = "/home/hhan/projects/public_library/DB";
|
1609
|
+
my $stopword = "$Database_Dir/addr.txt";
|
1610
|
+
my %stopH = ();
|
1611
|
+
open(stopReader, "$stopword") || die "SVMHeaderParse: could not open $stopword to read \n";
|
1612
|
+
while (my $line = <stopReader>) {
|
1613
|
+
$line = &str_space_clean($line);
|
1614
|
+
$line =~ s/^\d+\s+//g;
|
1615
|
+
$stopH{lc($line)}++;
|
1616
|
+
}
|
1617
|
+
close(stopReader);
|
1618
|
+
return(\%stopH);
|
1619
|
+
}
|
1620
|
+
|
1621
|
+
sub str_space_clean() {
|
1622
|
+
my $str = shift;
|
1623
|
+
|
1624
|
+
$str =~ s/\s+/ /g;
|
1625
|
+
$str =~ s/^\s+//g;
|
1626
|
+
$str =~ s/\s+$//g;
|
1627
|
+
return($str);
|
1628
|
+
}
|
1629
|
+
|
1630
|
+
sub nfreeze_hash_to_file() {
|
1631
|
+
my $H = shift;
|
1632
|
+
my $F = shift;
|
1633
|
+
|
1634
|
+
my $mystring = nfreeze($H);
|
1635
|
+
open(dumpFH, ">$F") || die "SVMHeaderParse: could not open $F to write: $!";
|
1636
|
+
print dumpFH "$mystring";
|
1637
|
+
close(dumpFH);
|
1638
|
+
}
|
1639
|
+
|
1640
|
+
sub dump_hash_to_file() {
|
1641
|
+
my $H = shift;
|
1642
|
+
my $F = shift;
|
1643
|
+
|
1644
|
+
$d = Data::Dumper->new([$H]);
|
1645
|
+
$mystring = $d->Dump;
|
1646
|
+
open(dumpFH, ">$F") || die "SVMHeaderParse: could not open $F to write: $!";
|
1647
|
+
print dumpFH "$mystring";
|
1648
|
+
close(dumpFH);
|
1649
|
+
}
|
1650
|
+
|
1651
|
+
|
1652
|
+
sub read_hash_from_file() {
|
1653
|
+
my $file = shift;
|
1654
|
+
|
1655
|
+
undef $/;
|
1656
|
+
open(dumpFH, "$file") || die "SVMHeaderParse: could not open $file to read. \n";
|
1657
|
+
my $string = <dumpFH>;
|
1658
|
+
close(dumpFH);
|
1659
|
+
$/ = "\n";
|
1660
|
+
|
1661
|
+
eval($string);
|
1662
|
+
return($VAR1);
|
1663
|
+
}
|
1664
|
+
|
1665
|
+
sub thaw_hash_from_file() {
|
1666
|
+
my $file = shift;
|
1667
|
+
|
1668
|
+
undef $/;
|
1669
|
+
open(dumpFH, "$file") || die "SVMHeaderParse: could not open $file to read. \n";
|
1670
|
+
my $string = <dumpFH>;
|
1671
|
+
close(dumpFH);
|
1672
|
+
$/ = "\n";
|
1673
|
+
|
1674
|
+
my $VAR1 = thaw($string);
|
1675
|
+
return($VAR1);
|
1676
|
+
}
|
1677
|
+
|
1678
|
+
sub rand_split_samples_to2parts() {
|
1679
|
+
my $samples = shift; #array
|
1680
|
+
my $ratio = shift;
|
1681
|
+
|
1682
|
+
my $total_num = $#$samples;
|
1683
|
+
my $num1 = int($total_num*$ratio);
|
1684
|
+
my $num2 = $total_num - $num1;
|
1685
|
+
my (@part1, @part2);
|
1686
|
+
print STDERR "rand_split_samples_to2parts\: $ratio of $total_num is $num1\n";
|
1687
|
+
$t=time;
|
1688
|
+
srand($t); #seed
|
1689
|
+
for($j=$total_num;$j>=0;$j--){
|
1690
|
+
$r=int(rand($j));
|
1691
|
+
if (($total_num - $j) < $num1) {
|
1692
|
+
push @part1, $$samples[$r];
|
1693
|
+
#adjust the samples after the selected one
|
1694
|
+
for my $k($r .. $#$samples-1) {
|
1695
|
+
$$samples[$k] = $$samples[$k+1];
|
1696
|
+
}
|
1697
|
+
pop @$samples;
|
1698
|
+
}else {
|
1699
|
+
push @part2, $$samples[$r];
|
1700
|
+
}
|
1701
|
+
}
|
1702
|
+
return(\@part1, \@part2);
|
1703
|
+
}
|
1704
|
+
|
1705
|
+
sub rand_split_samples_to2parts_v2() {
|
1706
|
+
my $samples = shift; #array
|
1707
|
+
my $ratio = shift;
|
1708
|
+
|
1709
|
+
my $total_num = $#$samples;
|
1710
|
+
my $num1 = int($total_num*$ratio);
|
1711
|
+
my $num2 = $total_num - $num1;
|
1712
|
+
my (@part1, @part2);
|
1713
|
+
print STDERR "rand_split_samples_to2parts\: $ratio of $total_num is $num1\n";
|
1714
|
+
$t=time;
|
1715
|
+
srand($t); #seed
|
1716
|
+
for($j=$total_num;$j>=0;$j--){
|
1717
|
+
$r=int(rand($j));
|
1718
|
+
if (($total_num - $j) < $num1) {
|
1719
|
+
push @part1, $$samples[$r];
|
1720
|
+
#adjust the samples after the selected one
|
1721
|
+
for my $k($r .. $#$samples-1) {
|
1722
|
+
$$samples[$k] = $$samples[$k+1];
|
1723
|
+
}
|
1724
|
+
pop @$samples;
|
1725
|
+
}
|
1726
|
+
}
|
1727
|
+
return(\@part1, $samples);
|
1728
|
+
}
|
1729
|
+
|
1730
|
+
sub rand_split_samples_toNparts() {
|
1731
|
+
my $samples = shift; #array
|
1732
|
+
my $fold = shift;
|
1733
|
+
|
1734
|
+
my $total_num = $#$samples;
|
1735
|
+
my $unit = int($total_num/$fold +1);
|
1736
|
+
my $last_fold = $total_num - $unit*($fold-1);
|
1737
|
+
|
1738
|
+
my @data = ();
|
1739
|
+
$t=time;
|
1740
|
+
srand($t); #seed
|
1741
|
+
for($j=$total_num;$j>=1;$j--){
|
1742
|
+
$r=int(rand($j));
|
1743
|
+
my $subfold = int(($total_num - $j)/$unit) + 1;
|
1744
|
+
push @{$data[$subfold]}, $$samples[$r];
|
1745
|
+
#adjust the samples after the selected one
|
1746
|
+
for my $k($r .. $#$samples-1) {
|
1747
|
+
$$samples[$k] = $$samples[$k+1];
|
1748
|
+
}
|
1749
|
+
pop @$samples;
|
1750
|
+
}
|
1751
|
+
return(@data);
|
1752
|
+
}
|
1753
|
+
|
1754
|
+
sub rand_split_hash_index_toNparts() {
|
1755
|
+
my $sample_hash = shift; #hash
|
1756
|
+
my $fold = shift;
|
1757
|
+
|
1758
|
+
my @sample_arr = keys %{$sample_hash};
|
1759
|
+
my $total_num = $#sample_arr;
|
1760
|
+
my $unit = int($total_num/$fold +1);
|
1761
|
+
my $last_fold = $total_num - $unit*($fold-1);
|
1762
|
+
|
1763
|
+
my @data = ();
|
1764
|
+
$t=time;
|
1765
|
+
srand($t); #seed
|
1766
|
+
for($j=$total_num;$j>=1;$j--){
|
1767
|
+
$r=int(rand($j));
|
1768
|
+
my $subfold = int(($total_num - $j)/$unit) + 1;
|
1769
|
+
|
1770
|
+
my $name = $sample_arr[$r];
|
1771
|
+
my %pos = ();
|
1772
|
+
if ($$sample_hash{$name}{label} > -1) {
|
1773
|
+
my @tmp = split(/\<\>/, $$sample_hash{$name}{label});
|
1774
|
+
map { $pos{$_} =1 } @tmp;
|
1775
|
+
}
|
1776
|
+
foreach my $file_name (keys %{$$sample_hash{$name}{name}}) {
|
1777
|
+
my ($tmp, $num) = split(/\_\_/, $file_name);
|
1778
|
+
my $label = "-1";
|
1779
|
+
if ($pos{$num}) {
|
1780
|
+
$label = "+1";
|
1781
|
+
}
|
1782
|
+
push @{$data[$subfold]}, "$label<>$file_name<>$$sample_hash{$name}{name}{$file_name}{snippet}";
|
1783
|
+
}
|
1784
|
+
|
1785
|
+
for my $k($r .. $#sample_arr-1) {
|
1786
|
+
$sample_arr[$k] = $sample_arr[$k+1];
|
1787
|
+
}
|
1788
|
+
pop @sample_arr;
|
1789
|
+
}
|
1790
|
+
return(@data);
|
1791
|
+
}
|
1792
|
+
|
1793
|
+
sub ExtractBinaryNfoldSVMResult() {
|
1794
|
+
my $in = shift;
|
1795
|
+
my %ResultH = ();
|
1796
|
+
|
1797
|
+
open (inFH, "$in") || die "SVMHeaderParse: could not open $in to read \n";
|
1798
|
+
while (my $line = <inFH>) {
|
1799
|
+
if ($line =~ /Accuracy on test set: (\d+\.\d+)\%/) {
|
1800
|
+
$ResultH{A}{count}++;
|
1801
|
+
$ResultH{A}{sum} += $1;
|
1802
|
+
}
|
1803
|
+
if ($line =~ /Precision\/recall on test set\: (.*)\%\/(.*)\%/) {
|
1804
|
+
my $P = $1;
|
1805
|
+
my $R = $2;
|
1806
|
+
if ($P =~ /\d+\.\d+/) {
|
1807
|
+
$ResultH{P}{count}++;
|
1808
|
+
$ResultH{P}{sum} += $P;
|
1809
|
+
}
|
1810
|
+
if ($R =~ /\d+\.\d+/) {
|
1811
|
+
$ResultH{R}{count}++;
|
1812
|
+
$ResultH{R}{sum} += $R;
|
1813
|
+
}
|
1814
|
+
}
|
1815
|
+
}
|
1816
|
+
close(inFH);
|
1817
|
+
|
1818
|
+
print STDERR "average result from cross validation \n";
|
1819
|
+
foreach my $eval(sort {$a <=> $b} keys %ResultH) {
|
1820
|
+
$ResultH{$eval}{avg} = sprintf("%.8f", $ResultH{$eval}{sum}/$ResultH{$eval}{count});
|
1821
|
+
print STDERR "evaluation($eval) -- $ResultH{$eval}{avg}\n";
|
1822
|
+
}
|
1823
|
+
}
|
1824
|
+
|
1825
|
+
## get alias file
|
1826
|
+
sub GetNameVariations1() {
|
1827
|
+
my $personalName = shift; #like _Chris_S._Mellish__1.txt
|
1828
|
+
|
1829
|
+
my @QueryNameParts = split(/\s+|\-/, $personalName);
|
1830
|
+
my %NameVariations;
|
1831
|
+
my ($FirstName, $LastName, $FI, $LI, $MI1, $MI2, $FI_LI, $AllInitial, $AllName, $FILN, $FIMI1LastName, $FIMI1MI2LastName)
|
1832
|
+
= ('','','','','','','','','','','','','');
|
1833
|
+
|
1834
|
+
$FirstName = $QueryNameParts[0];
|
1835
|
+
$LastName = $QueryNameParts[$#QueryNameParts];
|
1836
|
+
|
1837
|
+
$NameVariations{$FirstName} = "FN";
|
1838
|
+
$NameVariations{$LastName} = "LN";
|
1839
|
+
|
1840
|
+
$FI = substr($FirstName, 0, 1);
|
1841
|
+
$LI = substr($LastName, 0, 1);
|
1842
|
+
|
1843
|
+
$FI_LI= "$FI"."$LI";
|
1844
|
+
$FILN = "$FI"."$LastName";
|
1845
|
+
$FNLI = "$FirstName"."$LI";
|
1846
|
+
$NameVariations{$FILN} = "FILN";
|
1847
|
+
$NameVariations{$FNLI} = "FNLI";
|
1848
|
+
|
1849
|
+
for my $i(0 .. $#QueryNameParts) {
|
1850
|
+
$QueryNameParts[$i] =~ s/\W+//g;
|
1851
|
+
$AllName .= $QueryNameParts[$i];
|
1852
|
+
}
|
1853
|
+
# dependts on whether this name contains 3 parts or 4 parts
|
1854
|
+
if ($#QueryNameParts < 1) {next;}
|
1855
|
+
if ($#QueryNameParts eq 1) {
|
1856
|
+
$AllInitial = $FI_LI;
|
1857
|
+
$NameVariations{$AllInitial} = "all_initial";
|
1858
|
+
}else {
|
1859
|
+
$NameVariations{$FI_LI} = "FILI";
|
1860
|
+
$NameVariations{"$FN"."$QueryNameParts[1]"} = "FNMN";
|
1861
|
+
if ($#QueryNameParts eq 2) {
|
1862
|
+
$MI1 = substr($QueryNameParts[1], 0, 1);
|
1863
|
+
$AllInitial= "$FI"."$MI1"."$LI";
|
1864
|
+
$FIMI1LastName = "$FI"."$MI1"."$LastName";
|
1865
|
+
$NameVariations{$AllInitial} = "all_initial";
|
1866
|
+
$NameVariations{$FIMI1LastName} = "FIMI1LN";
|
1867
|
+
}elsif ($#NameParts eq 3) {
|
1868
|
+
$MI1 = substr($QueryNameParts[1], 0, 1);
|
1869
|
+
$MI2 = substr($QueryNameParts[2], 0, 1);
|
1870
|
+
$AllInitial = "$FI"."$MI1"."$MI2"."$LI";
|
1871
|
+
$FIMI1LastName = "$FI"."$MI1"."$LastName";
|
1872
|
+
$FIMI1MI2LastName = "$FI"."$MI1"."$MI2"."$LastName";
|
1873
|
+
$NameVariations{$AllInitial} = "all_initial";
|
1874
|
+
$NameVariations{$FIMI1LastName} = "FIMI1LN";
|
1875
|
+
$NameVariations{$FIMI1MI2LastName} = "FIMI1MI2LN";
|
1876
|
+
}
|
1877
|
+
}
|
1878
|
+
|
1879
|
+
## It will take chance for this exact match
|
1880
|
+
if (length ($QueryNameParts[$#QueryNameParts]) < 4) {
|
1881
|
+
$PartLastName = substr($QueryNameParts[$#QueryNameParts], 0, 5);
|
1882
|
+
$NameVariations{$PartLastName} = "partial_LN";
|
1883
|
+
}
|
1884
|
+
return(\%NameVariations);
|
1885
|
+
}
|
1886
|
+
|
1887
|
+
sub GetNameVariations() {
|
1888
|
+
my $personalName = shift; #like _Chris_S._Mellish__1.txt
|
1889
|
+
my $nickname = shift;
|
1890
|
+
|
1891
|
+
my @QueryNameParts = split(/\s+|\-/, $personalName);
|
1892
|
+
my %NameVariations;
|
1893
|
+
my ($FirstName, $LastName, $FI, $LI, $MI1, $MI2, $FI_LI, $AllInitial, $AllName, $FILN, $FIMI1LastName, $FIMI1MI2LastName)
|
1894
|
+
= ('','','','','','','','','','','','','');
|
1895
|
+
|
1896
|
+
$FirstName = $QueryNameParts[0];
|
1897
|
+
$LastName = $QueryNameParts[$#QueryNameParts];
|
1898
|
+
# using first 5 letters decreases performance if not using substring matching
|
1899
|
+
# if (length($QueryNameParts[0]) > 4) {
|
1900
|
+
# $FirstName = substr($QueryNameParts[0],0,5);
|
1901
|
+
# }
|
1902
|
+
# if (length($QueryNameParts[$#QueryNameParts]) > 4) {
|
1903
|
+
# $LastName = substr($QueryNameParts[$#QueryNameParts],0,5);
|
1904
|
+
# }
|
1905
|
+
$NameVariations{$FirstName} = "FN";
|
1906
|
+
$NameVariations{$LastName} = "LN";
|
1907
|
+
foreach my $alias(keys %{$$nickname{lc($FirstName)}}) {
|
1908
|
+
$NameVariations{$alias} = "FN";
|
1909
|
+
}
|
1910
|
+
$FI = substr($FirstName, 0, 1);
|
1911
|
+
$LI = substr($LastName, 0, 1);
|
1912
|
+
|
1913
|
+
$FI_LI= "$FI"."$LI";
|
1914
|
+
$FILN = "$FI"."\\"."w*"."$LastName";
|
1915
|
+
$FNLI = "$FirstName"."$LI";
|
1916
|
+
$LNFI = "$LastName"."$FI";
|
1917
|
+
$NameVariations{$FILN} = "FILN";
|
1918
|
+
$NameVariations{"$FI"."\."."$LastName"} = "FILN";
|
1919
|
+
$NameVariations{$FNLI} = "FNLI";
|
1920
|
+
$NameVariations{$LNFI} = "LNFI";
|
1921
|
+
|
1922
|
+
for my $i(0 .. $#QueryNameParts) {
|
1923
|
+
$QueryNameParts[$i] =~ s/\W+//g;
|
1924
|
+
$AllName .= $QueryNameParts[$i];
|
1925
|
+
}
|
1926
|
+
$NameVariations{$AllName} = "full_name";
|
1927
|
+
$NameVariations{"\\"."w*"."$FirstName"."\\"."w*"."$LastName"."\\"."w*"} = "FNLN";
|
1928
|
+
$NameVariations{"$FirstName"."\."."$LastName"} = "FNLN";
|
1929
|
+
$NameVariations{"$LastName"."$FirstName"} = "LNFN";
|
1930
|
+
|
1931
|
+
# depends on whether this name contains 3 parts or 4 parts
|
1932
|
+
if ($#QueryNameParts < 1) {next;}
|
1933
|
+
elsif ($#QueryNameParts eq 1) {
|
1934
|
+
$AllInitial = $FI_LI;
|
1935
|
+
$NameVariations{$AllInitial} = "all_initial";
|
1936
|
+
}else {
|
1937
|
+
$NameVariations{$FI_LI} = "FILI";
|
1938
|
+
$NameVariations{"$FirstName"."$QueryNameParts[1]"} = "FNMN";
|
1939
|
+
if ($#QueryNameParts eq 2) {
|
1940
|
+
$MI1 = substr($QueryNameParts[1], 0, 1);
|
1941
|
+
$AllInitial= "$FI"."$MI1"."$LI";
|
1942
|
+
$FIMI1LastName = "$FI"."$MI1"."$LastName";
|
1943
|
+
$NameVariations{$AllInitial} = "all_initial";
|
1944
|
+
$NameVariations{$FIMI1LastName} = "FIMI1LN";
|
1945
|
+
}elsif ($#QueryNameParts eq 3) {
|
1946
|
+
$MI1 = substr($QueryNameParts[1], 0, 1);
|
1947
|
+
$MI2 = substr($QueryNameParts[2], 0, 1);
|
1948
|
+
$AllInitial = "$FI"."$MI1"."$MI2"."$LI";
|
1949
|
+
$FIMI1LastName = "$FI"."$MI1"."$LastName";
|
1950
|
+
$FIMI2LI = "$FI"."$MI2"."$LI";
|
1951
|
+
$FIMI1LI = "$FI"."$MI1"."$LI";
|
1952
|
+
$FIMI1MI2LastName = "$FI"."$MI1"."$MI2"."$LastName";
|
1953
|
+
$MN2LastName = "$QueryNameParts[2]"."$LastName";
|
1954
|
+
$NameVariations{$AllInitial} = "all_initial";
|
1955
|
+
$NameVariations{$FIMI1LastName} = "FIMI1LN";
|
1956
|
+
$NameVariations{$FIMI1LI} = "FIMI1LI";
|
1957
|
+
$NameVariations{$FIMI2LI} = "FIMI2LI";
|
1958
|
+
$NameVariations{$MN2LastName} = "MI2LN";
|
1959
|
+
$NameVariations{$FIMI1MI2LastName} = "FIMI1MI2LN";
|
1960
|
+
}
|
1961
|
+
}
|
1962
|
+
|
1963
|
+
## It will take chance for this exact match
|
1964
|
+
if (length ($QueryNameParts[$#QueryNameParts]) < 4) {
|
1965
|
+
$PartLastName = substr($QueryNameParts[$#QueryNameParts], 0, 5);
|
1966
|
+
$NameVariations{$PartLastName} = "partial_LN";
|
1967
|
+
}
|
1968
|
+
return(\%NameVariations);
|
1969
|
+
}
|
1970
|
+
|
1971
|
+
sub get_university_emails() {
|
1972
|
+
my $univ = "$Database_Dir/university_list/univ-full.html";
|
1973
|
+
my $simple_format = "$Database_Dir/university_list.txt";
|
1974
|
+
|
1975
|
+
my %H = ();
|
1976
|
+
open(UNIV, $univ) || die "SVMHeaderParse: could not open $univ to read. \n";
|
1977
|
+
my @content = <UNIV>;
|
1978
|
+
close(UNIV);
|
1979
|
+
|
1980
|
+
open(simpleWriter, ">$simple_format") || die "SVMHeaderParse: could not open $simple_format to write: $!";
|
1981
|
+
for my $i(0 .. $#content) {
|
1982
|
+
if ($content[$i] =~ /\<LI\>\s+\<A\s+HREF\=\"([^\"]*)\"\>(.*)\<\/A\>/) {
|
1983
|
+
my $url = $1;
|
1984
|
+
my $college = $2;
|
1985
|
+
print simpleWriter "$college<>$url\n";
|
1986
|
+
}
|
1987
|
+
}
|
1988
|
+
close(simpleWriter);
|
1989
|
+
return(\%H);
|
1990
|
+
}
|
1991
|
+
|
1992
|
+
#input: an array of value
|
1993
|
+
sub compute_std() {
|
1994
|
+
my $arr = shift;
|
1995
|
+
my $mean = 0;
|
1996
|
+
my $std = 0;
|
1997
|
+
|
1998
|
+
#cal mean
|
1999
|
+
for my $i(0 .. $#$arr) {
|
2000
|
+
$mean += $$arr[$i];
|
2001
|
+
}
|
2002
|
+
$mean = sprintf("%.3f", $mean/($#$arr+1));
|
2003
|
+
|
2004
|
+
for my $i(0 .. $#$arr) {
|
2005
|
+
$std += ($$arr[$i]-$mean)**2;
|
2006
|
+
}
|
2007
|
+
my $temp = sprintf("%.8f", $std/$#$arr);
|
2008
|
+
$std = sqrt($temp);
|
2009
|
+
|
2010
|
+
return($mean, $std);
|
2011
|
+
}
|
2012
|
+
|
2013
|
+
|
2014
|
+
|
2015
|
+
|
2016
|
+
1;
|