biblicit 1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +3 -0
- data/.rspec +1 -0
- data/Gemfile +6 -0
- data/LICENSE.TXT +176 -0
- data/README.md +120 -0
- data/Rakefile +8 -0
- data/biblicit.gemspec +33 -0
- data/lib/biblicit/cb2bib.rb +83 -0
- data/lib/biblicit/citeseer.rb +53 -0
- data/lib/biblicit/extractor.rb +37 -0
- data/lib/biblicit.rb +6 -0
- data/perl/DocFilter/lib/CSXUtil/SafeText.pm +140 -0
- data/perl/DocFilter/lib/DocFilter/Config.pm +35 -0
- data/perl/DocFilter/lib/DocFilter/Filter.pm +51 -0
- data/perl/FileConversionService/README.TXT +11 -0
- data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
- data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +140 -0
- data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +77 -0
- data/perl/FileConversionService/lib/FileConverter/Compression.pm +137 -0
- data/perl/FileConversionService/lib/FileConverter/Config.pm +57 -0
- data/perl/FileConversionService/lib/FileConverter/Controller.pm +191 -0
- data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +61 -0
- data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +69 -0
- data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +69 -0
- data/perl/FileConversionService/lib/FileConverter/PSToText.pm +88 -0
- data/perl/FileConversionService/lib/FileConverter/Prescript.pm +68 -0
- data/perl/FileConversionService/lib/FileConverter/TET.pm +75 -0
- data/perl/FileConversionService/lib/FileConverter/Utils.pm +130 -0
- data/perl/HeaderParseService/README.TXT +80 -0
- data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +140 -0
- data/perl/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +968 -0
- data/perl/HeaderParseService/lib/HeaderParse/API/Function.pm +2016 -0
- data/perl/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +444 -0
- data/perl/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +409 -0
- data/perl/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +537 -0
- data/perl/HeaderParseService/lib/HeaderParse/API/Parser.pm +68 -0
- data/perl/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +1880 -0
- data/perl/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +46 -0
- data/perl/HeaderParseService/resources/data/EbizHeaders.txt +24330 -0
- data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +27506 -0
- data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +26495 -0
- data/perl/HeaderParseService/resources/data/tagged_headers.txt +40668 -0
- data/perl/HeaderParseService/resources/data/test_header.txt +31 -0
- data/perl/HeaderParseService/resources/data/test_header.txt.parsed +31 -0
- data/perl/HeaderParseService/resources/database/50states +60 -0
- data/perl/HeaderParseService/resources/database/AddrTopWords.txt +17 -0
- data/perl/HeaderParseService/resources/database/AffiTopWords.txt +35 -0
- data/perl/HeaderParseService/resources/database/AffiTopWordsAll.txt +533 -0
- data/perl/HeaderParseService/resources/database/ChineseSurNames.txt +276 -0
- data/perl/HeaderParseService/resources/database/Csurnames.bin +0 -0
- data/perl/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
- data/perl/HeaderParseService/resources/database/DomainSuffixes.txt +242 -0
- data/perl/HeaderParseService/resources/database/LabeledHeader +18 -0
- data/perl/HeaderParseService/resources/database/README +2 -0
- data/perl/HeaderParseService/resources/database/TrainMulClassLines +254 -0
- data/perl/HeaderParseService/resources/database/TrainMulClassLines1 +510 -0
- data/perl/HeaderParseService/resources/database/abstract.txt +1 -0
- data/perl/HeaderParseService/resources/database/abstractTopWords +9 -0
- data/perl/HeaderParseService/resources/database/addr.txt +28 -0
- data/perl/HeaderParseService/resources/database/affi.txt +34 -0
- data/perl/HeaderParseService/resources/database/affis.bin +0 -0
- data/perl/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
- data/perl/HeaderParseService/resources/database/allnamewords.bin +0 -0
- data/perl/HeaderParseService/resources/database/cities_US.txt +4512 -0
- data/perl/HeaderParseService/resources/database/cities_world.txt +4463 -0
- data/perl/HeaderParseService/resources/database/city.txt +3150 -0
- data/perl/HeaderParseService/resources/database/cityname.txt +3151 -0
- data/perl/HeaderParseService/resources/database/country_abbr.txt +243 -0
- data/perl/HeaderParseService/resources/database/countryname.txt +262 -0
- data/perl/HeaderParseService/resources/database/dateTopWords +30 -0
- data/perl/HeaderParseService/resources/database/degree.txt +67 -0
- data/perl/HeaderParseService/resources/database/email.txt +3 -0
- data/perl/HeaderParseService/resources/database/excludeWords.txt +40 -0
- data/perl/HeaderParseService/resources/database/female-names +4960 -0
- data/perl/HeaderParseService/resources/database/firstNames.txt +8448 -0
- data/perl/HeaderParseService/resources/database/firstnames.bin +0 -0
- data/perl/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
- data/perl/HeaderParseService/resources/database/intro.txt +2 -0
- data/perl/HeaderParseService/resources/database/keyword.txt +5 -0
- data/perl/HeaderParseService/resources/database/keywordTopWords +7 -0
- data/perl/HeaderParseService/resources/database/male-names +3906 -0
- data/perl/HeaderParseService/resources/database/middleNames.txt +2 -0
- data/perl/HeaderParseService/resources/database/month.txt +35 -0
- data/perl/HeaderParseService/resources/database/mul +868 -0
- data/perl/HeaderParseService/resources/database/mul.label +869 -0
- data/perl/HeaderParseService/resources/database/mul.label.old +869 -0
- data/perl/HeaderParseService/resources/database/mul.processed +762 -0
- data/perl/HeaderParseService/resources/database/mulAuthor +619 -0
- data/perl/HeaderParseService/resources/database/mulClassStat +45 -0
- data/perl/HeaderParseService/resources/database/nickname.txt +58 -0
- data/perl/HeaderParseService/resources/database/nicknames.bin +0 -0
- data/perl/HeaderParseService/resources/database/note.txt +121 -0
- data/perl/HeaderParseService/resources/database/page.txt +1 -0
- data/perl/HeaderParseService/resources/database/phone.txt +9 -0
- data/perl/HeaderParseService/resources/database/postcode.txt +54 -0
- data/perl/HeaderParseService/resources/database/pubnum.txt +45 -0
- data/perl/HeaderParseService/resources/database/statename.bin +0 -0
- data/perl/HeaderParseService/resources/database/statename.txt +73 -0
- data/perl/HeaderParseService/resources/database/states_and_abbreviations.txt +118 -0
- data/perl/HeaderParseService/resources/database/stopwords +438 -0
- data/perl/HeaderParseService/resources/database/stopwords.bin +0 -0
- data/perl/HeaderParseService/resources/database/surNames.txt +19613 -0
- data/perl/HeaderParseService/resources/database/surnames.bin +0 -0
- data/perl/HeaderParseService/resources/database/surnames_spec.bin +0 -0
- data/perl/HeaderParseService/resources/database/university_list/A.html +167 -0
- data/perl/HeaderParseService/resources/database/university_list/B.html +161 -0
- data/perl/HeaderParseService/resources/database/university_list/C.html +288 -0
- data/perl/HeaderParseService/resources/database/university_list/D.html +115 -0
- data/perl/HeaderParseService/resources/database/university_list/E.html +147 -0
- data/perl/HeaderParseService/resources/database/university_list/F.html +112 -0
- data/perl/HeaderParseService/resources/database/university_list/G.html +115 -0
- data/perl/HeaderParseService/resources/database/university_list/H.html +140 -0
- data/perl/HeaderParseService/resources/database/university_list/I.html +138 -0
- data/perl/HeaderParseService/resources/database/university_list/J.html +82 -0
- data/perl/HeaderParseService/resources/database/university_list/K.html +115 -0
- data/perl/HeaderParseService/resources/database/university_list/L.html +131 -0
- data/perl/HeaderParseService/resources/database/university_list/M.html +201 -0
- data/perl/HeaderParseService/resources/database/university_list/N.html +204 -0
- data/perl/HeaderParseService/resources/database/university_list/O.html +89 -0
- data/perl/HeaderParseService/resources/database/university_list/P.html +125 -0
- data/perl/HeaderParseService/resources/database/university_list/Q.html +49 -0
- data/perl/HeaderParseService/resources/database/university_list/R.html +126 -0
- data/perl/HeaderParseService/resources/database/university_list/S.html +296 -0
- data/perl/HeaderParseService/resources/database/university_list/T.html +156 -0
- data/perl/HeaderParseService/resources/database/university_list/U.html +800 -0
- data/perl/HeaderParseService/resources/database/university_list/V.html +75 -0
- data/perl/HeaderParseService/resources/database/university_list/W.html +144 -0
- data/perl/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
- data/perl/HeaderParseService/resources/database/university_list/X.html +44 -0
- data/perl/HeaderParseService/resources/database/university_list/Y.html +53 -0
- data/perl/HeaderParseService/resources/database/university_list/Z.html +43 -0
- data/perl/HeaderParseService/resources/database/university_list/ae.html +31 -0
- data/perl/HeaderParseService/resources/database/university_list/am.html +30 -0
- data/perl/HeaderParseService/resources/database/university_list/ar.html +35 -0
- data/perl/HeaderParseService/resources/database/university_list/at.html +43 -0
- data/perl/HeaderParseService/resources/database/university_list/au.html +82 -0
- data/perl/HeaderParseService/resources/database/university_list/bd.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/be.html +41 -0
- data/perl/HeaderParseService/resources/database/university_list/bg.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/bh.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
- data/perl/HeaderParseService/resources/database/university_list/bm.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/bn.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/br.html +66 -0
- data/perl/HeaderParseService/resources/database/university_list/ca.html +174 -0
- data/perl/HeaderParseService/resources/database/university_list/ch.html +52 -0
- data/perl/HeaderParseService/resources/database/university_list/cl.html +40 -0
- data/perl/HeaderParseService/resources/database/university_list/cn.html +87 -0
- data/perl/HeaderParseService/resources/database/university_list/co.html +39 -0
- data/perl/HeaderParseService/resources/database/university_list/cr.html +34 -0
- data/perl/HeaderParseService/resources/database/university_list/cy.html +34 -0
- data/perl/HeaderParseService/resources/database/university_list/cz.html +44 -0
- data/perl/HeaderParseService/resources/database/university_list/de.html +128 -0
- data/perl/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
- data/perl/HeaderParseService/resources/database/university_list/dk.html +42 -0
- data/perl/HeaderParseService/resources/database/university_list/ec.html +31 -0
- data/perl/HeaderParseService/resources/database/university_list/ee.html +30 -0
- data/perl/HeaderParseService/resources/database/university_list/eg.html +29 -0
- data/perl/HeaderParseService/resources/database/university_list/es.html +68 -0
- data/perl/HeaderParseService/resources/database/university_list/et.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/faq.html +147 -0
- data/perl/HeaderParseService/resources/database/university_list/fi.html +49 -0
- data/perl/HeaderParseService/resources/database/university_list/fj.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/fo.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/fr.html +106 -0
- data/perl/HeaderParseService/resources/database/university_list/geog.html +150 -0
- data/perl/HeaderParseService/resources/database/university_list/gr.html +38 -0
- data/perl/HeaderParseService/resources/database/university_list/gu.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/hk.html +34 -0
- data/perl/HeaderParseService/resources/database/university_list/hr.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/hu.html +46 -0
- data/perl/HeaderParseService/resources/database/university_list/id.html +29 -0
- data/perl/HeaderParseService/resources/database/university_list/ie.html +49 -0
- data/perl/HeaderParseService/resources/database/university_list/il.html +35 -0
- data/perl/HeaderParseService/resources/database/university_list/in.html +109 -0
- data/perl/HeaderParseService/resources/database/university_list/is.html +32 -0
- data/perl/HeaderParseService/resources/database/university_list/it.html +75 -0
- data/perl/HeaderParseService/resources/database/university_list/jm.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/jo.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/jp.html +155 -0
- data/perl/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
- data/perl/HeaderParseService/resources/database/university_list/kr.html +65 -0
- data/perl/HeaderParseService/resources/database/university_list/kw.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/lb.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
- data/perl/HeaderParseService/resources/database/university_list/lk.html +30 -0
- data/perl/HeaderParseService/resources/database/university_list/lt.html +31 -0
- data/perl/HeaderParseService/resources/database/university_list/lu.html +34 -0
- data/perl/HeaderParseService/resources/database/university_list/lv.html +30 -0
- data/perl/HeaderParseService/resources/database/university_list/ma.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
- data/perl/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
- data/perl/HeaderParseService/resources/database/university_list/mk.html +29 -0
- data/perl/HeaderParseService/resources/database/university_list/mo.html +29 -0
- data/perl/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
- data/perl/HeaderParseService/resources/database/university_list/mt.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/mx.html +68 -0
- data/perl/HeaderParseService/resources/database/university_list/my.html +39 -0
- data/perl/HeaderParseService/resources/database/university_list/ni.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/nl.html +51 -0
- data/perl/HeaderParseService/resources/database/university_list/no.html +56 -0
- data/perl/HeaderParseService/resources/database/university_list/nz.html +41 -0
- data/perl/HeaderParseService/resources/database/university_list/pa.html +31 -0
- data/perl/HeaderParseService/resources/database/university_list/pe.html +40 -0
- data/perl/HeaderParseService/resources/database/university_list/ph.html +41 -0
- data/perl/HeaderParseService/resources/database/university_list/pl.html +51 -0
- data/perl/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
- data/perl/HeaderParseService/resources/database/university_list/pr.html +31 -0
- data/perl/HeaderParseService/resources/database/university_list/ps.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/pt.html +45 -0
- data/perl/HeaderParseService/resources/database/university_list/recognition.html +69 -0
- data/perl/HeaderParseService/resources/database/university_list/results.html +71 -0
- data/perl/HeaderParseService/resources/database/university_list/ro.html +38 -0
- data/perl/HeaderParseService/resources/database/university_list/ru.html +48 -0
- data/perl/HeaderParseService/resources/database/university_list/sd.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/se.html +57 -0
- data/perl/HeaderParseService/resources/database/university_list/sg.html +33 -0
- data/perl/HeaderParseService/resources/database/university_list/si.html +30 -0
- data/perl/HeaderParseService/resources/database/university_list/sk.html +35 -0
- data/perl/HeaderParseService/resources/database/university_list/th.html +45 -0
- data/perl/HeaderParseService/resources/database/university_list/tr.html +44 -0
- data/perl/HeaderParseService/resources/database/university_list/tw.html +76 -0
- data/perl/HeaderParseService/resources/database/university_list/ua.html +29 -0
- data/perl/HeaderParseService/resources/database/university_list/uk.html +168 -0
- data/perl/HeaderParseService/resources/database/university_list/univ-full.html +3166 -0
- data/perl/HeaderParseService/resources/database/university_list/univ.html +122 -0
- data/perl/HeaderParseService/resources/database/university_list/uy.html +31 -0
- data/perl/HeaderParseService/resources/database/university_list/ve.html +34 -0
- data/perl/HeaderParseService/resources/database/university_list/yu.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list/za.html +46 -0
- data/perl/HeaderParseService/resources/database/university_list/zm.html +28 -0
- data/perl/HeaderParseService/resources/database/university_list.txt +3025 -0
- data/perl/HeaderParseService/resources/database/url.txt +1 -0
- data/perl/HeaderParseService/resources/database/webTopWords +225 -0
- data/perl/HeaderParseService/resources/database/words +45402 -0
- data/perl/HeaderParseService/resources/models/10ContextModelfold1 +369 -0
- data/perl/HeaderParseService/resources/models/10Modelfold1 +376 -0
- data/perl/HeaderParseService/resources/models/11ContextModelfold1 +400 -0
- data/perl/HeaderParseService/resources/models/11Modelfold1 +526 -0
- data/perl/HeaderParseService/resources/models/12ContextModelfold1 +510 -0
- data/perl/HeaderParseService/resources/models/12Modelfold1 +423 -0
- data/perl/HeaderParseService/resources/models/13ContextModelfold1 +364 -0
- data/perl/HeaderParseService/resources/models/13Modelfold1 +677 -0
- data/perl/HeaderParseService/resources/models/14ContextModelfold1 +459 -0
- data/perl/HeaderParseService/resources/models/14Modelfold1 +325 -0
- data/perl/HeaderParseService/resources/models/15ContextModelfold1 +340 -0
- data/perl/HeaderParseService/resources/models/15Modelfold1 +390 -0
- data/perl/HeaderParseService/resources/models/1ContextModelfold1 +668 -0
- data/perl/HeaderParseService/resources/models/1Modelfold1 +1147 -0
- data/perl/HeaderParseService/resources/models/2ContextModelfold1 +755 -0
- data/perl/HeaderParseService/resources/models/2Modelfold1 +796 -0
- data/perl/HeaderParseService/resources/models/3ContextModelfold1 +1299 -0
- data/perl/HeaderParseService/resources/models/3Modelfold1 +1360 -0
- data/perl/HeaderParseService/resources/models/4ContextModelfold1 +1062 -0
- data/perl/HeaderParseService/resources/models/4Modelfold1 +993 -0
- data/perl/HeaderParseService/resources/models/5ContextModelfold1 +1339 -0
- data/perl/HeaderParseService/resources/models/5Modelfold1 +2098 -0
- data/perl/HeaderParseService/resources/models/6ContextModelfold1 +888 -0
- data/perl/HeaderParseService/resources/models/6Modelfold1 +620 -0
- data/perl/HeaderParseService/resources/models/7ContextModelfold1 +257 -0
- data/perl/HeaderParseService/resources/models/7Modelfold1 +228 -0
- data/perl/HeaderParseService/resources/models/8ContextModelfold1 +677 -0
- data/perl/HeaderParseService/resources/models/8Modelfold1 +1871 -0
- data/perl/HeaderParseService/resources/models/9ContextModelfold1 +198 -0
- data/perl/HeaderParseService/resources/models/9Modelfold1 +170 -0
- data/perl/HeaderParseService/resources/models/NameSpaceModel +181 -0
- data/perl/HeaderParseService/resources/models/NameSpaceTrainF +347 -0
- data/perl/HeaderParseService/resources/models/WrapperBaseFeaDict +13460 -0
- data/perl/HeaderParseService/resources/models/WrapperContextFeaDict +14045 -0
- data/perl/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +510 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +23 -0
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +23 -0
- data/perl/ParsCit/README.TXT +82 -0
- data/perl/ParsCit/crfpp/traindata/parsCit.template +60 -0
- data/perl/ParsCit/crfpp/traindata/parsCit.train.data +12104 -0
- data/perl/ParsCit/crfpp/traindata/tagged_references.txt +500 -0
- data/perl/ParsCit/lib/CSXUtil/SafeText.pm +140 -0
- data/perl/ParsCit/lib/ParsCit/Citation.pm +462 -0
- data/perl/ParsCit/lib/ParsCit/CitationContext.pm +132 -0
- data/perl/ParsCit/lib/ParsCit/Config.pm +46 -0
- data/perl/ParsCit/lib/ParsCit/Controller.pm +306 -0
- data/perl/ParsCit/lib/ParsCit/PostProcess.pm +367 -0
- data/perl/ParsCit/lib/ParsCit/PreProcess.pm +333 -0
- data/perl/ParsCit/lib/ParsCit/Tr2crfpp.pm +331 -0
- data/perl/ParsCit/resources/parsCit.model +0 -0
- data/perl/ParsCit/resources/parsCitDict.txt +148783 -0
- data/perl/extract.pl +199 -0
- data/spec/biblicit/cb2bib_spec.rb +48 -0
- data/spec/biblicit/citeseer_spec.rb +40 -0
- data/spec/fixtures/pdf/10.1.1.109.4049.pdf +0 -0
- data/spec/fixtures/pdf/Bagnoli Watts TAR 2010.pdf +0 -0
- data/spec/fixtures/pdf/ICINCO_2010.pdf +0 -0
- data/spec/spec_helper.rb +3 -0
- metadata +474 -0
|
@@ -0,0 +1,1880 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright 2007 Penn State University
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
#
|
|
13
|
+
package HeaderParse::API::ParserMethods;
|
|
14
|
+
|
|
15
|
+
#06/27/2003, start to make this program to handle real data. So there is no evaluation, and off line classifiers should be trained be trained beforehand.
|
|
16
|
+
#02/10/2004 Apply to citeseer data (with the same format of EbizSearch data)
|
|
17
|
+
|
|
18
|
+
use utf8;
|
|
19
|
+
use Data::Dumper;
|
|
20
|
+
use FindBin;
|
|
21
|
+
use HeaderParse::API::NamePatternMatch;
|
|
22
|
+
use HeaderParse::API::MultiClassChunking; #default to use all export by this module
|
|
23
|
+
use HeaderParse::API::LoadInformation;
|
|
24
|
+
use HeaderParse::Config::API_Config;
|
|
25
|
+
use HeaderParse::API::AssembleXMLMetadata;
|
|
26
|
+
use vars qw($debug %dictH %nameH %firstnameH %lastnameH %BasicFeatureDictH %InverseTagMap);
|
|
27
|
+
use vars qw($Classifier $offlineD $Tmp_Dir $nMinHeaderLength $nMaxHeaderLength);
|
|
28
|
+
use HeaderParse::API::Function qw(&AddrMatch &printDict &GenTrainVecMatrix &LineFeatureRepre &FillSpace &SeparatePunc);
|
|
29
|
+
|
|
30
|
+
my $FeatureDictH = \%BasicFeatureDictH;
|
|
31
|
+
my $ContextFeatureDictH;
|
|
32
|
+
my $SpaceAuthorFeatureDictH; #do not know if it is OK to define a hash
|
|
33
|
+
my $PuncAuthorFeatureDictH;
|
|
34
|
+
my $NameSpaceTrainVecH;
|
|
35
|
+
my $NameSpaceTrainF = "$offlineD"."NameSpaceTrainF";
|
|
36
|
+
my $SVMNameSpaceModel = "$offlineD"."NameSpaceModel";
|
|
37
|
+
my $TestH;
|
|
38
|
+
my $TrainH;
|
|
39
|
+
my $TotalHea = 0;
|
|
40
|
+
|
|
41
|
+
my $timestamp;
|
|
42
|
+
|
|
43
|
+
#my $offlineD = "../../offline/";
|
|
44
|
+
#my $TestOutF = "$TestF"."\.parsed";
|
|
45
|
+
#my $tmpCacheVecB = "$Tmp_Dir/tmpVec";
|
|
46
|
+
#my $SVMTmpResultB = "$Tmp_Dir/tmpresult";
|
|
47
|
+
|
|
48
|
+
my $FeatureDict = "$offlineD"."WrapperBaseFeaDict";
|
|
49
|
+
my $ContextFeatureDict = "$offlineD"."WrapperContextFeaDict";
|
|
50
|
+
my $SpaceAuthorFeatureDictF = "$offlineD"."WrapperSpaceAuthorFeaDict";
|
|
51
|
+
my $PuncAuthorFeatureDictF = "$offlineD"."WrapperPuncAuthorFeaDict";
|
|
52
|
+
|
|
53
|
+
my $linear = 1; # just want to be fast
|
|
54
|
+
|
|
55
|
+
my %evalH; # global hash to record classification result for baseline, each context round and IE
|
|
56
|
+
my $norm = 1;
|
|
57
|
+
my $testp = 1; # this is only to make the program run, no meaning.
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
my %TestDataIndex; #It indexes the header no in the testing dataset
|
|
61
|
+
|
|
62
|
+
#Read dictionary files
|
|
63
|
+
undef $/;
|
|
64
|
+
open(dumpFH, "$FeatureDict") || die "SVMHeaderParse: could not open $FeatureDict to read: $!";
|
|
65
|
+
my $string = <dumpFH>;
|
|
66
|
+
close(dumpFH);
|
|
67
|
+
eval $string;
|
|
68
|
+
$FeatureDictH = $VAR1;
|
|
69
|
+
$string ="";
|
|
70
|
+
|
|
71
|
+
open(dumpFH, "$ContextFeatureDict") || die "SVMHeaderParse: could not open $ContextFeatureDict to read: $!";
|
|
72
|
+
$string = <dumpFH>;
|
|
73
|
+
close(dumpFH);
|
|
74
|
+
eval $string;
|
|
75
|
+
$ContextFeatureDictH = $VAR1;
|
|
76
|
+
$string ="";
|
|
77
|
+
|
|
78
|
+
open(dumpFH, "$SpaceAuthorFeatureDictF") || die "SVMHeaderParse: could not open $SpaceAuthorFeatureDictF to read: $!";
|
|
79
|
+
$string = <dumpFH>;
|
|
80
|
+
close(dumpFH);
|
|
81
|
+
eval $string;
|
|
82
|
+
$SpaceAuthorFeatureDictH = $VAR1;
|
|
83
|
+
$string ="";
|
|
84
|
+
$/ = "\n";
|
|
85
|
+
#End read dictionary files
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
sub Parse{
|
|
89
|
+
my $header=shift;
|
|
90
|
+
$timestamp = shift;
|
|
91
|
+
my $success = 0;
|
|
92
|
+
# $tmpCacheVec = $tmpCacheVec . "\_$timestamp\_";
|
|
93
|
+
# $SVMTmpResult = $SVMTmpResult . "\_$timestamp\_";
|
|
94
|
+
my $tmpCacheVec = "$Tmp_Dir/tmpVec"."\_$timestamp\_";
|
|
95
|
+
|
|
96
|
+
my $SVMTmpResult = "$Tmp_Dir/tmpresult"."\_$timestamp\_";
|
|
97
|
+
$TestH = &HashEbizHeader(\$header);
|
|
98
|
+
$TestH = &VectorizeUnknownHeaderLine($TestH);
|
|
99
|
+
|
|
100
|
+
my $baseline = 1;
|
|
101
|
+
$TestH = &LineClassify($testp, "", $baseline, $FeatureDictH,
|
|
102
|
+
$TestH, $tmpCacheVec, $SVMTmpResult);
|
|
103
|
+
$TestH = &UpdatePretag($TestH);
|
|
104
|
+
|
|
105
|
+
my $maxLoop = 2;
|
|
106
|
+
for my $loop(1 .. $maxLoop) {
|
|
107
|
+
$baseline = 0;
|
|
108
|
+
my $NowContext = "context"."$loop";
|
|
109
|
+
|
|
110
|
+
$TestH = &LineClassify($testp, $NowContext, $baseline,
|
|
111
|
+
$ContextFeatureDictH, $TestH,
|
|
112
|
+
$tmpCacheVec, $SVMTmpResult);
|
|
113
|
+
$TestH = &UpdatePretag($TestH);
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
#Phase 2: Extraction Information from Multi-Class Lines and Author Lines Chunks
|
|
117
|
+
my $LastContext = "context"."$maxLoop";
|
|
118
|
+
|
|
119
|
+
# BUG: InfoExtract hangs on some documents.
|
|
120
|
+
# this is reproducible with data extracted using TET from doc 654835
|
|
121
|
+
# from the legacy citeseer system.
|
|
122
|
+
eval {
|
|
123
|
+
local $SIG{'ALRM'} = sub { die "alarm\n"; };
|
|
124
|
+
alarm 15;
|
|
125
|
+
$TestH = &InfoExtract($testp, $TestH,$SpaceAuthorFeatureDictH, $PuncAuthorFeatureDictH, $SVMNameSpaceModel, $tmpCacheVec, $SVMTmpResult);
|
|
126
|
+
alarm 0;
|
|
127
|
+
};
|
|
128
|
+
if ($@) {
|
|
129
|
+
if ($@ eq "alarm\n") {
|
|
130
|
+
return 0;
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
$rXML = &ExportRDF($TestH);
|
|
134
|
+
|
|
135
|
+
for my $i(1..15){
|
|
136
|
+
unlink "$Tmp_Dir/tmpVec\_$timestamp\_test$i";
|
|
137
|
+
unlink "$Tmp_Dir/tmpresult\_$timestamp\_$i";
|
|
138
|
+
}
|
|
139
|
+
return $rXML;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
# This is the header extraction module from CiteSeer.
|
|
144
|
+
# Only the parts related to header extraction is used.
|
|
145
|
+
sub ExtractHeaderInformation {
|
|
146
|
+
my $papertext = shift;
|
|
147
|
+
my $header='';
|
|
148
|
+
|
|
149
|
+
if (!(length($$papertext))){
|
|
150
|
+
return ('Paper text is empty');
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
# $$papertext =~ s/<[SEFC][\d\.e\+\-]*>//sgi; # remove S|E|F|C tags
|
|
154
|
+
|
|
155
|
+
if ($$papertext =~ /^(.*?\b(?:Introduction|INTRODUCTION|Contents|CONTENTS)(?:.*?\n){6})/s) {
|
|
156
|
+
$header = $1;
|
|
157
|
+
} else {
|
|
158
|
+
my $nLines = 150;
|
|
159
|
+
my @lines = split '\n', $$papertext;
|
|
160
|
+
my $contentLines = 0;
|
|
161
|
+
for (my $i=0; $i<=$#lines; $i++) {
|
|
162
|
+
if ($lines[$i] !~ m/^\s*$/) {
|
|
163
|
+
$contentLines++;
|
|
164
|
+
}
|
|
165
|
+
$header .= $lines[$i]."\n";
|
|
166
|
+
if ($contentLines >= $nLines) {
|
|
167
|
+
last;
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
# if ($$papertext =~ /^(.*?)\b(?:Abstract|ABSTRACT|Introduction|INTRODUCTION|Contents|CONTENTS|[Tt]his\s+(paper|memo|technical|article|document|report|dissertation))\b/s) { $header = $1; }
|
|
173
|
+
# elsif ($$papertext =~ /^(.*?)\n[\d\.\s]*(Reference|Bibliography)/si) { $header = $1; }
|
|
174
|
+
# else{
|
|
175
|
+
# return ('Header could not be extracted');
|
|
176
|
+
# }
|
|
177
|
+
|
|
178
|
+
if ((defined $header) && (length ($header) > $nMaxHeaderLength)) {
|
|
179
|
+
$header = substr ($header, 0, $nMaxHeaderLength) . '...';
|
|
180
|
+
}
|
|
181
|
+
if (length($header) < $nMinHeaderLength) {
|
|
182
|
+
return ('Header could not be extracted');
|
|
183
|
+
}
|
|
184
|
+
return ('',$header);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
sub UpdatePretag() {
|
|
189
|
+
my $testH = shift;
|
|
190
|
+
# foreach my $testHea(sort {$a <=> $b} keys %{$testH}) {
|
|
191
|
+
foreach my $LN(sort {$a <=> $b} keys %{$testH}) {
|
|
192
|
+
delete($$testH{$LN}{Pretag});
|
|
193
|
+
if ($$testH{$LN}{PClass} eq "s") {
|
|
194
|
+
$$testH{$LN}{Pretag}{$$testH{$LN}{PSClsName}} = 1;
|
|
195
|
+
}elsif ($$testH{$LN}{PClass} eq "m") {
|
|
196
|
+
foreach my $mytag(keys %{$$testH{$LN}{PClsName}}) {
|
|
197
|
+
$$testH{$LN}{Pretag}{$mytag} = 1;
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
# }
|
|
202
|
+
return($testH);
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
#input: the file with all Training and testing samples
|
|
207
|
+
#output: $HeaderH{$HeaNO}{$LineNO} = "";
|
|
208
|
+
sub HashAllHeader() {
|
|
209
|
+
my $simulateHeaNum = shift;
|
|
210
|
+
my $tagF= shift;
|
|
211
|
+
my %HeaH = ();
|
|
212
|
+
my $HeaNO = 1; #start from 1
|
|
213
|
+
my $LineNO = 1;
|
|
214
|
+
|
|
215
|
+
open(tagFH, "$tagF") || die "SVMHeaderParse: could not open tag file\: $tagF to read: $!";
|
|
216
|
+
while (my $line = <tagFH>) {
|
|
217
|
+
$line =~ s/\+L\+//g;
|
|
218
|
+
$line =~ s/^\s+//g;
|
|
219
|
+
$line =~ s/\s+$//g;
|
|
220
|
+
|
|
221
|
+
if ($line =~ /^\s*\<NEW\_HEADER\>/) {
|
|
222
|
+
$HeaNO++;
|
|
223
|
+
$LineNO = 1;
|
|
224
|
+
#remove the line with only tag like </author>
|
|
225
|
+
}elsif (($line =~ /^\s*$/) || ($line =~ /^\<(\/)*(\w+)\>$/)) {
|
|
226
|
+
next;
|
|
227
|
+
}else {
|
|
228
|
+
$HeaH{$HeaNO}{$LineNO}{RawContent} = $line;
|
|
229
|
+
$LineNO++;
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
if ($simulateHeaNum > 0 && $HeaNO >= $simulateHeaNum) {
|
|
233
|
+
last;
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
close(tagFH);
|
|
237
|
+
return($HeaNO, \%HeaH);
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
#HEADER_DID[1]
|
|
242
|
+
#TRECS: Developing a Web-based e-Commerce Business Simulation
|
|
243
|
+
#TRECS: Developing a Web-based
|
|
244
|
+
sub HashEbizHeader() {
|
|
245
|
+
my $headerRef= shift;
|
|
246
|
+
my %HeaH = ();
|
|
247
|
+
# my $HeaNO = 1; #start from 1
|
|
248
|
+
my $LineNO = 1;
|
|
249
|
+
|
|
250
|
+
my @lines = split(/\n/, $$headerRef);
|
|
251
|
+
my $line;
|
|
252
|
+
|
|
253
|
+
#open(FH, "$F") || die "SVMHeaderParse: could not open file\: $F to read: $!";
|
|
254
|
+
#while (my $line = <FH>) {
|
|
255
|
+
foreach $line (@lines){
|
|
256
|
+
$line =~ s/^\s+//g;
|
|
257
|
+
$line =~ s/\s+$//g;
|
|
258
|
+
|
|
259
|
+
# if ($line =~ /^\s*HEADER\_DID\[(\d+)\]/) {
|
|
260
|
+
# $HeaNO = $1;
|
|
261
|
+
# $LineNO = 1;
|
|
262
|
+
# }elsif ($line !~ /^\s*$/) {
|
|
263
|
+
#$HeaH{$HeaNO}{$LineNO}{RawContent} = $line;
|
|
264
|
+
$HeaH{$LineNO}{RawContent} = $line;
|
|
265
|
+
$LineNO++;
|
|
266
|
+
# }
|
|
267
|
+
}
|
|
268
|
+
#close(FH);
|
|
269
|
+
return(\%HeaH);
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
sub BaseLineTrainSys() {
|
|
274
|
+
my $HeaderH = shift;
|
|
275
|
+
my $FeatureDictH = shift;
|
|
276
|
+
|
|
277
|
+
my %InitialHash = ();
|
|
278
|
+
$InitialHash{FeatureCounter} = 0;
|
|
279
|
+
|
|
280
|
+
my $PuncAuthorDictH = \%InitialHash;
|
|
281
|
+
my $SpaceAuthorDictH;
|
|
282
|
+
#this is the place to generate feature dictionrauy and name pattern dictionary
|
|
283
|
+
($HeaderH, $FeatureDictH, $SpaceAuthorDictH) = &FormFeaDict($HeaderH, $FeatureDictH);
|
|
284
|
+
#Prune features in Dictionary with DF < 3
|
|
285
|
+
$FeatureDictH = &PruneDict($FeatureDictH);
|
|
286
|
+
|
|
287
|
+
#prune features not in the pruned dict from the feature vector
|
|
288
|
+
foreach my $HeaNO (sort {$a <=> $b} keys %{$HeaderH}) {
|
|
289
|
+
foreach my $line(sort {$a <=> $b} keys %{$$HeaderH{$HeaNO}}) {
|
|
290
|
+
foreach my $fea(keys %{$$HeaderH{$HeaNO}{$line}{FeaVec}}) {
|
|
291
|
+
if (! $$FeatureDictH{$fea}{ID}) {
|
|
292
|
+
delete ($$HeaderH{$HeaNO}{$line}{FeaVec}{$fea});
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
if ($$HeaderH{$HeaNO}{$line}{FeaVec} ne "") {
|
|
297
|
+
my $tmpFeaVec = "";
|
|
298
|
+
foreach my $fea(sort{$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$HeaderH{$HeaNO}{$line}{FeaVec}}) {
|
|
299
|
+
|
|
300
|
+
if ($norm) {
|
|
301
|
+
#normalization
|
|
302
|
+
$$HeaderH{$HeaNO}{$line}{FeaVec}{$fea} = sprintf("%.8f", $$HeaderH{$HeaNO}{$line}{FeaVec}{$fea}/$$FeatureDictH{$fea}{max});
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
$tmpFeaVec .= "$$FeatureDictH{$fea}{ID}\:$$HeaderH{$HeaNO}{$line}{FeaVec}{$fea} ";
|
|
306
|
+
}
|
|
307
|
+
$$HeaderH{$HeaNO}{$line}{SVMFeaVec} = "$tmpFeaVec";
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
my %NameSpaceTrainVecH = (); #a separate hash for later printing
|
|
314
|
+
my $Lcount = 0;
|
|
315
|
+
#Prune acordingly features
|
|
316
|
+
foreach my $HeaNO (sort {$a <=> $b} keys %{$HeaderH}) {
|
|
317
|
+
foreach my $line(sort {$a <=> $b} keys %{$$HeaderH{$HeaNO}}) {
|
|
318
|
+
if (exists $$HeaderH{$HeaNO}{$line}{NamePattern}) {
|
|
319
|
+
foreach my $CandidateNamePattern(keys %{$$HeaderH{$HeaNO}{$line}{NamePattern}}) {
|
|
320
|
+
foreach my $fea(keys %{$$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}}) {
|
|
321
|
+
if (! $$SpaceAuthorDictH{$fea}{ID}) {
|
|
322
|
+
delete($$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}{$fea});
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
#normalization
|
|
327
|
+
if ($$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec} ne "") {
|
|
328
|
+
$Lcount++;
|
|
329
|
+
my $tmpFeaVec = "$$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{tag} ";
|
|
330
|
+
my $tmpTextVec = "$$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{tag} ";
|
|
331
|
+
|
|
332
|
+
foreach my $fea(sort{$$SpaceAuthorDictH{$a}{ID} <=> $$SpaceAuthorDictH{$b}{ID}} keys %{$$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}}) {
|
|
333
|
+
$$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}{$fea} = sprintf("%.8f", $$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}{$fea}/$$SpaceAuthorDictH{$fea}{max});
|
|
334
|
+
$tmpFeaVec .= "$$SpaceAuthorDictH{$fea}{ID}\:$$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}{$fea} ";
|
|
335
|
+
$tmpTextVec .= "$fea\:$$HeaderH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}{$fea} ";
|
|
336
|
+
}
|
|
337
|
+
$NameSpaceTrainVecH{$Lcount}{SpaceNameVec}=$tmpFeaVec;
|
|
338
|
+
$NameSpaceTrainVecH{$Lcount}{SpaceTextNameVec}=$tmpTextVec; #for debugging
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
return($HeaderH, $FeatureDictH, $PuncAuthorDictH, $SpaceAuthorDictH, \%NameSpaceTrainVecH);
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
sub ContextTrainSys() {
|
|
349
|
+
my $FeatureDictH = shift;
|
|
350
|
+
my $HeaderH = shift;
|
|
351
|
+
|
|
352
|
+
foreach my $HeaNO (sort {$a <=> $b} keys %{$HeaderH}) {
|
|
353
|
+
#assign neighour line's tag
|
|
354
|
+
($FeatureDictH, $$HeaderH{$HeaNO}) = &TrainAssignLineTag($FeatureDictH, $$HeaderH{$HeaNO});
|
|
355
|
+
}
|
|
356
|
+
return($FeatureDictH, $HeaderH);
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
#this is to write all the testing lines into one file to speed up
|
|
360
|
+
sub LineClassify() {
|
|
361
|
+
my ($testp, $nowLoop, $baseline, $FeatureDictH,
|
|
362
|
+
$HeaderH, $tmpCacheVec, $SVMTmpResult) = @_;
|
|
363
|
+
my %memoryH = ();
|
|
364
|
+
my $GlobalLineNO = 0;
|
|
365
|
+
|
|
366
|
+
#step1: collect all test data and write into one file
|
|
367
|
+
# keep a hash to record the global lineNO and the header no its local line no
|
|
368
|
+
# here is the file for all the testing data
|
|
369
|
+
|
|
370
|
+
# foreach my $testHea(sort {$a <=> $b} keys %{$HeaderH}) {
|
|
371
|
+
if ($baseline) {
|
|
372
|
+
#Filter feature vector by Feature Dictionary
|
|
373
|
+
### $$HeaderH{$testHea} = &FormTestFeaVec($FeatureDictH, $$HeaderH{$testHea});
|
|
374
|
+
$HeaderH = &FormTestFeaVec($FeatureDictH, $HeaderH);
|
|
375
|
+
}else {
|
|
376
|
+
$HeaderH = &TestAssignLineTag($FeatureDictH, $HeaderH);
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
foreach my $LN(sort {$a <=> $b} keys %{$HeaderH}) {
|
|
380
|
+
if (! $baseline) {
|
|
381
|
+
#To make the iteration correct, we should initialize $$HeaderH{$testHea} by removing all the single and multiple classes in the hash
|
|
382
|
+
delete($$HeaderH{$LN}{PClass});
|
|
383
|
+
delete($$HeaderH{$LN}{PSClsName});
|
|
384
|
+
delete($$HeaderH{$LN}{PClsName});
|
|
385
|
+
}elsif ($baseline && ($$HeaderH{$LN}{FeaVec} ne "")) {
|
|
386
|
+
#modify the feature vector(normalization)
|
|
387
|
+
my $tmpFeaVec = "";
|
|
388
|
+
foreach my $fea(sort{$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$HeaderH{$LN}{FeaVec}}) {
|
|
389
|
+
if (exists ($$FeatureDictH{$fea}{ID})) {
|
|
390
|
+
|
|
391
|
+
if ($norm) {
|
|
392
|
+
if ($debug) {
|
|
393
|
+
if ($$FeatureDictH{$fea}{max} == 0) {
|
|
394
|
+
print STDERR "fea $fea has max value 0! \n";
|
|
395
|
+
}
|
|
396
|
+
}
|
|
397
|
+
$$HeaderH{$LN}{FeaVec}{$fea} = sprintf("%.8f", $$HeaderH{$LN}{FeaVec}{$fea}/$$FeatureDictH{$fea}{max});
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
$tmpFeaVec .= "$$FeatureDictH{$fea}{ID}\:$$HeaderH{$LN}{FeaVec}{$fea} ";
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
$$HeaderH{$LN}{SVMFeaVec} = "$tmpFeaVec";
|
|
405
|
+
|
|
406
|
+
#be carefull here!!
|
|
407
|
+
if ($$HeaderH{$LN}{SVMFeaVec} eq "") {
|
|
408
|
+
if ($debug) {
|
|
409
|
+
print STDERR "header($testHea) -- Line($LN) has a null feature vector ($$HeaderH{$testHea}{$LN}{RawContent}) \n";
|
|
410
|
+
}
|
|
411
|
+
next;
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
$GlobalLineNO++;
|
|
416
|
+
$memoryH{$GlobalLineNO}{HeaNO} = $testHea;
|
|
417
|
+
$memoryH{$GlobalLineNO}{LocalLineNO} = $LN;
|
|
418
|
+
}
|
|
419
|
+
# }
|
|
420
|
+
|
|
421
|
+
#step2:we print 15 files with labelled feature vectors
|
|
422
|
+
for my $clsNO(1 .. 15) {
|
|
423
|
+
my $testF = "$tmpCacheVec"."test"."$clsNO";
|
|
424
|
+
open(testFH, ">$testF") || die "SVMHeaderParse: could not open $testF to write: $!";
|
|
425
|
+
# foreach my $HeaNO (sort {$a <=> $b} keys %{$HeaderH}) {
|
|
426
|
+
foreach my $LN(sort {$a <=> $b} keys %{$HeaderH}) {
|
|
427
|
+
my $tag = 1; # just to conform to the format
|
|
428
|
+
if ($baseline) {
|
|
429
|
+
print testFH "$tag $$HeaderH{$LN}{SVMFeaVec}\n";
|
|
430
|
+
}else {
|
|
431
|
+
print testFH "$tag $$HeaderH{$LN}{ContextSVMFeaVec}\n";
|
|
432
|
+
#print "context feature vec is $$HeaderH{$HeaNO}{$LN}{ContextSVMFeaVec}\n";
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
# } # end of collecting all the testing data into a file
|
|
436
|
+
close(testFH);
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
#step3: SVM classify
|
|
440
|
+
for my $clsNO(1 .. 15) {
|
|
441
|
+
my $testF = "$tmpCacheVec"."test"."$clsNO";
|
|
442
|
+
my $mySVMResult = "$SVMTmpResult"."$clsNO";
|
|
443
|
+
my $SVMModelF;
|
|
444
|
+
my $printstr = "";
|
|
445
|
+
if ($baseline) {
|
|
446
|
+
$printstr = "baseline";
|
|
447
|
+
$SVMModelF = "$offlineD"."$clsNO"."Model"."fold"."$testp";
|
|
448
|
+
}else {
|
|
449
|
+
$printstr = "context"."$nowLoop";
|
|
450
|
+
$SVMModelF = "$offlineD"."$clsNO"."ContextModel"."fold"."$testp";
|
|
451
|
+
}
|
|
452
|
+
# print "$Classifier -v 0 $testF $SVMModelF $mySVMResult\n";
|
|
453
|
+
# print "classification result from fold($testp)-class($clsNO)-$printstr\:\n";
|
|
454
|
+
system("$Classifier -v 0 $testF $SVMModelF $mySVMResult");
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
#step4:Read all the result into a hash
|
|
458
|
+
my %SVMResultHash = ();
|
|
459
|
+
my %OrphanTagAssignHash = (); #This records the accuracy of assigned tags
|
|
460
|
+
my %NegMeanH = (); #record the mean of the negative value each classifier made
|
|
461
|
+
my %PosMinH = ();
|
|
462
|
+
|
|
463
|
+
for my $clsNO(1 .. 15) {
|
|
464
|
+
my $mySVMResult = "$SVMTmpResult"."$clsNO";
|
|
465
|
+
my $myLineNO = 0;
|
|
466
|
+
|
|
467
|
+
#initialize %PosMinH 's value
|
|
468
|
+
$PosMinH{$clsNO} = 100;
|
|
469
|
+
|
|
470
|
+
open(mySVMResultFH, "$mySVMResult") || die "SVMHeaderParse: could not open $mySVMResult to read: $!";
|
|
471
|
+
while (my $myline = <mySVMResultFH>) {
|
|
472
|
+
$myline =~ s/^\s+//g;
|
|
473
|
+
$myline =~ s/\s+$//g;
|
|
474
|
+
if ($myline !~ /^\s*$/) {
|
|
475
|
+
$myLineNO++;
|
|
476
|
+
if ($debug) {
|
|
477
|
+
print STDERR " current lineNo is $myLineNO and score for class $clsNO is $myline \n";
|
|
478
|
+
}
|
|
479
|
+
$SVMResultHash{$myLineNO}{$clsNO} = $myline;
|
|
480
|
+
if ($myline < 0) {
|
|
481
|
+
$NegMeanH{$clsNO} += $myline;
|
|
482
|
+
}else {
|
|
483
|
+
if ($PosMinH{$clsNO} > $myline) {
|
|
484
|
+
$PosMinH{$clsNO} = $myline;
|
|
485
|
+
}
|
|
486
|
+
}
|
|
487
|
+
}
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
if ($myLineNO < 1) {
|
|
491
|
+
if ($debug) {
|
|
492
|
+
print STDERR "yahoo: $mySVMResult has myLineNO 0 \n";
|
|
493
|
+
}
|
|
494
|
+
}else {
|
|
495
|
+
$NegMeanH{$clsNO} = sprintf("%.8f", $NegMeanH{$clsNO}/$myLineNO);
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
close(mySVMResultFH);
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
my $PredTagbyMinNeg = 0;
|
|
502
|
+
my $PredValbyMinNeg = 100;
|
|
503
|
+
my $PredTagbyMinPos = 0;
|
|
504
|
+
my $PredValbyMinPos = 100;
|
|
505
|
+
|
|
506
|
+
#analyze the results from the hash and fill the Test Hash(HeaderH)
|
|
507
|
+
for my $myline(1 .. $GlobalLineNO) {
|
|
508
|
+
my @PredictTags = ();
|
|
509
|
+
my $minVal = 100;
|
|
510
|
+
my $CandidateTag = -1;
|
|
511
|
+
my $myHeaNO = $memoryH{$myline}{HeaNO};
|
|
512
|
+
my $myLineNO = $memoryH{$myline}{LocalLineNO};
|
|
513
|
+
|
|
514
|
+
for my $clsNO(1 .. 15) {
|
|
515
|
+
my $myresult = $SVMResultHash{$myline}{$clsNO};
|
|
516
|
+
#keep the classification results for multi-class line
|
|
517
|
+
$$HeaderH{$myLineNO}{ClassifyResult}{$clsNO} = $myresult;
|
|
518
|
+
if ($debug) {
|
|
519
|
+
print STDERR "\t\t result by class $clsNO -- $result \n";
|
|
520
|
+
}
|
|
521
|
+
my $myRelDiv = 10;
|
|
522
|
+
|
|
523
|
+
if ($myresult > 0) {
|
|
524
|
+
push @PredictTags, $clsNO;
|
|
525
|
+
}else {
|
|
526
|
+
$myRelDiv = sprintf("%.8f", $myresult/$NegMeanH{$clsNO});
|
|
527
|
+
if ($myRelDiv < $minVal) {
|
|
528
|
+
$minVal = $myRelDiv;
|
|
529
|
+
$CandidateTag = $clsNO;
|
|
530
|
+
}
|
|
531
|
+
if ( (0 - $myresult) < $PredValbyMinNeg) {
|
|
532
|
+
$PredValbyMinNeg = -$myresult;
|
|
533
|
+
$PredTagbyMinNeg = $clsNO;
|
|
534
|
+
}
|
|
535
|
+
if (($PosMinH{$clsNO}- $myresult) < $PredValbyMinPos) {
|
|
536
|
+
$PredValbyMinPos = $PosMinH{$clsNO}- $myresult;
|
|
537
|
+
$PredTagbyMinPos = $clsNO;
|
|
538
|
+
}
|
|
539
|
+
}
|
|
540
|
+
}
|
|
541
|
+
#Assign ONLY class nearest to the hyperplane to the orphan point
|
|
542
|
+
if ($#PredictTags < 0) {
|
|
543
|
+
push @PredictTags, $CandidateTag;
|
|
544
|
+
$OrphanTagAssignHash{TotalLineNum}++;
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
#Fill the hash with the classification result
|
|
548
|
+
if ($#PredictTags eq 0) {
|
|
549
|
+
$$HeaderH{$myLineNO}{PClass} = "s";
|
|
550
|
+
$$HeaderH{$myLineNO}{PSClsName} = $PredictTags[0];
|
|
551
|
+
}elsif ($#PredictTags > 0) {
|
|
552
|
+
$$HeaderH{$myLineNO}{PClass} = "m";
|
|
553
|
+
# the multi tags predicted in one line has no sense of the order
|
|
554
|
+
for my $i(0 .. $#PredictTags) {
|
|
555
|
+
$$HeaderH{$myLineNO}{PClsName}{$PredictTags[$i]} = 1;
|
|
556
|
+
if ($debug) {
|
|
557
|
+
print STDERR "hea($myHeaNO)-- line($myLineNO) is classified as multi-class $PredictTags[$i] \n";
|
|
558
|
+
}
|
|
559
|
+
}
|
|
560
|
+
}else { #impossible
|
|
561
|
+
if ($debug) {
|
|
562
|
+
print STDERR "hea($myHeaNO)-- line($myLineNO) is orphan\n";
|
|
563
|
+
}
|
|
564
|
+
}
|
|
565
|
+
}
|
|
566
|
+
return($HeaderH);
|
|
567
|
+
}
|
|
568
|
+
|
|
569
|
+
|
|
570
|
+
#this is to
|
|
571
|
+
#(1) populate the predicted items(done in the LineClassify)
|
|
572
|
+
#(2) Extract related information from multi-author line and multi-classline
|
|
573
|
+
#all information to be extracted comes from {Pchunk}
|
|
574
|
+
#all word distribution information comes from {Pline} word dist.;
|
|
575
|
+
|
|
576
|
+
sub InfoExtract() {
|
|
577
|
+
my $testp = shift;
|
|
578
|
+
my $TestH = shift;
|
|
579
|
+
my $PuncAuthorDictH = shift;
|
|
580
|
+
my $SpaceAuthorDictH = shift;
|
|
581
|
+
my $SVMNameSpaceModel = shift;
|
|
582
|
+
my $tmpCacheVec = shift;
|
|
583
|
+
my $SVMTmpResult = shift;
|
|
584
|
+
|
|
585
|
+
# foreach my $testHea(sort {$a <=> $b} keys %{$TestH}) {
|
|
586
|
+
foreach my $LN(sort {$a <=> $b} keys %{$TestH}) {
|
|
587
|
+
if ($$TestH{$LN}{'PClass'} eq "s") { # single class
|
|
588
|
+
if ($$TestH{$LN}{PSClsName} ne '2') { #non-author single class
|
|
589
|
+
$$TestH{$LN}{Pchunk}{ChunkCounter}++;
|
|
590
|
+
my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
|
|
591
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = $$TestH{$LN}{PSClsName};
|
|
592
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $$TestH{$LN}{PureText};
|
|
593
|
+
}else {
|
|
594
|
+
if ($$TestH{$LN}{SClsWordCount} < 4) { #obvious single name
|
|
595
|
+
$$TestH{$LN}{Pchunk}{ChunkCounter}++;
|
|
596
|
+
my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
|
|
597
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = 2;
|
|
598
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $$TestH{$LN}{PureText};
|
|
599
|
+
}else { #multi-authors
|
|
600
|
+
my $Tline = $$TestH{$LN}{RawContent};
|
|
601
|
+
$Tline =~ s/<(\/)*author>//g;
|
|
602
|
+
if ($debug) {
|
|
603
|
+
print STDERR "predicted Multi-Author line -- $Tline \n";
|
|
604
|
+
}
|
|
605
|
+
my $NamePunc = 0;
|
|
606
|
+
#judge this is punctuated line or pure text-space
|
|
607
|
+
if (($$TestH{$LN}{PureText} =~ /([^\p{IsLower}\p{IsUpper}\s+\-\.\d+])/) || ($$TestH{$LN}{PureText} =~ /\band\b/i)) {
|
|
608
|
+
#multi-class needs while ... $punc++;
|
|
609
|
+
$NamePunc = 1;
|
|
610
|
+
}else {
|
|
611
|
+
$NamePunc = 0;
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
if ($NamePunc) {
|
|
615
|
+
#Heuristics bases separation based on features learned.
|
|
616
|
+
if (($$TestH{$LN}{PureText} =~ /Jr|Dr/) && ($$TestH{$LN}{SClsWordCount} <5)) {
|
|
617
|
+
#this is only one name
|
|
618
|
+
$$TestH{$LN}{Pchunk}{ChunkCounter}++;
|
|
619
|
+
my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
|
|
620
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = 2;
|
|
621
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $$TestH{$LN}{PureText};
|
|
622
|
+
}else {
|
|
623
|
+
my $nameStr = $$TestH{$LN}{PureText};
|
|
624
|
+
$nameStr =~ s/^\s+//g;
|
|
625
|
+
$nameStr =~ s/\s+$//g;
|
|
626
|
+
my @GuessedNames = split(/\,|\&|and/, $nameStr);
|
|
627
|
+
for my $i(0 .. $#GuessedNames) {
|
|
628
|
+
#chunk starts from 1
|
|
629
|
+
$GuessedNames[$i] =~ s/^\s+//g;
|
|
630
|
+
$GuessedNames[$i] =~ s/\s+$//g;
|
|
631
|
+
if ($GuessedNames[$i] !~ /^\s*$/) {
|
|
632
|
+
my @Nameparts = split(/\s+/, $GuessedNames[$i]);
|
|
633
|
+
if ($#Nameparts < 3) {
|
|
634
|
+
$$TestH{$LN}{Pchunk}{ChunkCounter}++;
|
|
635
|
+
my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
|
|
636
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = 2;
|
|
637
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $GuessedNames[$i];
|
|
638
|
+
}else {
|
|
639
|
+
#space separated names [name1 name2 name3 and name4]
|
|
640
|
+
my $PredictedNames = &HeaderParse::API::NamePatternMatch::NamePatternMatch($GuessedNames[$i]);
|
|
641
|
+
if ($#$PredictedNames < 1){
|
|
642
|
+
#only 1/0 reasonable name pattern, take it
|
|
643
|
+
$$TestH{$LN}{Pchunk}{ChunkCounter}++;
|
|
644
|
+
my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
|
|
645
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = 2;
|
|
646
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $GuessedNames[$i];
|
|
647
|
+
}else { #classify to predict
|
|
648
|
+
my $BestNamePattern = &PredictBestNamePattern($PredictedNames, $SVMNameSpaceModel, $SpaceAuthorDictH, $tmpCacheVec, $SVMTmpResult);
|
|
649
|
+
my @names = split(/<>/, $BestNamePattern);
|
|
650
|
+
for my $i(0 .. $#names) {
|
|
651
|
+
$$TestH{$LN}{Pchunk}{ChunkCounter}++;
|
|
652
|
+
my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
|
|
653
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = 2;
|
|
654
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $names[$i];
|
|
655
|
+
}
|
|
656
|
+
}
|
|
657
|
+
}
|
|
658
|
+
}
|
|
659
|
+
}
|
|
660
|
+
}
|
|
661
|
+
}else {
|
|
662
|
+
#name Space
|
|
663
|
+
my $nameStr = $$TestH{$LN}{PureText};
|
|
664
|
+
my $PredictedNames = &HeaderParse::API::NamePatternMatch::NamePatternMatch($nameStr);
|
|
665
|
+
if ($#$PredictedNames < 1){
|
|
666
|
+
#only 1/0 reasonable name pattern, take the parser-decided chunks
|
|
667
|
+
my $tmp_name_container = $$PredictedNames[0];
|
|
668
|
+
if ($#$tmp_name_container > 0) {
|
|
669
|
+
for my $kk(0 .. $#$tmp_name_container) {
|
|
670
|
+
$$TestH{$LN}{Pchunk}{ChunkCounter}++;
|
|
671
|
+
my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
|
|
672
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = 2;
|
|
673
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $$tmp_name_container[$kk];
|
|
674
|
+
}
|
|
675
|
+
}else {
|
|
676
|
+
#this branch is original
|
|
677
|
+
$$TestH{$LN}{Pchunk}{ChunkCounter}++;
|
|
678
|
+
my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
|
|
679
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = 2;
|
|
680
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $nameStr;
|
|
681
|
+
}
|
|
682
|
+
}else {
|
|
683
|
+
#classify to predict
|
|
684
|
+
my $BestNamePattern = &PredictBestNamePattern($PredictedNames, $SVMNameSpaceModel, $SpaceAuthorDictH, $tmpCacheVec, $SVMTmpResult);
|
|
685
|
+
my @names = split(/<>/, $BestNamePattern);
|
|
686
|
+
for my $i(0 .. $#names) {
|
|
687
|
+
$$TestH{$LN}{Pchunk}{ChunkCounter}++;
|
|
688
|
+
my $ChunkPos = $$TestH{$LN}{Pchunk}{ChunkCounter};
|
|
689
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{cls} = 2;
|
|
690
|
+
$$TestH{$LN}{Pchunk}{$ChunkPos}{content} = $names[$i];
|
|
691
|
+
}
|
|
692
|
+
}
|
|
693
|
+
}
|
|
694
|
+
}
|
|
695
|
+
}
|
|
696
|
+
#multiple class
|
|
697
|
+
}elsif ($$TestH{$LN}{PClass} eq "m"){
|
|
698
|
+
my (%TagH, $emailChunkH, $URLChunkH, @ArrayofHash);
|
|
699
|
+
#get a hash of all tags
|
|
700
|
+
foreach my $tag(keys %{$$TestH{$LN}{PClsName}}) {
|
|
701
|
+
$TagH{counter}++;
|
|
702
|
+
$TagH{$tag}++;
|
|
703
|
+
}
|
|
704
|
+
my ($PuncNum, $SepH, $component) = &GetSeparatorIndex($$TestH{$LN}{PureText});
|
|
705
|
+
#Preprocess -- extract email and URL out
|
|
706
|
+
if ($$TestH{$LN}{PClsName}{6}) {
|
|
707
|
+
#component has holes of "-1", after extracting emailchunk out
|
|
708
|
+
($emailChunkH, $component) = &LocateEmailFromComponent($component);
|
|
709
|
+
delete($TagH{6});
|
|
710
|
+
$TagH{counter}--;
|
|
711
|
+
push @ArrayofHash, $emailChunkH;
|
|
712
|
+
}
|
|
713
|
+
if ($$TestH{$LN}{PClsName}{12}) {
|
|
714
|
+
($URLChunkH, $component) = &LocateURLFromComponent($component);
|
|
715
|
+
delete($TagH{12});
|
|
716
|
+
$TagH{counter}--;
|
|
717
|
+
push @ArrayofHash, $URLChunkH;
|
|
718
|
+
}
|
|
719
|
+
|
|
720
|
+
if($TagH{counter} <1){ #no additional class
|
|
721
|
+
#exception: what if still text left ???????
|
|
722
|
+
$$TestH{$LN} = &FillChunkH($$TestH{$LN},$component, \@ArrayofHash);
|
|
723
|
+
#tag each word
|
|
724
|
+
}elsif ($TagH{counter} == 1){
|
|
725
|
+
#only one class left ..
|
|
726
|
+
my $lastTag = "";
|
|
727
|
+
foreach my $tag(keys %TagH) {
|
|
728
|
+
if ($tag ne "counter") {
|
|
729
|
+
$lastTag = $tag;
|
|
730
|
+
}
|
|
731
|
+
}
|
|
732
|
+
#Get the rest possible chunks separated by the email and URL
|
|
733
|
+
my $UnIdentifiedChunk = &LocateUnIdentifiedChunk($component);
|
|
734
|
+
#Tag all the test chunk as the only left class
|
|
735
|
+
foreach my $chunkNO(sort{$a<=>$b} keys %{$UnIdentifiedChunk}) {
|
|
736
|
+
$$UnIdentifiedChunk{$chunkNO}{cls} = $lastTag;
|
|
737
|
+
}
|
|
738
|
+
push @ArrayofHash, $UnIdentifiedChunk; #or\%myHash--must be pointer
|
|
739
|
+
#fill in the TestH chunk in a ordered way and tag each word
|
|
740
|
+
$$TestH{$LN} = &FillChunkH($$TestH{$LN}, $component, \@ArrayofHash);
|
|
741
|
+
# two class module
|
|
742
|
+
}elsif ($TagH{counter} == 2) {
|
|
743
|
+
#needs maping!
|
|
744
|
+
my @TagsArray = ();
|
|
745
|
+
foreach my $mytag(sort keys %TagH) {
|
|
746
|
+
if ($mytag ne "counter") {
|
|
747
|
+
push @TagsArray, $mytag;
|
|
748
|
+
}
|
|
749
|
+
}
|
|
750
|
+
|
|
751
|
+
my $UnIdentifiedChunk = &LocateUnIdentifiedChunk($component);
|
|
752
|
+
my $chunk1start = $$UnIdentifiedChunk{1}{startPos};
|
|
753
|
+
my $chunk1end = $$UnIdentifiedChunk{1}{endPos};
|
|
754
|
+
my $IdentifiedChunk;
|
|
755
|
+
#continuous
|
|
756
|
+
if ($$UnIdentifiedChunk{counter} == 1) {
|
|
757
|
+
my $offset;
|
|
758
|
+
my $newComponent = $component;
|
|
759
|
+
my $newSepH = $SepH;
|
|
760
|
+
if (($chunk1start == 0) && ($chunk1end == $#$component)) {
|
|
761
|
+
$offset = 0;
|
|
762
|
+
}else {
|
|
763
|
+
$offset = $chunk1start;
|
|
764
|
+
#adjust $component and $SepH
|
|
765
|
+
$newComponent = ();
|
|
766
|
+
for my $tmpi($chunk1start .. $chunk1end) {
|
|
767
|
+
$$newComponent[$tmpi-$offset] = $$component[$tmpi];
|
|
768
|
+
}
|
|
769
|
+
|
|
770
|
+
foreach my $tmpSep(sort keys %{$newSepH}) {
|
|
771
|
+
if (($tmpSep >= $chunk1start) && ($tmpSep <= $chunk1end)) {
|
|
772
|
+
my $newSep = $tmpSep - $offset;
|
|
773
|
+
$$newSepH{$newSep} = $$newSepH{$tmpSep};
|
|
774
|
+
}
|
|
775
|
+
delete($$newSepH{$tmpSep});
|
|
776
|
+
}
|
|
777
|
+
}
|
|
778
|
+
|
|
779
|
+
if ($PuncNum > 1) {
|
|
780
|
+
$IdentifiedChunk = &Cont2ClassChunking($testp, \@TagsArray, "punc", $newSepH, $newComponent, $FeatureDictH, $tmpCacheVec, $SVMTmpResult);
|
|
781
|
+
}else {
|
|
782
|
+
$IdentifiedChunk = &Cont2ClassChunking($testp, \@TagsArray, "space", $newSepH, $newComponent, $FeatureDictH, $tmpCacheVec, $SVMTmpResult);
|
|
783
|
+
}
|
|
784
|
+
#adjust back $chunk
|
|
785
|
+
if ($offset > 0) {
|
|
786
|
+
foreach my $tmpi(sort keys %{$IdentifiedChunk}) {
|
|
787
|
+
$$IdentifiedChunk{$tmpi}{startPos} += $offset;
|
|
788
|
+
$$IdentifiedChunk{$tmpi}{endPos} += $offset;
|
|
789
|
+
}
|
|
790
|
+
}
|
|
791
|
+
push @ArrayofHash, $IdentifiedChunk;
|
|
792
|
+
$$TestH{$LN} = &FillChunkH($$TestH{$LN}, $component, \@ArrayofHash);
|
|
793
|
+
}elsif ($$UnIdentifiedChunk{counter} == 2) { #discrete
|
|
794
|
+
$IdentifiedChunk = &Disc2ClassChunking_2chunk($testp, \@TagsArray, $UnIdentifiedChunk, $FeatureDictH, $tmpCacheVec, $SVMTmpResult);
|
|
795
|
+
push @ArrayofHash, $IdentifiedChunk;
|
|
796
|
+
$$TestH{$LN} = &FillChunkH($$TestH{$LN}, $component, \@ArrayofHash);
|
|
797
|
+
}elsif ($$UnIdentifiedChunk{counter} > 2) { #disc
|
|
798
|
+
if ($debug) {
|
|
799
|
+
print STDERR "2 classes with 3+ chunks\n";
|
|
800
|
+
}
|
|
801
|
+
}
|
|
802
|
+
#see 3 and 4 as one class
|
|
803
|
+
}elsif (($TagH{counter} == 3) && $TagH{3} && $TagH{4}) {
|
|
804
|
+
#tag array includes only 4 and the other tag
|
|
805
|
+
my @TagsArray = ();
|
|
806
|
+
foreach my $mytag(sort keys %TagH) {
|
|
807
|
+
if (($mytag ne "3") && ($mytag ne "4") && ($mytag ne "counter")) {
|
|
808
|
+
push @TagsArray, $mytag;
|
|
809
|
+
}
|
|
810
|
+
}
|
|
811
|
+
push @TagsArray, 4;
|
|
812
|
+
|
|
813
|
+
my $UnIdentifiedChunk = &LocateUnIdentifiedChunk($component);
|
|
814
|
+
my $chunk1start = $$UnIdentifiedChunk{1}{startPos};
|
|
815
|
+
my $chunk1end = $$UnIdentifiedChunk{1}{endPos};
|
|
816
|
+
|
|
817
|
+
my $IdentifiedChunk;
|
|
818
|
+
my $startPos34 = 0;
|
|
819
|
+
my $endPos34 = 0;
|
|
820
|
+
#continuous
|
|
821
|
+
if ($$UnIdentifiedChunk{counter} == 1) {
|
|
822
|
+
my $offset;
|
|
823
|
+
my $newComponent = $component;
|
|
824
|
+
my $newSepH = $SepH;
|
|
825
|
+
|
|
826
|
+
if (($chunk1start == 0) && ($chunk1end == $#$component)) {
|
|
827
|
+
$offset = 0;
|
|
828
|
+
}else {
|
|
829
|
+
$offset = $chunk1start;
|
|
830
|
+
#adjust $component and $SepH
|
|
831
|
+
$newComponent = ();
|
|
832
|
+
for my $tmpi($chunk1start .. $chunk1end) {
|
|
833
|
+
$$newComponent[$tmpi-$offset] = $$component[$tmpi];
|
|
834
|
+
}
|
|
835
|
+
|
|
836
|
+
foreach my $tmpSep(sort keys %{$newSepH}) {
|
|
837
|
+
if (($tmpSep >= $chunk1start) && ($tmpSep <= $chunk1end)) {
|
|
838
|
+
my $newSep = $tmpSep - $offset;
|
|
839
|
+
$$newSepH{$newSep} = $$newSepH{$tmpSep};
|
|
840
|
+
}
|
|
841
|
+
delete($$newSepH{$tmpSep});
|
|
842
|
+
}
|
|
843
|
+
}
|
|
844
|
+
|
|
845
|
+
#find the boundary between 34 and the other tag
|
|
846
|
+
if ($PuncNum > 1) {
|
|
847
|
+
$IdentifiedChunk = &Cont2ClassChunking($testp, \@TagsArray, "punc", $newSepH, $newComponent, $FeatureDictH, $tmpCacheVec, $SVMTmpResult);
|
|
848
|
+
}else {
|
|
849
|
+
$IdentifiedChunk = &Cont2ClassChunking($testp, \@TagsArray, "space", $newSepH, $newComponent, $FeatureDictH, $tmpCacheVec, $SVMTmpResult);
|
|
850
|
+
}
|
|
851
|
+
|
|
852
|
+
#adjust back $chunk
|
|
853
|
+
#get the position of the 3 4
|
|
854
|
+
foreach my $tmpi(sort keys %{$IdentifiedChunk}) {
|
|
855
|
+
if ($offset > 0) {
|
|
856
|
+
$$IdentifiedChunk{$tmpi}{startPos} += $offset;
|
|
857
|
+
$$IdentifiedChunk{$tmpi}{endPos} += $offset;
|
|
858
|
+
}
|
|
859
|
+
if ($$IdentifiedChunk{$tmpi}{cls} eq 4) {
|
|
860
|
+
$startPos34 = $$IdentifiedChunk{$tmpi}{startPos}; #absolute pos
|
|
861
|
+
$endPos34 = $$IdentifiedChunk{$tmpi}{endPos};
|
|
862
|
+
delete($$IdentifiedChunk{$tmpi});
|
|
863
|
+
}
|
|
864
|
+
}
|
|
865
|
+
push @ArrayofHash, $IdentifiedChunk;
|
|
866
|
+
|
|
867
|
+
}else { #if 2 discrete chunks
|
|
868
|
+
$IdentifiedChunk = &Disc2ClassChunking_2chunk($testp, \@TagsArray, $UnIdentifiedChunk, $FeatureDictH, $tmpCacheVec, $SVMTmpResult);
|
|
869
|
+
foreach my $tmpi(sort keys %{$IdentifiedChunk}) {
|
|
870
|
+
if ($$IdentifiedChunk{$tmpi}{cls} eq 4) {
|
|
871
|
+
$startPos34 = $$IdentifiedChunk{$tmpi}{startPos};
|
|
872
|
+
$endPos34 = $$IdentifiedChunk{$tmpi}{endPos};
|
|
873
|
+
delete($$IdentifiedChunk{$tmpi});
|
|
874
|
+
}
|
|
875
|
+
}
|
|
876
|
+
push @ArrayofHash, $IdentifiedChunk;
|
|
877
|
+
}
|
|
878
|
+
|
|
879
|
+
#find the boundary between 3 and 4
|
|
880
|
+
my $newComponent = (); #modified by Hui 03/19
|
|
881
|
+
my $newSepH = $SepH;
|
|
882
|
+
my $newPuncNum = 0;
|
|
883
|
+
my $offset = $startPos34;
|
|
884
|
+
for (my $tmpi=$startPos34; $tmpi<=$endPos34; $tmpi++) {
|
|
885
|
+
#modified by Hui 03/19/03 -$offset
|
|
886
|
+
$$newComponent[$tmpi-$offset] = $$component[$tmpi];
|
|
887
|
+
if ($$newComponent[$tmpi-$offset] =~ /^\W+$/) {
|
|
888
|
+
$newPuncNum++;
|
|
889
|
+
}
|
|
890
|
+
}
|
|
891
|
+
|
|
892
|
+
if ($newPuncNum > 1) {
|
|
893
|
+
foreach my $tmpSep(sort keys %{$$newSepH{punc}}) {
|
|
894
|
+
if (($tmpSep >= $startPos34) && ($tmpSep <= $endPos34)) {
|
|
895
|
+
my $newSep = $tmpSep - $offset;
|
|
896
|
+
$$newSepH{punc}{$newSep} = $$newSepH{punc}{$tmpSep};
|
|
897
|
+
}
|
|
898
|
+
delete($$newSepH{punc}{$tmpSep});
|
|
899
|
+
}
|
|
900
|
+
}else {
|
|
901
|
+
foreach my $tmpSep(sort keys %{$$newSepH{space}}) {
|
|
902
|
+
if (($tmpSep >= $startPos34) && ($tmpSep <= $endPos34)) {
|
|
903
|
+
my $newSep = $tmpSep - $offset;
|
|
904
|
+
$$newSepH{space}{$newSep} = $$newSepH{space}{$tmpSep};
|
|
905
|
+
}
|
|
906
|
+
delete($$newSepH{space}{$tmpSep});
|
|
907
|
+
}
|
|
908
|
+
}
|
|
909
|
+
|
|
910
|
+
my @NewTagsArray = ();
|
|
911
|
+
push @NewTagsArray, 3;
|
|
912
|
+
push @NewTagsArray, 4;
|
|
913
|
+
if ($newPuncNum > 1) {
|
|
914
|
+
$IdentifiedChunk = &Cont2ClassChunking($testp, \@NewTagsArray, "punc", $newSepH, $newComponent, $FeatureDictH, $tmpCacheVec, $SVMTmpResult);
|
|
915
|
+
}else {
|
|
916
|
+
$IdentifiedChunk = &Cont2ClassChunking($testp, \@NewTagsArray, "space", $newSepH, $newComponent, $FeatureDictH, $tmpCacheVec, $SVMTmpResult);
|
|
917
|
+
}
|
|
918
|
+
|
|
919
|
+
#adjust back $chunk
|
|
920
|
+
if ($offset > 0) {
|
|
921
|
+
foreach my $tmpi(sort keys %{$IdentifiedChunk}) {
|
|
922
|
+
$$IdentifiedChunk{$tmpi}{startPos} += $offset;
|
|
923
|
+
$$IdentifiedChunk{$tmpi}{endPos} += $offset;
|
|
924
|
+
}
|
|
925
|
+
}
|
|
926
|
+
push @ArrayofHash, $IdentifiedChunk;
|
|
927
|
+
$$TestH{$LN} = &FillChunkH($$TestH{$LN}, $component, \@ArrayofHash);
|
|
928
|
+
}elsif ($TagH{counter} > 2) { #3+ cases.
|
|
929
|
+
#consider about 3 discrete chunks for 3 tags????
|
|
930
|
+
if ($debug) {
|
|
931
|
+
print STDERR "do not care yet -- here is the case for 3+ classes after preprocessing \n";
|
|
932
|
+
#find the most likely position and expand to arround some(like 3) words
|
|
933
|
+
}
|
|
934
|
+
}
|
|
935
|
+
}
|
|
936
|
+
}
|
|
937
|
+
# }
|
|
938
|
+
|
|
939
|
+
return($TestH);
|
|
940
|
+
}
|
|
941
|
+
|
|
942
|
+
|
|
943
|
+
sub ExportInfo(){
|
|
944
|
+
my $TestH = shift;
|
|
945
|
+
my $outF = "output.txt";
|
|
946
|
+
open(WRITER, ">$outF") || die "SVMHeaderParse: could not open $outF to write: $!";
|
|
947
|
+
# foreach my $testHea(sort {$a <=> $b} keys %{$TestH}) {
|
|
948
|
+
print WRITER "headerno($testHea) -- ";
|
|
949
|
+
foreach my $LN(sort {$a <=> $b} keys %{$TestH}) {
|
|
950
|
+
print WRITER "lineno($LN)\: \n ";
|
|
951
|
+
foreach my $chunk(sort {$a <=> $b} keys %{$$TestH{$LN}{Pchunk}}) {
|
|
952
|
+
if ($chunk ne "ChunkCounter") {
|
|
953
|
+
print WRITER "\t chunk($chunk) -- class($$TestH{$LN}{Pchunk}{$chunk}{cls} <> content($$TestH{$LN}{Pchunk}{$chunk}{content} \n";
|
|
954
|
+
}
|
|
955
|
+
}
|
|
956
|
+
}
|
|
957
|
+
# }
|
|
958
|
+
close(WRITER);
|
|
959
|
+
}
|
|
960
|
+
|
|
961
|
+
|
|
962
|
+
sub ExportRDF(){
|
|
963
|
+
my $TestH = shift;
|
|
964
|
+
my $str='';
|
|
965
|
+
my $tempStr='';
|
|
966
|
+
foreach my $LN(sort {$a <=> $b} keys %{$TestH}) {
|
|
967
|
+
foreach my $chunk(sort {$a <=> $b} keys %{$$TestH{$LN}{Pchunk}}) {
|
|
968
|
+
my $tag = $InverseTagMap{$$TestH{$LN}{Pchunk}{$chunk}{cls}};
|
|
969
|
+
my $content = $$TestH{$LN}{Pchunk}{$chunk}{content};
|
|
970
|
+
if ($content =~ /\w+/) {
|
|
971
|
+
$str .="<$tag>$content</$tag>\n";
|
|
972
|
+
# if($tag =~/(url|note|date|abstract|intro|keyword|web|degree|pubnum|page)/){
|
|
973
|
+
# $tempStr .= "\n<cs_header:$tag>$content<\/cs_header:$tag>";
|
|
974
|
+
# }
|
|
975
|
+
}
|
|
976
|
+
}
|
|
977
|
+
}
|
|
978
|
+
|
|
979
|
+
# print "RDF:\n\n $str\n";
|
|
980
|
+
# print "$str\n";
|
|
981
|
+
$rXML = &HeaderParse::API::AssembleXMLMetadata::assemble(\$str);
|
|
982
|
+
return $rXML;
|
|
983
|
+
}
|
|
984
|
+
|
|
985
|
+
|
|
986
|
+
#Basic function: popuate information from line -- feature vector and class assignment and name patterns.
|
|
987
|
+
#no dictionary would be formed here
|
|
988
|
+
sub PopulateLineInfo4Header_unit() {
|
|
989
|
+
my $HeaderH = shift;
|
|
990
|
+
my %curState = ();
|
|
991
|
+
|
|
992
|
+
foreach my $line(sort {$a <=> $b} keys %{$HeaderH}) {
|
|
993
|
+
my $PureTextLine = $$HeaderH{$line}{RawContent};
|
|
994
|
+
$PureTextLine =~ s/(\<)*\<(\/)*(\w+)\>(\>)*/ /g; # remove the tags
|
|
995
|
+
$PureTextLine =~ s/\+L\+//g;
|
|
996
|
+
$PureTextLine =~ s/^\s+//g;
|
|
997
|
+
$PureTextLine =~ s/\s+$//g;
|
|
998
|
+
#should make punctuation separate!
|
|
999
|
+
$$HeaderH{$line}{PureText} = &SeparatePunc($PureTextLine);
|
|
1000
|
+
$$HeaderH{$line}{FeaVec} = &LineFeatureRepre($$HeaderH{$line}{PureText});
|
|
1001
|
+
#add the position of the line here!!!!
|
|
1002
|
+
$$HeaderH{$line}{FeaVec}{Clinepos} = $line;
|
|
1003
|
+
my $textFeaVec = "";
|
|
1004
|
+
foreach my $fea(keys %{$$HeaderH{$line}{FeaVec}}) {
|
|
1005
|
+
if($$HeaderH{$line}{FeaVec}{$fea} == 0) {
|
|
1006
|
+
delete ($$HeaderH{$line}{FeaVec}{$fea});
|
|
1007
|
+
}else {
|
|
1008
|
+
$textFeaVec .= "$fea($$HeaderH{$line}{FeaVec}{$fea}) ";
|
|
1009
|
+
}
|
|
1010
|
+
}
|
|
1011
|
+
$$HeaderH{$line}{TextFeaVec} = $textFeaVec; # for read and debug
|
|
1012
|
+
|
|
1013
|
+
#assign class tag to each line -- not separator <<sep>><</sep>> here
|
|
1014
|
+
if ($$HeaderH{$line}{RawContent} =~ /([^\<]+|(^\s*))\<(\/)*(\w+)\>($|[^\<]+)/) {
|
|
1015
|
+
%curState = ();
|
|
1016
|
+
my $tmpIndex = 0; # the order of this tag showed up last time
|
|
1017
|
+
my $preTag = -1;
|
|
1018
|
+
my $mul = 0;
|
|
1019
|
+
while ($$HeaderH{$line}{RawContent} =~ /([^\<]+|(^\s*))\<(\/)*(\w+)\>($|[^\<]+)/g) {
|
|
1020
|
+
$tmpIndex++;
|
|
1021
|
+
my $tmptag = $4;
|
|
1022
|
+
if (($preTag > 0) && ($preTag ne $tagMap{$tmptag})) {
|
|
1023
|
+
$mul = 1;
|
|
1024
|
+
}
|
|
1025
|
+
$curState{$tagMap{$tmptag}} = $tmpIndex;
|
|
1026
|
+
$preTag = $tagMap{$tmptag};
|
|
1027
|
+
}
|
|
1028
|
+
|
|
1029
|
+
if ($mul) {
|
|
1030
|
+
$$HeaderH{$line}{TClass} = "m";
|
|
1031
|
+
my $order = 1;
|
|
1032
|
+
foreach my $tag(sort {$curState{$a} <=> $curState{$b}} keys %curState) {
|
|
1033
|
+
$$HeaderH{$line}{MClsName}{$tag} = $order;
|
|
1034
|
+
$order++;
|
|
1035
|
+
}
|
|
1036
|
+
|
|
1037
|
+
#represent the class distribution only for this multi-class case.
|
|
1038
|
+
my $Tline = $$HeaderH{$line}{RawContent};
|
|
1039
|
+
#main purpose is to combine </phone><email> as one <s>
|
|
1040
|
+
$Tline =~ s/\<(\/)*(\w+)\>/<s>/g; #replace the tags with <s>
|
|
1041
|
+
$Tline =~ s/^\s*<s>\s*//g;
|
|
1042
|
+
$Tline =~ s/\s*<s>\s*$//g;
|
|
1043
|
+
$Tline =~ s/<s>\s*<s>/<s>/g;
|
|
1044
|
+
$Tline =~ s/\s+/ /g;
|
|
1045
|
+
|
|
1046
|
+
$Tline = &SeparatePunc($Tline);
|
|
1047
|
+
|
|
1048
|
+
while ($Tline =~ /(\s+(\W+)\s+<s>)/g) {
|
|
1049
|
+
my $whole = $1;
|
|
1050
|
+
my $punc = $2;
|
|
1051
|
+
$punc =~ s/^\s+//g;
|
|
1052
|
+
$punc =~ s/\s+$//g;
|
|
1053
|
+
|
|
1054
|
+
if ($punc eq "\|") {
|
|
1055
|
+
$Tline =~ s/\|/\!\!\!/g;
|
|
1056
|
+
$whole =~ s/\|/\!\!\!/g;
|
|
1057
|
+
}
|
|
1058
|
+
$Tline =~ s/$whole/<<sep>>$punc<<\/sep>>/g; #only once no "g"
|
|
1059
|
+
if ($punc eq "\|") {
|
|
1060
|
+
$Tline =~ s/\!\!\!//g;
|
|
1061
|
+
$whole =~ s/\!\!\!//g;
|
|
1062
|
+
}
|
|
1063
|
+
}
|
|
1064
|
+
while ($Tline =~ /(<s>\s+(\W+)\s+)/g) {
|
|
1065
|
+
my $whole = $1;
|
|
1066
|
+
my $punc = $2;
|
|
1067
|
+
$punc =~ s/^\s+//g;
|
|
1068
|
+
$punc =~ s/\s+$//g;
|
|
1069
|
+
if ($punc eq "\|") {
|
|
1070
|
+
$Tline =~ s/\|/\!\!\!/g;
|
|
1071
|
+
$whole =~ s/\|/\!\!\!/g;
|
|
1072
|
+
}
|
|
1073
|
+
$Tline =~ s/$whole/<<sep>>$punc<<\/sep>>/g; #only once no "g"
|
|
1074
|
+
if ($punc eq "\|") {
|
|
1075
|
+
$Tline =~ s/\!\!\!/\|/g;
|
|
1076
|
+
$whole =~ s/\!\!\!/\|/g;
|
|
1077
|
+
}
|
|
1078
|
+
}
|
|
1079
|
+
$Tline =~ s/<s>/<<sep>><<\/sep>>/g;
|
|
1080
|
+
my ($PuncNum, $SepH, $component) = &GetSeparatorIndex($Tline);
|
|
1081
|
+
#Populate Truth Hash by the chunk and word-class distribution
|
|
1082
|
+
$$HeaderH{$line} = &AssignWordTagFromChunk($$HeaderH{$line}, $SepH, $component);
|
|
1083
|
+
}else {
|
|
1084
|
+
$$HeaderH{$line}{TClass} = "s";
|
|
1085
|
+
my @Tarr = split(/\s+/, $PureTextLine);
|
|
1086
|
+
$$HeaderH{$line}{SClsWordCount} = $#Tarr +1;
|
|
1087
|
+
foreach my $tag(sort {$curState{$a} <=> $curState{$b}} keys %curState) {
|
|
1088
|
+
$$HeaderH{$line}{SClsName} = $tag;
|
|
1089
|
+
}
|
|
1090
|
+
|
|
1091
|
+
#Fill in the word-class distribution for single class line
|
|
1092
|
+
my $lineContent = &SeparatePunc($$HeaderH{$line}{PureText});
|
|
1093
|
+
my @wordArray = split(/\s+/, $lineContent);
|
|
1094
|
+
undef $lineContent;
|
|
1095
|
+
|
|
1096
|
+
$$HeaderH{$line} = &AssignWordTag4SingleClassLine("truth", $$HeaderH{$line}{SClsName}, $$HeaderH{$line}, \@wordArray);
|
|
1097
|
+
|
|
1098
|
+
#but only multi-author has multiple chunks
|
|
1099
|
+
#all reasonable name patterns for space separated names
|
|
1100
|
+
#feature vec for each space namepatterns and puncutation separators
|
|
1101
|
+
#Test/prediction will base on the predicted line tag in another module
|
|
1102
|
+
|
|
1103
|
+
#single author
|
|
1104
|
+
if ($$HeaderH{$line}{SClsName} eq "2") {
|
|
1105
|
+
#From Truth
|
|
1106
|
+
if ($$HeaderH{$line}{RawContent} !~ /<<sep>>/) {
|
|
1107
|
+
#could we save space by indicating the pure text directly
|
|
1108
|
+
$$HeaderH{$line}{Tchunk}{$i}{cls} = 2;
|
|
1109
|
+
$$HeaderH{$line}{Tchunk}{$i}{content} = $$HeaderH{$line}{PureText};
|
|
1110
|
+
#multiple authors
|
|
1111
|
+
}else {
|
|
1112
|
+
my $Tline = $$HeaderH{$line}{RawContent};
|
|
1113
|
+
$Tline =~ s/<(\/)*author>//g;
|
|
1114
|
+
|
|
1115
|
+
my ($PuncNum, $SepH, $component) = &GetSeparatorIndex($Tline);
|
|
1116
|
+
my $nameStr = join(" ", @$component);
|
|
1117
|
+
|
|
1118
|
+
#judge this is punctuated line or pure text-space
|
|
1119
|
+
if ($$HeaderH{$line}{PureText} =~ /([^\p{IsLower}\p{IsUpper}\s+\-\.\d+])|(\W+and\W+)/ig) {
|
|
1120
|
+
#multi-class needs while ... $punc++;
|
|
1121
|
+
$$HeaderH{$line}{NamePunc} = 1;
|
|
1122
|
+
}else {
|
|
1123
|
+
$$HeaderH{$line}{NameSpace} = 1;
|
|
1124
|
+
}
|
|
1125
|
+
|
|
1126
|
+
#{NamePuncFeaVec} and {NameSpaceFeaVec} based on number of puncs (>2)
|
|
1127
|
+
#{MulClsPuncFeaVec}
|
|
1128
|
+
|
|
1129
|
+
######common to both name space and name punc ######
|
|
1130
|
+
my $TrueNames = &HeaderParse::API::NamePatternMatch::GetTrueName($nameStr);
|
|
1131
|
+
for my $i(0 .. $#$TrueNames) {
|
|
1132
|
+
my $j = $i+1; #chunk should start from 1
|
|
1133
|
+
$$HeaderH{$line}{Tchunk}{$j}{cls} = 2;
|
|
1134
|
+
$$HeaderH{$line}{Tchunk}{$j}{content} = "$$TrueNames[$i]";
|
|
1135
|
+
}
|
|
1136
|
+
################################################
|
|
1137
|
+
|
|
1138
|
+
if ($$HeaderH{$line}{NamePunc}) {
|
|
1139
|
+
}else {
|
|
1140
|
+
my $PredictedNames = &HeaderParse::API::NamePatternMatch::NamePatternMatch($nameStr);
|
|
1141
|
+
if ($#$PredictedNames < 1) {
|
|
1142
|
+
#only one pattern -- do not fill name pattern
|
|
1143
|
+
}else {
|
|
1144
|
+
my $TrueIndex = &HeaderParse::API::NamePatternMatch::Duplicate($TrueNames, $PredictedNames);
|
|
1145
|
+
#must solve the problem
|
|
1146
|
+
if ($TrueIndex eq "-1") {
|
|
1147
|
+
if ($debug) {
|
|
1148
|
+
print STDERR "here the true name($TrueNames) is null from the line $content \n";
|
|
1149
|
+
}
|
|
1150
|
+
}else {
|
|
1151
|
+
#populate all reasonable name patterns
|
|
1152
|
+
for my $i(0 .. $#$PredictedNames) {
|
|
1153
|
+
my $candidateName = "";
|
|
1154
|
+
for my $j(0 .. $#{$$PredictedNames[$i]}) {
|
|
1155
|
+
if ($$PredictedNames[$i][$j]) {
|
|
1156
|
+
$candidateName .= "$$PredictedNames[$i][$j]<>";
|
|
1157
|
+
}
|
|
1158
|
+
}
|
|
1159
|
+
# print "candidate name\: $candidateName ";
|
|
1160
|
+
$$HeaderH{$line}{NamePattern}{$candidateName}{content} = $candidateName;
|
|
1161
|
+
($$HeaderH{$line}{NamePattern}{$candidateName}{SpaceNameVec}) = &SpaceNameLnFeaRepre_unit($candidateName);
|
|
1162
|
+
if ($i eq $TrueIndex) {
|
|
1163
|
+
$$HeaderH{$line}{NamePattern}{$candidateName}{tag} = 1;
|
|
1164
|
+
}else {
|
|
1165
|
+
$$HeaderH{$line}{NamePattern}{$candidateName}{tag} = -1;
|
|
1166
|
+
}
|
|
1167
|
+
}
|
|
1168
|
+
}
|
|
1169
|
+
}
|
|
1170
|
+
}
|
|
1171
|
+
}
|
|
1172
|
+
}
|
|
1173
|
+
}
|
|
1174
|
+
}else { #if there is no explicit tag for this line, this line only belongs to the last class of the previous line
|
|
1175
|
+
my $tmpI = 0;
|
|
1176
|
+
foreach my $state (sort {$curState{$b} <=> $curState{$a}} keys %curState) {
|
|
1177
|
+
if ($tmpI > 0) {
|
|
1178
|
+
delete ($curState{$state});
|
|
1179
|
+
} #only keep the last tag
|
|
1180
|
+
$tmpI++;
|
|
1181
|
+
}
|
|
1182
|
+
$$HeaderH{$line}{TClass} = "s";
|
|
1183
|
+
foreach my $tag(sort {$curState{$a} <=> $curState{$b}} keys %curState) {
|
|
1184
|
+
$$HeaderH{$line}{SClsName} = $tag;
|
|
1185
|
+
}
|
|
1186
|
+
}
|
|
1187
|
+
}
|
|
1188
|
+
|
|
1189
|
+
return($HeaderH);
|
|
1190
|
+
}
|
|
1191
|
+
|
|
1192
|
+
|
|
1193
|
+
sub VectorizeUnknownHeaderLine () {
|
|
1194
|
+
my $HeaderH = shift;
|
|
1195
|
+
|
|
1196
|
+
my %curState = ();
|
|
1197
|
+
foreach my $line(sort {$a <=> $b} keys %{$HeaderH}) {
|
|
1198
|
+
my $PureTextLine = $$HeaderH{$line}{RawContent};
|
|
1199
|
+
# print "LINE $line: $PureTextLine\n";
|
|
1200
|
+
$PureTextLine =~ s/^\s+//g;
|
|
1201
|
+
$PureTextLine =~ s/\s+$//g;
|
|
1202
|
+
#should make punctuation separate!
|
|
1203
|
+
$$HeaderH{$line}{PureText} = &SeparatePunc($PureTextLine);
|
|
1204
|
+
|
|
1205
|
+
my @Tarr = split(/\s+/, $PureTextLine);
|
|
1206
|
+
$$HeaderH{$line}{SClsWordCount} = $#Tarr +1;
|
|
1207
|
+
$$HeaderH{$line}{FeaVec} = &LineFeatureRepre($$HeaderH{$line}{PureText});
|
|
1208
|
+
# foreach my $key (keys %{$$HeaderH{$line}{FeaVec}}) {
|
|
1209
|
+
# print "$key :: ".${$$HeaderH{$line}{FeaVec}}{$key}."\n";
|
|
1210
|
+
# }
|
|
1211
|
+
# print "\n";
|
|
1212
|
+
#add the position of the line here!!!!
|
|
1213
|
+
$$HeaderH{$line}{FeaVec}{Clinepos} = $line;
|
|
1214
|
+
|
|
1215
|
+
my $textFeaVec = "";
|
|
1216
|
+
foreach my $fea(keys %{$$HeaderH{$line}{FeaVec}}) {
|
|
1217
|
+
if($$HeaderH{$line}{FeaVec}{$fea} == 0) {
|
|
1218
|
+
delete ($$HeaderH{$line}{FeaVec}{$fea});
|
|
1219
|
+
}else {
|
|
1220
|
+
$textFeaVec .= "$fea($$HeaderH{$line}{FeaVec}{$fea}) ";
|
|
1221
|
+
}
|
|
1222
|
+
}
|
|
1223
|
+
$$HeaderH{$line}{TextFeaVec} = $textFeaVec; # for read and debug
|
|
1224
|
+
}
|
|
1225
|
+
|
|
1226
|
+
return($HeaderH);
|
|
1227
|
+
}
|
|
1228
|
+
|
|
1229
|
+
|
|
1230
|
+
#training data are assigned the true neighbour lines' tag
|
|
1231
|
+
sub TrainAssignLineTag() {
|
|
1232
|
+
my $FeatureDictH = shift;
|
|
1233
|
+
my $HeaderH = shift;
|
|
1234
|
+
my %curState = ();
|
|
1235
|
+
|
|
1236
|
+
foreach my $line(sort {$a <=> $b} keys %{$HeaderH}) {
|
|
1237
|
+
my $PC = 1; # 0 means the tag for current line (which might be useful)
|
|
1238
|
+
my $Pline = $line - $PC;
|
|
1239
|
+
while (($PC < 5) && ($Pline > 0)) { #previous line
|
|
1240
|
+
if (exists $$HeaderH{$Pline}{TClass}) {
|
|
1241
|
+
if ($$HeaderH{$Pline}{TClass} eq "s") {
|
|
1242
|
+
my $ContextFea = "P"."$PC"."$$HeaderH{$Pline}{SClsName}";
|
|
1243
|
+
if (! $$FeatureDictH{$ContextFea}{ID}) {
|
|
1244
|
+
$$FeatureDictH{FeatureCounter}++;
|
|
1245
|
+
$$FeatureDictH{$ContextFea}{ID} = $$FeatureDictH{FeatureCounter};
|
|
1246
|
+
$$FeatureDictH{$ContextFea}{max} = 0.5;
|
|
1247
|
+
}
|
|
1248
|
+
|
|
1249
|
+
if ($$FeatureDictH{$ContextFea}{ID}) {
|
|
1250
|
+
$$HeaderH{$line}{ContextFeaVec}{$ContextFea} = 0.5;
|
|
1251
|
+
$$FeatureDictH{$ContextFea}{DF}++;
|
|
1252
|
+
}
|
|
1253
|
+
}else { # consider the order of the tag
|
|
1254
|
+
foreach my $tag(sort {$$HeaderH{$Pline}{MClsName}{$a} <=> $$HeaderH{$Pline}{MClsName}{$b}} keys %{$$HeaderH{$Pline}{MClsName}}){
|
|
1255
|
+
my $ContextFea = "P"."$PC"."$tag";
|
|
1256
|
+
if (! $$FeatureDictH{$ContextFea}{ID}) {
|
|
1257
|
+
$$FeatureDictH{FeatureCounter}++;
|
|
1258
|
+
$$FeatureDictH{$ContextFea}{ID} = $$FeatureDictH{FeatureCounter};
|
|
1259
|
+
$$FeatureDictH{$ContextFea}{max} = 0.5;
|
|
1260
|
+
}
|
|
1261
|
+
if ($$FeatureDictH{$ContextFea}{ID}) {
|
|
1262
|
+
$$HeaderH{$line}{ContextFeaVec}{$ContextFea} = 0.5;
|
|
1263
|
+
$$FeatureDictH{$ContextFea}{DF}++;
|
|
1264
|
+
}
|
|
1265
|
+
}
|
|
1266
|
+
}
|
|
1267
|
+
$PC++;
|
|
1268
|
+
$Pline = $line - $PC;
|
|
1269
|
+
}else {
|
|
1270
|
+
last;
|
|
1271
|
+
}
|
|
1272
|
+
}
|
|
1273
|
+
|
|
1274
|
+
my $NC = 1;
|
|
1275
|
+
my $Nline = $line + $NC;
|
|
1276
|
+
while (($NC < 5) && (exists $$HeaderH{$Nline})) { #next line
|
|
1277
|
+
if ($$HeaderH{$Nline}{TClass} eq "s") {
|
|
1278
|
+
my $ContextFea = "N"."$NC"."$$HeaderH{$Nline}{SClsName}";
|
|
1279
|
+
if (! $$FeatureDictH{$ContextFea}{ID}) {
|
|
1280
|
+
$$FeatureDictH{FeatureCounter}++;
|
|
1281
|
+
$$FeatureDictH{$ContextFea}{ID} = $$FeatureDictH{FeatureCounter};
|
|
1282
|
+
$$FeatureDictH{$ContextFea}{max} = 0.5;
|
|
1283
|
+
}
|
|
1284
|
+
if ($$FeatureDictH{$ContextFea}{ID}) {
|
|
1285
|
+
$$HeaderH{$line}{ContextFeaVec}{$ContextFea} = 0.5;
|
|
1286
|
+
$$FeatureDictH{$ContextFea}{DF}++;
|
|
1287
|
+
}
|
|
1288
|
+
}else { # consider the order of the tag
|
|
1289
|
+
foreach my $tag(sort {$$HeaderH{$Nline}{MClsName}{$a} <=> $$HeaderH{$Nline}{MClsName}{$b}} keys %{$$HeaderH{$Nline}{MClsName}}){
|
|
1290
|
+
my $ContextFea = "N"."$NC"."$tag";
|
|
1291
|
+
if (! $$FeatureDictH{$ContextFea}{ID}) {
|
|
1292
|
+
$$FeatureDictH{FeatureCounter}++;
|
|
1293
|
+
$$FeatureDictH{$ContextFea}{ID} = $$FeatureDictH{FeatureCounter};
|
|
1294
|
+
$$FeatureDictH{$ContextFea}{max} = 0.5;
|
|
1295
|
+
}
|
|
1296
|
+
if ($$FeatureDictH{$ContextFea}{ID}) {
|
|
1297
|
+
$$HeaderH{$line}{ContextFeaVec}{$ContextFea} = 0.5;
|
|
1298
|
+
$$FeatureDictH{$ContextFea}{DF}++;
|
|
1299
|
+
}
|
|
1300
|
+
}
|
|
1301
|
+
}
|
|
1302
|
+
$NC++;
|
|
1303
|
+
$Nline = $line + $NC;
|
|
1304
|
+
}
|
|
1305
|
+
|
|
1306
|
+
#assemble features and their weight into string without normalization
|
|
1307
|
+
my $tmpFeaVec = $$HeaderH{$line}{SVMFeaVec};
|
|
1308
|
+
foreach my $fea(sort{$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$HeaderH{$line}{ContextFeaVec}}) {
|
|
1309
|
+
if (exists $$FeatureDictH{$fea}{ID}) {
|
|
1310
|
+
$tmpFeaVec .= "$$FeatureDictH{$fea}{ID}\:$$HeaderH{$line}{ContextFeaVec}{$fea} ";
|
|
1311
|
+
}
|
|
1312
|
+
}
|
|
1313
|
+
$$HeaderH{$line}{ContextSVMFeaVec} = "$tmpFeaVec";
|
|
1314
|
+
}
|
|
1315
|
+
return($FeatureDictH, $HeaderH);
|
|
1316
|
+
}
|
|
1317
|
+
|
|
1318
|
+
sub TestAssignLineTag() {
|
|
1319
|
+
my $FeatureDictH = shift;
|
|
1320
|
+
my $HeaderH = shift;
|
|
1321
|
+
my %curState = ();
|
|
1322
|
+
|
|
1323
|
+
foreach $line(sort {$a <=> $b} keys %{$HeaderH}) {
|
|
1324
|
+
#Initialize-remove the $$HeaderH{$line}{ContextFeaVec}
|
|
1325
|
+
if(exists ($$HeaderH{$line}{ContextFeaVec})) {
|
|
1326
|
+
delete($$HeaderH{$line}{ContextFeaVec});
|
|
1327
|
+
}
|
|
1328
|
+
|
|
1329
|
+
my $PC = 1; # 0 means the tag for current line (which might be useful)
|
|
1330
|
+
my $Pline = $line - $PC;
|
|
1331
|
+
while (($PC < 5) && ($Pline > 0)) { #previous line
|
|
1332
|
+
if (exists $$HeaderH{$Pline}{Pretag}) {
|
|
1333
|
+
foreach my $tag(sort keys %{$$HeaderH{$Pline}{Pretag}}){
|
|
1334
|
+
my $ContextFea = "P"."$PC"."$tag";
|
|
1335
|
+
if ($$FeatureDictH{$ContextFea}{ID}) {
|
|
1336
|
+
$$HeaderH{$line}{ContextFeaVec}{$ContextFea} = 0.5;
|
|
1337
|
+
}
|
|
1338
|
+
}
|
|
1339
|
+
}
|
|
1340
|
+
$PC++;
|
|
1341
|
+
$Pline = $line - $PC;
|
|
1342
|
+
}
|
|
1343
|
+
|
|
1344
|
+
my $NC = 1;
|
|
1345
|
+
my $Nline = $line + $NC;
|
|
1346
|
+
while (($NC < 5) && (exists $$HeaderH{$Nline})) { #next line
|
|
1347
|
+
foreach my $tag(sort keys %{$$HeaderH{$Nline}{Pretag}}){
|
|
1348
|
+
my $ContextFea = "N"."$NC"."$tag";
|
|
1349
|
+
if ($$FeatureDictH{$ContextFea}{ID}) {
|
|
1350
|
+
$$HeaderH{$line}{ContextFeaVec}{$ContextFea} = 0.5;
|
|
1351
|
+
}
|
|
1352
|
+
}
|
|
1353
|
+
$NC++;
|
|
1354
|
+
$Nline = $line + $NC;
|
|
1355
|
+
}
|
|
1356
|
+
|
|
1357
|
+
#assemble features and their weight into string without normalization
|
|
1358
|
+
my $tmpFeaVec = $$HeaderH{$line}{SVMFeaVec};
|
|
1359
|
+
|
|
1360
|
+
foreach my $fea(sort{$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$HeaderH{$line}{ContextFeaVec}}) {
|
|
1361
|
+
if (exists $$FeatureDictH{$fea}{ID}) {
|
|
1362
|
+
$tmpFeaVec .= "$$FeatureDictH{$fea}{ID}\:$$HeaderH{$line}{ContextFeaVec}{$fea} ";
|
|
1363
|
+
}
|
|
1364
|
+
}
|
|
1365
|
+
$tmpFeaVec =~ s/\s+$//g;
|
|
1366
|
+
$$HeaderH{$line}{ContextSVMFeaVec} = "$tmpFeaVec";
|
|
1367
|
+
}
|
|
1368
|
+
return($FeatureDictH, $HeaderH);
|
|
1369
|
+
}
|
|
1370
|
+
|
|
1371
|
+
#given a line, check the number and the position of punctuation/space it contains
|
|
1372
|
+
sub GetSeparatorIndex() {
|
|
1373
|
+
my $line = shift;
|
|
1374
|
+
my %SeparatorH = ();
|
|
1375
|
+
|
|
1376
|
+
my $PuncNum = 0;
|
|
1377
|
+
$line =~ s/^\s+//g;
|
|
1378
|
+
$line =~ s/\s+$//g;
|
|
1379
|
+
|
|
1380
|
+
#punc means this line contains punc or only space
|
|
1381
|
+
#each space occupies a position and punctuations are separate
|
|
1382
|
+
my ($punc, $spaceLine) = &FillSpace($line);
|
|
1383
|
+
|
|
1384
|
+
#punctuation is specific; space separator contains punctuation separators.
|
|
1385
|
+
my @component = split(/\s+/, $spaceLine);
|
|
1386
|
+
foreach my $i(0 .. $#component) {
|
|
1387
|
+
if ($component[$i] =~ /<<sep>>(\W+|\s*)<<\/sep>>/) {
|
|
1388
|
+
$component[$i] = $1;
|
|
1389
|
+
if ($component[$i] eq "") {
|
|
1390
|
+
$component[$i] = "<<sep>><<\/sep>>";
|
|
1391
|
+
$SeparatorH{space}{$i} = 2;
|
|
1392
|
+
}else {
|
|
1393
|
+
$SeparatorH{punc}{$i} = 2;
|
|
1394
|
+
$PuncNum++;
|
|
1395
|
+
$SeparatorH{space}{$i} = 2;
|
|
1396
|
+
}
|
|
1397
|
+
}elsif ($component[$i] =~ /<space>/) {
|
|
1398
|
+
$SeparatorH{space}{$i} = 1;
|
|
1399
|
+
}elsif ($component[$i] =~ /^[^\p{IsLower}\p{IsUpper}\s+\-\d+]+$/) {
|
|
1400
|
+
$SeparatorH{punc}{$i} = 1; #position(not what punc)
|
|
1401
|
+
$PuncNum++;
|
|
1402
|
+
$SeparatorH{space}{$i} = 1;
|
|
1403
|
+
}
|
|
1404
|
+
}
|
|
1405
|
+
return($PuncNum, \%SeparatorH, \@component);
|
|
1406
|
+
}
|
|
1407
|
+
|
|
1408
|
+
|
|
1409
|
+
#multi-Authors line still has only one class, although 1+ authors
|
|
1410
|
+
sub AssignWordTagFromChunk() {
|
|
1411
|
+
my ($LineH, $SepH, $component) = @_;
|
|
1412
|
+
my @tags = ();
|
|
1413
|
+
foreach my $tag(sort {$$LineH{MClsName}{$a} <=> $$LineH{MClsName}{$b}} keys %{$$LineH{MClsName}}) {
|
|
1414
|
+
push @tags, $tag;
|
|
1415
|
+
}
|
|
1416
|
+
|
|
1417
|
+
my $ChunkNO = 1;
|
|
1418
|
+
my $curTag = $tags[$tagP];
|
|
1419
|
+
my $WordPos = 1;
|
|
1420
|
+
my $chunk = "";
|
|
1421
|
+
for my $i(0 .. $#$component) {
|
|
1422
|
+
#we do not assign class to separators
|
|
1423
|
+
if ($$SepH{space}{$i} >1) {
|
|
1424
|
+
if ($chunk ne "") {
|
|
1425
|
+
$$LineH{Tchunk}{$ChunkNO}{cls} = $curTag;
|
|
1426
|
+
$$LineH{Tchunk}{$ChunkNO}{content} = $chunk;
|
|
1427
|
+
$chunk = "";
|
|
1428
|
+
$curTag = $tags[$ChunkNO];
|
|
1429
|
+
$ChunkNO++;
|
|
1430
|
+
}
|
|
1431
|
+
}elsif ($$component[$i] !~ /<space>|^\W+$|\<\<.*\>\>/) { # such as <<sep>> <</sep>>
|
|
1432
|
+
$chunk .= "$$component[$i] ";
|
|
1433
|
+
$$LineH{Tline}{$WordPos}{cls} = $curTag;
|
|
1434
|
+
$$LineH{Tline}{$WordPos}{OriginalWord} = $$component[$i];
|
|
1435
|
+
$WordPos++;
|
|
1436
|
+
}
|
|
1437
|
+
};
|
|
1438
|
+
|
|
1439
|
+
#Fill in the last chunk
|
|
1440
|
+
$$LineH{Tchunk}{$ChunkNO}{cls} = $curTag;
|
|
1441
|
+
$$LineH{Tchunk}{$ChunkNO}{content} = $chunk;
|
|
1442
|
+
|
|
1443
|
+
return ($LineH);
|
|
1444
|
+
}
|
|
1445
|
+
|
|
1446
|
+
|
|
1447
|
+
sub AssignWordTag4SingleClassLine() {
|
|
1448
|
+
my ($type, $curTag, $LineH, $component) = @_;
|
|
1449
|
+
|
|
1450
|
+
my $WordPos = 1;
|
|
1451
|
+
for my $i(0 .. $#$component) {
|
|
1452
|
+
if ($$component[$i] !~ /<space>|^\W+$|\<\<.*\>\>/) { # such as <<sep>> <</sep>>
|
|
1453
|
+
if ($type eq "truth") {
|
|
1454
|
+
$$LineH{Tline}{$WordPos}{cls} = $curTag;
|
|
1455
|
+
#added 01/08 the original word in a position
|
|
1456
|
+
$$LineH{Tline}{$WordPos}{OriginalWord} = $$component[$i];
|
|
1457
|
+
}elsif ($type eq "predict") {
|
|
1458
|
+
$$LineH{Pline}{$WordPos}{cls} = $curTag;
|
|
1459
|
+
$$LineH{Pline}{$WordPos}{OriginalWord} = $$component[$i];
|
|
1460
|
+
}
|
|
1461
|
+
$WordPos++;
|
|
1462
|
+
}
|
|
1463
|
+
}
|
|
1464
|
+
|
|
1465
|
+
return ($LineH);
|
|
1466
|
+
}
|
|
1467
|
+
|
|
1468
|
+
|
|
1469
|
+
sub Analyze() {
|
|
1470
|
+
my $resultF = shift;
|
|
1471
|
+
open(resultFH, "$resultF") || die "SVMHeaderParse: could not open $resultF to read: $!";
|
|
1472
|
+
my $result = <resultFH>;
|
|
1473
|
+
close(resultFH);
|
|
1474
|
+
$result =~ s/\s+$//g;
|
|
1475
|
+
return($result);
|
|
1476
|
+
}
|
|
1477
|
+
|
|
1478
|
+
|
|
1479
|
+
sub ReadFeatureDict() {
|
|
1480
|
+
my $Fname = shift;
|
|
1481
|
+
my %FeatureDictH;
|
|
1482
|
+
|
|
1483
|
+
open (FH, "$Fname") || die "SVMHeaderParse: could not open $Fname to read: $!";
|
|
1484
|
+
while (my $line = <FH>) {
|
|
1485
|
+
my ($ID, $fea, $max, $DF) = split(/<>/, $line);
|
|
1486
|
+
$ID =~ s/^\s+//g;
|
|
1487
|
+
$ID =~ s/\s+$//g;
|
|
1488
|
+
|
|
1489
|
+
if ($fea =~ /FeatureCounter/) {
|
|
1490
|
+
$FeatureDictH{$fea}{num} = $ID;
|
|
1491
|
+
next;
|
|
1492
|
+
}
|
|
1493
|
+
|
|
1494
|
+
$fea =~ s/^\s+//g;
|
|
1495
|
+
$fea =~ s/\s+$//g;
|
|
1496
|
+
$max =~ s/^\s+//g;
|
|
1497
|
+
$max =~ s/\s+$//g;
|
|
1498
|
+
$DF =~ s/^\s+//g;
|
|
1499
|
+
$Df =~ s/\s+$//g;
|
|
1500
|
+
$FeatureDictH{$fea}{ID} = $ID;
|
|
1501
|
+
$FeatureDictH{$fea}{max} = $max;
|
|
1502
|
+
$FeatureDictH{$fea}{DF} = $DF;
|
|
1503
|
+
}
|
|
1504
|
+
close(FH);
|
|
1505
|
+
return(\%FeatureDictH);
|
|
1506
|
+
}
|
|
1507
|
+
|
|
1508
|
+
|
|
1509
|
+
sub printTrainData() {
|
|
1510
|
+
my $affix = shift;
|
|
1511
|
+
my $HeaderH = shift;
|
|
1512
|
+
|
|
1513
|
+
#Sometimes $$HeaderH{$HeaNO}{$LN}{ContextSVMFeaVec} is not null, but
|
|
1514
|
+
#$$HeaderH{$HeaNO}{$LN}{SVMFeaVec} is null so they have different file length!
|
|
1515
|
+
for my $clsNO(1 .. 15) {
|
|
1516
|
+
my $F = "$offlineD"."$clsNO"."\."."$affix";
|
|
1517
|
+
open(FH, ">$F") || die "SVMHeaderParse: could not open $F to write: $!";
|
|
1518
|
+
foreach my $HeaNO (sort {$a <=> $b} keys %{$HeaderH}) {
|
|
1519
|
+
foreach my $LN(sort {$a <=> $b} keys %{$$HeaderH{$HeaNO}}) {
|
|
1520
|
+
if ($$HeaderH{$HeaNO}{$LN}{SVMFeaVec} ne "") {
|
|
1521
|
+
if ($affix eq "train") {
|
|
1522
|
+
if ($$HeaderH{$HeaNO}{$LN}{SVMFeaVec} ne "") {
|
|
1523
|
+
if (($$HeaderH{$HeaNO}{$LN}{SClsName} eq "$clsNO") || exists($$HeaderH{$HeaNO}{$LN}{MClsName}{$clsNO})) {
|
|
1524
|
+
print FH "1 $$HeaderH{$HeaNO}{$LN}{SVMFeaVec}\n";
|
|
1525
|
+
}else {
|
|
1526
|
+
print FH "-1 $$HeaderH{$HeaNO}{$LN}{SVMFeaVec}\n";
|
|
1527
|
+
}
|
|
1528
|
+
}
|
|
1529
|
+
}elsif ($affix eq "context") {
|
|
1530
|
+
#if ($$HeaderH{$HeaNO}{$LN}{ContextSVMFeaVec} ne "") {
|
|
1531
|
+
if (($$HeaderH{$HeaNO}{$LN}{SClsName} eq "$clsNO") || exists($$HeaderH{$HeaNO}{$LN}{MClsName}{$clsNO})) {
|
|
1532
|
+
print FH "1 $$HeaderH{$HeaNO}{$LN}{ContextSVMFeaVec}\n";
|
|
1533
|
+
}else {
|
|
1534
|
+
print FH "-1 $$HeaderH{$HeaNO}{$LN}{ContextSVMFeaVec}\n";
|
|
1535
|
+
}
|
|
1536
|
+
}else {
|
|
1537
|
+
print "weired -- $affix is not context nor train \n";
|
|
1538
|
+
}
|
|
1539
|
+
}
|
|
1540
|
+
}
|
|
1541
|
+
}
|
|
1542
|
+
close(FH);
|
|
1543
|
+
}
|
|
1544
|
+
}
|
|
1545
|
+
|
|
1546
|
+
|
|
1547
|
+
sub printNameSpaceTrainData(){
|
|
1548
|
+
my $printF = shift;
|
|
1549
|
+
my $NameSpaceTrainVecH = shift;
|
|
1550
|
+
|
|
1551
|
+
open(FH, ">$printF") || die "SVMHeaderParse: could not open $printF to write: $!";
|
|
1552
|
+
foreach my $Lcount(sort{$a<=>$b} keys %{$NameSpaceTrainVecH}) {
|
|
1553
|
+
print FH "$$NameSpaceTrainVecH{$Lcount}{SpaceNameVec}\n";
|
|
1554
|
+
}
|
|
1555
|
+
close(FH);
|
|
1556
|
+
}
|
|
1557
|
+
|
|
1558
|
+
|
|
1559
|
+
sub SpaceNameLnFeaRepre() {
|
|
1560
|
+
my $type = shift;
|
|
1561
|
+
my $NamePatternStr = shift;
|
|
1562
|
+
my $NameDictH = shift;
|
|
1563
|
+
|
|
1564
|
+
#feature generation and representation
|
|
1565
|
+
#It is good to make each of the apple's feature(color, shape..) separate.
|
|
1566
|
+
my %FeatureH = ();
|
|
1567
|
+
$NamePatternStr =~ s/\<\>$//g; #remove the last <>
|
|
1568
|
+
my @Names = split(/<>/, $NamePatternStr);
|
|
1569
|
+
|
|
1570
|
+
#try making features binary
|
|
1571
|
+
for my $i(0 .. $#Names) {
|
|
1572
|
+
my @NameComponent = split(/\s+/, $Names[$i]);
|
|
1573
|
+
for my $j(0 .. $#NameComponent){
|
|
1574
|
+
|
|
1575
|
+
#feature generation($i = 0 is the first one)
|
|
1576
|
+
$FeatureH{"Name"."$i"."part"."$j"."form"} = &HeaderParse::API::NamePatternMatch::RichNameType($NameComponent[$j]);
|
|
1577
|
+
if ($j eq $#NameComponent) {
|
|
1578
|
+
$FeatureH{"Name"."$i"."part"."$j"."pos"} = "Last";
|
|
1579
|
+
}elsif ($j eq $#NameComponent -1) {
|
|
1580
|
+
$FeatureH{"Name"."$i"."part"."$j"."pos"} = "SecLast";
|
|
1581
|
+
}else {
|
|
1582
|
+
$FeatureH{"Name"."$i"."part"."$j"."pos"} = $j;
|
|
1583
|
+
}
|
|
1584
|
+
|
|
1585
|
+
#firstname, lastname information
|
|
1586
|
+
# print "hello: ".lc($NameComponent[$j])."\n";
|
|
1587
|
+
if (($firstnameH{lc($NameComponent[$j])}) && (!$lastnameH{lc($NameComponent[$j])})) {
|
|
1588
|
+
$FeatureH{"Name"."$i"."part"."$j"."FN"} = 1;
|
|
1589
|
+
}elsif (($lastnameH{lc($NameComponent[$j])}) && (!$firstnameH{lc($NameComponent[$j])})) {
|
|
1590
|
+
$FeatureH{"Name"."$i"."part"."$j"."LN"} = 1;
|
|
1591
|
+
}elsif (! $dictH{lc($NameComponent[$j])}) {
|
|
1592
|
+
$FeatureH{"Name"."$i"."part"."$j"."NonDict"} = 1;
|
|
1593
|
+
}
|
|
1594
|
+
|
|
1595
|
+
#space for more features
|
|
1596
|
+
}
|
|
1597
|
+
}
|
|
1598
|
+
|
|
1599
|
+
#Build up FeatureVec
|
|
1600
|
+
#code for the attribute ID separately so that the ID for features would be continuous
|
|
1601
|
+
if ($type eq "train") {
|
|
1602
|
+
foreach my $fea(sort {$a <=> $b} keys %FeatureH) {
|
|
1603
|
+
if (! $$NameDictH{$fea}{ID}) {
|
|
1604
|
+
$$NameDictH{FeatureCounter}++;
|
|
1605
|
+
$$NameDictH{$fea}{ID} = $$NameDictH{FeatureCounter};
|
|
1606
|
+
}
|
|
1607
|
+
|
|
1608
|
+
if (! IsNumber($FeatureH{$fea})) {
|
|
1609
|
+
if (! exists $$NameDictH{$FeatureH{$fea}}{ID}) {
|
|
1610
|
+
$$NameDictH{FeatureCounter}++;
|
|
1611
|
+
$$NameDictH{$FeatureH{$fea}}{ID} = $$NameDictH{FeatureCounter};
|
|
1612
|
+
}
|
|
1613
|
+
$FeatureH{$fea} = $$NameDictH{$FeatureH{$fea}}{ID};
|
|
1614
|
+
}
|
|
1615
|
+
|
|
1616
|
+
if ($FeatureH{$fea} == 0) {
|
|
1617
|
+
delete($FeatureH{$fea});
|
|
1618
|
+
}else {
|
|
1619
|
+
if ((! exists $$NameDictH{$fea}{max}) || ($$NameDictH{$fea}{max} < $FeatureH{$fea})) {
|
|
1620
|
+
$$NameDictH{$fea}{max} = $FeatureH{$fea};
|
|
1621
|
+
}
|
|
1622
|
+
}
|
|
1623
|
+
}
|
|
1624
|
+
return(\%FeatureH, $NameDictH);
|
|
1625
|
+
#test
|
|
1626
|
+
}else {
|
|
1627
|
+
my $SpaceNameFeaVec = "";
|
|
1628
|
+
my $SpaceNameTextFeaVec = "";
|
|
1629
|
+
foreach my $fea(sort {$$NameDictH{$a}{ID} <=> $$NameDictH{$b}{ID}} keys %FeatureH) {
|
|
1630
|
+
if (! &IsNumber($FeatureH{$fea})) {
|
|
1631
|
+
if (exists $$NameDictH{$FeatureH{$fea}}{ID}) {
|
|
1632
|
+
$FeatureH{$fea} = $$NameDictH{$FeatureH{$fea}}{ID};
|
|
1633
|
+
}else {
|
|
1634
|
+
delete($FeatureH{$fea});
|
|
1635
|
+
}
|
|
1636
|
+
}
|
|
1637
|
+
|
|
1638
|
+
if (! ($FeatureH{$fea} && $$NameDictH{$fea}{ID})) {
|
|
1639
|
+
delete($FeatureH{$fea});
|
|
1640
|
+
}else {
|
|
1641
|
+
$FeatureH{$fea} = sprintf("%.8f", $FeatureH{$fea}/$$NameDictH{$fea}{max});
|
|
1642
|
+
$SpaceNameFeaVec .= "$$NameDictH{$fea}{ID}\:$FeatureH{$fea} ";
|
|
1643
|
+
$SpaceNameTextFeaVec .= "$fea\:$FeatureH{$fea} ";
|
|
1644
|
+
}
|
|
1645
|
+
}
|
|
1646
|
+
return($SpaceNameFeaVec, $SpaceNameTextFeaVec);
|
|
1647
|
+
}
|
|
1648
|
+
}
|
|
1649
|
+
|
|
1650
|
+
|
|
1651
|
+
sub SpaceNameLnFeaRepre_unit() {
|
|
1652
|
+
my $NamePatternStr = shift;
|
|
1653
|
+
|
|
1654
|
+
#feature generation and representation
|
|
1655
|
+
#It is good to make each of the apple's feature(color, shape..) separate.
|
|
1656
|
+
my %FeatureH = ();
|
|
1657
|
+
$NamePatternStr =~ s/\<\>$//g; #remove the last <>
|
|
1658
|
+
my @Names = split(/<>/, $NamePatternStr);
|
|
1659
|
+
|
|
1660
|
+
#try making features binary
|
|
1661
|
+
for my $i(0 .. $#Names) {
|
|
1662
|
+
my @NameComponent = split(/\s+/, $Names[$i]);
|
|
1663
|
+
for my $j(0 .. $#NameComponent){
|
|
1664
|
+
#feature generation($i = 0 is the first one)
|
|
1665
|
+
$FeatureH{"Name"."$i"."part"."$j"."form"} = &HeaderParse::API::NamePatternMatch::RichNameType($NameComponent[$j]);
|
|
1666
|
+
if ($j eq $#NameComponent) {
|
|
1667
|
+
$FeatureH{"Name"."$i"."part"."$j"."pos"} = "Last";
|
|
1668
|
+
}elsif ($j eq $#NameComponent -1) {
|
|
1669
|
+
$FeatureH{"Name"."$i"."part"."$j"."pos"} = "SecLast";
|
|
1670
|
+
}else {
|
|
1671
|
+
$FeatureH{"Name"."$i"."part"."$j"."pos"} = $j;
|
|
1672
|
+
}
|
|
1673
|
+
#firstname, lastname information
|
|
1674
|
+
# print "hello2: ".lc($NameComponent[$j])."\n";
|
|
1675
|
+
if (($firstnameH{lc($NameComponent[$j])}) && (!$lastnameH{lc($NameComponent[$j])})) {
|
|
1676
|
+
# print "NAME MATCH: ".lc($NameComponent[$j])."\n";
|
|
1677
|
+
$FeatureH{"Name"."$i"."part"."$j"."FN"} = 1;
|
|
1678
|
+
}elsif (($lastnameH{lc($NameComponent[$j])}) && (!$firstnameH{lc($NameComponent[$j])})) {
|
|
1679
|
+
# print "NAME MATCH: ".lc($NameComponent[$j])."\n";
|
|
1680
|
+
|
|
1681
|
+
$FeatureH{"Name"."$i"."part"."$j"."LN"} = 1;
|
|
1682
|
+
}elsif (! $dictH{lc($NameComponent[$j])}) {
|
|
1683
|
+
# print "NAME MATCH: ".lc($NameComponent[$j])."\n";
|
|
1684
|
+
|
|
1685
|
+
$FeatureH{"Name"."$i"."part"."$j"."NonDict"} = 1;
|
|
1686
|
+
}
|
|
1687
|
+
|
|
1688
|
+
#space for more features
|
|
1689
|
+
}
|
|
1690
|
+
}
|
|
1691
|
+
return(\%FeatureH);
|
|
1692
|
+
}
|
|
1693
|
+
|
|
1694
|
+
|
|
1695
|
+
sub IsNumber ()
|
|
1696
|
+
{
|
|
1697
|
+
my $in = shift;
|
|
1698
|
+
if ($in =~ m/^(\d+)(\.\d+)*$/) {
|
|
1699
|
+
return 1;
|
|
1700
|
+
}else {
|
|
1701
|
+
return 0;
|
|
1702
|
+
}
|
|
1703
|
+
}
|
|
1704
|
+
|
|
1705
|
+
|
|
1706
|
+
sub FormFeaDict() {
|
|
1707
|
+
my $DataH = shift;
|
|
1708
|
+
my $FeatureDictH = shift;
|
|
1709
|
+
my %NameSpaceFeaDictH = ();
|
|
1710
|
+
|
|
1711
|
+
foreach my $HeaNO (sort {$a <=> $b} keys %{$DataH}) {
|
|
1712
|
+
foreach my $line (sort {$a <=> $b} keys %{$$DataH{$HeaNO}}) {
|
|
1713
|
+
foreach my $fea(keys %{$$DataH{$HeaNO}{$line}{FeaVec}}) {
|
|
1714
|
+
if ($$DataH{$HeaNO}{$line}{FeaVec}{$fea} == 0) {
|
|
1715
|
+
delete ($$DataH{$HeaNO}{$line}{FeaVec}{$fea});
|
|
1716
|
+
next;
|
|
1717
|
+
}else {
|
|
1718
|
+
if (! $$FeatureDictH{$fea}{ID}) {
|
|
1719
|
+
$$FeatureDictH{FeatureCounter}++;
|
|
1720
|
+
$$FeatureDictH{$fea}{ID} = $$FeatureDictH{FeatureCounter};
|
|
1721
|
+
}
|
|
1722
|
+
if ($$DataH{$HeaNO}{$line}{FeaVec}{$fea} > $$FeatureDictH{$fea}{max}) {
|
|
1723
|
+
$$FeatureDictH{$fea}{max} = $$DataH{$HeaNO}{$line}{FeaVec}{$fea};
|
|
1724
|
+
}
|
|
1725
|
+
$$FeatureDictH{$fea}{DF}++;
|
|
1726
|
+
}
|
|
1727
|
+
#test needs this line!
|
|
1728
|
+
if ((! $$FeatureDictH{$fea}{ID}) || ($$DataH{$HeaNO}{$line}{FeaVec}{$fea} == 0)) { #some basic feature defined in initialization such as pubnumber could be 0
|
|
1729
|
+
delete ($$DataH{$HeaNO}{$line}{FeaVec}{$fea});
|
|
1730
|
+
}
|
|
1731
|
+
}
|
|
1732
|
+
|
|
1733
|
+
#form the Name Space Feature Dict
|
|
1734
|
+
if (exists $$DataH{$HeaNO}{$line}{NamePattern}) {
|
|
1735
|
+
foreach my $CandidateNamePattern(keys %{$$DataH{$HeaNO}{$line}{NamePattern}}) {
|
|
1736
|
+
foreach my $fea(keys %{$$DataH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}}) {
|
|
1737
|
+
my $wt = $$DataH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}{$fea};
|
|
1738
|
+
if (! $NameSpaceFeaDictH{$fea}{ID}) {
|
|
1739
|
+
$NameSpaceFeaDictH{FeatureCounter}++;
|
|
1740
|
+
$NameSpaceFeaDictH{$fea}{ID} = $NameSpaceFeaDictH{FeatureCounter};
|
|
1741
|
+
}
|
|
1742
|
+
if (! &IsNumber($wt)) {
|
|
1743
|
+
if (! exists $NameSpaceFeaDictH{$wt}{ID}) {
|
|
1744
|
+
$NameSpaceFeaDictH{FeatureCounter}++;
|
|
1745
|
+
$NameSpaceFeaDictH{$wt}{ID} = $NameSpaceFeaDictH{FeatureCounter};
|
|
1746
|
+
}
|
|
1747
|
+
$$DataH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}{$fea} = $NameSpaceFeaDictH{$wt}{ID};
|
|
1748
|
+
}
|
|
1749
|
+
|
|
1750
|
+
if ($wt == 0) {
|
|
1751
|
+
delete($$DataH{$HeaNO}{$line}{NamePattern}{$CandidateNamePattern}{SpaceNameVec}{$fea});
|
|
1752
|
+
}else {
|
|
1753
|
+
if ((! exists $NameSpaceFeaDictH{$fea}{max}) || ($NameSpaceFeaDictH{$fea}{max} < $wt)) {
|
|
1754
|
+
$NameSpaceFeaDictH{$fea}{max} = $wt;
|
|
1755
|
+
}
|
|
1756
|
+
}
|
|
1757
|
+
}
|
|
1758
|
+
}
|
|
1759
|
+
}
|
|
1760
|
+
#end of form the dictionary for the name
|
|
1761
|
+
}
|
|
1762
|
+
}
|
|
1763
|
+
return($DataH, $FeatureDictH, \%NameSpaceFeaDictH);
|
|
1764
|
+
}
|
|
1765
|
+
|
|
1766
|
+
|
|
1767
|
+
sub FormTestFeaVec(){
|
|
1768
|
+
my $FeatureDictH = shift;
|
|
1769
|
+
my $TestHeaderH = shift;
|
|
1770
|
+
|
|
1771
|
+
foreach my $line(sort{$a<=>$b} keys %{$TestHeaderH}) {
|
|
1772
|
+
foreach my $fea(keys %{$$TestHeaderH{$line}{FeaVec}}) {
|
|
1773
|
+
if ((! $$FeatureDictH{$fea}{ID}) || ($$TestHeaderH{$line}{FeaVec}{$fea} == 0)) {
|
|
1774
|
+
delete($$TestHeaderH{$line}{FeaVec}{$fea});
|
|
1775
|
+
}
|
|
1776
|
+
}
|
|
1777
|
+
}
|
|
1778
|
+
return($TestHeaderH);
|
|
1779
|
+
}
|
|
1780
|
+
|
|
1781
|
+
|
|
1782
|
+
sub PruneDict() {
|
|
1783
|
+
my $FeatureDictH = shift;
|
|
1784
|
+
my $Recount = 1;
|
|
1785
|
+
|
|
1786
|
+
foreach my $DictFea(sort{$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$FeatureDictH}) {
|
|
1787
|
+
if ((($DictFea ne "FeatureCounter") && ($$FeatureDictH{$DictFea}{max} == 0)) || ($$FeatureDictH{$DictFea}{DF} < 2)) {
|
|
1788
|
+
delete($$FeatureDictH{$DictFea});
|
|
1789
|
+
}else {
|
|
1790
|
+
$$FeatureDictH{$DictFea}{ID} = $Recount;
|
|
1791
|
+
$Recount++;
|
|
1792
|
+
}
|
|
1793
|
+
}
|
|
1794
|
+
|
|
1795
|
+
$$FeatureDictH{FeatureCounter} = $Recount-1;
|
|
1796
|
+
|
|
1797
|
+
return($FeatureDictH);
|
|
1798
|
+
}
|
|
1799
|
+
|
|
1800
|
+
#input is an array of name patterns
|
|
1801
|
+
#return a string of the best name pattern
|
|
1802
|
+
sub PredictBestNamePattern() {
|
|
1803
|
+
my $PredictedNames = shift;
|
|
1804
|
+
my $SVMNameSpaceModel = shift;
|
|
1805
|
+
my $SpaceNameDictH = shift;
|
|
1806
|
+
my $tmpCacheVec = shift;
|
|
1807
|
+
my $SVMTmpResult = shift;
|
|
1808
|
+
|
|
1809
|
+
my $MaxVal = -10;
|
|
1810
|
+
my $BestNamePattern = "";
|
|
1811
|
+
|
|
1812
|
+
for my $i(0 .. $#$PredictedNames) {
|
|
1813
|
+
my $candidateName = "";
|
|
1814
|
+
for my $j(0 .. $#{$$PredictedNames[$i]}) {
|
|
1815
|
+
if ($$PredictedNames[$i][$j]) {
|
|
1816
|
+
$candidateName .= "$$PredictedNames[$i][$j]<>";
|
|
1817
|
+
}
|
|
1818
|
+
}
|
|
1819
|
+
|
|
1820
|
+
my ($RawNameFeaVec) = &SpaceNameLnFeaRepre_unit($candidateName);
|
|
1821
|
+
#filter out the non-dictinary features
|
|
1822
|
+
my $SpaceNameVec = "";
|
|
1823
|
+
my $SpaceNameTextFeaVec = "";
|
|
1824
|
+
foreach my $fea(sort {$$SpaceNameDictH{$a}{ID} <=> $$SpaceNameDictH{$b}{ID}} keys %{$RawNameFeaVec}) {
|
|
1825
|
+
my $wt = $$RawNameFeaVec{$fea};
|
|
1826
|
+
if (! &IsNumber($wt)) {
|
|
1827
|
+
if (exists $$SpaceNameDictH{$wt}{ID}) {
|
|
1828
|
+
$$RawNameFeaVec{$fea} = $$SpaceNameDictH{$wt}{ID};
|
|
1829
|
+
}else {
|
|
1830
|
+
delete($$RawNameFeaVec{$fea});
|
|
1831
|
+
}
|
|
1832
|
+
}
|
|
1833
|
+
|
|
1834
|
+
if (! (($$RawNameFeaVec{$fea}>0) && $$SpaceNameDictH{$fea}{ID})) {
|
|
1835
|
+
delete($$RawNameFeaVec{$fea});
|
|
1836
|
+
}else {
|
|
1837
|
+
$$RawNameFeaVec{$fea} = sprintf("%.8f", $$RawNameFeaVec{$fea}/$$SpaceNameDictH{$fea}{max}
|
|
1838
|
+
);
|
|
1839
|
+
$SpaceNameVec .= "$$SpaceNameDictH{$fea}{ID}\:$$RawNameFeaVec{$fea} ";
|
|
1840
|
+
$SpaceNameTextFeaVec .= "$fea\:$$RawNameFeaVec{$fea} ";
|
|
1841
|
+
}
|
|
1842
|
+
}
|
|
1843
|
+
open(testVec, ">$tmpCacheVec") || die "SVMHeaderParse: could not open $tmpCacheVec to write: $!";
|
|
1844
|
+
# print "NamePattern FeatureVec is\: $SpaceNameTextVec\n";
|
|
1845
|
+
print testVec "$SpaceNameVec";
|
|
1846
|
+
close(testVec);
|
|
1847
|
+
`$Classifier -v 0 $tmpCacheVec $SVMNameSpaceModel $SVMTmpResult`;
|
|
1848
|
+
my $result = &Analyze($SVMTmpResult);
|
|
1849
|
+
if ($result > $MaxVal) {
|
|
1850
|
+
$MaxVal = $result;
|
|
1851
|
+
$BestNamePattern = $candidateName;
|
|
1852
|
+
}
|
|
1853
|
+
}
|
|
1854
|
+
|
|
1855
|
+
unlink $tmpCacheVec;
|
|
1856
|
+
unlink $SVMTmpResult;
|
|
1857
|
+
|
|
1858
|
+
#split the multiple names in order
|
|
1859
|
+
$BestNamePattern =~ s/\<\>$//g; #remove the last <>
|
|
1860
|
+
|
|
1861
|
+
return($BestNamePattern);
|
|
1862
|
+
}
|
|
1863
|
+
|
|
1864
|
+
|
|
1865
|
+
sub WordCount() { #didn't try, but should be OK, since it is borrowed from AddrMatch in function.pm
|
|
1866
|
+
my $inStr = shift;
|
|
1867
|
+
$inStr =~ s/^\s+//g;
|
|
1868
|
+
$inStr =~ s/\s+$//g;
|
|
1869
|
+
|
|
1870
|
+
my $senLen = 0;
|
|
1871
|
+
my @words = split(/\s+/, $inStr);
|
|
1872
|
+
for my $i(0 .. $#words) {
|
|
1873
|
+
if ($words[0] !~ /^\W+\s*$/) { #punctuation
|
|
1874
|
+
$senLen ++;
|
|
1875
|
+
}
|
|
1876
|
+
}
|
|
1877
|
+
return($senLen);
|
|
1878
|
+
}
|
|
1879
|
+
|
|
1880
|
+
1;
|