biblicit 1.0 → 2.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitmodules +3 -0
- data/Gemfile +1 -1
- data/README.md +125 -30
- data/Rakefile +22 -0
- data/biblicit.gemspec +9 -7
- data/lib/biblicit/cb2bib.rb +10 -11
- data/lib/biblicit/citeseer.rb +14 -26
- data/lib/biblicit/extractor.rb +40 -19
- data/lib/biblicit/parscit.rb +38 -0
- data/parscit/.gitignore +8 -0
- data/parscit/CHANGELOG +125 -0
- data/parscit/COPYING +674 -0
- data/parscit/COPYING.LESSER +165 -0
- data/parscit/INSTALL +105 -0
- data/parscit/README +97 -0
- data/{perl/ParsCit/README.TXT → parscit/USAGE} +25 -15
- data/parscit/bin/archtest.pl +31 -0
- data/parscit/bin/citeExtract.pl +562 -0
- data/parscit/bin/conlleval.pl +315 -0
- data/parscit/bin/headExtract.pl +40 -0
- data/parscit/bin/parsHed/convert2TokenLevel.pl +138 -0
- data/parscit/bin/parsHed/keywordGen.pl +308 -0
- data/parscit/bin/parsHed/parseXmlHeader.pl +141 -0
- data/parscit/bin/parsHed/redo.parsHed.pl +198 -0
- data/parscit/bin/parsHed/tr2crfpp_parsHed.pl +521 -0
- data/parscit/bin/parseRefStrings.pl +102 -0
- data/parscit/bin/phOutput2xml.pl +223 -0
- data/parscit/bin/redo.parsCit.pl +105 -0
- data/parscit/bin/sectExtract.pl +149 -0
- data/parscit/bin/sectLabel/README +110 -0
- data/parscit/bin/sectLabel/README.txt +110 -0
- data/parscit/bin/sectLabel/genericSect/crossValidation.rb +98 -0
- data/parscit/bin/sectLabel/genericSect/extractFeature.rb +104 -0
- data/parscit/bin/sectLabel/genericSectExtract.rb +53 -0
- data/parscit/bin/sectLabel/getStructureInfo.pl +156 -0
- data/parscit/bin/sectLabel/processOmniXML.pl +1427 -0
- data/parscit/bin/sectLabel/processOmniXML_new.pl +1025 -0
- data/parscit/bin/sectLabel/processOmniXMLv2.pl +1529 -0
- data/parscit/bin/sectLabel/processOmniXMLv3.pl +964 -0
- data/parscit/bin/sectLabel/redo.sectLabel.pl +219 -0
- data/parscit/bin/sectLabel/simplifyOmniXML.pl +382 -0
- data/parscit/bin/sectLabel/single2multi.pl +190 -0
- data/parscit/bin/sectLabel/tr2crfpp.pl +158 -0
- data/parscit/bin/tr2crfpp.pl +260 -0
- data/parscit/bin/xml2train.pl +193 -0
- data/parscit/lib/CSXUtil/SafeText.pm +130 -0
- data/parscit/lib/Omni/Config.pm +93 -0
- data/parscit/lib/Omni/Omnicell.pm +263 -0
- data/parscit/lib/Omni/Omnicol.pm +292 -0
- data/parscit/lib/Omni/Omnidd.pm +328 -0
- data/parscit/lib/Omni/Omnidoc.pm +153 -0
- data/parscit/lib/Omni/Omniframe.pm +223 -0
- data/parscit/lib/Omni/Omniline.pm +423 -0
- data/parscit/lib/Omni/Omnipage.pm +282 -0
- data/parscit/lib/Omni/Omnipara.pm +232 -0
- data/parscit/lib/Omni/Omnirun.pm +303 -0
- data/parscit/lib/Omni/Omnitable.pm +336 -0
- data/parscit/lib/Omni/Omniword.pm +162 -0
- data/parscit/lib/Omni/Traversal.pm +313 -0
- data/parscit/lib/ParsCit/.PostProcess.pm.swp +0 -0
- data/parscit/lib/ParsCit/Citation.pm +737 -0
- data/parscit/lib/ParsCit/CitationContext.pm +220 -0
- data/parscit/lib/ParsCit/Config.pm +35 -0
- data/parscit/lib/ParsCit/Controller.pm +653 -0
- data/parscit/lib/ParsCit/PostProcess.pm +505 -0
- data/parscit/lib/ParsCit/PreProcess.pm +1041 -0
- data/parscit/lib/ParsCit/Tr2crfpp.pm +1195 -0
- data/parscit/lib/ParsHed/Config.pm +49 -0
- data/parscit/lib/ParsHed/Controller.pm +143 -0
- data/parscit/lib/ParsHed/PostProcess.pm +322 -0
- data/parscit/lib/ParsHed/Tr2crfpp.pm +448 -0
- data/{perl/ParsCit/lib/ParsCit/Tr2crfpp.pm → parscit/lib/ParsHed/Tr2crfpp_token.pm} +22 -21
- data/parscit/lib/SectLabel/AAMatching.pm +1949 -0
- data/parscit/lib/SectLabel/Config.pm +88 -0
- data/parscit/lib/SectLabel/Controller.pm +332 -0
- data/parscit/lib/SectLabel/PostProcess.pm +425 -0
- data/parscit/lib/SectLabel/PreProcess.pm +116 -0
- data/parscit/lib/SectLabel/Tr2crfpp.pm +1246 -0
- data/parscit/resources/parsCit.model +0 -0
- data/parscit/resources/parsCit.split.model +0 -0
- data/{perl/ParsCit → parscit}/resources/parsCitDict.txt +205 -0
- data/parscit/resources/parsHed/bigram +10 -0
- data/parscit/resources/parsHed/keywords +10 -0
- data/parscit/resources/parsHed/parsHed.model +0 -0
- data/parscit/resources/parsHed/parsHed.template +178 -0
- data/parscit/resources/sectLabel/affiliation.model +0 -0
- data/parscit/resources/sectLabel/author.model +0 -0
- data/parscit/resources/sectLabel/funcWord +320 -0
- data/parscit/resources/sectLabel/genericSect.model +0 -0
- data/parscit/resources/sectLabel/sectLabel.config +42 -0
- data/parscit/resources/sectLabel/sectLabel.configXml +42 -0
- data/parscit/resources/sectLabel/sectLabel.model +0 -0
- data/sh/convert_to_text.sh +20 -0
- data/spec/biblicit/extractor_spec.rb +121 -0
- data/spec/fixtures/Review_of_Michael_Tyes_Consciousness_Revisited.docx +0 -0
- data/spec/fixtures/critical-infrastructures.ps +63951 -0
- data/spec/fixtures/txt/E06-1050.txt +867 -0
- data/spec/fixtures/txt/sample1.txt +902 -0
- data/spec/fixtures/txt/sample2.txt +394 -0
- data/spec/spec_helper.rb +3 -0
- data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Function.pm +2 -20
- data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +0 -7
- data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Parser.pm +0 -2
- data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +0 -7
- data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +6 -1
- data/svm-header-parse/HeaderParseService/tmp/.gitignore +4 -0
- data/svm-header-parse/extract.pl +75 -0
- metadata +351 -317
- data/perl/DocFilter/lib/DocFilter/Config.pm +0 -35
- data/perl/DocFilter/lib/DocFilter/Filter.pm +0 -51
- data/perl/FileConversionService/README.TXT +0 -11
- data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
- data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +0 -140
- data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +0 -77
- data/perl/FileConversionService/lib/FileConverter/Compression.pm +0 -137
- data/perl/FileConversionService/lib/FileConverter/Config.pm +0 -57
- data/perl/FileConversionService/lib/FileConverter/Controller.pm +0 -191
- data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +0 -61
- data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +0 -69
- data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +0 -69
- data/perl/FileConversionService/lib/FileConverter/PSToText.pm +0 -88
- data/perl/FileConversionService/lib/FileConverter/Prescript.pm +0 -68
- data/perl/FileConversionService/lib/FileConverter/TET.pm +0 -75
- data/perl/FileConversionService/lib/FileConverter/Utils.pm +0 -130
- data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +0 -140
- data/perl/HeaderParseService/resources/data/EbizHeaders.txt +0 -24330
- data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +0 -27506
- data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +0 -26495
- data/perl/HeaderParseService/resources/data/tagged_headers.txt +0 -40668
- data/perl/HeaderParseService/resources/data/test_header.txt +0 -31
- data/perl/HeaderParseService/resources/data/test_header.txt.parsed +0 -31
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +0 -23
- data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +0 -23
- data/perl/ParsCit/crfpp/traindata/parsCit.template +0 -60
- data/perl/ParsCit/crfpp/traindata/parsCit.train.data +0 -12104
- data/perl/ParsCit/crfpp/traindata/tagged_references.txt +0 -500
- data/perl/ParsCit/lib/CSXUtil/SafeText.pm +0 -140
- data/perl/ParsCit/lib/ParsCit/Citation.pm +0 -462
- data/perl/ParsCit/lib/ParsCit/CitationContext.pm +0 -132
- data/perl/ParsCit/lib/ParsCit/Config.pm +0 -46
- data/perl/ParsCit/lib/ParsCit/Controller.pm +0 -306
- data/perl/ParsCit/lib/ParsCit/PostProcess.pm +0 -367
- data/perl/ParsCit/lib/ParsCit/PreProcess.pm +0 -333
- data/perl/ParsCit/resources/parsCit.model +0 -0
- data/perl/extract.pl +0 -199
- data/spec/biblicit/cb2bib_spec.rb +0 -48
- data/spec/biblicit/citeseer_spec.rb +0 -40
- /data/{perl → svm-header-parse}/HeaderParseService/README.TXT +0 -0
- /data/{perl/DocFilter → svm-header-parse/HeaderParseService}/lib/CSXUtil/SafeText.pm +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/50states +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AddrTopWords.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWords.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWordsAll.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/ChineseSurNames.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/DomainSuffixes.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/LabeledHeader +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/README +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstract.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstractTopWords +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/addr.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affi.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affis.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/allnamewords.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_US.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_world.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/city.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cityname.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/country_abbr.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/countryname.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/dateTopWords +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/degree.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/email.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/excludeWords.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/female-names +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstNames.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/intro.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keyword.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keywordTopWords +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/male-names +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/middleNames.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/month.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label.old +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.processed +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulAuthor +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulClassStat +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nickname.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nicknames.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/note.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/page.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/phone.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/postcode.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/pubnum.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/states_and_abbreviations.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surNames.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames_spec.bin +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/A.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/B.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/C.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/D.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/E.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/F.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/G.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/H.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/I.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/J.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/K.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/L.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/M.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/N.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/O.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/P.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Q.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/R.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/S.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/T.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/U.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/V.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/W.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/X.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Y.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Z.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ae.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/am.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ar.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/at.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/au.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bd.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/be.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bg.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bh.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bm.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bn.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/br.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ca.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ch.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cl.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cn.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/co.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cy.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cz.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/de.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ec.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ee.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/eg.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/es.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/et.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/faq.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fi.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fj.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fo.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/geog.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gu.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hu.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/id.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ie.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/il.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/in.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/is.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/it.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jm.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jo.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jp.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kw.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lb.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lt.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lu.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lv.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ma.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mo.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mt.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mx.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/my.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ni.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nl.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/no.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nz.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pa.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pe.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ph.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pl.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ps.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pt.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/recognition.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/results.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ro.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ru.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sd.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/se.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sg.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/si.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/th.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tr.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tw.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ua.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uk.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ-full.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uy.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ve.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/yu.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/za.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/zm.html +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/url.txt +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/webTopWords +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/database/words +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9ContextModelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9Modelfold1 +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceModel +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceTrainF +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperBaseFeaDict +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperContextFeaDict +0 -0
- /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +0 -0
@@ -0,0 +1,964 @@
|
|
1
|
+
#!/usr/bin/perl
|
2
|
+
|
3
|
+
# Author: Do Hoang Nhat Huy <huydo@comp.nus.edu.sg>
|
4
|
+
# Modified from template by Min-Yen Kan <kanmy@comp.nus.edu.sg>
|
5
|
+
|
6
|
+
require 5.0;
|
7
|
+
use strict;
|
8
|
+
|
9
|
+
# Dependencies
|
10
|
+
use FindBin;
|
11
|
+
use Getopt::Long;
|
12
|
+
use HTML::Entities;
|
13
|
+
|
14
|
+
# I do not know a better solution to find a lib path in -T mode.
|
15
|
+
# So if you know a better solution, I'd be glad to hear.
|
16
|
+
# See this http://www.perlmonks.org/?node_id=585299 for why I used the below code
|
17
|
+
|
18
|
+
# To get correct path in case 2 scripts in different directories use FindBin
|
19
|
+
FindBin::again();
|
20
|
+
my $path = undef;
|
21
|
+
BEGIN
|
22
|
+
{
|
23
|
+
if ($FindBin::Bin =~ /(.*)/) { $path = $1; }
|
24
|
+
}
|
25
|
+
use lib "$path/../../lib";
|
26
|
+
|
27
|
+
use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/5.10.0";
|
28
|
+
use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/site_perl/5.10.0";
|
29
|
+
|
30
|
+
# Local libraries
|
31
|
+
use Omni::Config;
|
32
|
+
use Omni::Omnidoc;
|
33
|
+
use SectLabel::PreProcess;
|
34
|
+
|
35
|
+
# Omnilib configuration: object name
|
36
|
+
my $obj_list = $Omni::Config::obj_list;
|
37
|
+
|
38
|
+
### USER customizable section
|
39
|
+
$0 =~ /([^\/]+)$/; my $progname = $1;
|
40
|
+
my $version = "1.0";
|
41
|
+
### END user customizable section
|
42
|
+
|
43
|
+
sub License
|
44
|
+
{
|
45
|
+
print STDERR "# Copyright 2011 \251 by Do Hoang Nhat Huy\n";
|
46
|
+
}
|
47
|
+
|
48
|
+
sub Help
|
49
|
+
{
|
50
|
+
print STDERR "Process Omnipage XML output (concatenated results fromm all pages of a PDF file), and extract text lines together with other XML infos\n";
|
51
|
+
print STDERR "usage: $progname -h\t[invokes help]\n";
|
52
|
+
print STDERR " $progname -in xmlfile -out outfile [-decode] [-log]\n";
|
53
|
+
print STDERR "Options:\n";
|
54
|
+
print STDERR "\t-q \tQuiet Mode (don't echo license)\n";
|
55
|
+
print STDERR "\t-decode \tDecode HTML entities and then output, to avoid double entity encoding later\n";
|
56
|
+
}
|
57
|
+
|
58
|
+
my $quite = 0;
|
59
|
+
my $help = 0;
|
60
|
+
my $out_file = undef;
|
61
|
+
my $in_file = undef;
|
62
|
+
my $is_decode = 0;
|
63
|
+
my $is_debug = 0;
|
64
|
+
my $address = 1;
|
65
|
+
|
66
|
+
$help = 1 unless GetOptions( 'in=s' => \$in_file,
|
67
|
+
'out=s' => \$out_file,
|
68
|
+
'decode' => \$is_decode,
|
69
|
+
'log' => \$is_debug,
|
70
|
+
'h' => \$help,
|
71
|
+
'q' => \$quite );
|
72
|
+
|
73
|
+
if ($help || ! defined $in_file || ! defined $out_file)
|
74
|
+
{
|
75
|
+
Help();
|
76
|
+
exit(0);
|
77
|
+
}
|
78
|
+
|
79
|
+
if (!$quite)
|
80
|
+
{
|
81
|
+
License();
|
82
|
+
}
|
83
|
+
|
84
|
+
### Untaint ###
|
85
|
+
$in_file = UntaintPath($in_file);
|
86
|
+
$out_file = UntaintPath($out_file);
|
87
|
+
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
|
88
|
+
### End untaint ###
|
89
|
+
|
90
|
+
# Mark page, para, line, word
|
91
|
+
my %g_page_hash = ();
|
92
|
+
|
93
|
+
# Mark paragraph
|
94
|
+
my @g_para = ();
|
95
|
+
|
96
|
+
# XML features
|
97
|
+
# Location feature
|
98
|
+
my @g_pos_hash = ();
|
99
|
+
my $g_maxpos = 0;
|
100
|
+
my $g_minpos = 1000000;
|
101
|
+
# Align feature
|
102
|
+
my @g_align = ();
|
103
|
+
# Bold feature
|
104
|
+
my @g_bold = ();
|
105
|
+
# Italic feature
|
106
|
+
my @g_italic = ();
|
107
|
+
# Pic feature
|
108
|
+
my @g_pic = ();
|
109
|
+
# Table feature
|
110
|
+
my @g_table = ();
|
111
|
+
# Bullet feature
|
112
|
+
my @g_bullet = ();
|
113
|
+
# Font size feature
|
114
|
+
my %g_font_size_hash = ();
|
115
|
+
my @g_font_size = ();
|
116
|
+
# Font face feature
|
117
|
+
my %g_font_face_hash = ();
|
118
|
+
my @g_font_face = ();
|
119
|
+
|
120
|
+
# All lines
|
121
|
+
my @lines = ();
|
122
|
+
# and their address
|
123
|
+
my @lines_addr = ();
|
124
|
+
|
125
|
+
# BEGIN
|
126
|
+
ProcessFile($in_file);
|
127
|
+
# Find header part
|
128
|
+
my $num_lines = scalar(@lines);
|
129
|
+
my ($header_length, $body_length, $body_start_id) = SectLabel::PreProcess::FindHeaderText(\@lines, 0, $num_lines);
|
130
|
+
# Done
|
131
|
+
Output(\@lines, $out_file);
|
132
|
+
|
133
|
+
if ($address == 1)
|
134
|
+
{
|
135
|
+
my $address_handle = undef;
|
136
|
+
# Save the line address for further use
|
137
|
+
open($address_handle, ">:utf8", $out_file . ".address") || die"#Can't open file \"$out_file.address\"\n";
|
138
|
+
foreach my $addr (@lines_addr)
|
139
|
+
{
|
140
|
+
print $address_handle $addr->{ 'L1' }, " ", $addr->{ 'L2' }, " ", $addr->{ 'L3' }, " ", $addr->{ 'L4' }, "\n";
|
141
|
+
}
|
142
|
+
# Done
|
143
|
+
close $address_handle;
|
144
|
+
}
|
145
|
+
# END
|
146
|
+
|
147
|
+
sub ProcessFile
|
148
|
+
{
|
149
|
+
my ($in_file) = @_;
|
150
|
+
|
151
|
+
my $input_handle = undef;
|
152
|
+
if (! open($input_handle, "<:utf8", $in_file)) { die "Could not open xml file " . $in_file; }
|
153
|
+
my $xml = do { local $/; <$input_handle> };
|
154
|
+
close $input_handle;
|
155
|
+
|
156
|
+
###
|
157
|
+
# Huydhn
|
158
|
+
# NOTE: the omnipage xml is not well constructed (concatenated multiple xml files).
|
159
|
+
# This merged xml need to be fixed first before pass it to xml processing libraries, e.g. xml::twig
|
160
|
+
###
|
161
|
+
# Convert to Unix format
|
162
|
+
$xml =~ s/\r//g;
|
163
|
+
# Remove <?xml version="1.0" encoding="UTF-8"?>
|
164
|
+
$xml =~ s/<\?xml.+?>\n//g;
|
165
|
+
# Remove <!--XML document generated using OCR technology from ScanSoft, Inc.-->
|
166
|
+
$xml =~ s/<\!\-\-XML.+?>\n//g;
|
167
|
+
# Declaration and root
|
168
|
+
$xml = "<?xml version=\"1.0\"?>" . "\n" . "<root>" . "\n" . $xml . "\n" . "</root>";
|
169
|
+
|
170
|
+
# New document
|
171
|
+
my $doc = new Omni::Omnidoc();
|
172
|
+
$doc->set_raw($xml);
|
173
|
+
|
174
|
+
# Current position
|
175
|
+
my %current = ();
|
176
|
+
|
177
|
+
# All pages in the document
|
178
|
+
my $pages = $doc->get_objs_ref();
|
179
|
+
|
180
|
+
# From page, To page
|
181
|
+
my $start_page = 0;
|
182
|
+
my $end_page = scalar(@{ $pages }) - 1;
|
183
|
+
|
184
|
+
# Image area flag
|
185
|
+
my $is_pic = 0;
|
186
|
+
|
187
|
+
# Tree traveling is 'not' fun. Seriously.
|
188
|
+
# This is like a dungeon seige.
|
189
|
+
for (my $x = $start_page; $x <= $end_page; $x++)
|
190
|
+
{
|
191
|
+
# Current position
|
192
|
+
$current{ 'L1' } = $x;
|
193
|
+
|
194
|
+
# Column or dd
|
195
|
+
my $level_2 = $pages->[ $x ]->get_objs_ref();
|
196
|
+
my $start_l2 = 0;
|
197
|
+
my $end_l2 = scalar(@{ $level_2 }) - 1;
|
198
|
+
|
199
|
+
for (my $y = $start_l2; $y <= $end_l2; $y++)
|
200
|
+
{
|
201
|
+
# Thang's code
|
202
|
+
# Thang considers <dd> tag as image, I just follow that
|
203
|
+
if ($level_2->[ $y ]->get_name() eq $obj_list->{ 'OMNIDD' })
|
204
|
+
{
|
205
|
+
$is_pic = 1;
|
206
|
+
}
|
207
|
+
else
|
208
|
+
{
|
209
|
+
$is_pic = 0;
|
210
|
+
}
|
211
|
+
# End Thang's code
|
212
|
+
|
213
|
+
# Current position
|
214
|
+
$current{ 'L2' } = $y;
|
215
|
+
|
216
|
+
# Table or paragraph
|
217
|
+
my $level_3 = $level_2->[ $y ]->get_objs_ref();
|
218
|
+
my $start_l3 = 0;
|
219
|
+
my $end_l3 = scalar(@{ $level_3 }) - 1;
|
220
|
+
|
221
|
+
for (my $z = $start_l3; $z <= $end_l3; $z++)
|
222
|
+
{
|
223
|
+
# Current position
|
224
|
+
$current{ 'L3' } = $z;
|
225
|
+
|
226
|
+
# Is a paragraph
|
227
|
+
if ($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNIPARA' })
|
228
|
+
{
|
229
|
+
# Thang's code
|
230
|
+
ProcessPara($level_3->[ $z ], $is_pic, \%current);
|
231
|
+
# End Thang's code
|
232
|
+
}
|
233
|
+
# or a table
|
234
|
+
elsif ($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNITABLE' })
|
235
|
+
{
|
236
|
+
# Thang's code
|
237
|
+
ProcessTable($level_3->[ $z ], $is_pic, \%current, 0);
|
238
|
+
# End Thangs's code
|
239
|
+
}
|
240
|
+
# or a frame
|
241
|
+
elsif ($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNIFRAME' })
|
242
|
+
{
|
243
|
+
# Frame contains multiple paragraph ?
|
244
|
+
ProcessFrame($level_3->[ $z ], $is_pic, \%current);
|
245
|
+
}
|
246
|
+
}
|
247
|
+
}
|
248
|
+
}
|
249
|
+
}
|
250
|
+
|
251
|
+
sub Output
|
252
|
+
{
|
253
|
+
my ($lines, $out_file) = @_;
|
254
|
+
|
255
|
+
my $output_handle = undef;
|
256
|
+
# This is the output
|
257
|
+
open($output_handle, ">:utf8", $out_file) || die"#Can't open file \"$out_file\"\n";
|
258
|
+
|
259
|
+
# XML feature label
|
260
|
+
my %g_font_size_labels = ();
|
261
|
+
GetFontSizeLabels(\%g_font_size_hash, \%g_font_size_labels);
|
262
|
+
|
263
|
+
my $output = "";
|
264
|
+
my $para_line_id = -1;
|
265
|
+
my $para_line_count = 0;
|
266
|
+
|
267
|
+
# This is the index of the line
|
268
|
+
my $id = 0;
|
269
|
+
# For each line in the whole document
|
270
|
+
foreach my $line (@{ $lines })
|
271
|
+
{
|
272
|
+
# Remove empty line
|
273
|
+
$line =~ s/^\s+|\s+$//g;
|
274
|
+
|
275
|
+
# New paragraph
|
276
|
+
if (($g_para[ $id ] eq "yes") && ($output ne ""))
|
277
|
+
{
|
278
|
+
if ($is_decode) { $output = decode_entities($output); }
|
279
|
+
# Write output to file
|
280
|
+
print $output_handle $output;
|
281
|
+
# Clean output for new paragraph
|
282
|
+
$output = "";
|
283
|
+
}
|
284
|
+
|
285
|
+
$output .= $line;
|
286
|
+
|
287
|
+
my $loc_feature = undef;
|
288
|
+
# XML location feature
|
289
|
+
if ($g_pos_hash[ $id ] != (-1)) { $loc_feature = "xmlLoc_".int(($g_pos_hash[$id] - $g_minpos) * 8.0 / ($g_maxpos - $g_minpos + 1)); }
|
290
|
+
|
291
|
+
# Align feature
|
292
|
+
my $align_feature = "xmlAlign_" . $g_align[ $id ];
|
293
|
+
|
294
|
+
my $font_size_feature = undef;
|
295
|
+
# Font_size feature
|
296
|
+
if (($g_font_size[$id] eq "") || ($g_font_size[$id] == -1))
|
297
|
+
{
|
298
|
+
$font_size_feature = "xmlFontSize_none";
|
299
|
+
}
|
300
|
+
else
|
301
|
+
{
|
302
|
+
$font_size_feature = "xmlFontSize_" . $g_font_size_labels{ $g_font_size[ $id ] };
|
303
|
+
}
|
304
|
+
|
305
|
+
# Bold feature
|
306
|
+
my $bold_feature = "xmlBold_" . $g_bold[ $id ];
|
307
|
+
# Italic feature
|
308
|
+
my $italic_feature = "xmlItalic_" . $g_italic[ $id ];
|
309
|
+
# Image feature
|
310
|
+
my $pic_feature = "xmlPic_" . $g_pic[ $id ];
|
311
|
+
# Table feature
|
312
|
+
my $table_feature = "xmlTable_" . $g_table[ $id ];
|
313
|
+
# Bullet feature
|
314
|
+
my $bullet_feature = "xmlBullet_" . $g_bullet[ $id ];
|
315
|
+
# Differential features
|
316
|
+
my ($align_diff, $font_size_diff, $font_face_diff, $font_sf_diff, $font_sfbi_diff, $font_sfbia_diff, $para_diff) = GetDifferentialFeatures($id);
|
317
|
+
|
318
|
+
# Each line and its XML features
|
319
|
+
$output .= " |XML| $loc_feature $bold_feature $italic_feature $font_size_feature $pic_feature $table_feature $bullet_feature $font_sfbia_diff $para_diff" . "\n";
|
320
|
+
|
321
|
+
# Update line index
|
322
|
+
$id++;
|
323
|
+
}
|
324
|
+
|
325
|
+
# New paragraph
|
326
|
+
if ($output ne "")
|
327
|
+
{
|
328
|
+
if ($is_decode) { $output = decode_entities($output); }
|
329
|
+
# Write output to file
|
330
|
+
print $output_handle $output;
|
331
|
+
# Clean output for new paragraph
|
332
|
+
$output = "";
|
333
|
+
}
|
334
|
+
|
335
|
+
# Done
|
336
|
+
close $output_handle;
|
337
|
+
}
|
338
|
+
|
339
|
+
sub GetDifferentialFeatures
|
340
|
+
{
|
341
|
+
my ($id) = @_;
|
342
|
+
|
343
|
+
my $align_diff = "bi_xmlA_";
|
344
|
+
# AlignChange feature
|
345
|
+
if ($id == 0)
|
346
|
+
{
|
347
|
+
$align_diff .= $g_align[ $id ];
|
348
|
+
}
|
349
|
+
elsif ($g_align[ $id ] eq $g_align[ $id - 1 ])
|
350
|
+
{
|
351
|
+
$align_diff .= "continue";
|
352
|
+
}
|
353
|
+
else
|
354
|
+
{
|
355
|
+
$align_diff .= $g_align[$id];
|
356
|
+
}
|
357
|
+
|
358
|
+
my $font_face_diff = "bi_xmlF_";
|
359
|
+
# FontFaceChange feature
|
360
|
+
if ($id == 0)
|
361
|
+
{
|
362
|
+
$font_face_diff .= "new";
|
363
|
+
}
|
364
|
+
elsif ($g_font_face[ $id ] eq $g_font_face[ $id - 1 ])
|
365
|
+
{
|
366
|
+
$font_face_diff .= "continue";
|
367
|
+
}
|
368
|
+
else
|
369
|
+
{
|
370
|
+
$font_face_diff .= "new";
|
371
|
+
}
|
372
|
+
|
373
|
+
my $font_size_diff = "bi_xmlS_";
|
374
|
+
# FontSizeChange feature
|
375
|
+
if ($id == 0)
|
376
|
+
{
|
377
|
+
$font_size_diff .= "new";
|
378
|
+
}
|
379
|
+
elsif ($g_font_size[ $id ] == $g_font_size[ $id - 1 ])
|
380
|
+
{
|
381
|
+
$font_size_diff .= "continue";
|
382
|
+
}
|
383
|
+
else
|
384
|
+
{
|
385
|
+
$font_size_diff .= "new";
|
386
|
+
}
|
387
|
+
|
388
|
+
my $font_sf_diff = "bi_xmlSF_";
|
389
|
+
# FontSFChange feature
|
390
|
+
if ($id == 0)
|
391
|
+
{
|
392
|
+
$font_sf_diff .= "new";
|
393
|
+
}
|
394
|
+
elsif ($g_font_size[ $id ] == $g_font_size[ $id - 1 ] && $g_font_face[ $id ] eq $g_font_face[ $id - 1 ])
|
395
|
+
{
|
396
|
+
$font_sf_diff .= "continue";
|
397
|
+
}
|
398
|
+
else
|
399
|
+
{
|
400
|
+
$font_sf_diff .= "new";
|
401
|
+
}
|
402
|
+
|
403
|
+
my $font_sfbi_diff = "bi_xmlSFBI_";
|
404
|
+
# FontSFBIChange feature
|
405
|
+
if ($id == 0)
|
406
|
+
{
|
407
|
+
$font_sfbi_diff .= "new";
|
408
|
+
}
|
409
|
+
elsif ($g_font_size[ $id ] == $g_font_size[ $id - 1 ] && $g_font_face[ $id ] eq $g_font_face[ $id - 1 ] && $g_bold[ $id ] eq $g_bold[ $id - 1 ] && $g_italic[ $id ] eq $g_italic[ $id - 1 ])
|
410
|
+
{
|
411
|
+
$font_sfbi_diff .= "continue";
|
412
|
+
}
|
413
|
+
else
|
414
|
+
{
|
415
|
+
$font_sfbi_diff .= "new";
|
416
|
+
}
|
417
|
+
|
418
|
+
my $font_sfbia_diff = "bi_xmlSFBIA_";
|
419
|
+
# FontSFBIAChange feature
|
420
|
+
if ($id == 0)
|
421
|
+
{
|
422
|
+
$font_sfbia_diff .= "new";
|
423
|
+
}
|
424
|
+
elsif ($g_font_size[ $id ] == $g_font_size[ $id - 1 ] && $g_font_face[ $id ] eq $g_font_face[ $id - 1 ] && $g_bold[ $id ] eq $g_bold[ $id - 1 ] && $g_italic[ $id ] eq $g_italic[$id - 1] && $g_align[ $id ] eq $g_align[ $id - 1 ])
|
425
|
+
{
|
426
|
+
$font_sfbia_diff .= "continue";
|
427
|
+
}
|
428
|
+
else
|
429
|
+
{
|
430
|
+
$font_sfbia_diff .= "new";
|
431
|
+
}
|
432
|
+
|
433
|
+
# ParaChange feature
|
434
|
+
my $para_diff = "bi_xmlPara_";
|
435
|
+
# Header part, consider each line as a separate paragraph
|
436
|
+
if ($id < $body_start_id)
|
437
|
+
{
|
438
|
+
$para_diff .= "header";
|
439
|
+
}
|
440
|
+
else
|
441
|
+
{
|
442
|
+
if($g_para[$id] eq "yes")
|
443
|
+
{
|
444
|
+
$para_diff .= "new";
|
445
|
+
}
|
446
|
+
else
|
447
|
+
{
|
448
|
+
$para_diff .= "continue";
|
449
|
+
}
|
450
|
+
}
|
451
|
+
|
452
|
+
return ($align_diff, $font_size_diff, $font_face_diff, $font_sf_diff, $font_sfbi_diff, $font_sfbia_diff, $para_diff);
|
453
|
+
}
|
454
|
+
|
455
|
+
sub GetFontSizeLabels
|
456
|
+
{
|
457
|
+
my ($g_font_size_hash, $g_font_size_labels) = @_;
|
458
|
+
|
459
|
+
# Sort by value in desccending order
|
460
|
+
my @sorted_fonts = sort { $g_font_size_hash->{ $b } <=> $g_font_size_hash->{ $a } } keys %{ $g_font_size_hash };
|
461
|
+
# and get the
|
462
|
+
my $common_size = $sorted_fonts[ 0 ];
|
463
|
+
|
464
|
+
# Sort by key in ascending order
|
465
|
+
@sorted_fonts = sort { $a <=> $b } keys %{ $g_font_size_hash };
|
466
|
+
|
467
|
+
my $common_index = 0;
|
468
|
+
# Index of common font size
|
469
|
+
foreach (@sorted_fonts)
|
470
|
+
{
|
471
|
+
# Found
|
472
|
+
if ($common_size == $_) { last; }
|
473
|
+
$common_index++;
|
474
|
+
}
|
475
|
+
|
476
|
+
# Small fonts
|
477
|
+
for (my $i = 0; $i < $common_index; $i++)
|
478
|
+
{
|
479
|
+
$g_font_size_labels->{ $sorted_fonts[ $i ] } = "smaller";
|
480
|
+
}
|
481
|
+
|
482
|
+
# Common fonts
|
483
|
+
$g_font_size_labels->{ $common_size } = "common";
|
484
|
+
|
485
|
+
# Large fonts
|
486
|
+
for (my $i = ($common_index + 1); $i < scalar(@sorted_fonts); $i++)
|
487
|
+
{
|
488
|
+
if ((scalar(@sorted_fonts) - $i) <= 3)
|
489
|
+
{
|
490
|
+
$g_font_size_labels->{ $sorted_fonts[$i] } = "largest" . ($i + 1 - scalar(@sorted_fonts));
|
491
|
+
}
|
492
|
+
else
|
493
|
+
{
|
494
|
+
$g_font_size_labels->{ $sorted_fonts[$i] } = "larger";
|
495
|
+
}
|
496
|
+
}
|
497
|
+
}
|
498
|
+
|
499
|
+
sub ProcessFrame
|
500
|
+
{
|
501
|
+
my ($omniframe, $is_pic, $line_addr) = @_;
|
502
|
+
|
503
|
+
# Line index in the whole frame
|
504
|
+
my $lindex = 0;
|
505
|
+
# All paragraph or table in the frame
|
506
|
+
my $objs = $omniframe->get_objs_ref();
|
507
|
+
# For each paragraph or table in the frame
|
508
|
+
for (my $i = 0; $i < scalar(@{ $objs }); $i++)
|
509
|
+
{
|
510
|
+
if ($objs->[ $i ]->get_name() eq $obj_list->{ 'OMNIPARA' })
|
511
|
+
{
|
512
|
+
# Paragraph attributes
|
513
|
+
my $align = $objs->[ $i ]->get_alignment();
|
514
|
+
my $space = $objs->[ $i ]->get_space_before();
|
515
|
+
# Line attributes
|
516
|
+
my ($left, $top, $right, $bottom) = undef;
|
517
|
+
# Run attributes
|
518
|
+
my $bold_count = 0;
|
519
|
+
my $italic_count = 0;
|
520
|
+
my %font_size_hash = ();
|
521
|
+
my %font_face_hash = ();
|
522
|
+
|
523
|
+
my $omnilines = $objs->[ $i ]->get_objs_ref();
|
524
|
+
# For each line in the paragraph
|
525
|
+
for (my $t = 0; $t < scalar(@{ $omnilines }); $t++)
|
526
|
+
{
|
527
|
+
# Save the line
|
528
|
+
push @lines, $omnilines->[ $t ]->get_content();
|
529
|
+
# Save the line's address
|
530
|
+
$line_addr->{ 'L4' } = $lindex;
|
531
|
+
push @lines_addr, { %{ $line_addr } };
|
532
|
+
# Point to the next line in the whole frame
|
533
|
+
$lindex++;
|
534
|
+
|
535
|
+
# Line attributes
|
536
|
+
$left = $omnilines->[ $t ]->get_left_pos();
|
537
|
+
$right = $omnilines->[ $t ]->get_right_pos();
|
538
|
+
$top = $omnilines->[ $t ]->get_top_pos();
|
539
|
+
$bottom = $omnilines->[ $t ]->get_bottom_pos();
|
540
|
+
|
541
|
+
# Runs
|
542
|
+
my $runs = $omnilines->[ $t ]->get_objs_ref();
|
543
|
+
my $start_r = 0;
|
544
|
+
my $end_r = scalar(@{ $runs }) - 1;
|
545
|
+
|
546
|
+
# Total number of words in a line
|
547
|
+
my $words_count = 0;
|
548
|
+
|
549
|
+
for (my $u = $start_r; $u <= $end_r; $u++)
|
550
|
+
{
|
551
|
+
# Thang's compatible code (instead of using get_objs_ref)
|
552
|
+
my $rcontent = undef;
|
553
|
+
# Get run content
|
554
|
+
$rcontent = $runs->[ $u ]->get_content();
|
555
|
+
# Trim
|
556
|
+
$rcontent =~ s/^\s+|\s+$//g;
|
557
|
+
# Split to words
|
558
|
+
my @words = split(/\s+/, $rcontent);
|
559
|
+
|
560
|
+
# Update the number of words
|
561
|
+
$words_count += scalar(@words);
|
562
|
+
|
563
|
+
# XML format
|
564
|
+
my $font_size = $runs->[ $u ]->get_font_size();
|
565
|
+
$font_size_hash{ $font_size } = $font_size_hash{ $font_size } ? $font_size_hash{ $font_size } + scalar(@words) : scalar(@words);
|
566
|
+
# XML format
|
567
|
+
my $font_face = $runs->[ $u ]->get_font_face();
|
568
|
+
$font_face_hash{ $font_face } = $font_face_hash{ $font_face } ? $font_face_hash{ $font_face } + scalar(@words) : scalar(@words);
|
569
|
+
# XML format
|
570
|
+
if ($runs->[ $u ]->get_bold() eq "true") { $bold_count += scalar(@words); }
|
571
|
+
# XML format
|
572
|
+
if ($runs->[ $u ]->get_italic() eq "true") { $italic_count += scalar(@words); }
|
573
|
+
}
|
574
|
+
|
575
|
+
# Line attributes - relative position in paragraph
|
576
|
+
if ($t == 0)
|
577
|
+
{
|
578
|
+
push @g_para, "yes";
|
579
|
+
}
|
580
|
+
else
|
581
|
+
{
|
582
|
+
push @g_para, "no";
|
583
|
+
}
|
584
|
+
|
585
|
+
# Line attributes - line position
|
586
|
+
my $pos = ($top + $bottom) / 2.0;
|
587
|
+
# Compare to global min and max position
|
588
|
+
if ($pos < $g_minpos) { $g_minpos = $pos; }
|
589
|
+
if ($pos > $g_maxpos) { $g_maxpos = $pos; }
|
590
|
+
# Pos feature
|
591
|
+
push @g_pos_hash, $pos;
|
592
|
+
# Alignment feature
|
593
|
+
push @g_align, $align;
|
594
|
+
# Table feature
|
595
|
+
push @g_table, "no";
|
596
|
+
|
597
|
+
if ($is_pic)
|
598
|
+
{
|
599
|
+
push @g_pic, "yes";
|
600
|
+
# Not assign value if line is in image area
|
601
|
+
push @g_bold, "no";
|
602
|
+
push @g_italic, "no";
|
603
|
+
push @g_bullet, "no";
|
604
|
+
push @g_font_size, -1;
|
605
|
+
push @g_font_face, "none";
|
606
|
+
}
|
607
|
+
else
|
608
|
+
{
|
609
|
+
push @g_pic, "no";
|
610
|
+
UpdateXMLFontFeature(\%font_size_hash, \%font_face_hash);
|
611
|
+
UpdateXMLFeatures($bold_count, $italic_count, $words_count, $omnilines->[ $t ]->get_bullet(), $space);
|
612
|
+
}
|
613
|
+
|
614
|
+
# Reset hash
|
615
|
+
%font_size_hash = ();
|
616
|
+
%font_face_hash = ();
|
617
|
+
# Reset
|
618
|
+
$bold_count = 0;
|
619
|
+
$italic_count = 0;
|
620
|
+
}
|
621
|
+
}
|
622
|
+
elsif ($objs->[ $i ]->get_name() eq $obj_list->{ 'OMNITABLE' })
|
623
|
+
{
|
624
|
+
$lindex = ProcessTable($objs->[ $i ], $is_pic, $line_addr, $lindex);
|
625
|
+
}
|
626
|
+
}
|
627
|
+
}
|
628
|
+
|
629
|
+
sub ProcessTable
|
630
|
+
{
|
631
|
+
my ($omnitable, $is_pic, $line_addr, $lindex) = @_;
|
632
|
+
|
633
|
+
# Table attributes
|
634
|
+
my ($left, $top, $right, $bottom) = undef;
|
635
|
+
$left = $omnitable->get_left_pos();
|
636
|
+
$right = $omnitable->get_right_pos();
|
637
|
+
$top = $omnitable->get_top_pos();
|
638
|
+
$bottom = $omnitable->get_bottom_pos();
|
639
|
+
# Table attributes
|
640
|
+
my $align = $omnitable->get_alignment();
|
641
|
+
|
642
|
+
# Thang's code
|
643
|
+
my $pos = ($top + $bottom) / 2.0;
|
644
|
+
# Set new min and max position
|
645
|
+
if ($pos < $g_minpos) { $g_minpos = $pos; }
|
646
|
+
if ($pos > $g_maxpos) { $g_maxpos = $pos; }
|
647
|
+
# End Thangs's code
|
648
|
+
|
649
|
+
# All row in the table
|
650
|
+
my $rows = $omnitable->get_row_content();
|
651
|
+
# For each row in the table
|
652
|
+
for (my $i = 0; $i < scalar(@{ $rows }); $i++)
|
653
|
+
{
|
654
|
+
my @row_lines = split(/\n/, $rows->[ $i ]);
|
655
|
+
# For each line in the row
|
656
|
+
for (my $j = 0; $j < scalar(@row_lines); $j++)
|
657
|
+
{
|
658
|
+
# Save the line
|
659
|
+
push @lines, $row_lines[ $j ];
|
660
|
+
# Save the line's address
|
661
|
+
$line_addr->{ 'L4' } = $lindex;
|
662
|
+
push @lines_addr, { %{ $line_addr } };
|
663
|
+
# Point to the next line in the whole table
|
664
|
+
$lindex++;
|
665
|
+
|
666
|
+
if (($j == 0) && ($i == 0))
|
667
|
+
{
|
668
|
+
push @g_para, "yes";
|
669
|
+
}
|
670
|
+
else
|
671
|
+
{
|
672
|
+
push @g_para, "no";
|
673
|
+
}
|
674
|
+
|
675
|
+
# Table feature
|
676
|
+
push @g_table, "yes";
|
677
|
+
|
678
|
+
# Pic feature
|
679
|
+
if ($is_pic)
|
680
|
+
{
|
681
|
+
push @g_pic, "yes";
|
682
|
+
}
|
683
|
+
else
|
684
|
+
{
|
685
|
+
push @g_pic, "no";
|
686
|
+
}
|
687
|
+
|
688
|
+
# Update xml pos value
|
689
|
+
push @g_pos_hash, $pos;
|
690
|
+
# Update xml alignment value
|
691
|
+
push @g_align, $align;
|
692
|
+
|
693
|
+
# Fontsize feature
|
694
|
+
push @g_font_size, -1;
|
695
|
+
# Fontface feature
|
696
|
+
push @g_font_face, "none";
|
697
|
+
# Bold feature
|
698
|
+
push @g_bold, "no";
|
699
|
+
# Italic feature
|
700
|
+
push @g_italic, "no";
|
701
|
+
# Bullet feature
|
702
|
+
push @g_bullet, "no";
|
703
|
+
}
|
704
|
+
}
|
705
|
+
|
706
|
+
# Nonsense
|
707
|
+
return $lindex;
|
708
|
+
}
|
709
|
+
|
710
|
+
sub ProcessPara
|
711
|
+
{
|
712
|
+
my ($paragraph, $is_pic, $line_addr) = @_;
|
713
|
+
|
714
|
+
# Paragraph attributes
|
715
|
+
my $align = $paragraph->get_alignment();
|
716
|
+
my $space = $paragraph->get_space_before();
|
717
|
+
# Line attributes
|
718
|
+
my ($left, $top, $right, $bottom) = undef;
|
719
|
+
# Run attributes
|
720
|
+
my $bold_count = 0;
|
721
|
+
my $italic_count = 0;
|
722
|
+
my %font_size_hash = ();
|
723
|
+
my %font_face_hash = ();
|
724
|
+
|
725
|
+
# Lines
|
726
|
+
my $omnilines = $paragraph->get_objs_ref();
|
727
|
+
my $start_l = 0;
|
728
|
+
my $end_l = scalar(@{ $omnilines }) - 1;
|
729
|
+
|
730
|
+
# Lines
|
731
|
+
for (my $t = $start_l; $t <= $end_l; $t++)
|
732
|
+
{
|
733
|
+
# Skip blank line
|
734
|
+
my $lcontent = $omnilines->[ $t ]->get_content();
|
735
|
+
$lcontent =~ s/^\s+|\s+$//g;
|
736
|
+
# Skip blank line
|
737
|
+
if ($lcontent eq "") { next; }
|
738
|
+
|
739
|
+
# Save the line
|
740
|
+
push @lines, $omnilines->[ $t ]->get_content();
|
741
|
+
# Save the line's address
|
742
|
+
$line_addr->{ 'L4' } = $t;
|
743
|
+
push @lines_addr, { %{ $line_addr } };
|
744
|
+
|
745
|
+
# Line attributes
|
746
|
+
$left = $omnilines->[ $t ]->get_left_pos();
|
747
|
+
$right = $omnilines->[ $t ]->get_right_pos();
|
748
|
+
$top = $omnilines->[ $t ]->get_top_pos();
|
749
|
+
$bottom = $omnilines->[ $t ]->get_bottom_pos();
|
750
|
+
|
751
|
+
# Runs
|
752
|
+
my $runs = $omnilines->[ $t ]->get_objs_ref();
|
753
|
+
my $start_r = 0;
|
754
|
+
my $end_r = scalar(@{ $runs }) - 1;
|
755
|
+
|
756
|
+
# Total number of words in a line
|
757
|
+
my $words_count = 0;
|
758
|
+
|
759
|
+
for (my $u = $start_r; $u <= $end_r; $u++)
|
760
|
+
{
|
761
|
+
# Thang's compatible code (instead of using get_objs_ref)
|
762
|
+
my $rcontent = undef;
|
763
|
+
# Get run content
|
764
|
+
$rcontent = $runs->[ $u ]->get_content();
|
765
|
+
# Trim
|
766
|
+
$rcontent =~ s/^\s+|\s+$//g;
|
767
|
+
# Split to words
|
768
|
+
my @words = split(/\s+/, $rcontent);
|
769
|
+
|
770
|
+
# Update the number of words
|
771
|
+
$words_count += scalar(@words);
|
772
|
+
|
773
|
+
# XML format
|
774
|
+
my $font_size = $runs->[ $u ]->get_font_size();
|
775
|
+
$font_size_hash{ $font_size } = $font_size_hash{ $font_size } ? $font_size_hash{ $font_size } + scalar(@words) : scalar(@words);
|
776
|
+
# XML format
|
777
|
+
my $font_face = $runs->[ $u ]->get_font_face();
|
778
|
+
$font_face_hash{ $font_face } = $font_face_hash{ $font_face } ? $font_face_hash{ $font_face } + scalar(@words) : scalar(@words);
|
779
|
+
# XML format
|
780
|
+
if ($runs->[ $u ]->get_bold() eq "true") { $bold_count += scalar(@words); }
|
781
|
+
# XML format
|
782
|
+
if ($runs->[ $u ]->get_italic() eq "true") { $italic_count += scalar(@words); }
|
783
|
+
}
|
784
|
+
|
785
|
+
# Line attributes - relative position in paragraph
|
786
|
+
if ($t == $start_l)
|
787
|
+
{
|
788
|
+
push @g_para, "yes";
|
789
|
+
}
|
790
|
+
else
|
791
|
+
{
|
792
|
+
push @g_para, "no";
|
793
|
+
}
|
794
|
+
|
795
|
+
# Line attributes - line position
|
796
|
+
my $pos = ($top + $bottom) / 2.0;
|
797
|
+
# Compare to global min and max position
|
798
|
+
if ($pos < $g_minpos) { $g_minpos = $pos; }
|
799
|
+
if ($pos > $g_maxpos) { $g_maxpos = $pos; }
|
800
|
+
# Pos feature
|
801
|
+
push @g_pos_hash, $pos;
|
802
|
+
# Alignment feature
|
803
|
+
push @g_align, $align;
|
804
|
+
# Table feature
|
805
|
+
push @g_table, "no";
|
806
|
+
|
807
|
+
if ($is_pic)
|
808
|
+
{
|
809
|
+
push @g_pic, "yes";
|
810
|
+
# Not assign value if line is in image area
|
811
|
+
push @g_bold, "no";
|
812
|
+
push @g_italic, "no";
|
813
|
+
push @g_bullet, "no";
|
814
|
+
push @g_font_size, -1;
|
815
|
+
push @g_font_face, "none";
|
816
|
+
}
|
817
|
+
else
|
818
|
+
{
|
819
|
+
push @g_pic, "no";
|
820
|
+
UpdateXMLFontFeature(\%font_size_hash, \%font_face_hash);
|
821
|
+
UpdateXMLFeatures($bold_count, $italic_count, $words_count, $omnilines->[ $t ]->get_bullet(), $space);
|
822
|
+
}
|
823
|
+
|
824
|
+
# Reset hash
|
825
|
+
%font_size_hash = ();
|
826
|
+
%font_face_hash = ();
|
827
|
+
# Reset
|
828
|
+
$bold_count = 0;
|
829
|
+
$italic_count = 0;
|
830
|
+
}
|
831
|
+
}
|
832
|
+
|
833
|
+
sub UpdateXMLFontFeature
|
834
|
+
{
|
835
|
+
my ($font_size_hash, $font_face_hash) = @_;
|
836
|
+
|
837
|
+
# Font size feature
|
838
|
+
if (scalar(keys %{ $font_size_hash }) == 0)
|
839
|
+
{
|
840
|
+
push @g_font_size, -1;
|
841
|
+
}
|
842
|
+
else
|
843
|
+
{
|
844
|
+
my @sorted_fonts = sort { $font_size_hash->{ $b } <=> $font_size_hash->{ $a } } keys %{ $font_size_hash };
|
845
|
+
|
846
|
+
my $font_size = undef;
|
847
|
+
# Iw two font sizes are equal in number, get the larger one
|
848
|
+
if ((scalar(@sorted_fonts) != 1) && ($font_size_hash->{ $sorted_fonts[ 0 ] } == $font_size_hash->{ $sorted_fonts[ 1 ] }))
|
849
|
+
{
|
850
|
+
$font_size = ($sorted_fonts[ 0 ] > $sorted_fonts[ 1 ]) ? $sorted_fonts[ 0 ] : $sorted_fonts[ 1 ];
|
851
|
+
}
|
852
|
+
else
|
853
|
+
{
|
854
|
+
$font_size = $sorted_fonts[ 0 ];
|
855
|
+
}
|
856
|
+
|
857
|
+
if ($font_size eq "") { $font_size = 0; }
|
858
|
+
|
859
|
+
push @g_font_size, $font_size;
|
860
|
+
$g_font_size_hash{ $font_size } = $g_font_size_hash{ $font_size } ? $g_font_size_hash{ $font_size } + 1 : 1;
|
861
|
+
}
|
862
|
+
|
863
|
+
# Font face feature
|
864
|
+
if (scalar(keys %{ $font_face_hash }) == 0)
|
865
|
+
{
|
866
|
+
push @g_font_face, "none";
|
867
|
+
}
|
868
|
+
else
|
869
|
+
{
|
870
|
+
my @sorted_fonts = sort { $font_face_hash->{ $b } <=> $font_face_hash->{ $a } } keys %{ $font_face_hash };
|
871
|
+
|
872
|
+
my $font_face = $sorted_fonts[ 0 ];
|
873
|
+
push @g_font_face, $font_face;
|
874
|
+
|
875
|
+
$g_font_face_hash{ $font_face } = $g_font_face_hash{ $font_face } ? $g_font_face_hash{ $font_face } + 1 : 1;
|
876
|
+
}
|
877
|
+
}
|
878
|
+
|
879
|
+
sub UpdateXMLFeatures
|
880
|
+
{
|
881
|
+
my ($bold_count, $italic_count, $words_count, $is_bullet, $space) = @_;
|
882
|
+
|
883
|
+
# Bold feature
|
884
|
+
my $bold_feature = undef;
|
885
|
+
if (($words_count != 0) && ($bold_count / $words_count >= 0.667))
|
886
|
+
{
|
887
|
+
$bold_feature = "yes";
|
888
|
+
}
|
889
|
+
else
|
890
|
+
{
|
891
|
+
$bold_feature = "no";
|
892
|
+
}
|
893
|
+
push @g_bold, $bold_feature;
|
894
|
+
|
895
|
+
# Italic feature
|
896
|
+
my $italic_feature = undef;
|
897
|
+
if (($words_count != 0) && ($italic_count / $words_count >= 0.667))
|
898
|
+
{
|
899
|
+
$italic_feature = "yes";
|
900
|
+
}
|
901
|
+
else
|
902
|
+
{
|
903
|
+
$italic_feature = "no";
|
904
|
+
}
|
905
|
+
push @g_italic, $italic_feature;
|
906
|
+
|
907
|
+
# Bullet feature
|
908
|
+
if ((defined $is_bullet) && ($is_bullet eq "true"))
|
909
|
+
{
|
910
|
+
push @g_bullet, "yes";
|
911
|
+
}
|
912
|
+
else
|
913
|
+
{
|
914
|
+
push @g_bullet, "no";
|
915
|
+
}
|
916
|
+
}
|
917
|
+
|
918
|
+
sub UntaintPath
|
919
|
+
{
|
920
|
+
my ($path) = @_;
|
921
|
+
|
922
|
+
if ( $path =~ /^([-_\/\w\.]*)$/ )
|
923
|
+
{
|
924
|
+
$path = $1;
|
925
|
+
}
|
926
|
+
else
|
927
|
+
{
|
928
|
+
die "Bad path \"$path\"\n";
|
929
|
+
}
|
930
|
+
|
931
|
+
return $path;
|
932
|
+
}
|
933
|
+
|
934
|
+
sub Untaint
|
935
|
+
{
|
936
|
+
my ($s) = @_;
|
937
|
+
if ($s =~ /^([\w \-\@\(\),\.\/]+)$/)
|
938
|
+
{
|
939
|
+
$s = $1; # $data now untainted
|
940
|
+
}
|
941
|
+
else
|
942
|
+
{
|
943
|
+
die "Bad data in $s"; # log this somewhere
|
944
|
+
}
|
945
|
+
|
946
|
+
return $s;
|
947
|
+
}
|
948
|
+
|
949
|
+
sub Execute
|
950
|
+
{
|
951
|
+
my ($cmd) = @_;
|
952
|
+
$cmd = Untaint($cmd);
|
953
|
+
system($cmd);
|
954
|
+
}
|
955
|
+
|
956
|
+
sub NewTmpFile
|
957
|
+
{
|
958
|
+
my $tmp_file = `date '+%Y%m%d-%H%M%S-$$'`;
|
959
|
+
chomp $tmp_file;
|
960
|
+
return $tmp_file;
|
961
|
+
}
|
962
|
+
|
963
|
+
|
964
|
+
|