biblicit 1.0 → 2.0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (406) hide show
  1. data/.gitmodules +3 -0
  2. data/Gemfile +1 -1
  3. data/README.md +125 -30
  4. data/Rakefile +22 -0
  5. data/biblicit.gemspec +9 -7
  6. data/lib/biblicit/cb2bib.rb +10 -11
  7. data/lib/biblicit/citeseer.rb +14 -26
  8. data/lib/biblicit/extractor.rb +40 -19
  9. data/lib/biblicit/parscit.rb +38 -0
  10. data/parscit/.gitignore +8 -0
  11. data/parscit/CHANGELOG +125 -0
  12. data/parscit/COPYING +674 -0
  13. data/parscit/COPYING.LESSER +165 -0
  14. data/parscit/INSTALL +105 -0
  15. data/parscit/README +97 -0
  16. data/{perl/ParsCit/README.TXT → parscit/USAGE} +25 -15
  17. data/parscit/bin/archtest.pl +31 -0
  18. data/parscit/bin/citeExtract.pl +562 -0
  19. data/parscit/bin/conlleval.pl +315 -0
  20. data/parscit/bin/headExtract.pl +40 -0
  21. data/parscit/bin/parsHed/convert2TokenLevel.pl +138 -0
  22. data/parscit/bin/parsHed/keywordGen.pl +308 -0
  23. data/parscit/bin/parsHed/parseXmlHeader.pl +141 -0
  24. data/parscit/bin/parsHed/redo.parsHed.pl +198 -0
  25. data/parscit/bin/parsHed/tr2crfpp_parsHed.pl +521 -0
  26. data/parscit/bin/parseRefStrings.pl +102 -0
  27. data/parscit/bin/phOutput2xml.pl +223 -0
  28. data/parscit/bin/redo.parsCit.pl +105 -0
  29. data/parscit/bin/sectExtract.pl +149 -0
  30. data/parscit/bin/sectLabel/README +110 -0
  31. data/parscit/bin/sectLabel/README.txt +110 -0
  32. data/parscit/bin/sectLabel/genericSect/crossValidation.rb +98 -0
  33. data/parscit/bin/sectLabel/genericSect/extractFeature.rb +104 -0
  34. data/parscit/bin/sectLabel/genericSectExtract.rb +53 -0
  35. data/parscit/bin/sectLabel/getStructureInfo.pl +156 -0
  36. data/parscit/bin/sectLabel/processOmniXML.pl +1427 -0
  37. data/parscit/bin/sectLabel/processOmniXML_new.pl +1025 -0
  38. data/parscit/bin/sectLabel/processOmniXMLv2.pl +1529 -0
  39. data/parscit/bin/sectLabel/processOmniXMLv3.pl +964 -0
  40. data/parscit/bin/sectLabel/redo.sectLabel.pl +219 -0
  41. data/parscit/bin/sectLabel/simplifyOmniXML.pl +382 -0
  42. data/parscit/bin/sectLabel/single2multi.pl +190 -0
  43. data/parscit/bin/sectLabel/tr2crfpp.pl +158 -0
  44. data/parscit/bin/tr2crfpp.pl +260 -0
  45. data/parscit/bin/xml2train.pl +193 -0
  46. data/parscit/lib/CSXUtil/SafeText.pm +130 -0
  47. data/parscit/lib/Omni/Config.pm +93 -0
  48. data/parscit/lib/Omni/Omnicell.pm +263 -0
  49. data/parscit/lib/Omni/Omnicol.pm +292 -0
  50. data/parscit/lib/Omni/Omnidd.pm +328 -0
  51. data/parscit/lib/Omni/Omnidoc.pm +153 -0
  52. data/parscit/lib/Omni/Omniframe.pm +223 -0
  53. data/parscit/lib/Omni/Omniline.pm +423 -0
  54. data/parscit/lib/Omni/Omnipage.pm +282 -0
  55. data/parscit/lib/Omni/Omnipara.pm +232 -0
  56. data/parscit/lib/Omni/Omnirun.pm +303 -0
  57. data/parscit/lib/Omni/Omnitable.pm +336 -0
  58. data/parscit/lib/Omni/Omniword.pm +162 -0
  59. data/parscit/lib/Omni/Traversal.pm +313 -0
  60. data/parscit/lib/ParsCit/.PostProcess.pm.swp +0 -0
  61. data/parscit/lib/ParsCit/Citation.pm +737 -0
  62. data/parscit/lib/ParsCit/CitationContext.pm +220 -0
  63. data/parscit/lib/ParsCit/Config.pm +35 -0
  64. data/parscit/lib/ParsCit/Controller.pm +653 -0
  65. data/parscit/lib/ParsCit/PostProcess.pm +505 -0
  66. data/parscit/lib/ParsCit/PreProcess.pm +1041 -0
  67. data/parscit/lib/ParsCit/Tr2crfpp.pm +1195 -0
  68. data/parscit/lib/ParsHed/Config.pm +49 -0
  69. data/parscit/lib/ParsHed/Controller.pm +143 -0
  70. data/parscit/lib/ParsHed/PostProcess.pm +322 -0
  71. data/parscit/lib/ParsHed/Tr2crfpp.pm +448 -0
  72. data/{perl/ParsCit/lib/ParsCit/Tr2crfpp.pm → parscit/lib/ParsHed/Tr2crfpp_token.pm} +22 -21
  73. data/parscit/lib/SectLabel/AAMatching.pm +1949 -0
  74. data/parscit/lib/SectLabel/Config.pm +88 -0
  75. data/parscit/lib/SectLabel/Controller.pm +332 -0
  76. data/parscit/lib/SectLabel/PostProcess.pm +425 -0
  77. data/parscit/lib/SectLabel/PreProcess.pm +116 -0
  78. data/parscit/lib/SectLabel/Tr2crfpp.pm +1246 -0
  79. data/parscit/resources/parsCit.model +0 -0
  80. data/parscit/resources/parsCit.split.model +0 -0
  81. data/{perl/ParsCit → parscit}/resources/parsCitDict.txt +205 -0
  82. data/parscit/resources/parsHed/bigram +10 -0
  83. data/parscit/resources/parsHed/keywords +10 -0
  84. data/parscit/resources/parsHed/parsHed.model +0 -0
  85. data/parscit/resources/parsHed/parsHed.template +178 -0
  86. data/parscit/resources/sectLabel/affiliation.model +0 -0
  87. data/parscit/resources/sectLabel/author.model +0 -0
  88. data/parscit/resources/sectLabel/funcWord +320 -0
  89. data/parscit/resources/sectLabel/genericSect.model +0 -0
  90. data/parscit/resources/sectLabel/sectLabel.config +42 -0
  91. data/parscit/resources/sectLabel/sectLabel.configXml +42 -0
  92. data/parscit/resources/sectLabel/sectLabel.model +0 -0
  93. data/sh/convert_to_text.sh +20 -0
  94. data/spec/biblicit/extractor_spec.rb +121 -0
  95. data/spec/fixtures/Review_of_Michael_Tyes_Consciousness_Revisited.docx +0 -0
  96. data/spec/fixtures/critical-infrastructures.ps +63951 -0
  97. data/spec/fixtures/txt/E06-1050.txt +867 -0
  98. data/spec/fixtures/txt/sample1.txt +902 -0
  99. data/spec/fixtures/txt/sample2.txt +394 -0
  100. data/spec/spec_helper.rb +3 -0
  101. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Function.pm +2 -20
  102. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +0 -7
  103. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Parser.pm +0 -2
  104. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +0 -7
  105. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +6 -1
  106. data/svm-header-parse/HeaderParseService/tmp/.gitignore +4 -0
  107. data/svm-header-parse/extract.pl +75 -0
  108. metadata +351 -317
  109. data/perl/DocFilter/lib/DocFilter/Config.pm +0 -35
  110. data/perl/DocFilter/lib/DocFilter/Filter.pm +0 -51
  111. data/perl/FileConversionService/README.TXT +0 -11
  112. data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
  113. data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +0 -140
  114. data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +0 -77
  115. data/perl/FileConversionService/lib/FileConverter/Compression.pm +0 -137
  116. data/perl/FileConversionService/lib/FileConverter/Config.pm +0 -57
  117. data/perl/FileConversionService/lib/FileConverter/Controller.pm +0 -191
  118. data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +0 -61
  119. data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +0 -69
  120. data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +0 -69
  121. data/perl/FileConversionService/lib/FileConverter/PSToText.pm +0 -88
  122. data/perl/FileConversionService/lib/FileConverter/Prescript.pm +0 -68
  123. data/perl/FileConversionService/lib/FileConverter/TET.pm +0 -75
  124. data/perl/FileConversionService/lib/FileConverter/Utils.pm +0 -130
  125. data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +0 -140
  126. data/perl/HeaderParseService/resources/data/EbizHeaders.txt +0 -24330
  127. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +0 -27506
  128. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +0 -26495
  129. data/perl/HeaderParseService/resources/data/tagged_headers.txt +0 -40668
  130. data/perl/HeaderParseService/resources/data/test_header.txt +0 -31
  131. data/perl/HeaderParseService/resources/data/test_header.txt.parsed +0 -31
  132. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +0 -23
  133. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +0 -23
  134. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +0 -23
  135. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +0 -23
  136. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +0 -23
  137. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +0 -23
  138. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +0 -23
  139. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +0 -23
  140. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +0 -23
  141. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +0 -23
  142. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +0 -23
  143. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +0 -23
  144. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +0 -23
  145. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +0 -23
  146. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +0 -23
  147. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +0 -23
  148. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +0 -23
  149. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +0 -23
  150. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +0 -23
  151. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +0 -23
  152. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +0 -23
  153. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +0 -23
  154. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +0 -23
  155. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +0 -23
  156. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +0 -23
  157. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +0 -23
  158. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +0 -23
  159. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +0 -23
  160. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +0 -23
  161. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +0 -23
  162. data/perl/ParsCit/crfpp/traindata/parsCit.template +0 -60
  163. data/perl/ParsCit/crfpp/traindata/parsCit.train.data +0 -12104
  164. data/perl/ParsCit/crfpp/traindata/tagged_references.txt +0 -500
  165. data/perl/ParsCit/lib/CSXUtil/SafeText.pm +0 -140
  166. data/perl/ParsCit/lib/ParsCit/Citation.pm +0 -462
  167. data/perl/ParsCit/lib/ParsCit/CitationContext.pm +0 -132
  168. data/perl/ParsCit/lib/ParsCit/Config.pm +0 -46
  169. data/perl/ParsCit/lib/ParsCit/Controller.pm +0 -306
  170. data/perl/ParsCit/lib/ParsCit/PostProcess.pm +0 -367
  171. data/perl/ParsCit/lib/ParsCit/PreProcess.pm +0 -333
  172. data/perl/ParsCit/resources/parsCit.model +0 -0
  173. data/perl/extract.pl +0 -199
  174. data/spec/biblicit/cb2bib_spec.rb +0 -48
  175. data/spec/biblicit/citeseer_spec.rb +0 -40
  176. /data/{perl → svm-header-parse}/HeaderParseService/README.TXT +0 -0
  177. /data/{perl/DocFilter → svm-header-parse/HeaderParseService}/lib/CSXUtil/SafeText.pm +0 -0
  178. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +0 -0
  179. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +0 -0
  180. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +0 -0
  181. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/50states +0 -0
  182. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AddrTopWords.txt +0 -0
  183. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWords.txt +0 -0
  184. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWordsAll.txt +0 -0
  185. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/ChineseSurNames.txt +0 -0
  186. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames.bin +0 -0
  187. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
  188. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/DomainSuffixes.txt +0 -0
  189. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/LabeledHeader +0 -0
  190. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/README +0 -0
  191. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines +0 -0
  192. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines1 +0 -0
  193. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstract.txt +0 -0
  194. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstractTopWords +0 -0
  195. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/addr.txt +0 -0
  196. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affi.txt +0 -0
  197. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affis.bin +0 -0
  198. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
  199. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/allnamewords.bin +0 -0
  200. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_US.txt +0 -0
  201. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_world.txt +0 -0
  202. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/city.txt +0 -0
  203. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cityname.txt +0 -0
  204. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/country_abbr.txt +0 -0
  205. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/countryname.txt +0 -0
  206. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/dateTopWords +0 -0
  207. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/degree.txt +0 -0
  208. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/email.txt +0 -0
  209. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/excludeWords.txt +0 -0
  210. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/female-names +0 -0
  211. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstNames.txt +0 -0
  212. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames.bin +0 -0
  213. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
  214. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/intro.txt +0 -0
  215. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keyword.txt +0 -0
  216. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keywordTopWords +0 -0
  217. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/male-names +0 -0
  218. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/middleNames.txt +0 -0
  219. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/month.txt +0 -0
  220. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul +0 -0
  221. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label +0 -0
  222. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label.old +0 -0
  223. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.processed +0 -0
  224. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulAuthor +0 -0
  225. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulClassStat +0 -0
  226. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nickname.txt +0 -0
  227. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nicknames.bin +0 -0
  228. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/note.txt +0 -0
  229. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/page.txt +0 -0
  230. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/phone.txt +0 -0
  231. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/postcode.txt +0 -0
  232. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/pubnum.txt +0 -0
  233. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.bin +0 -0
  234. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.txt +0 -0
  235. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/states_and_abbreviations.txt +0 -0
  236. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords +0 -0
  237. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords.bin +0 -0
  238. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surNames.txt +0 -0
  239. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames.bin +0 -0
  240. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames_spec.bin +0 -0
  241. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/A.html +0 -0
  242. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/B.html +0 -0
  243. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/C.html +0 -0
  244. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/D.html +0 -0
  245. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/E.html +0 -0
  246. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/F.html +0 -0
  247. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/G.html +0 -0
  248. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/H.html +0 -0
  249. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/I.html +0 -0
  250. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/J.html +0 -0
  251. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/K.html +0 -0
  252. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/L.html +0 -0
  253. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/M.html +0 -0
  254. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/N.html +0 -0
  255. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/O.html +0 -0
  256. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/P.html +0 -0
  257. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Q.html +0 -0
  258. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/R.html +0 -0
  259. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/S.html +0 -0
  260. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/T.html +0 -0
  261. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/U.html +0 -0
  262. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/V.html +0 -0
  263. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/W.html +0 -0
  264. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
  265. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/X.html +0 -0
  266. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Y.html +0 -0
  267. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Z.html +0 -0
  268. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ae.html +0 -0
  269. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/am.html +0 -0
  270. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ar.html +0 -0
  271. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/at.html +0 -0
  272. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/au.html +0 -0
  273. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bd.html +0 -0
  274. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/be.html +0 -0
  275. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bg.html +0 -0
  276. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bh.html +0 -0
  277. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
  278. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bm.html +0 -0
  279. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bn.html +0 -0
  280. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/br.html +0 -0
  281. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ca.html +0 -0
  282. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ch.html +0 -0
  283. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cl.html +0 -0
  284. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cn.html +0 -0
  285. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/co.html +0 -0
  286. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cr.html +0 -0
  287. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cy.html +0 -0
  288. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cz.html +0 -0
  289. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/de.html +0 -0
  290. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
  291. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dk.html +0 -0
  292. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ec.html +0 -0
  293. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ee.html +0 -0
  294. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/eg.html +0 -0
  295. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/es.html +0 -0
  296. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/et.html +0 -0
  297. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/faq.html +0 -0
  298. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fi.html +0 -0
  299. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fj.html +0 -0
  300. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fo.html +0 -0
  301. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fr.html +0 -0
  302. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/geog.html +0 -0
  303. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gr.html +0 -0
  304. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gu.html +0 -0
  305. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hk.html +0 -0
  306. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hr.html +0 -0
  307. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hu.html +0 -0
  308. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/id.html +0 -0
  309. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ie.html +0 -0
  310. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/il.html +0 -0
  311. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/in.html +0 -0
  312. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/is.html +0 -0
  313. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/it.html +0 -0
  314. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jm.html +0 -0
  315. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jo.html +0 -0
  316. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jp.html +0 -0
  317. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
  318. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kr.html +0 -0
  319. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kw.html +0 -0
  320. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lb.html +0 -0
  321. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
  322. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lk.html +0 -0
  323. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lt.html +0 -0
  324. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lu.html +0 -0
  325. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lv.html +0 -0
  326. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ma.html +0 -0
  327. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
  328. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
  329. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mk.html +0 -0
  330. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mo.html +0 -0
  331. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
  332. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mt.html +0 -0
  333. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mx.html +0 -0
  334. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/my.html +0 -0
  335. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ni.html +0 -0
  336. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nl.html +0 -0
  337. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/no.html +0 -0
  338. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nz.html +0 -0
  339. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pa.html +0 -0
  340. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pe.html +0 -0
  341. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ph.html +0 -0
  342. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pl.html +0 -0
  343. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
  344. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pr.html +0 -0
  345. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ps.html +0 -0
  346. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pt.html +0 -0
  347. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/recognition.html +0 -0
  348. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/results.html +0 -0
  349. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ro.html +0 -0
  350. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ru.html +0 -0
  351. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sd.html +0 -0
  352. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/se.html +0 -0
  353. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sg.html +0 -0
  354. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/si.html +0 -0
  355. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sk.html +0 -0
  356. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/th.html +0 -0
  357. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tr.html +0 -0
  358. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tw.html +0 -0
  359. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ua.html +0 -0
  360. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uk.html +0 -0
  361. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ-full.html +0 -0
  362. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ.html +0 -0
  363. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uy.html +0 -0
  364. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ve.html +0 -0
  365. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/yu.html +0 -0
  366. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/za.html +0 -0
  367. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/zm.html +0 -0
  368. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list.txt +0 -0
  369. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/url.txt +0 -0
  370. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/webTopWords +0 -0
  371. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/words +0 -0
  372. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10ContextModelfold1 +0 -0
  373. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10Modelfold1 +0 -0
  374. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11ContextModelfold1 +0 -0
  375. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11Modelfold1 +0 -0
  376. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12ContextModelfold1 +0 -0
  377. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12Modelfold1 +0 -0
  378. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13ContextModelfold1 +0 -0
  379. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13Modelfold1 +0 -0
  380. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14ContextModelfold1 +0 -0
  381. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14Modelfold1 +0 -0
  382. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15ContextModelfold1 +0 -0
  383. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15Modelfold1 +0 -0
  384. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1ContextModelfold1 +0 -0
  385. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1Modelfold1 +0 -0
  386. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2ContextModelfold1 +0 -0
  387. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2Modelfold1 +0 -0
  388. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3ContextModelfold1 +0 -0
  389. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3Modelfold1 +0 -0
  390. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4ContextModelfold1 +0 -0
  391. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4Modelfold1 +0 -0
  392. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5ContextModelfold1 +0 -0
  393. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5Modelfold1 +0 -0
  394. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6ContextModelfold1 +0 -0
  395. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6Modelfold1 +0 -0
  396. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7ContextModelfold1 +0 -0
  397. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7Modelfold1 +0 -0
  398. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8ContextModelfold1 +0 -0
  399. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8Modelfold1 +0 -0
  400. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9ContextModelfold1 +0 -0
  401. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9Modelfold1 +0 -0
  402. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceModel +0 -0
  403. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceTrainF +0 -0
  404. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperBaseFeaDict +0 -0
  405. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperContextFeaDict +0 -0
  406. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +0 -0
@@ -0,0 +1,964 @@
1
+ #!/usr/bin/perl
2
+
3
+ # Author: Do Hoang Nhat Huy <huydo@comp.nus.edu.sg>
4
+ # Modified from template by Min-Yen Kan <kanmy@comp.nus.edu.sg>
5
+
6
+ require 5.0;
7
+ use strict;
8
+
9
+ # Dependencies
10
+ use FindBin;
11
+ use Getopt::Long;
12
+ use HTML::Entities;
13
+
14
+ # I do not know a better solution to find a lib path in -T mode.
15
+ # So if you know a better solution, I'd be glad to hear.
16
+ # See this http://www.perlmonks.org/?node_id=585299 for why I used the below code
17
+
18
+ # To get correct path in case 2 scripts in different directories use FindBin
19
+ FindBin::again();
20
+ my $path = undef;
21
+ BEGIN
22
+ {
23
+ if ($FindBin::Bin =~ /(.*)/) { $path = $1; }
24
+ }
25
+ use lib "$path/../../lib";
26
+
27
+ use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/5.10.0";
28
+ use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/site_perl/5.10.0";
29
+
30
+ # Local libraries
31
+ use Omni::Config;
32
+ use Omni::Omnidoc;
33
+ use SectLabel::PreProcess;
34
+
35
+ # Omnilib configuration: object name
36
+ my $obj_list = $Omni::Config::obj_list;
37
+
38
+ ### USER customizable section
39
+ $0 =~ /([^\/]+)$/; my $progname = $1;
40
+ my $version = "1.0";
41
+ ### END user customizable section
42
+
43
+ sub License
44
+ {
45
+ print STDERR "# Copyright 2011 \251 by Do Hoang Nhat Huy\n";
46
+ }
47
+
48
+ sub Help
49
+ {
50
+ print STDERR "Process Omnipage XML output (concatenated results fromm all pages of a PDF file), and extract text lines together with other XML infos\n";
51
+ print STDERR "usage: $progname -h\t[invokes help]\n";
52
+ print STDERR " $progname -in xmlfile -out outfile [-decode] [-log]\n";
53
+ print STDERR "Options:\n";
54
+ print STDERR "\t-q \tQuiet Mode (don't echo license)\n";
55
+ print STDERR "\t-decode \tDecode HTML entities and then output, to avoid double entity encoding later\n";
56
+ }
57
+
58
+ my $quite = 0;
59
+ my $help = 0;
60
+ my $out_file = undef;
61
+ my $in_file = undef;
62
+ my $is_decode = 0;
63
+ my $is_debug = 0;
64
+ my $address = 1;
65
+
66
+ $help = 1 unless GetOptions( 'in=s' => \$in_file,
67
+ 'out=s' => \$out_file,
68
+ 'decode' => \$is_decode,
69
+ 'log' => \$is_debug,
70
+ 'h' => \$help,
71
+ 'q' => \$quite );
72
+
73
+ if ($help || ! defined $in_file || ! defined $out_file)
74
+ {
75
+ Help();
76
+ exit(0);
77
+ }
78
+
79
+ if (!$quite)
80
+ {
81
+ License();
82
+ }
83
+
84
+ ### Untaint ###
85
+ $in_file = UntaintPath($in_file);
86
+ $out_file = UntaintPath($out_file);
87
+ $ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
88
+ ### End untaint ###
89
+
90
+ # Mark page, para, line, word
91
+ my %g_page_hash = ();
92
+
93
+ # Mark paragraph
94
+ my @g_para = ();
95
+
96
+ # XML features
97
+ # Location feature
98
+ my @g_pos_hash = ();
99
+ my $g_maxpos = 0;
100
+ my $g_minpos = 1000000;
101
+ # Align feature
102
+ my @g_align = ();
103
+ # Bold feature
104
+ my @g_bold = ();
105
+ # Italic feature
106
+ my @g_italic = ();
107
+ # Pic feature
108
+ my @g_pic = ();
109
+ # Table feature
110
+ my @g_table = ();
111
+ # Bullet feature
112
+ my @g_bullet = ();
113
+ # Font size feature
114
+ my %g_font_size_hash = ();
115
+ my @g_font_size = ();
116
+ # Font face feature
117
+ my %g_font_face_hash = ();
118
+ my @g_font_face = ();
119
+
120
+ # All lines
121
+ my @lines = ();
122
+ # and their address
123
+ my @lines_addr = ();
124
+
125
+ # BEGIN
126
+ ProcessFile($in_file);
127
+ # Find header part
128
+ my $num_lines = scalar(@lines);
129
+ my ($header_length, $body_length, $body_start_id) = SectLabel::PreProcess::FindHeaderText(\@lines, 0, $num_lines);
130
+ # Done
131
+ Output(\@lines, $out_file);
132
+
133
+ if ($address == 1)
134
+ {
135
+ my $address_handle = undef;
136
+ # Save the line address for further use
137
+ open($address_handle, ">:utf8", $out_file . ".address") || die"#Can't open file \"$out_file.address\"\n";
138
+ foreach my $addr (@lines_addr)
139
+ {
140
+ print $address_handle $addr->{ 'L1' }, " ", $addr->{ 'L2' }, " ", $addr->{ 'L3' }, " ", $addr->{ 'L4' }, "\n";
141
+ }
142
+ # Done
143
+ close $address_handle;
144
+ }
145
+ # END
146
+
147
+ sub ProcessFile
148
+ {
149
+ my ($in_file) = @_;
150
+
151
+ my $input_handle = undef;
152
+ if (! open($input_handle, "<:utf8", $in_file)) { die "Could not open xml file " . $in_file; }
153
+ my $xml = do { local $/; <$input_handle> };
154
+ close $input_handle;
155
+
156
+ ###
157
+ # Huydhn
158
+ # NOTE: the omnipage xml is not well constructed (concatenated multiple xml files).
159
+ # This merged xml need to be fixed first before pass it to xml processing libraries, e.g. xml::twig
160
+ ###
161
+ # Convert to Unix format
162
+ $xml =~ s/\r//g;
163
+ # Remove <?xml version="1.0" encoding="UTF-8"?>
164
+ $xml =~ s/<\?xml.+?>\n//g;
165
+ # Remove <!--XML document generated using OCR technology from ScanSoft, Inc.-->
166
+ $xml =~ s/<\!\-\-XML.+?>\n//g;
167
+ # Declaration and root
168
+ $xml = "<?xml version=\"1.0\"?>" . "\n" . "<root>" . "\n" . $xml . "\n" . "</root>";
169
+
170
+ # New document
171
+ my $doc = new Omni::Omnidoc();
172
+ $doc->set_raw($xml);
173
+
174
+ # Current position
175
+ my %current = ();
176
+
177
+ # All pages in the document
178
+ my $pages = $doc->get_objs_ref();
179
+
180
+ # From page, To page
181
+ my $start_page = 0;
182
+ my $end_page = scalar(@{ $pages }) - 1;
183
+
184
+ # Image area flag
185
+ my $is_pic = 0;
186
+
187
+ # Tree traveling is 'not' fun. Seriously.
188
+ # This is like a dungeon seige.
189
+ for (my $x = $start_page; $x <= $end_page; $x++)
190
+ {
191
+ # Current position
192
+ $current{ 'L1' } = $x;
193
+
194
+ # Column or dd
195
+ my $level_2 = $pages->[ $x ]->get_objs_ref();
196
+ my $start_l2 = 0;
197
+ my $end_l2 = scalar(@{ $level_2 }) - 1;
198
+
199
+ for (my $y = $start_l2; $y <= $end_l2; $y++)
200
+ {
201
+ # Thang's code
202
+ # Thang considers <dd> tag as image, I just follow that
203
+ if ($level_2->[ $y ]->get_name() eq $obj_list->{ 'OMNIDD' })
204
+ {
205
+ $is_pic = 1;
206
+ }
207
+ else
208
+ {
209
+ $is_pic = 0;
210
+ }
211
+ # End Thang's code
212
+
213
+ # Current position
214
+ $current{ 'L2' } = $y;
215
+
216
+ # Table or paragraph
217
+ my $level_3 = $level_2->[ $y ]->get_objs_ref();
218
+ my $start_l3 = 0;
219
+ my $end_l3 = scalar(@{ $level_3 }) - 1;
220
+
221
+ for (my $z = $start_l3; $z <= $end_l3; $z++)
222
+ {
223
+ # Current position
224
+ $current{ 'L3' } = $z;
225
+
226
+ # Is a paragraph
227
+ if ($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNIPARA' })
228
+ {
229
+ # Thang's code
230
+ ProcessPara($level_3->[ $z ], $is_pic, \%current);
231
+ # End Thang's code
232
+ }
233
+ # or a table
234
+ elsif ($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNITABLE' })
235
+ {
236
+ # Thang's code
237
+ ProcessTable($level_3->[ $z ], $is_pic, \%current, 0);
238
+ # End Thangs's code
239
+ }
240
+ # or a frame
241
+ elsif ($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNIFRAME' })
242
+ {
243
+ # Frame contains multiple paragraph ?
244
+ ProcessFrame($level_3->[ $z ], $is_pic, \%current);
245
+ }
246
+ }
247
+ }
248
+ }
249
+ }
250
+
251
+ sub Output
252
+ {
253
+ my ($lines, $out_file) = @_;
254
+
255
+ my $output_handle = undef;
256
+ # This is the output
257
+ open($output_handle, ">:utf8", $out_file) || die"#Can't open file \"$out_file\"\n";
258
+
259
+ # XML feature label
260
+ my %g_font_size_labels = ();
261
+ GetFontSizeLabels(\%g_font_size_hash, \%g_font_size_labels);
262
+
263
+ my $output = "";
264
+ my $para_line_id = -1;
265
+ my $para_line_count = 0;
266
+
267
+ # This is the index of the line
268
+ my $id = 0;
269
+ # For each line in the whole document
270
+ foreach my $line (@{ $lines })
271
+ {
272
+ # Remove empty line
273
+ $line =~ s/^\s+|\s+$//g;
274
+
275
+ # New paragraph
276
+ if (($g_para[ $id ] eq "yes") && ($output ne ""))
277
+ {
278
+ if ($is_decode) { $output = decode_entities($output); }
279
+ # Write output to file
280
+ print $output_handle $output;
281
+ # Clean output for new paragraph
282
+ $output = "";
283
+ }
284
+
285
+ $output .= $line;
286
+
287
+ my $loc_feature = undef;
288
+ # XML location feature
289
+ if ($g_pos_hash[ $id ] != (-1)) { $loc_feature = "xmlLoc_".int(($g_pos_hash[$id] - $g_minpos) * 8.0 / ($g_maxpos - $g_minpos + 1)); }
290
+
291
+ # Align feature
292
+ my $align_feature = "xmlAlign_" . $g_align[ $id ];
293
+
294
+ my $font_size_feature = undef;
295
+ # Font_size feature
296
+ if (($g_font_size[$id] eq "") || ($g_font_size[$id] == -1))
297
+ {
298
+ $font_size_feature = "xmlFontSize_none";
299
+ }
300
+ else
301
+ {
302
+ $font_size_feature = "xmlFontSize_" . $g_font_size_labels{ $g_font_size[ $id ] };
303
+ }
304
+
305
+ # Bold feature
306
+ my $bold_feature = "xmlBold_" . $g_bold[ $id ];
307
+ # Italic feature
308
+ my $italic_feature = "xmlItalic_" . $g_italic[ $id ];
309
+ # Image feature
310
+ my $pic_feature = "xmlPic_" . $g_pic[ $id ];
311
+ # Table feature
312
+ my $table_feature = "xmlTable_" . $g_table[ $id ];
313
+ # Bullet feature
314
+ my $bullet_feature = "xmlBullet_" . $g_bullet[ $id ];
315
+ # Differential features
316
+ my ($align_diff, $font_size_diff, $font_face_diff, $font_sf_diff, $font_sfbi_diff, $font_sfbia_diff, $para_diff) = GetDifferentialFeatures($id);
317
+
318
+ # Each line and its XML features
319
+ $output .= " |XML| $loc_feature $bold_feature $italic_feature $font_size_feature $pic_feature $table_feature $bullet_feature $font_sfbia_diff $para_diff" . "\n";
320
+
321
+ # Update line index
322
+ $id++;
323
+ }
324
+
325
+ # New paragraph
326
+ if ($output ne "")
327
+ {
328
+ if ($is_decode) { $output = decode_entities($output); }
329
+ # Write output to file
330
+ print $output_handle $output;
331
+ # Clean output for new paragraph
332
+ $output = "";
333
+ }
334
+
335
+ # Done
336
+ close $output_handle;
337
+ }
338
+
339
+ sub GetDifferentialFeatures
340
+ {
341
+ my ($id) = @_;
342
+
343
+ my $align_diff = "bi_xmlA_";
344
+ # AlignChange feature
345
+ if ($id == 0)
346
+ {
347
+ $align_diff .= $g_align[ $id ];
348
+ }
349
+ elsif ($g_align[ $id ] eq $g_align[ $id - 1 ])
350
+ {
351
+ $align_diff .= "continue";
352
+ }
353
+ else
354
+ {
355
+ $align_diff .= $g_align[$id];
356
+ }
357
+
358
+ my $font_face_diff = "bi_xmlF_";
359
+ # FontFaceChange feature
360
+ if ($id == 0)
361
+ {
362
+ $font_face_diff .= "new";
363
+ }
364
+ elsif ($g_font_face[ $id ] eq $g_font_face[ $id - 1 ])
365
+ {
366
+ $font_face_diff .= "continue";
367
+ }
368
+ else
369
+ {
370
+ $font_face_diff .= "new";
371
+ }
372
+
373
+ my $font_size_diff = "bi_xmlS_";
374
+ # FontSizeChange feature
375
+ if ($id == 0)
376
+ {
377
+ $font_size_diff .= "new";
378
+ }
379
+ elsif ($g_font_size[ $id ] == $g_font_size[ $id - 1 ])
380
+ {
381
+ $font_size_diff .= "continue";
382
+ }
383
+ else
384
+ {
385
+ $font_size_diff .= "new";
386
+ }
387
+
388
+ my $font_sf_diff = "bi_xmlSF_";
389
+ # FontSFChange feature
390
+ if ($id == 0)
391
+ {
392
+ $font_sf_diff .= "new";
393
+ }
394
+ elsif ($g_font_size[ $id ] == $g_font_size[ $id - 1 ] && $g_font_face[ $id ] eq $g_font_face[ $id - 1 ])
395
+ {
396
+ $font_sf_diff .= "continue";
397
+ }
398
+ else
399
+ {
400
+ $font_sf_diff .= "new";
401
+ }
402
+
403
+ my $font_sfbi_diff = "bi_xmlSFBI_";
404
+ # FontSFBIChange feature
405
+ if ($id == 0)
406
+ {
407
+ $font_sfbi_diff .= "new";
408
+ }
409
+ elsif ($g_font_size[ $id ] == $g_font_size[ $id - 1 ] && $g_font_face[ $id ] eq $g_font_face[ $id - 1 ] && $g_bold[ $id ] eq $g_bold[ $id - 1 ] && $g_italic[ $id ] eq $g_italic[ $id - 1 ])
410
+ {
411
+ $font_sfbi_diff .= "continue";
412
+ }
413
+ else
414
+ {
415
+ $font_sfbi_diff .= "new";
416
+ }
417
+
418
+ my $font_sfbia_diff = "bi_xmlSFBIA_";
419
+ # FontSFBIAChange feature
420
+ if ($id == 0)
421
+ {
422
+ $font_sfbia_diff .= "new";
423
+ }
424
+ elsif ($g_font_size[ $id ] == $g_font_size[ $id - 1 ] && $g_font_face[ $id ] eq $g_font_face[ $id - 1 ] && $g_bold[ $id ] eq $g_bold[ $id - 1 ] && $g_italic[ $id ] eq $g_italic[$id - 1] && $g_align[ $id ] eq $g_align[ $id - 1 ])
425
+ {
426
+ $font_sfbia_diff .= "continue";
427
+ }
428
+ else
429
+ {
430
+ $font_sfbia_diff .= "new";
431
+ }
432
+
433
+ # ParaChange feature
434
+ my $para_diff = "bi_xmlPara_";
435
+ # Header part, consider each line as a separate paragraph
436
+ if ($id < $body_start_id)
437
+ {
438
+ $para_diff .= "header";
439
+ }
440
+ else
441
+ {
442
+ if($g_para[$id] eq "yes")
443
+ {
444
+ $para_diff .= "new";
445
+ }
446
+ else
447
+ {
448
+ $para_diff .= "continue";
449
+ }
450
+ }
451
+
452
+ return ($align_diff, $font_size_diff, $font_face_diff, $font_sf_diff, $font_sfbi_diff, $font_sfbia_diff, $para_diff);
453
+ }
454
+
455
+ sub GetFontSizeLabels
456
+ {
457
+ my ($g_font_size_hash, $g_font_size_labels) = @_;
458
+
459
+ # Sort by value in desccending order
460
+ my @sorted_fonts = sort { $g_font_size_hash->{ $b } <=> $g_font_size_hash->{ $a } } keys %{ $g_font_size_hash };
461
+ # and get the
462
+ my $common_size = $sorted_fonts[ 0 ];
463
+
464
+ # Sort by key in ascending order
465
+ @sorted_fonts = sort { $a <=> $b } keys %{ $g_font_size_hash };
466
+
467
+ my $common_index = 0;
468
+ # Index of common font size
469
+ foreach (@sorted_fonts)
470
+ {
471
+ # Found
472
+ if ($common_size == $_) { last; }
473
+ $common_index++;
474
+ }
475
+
476
+ # Small fonts
477
+ for (my $i = 0; $i < $common_index; $i++)
478
+ {
479
+ $g_font_size_labels->{ $sorted_fonts[ $i ] } = "smaller";
480
+ }
481
+
482
+ # Common fonts
483
+ $g_font_size_labels->{ $common_size } = "common";
484
+
485
+ # Large fonts
486
+ for (my $i = ($common_index + 1); $i < scalar(@sorted_fonts); $i++)
487
+ {
488
+ if ((scalar(@sorted_fonts) - $i) <= 3)
489
+ {
490
+ $g_font_size_labels->{ $sorted_fonts[$i] } = "largest" . ($i + 1 - scalar(@sorted_fonts));
491
+ }
492
+ else
493
+ {
494
+ $g_font_size_labels->{ $sorted_fonts[$i] } = "larger";
495
+ }
496
+ }
497
+ }
498
+
499
+ sub ProcessFrame
500
+ {
501
+ my ($omniframe, $is_pic, $line_addr) = @_;
502
+
503
+ # Line index in the whole frame
504
+ my $lindex = 0;
505
+ # All paragraph or table in the frame
506
+ my $objs = $omniframe->get_objs_ref();
507
+ # For each paragraph or table in the frame
508
+ for (my $i = 0; $i < scalar(@{ $objs }); $i++)
509
+ {
510
+ if ($objs->[ $i ]->get_name() eq $obj_list->{ 'OMNIPARA' })
511
+ {
512
+ # Paragraph attributes
513
+ my $align = $objs->[ $i ]->get_alignment();
514
+ my $space = $objs->[ $i ]->get_space_before();
515
+ # Line attributes
516
+ my ($left, $top, $right, $bottom) = undef;
517
+ # Run attributes
518
+ my $bold_count = 0;
519
+ my $italic_count = 0;
520
+ my %font_size_hash = ();
521
+ my %font_face_hash = ();
522
+
523
+ my $omnilines = $objs->[ $i ]->get_objs_ref();
524
+ # For each line in the paragraph
525
+ for (my $t = 0; $t < scalar(@{ $omnilines }); $t++)
526
+ {
527
+ # Save the line
528
+ push @lines, $omnilines->[ $t ]->get_content();
529
+ # Save the line's address
530
+ $line_addr->{ 'L4' } = $lindex;
531
+ push @lines_addr, { %{ $line_addr } };
532
+ # Point to the next line in the whole frame
533
+ $lindex++;
534
+
535
+ # Line attributes
536
+ $left = $omnilines->[ $t ]->get_left_pos();
537
+ $right = $omnilines->[ $t ]->get_right_pos();
538
+ $top = $omnilines->[ $t ]->get_top_pos();
539
+ $bottom = $omnilines->[ $t ]->get_bottom_pos();
540
+
541
+ # Runs
542
+ my $runs = $omnilines->[ $t ]->get_objs_ref();
543
+ my $start_r = 0;
544
+ my $end_r = scalar(@{ $runs }) - 1;
545
+
546
+ # Total number of words in a line
547
+ my $words_count = 0;
548
+
549
+ for (my $u = $start_r; $u <= $end_r; $u++)
550
+ {
551
+ # Thang's compatible code (instead of using get_objs_ref)
552
+ my $rcontent = undef;
553
+ # Get run content
554
+ $rcontent = $runs->[ $u ]->get_content();
555
+ # Trim
556
+ $rcontent =~ s/^\s+|\s+$//g;
557
+ # Split to words
558
+ my @words = split(/\s+/, $rcontent);
559
+
560
+ # Update the number of words
561
+ $words_count += scalar(@words);
562
+
563
+ # XML format
564
+ my $font_size = $runs->[ $u ]->get_font_size();
565
+ $font_size_hash{ $font_size } = $font_size_hash{ $font_size } ? $font_size_hash{ $font_size } + scalar(@words) : scalar(@words);
566
+ # XML format
567
+ my $font_face = $runs->[ $u ]->get_font_face();
568
+ $font_face_hash{ $font_face } = $font_face_hash{ $font_face } ? $font_face_hash{ $font_face } + scalar(@words) : scalar(@words);
569
+ # XML format
570
+ if ($runs->[ $u ]->get_bold() eq "true") { $bold_count += scalar(@words); }
571
+ # XML format
572
+ if ($runs->[ $u ]->get_italic() eq "true") { $italic_count += scalar(@words); }
573
+ }
574
+
575
+ # Line attributes - relative position in paragraph
576
+ if ($t == 0)
577
+ {
578
+ push @g_para, "yes";
579
+ }
580
+ else
581
+ {
582
+ push @g_para, "no";
583
+ }
584
+
585
+ # Line attributes - line position
586
+ my $pos = ($top + $bottom) / 2.0;
587
+ # Compare to global min and max position
588
+ if ($pos < $g_minpos) { $g_minpos = $pos; }
589
+ if ($pos > $g_maxpos) { $g_maxpos = $pos; }
590
+ # Pos feature
591
+ push @g_pos_hash, $pos;
592
+ # Alignment feature
593
+ push @g_align, $align;
594
+ # Table feature
595
+ push @g_table, "no";
596
+
597
+ if ($is_pic)
598
+ {
599
+ push @g_pic, "yes";
600
+ # Not assign value if line is in image area
601
+ push @g_bold, "no";
602
+ push @g_italic, "no";
603
+ push @g_bullet, "no";
604
+ push @g_font_size, -1;
605
+ push @g_font_face, "none";
606
+ }
607
+ else
608
+ {
609
+ push @g_pic, "no";
610
+ UpdateXMLFontFeature(\%font_size_hash, \%font_face_hash);
611
+ UpdateXMLFeatures($bold_count, $italic_count, $words_count, $omnilines->[ $t ]->get_bullet(), $space);
612
+ }
613
+
614
+ # Reset hash
615
+ %font_size_hash = ();
616
+ %font_face_hash = ();
617
+ # Reset
618
+ $bold_count = 0;
619
+ $italic_count = 0;
620
+ }
621
+ }
622
+ elsif ($objs->[ $i ]->get_name() eq $obj_list->{ 'OMNITABLE' })
623
+ {
624
+ $lindex = ProcessTable($objs->[ $i ], $is_pic, $line_addr, $lindex);
625
+ }
626
+ }
627
+ }
628
+
629
+ sub ProcessTable
630
+ {
631
+ my ($omnitable, $is_pic, $line_addr, $lindex) = @_;
632
+
633
+ # Table attributes
634
+ my ($left, $top, $right, $bottom) = undef;
635
+ $left = $omnitable->get_left_pos();
636
+ $right = $omnitable->get_right_pos();
637
+ $top = $omnitable->get_top_pos();
638
+ $bottom = $omnitable->get_bottom_pos();
639
+ # Table attributes
640
+ my $align = $omnitable->get_alignment();
641
+
642
+ # Thang's code
643
+ my $pos = ($top + $bottom) / 2.0;
644
+ # Set new min and max position
645
+ if ($pos < $g_minpos) { $g_minpos = $pos; }
646
+ if ($pos > $g_maxpos) { $g_maxpos = $pos; }
647
+ # End Thangs's code
648
+
649
+ # All row in the table
650
+ my $rows = $omnitable->get_row_content();
651
+ # For each row in the table
652
+ for (my $i = 0; $i < scalar(@{ $rows }); $i++)
653
+ {
654
+ my @row_lines = split(/\n/, $rows->[ $i ]);
655
+ # For each line in the row
656
+ for (my $j = 0; $j < scalar(@row_lines); $j++)
657
+ {
658
+ # Save the line
659
+ push @lines, $row_lines[ $j ];
660
+ # Save the line's address
661
+ $line_addr->{ 'L4' } = $lindex;
662
+ push @lines_addr, { %{ $line_addr } };
663
+ # Point to the next line in the whole table
664
+ $lindex++;
665
+
666
+ if (($j == 0) && ($i == 0))
667
+ {
668
+ push @g_para, "yes";
669
+ }
670
+ else
671
+ {
672
+ push @g_para, "no";
673
+ }
674
+
675
+ # Table feature
676
+ push @g_table, "yes";
677
+
678
+ # Pic feature
679
+ if ($is_pic)
680
+ {
681
+ push @g_pic, "yes";
682
+ }
683
+ else
684
+ {
685
+ push @g_pic, "no";
686
+ }
687
+
688
+ # Update xml pos value
689
+ push @g_pos_hash, $pos;
690
+ # Update xml alignment value
691
+ push @g_align, $align;
692
+
693
+ # Fontsize feature
694
+ push @g_font_size, -1;
695
+ # Fontface feature
696
+ push @g_font_face, "none";
697
+ # Bold feature
698
+ push @g_bold, "no";
699
+ # Italic feature
700
+ push @g_italic, "no";
701
+ # Bullet feature
702
+ push @g_bullet, "no";
703
+ }
704
+ }
705
+
706
+ # Nonsense
707
+ return $lindex;
708
+ }
709
+
710
+ sub ProcessPara
711
+ {
712
+ my ($paragraph, $is_pic, $line_addr) = @_;
713
+
714
+ # Paragraph attributes
715
+ my $align = $paragraph->get_alignment();
716
+ my $space = $paragraph->get_space_before();
717
+ # Line attributes
718
+ my ($left, $top, $right, $bottom) = undef;
719
+ # Run attributes
720
+ my $bold_count = 0;
721
+ my $italic_count = 0;
722
+ my %font_size_hash = ();
723
+ my %font_face_hash = ();
724
+
725
+ # Lines
726
+ my $omnilines = $paragraph->get_objs_ref();
727
+ my $start_l = 0;
728
+ my $end_l = scalar(@{ $omnilines }) - 1;
729
+
730
+ # Lines
731
+ for (my $t = $start_l; $t <= $end_l; $t++)
732
+ {
733
+ # Skip blank line
734
+ my $lcontent = $omnilines->[ $t ]->get_content();
735
+ $lcontent =~ s/^\s+|\s+$//g;
736
+ # Skip blank line
737
+ if ($lcontent eq "") { next; }
738
+
739
+ # Save the line
740
+ push @lines, $omnilines->[ $t ]->get_content();
741
+ # Save the line's address
742
+ $line_addr->{ 'L4' } = $t;
743
+ push @lines_addr, { %{ $line_addr } };
744
+
745
+ # Line attributes
746
+ $left = $omnilines->[ $t ]->get_left_pos();
747
+ $right = $omnilines->[ $t ]->get_right_pos();
748
+ $top = $omnilines->[ $t ]->get_top_pos();
749
+ $bottom = $omnilines->[ $t ]->get_bottom_pos();
750
+
751
+ # Runs
752
+ my $runs = $omnilines->[ $t ]->get_objs_ref();
753
+ my $start_r = 0;
754
+ my $end_r = scalar(@{ $runs }) - 1;
755
+
756
+ # Total number of words in a line
757
+ my $words_count = 0;
758
+
759
+ for (my $u = $start_r; $u <= $end_r; $u++)
760
+ {
761
+ # Thang's compatible code (instead of using get_objs_ref)
762
+ my $rcontent = undef;
763
+ # Get run content
764
+ $rcontent = $runs->[ $u ]->get_content();
765
+ # Trim
766
+ $rcontent =~ s/^\s+|\s+$//g;
767
+ # Split to words
768
+ my @words = split(/\s+/, $rcontent);
769
+
770
+ # Update the number of words
771
+ $words_count += scalar(@words);
772
+
773
+ # XML format
774
+ my $font_size = $runs->[ $u ]->get_font_size();
775
+ $font_size_hash{ $font_size } = $font_size_hash{ $font_size } ? $font_size_hash{ $font_size } + scalar(@words) : scalar(@words);
776
+ # XML format
777
+ my $font_face = $runs->[ $u ]->get_font_face();
778
+ $font_face_hash{ $font_face } = $font_face_hash{ $font_face } ? $font_face_hash{ $font_face } + scalar(@words) : scalar(@words);
779
+ # XML format
780
+ if ($runs->[ $u ]->get_bold() eq "true") { $bold_count += scalar(@words); }
781
+ # XML format
782
+ if ($runs->[ $u ]->get_italic() eq "true") { $italic_count += scalar(@words); }
783
+ }
784
+
785
+ # Line attributes - relative position in paragraph
786
+ if ($t == $start_l)
787
+ {
788
+ push @g_para, "yes";
789
+ }
790
+ else
791
+ {
792
+ push @g_para, "no";
793
+ }
794
+
795
+ # Line attributes - line position
796
+ my $pos = ($top + $bottom) / 2.0;
797
+ # Compare to global min and max position
798
+ if ($pos < $g_minpos) { $g_minpos = $pos; }
799
+ if ($pos > $g_maxpos) { $g_maxpos = $pos; }
800
+ # Pos feature
801
+ push @g_pos_hash, $pos;
802
+ # Alignment feature
803
+ push @g_align, $align;
804
+ # Table feature
805
+ push @g_table, "no";
806
+
807
+ if ($is_pic)
808
+ {
809
+ push @g_pic, "yes";
810
+ # Not assign value if line is in image area
811
+ push @g_bold, "no";
812
+ push @g_italic, "no";
813
+ push @g_bullet, "no";
814
+ push @g_font_size, -1;
815
+ push @g_font_face, "none";
816
+ }
817
+ else
818
+ {
819
+ push @g_pic, "no";
820
+ UpdateXMLFontFeature(\%font_size_hash, \%font_face_hash);
821
+ UpdateXMLFeatures($bold_count, $italic_count, $words_count, $omnilines->[ $t ]->get_bullet(), $space);
822
+ }
823
+
824
+ # Reset hash
825
+ %font_size_hash = ();
826
+ %font_face_hash = ();
827
+ # Reset
828
+ $bold_count = 0;
829
+ $italic_count = 0;
830
+ }
831
+ }
832
+
833
+ sub UpdateXMLFontFeature
834
+ {
835
+ my ($font_size_hash, $font_face_hash) = @_;
836
+
837
+ # Font size feature
838
+ if (scalar(keys %{ $font_size_hash }) == 0)
839
+ {
840
+ push @g_font_size, -1;
841
+ }
842
+ else
843
+ {
844
+ my @sorted_fonts = sort { $font_size_hash->{ $b } <=> $font_size_hash->{ $a } } keys %{ $font_size_hash };
845
+
846
+ my $font_size = undef;
847
+ # Iw two font sizes are equal in number, get the larger one
848
+ if ((scalar(@sorted_fonts) != 1) && ($font_size_hash->{ $sorted_fonts[ 0 ] } == $font_size_hash->{ $sorted_fonts[ 1 ] }))
849
+ {
850
+ $font_size = ($sorted_fonts[ 0 ] > $sorted_fonts[ 1 ]) ? $sorted_fonts[ 0 ] : $sorted_fonts[ 1 ];
851
+ }
852
+ else
853
+ {
854
+ $font_size = $sorted_fonts[ 0 ];
855
+ }
856
+
857
+ if ($font_size eq "") { $font_size = 0; }
858
+
859
+ push @g_font_size, $font_size;
860
+ $g_font_size_hash{ $font_size } = $g_font_size_hash{ $font_size } ? $g_font_size_hash{ $font_size } + 1 : 1;
861
+ }
862
+
863
+ # Font face feature
864
+ if (scalar(keys %{ $font_face_hash }) == 0)
865
+ {
866
+ push @g_font_face, "none";
867
+ }
868
+ else
869
+ {
870
+ my @sorted_fonts = sort { $font_face_hash->{ $b } <=> $font_face_hash->{ $a } } keys %{ $font_face_hash };
871
+
872
+ my $font_face = $sorted_fonts[ 0 ];
873
+ push @g_font_face, $font_face;
874
+
875
+ $g_font_face_hash{ $font_face } = $g_font_face_hash{ $font_face } ? $g_font_face_hash{ $font_face } + 1 : 1;
876
+ }
877
+ }
878
+
879
+ sub UpdateXMLFeatures
880
+ {
881
+ my ($bold_count, $italic_count, $words_count, $is_bullet, $space) = @_;
882
+
883
+ # Bold feature
884
+ my $bold_feature = undef;
885
+ if (($words_count != 0) && ($bold_count / $words_count >= 0.667))
886
+ {
887
+ $bold_feature = "yes";
888
+ }
889
+ else
890
+ {
891
+ $bold_feature = "no";
892
+ }
893
+ push @g_bold, $bold_feature;
894
+
895
+ # Italic feature
896
+ my $italic_feature = undef;
897
+ if (($words_count != 0) && ($italic_count / $words_count >= 0.667))
898
+ {
899
+ $italic_feature = "yes";
900
+ }
901
+ else
902
+ {
903
+ $italic_feature = "no";
904
+ }
905
+ push @g_italic, $italic_feature;
906
+
907
+ # Bullet feature
908
+ if ((defined $is_bullet) && ($is_bullet eq "true"))
909
+ {
910
+ push @g_bullet, "yes";
911
+ }
912
+ else
913
+ {
914
+ push @g_bullet, "no";
915
+ }
916
+ }
917
+
918
+ sub UntaintPath
919
+ {
920
+ my ($path) = @_;
921
+
922
+ if ( $path =~ /^([-_\/\w\.]*)$/ )
923
+ {
924
+ $path = $1;
925
+ }
926
+ else
927
+ {
928
+ die "Bad path \"$path\"\n";
929
+ }
930
+
931
+ return $path;
932
+ }
933
+
934
+ sub Untaint
935
+ {
936
+ my ($s) = @_;
937
+ if ($s =~ /^([\w \-\@\(\),\.\/]+)$/)
938
+ {
939
+ $s = $1; # $data now untainted
940
+ }
941
+ else
942
+ {
943
+ die "Bad data in $s"; # log this somewhere
944
+ }
945
+
946
+ return $s;
947
+ }
948
+
949
+ sub Execute
950
+ {
951
+ my ($cmd) = @_;
952
+ $cmd = Untaint($cmd);
953
+ system($cmd);
954
+ }
955
+
956
+ sub NewTmpFile
957
+ {
958
+ my $tmp_file = `date '+%Y%m%d-%H%M%S-$$'`;
959
+ chomp $tmp_file;
960
+ return $tmp_file;
961
+ }
962
+
963
+
964
+