biblicit 1.0 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (406) hide show
  1. data/.gitmodules +3 -0
  2. data/Gemfile +1 -1
  3. data/README.md +125 -30
  4. data/Rakefile +22 -0
  5. data/biblicit.gemspec +9 -7
  6. data/lib/biblicit/cb2bib.rb +10 -11
  7. data/lib/biblicit/citeseer.rb +14 -26
  8. data/lib/biblicit/extractor.rb +40 -19
  9. data/lib/biblicit/parscit.rb +38 -0
  10. data/parscit/.gitignore +8 -0
  11. data/parscit/CHANGELOG +125 -0
  12. data/parscit/COPYING +674 -0
  13. data/parscit/COPYING.LESSER +165 -0
  14. data/parscit/INSTALL +105 -0
  15. data/parscit/README +97 -0
  16. data/{perl/ParsCit/README.TXT → parscit/USAGE} +25 -15
  17. data/parscit/bin/archtest.pl +31 -0
  18. data/parscit/bin/citeExtract.pl +562 -0
  19. data/parscit/bin/conlleval.pl +315 -0
  20. data/parscit/bin/headExtract.pl +40 -0
  21. data/parscit/bin/parsHed/convert2TokenLevel.pl +138 -0
  22. data/parscit/bin/parsHed/keywordGen.pl +308 -0
  23. data/parscit/bin/parsHed/parseXmlHeader.pl +141 -0
  24. data/parscit/bin/parsHed/redo.parsHed.pl +198 -0
  25. data/parscit/bin/parsHed/tr2crfpp_parsHed.pl +521 -0
  26. data/parscit/bin/parseRefStrings.pl +102 -0
  27. data/parscit/bin/phOutput2xml.pl +223 -0
  28. data/parscit/bin/redo.parsCit.pl +105 -0
  29. data/parscit/bin/sectExtract.pl +149 -0
  30. data/parscit/bin/sectLabel/README +110 -0
  31. data/parscit/bin/sectLabel/README.txt +110 -0
  32. data/parscit/bin/sectLabel/genericSect/crossValidation.rb +98 -0
  33. data/parscit/bin/sectLabel/genericSect/extractFeature.rb +104 -0
  34. data/parscit/bin/sectLabel/genericSectExtract.rb +53 -0
  35. data/parscit/bin/sectLabel/getStructureInfo.pl +156 -0
  36. data/parscit/bin/sectLabel/processOmniXML.pl +1427 -0
  37. data/parscit/bin/sectLabel/processOmniXML_new.pl +1025 -0
  38. data/parscit/bin/sectLabel/processOmniXMLv2.pl +1529 -0
  39. data/parscit/bin/sectLabel/processOmniXMLv3.pl +964 -0
  40. data/parscit/bin/sectLabel/redo.sectLabel.pl +219 -0
  41. data/parscit/bin/sectLabel/simplifyOmniXML.pl +382 -0
  42. data/parscit/bin/sectLabel/single2multi.pl +190 -0
  43. data/parscit/bin/sectLabel/tr2crfpp.pl +158 -0
  44. data/parscit/bin/tr2crfpp.pl +260 -0
  45. data/parscit/bin/xml2train.pl +193 -0
  46. data/parscit/lib/CSXUtil/SafeText.pm +130 -0
  47. data/parscit/lib/Omni/Config.pm +93 -0
  48. data/parscit/lib/Omni/Omnicell.pm +263 -0
  49. data/parscit/lib/Omni/Omnicol.pm +292 -0
  50. data/parscit/lib/Omni/Omnidd.pm +328 -0
  51. data/parscit/lib/Omni/Omnidoc.pm +153 -0
  52. data/parscit/lib/Omni/Omniframe.pm +223 -0
  53. data/parscit/lib/Omni/Omniline.pm +423 -0
  54. data/parscit/lib/Omni/Omnipage.pm +282 -0
  55. data/parscit/lib/Omni/Omnipara.pm +232 -0
  56. data/parscit/lib/Omni/Omnirun.pm +303 -0
  57. data/parscit/lib/Omni/Omnitable.pm +336 -0
  58. data/parscit/lib/Omni/Omniword.pm +162 -0
  59. data/parscit/lib/Omni/Traversal.pm +313 -0
  60. data/parscit/lib/ParsCit/.PostProcess.pm.swp +0 -0
  61. data/parscit/lib/ParsCit/Citation.pm +737 -0
  62. data/parscit/lib/ParsCit/CitationContext.pm +220 -0
  63. data/parscit/lib/ParsCit/Config.pm +35 -0
  64. data/parscit/lib/ParsCit/Controller.pm +653 -0
  65. data/parscit/lib/ParsCit/PostProcess.pm +505 -0
  66. data/parscit/lib/ParsCit/PreProcess.pm +1041 -0
  67. data/parscit/lib/ParsCit/Tr2crfpp.pm +1195 -0
  68. data/parscit/lib/ParsHed/Config.pm +49 -0
  69. data/parscit/lib/ParsHed/Controller.pm +143 -0
  70. data/parscit/lib/ParsHed/PostProcess.pm +322 -0
  71. data/parscit/lib/ParsHed/Tr2crfpp.pm +448 -0
  72. data/{perl/ParsCit/lib/ParsCit/Tr2crfpp.pm → parscit/lib/ParsHed/Tr2crfpp_token.pm} +22 -21
  73. data/parscit/lib/SectLabel/AAMatching.pm +1949 -0
  74. data/parscit/lib/SectLabel/Config.pm +88 -0
  75. data/parscit/lib/SectLabel/Controller.pm +332 -0
  76. data/parscit/lib/SectLabel/PostProcess.pm +425 -0
  77. data/parscit/lib/SectLabel/PreProcess.pm +116 -0
  78. data/parscit/lib/SectLabel/Tr2crfpp.pm +1246 -0
  79. data/parscit/resources/parsCit.model +0 -0
  80. data/parscit/resources/parsCit.split.model +0 -0
  81. data/{perl/ParsCit → parscit}/resources/parsCitDict.txt +205 -0
  82. data/parscit/resources/parsHed/bigram +10 -0
  83. data/parscit/resources/parsHed/keywords +10 -0
  84. data/parscit/resources/parsHed/parsHed.model +0 -0
  85. data/parscit/resources/parsHed/parsHed.template +178 -0
  86. data/parscit/resources/sectLabel/affiliation.model +0 -0
  87. data/parscit/resources/sectLabel/author.model +0 -0
  88. data/parscit/resources/sectLabel/funcWord +320 -0
  89. data/parscit/resources/sectLabel/genericSect.model +0 -0
  90. data/parscit/resources/sectLabel/sectLabel.config +42 -0
  91. data/parscit/resources/sectLabel/sectLabel.configXml +42 -0
  92. data/parscit/resources/sectLabel/sectLabel.model +0 -0
  93. data/sh/convert_to_text.sh +20 -0
  94. data/spec/biblicit/extractor_spec.rb +121 -0
  95. data/spec/fixtures/Review_of_Michael_Tyes_Consciousness_Revisited.docx +0 -0
  96. data/spec/fixtures/critical-infrastructures.ps +63951 -0
  97. data/spec/fixtures/txt/E06-1050.txt +867 -0
  98. data/spec/fixtures/txt/sample1.txt +902 -0
  99. data/spec/fixtures/txt/sample2.txt +394 -0
  100. data/spec/spec_helper.rb +3 -0
  101. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Function.pm +2 -20
  102. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +0 -7
  103. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Parser.pm +0 -2
  104. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +0 -7
  105. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +6 -1
  106. data/svm-header-parse/HeaderParseService/tmp/.gitignore +4 -0
  107. data/svm-header-parse/extract.pl +75 -0
  108. metadata +351 -317
  109. data/perl/DocFilter/lib/DocFilter/Config.pm +0 -35
  110. data/perl/DocFilter/lib/DocFilter/Filter.pm +0 -51
  111. data/perl/FileConversionService/README.TXT +0 -11
  112. data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
  113. data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +0 -140
  114. data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +0 -77
  115. data/perl/FileConversionService/lib/FileConverter/Compression.pm +0 -137
  116. data/perl/FileConversionService/lib/FileConverter/Config.pm +0 -57
  117. data/perl/FileConversionService/lib/FileConverter/Controller.pm +0 -191
  118. data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +0 -61
  119. data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +0 -69
  120. data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +0 -69
  121. data/perl/FileConversionService/lib/FileConverter/PSToText.pm +0 -88
  122. data/perl/FileConversionService/lib/FileConverter/Prescript.pm +0 -68
  123. data/perl/FileConversionService/lib/FileConverter/TET.pm +0 -75
  124. data/perl/FileConversionService/lib/FileConverter/Utils.pm +0 -130
  125. data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +0 -140
  126. data/perl/HeaderParseService/resources/data/EbizHeaders.txt +0 -24330
  127. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +0 -27506
  128. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +0 -26495
  129. data/perl/HeaderParseService/resources/data/tagged_headers.txt +0 -40668
  130. data/perl/HeaderParseService/resources/data/test_header.txt +0 -31
  131. data/perl/HeaderParseService/resources/data/test_header.txt.parsed +0 -31
  132. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +0 -23
  133. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +0 -23
  134. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +0 -23
  135. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +0 -23
  136. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +0 -23
  137. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +0 -23
  138. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +0 -23
  139. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +0 -23
  140. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +0 -23
  141. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +0 -23
  142. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +0 -23
  143. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +0 -23
  144. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +0 -23
  145. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +0 -23
  146. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +0 -23
  147. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +0 -23
  148. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +0 -23
  149. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +0 -23
  150. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +0 -23
  151. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +0 -23
  152. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +0 -23
  153. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +0 -23
  154. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +0 -23
  155. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +0 -23
  156. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +0 -23
  157. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +0 -23
  158. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +0 -23
  159. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +0 -23
  160. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +0 -23
  161. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +0 -23
  162. data/perl/ParsCit/crfpp/traindata/parsCit.template +0 -60
  163. data/perl/ParsCit/crfpp/traindata/parsCit.train.data +0 -12104
  164. data/perl/ParsCit/crfpp/traindata/tagged_references.txt +0 -500
  165. data/perl/ParsCit/lib/CSXUtil/SafeText.pm +0 -140
  166. data/perl/ParsCit/lib/ParsCit/Citation.pm +0 -462
  167. data/perl/ParsCit/lib/ParsCit/CitationContext.pm +0 -132
  168. data/perl/ParsCit/lib/ParsCit/Config.pm +0 -46
  169. data/perl/ParsCit/lib/ParsCit/Controller.pm +0 -306
  170. data/perl/ParsCit/lib/ParsCit/PostProcess.pm +0 -367
  171. data/perl/ParsCit/lib/ParsCit/PreProcess.pm +0 -333
  172. data/perl/ParsCit/resources/parsCit.model +0 -0
  173. data/perl/extract.pl +0 -199
  174. data/spec/biblicit/cb2bib_spec.rb +0 -48
  175. data/spec/biblicit/citeseer_spec.rb +0 -40
  176. /data/{perl → svm-header-parse}/HeaderParseService/README.TXT +0 -0
  177. /data/{perl/DocFilter → svm-header-parse/HeaderParseService}/lib/CSXUtil/SafeText.pm +0 -0
  178. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +0 -0
  179. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +0 -0
  180. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +0 -0
  181. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/50states +0 -0
  182. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AddrTopWords.txt +0 -0
  183. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWords.txt +0 -0
  184. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWordsAll.txt +0 -0
  185. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/ChineseSurNames.txt +0 -0
  186. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames.bin +0 -0
  187. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
  188. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/DomainSuffixes.txt +0 -0
  189. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/LabeledHeader +0 -0
  190. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/README +0 -0
  191. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines +0 -0
  192. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines1 +0 -0
  193. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstract.txt +0 -0
  194. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstractTopWords +0 -0
  195. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/addr.txt +0 -0
  196. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affi.txt +0 -0
  197. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affis.bin +0 -0
  198. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
  199. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/allnamewords.bin +0 -0
  200. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_US.txt +0 -0
  201. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_world.txt +0 -0
  202. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/city.txt +0 -0
  203. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cityname.txt +0 -0
  204. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/country_abbr.txt +0 -0
  205. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/countryname.txt +0 -0
  206. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/dateTopWords +0 -0
  207. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/degree.txt +0 -0
  208. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/email.txt +0 -0
  209. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/excludeWords.txt +0 -0
  210. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/female-names +0 -0
  211. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstNames.txt +0 -0
  212. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames.bin +0 -0
  213. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
  214. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/intro.txt +0 -0
  215. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keyword.txt +0 -0
  216. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keywordTopWords +0 -0
  217. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/male-names +0 -0
  218. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/middleNames.txt +0 -0
  219. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/month.txt +0 -0
  220. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul +0 -0
  221. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label +0 -0
  222. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label.old +0 -0
  223. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.processed +0 -0
  224. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulAuthor +0 -0
  225. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulClassStat +0 -0
  226. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nickname.txt +0 -0
  227. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nicknames.bin +0 -0
  228. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/note.txt +0 -0
  229. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/page.txt +0 -0
  230. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/phone.txt +0 -0
  231. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/postcode.txt +0 -0
  232. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/pubnum.txt +0 -0
  233. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.bin +0 -0
  234. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.txt +0 -0
  235. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/states_and_abbreviations.txt +0 -0
  236. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords +0 -0
  237. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords.bin +0 -0
  238. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surNames.txt +0 -0
  239. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames.bin +0 -0
  240. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames_spec.bin +0 -0
  241. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/A.html +0 -0
  242. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/B.html +0 -0
  243. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/C.html +0 -0
  244. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/D.html +0 -0
  245. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/E.html +0 -0
  246. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/F.html +0 -0
  247. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/G.html +0 -0
  248. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/H.html +0 -0
  249. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/I.html +0 -0
  250. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/J.html +0 -0
  251. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/K.html +0 -0
  252. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/L.html +0 -0
  253. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/M.html +0 -0
  254. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/N.html +0 -0
  255. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/O.html +0 -0
  256. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/P.html +0 -0
  257. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Q.html +0 -0
  258. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/R.html +0 -0
  259. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/S.html +0 -0
  260. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/T.html +0 -0
  261. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/U.html +0 -0
  262. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/V.html +0 -0
  263. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/W.html +0 -0
  264. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
  265. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/X.html +0 -0
  266. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Y.html +0 -0
  267. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Z.html +0 -0
  268. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ae.html +0 -0
  269. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/am.html +0 -0
  270. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ar.html +0 -0
  271. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/at.html +0 -0
  272. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/au.html +0 -0
  273. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bd.html +0 -0
  274. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/be.html +0 -0
  275. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bg.html +0 -0
  276. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bh.html +0 -0
  277. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
  278. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bm.html +0 -0
  279. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bn.html +0 -0
  280. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/br.html +0 -0
  281. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ca.html +0 -0
  282. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ch.html +0 -0
  283. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cl.html +0 -0
  284. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cn.html +0 -0
  285. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/co.html +0 -0
  286. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cr.html +0 -0
  287. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cy.html +0 -0
  288. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cz.html +0 -0
  289. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/de.html +0 -0
  290. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
  291. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dk.html +0 -0
  292. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ec.html +0 -0
  293. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ee.html +0 -0
  294. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/eg.html +0 -0
  295. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/es.html +0 -0
  296. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/et.html +0 -0
  297. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/faq.html +0 -0
  298. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fi.html +0 -0
  299. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fj.html +0 -0
  300. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fo.html +0 -0
  301. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fr.html +0 -0
  302. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/geog.html +0 -0
  303. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gr.html +0 -0
  304. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gu.html +0 -0
  305. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hk.html +0 -0
  306. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hr.html +0 -0
  307. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hu.html +0 -0
  308. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/id.html +0 -0
  309. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ie.html +0 -0
  310. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/il.html +0 -0
  311. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/in.html +0 -0
  312. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/is.html +0 -0
  313. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/it.html +0 -0
  314. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jm.html +0 -0
  315. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jo.html +0 -0
  316. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jp.html +0 -0
  317. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
  318. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kr.html +0 -0
  319. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kw.html +0 -0
  320. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lb.html +0 -0
  321. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
  322. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lk.html +0 -0
  323. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lt.html +0 -0
  324. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lu.html +0 -0
  325. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lv.html +0 -0
  326. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ma.html +0 -0
  327. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
  328. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
  329. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mk.html +0 -0
  330. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mo.html +0 -0
  331. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
  332. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mt.html +0 -0
  333. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mx.html +0 -0
  334. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/my.html +0 -0
  335. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ni.html +0 -0
  336. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nl.html +0 -0
  337. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/no.html +0 -0
  338. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nz.html +0 -0
  339. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pa.html +0 -0
  340. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pe.html +0 -0
  341. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ph.html +0 -0
  342. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pl.html +0 -0
  343. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
  344. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pr.html +0 -0
  345. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ps.html +0 -0
  346. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pt.html +0 -0
  347. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/recognition.html +0 -0
  348. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/results.html +0 -0
  349. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ro.html +0 -0
  350. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ru.html +0 -0
  351. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sd.html +0 -0
  352. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/se.html +0 -0
  353. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sg.html +0 -0
  354. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/si.html +0 -0
  355. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sk.html +0 -0
  356. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/th.html +0 -0
  357. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tr.html +0 -0
  358. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tw.html +0 -0
  359. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ua.html +0 -0
  360. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uk.html +0 -0
  361. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ-full.html +0 -0
  362. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ.html +0 -0
  363. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uy.html +0 -0
  364. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ve.html +0 -0
  365. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/yu.html +0 -0
  366. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/za.html +0 -0
  367. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/zm.html +0 -0
  368. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list.txt +0 -0
  369. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/url.txt +0 -0
  370. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/webTopWords +0 -0
  371. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/words +0 -0
  372. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10ContextModelfold1 +0 -0
  373. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10Modelfold1 +0 -0
  374. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11ContextModelfold1 +0 -0
  375. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11Modelfold1 +0 -0
  376. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12ContextModelfold1 +0 -0
  377. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12Modelfold1 +0 -0
  378. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13ContextModelfold1 +0 -0
  379. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13Modelfold1 +0 -0
  380. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14ContextModelfold1 +0 -0
  381. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14Modelfold1 +0 -0
  382. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15ContextModelfold1 +0 -0
  383. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15Modelfold1 +0 -0
  384. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1ContextModelfold1 +0 -0
  385. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1Modelfold1 +0 -0
  386. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2ContextModelfold1 +0 -0
  387. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2Modelfold1 +0 -0
  388. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3ContextModelfold1 +0 -0
  389. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3Modelfold1 +0 -0
  390. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4ContextModelfold1 +0 -0
  391. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4Modelfold1 +0 -0
  392. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5ContextModelfold1 +0 -0
  393. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5Modelfold1 +0 -0
  394. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6ContextModelfold1 +0 -0
  395. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6Modelfold1 +0 -0
  396. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7ContextModelfold1 +0 -0
  397. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7Modelfold1 +0 -0
  398. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8ContextModelfold1 +0 -0
  399. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8Modelfold1 +0 -0
  400. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9ContextModelfold1 +0 -0
  401. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9Modelfold1 +0 -0
  402. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceModel +0 -0
  403. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceTrainF +0 -0
  404. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperBaseFeaDict +0 -0
  405. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperContextFeaDict +0 -0
  406. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +0 -0
@@ -0,0 +1,964 @@
1
+ #!/usr/bin/perl
2
+
3
+ # Author: Do Hoang Nhat Huy <huydo@comp.nus.edu.sg>
4
+ # Modified from template by Min-Yen Kan <kanmy@comp.nus.edu.sg>
5
+
6
+ require 5.0;
7
+ use strict;
8
+
9
+ # Dependencies
10
+ use FindBin;
11
+ use Getopt::Long;
12
+ use HTML::Entities;
13
+
14
+ # I do not know a better solution to find a lib path in -T mode.
15
+ # So if you know a better solution, I'd be glad to hear.
16
+ # See this http://www.perlmonks.org/?node_id=585299 for why I used the below code
17
+
18
+ # To get correct path in case 2 scripts in different directories use FindBin
19
+ FindBin::again();
20
+ my $path = undef;
21
+ BEGIN
22
+ {
23
+ if ($FindBin::Bin =~ /(.*)/) { $path = $1; }
24
+ }
25
+ use lib "$path/../../lib";
26
+
27
+ use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/5.10.0";
28
+ use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/site_perl/5.10.0";
29
+
30
+ # Local libraries
31
+ use Omni::Config;
32
+ use Omni::Omnidoc;
33
+ use SectLabel::PreProcess;
34
+
35
+ # Omnilib configuration: object name
36
+ my $obj_list = $Omni::Config::obj_list;
37
+
38
+ ### USER customizable section
39
+ $0 =~ /([^\/]+)$/; my $progname = $1;
40
+ my $version = "1.0";
41
+ ### END user customizable section
42
+
43
+ sub License
44
+ {
45
+ print STDERR "# Copyright 2011 \251 by Do Hoang Nhat Huy\n";
46
+ }
47
+
48
+ sub Help
49
+ {
50
+ print STDERR "Process Omnipage XML output (concatenated results fromm all pages of a PDF file), and extract text lines together with other XML infos\n";
51
+ print STDERR "usage: $progname -h\t[invokes help]\n";
52
+ print STDERR " $progname -in xmlfile -out outfile [-decode] [-log]\n";
53
+ print STDERR "Options:\n";
54
+ print STDERR "\t-q \tQuiet Mode (don't echo license)\n";
55
+ print STDERR "\t-decode \tDecode HTML entities and then output, to avoid double entity encoding later\n";
56
+ }
57
+
58
+ my $quite = 0;
59
+ my $help = 0;
60
+ my $out_file = undef;
61
+ my $in_file = undef;
62
+ my $is_decode = 0;
63
+ my $is_debug = 0;
64
+ my $address = 1;
65
+
66
+ $help = 1 unless GetOptions( 'in=s' => \$in_file,
67
+ 'out=s' => \$out_file,
68
+ 'decode' => \$is_decode,
69
+ 'log' => \$is_debug,
70
+ 'h' => \$help,
71
+ 'q' => \$quite );
72
+
73
+ if ($help || ! defined $in_file || ! defined $out_file)
74
+ {
75
+ Help();
76
+ exit(0);
77
+ }
78
+
79
+ if (!$quite)
80
+ {
81
+ License();
82
+ }
83
+
84
+ ### Untaint ###
85
+ $in_file = UntaintPath($in_file);
86
+ $out_file = UntaintPath($out_file);
87
+ $ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
88
+ ### End untaint ###
89
+
90
+ # Mark page, para, line, word
91
+ my %g_page_hash = ();
92
+
93
+ # Mark paragraph
94
+ my @g_para = ();
95
+
96
+ # XML features
97
+ # Location feature
98
+ my @g_pos_hash = ();
99
+ my $g_maxpos = 0;
100
+ my $g_minpos = 1000000;
101
+ # Align feature
102
+ my @g_align = ();
103
+ # Bold feature
104
+ my @g_bold = ();
105
+ # Italic feature
106
+ my @g_italic = ();
107
+ # Pic feature
108
+ my @g_pic = ();
109
+ # Table feature
110
+ my @g_table = ();
111
+ # Bullet feature
112
+ my @g_bullet = ();
113
+ # Font size feature
114
+ my %g_font_size_hash = ();
115
+ my @g_font_size = ();
116
+ # Font face feature
117
+ my %g_font_face_hash = ();
118
+ my @g_font_face = ();
119
+
120
+ # All lines
121
+ my @lines = ();
122
+ # and their address
123
+ my @lines_addr = ();
124
+
125
+ # BEGIN
126
+ ProcessFile($in_file);
127
+ # Find header part
128
+ my $num_lines = scalar(@lines);
129
+ my ($header_length, $body_length, $body_start_id) = SectLabel::PreProcess::FindHeaderText(\@lines, 0, $num_lines);
130
+ # Done
131
+ Output(\@lines, $out_file);
132
+
133
+ if ($address == 1)
134
+ {
135
+ my $address_handle = undef;
136
+ # Save the line address for further use
137
+ open($address_handle, ">:utf8", $out_file . ".address") || die"#Can't open file \"$out_file.address\"\n";
138
+ foreach my $addr (@lines_addr)
139
+ {
140
+ print $address_handle $addr->{ 'L1' }, " ", $addr->{ 'L2' }, " ", $addr->{ 'L3' }, " ", $addr->{ 'L4' }, "\n";
141
+ }
142
+ # Done
143
+ close $address_handle;
144
+ }
145
+ # END
146
+
147
+ sub ProcessFile
148
+ {
149
+ my ($in_file) = @_;
150
+
151
+ my $input_handle = undef;
152
+ if (! open($input_handle, "<:utf8", $in_file)) { die "Could not open xml file " . $in_file; }
153
+ my $xml = do { local $/; <$input_handle> };
154
+ close $input_handle;
155
+
156
+ ###
157
+ # Huydhn
158
+ # NOTE: the omnipage xml is not well constructed (concatenated multiple xml files).
159
+ # This merged xml need to be fixed first before pass it to xml processing libraries, e.g. xml::twig
160
+ ###
161
+ # Convert to Unix format
162
+ $xml =~ s/\r//g;
163
+ # Remove <?xml version="1.0" encoding="UTF-8"?>
164
+ $xml =~ s/<\?xml.+?>\n//g;
165
+ # Remove <!--XML document generated using OCR technology from ScanSoft, Inc.-->
166
+ $xml =~ s/<\!\-\-XML.+?>\n//g;
167
+ # Declaration and root
168
+ $xml = "<?xml version=\"1.0\"?>" . "\n" . "<root>" . "\n" . $xml . "\n" . "</root>";
169
+
170
+ # New document
171
+ my $doc = new Omni::Omnidoc();
172
+ $doc->set_raw($xml);
173
+
174
+ # Current position
175
+ my %current = ();
176
+
177
+ # All pages in the document
178
+ my $pages = $doc->get_objs_ref();
179
+
180
+ # From page, To page
181
+ my $start_page = 0;
182
+ my $end_page = scalar(@{ $pages }) - 1;
183
+
184
+ # Image area flag
185
+ my $is_pic = 0;
186
+
187
+ # Tree traveling is 'not' fun. Seriously.
188
+ # This is like a dungeon seige.
189
+ for (my $x = $start_page; $x <= $end_page; $x++)
190
+ {
191
+ # Current position
192
+ $current{ 'L1' } = $x;
193
+
194
+ # Column or dd
195
+ my $level_2 = $pages->[ $x ]->get_objs_ref();
196
+ my $start_l2 = 0;
197
+ my $end_l2 = scalar(@{ $level_2 }) - 1;
198
+
199
+ for (my $y = $start_l2; $y <= $end_l2; $y++)
200
+ {
201
+ # Thang's code
202
+ # Thang considers <dd> tag as image, I just follow that
203
+ if ($level_2->[ $y ]->get_name() eq $obj_list->{ 'OMNIDD' })
204
+ {
205
+ $is_pic = 1;
206
+ }
207
+ else
208
+ {
209
+ $is_pic = 0;
210
+ }
211
+ # End Thang's code
212
+
213
+ # Current position
214
+ $current{ 'L2' } = $y;
215
+
216
+ # Table or paragraph
217
+ my $level_3 = $level_2->[ $y ]->get_objs_ref();
218
+ my $start_l3 = 0;
219
+ my $end_l3 = scalar(@{ $level_3 }) - 1;
220
+
221
+ for (my $z = $start_l3; $z <= $end_l3; $z++)
222
+ {
223
+ # Current position
224
+ $current{ 'L3' } = $z;
225
+
226
+ # Is a paragraph
227
+ if ($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNIPARA' })
228
+ {
229
+ # Thang's code
230
+ ProcessPara($level_3->[ $z ], $is_pic, \%current);
231
+ # End Thang's code
232
+ }
233
+ # or a table
234
+ elsif ($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNITABLE' })
235
+ {
236
+ # Thang's code
237
+ ProcessTable($level_3->[ $z ], $is_pic, \%current, 0);
238
+ # End Thangs's code
239
+ }
240
+ # or a frame
241
+ elsif ($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNIFRAME' })
242
+ {
243
+ # Frame contains multiple paragraph ?
244
+ ProcessFrame($level_3->[ $z ], $is_pic, \%current);
245
+ }
246
+ }
247
+ }
248
+ }
249
+ }
250
+
251
+ sub Output
252
+ {
253
+ my ($lines, $out_file) = @_;
254
+
255
+ my $output_handle = undef;
256
+ # This is the output
257
+ open($output_handle, ">:utf8", $out_file) || die"#Can't open file \"$out_file\"\n";
258
+
259
+ # XML feature label
260
+ my %g_font_size_labels = ();
261
+ GetFontSizeLabels(\%g_font_size_hash, \%g_font_size_labels);
262
+
263
+ my $output = "";
264
+ my $para_line_id = -1;
265
+ my $para_line_count = 0;
266
+
267
+ # This is the index of the line
268
+ my $id = 0;
269
+ # For each line in the whole document
270
+ foreach my $line (@{ $lines })
271
+ {
272
+ # Remove empty line
273
+ $line =~ s/^\s+|\s+$//g;
274
+
275
+ # New paragraph
276
+ if (($g_para[ $id ] eq "yes") && ($output ne ""))
277
+ {
278
+ if ($is_decode) { $output = decode_entities($output); }
279
+ # Write output to file
280
+ print $output_handle $output;
281
+ # Clean output for new paragraph
282
+ $output = "";
283
+ }
284
+
285
+ $output .= $line;
286
+
287
+ my $loc_feature = undef;
288
+ # XML location feature
289
+ if ($g_pos_hash[ $id ] != (-1)) { $loc_feature = "xmlLoc_".int(($g_pos_hash[$id] - $g_minpos) * 8.0 / ($g_maxpos - $g_minpos + 1)); }
290
+
291
+ # Align feature
292
+ my $align_feature = "xmlAlign_" . $g_align[ $id ];
293
+
294
+ my $font_size_feature = undef;
295
+ # Font_size feature
296
+ if (($g_font_size[$id] eq "") || ($g_font_size[$id] == -1))
297
+ {
298
+ $font_size_feature = "xmlFontSize_none";
299
+ }
300
+ else
301
+ {
302
+ $font_size_feature = "xmlFontSize_" . $g_font_size_labels{ $g_font_size[ $id ] };
303
+ }
304
+
305
+ # Bold feature
306
+ my $bold_feature = "xmlBold_" . $g_bold[ $id ];
307
+ # Italic feature
308
+ my $italic_feature = "xmlItalic_" . $g_italic[ $id ];
309
+ # Image feature
310
+ my $pic_feature = "xmlPic_" . $g_pic[ $id ];
311
+ # Table feature
312
+ my $table_feature = "xmlTable_" . $g_table[ $id ];
313
+ # Bullet feature
314
+ my $bullet_feature = "xmlBullet_" . $g_bullet[ $id ];
315
+ # Differential features
316
+ my ($align_diff, $font_size_diff, $font_face_diff, $font_sf_diff, $font_sfbi_diff, $font_sfbia_diff, $para_diff) = GetDifferentialFeatures($id);
317
+
318
+ # Each line and its XML features
319
+ $output .= " |XML| $loc_feature $bold_feature $italic_feature $font_size_feature $pic_feature $table_feature $bullet_feature $font_sfbia_diff $para_diff" . "\n";
320
+
321
+ # Update line index
322
+ $id++;
323
+ }
324
+
325
+ # New paragraph
326
+ if ($output ne "")
327
+ {
328
+ if ($is_decode) { $output = decode_entities($output); }
329
+ # Write output to file
330
+ print $output_handle $output;
331
+ # Clean output for new paragraph
332
+ $output = "";
333
+ }
334
+
335
+ # Done
336
+ close $output_handle;
337
+ }
338
+
339
+ sub GetDifferentialFeatures
340
+ {
341
+ my ($id) = @_;
342
+
343
+ my $align_diff = "bi_xmlA_";
344
+ # AlignChange feature
345
+ if ($id == 0)
346
+ {
347
+ $align_diff .= $g_align[ $id ];
348
+ }
349
+ elsif ($g_align[ $id ] eq $g_align[ $id - 1 ])
350
+ {
351
+ $align_diff .= "continue";
352
+ }
353
+ else
354
+ {
355
+ $align_diff .= $g_align[$id];
356
+ }
357
+
358
+ my $font_face_diff = "bi_xmlF_";
359
+ # FontFaceChange feature
360
+ if ($id == 0)
361
+ {
362
+ $font_face_diff .= "new";
363
+ }
364
+ elsif ($g_font_face[ $id ] eq $g_font_face[ $id - 1 ])
365
+ {
366
+ $font_face_diff .= "continue";
367
+ }
368
+ else
369
+ {
370
+ $font_face_diff .= "new";
371
+ }
372
+
373
+ my $font_size_diff = "bi_xmlS_";
374
+ # FontSizeChange feature
375
+ if ($id == 0)
376
+ {
377
+ $font_size_diff .= "new";
378
+ }
379
+ elsif ($g_font_size[ $id ] == $g_font_size[ $id - 1 ])
380
+ {
381
+ $font_size_diff .= "continue";
382
+ }
383
+ else
384
+ {
385
+ $font_size_diff .= "new";
386
+ }
387
+
388
+ my $font_sf_diff = "bi_xmlSF_";
389
+ # FontSFChange feature
390
+ if ($id == 0)
391
+ {
392
+ $font_sf_diff .= "new";
393
+ }
394
+ elsif ($g_font_size[ $id ] == $g_font_size[ $id - 1 ] && $g_font_face[ $id ] eq $g_font_face[ $id - 1 ])
395
+ {
396
+ $font_sf_diff .= "continue";
397
+ }
398
+ else
399
+ {
400
+ $font_sf_diff .= "new";
401
+ }
402
+
403
+ my $font_sfbi_diff = "bi_xmlSFBI_";
404
+ # FontSFBIChange feature
405
+ if ($id == 0)
406
+ {
407
+ $font_sfbi_diff .= "new";
408
+ }
409
+ elsif ($g_font_size[ $id ] == $g_font_size[ $id - 1 ] && $g_font_face[ $id ] eq $g_font_face[ $id - 1 ] && $g_bold[ $id ] eq $g_bold[ $id - 1 ] && $g_italic[ $id ] eq $g_italic[ $id - 1 ])
410
+ {
411
+ $font_sfbi_diff .= "continue";
412
+ }
413
+ else
414
+ {
415
+ $font_sfbi_diff .= "new";
416
+ }
417
+
418
+ my $font_sfbia_diff = "bi_xmlSFBIA_";
419
+ # FontSFBIAChange feature
420
+ if ($id == 0)
421
+ {
422
+ $font_sfbia_diff .= "new";
423
+ }
424
+ elsif ($g_font_size[ $id ] == $g_font_size[ $id - 1 ] && $g_font_face[ $id ] eq $g_font_face[ $id - 1 ] && $g_bold[ $id ] eq $g_bold[ $id - 1 ] && $g_italic[ $id ] eq $g_italic[$id - 1] && $g_align[ $id ] eq $g_align[ $id - 1 ])
425
+ {
426
+ $font_sfbia_diff .= "continue";
427
+ }
428
+ else
429
+ {
430
+ $font_sfbia_diff .= "new";
431
+ }
432
+
433
+ # ParaChange feature
434
+ my $para_diff = "bi_xmlPara_";
435
+ # Header part, consider each line as a separate paragraph
436
+ if ($id < $body_start_id)
437
+ {
438
+ $para_diff .= "header";
439
+ }
440
+ else
441
+ {
442
+ if($g_para[$id] eq "yes")
443
+ {
444
+ $para_diff .= "new";
445
+ }
446
+ else
447
+ {
448
+ $para_diff .= "continue";
449
+ }
450
+ }
451
+
452
+ return ($align_diff, $font_size_diff, $font_face_diff, $font_sf_diff, $font_sfbi_diff, $font_sfbia_diff, $para_diff);
453
+ }
454
+
455
+ sub GetFontSizeLabels
456
+ {
457
+ my ($g_font_size_hash, $g_font_size_labels) = @_;
458
+
459
+ # Sort by value in desccending order
460
+ my @sorted_fonts = sort { $g_font_size_hash->{ $b } <=> $g_font_size_hash->{ $a } } keys %{ $g_font_size_hash };
461
+ # and get the
462
+ my $common_size = $sorted_fonts[ 0 ];
463
+
464
+ # Sort by key in ascending order
465
+ @sorted_fonts = sort { $a <=> $b } keys %{ $g_font_size_hash };
466
+
467
+ my $common_index = 0;
468
+ # Index of common font size
469
+ foreach (@sorted_fonts)
470
+ {
471
+ # Found
472
+ if ($common_size == $_) { last; }
473
+ $common_index++;
474
+ }
475
+
476
+ # Small fonts
477
+ for (my $i = 0; $i < $common_index; $i++)
478
+ {
479
+ $g_font_size_labels->{ $sorted_fonts[ $i ] } = "smaller";
480
+ }
481
+
482
+ # Common fonts
483
+ $g_font_size_labels->{ $common_size } = "common";
484
+
485
+ # Large fonts
486
+ for (my $i = ($common_index + 1); $i < scalar(@sorted_fonts); $i++)
487
+ {
488
+ if ((scalar(@sorted_fonts) - $i) <= 3)
489
+ {
490
+ $g_font_size_labels->{ $sorted_fonts[$i] } = "largest" . ($i + 1 - scalar(@sorted_fonts));
491
+ }
492
+ else
493
+ {
494
+ $g_font_size_labels->{ $sorted_fonts[$i] } = "larger";
495
+ }
496
+ }
497
+ }
498
+
499
+ sub ProcessFrame
500
+ {
501
+ my ($omniframe, $is_pic, $line_addr) = @_;
502
+
503
+ # Line index in the whole frame
504
+ my $lindex = 0;
505
+ # All paragraph or table in the frame
506
+ my $objs = $omniframe->get_objs_ref();
507
+ # For each paragraph or table in the frame
508
+ for (my $i = 0; $i < scalar(@{ $objs }); $i++)
509
+ {
510
+ if ($objs->[ $i ]->get_name() eq $obj_list->{ 'OMNIPARA' })
511
+ {
512
+ # Paragraph attributes
513
+ my $align = $objs->[ $i ]->get_alignment();
514
+ my $space = $objs->[ $i ]->get_space_before();
515
+ # Line attributes
516
+ my ($left, $top, $right, $bottom) = undef;
517
+ # Run attributes
518
+ my $bold_count = 0;
519
+ my $italic_count = 0;
520
+ my %font_size_hash = ();
521
+ my %font_face_hash = ();
522
+
523
+ my $omnilines = $objs->[ $i ]->get_objs_ref();
524
+ # For each line in the paragraph
525
+ for (my $t = 0; $t < scalar(@{ $omnilines }); $t++)
526
+ {
527
+ # Save the line
528
+ push @lines, $omnilines->[ $t ]->get_content();
529
+ # Save the line's address
530
+ $line_addr->{ 'L4' } = $lindex;
531
+ push @lines_addr, { %{ $line_addr } };
532
+ # Point to the next line in the whole frame
533
+ $lindex++;
534
+
535
+ # Line attributes
536
+ $left = $omnilines->[ $t ]->get_left_pos();
537
+ $right = $omnilines->[ $t ]->get_right_pos();
538
+ $top = $omnilines->[ $t ]->get_top_pos();
539
+ $bottom = $omnilines->[ $t ]->get_bottom_pos();
540
+
541
+ # Runs
542
+ my $runs = $omnilines->[ $t ]->get_objs_ref();
543
+ my $start_r = 0;
544
+ my $end_r = scalar(@{ $runs }) - 1;
545
+
546
+ # Total number of words in a line
547
+ my $words_count = 0;
548
+
549
+ for (my $u = $start_r; $u <= $end_r; $u++)
550
+ {
551
+ # Thang's compatible code (instead of using get_objs_ref)
552
+ my $rcontent = undef;
553
+ # Get run content
554
+ $rcontent = $runs->[ $u ]->get_content();
555
+ # Trim
556
+ $rcontent =~ s/^\s+|\s+$//g;
557
+ # Split to words
558
+ my @words = split(/\s+/, $rcontent);
559
+
560
+ # Update the number of words
561
+ $words_count += scalar(@words);
562
+
563
+ # XML format
564
+ my $font_size = $runs->[ $u ]->get_font_size();
565
+ $font_size_hash{ $font_size } = $font_size_hash{ $font_size } ? $font_size_hash{ $font_size } + scalar(@words) : scalar(@words);
566
+ # XML format
567
+ my $font_face = $runs->[ $u ]->get_font_face();
568
+ $font_face_hash{ $font_face } = $font_face_hash{ $font_face } ? $font_face_hash{ $font_face } + scalar(@words) : scalar(@words);
569
+ # XML format
570
+ if ($runs->[ $u ]->get_bold() eq "true") { $bold_count += scalar(@words); }
571
+ # XML format
572
+ if ($runs->[ $u ]->get_italic() eq "true") { $italic_count += scalar(@words); }
573
+ }
574
+
575
+ # Line attributes - relative position in paragraph
576
+ if ($t == 0)
577
+ {
578
+ push @g_para, "yes";
579
+ }
580
+ else
581
+ {
582
+ push @g_para, "no";
583
+ }
584
+
585
+ # Line attributes - line position
586
+ my $pos = ($top + $bottom) / 2.0;
587
+ # Compare to global min and max position
588
+ if ($pos < $g_minpos) { $g_minpos = $pos; }
589
+ if ($pos > $g_maxpos) { $g_maxpos = $pos; }
590
+ # Pos feature
591
+ push @g_pos_hash, $pos;
592
+ # Alignment feature
593
+ push @g_align, $align;
594
+ # Table feature
595
+ push @g_table, "no";
596
+
597
+ if ($is_pic)
598
+ {
599
+ push @g_pic, "yes";
600
+ # Not assign value if line is in image area
601
+ push @g_bold, "no";
602
+ push @g_italic, "no";
603
+ push @g_bullet, "no";
604
+ push @g_font_size, -1;
605
+ push @g_font_face, "none";
606
+ }
607
+ else
608
+ {
609
+ push @g_pic, "no";
610
+ UpdateXMLFontFeature(\%font_size_hash, \%font_face_hash);
611
+ UpdateXMLFeatures($bold_count, $italic_count, $words_count, $omnilines->[ $t ]->get_bullet(), $space);
612
+ }
613
+
614
+ # Reset hash
615
+ %font_size_hash = ();
616
+ %font_face_hash = ();
617
+ # Reset
618
+ $bold_count = 0;
619
+ $italic_count = 0;
620
+ }
621
+ }
622
+ elsif ($objs->[ $i ]->get_name() eq $obj_list->{ 'OMNITABLE' })
623
+ {
624
+ $lindex = ProcessTable($objs->[ $i ], $is_pic, $line_addr, $lindex);
625
+ }
626
+ }
627
+ }
628
+
629
+ sub ProcessTable
630
+ {
631
+ my ($omnitable, $is_pic, $line_addr, $lindex) = @_;
632
+
633
+ # Table attributes
634
+ my ($left, $top, $right, $bottom) = undef;
635
+ $left = $omnitable->get_left_pos();
636
+ $right = $omnitable->get_right_pos();
637
+ $top = $omnitable->get_top_pos();
638
+ $bottom = $omnitable->get_bottom_pos();
639
+ # Table attributes
640
+ my $align = $omnitable->get_alignment();
641
+
642
+ # Thang's code
643
+ my $pos = ($top + $bottom) / 2.0;
644
+ # Set new min and max position
645
+ if ($pos < $g_minpos) { $g_minpos = $pos; }
646
+ if ($pos > $g_maxpos) { $g_maxpos = $pos; }
647
+ # End Thangs's code
648
+
649
+ # All row in the table
650
+ my $rows = $omnitable->get_row_content();
651
+ # For each row in the table
652
+ for (my $i = 0; $i < scalar(@{ $rows }); $i++)
653
+ {
654
+ my @row_lines = split(/\n/, $rows->[ $i ]);
655
+ # For each line in the row
656
+ for (my $j = 0; $j < scalar(@row_lines); $j++)
657
+ {
658
+ # Save the line
659
+ push @lines, $row_lines[ $j ];
660
+ # Save the line's address
661
+ $line_addr->{ 'L4' } = $lindex;
662
+ push @lines_addr, { %{ $line_addr } };
663
+ # Point to the next line in the whole table
664
+ $lindex++;
665
+
666
+ if (($j == 0) && ($i == 0))
667
+ {
668
+ push @g_para, "yes";
669
+ }
670
+ else
671
+ {
672
+ push @g_para, "no";
673
+ }
674
+
675
+ # Table feature
676
+ push @g_table, "yes";
677
+
678
+ # Pic feature
679
+ if ($is_pic)
680
+ {
681
+ push @g_pic, "yes";
682
+ }
683
+ else
684
+ {
685
+ push @g_pic, "no";
686
+ }
687
+
688
+ # Update xml pos value
689
+ push @g_pos_hash, $pos;
690
+ # Update xml alignment value
691
+ push @g_align, $align;
692
+
693
+ # Fontsize feature
694
+ push @g_font_size, -1;
695
+ # Fontface feature
696
+ push @g_font_face, "none";
697
+ # Bold feature
698
+ push @g_bold, "no";
699
+ # Italic feature
700
+ push @g_italic, "no";
701
+ # Bullet feature
702
+ push @g_bullet, "no";
703
+ }
704
+ }
705
+
706
+ # Nonsense
707
+ return $lindex;
708
+ }
709
+
710
+ sub ProcessPara
711
+ {
712
+ my ($paragraph, $is_pic, $line_addr) = @_;
713
+
714
+ # Paragraph attributes
715
+ my $align = $paragraph->get_alignment();
716
+ my $space = $paragraph->get_space_before();
717
+ # Line attributes
718
+ my ($left, $top, $right, $bottom) = undef;
719
+ # Run attributes
720
+ my $bold_count = 0;
721
+ my $italic_count = 0;
722
+ my %font_size_hash = ();
723
+ my %font_face_hash = ();
724
+
725
+ # Lines
726
+ my $omnilines = $paragraph->get_objs_ref();
727
+ my $start_l = 0;
728
+ my $end_l = scalar(@{ $omnilines }) - 1;
729
+
730
+ # Lines
731
+ for (my $t = $start_l; $t <= $end_l; $t++)
732
+ {
733
+ # Skip blank line
734
+ my $lcontent = $omnilines->[ $t ]->get_content();
735
+ $lcontent =~ s/^\s+|\s+$//g;
736
+ # Skip blank line
737
+ if ($lcontent eq "") { next; }
738
+
739
+ # Save the line
740
+ push @lines, $omnilines->[ $t ]->get_content();
741
+ # Save the line's address
742
+ $line_addr->{ 'L4' } = $t;
743
+ push @lines_addr, { %{ $line_addr } };
744
+
745
+ # Line attributes
746
+ $left = $omnilines->[ $t ]->get_left_pos();
747
+ $right = $omnilines->[ $t ]->get_right_pos();
748
+ $top = $omnilines->[ $t ]->get_top_pos();
749
+ $bottom = $omnilines->[ $t ]->get_bottom_pos();
750
+
751
+ # Runs
752
+ my $runs = $omnilines->[ $t ]->get_objs_ref();
753
+ my $start_r = 0;
754
+ my $end_r = scalar(@{ $runs }) - 1;
755
+
756
+ # Total number of words in a line
757
+ my $words_count = 0;
758
+
759
+ for (my $u = $start_r; $u <= $end_r; $u++)
760
+ {
761
+ # Thang's compatible code (instead of using get_objs_ref)
762
+ my $rcontent = undef;
763
+ # Get run content
764
+ $rcontent = $runs->[ $u ]->get_content();
765
+ # Trim
766
+ $rcontent =~ s/^\s+|\s+$//g;
767
+ # Split to words
768
+ my @words = split(/\s+/, $rcontent);
769
+
770
+ # Update the number of words
771
+ $words_count += scalar(@words);
772
+
773
+ # XML format
774
+ my $font_size = $runs->[ $u ]->get_font_size();
775
+ $font_size_hash{ $font_size } = $font_size_hash{ $font_size } ? $font_size_hash{ $font_size } + scalar(@words) : scalar(@words);
776
+ # XML format
777
+ my $font_face = $runs->[ $u ]->get_font_face();
778
+ $font_face_hash{ $font_face } = $font_face_hash{ $font_face } ? $font_face_hash{ $font_face } + scalar(@words) : scalar(@words);
779
+ # XML format
780
+ if ($runs->[ $u ]->get_bold() eq "true") { $bold_count += scalar(@words); }
781
+ # XML format
782
+ if ($runs->[ $u ]->get_italic() eq "true") { $italic_count += scalar(@words); }
783
+ }
784
+
785
+ # Line attributes - relative position in paragraph
786
+ if ($t == $start_l)
787
+ {
788
+ push @g_para, "yes";
789
+ }
790
+ else
791
+ {
792
+ push @g_para, "no";
793
+ }
794
+
795
+ # Line attributes - line position
796
+ my $pos = ($top + $bottom) / 2.0;
797
+ # Compare to global min and max position
798
+ if ($pos < $g_minpos) { $g_minpos = $pos; }
799
+ if ($pos > $g_maxpos) { $g_maxpos = $pos; }
800
+ # Pos feature
801
+ push @g_pos_hash, $pos;
802
+ # Alignment feature
803
+ push @g_align, $align;
804
+ # Table feature
805
+ push @g_table, "no";
806
+
807
+ if ($is_pic)
808
+ {
809
+ push @g_pic, "yes";
810
+ # Not assign value if line is in image area
811
+ push @g_bold, "no";
812
+ push @g_italic, "no";
813
+ push @g_bullet, "no";
814
+ push @g_font_size, -1;
815
+ push @g_font_face, "none";
816
+ }
817
+ else
818
+ {
819
+ push @g_pic, "no";
820
+ UpdateXMLFontFeature(\%font_size_hash, \%font_face_hash);
821
+ UpdateXMLFeatures($bold_count, $italic_count, $words_count, $omnilines->[ $t ]->get_bullet(), $space);
822
+ }
823
+
824
+ # Reset hash
825
+ %font_size_hash = ();
826
+ %font_face_hash = ();
827
+ # Reset
828
+ $bold_count = 0;
829
+ $italic_count = 0;
830
+ }
831
+ }
832
+
833
+ sub UpdateXMLFontFeature
834
+ {
835
+ my ($font_size_hash, $font_face_hash) = @_;
836
+
837
+ # Font size feature
838
+ if (scalar(keys %{ $font_size_hash }) == 0)
839
+ {
840
+ push @g_font_size, -1;
841
+ }
842
+ else
843
+ {
844
+ my @sorted_fonts = sort { $font_size_hash->{ $b } <=> $font_size_hash->{ $a } } keys %{ $font_size_hash };
845
+
846
+ my $font_size = undef;
847
+ # Iw two font sizes are equal in number, get the larger one
848
+ if ((scalar(@sorted_fonts) != 1) && ($font_size_hash->{ $sorted_fonts[ 0 ] } == $font_size_hash->{ $sorted_fonts[ 1 ] }))
849
+ {
850
+ $font_size = ($sorted_fonts[ 0 ] > $sorted_fonts[ 1 ]) ? $sorted_fonts[ 0 ] : $sorted_fonts[ 1 ];
851
+ }
852
+ else
853
+ {
854
+ $font_size = $sorted_fonts[ 0 ];
855
+ }
856
+
857
+ if ($font_size eq "") { $font_size = 0; }
858
+
859
+ push @g_font_size, $font_size;
860
+ $g_font_size_hash{ $font_size } = $g_font_size_hash{ $font_size } ? $g_font_size_hash{ $font_size } + 1 : 1;
861
+ }
862
+
863
+ # Font face feature
864
+ if (scalar(keys %{ $font_face_hash }) == 0)
865
+ {
866
+ push @g_font_face, "none";
867
+ }
868
+ else
869
+ {
870
+ my @sorted_fonts = sort { $font_face_hash->{ $b } <=> $font_face_hash->{ $a } } keys %{ $font_face_hash };
871
+
872
+ my $font_face = $sorted_fonts[ 0 ];
873
+ push @g_font_face, $font_face;
874
+
875
+ $g_font_face_hash{ $font_face } = $g_font_face_hash{ $font_face } ? $g_font_face_hash{ $font_face } + 1 : 1;
876
+ }
877
+ }
878
+
879
+ sub UpdateXMLFeatures
880
+ {
881
+ my ($bold_count, $italic_count, $words_count, $is_bullet, $space) = @_;
882
+
883
+ # Bold feature
884
+ my $bold_feature = undef;
885
+ if (($words_count != 0) && ($bold_count / $words_count >= 0.667))
886
+ {
887
+ $bold_feature = "yes";
888
+ }
889
+ else
890
+ {
891
+ $bold_feature = "no";
892
+ }
893
+ push @g_bold, $bold_feature;
894
+
895
+ # Italic feature
896
+ my $italic_feature = undef;
897
+ if (($words_count != 0) && ($italic_count / $words_count >= 0.667))
898
+ {
899
+ $italic_feature = "yes";
900
+ }
901
+ else
902
+ {
903
+ $italic_feature = "no";
904
+ }
905
+ push @g_italic, $italic_feature;
906
+
907
+ # Bullet feature
908
+ if ((defined $is_bullet) && ($is_bullet eq "true"))
909
+ {
910
+ push @g_bullet, "yes";
911
+ }
912
+ else
913
+ {
914
+ push @g_bullet, "no";
915
+ }
916
+ }
917
+
918
+ sub UntaintPath
919
+ {
920
+ my ($path) = @_;
921
+
922
+ if ( $path =~ /^([-_\/\w\.]*)$/ )
923
+ {
924
+ $path = $1;
925
+ }
926
+ else
927
+ {
928
+ die "Bad path \"$path\"\n";
929
+ }
930
+
931
+ return $path;
932
+ }
933
+
934
+ sub Untaint
935
+ {
936
+ my ($s) = @_;
937
+ if ($s =~ /^([\w \-\@\(\),\.\/]+)$/)
938
+ {
939
+ $s = $1; # $data now untainted
940
+ }
941
+ else
942
+ {
943
+ die "Bad data in $s"; # log this somewhere
944
+ }
945
+
946
+ return $s;
947
+ }
948
+
949
+ sub Execute
950
+ {
951
+ my ($cmd) = @_;
952
+ $cmd = Untaint($cmd);
953
+ system($cmd);
954
+ }
955
+
956
+ sub NewTmpFile
957
+ {
958
+ my $tmp_file = `date '+%Y%m%d-%H%M%S-$$'`;
959
+ chomp $tmp_file;
960
+ return $tmp_file;
961
+ }
962
+
963
+
964
+