biblicit 1.0 → 2.0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (406) hide show
  1. data/.gitmodules +3 -0
  2. data/Gemfile +1 -1
  3. data/README.md +125 -30
  4. data/Rakefile +22 -0
  5. data/biblicit.gemspec +9 -7
  6. data/lib/biblicit/cb2bib.rb +10 -11
  7. data/lib/biblicit/citeseer.rb +14 -26
  8. data/lib/biblicit/extractor.rb +40 -19
  9. data/lib/biblicit/parscit.rb +38 -0
  10. data/parscit/.gitignore +8 -0
  11. data/parscit/CHANGELOG +125 -0
  12. data/parscit/COPYING +674 -0
  13. data/parscit/COPYING.LESSER +165 -0
  14. data/parscit/INSTALL +105 -0
  15. data/parscit/README +97 -0
  16. data/{perl/ParsCit/README.TXT → parscit/USAGE} +25 -15
  17. data/parscit/bin/archtest.pl +31 -0
  18. data/parscit/bin/citeExtract.pl +562 -0
  19. data/parscit/bin/conlleval.pl +315 -0
  20. data/parscit/bin/headExtract.pl +40 -0
  21. data/parscit/bin/parsHed/convert2TokenLevel.pl +138 -0
  22. data/parscit/bin/parsHed/keywordGen.pl +308 -0
  23. data/parscit/bin/parsHed/parseXmlHeader.pl +141 -0
  24. data/parscit/bin/parsHed/redo.parsHed.pl +198 -0
  25. data/parscit/bin/parsHed/tr2crfpp_parsHed.pl +521 -0
  26. data/parscit/bin/parseRefStrings.pl +102 -0
  27. data/parscit/bin/phOutput2xml.pl +223 -0
  28. data/parscit/bin/redo.parsCit.pl +105 -0
  29. data/parscit/bin/sectExtract.pl +149 -0
  30. data/parscit/bin/sectLabel/README +110 -0
  31. data/parscit/bin/sectLabel/README.txt +110 -0
  32. data/parscit/bin/sectLabel/genericSect/crossValidation.rb +98 -0
  33. data/parscit/bin/sectLabel/genericSect/extractFeature.rb +104 -0
  34. data/parscit/bin/sectLabel/genericSectExtract.rb +53 -0
  35. data/parscit/bin/sectLabel/getStructureInfo.pl +156 -0
  36. data/parscit/bin/sectLabel/processOmniXML.pl +1427 -0
  37. data/parscit/bin/sectLabel/processOmniXML_new.pl +1025 -0
  38. data/parscit/bin/sectLabel/processOmniXMLv2.pl +1529 -0
  39. data/parscit/bin/sectLabel/processOmniXMLv3.pl +964 -0
  40. data/parscit/bin/sectLabel/redo.sectLabel.pl +219 -0
  41. data/parscit/bin/sectLabel/simplifyOmniXML.pl +382 -0
  42. data/parscit/bin/sectLabel/single2multi.pl +190 -0
  43. data/parscit/bin/sectLabel/tr2crfpp.pl +158 -0
  44. data/parscit/bin/tr2crfpp.pl +260 -0
  45. data/parscit/bin/xml2train.pl +193 -0
  46. data/parscit/lib/CSXUtil/SafeText.pm +130 -0
  47. data/parscit/lib/Omni/Config.pm +93 -0
  48. data/parscit/lib/Omni/Omnicell.pm +263 -0
  49. data/parscit/lib/Omni/Omnicol.pm +292 -0
  50. data/parscit/lib/Omni/Omnidd.pm +328 -0
  51. data/parscit/lib/Omni/Omnidoc.pm +153 -0
  52. data/parscit/lib/Omni/Omniframe.pm +223 -0
  53. data/parscit/lib/Omni/Omniline.pm +423 -0
  54. data/parscit/lib/Omni/Omnipage.pm +282 -0
  55. data/parscit/lib/Omni/Omnipara.pm +232 -0
  56. data/parscit/lib/Omni/Omnirun.pm +303 -0
  57. data/parscit/lib/Omni/Omnitable.pm +336 -0
  58. data/parscit/lib/Omni/Omniword.pm +162 -0
  59. data/parscit/lib/Omni/Traversal.pm +313 -0
  60. data/parscit/lib/ParsCit/.PostProcess.pm.swp +0 -0
  61. data/parscit/lib/ParsCit/Citation.pm +737 -0
  62. data/parscit/lib/ParsCit/CitationContext.pm +220 -0
  63. data/parscit/lib/ParsCit/Config.pm +35 -0
  64. data/parscit/lib/ParsCit/Controller.pm +653 -0
  65. data/parscit/lib/ParsCit/PostProcess.pm +505 -0
  66. data/parscit/lib/ParsCit/PreProcess.pm +1041 -0
  67. data/parscit/lib/ParsCit/Tr2crfpp.pm +1195 -0
  68. data/parscit/lib/ParsHed/Config.pm +49 -0
  69. data/parscit/lib/ParsHed/Controller.pm +143 -0
  70. data/parscit/lib/ParsHed/PostProcess.pm +322 -0
  71. data/parscit/lib/ParsHed/Tr2crfpp.pm +448 -0
  72. data/{perl/ParsCit/lib/ParsCit/Tr2crfpp.pm → parscit/lib/ParsHed/Tr2crfpp_token.pm} +22 -21
  73. data/parscit/lib/SectLabel/AAMatching.pm +1949 -0
  74. data/parscit/lib/SectLabel/Config.pm +88 -0
  75. data/parscit/lib/SectLabel/Controller.pm +332 -0
  76. data/parscit/lib/SectLabel/PostProcess.pm +425 -0
  77. data/parscit/lib/SectLabel/PreProcess.pm +116 -0
  78. data/parscit/lib/SectLabel/Tr2crfpp.pm +1246 -0
  79. data/parscit/resources/parsCit.model +0 -0
  80. data/parscit/resources/parsCit.split.model +0 -0
  81. data/{perl/ParsCit → parscit}/resources/parsCitDict.txt +205 -0
  82. data/parscit/resources/parsHed/bigram +10 -0
  83. data/parscit/resources/parsHed/keywords +10 -0
  84. data/parscit/resources/parsHed/parsHed.model +0 -0
  85. data/parscit/resources/parsHed/parsHed.template +178 -0
  86. data/parscit/resources/sectLabel/affiliation.model +0 -0
  87. data/parscit/resources/sectLabel/author.model +0 -0
  88. data/parscit/resources/sectLabel/funcWord +320 -0
  89. data/parscit/resources/sectLabel/genericSect.model +0 -0
  90. data/parscit/resources/sectLabel/sectLabel.config +42 -0
  91. data/parscit/resources/sectLabel/sectLabel.configXml +42 -0
  92. data/parscit/resources/sectLabel/sectLabel.model +0 -0
  93. data/sh/convert_to_text.sh +20 -0
  94. data/spec/biblicit/extractor_spec.rb +121 -0
  95. data/spec/fixtures/Review_of_Michael_Tyes_Consciousness_Revisited.docx +0 -0
  96. data/spec/fixtures/critical-infrastructures.ps +63951 -0
  97. data/spec/fixtures/txt/E06-1050.txt +867 -0
  98. data/spec/fixtures/txt/sample1.txt +902 -0
  99. data/spec/fixtures/txt/sample2.txt +394 -0
  100. data/spec/spec_helper.rb +3 -0
  101. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Function.pm +2 -20
  102. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +0 -7
  103. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Parser.pm +0 -2
  104. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +0 -7
  105. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +6 -1
  106. data/svm-header-parse/HeaderParseService/tmp/.gitignore +4 -0
  107. data/svm-header-parse/extract.pl +75 -0
  108. metadata +351 -317
  109. data/perl/DocFilter/lib/DocFilter/Config.pm +0 -35
  110. data/perl/DocFilter/lib/DocFilter/Filter.pm +0 -51
  111. data/perl/FileConversionService/README.TXT +0 -11
  112. data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
  113. data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +0 -140
  114. data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +0 -77
  115. data/perl/FileConversionService/lib/FileConverter/Compression.pm +0 -137
  116. data/perl/FileConversionService/lib/FileConverter/Config.pm +0 -57
  117. data/perl/FileConversionService/lib/FileConverter/Controller.pm +0 -191
  118. data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +0 -61
  119. data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +0 -69
  120. data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +0 -69
  121. data/perl/FileConversionService/lib/FileConverter/PSToText.pm +0 -88
  122. data/perl/FileConversionService/lib/FileConverter/Prescript.pm +0 -68
  123. data/perl/FileConversionService/lib/FileConverter/TET.pm +0 -75
  124. data/perl/FileConversionService/lib/FileConverter/Utils.pm +0 -130
  125. data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +0 -140
  126. data/perl/HeaderParseService/resources/data/EbizHeaders.txt +0 -24330
  127. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +0 -27506
  128. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +0 -26495
  129. data/perl/HeaderParseService/resources/data/tagged_headers.txt +0 -40668
  130. data/perl/HeaderParseService/resources/data/test_header.txt +0 -31
  131. data/perl/HeaderParseService/resources/data/test_header.txt.parsed +0 -31
  132. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +0 -23
  133. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +0 -23
  134. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +0 -23
  135. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +0 -23
  136. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +0 -23
  137. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +0 -23
  138. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +0 -23
  139. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +0 -23
  140. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +0 -23
  141. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +0 -23
  142. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +0 -23
  143. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +0 -23
  144. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +0 -23
  145. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +0 -23
  146. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +0 -23
  147. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +0 -23
  148. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +0 -23
  149. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +0 -23
  150. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +0 -23
  151. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +0 -23
  152. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +0 -23
  153. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +0 -23
  154. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +0 -23
  155. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +0 -23
  156. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +0 -23
  157. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +0 -23
  158. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +0 -23
  159. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +0 -23
  160. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +0 -23
  161. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +0 -23
  162. data/perl/ParsCit/crfpp/traindata/parsCit.template +0 -60
  163. data/perl/ParsCit/crfpp/traindata/parsCit.train.data +0 -12104
  164. data/perl/ParsCit/crfpp/traindata/tagged_references.txt +0 -500
  165. data/perl/ParsCit/lib/CSXUtil/SafeText.pm +0 -140
  166. data/perl/ParsCit/lib/ParsCit/Citation.pm +0 -462
  167. data/perl/ParsCit/lib/ParsCit/CitationContext.pm +0 -132
  168. data/perl/ParsCit/lib/ParsCit/Config.pm +0 -46
  169. data/perl/ParsCit/lib/ParsCit/Controller.pm +0 -306
  170. data/perl/ParsCit/lib/ParsCit/PostProcess.pm +0 -367
  171. data/perl/ParsCit/lib/ParsCit/PreProcess.pm +0 -333
  172. data/perl/ParsCit/resources/parsCit.model +0 -0
  173. data/perl/extract.pl +0 -199
  174. data/spec/biblicit/cb2bib_spec.rb +0 -48
  175. data/spec/biblicit/citeseer_spec.rb +0 -40
  176. /data/{perl → svm-header-parse}/HeaderParseService/README.TXT +0 -0
  177. /data/{perl/DocFilter → svm-header-parse/HeaderParseService}/lib/CSXUtil/SafeText.pm +0 -0
  178. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +0 -0
  179. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +0 -0
  180. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +0 -0
  181. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/50states +0 -0
  182. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AddrTopWords.txt +0 -0
  183. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWords.txt +0 -0
  184. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWordsAll.txt +0 -0
  185. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/ChineseSurNames.txt +0 -0
  186. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames.bin +0 -0
  187. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
  188. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/DomainSuffixes.txt +0 -0
  189. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/LabeledHeader +0 -0
  190. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/README +0 -0
  191. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines +0 -0
  192. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines1 +0 -0
  193. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstract.txt +0 -0
  194. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstractTopWords +0 -0
  195. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/addr.txt +0 -0
  196. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affi.txt +0 -0
  197. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affis.bin +0 -0
  198. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
  199. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/allnamewords.bin +0 -0
  200. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_US.txt +0 -0
  201. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_world.txt +0 -0
  202. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/city.txt +0 -0
  203. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cityname.txt +0 -0
  204. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/country_abbr.txt +0 -0
  205. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/countryname.txt +0 -0
  206. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/dateTopWords +0 -0
  207. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/degree.txt +0 -0
  208. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/email.txt +0 -0
  209. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/excludeWords.txt +0 -0
  210. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/female-names +0 -0
  211. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstNames.txt +0 -0
  212. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames.bin +0 -0
  213. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
  214. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/intro.txt +0 -0
  215. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keyword.txt +0 -0
  216. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keywordTopWords +0 -0
  217. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/male-names +0 -0
  218. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/middleNames.txt +0 -0
  219. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/month.txt +0 -0
  220. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul +0 -0
  221. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label +0 -0
  222. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label.old +0 -0
  223. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.processed +0 -0
  224. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulAuthor +0 -0
  225. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulClassStat +0 -0
  226. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nickname.txt +0 -0
  227. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nicknames.bin +0 -0
  228. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/note.txt +0 -0
  229. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/page.txt +0 -0
  230. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/phone.txt +0 -0
  231. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/postcode.txt +0 -0
  232. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/pubnum.txt +0 -0
  233. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.bin +0 -0
  234. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.txt +0 -0
  235. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/states_and_abbreviations.txt +0 -0
  236. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords +0 -0
  237. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords.bin +0 -0
  238. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surNames.txt +0 -0
  239. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames.bin +0 -0
  240. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames_spec.bin +0 -0
  241. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/A.html +0 -0
  242. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/B.html +0 -0
  243. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/C.html +0 -0
  244. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/D.html +0 -0
  245. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/E.html +0 -0
  246. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/F.html +0 -0
  247. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/G.html +0 -0
  248. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/H.html +0 -0
  249. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/I.html +0 -0
  250. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/J.html +0 -0
  251. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/K.html +0 -0
  252. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/L.html +0 -0
  253. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/M.html +0 -0
  254. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/N.html +0 -0
  255. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/O.html +0 -0
  256. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/P.html +0 -0
  257. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Q.html +0 -0
  258. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/R.html +0 -0
  259. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/S.html +0 -0
  260. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/T.html +0 -0
  261. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/U.html +0 -0
  262. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/V.html +0 -0
  263. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/W.html +0 -0
  264. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
  265. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/X.html +0 -0
  266. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Y.html +0 -0
  267. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Z.html +0 -0
  268. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ae.html +0 -0
  269. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/am.html +0 -0
  270. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ar.html +0 -0
  271. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/at.html +0 -0
  272. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/au.html +0 -0
  273. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bd.html +0 -0
  274. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/be.html +0 -0
  275. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bg.html +0 -0
  276. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bh.html +0 -0
  277. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
  278. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bm.html +0 -0
  279. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bn.html +0 -0
  280. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/br.html +0 -0
  281. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ca.html +0 -0
  282. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ch.html +0 -0
  283. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cl.html +0 -0
  284. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cn.html +0 -0
  285. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/co.html +0 -0
  286. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cr.html +0 -0
  287. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cy.html +0 -0
  288. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cz.html +0 -0
  289. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/de.html +0 -0
  290. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
  291. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dk.html +0 -0
  292. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ec.html +0 -0
  293. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ee.html +0 -0
  294. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/eg.html +0 -0
  295. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/es.html +0 -0
  296. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/et.html +0 -0
  297. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/faq.html +0 -0
  298. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fi.html +0 -0
  299. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fj.html +0 -0
  300. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fo.html +0 -0
  301. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fr.html +0 -0
  302. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/geog.html +0 -0
  303. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gr.html +0 -0
  304. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gu.html +0 -0
  305. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hk.html +0 -0
  306. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hr.html +0 -0
  307. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hu.html +0 -0
  308. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/id.html +0 -0
  309. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ie.html +0 -0
  310. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/il.html +0 -0
  311. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/in.html +0 -0
  312. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/is.html +0 -0
  313. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/it.html +0 -0
  314. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jm.html +0 -0
  315. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jo.html +0 -0
  316. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jp.html +0 -0
  317. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
  318. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kr.html +0 -0
  319. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kw.html +0 -0
  320. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lb.html +0 -0
  321. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
  322. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lk.html +0 -0
  323. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lt.html +0 -0
  324. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lu.html +0 -0
  325. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lv.html +0 -0
  326. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ma.html +0 -0
  327. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
  328. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
  329. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mk.html +0 -0
  330. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mo.html +0 -0
  331. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
  332. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mt.html +0 -0
  333. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mx.html +0 -0
  334. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/my.html +0 -0
  335. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ni.html +0 -0
  336. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nl.html +0 -0
  337. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/no.html +0 -0
  338. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nz.html +0 -0
  339. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pa.html +0 -0
  340. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pe.html +0 -0
  341. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ph.html +0 -0
  342. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pl.html +0 -0
  343. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
  344. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pr.html +0 -0
  345. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ps.html +0 -0
  346. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pt.html +0 -0
  347. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/recognition.html +0 -0
  348. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/results.html +0 -0
  349. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ro.html +0 -0
  350. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ru.html +0 -0
  351. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sd.html +0 -0
  352. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/se.html +0 -0
  353. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sg.html +0 -0
  354. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/si.html +0 -0
  355. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sk.html +0 -0
  356. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/th.html +0 -0
  357. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tr.html +0 -0
  358. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tw.html +0 -0
  359. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ua.html +0 -0
  360. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uk.html +0 -0
  361. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ-full.html +0 -0
  362. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ.html +0 -0
  363. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uy.html +0 -0
  364. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ve.html +0 -0
  365. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/yu.html +0 -0
  366. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/za.html +0 -0
  367. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/zm.html +0 -0
  368. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list.txt +0 -0
  369. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/url.txt +0 -0
  370. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/webTopWords +0 -0
  371. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/words +0 -0
  372. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10ContextModelfold1 +0 -0
  373. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10Modelfold1 +0 -0
  374. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11ContextModelfold1 +0 -0
  375. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11Modelfold1 +0 -0
  376. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12ContextModelfold1 +0 -0
  377. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12Modelfold1 +0 -0
  378. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13ContextModelfold1 +0 -0
  379. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13Modelfold1 +0 -0
  380. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14ContextModelfold1 +0 -0
  381. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14Modelfold1 +0 -0
  382. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15ContextModelfold1 +0 -0
  383. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15Modelfold1 +0 -0
  384. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1ContextModelfold1 +0 -0
  385. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1Modelfold1 +0 -0
  386. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2ContextModelfold1 +0 -0
  387. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2Modelfold1 +0 -0
  388. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3ContextModelfold1 +0 -0
  389. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3Modelfold1 +0 -0
  390. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4ContextModelfold1 +0 -0
  391. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4Modelfold1 +0 -0
  392. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5ContextModelfold1 +0 -0
  393. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5Modelfold1 +0 -0
  394. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6ContextModelfold1 +0 -0
  395. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6Modelfold1 +0 -0
  396. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7ContextModelfold1 +0 -0
  397. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7Modelfold1 +0 -0
  398. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8ContextModelfold1 +0 -0
  399. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8Modelfold1 +0 -0
  400. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9ContextModelfold1 +0 -0
  401. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9Modelfold1 +0 -0
  402. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceModel +0 -0
  403. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceTrainF +0 -0
  404. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperBaseFeaDict +0 -0
  405. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperContextFeaDict +0 -0
  406. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +0 -0
@@ -0,0 +1,1025 @@
1
+ #!/usr/bin/perl -wT
2
+ # Author: Luong Minh Thang <luongmin@comp.nus.edu.sg>, generated at Tue, 02 Jun 2009 01:30:42
3
+
4
+ # Modified from template by Min-Yen Kan <kanmy@comp.nus.edu.sg>
5
+
6
+ require 5.0;
7
+ use strict;
8
+ use Getopt::Long;
9
+ use HTML::Entities;
10
+
11
+ # I do not know a better solution to find a lib path in -T mode.
12
+ # So if you know a better solution, I'd be glad to hear.
13
+ # See this http://www.perlmonks.org/?node_id=585299 for why I used the below code
14
+ use FindBin;
15
+ FindBin::again(); # to get correct path in case 2 scripts in different directories use FindBin
16
+ my $path;
17
+ BEGIN {
18
+ if ($FindBin::Bin =~ /(.*)/) {
19
+ $path = $1;
20
+ }
21
+ }
22
+ use lib "$path/../../lib";
23
+ use SectLabel::PreProcess;
24
+
25
+ ### USER customizable section
26
+ $0 =~ /([^\/]+)$/; my $progname = $1;
27
+ my $outputVersion = "1.0";
28
+ ### END user customizable section
29
+
30
+ sub License {
31
+ print STDERR "# Copyright 2009 \251 by Luong Minh Thang\n";
32
+ }
33
+
34
+ ### HELP Sub-procedure
35
+ sub Help {
36
+ print STDERR "Process Omnipage XML output (concatenated results fromm all pages of a PDF file), and extract text lines together with other XML infos\n";
37
+ print STDERR "usage: $progname -h\t[invokes help]\n";
38
+ print STDERR " $progname -in xmlFile -out outFile [-xmlFeature -decode -markup -para] [-tag tagFile -allowEmptyLine -log]\n";
39
+ print STDERR "Options:\n";
40
+ print STDERR "\t-q\tQuiet Mode (don't echo license)\n";
41
+ print STDERR "\t-xmlFeature: append XML feature together with text extracted\n";
42
+ print STDERR "\t-decode: decode HTML entities and then output, to avoid double entity encoding later\n";
43
+ print STDERR "\t-para: marking in the output each paragraph with # Para lineId numLines\n";
44
+ print STDERR "\t-markup: marking in the output detailed word-level info ### Page w h\\n## Para l t r b\\n# Line l t r b\\nword l t r b\n";
45
+
46
+ print STDERR "\t-tag tagFile: count XML tags/values for statistics purpose\n";
47
+ }
48
+ my $QUIET = 0;
49
+ my $HELP = 0;
50
+ my $outFile = undef;
51
+ my $inFile = undef;
52
+
53
+ my $isXmlFeature = 0;
54
+ my $isDecode = 0;
55
+
56
+ my $isMarkup = 0;
57
+ my $isParaDelimiter = 0;
58
+
59
+ my $tagFile = "";
60
+ my $isAllowEmpty = 0;
61
+ my $isDebug = 0;
62
+ $HELP = 1 unless GetOptions('in=s' => \$inFile,
63
+ 'out=s' => \$outFile,
64
+ 'decode' => \$isDecode,
65
+ 'xmlFeature' => \$isXmlFeature,
66
+
67
+ 'tag=s' => \$tagFile,
68
+ 'allowEmptyLine' => \$isAllowEmpty,
69
+ 'markup' => \$isMarkup,
70
+
71
+ 'para' => \$isParaDelimiter,
72
+ 'log' => \$isDebug,
73
+ 'h' => \$HELP,
74
+ 'q' => \$QUIET);
75
+
76
+ if ($HELP || !defined $inFile || !defined $outFile) {
77
+ Help();
78
+ exit(0);
79
+ }
80
+
81
+ if (!$QUIET) {
82
+ License();
83
+ }
84
+
85
+ ### Untaint ###
86
+ $inFile = untaintPath($inFile);
87
+ $outFile = untaintPath($outFile);
88
+ $tagFile = untaintPath($tagFile);
89
+ $ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
90
+ ### End untaint ###
91
+
92
+ ### Mark page, para, line, word
93
+ my %gPageHash = ();
94
+
95
+ ### Mark paragraph
96
+ my @gPara = ();
97
+
98
+ ### XML features ###
99
+ # locFeature
100
+ my @gPosHash = (); my $gMinPos = 1000000; my $gMaxPos = 0;
101
+ my @gAlign = (); # alignFeature
102
+ my @gBold = (); # bold feature
103
+ my @gItalic = (); # italic feature
104
+
105
+ # font size feature
106
+ my %gFontSizeHash = (); my @gFontSize = ();
107
+ # font face feature
108
+ my %gFontFaceHash = (); my @gFontFace = ();
109
+
110
+ my @gPic = (); # pic feature
111
+ my @gTable = (); # table feature
112
+ my @gBullet = (); # bullet feature
113
+
114
+ # space feature
115
+ #my %gSpaceHash = (); my @gSpace = ();
116
+ ### End XML features ###
117
+
118
+ my %tags = ();
119
+
120
+ if($isDebug){
121
+ print STDERR "\n# Processing file $inFile & output to $outFile\n";
122
+ }
123
+
124
+ my $markupOutput = "";
125
+ my $allText = processFile($inFile, $outFile, \%tags);
126
+
127
+ # Find header part
128
+ my @lines = split(/\n/, $allText);
129
+ my $numLines = scalar(@lines);
130
+ my ($headerLength, $bodyLength, $bodyStartId) =
131
+ SectLabel::PreProcess::findHeaderText(\@lines, 0, $numLines);
132
+
133
+ # Output
134
+ if($isMarkup){
135
+ open(OF, ">:utf8", "$outFile") || die"#Can't open file \"$outFile\"\n";
136
+ print OF "$markupOutput";
137
+ close OF;
138
+ } else {
139
+ output(\@lines, $outFile);
140
+ }
141
+
142
+ if($tagFile ne ""){
143
+ printTagInfo(\%tags, $tagFile);
144
+ }
145
+
146
+ sub processFile {
147
+ my ($inFile, $tags) = @_;
148
+
149
+ if (!(-e $inFile)) { die "# $progname crash\t\tFile \"$inFile\" doesn't exist"; }
150
+ open (IF, "<:utf8", $inFile) || die "# $progname crash\t\tCan't open \"$inFile\"";
151
+
152
+ my $isPara = 0;
153
+ my $isTable = 0;
154
+ my $isSpace = 0;
155
+ my $isPic = 0;
156
+ my $allText = "";
157
+ my $text = "";
158
+
159
+ my $lineId = 0;
160
+ my $isFirstTableCell = 0;
161
+ while (<IF>) { #each line contains a header
162
+ if (/^\#/) { next; } # skip comments
163
+ chomp;
164
+ s/\cM$//; # remove ^M character at the end of the file if any
165
+ my $line = $_;
166
+
167
+ if($tagFile ne ""){
168
+ processTagInfo($line, $tags);
169
+ }
170
+
171
+ # if ($line =~ /<\?xml version.+>/){ } ### Xml ###
172
+ # if ($line =~ /^<\/column>$/){ } ### Column ###
173
+ if ($isMarkup && $line =~ /<theoreticalPage (.*)\/>/ && $isMarkup){
174
+ $markupOutput .= "### Page $1\n";
175
+ }
176
+
177
+ ### pic ###
178
+ if ($line =~ /^<dd (.*)>$/){
179
+ $isPic = 1;
180
+ if($isMarkup){
181
+ $markupOutput .= "### Figure $1\n";
182
+ }
183
+ }
184
+ elsif ($line =~ /^<\/dd>$/){
185
+ $isPic = 0;
186
+ }
187
+
188
+ ### Table ###
189
+ elsif ($line =~ /^<table (.*)>$/){
190
+ $isTable = 1;
191
+ $isFirstTableCell = 1;
192
+ if($isMarkup){
193
+ $markupOutput .= "### Table $1\n";
194
+ }
195
+ }
196
+ elsif ($line =~ /^<\/table>$/){
197
+ $isTable = 0;
198
+ }
199
+
200
+
201
+ ### Paragraph ###
202
+ # Note: table processing should have higher priority than paragraph, i.e. the priority does matter
203
+ elsif ($line =~ /^<para (.*)>$/){
204
+ $text .= $line."\n"; # we need the header
205
+ $isPara = 1;
206
+
207
+ if($isMarkup){
208
+ $markupOutput .= "## Para $1\n";
209
+ }
210
+ }
211
+ elsif ($line =~ /^<\/para>$/){
212
+ my ($paraText, $l, $t, $r, $b);
213
+ ($paraText, $l, $t, $r, $b, $isSpace) = processPara($text, $isTable, $isPic, \$isFirstTableCell);
214
+ $allText .= $paraText;
215
+
216
+ my @tmpLines = split(/\n/, $paraText);
217
+ $lineId += scalar(@tmpLines);
218
+ $isPara = 0;
219
+ $text = "";
220
+ }
221
+ elsif($isPara){
222
+ $text .= $line."\n";
223
+ next;
224
+ }
225
+ }
226
+ close IF;
227
+
228
+ return $allText;
229
+ }
230
+
231
+ sub output {
232
+ my ($lines, $outFile) = @_;
233
+
234
+ open(OF, ">:utf8", "$outFile") || die"#Can't open file \"$outFile\"\n";
235
+
236
+ ####### Final output ############
237
+ # xml feature label
238
+ my %gFontSizeLabels = ();
239
+ # my %gSpaceLabels = (); # yes, no
240
+
241
+ if($isXmlFeature){
242
+ getFontSizeLabels(\%gFontSizeHash, \%gFontSizeLabels);
243
+ # getSpaceLabels(\%gSpaceHash, \%gSpaceLabels);
244
+ }
245
+
246
+ my $id = -1;
247
+ my $output = "";
248
+ my $paraLineId = -1;
249
+ my $paraLineCount = 0;
250
+ foreach my $line (@{$lines}) {
251
+ $id++;
252
+
253
+ $line =~ s/\cM$//; # remove ^M character at the end of each line if any
254
+
255
+ if($line =~ /^\s*$/){ # # empty lines
256
+ if(!$isAllowEmpty){
257
+ next;
258
+ } else {
259
+ if($isDebug){
260
+ print STDERR "#! Line $id empty!\n";
261
+ }
262
+ }
263
+ }
264
+
265
+ if($gPara[$id] eq "yes"){
266
+ if($output ne ""){ ## mark para
267
+ if($isParaDelimiter){
268
+ print OF "# Para $paraLineId $paraLineCount\n$output";
269
+ $paraLineCount = 0;
270
+ } else {
271
+ if($isDecode){
272
+ $output = decode_entities($output);
273
+ }
274
+ print OF $output;
275
+ }
276
+
277
+ $output = "";
278
+ }
279
+ $paraLineId = $id;
280
+ }
281
+
282
+ $output .= $line;
283
+ $paraLineCount++;
284
+
285
+ ## Output XML features ###
286
+ if($isXmlFeature){
287
+ # loc feature
288
+ my $locFeature;
289
+ if($gPosHash[$id] != -1){
290
+ $locFeature = "xmlLoc_".int(($gPosHash[$id] - $gMinPos)*8.0/($gMaxPos - $gMinPos + 1));
291
+ }
292
+
293
+ # align feature
294
+ my $alignFeature = "xmlAlign_".$gAlign[$id];
295
+
296
+ # fontSize feature
297
+ my $fontSizeFeature;
298
+ if($gFontSize[$id] == -1){
299
+ $fontSizeFeature = "xmlFontSize_none";
300
+ } else {
301
+ $fontSizeFeature = "xmlFontSize_".$gFontSizeLabels{$gFontSize[$id]};
302
+ }
303
+
304
+ my $boldFeature = "xmlBold_".$gBold[$id]; # bold feature
305
+ my $italicFeature = "xmlItalic_".$gItalic[$id]; # italic feature
306
+ my $picFeature = "xmlPic_".$gPic[$id]; # pic feature
307
+ my $tableFeature = "xmlTable_".$gTable[$id]; # table feature
308
+ my $bulletFeature = "xmlBullet_".$gBullet[$id]; # bullet feature
309
+
310
+ # space feature
311
+ # my $spaceFeature;
312
+ # if($gSpace[$id] eq "none"){
313
+ # $spaceFeature = "xmlSpace_none";
314
+ # } else {
315
+ # $spaceFeature = "xmlSpace_".$gSpaceLabels{$gSpace[$id]};
316
+ # }
317
+
318
+ ## Differential features ##
319
+ my ($alignDiff, $fontSizeDiff, $fontFaceDiff, $fontSFDiff, $fontSFBIDiff, $fontSFBIADiff, $paraDiff) = getDifferentialFeatures($id);
320
+
321
+ $output .= " |XML| $locFeature $boldFeature $italicFeature $fontSizeFeature $picFeature $tableFeature $bulletFeature $fontSFBIADiff $paraDiff\n"; # $alignFeature $alignDiff $fontSizeDiff $fontFaceDiff $fontSFDiff $fontSFBIDiff
322
+ } else {
323
+ $output .= "\n";
324
+ }
325
+ }
326
+
327
+ if($output ne ""){ ## mark para
328
+ if($isParaDelimiter){
329
+ print OF "# Para $paraLineId $paraLineCount\n$output";
330
+ $paraLineCount = 0;
331
+ } else {
332
+ if($isDecode){
333
+ $output = decode_entities($output);
334
+ }
335
+
336
+ print OF $output;
337
+ }
338
+ $output = ""
339
+ }
340
+ close OF;
341
+ }
342
+
343
+ sub getDifferentialFeatures {
344
+ my ($id) = @_;
345
+
346
+ # alignChange feature
347
+ my $alignDiff = "bi_xmlA_";
348
+ if($id == 0){
349
+ $alignDiff .= $gAlign[$id];
350
+ } elsif($gAlign[$id] eq $gAlign[$id-1]){
351
+ $alignDiff .= "continue";
352
+ } else {
353
+ $alignDiff .= $gAlign[$id];
354
+ }
355
+
356
+ # fontFaceChange feature
357
+ my $fontFaceDiff = "bi_xmlF_";
358
+ if($id == 0){
359
+ $fontFaceDiff .= "new";
360
+ } elsif($gFontFace[$id] eq $gFontFace[$id-1]){
361
+ $fontFaceDiff .= "continue";
362
+ } else {
363
+ $fontFaceDiff .= "new";
364
+ }
365
+
366
+ # fontSizeChange feature
367
+ my $fontSizeDiff = "bi_xmlS_";
368
+ if($id == 0){
369
+ $fontSizeDiff .= "new";
370
+ } elsif($gFontSize[$id] == $gFontSize[$id-1]){
371
+ $fontSizeDiff .= "continue";
372
+ } else {
373
+ $fontSizeDiff .= "new";
374
+ }
375
+
376
+ # fontSFChange feature
377
+ my $fontSFDiff = "bi_xmlSF_";
378
+ if($id == 0){
379
+ $fontSFDiff .= "new";
380
+ } elsif($gFontSize[$id] == $gFontSize[$id-1] && $gFontFace[$id] eq $gFontFace[$id-1]){
381
+ $fontSFDiff .= "continue";
382
+ } else {
383
+ $fontSFDiff .= "new";
384
+ }
385
+
386
+ # fontSFBIChange feature
387
+ my $fontSFBIDiff = "bi_xmlSFBI_";
388
+ if($id == 0){
389
+ $fontSFBIDiff .= "new";
390
+ } elsif($gFontSize[$id] == $gFontSize[$id-1] && $gFontFace[$id] eq $gFontFace[$id-1] && $gBold[$id] eq $gBold[$id-1] && $gItalic[$id] eq $gItalic[$id-1]){
391
+ $fontSFBIDiff .= "continue";
392
+ } else {
393
+ $fontSFBIDiff .= "new";
394
+ }
395
+
396
+ # fontSFBIAChange feature
397
+ my $fontSFBIADiff = "bi_xmlSFBIA_";
398
+ if($id == 0){
399
+ $fontSFBIADiff .= "new";
400
+ } elsif($gFontSize[$id] == $gFontSize[$id-1] && $gFontFace[$id] eq $gFontFace[$id-1] && $gBold[$id] eq $gBold[$id-1] && $gItalic[$id] eq $gItalic[$id-1] && $gAlign[$id] eq $gAlign[$id-1]){
401
+ $fontSFBIADiff .= "continue";
402
+ } else {
403
+ $fontSFBIADiff .= "new";
404
+ }
405
+
406
+ # para change feature
407
+ my $paraDiff = "bi_xmlPara_";
408
+ if($id < $bodyStartId){ # header part, consider each line as a separate paragraph
409
+ $paraDiff .= "header";
410
+ } else {
411
+ if($gPara[$id] eq "yes"){
412
+ $paraDiff .= "new";
413
+ } else {
414
+ $paraDiff .= "continue";
415
+ }
416
+ }
417
+
418
+ return ($alignDiff, $fontSizeDiff, $fontFaceDiff, $fontSFDiff, $fontSFBIDiff, $fontSFBIADiff, $paraDiff);
419
+ }
420
+
421
+ sub getFontSizeLabels {
422
+ my ($gFontSizeHash, $gFontSizeLabels) = @_;
423
+
424
+ if($isDebug){ print STDERR "# Map fonts\n"; }
425
+ my @sortedFonts = sort { $gFontSizeHash->{$b} <=> $gFontSizeHash->{$a} } keys %{$gFontSizeHash}; # sort by values, obtain keys
426
+
427
+ my $commonSize = $sortedFonts[0];
428
+ @sortedFonts = sort { $a <=> $b } keys %{$gFontSizeHash}; # sort by keys, obtain keys
429
+ my $commonIndex = 0; # index of common font size
430
+ foreach(@sortedFonts){
431
+ if($commonSize == $_) { # found
432
+ last;
433
+ }
434
+ $commonIndex++;
435
+ }
436
+
437
+ # small fonts
438
+ for(my $i = 0; $i<$commonIndex; $i++){ # smallIndex $largeIndex
439
+ $gFontSizeLabels->{$sortedFonts[$i]} = "smaller";
440
+
441
+ if($isDebug){
442
+ print STDERR "$sortedFonts[$i] --> $gFontSizeLabels->{$sortedFonts[$i]}, freq = $gFontSizeHash->{$sortedFonts[$i]}\n";
443
+ }
444
+ }
445
+
446
+ # common fonts
447
+ $gFontSizeLabels->{$commonSize} = "common";
448
+ if($isDebug){
449
+ print STDERR "$sortedFonts[$commonIndex] --> $gFontSizeLabels->{$sortedFonts[$commonIndex]}, freq = $gFontSizeHash->{$sortedFonts[$commonIndex]}\n";
450
+ }
451
+
452
+ # large fonts
453
+ for(my $i = ($commonIndex+1); $i<scalar(@sortedFonts); $i++){ # ($largeIndex+1) (scalar(@sortedFonts)-1)
454
+ if((scalar(@sortedFonts)-$i) <= 3){
455
+ $gFontSizeLabels->{$sortedFonts[$i]} = "largest".($i+1-scalar(@sortedFonts));
456
+ } else {
457
+ $gFontSizeLabels->{$sortedFonts[$i]} = "larger";
458
+ }
459
+
460
+ if($isDebug){
461
+ print STDERR "$sortedFonts[$i] --> $gFontSizeLabels->{$sortedFonts[$i]}, freq = $gFontSizeHash->{$sortedFonts[$i]}\n";
462
+ }
463
+ }
464
+ }
465
+
466
+ sub getSpaceLabels {
467
+ my ($gSpaceHash, $gSpaceLabels) = @_;
468
+
469
+ if($isDebug){
470
+ print STDERR "\n# Map space\n";
471
+ }
472
+ my @sortedSpaces = sort { $gSpaceHash->{$b} <=> $gSpaceHash->{$a} } keys %{$gSpaceHash}; # sort by freqs, obtain space faces
473
+
474
+ my $commonSpace = $sortedSpaces[0];
475
+ my $commonFreq = $gSpaceHash->{$commonSpace};
476
+ # find similar common freq with larger spaces
477
+ for(my $i = 0; $i<scalar(@sortedSpaces); $i++){ # 0 ($smallIndex-1)
478
+ my $freq = $gSpaceHash->{$sortedSpaces[$i]};
479
+ if($freq/$commonFreq > 0.8){
480
+ if($sortedSpaces[$i] > $commonSpace){
481
+ $commonSpace = $sortedSpaces[$i];
482
+ }
483
+ } else {
484
+ last;
485
+ }
486
+ }
487
+
488
+ for(my $i = 0; $i<scalar(@sortedSpaces); $i++){ # 0 ($smallIndex-1)
489
+ if($sortedSpaces[$i] > $commonSpace){
490
+ $gSpaceLabels->{$sortedSpaces[$i]} = "yes";
491
+ } else {
492
+ $gSpaceLabels->{$sortedSpaces[$i]} = "no";
493
+ }
494
+
495
+ if($isDebug){
496
+ print STDERR "$sortedSpaces[$i] --> $gSpaceLabels->{$sortedSpaces[$i]}, freq = $gSpaceHash->{$sortedSpaces[$i]}\n";
497
+ }
498
+ }
499
+ }
500
+
501
+ sub getAttrValue {
502
+ my ($attrText, $attr) = @_;
503
+
504
+ my $value = "none";
505
+ if($attrText =~ /^.*$attr=\"(.+?)\".*$/){
506
+ $value = $1;
507
+ }
508
+
509
+ return $value;
510
+ }
511
+
512
+ sub checkFontAttr {
513
+ my ($attrText, $attr, $attrHash, $count) = @_;
514
+
515
+ if($attrText =~ /^.*$attr=\"(.+?)\".*$/){
516
+ my $attrValue = $1;
517
+
518
+ $attrHash->{$attrValue} = $attrHash->{$attrValue} ? ($attrHash->{$attrValue}+$count) : $count;
519
+ }
520
+ }
521
+
522
+ sub processPara {
523
+ my ($inputText, $isTable, $isPic, $isFirstTableCell) = @_;
524
+
525
+ my $isSpace = 0;
526
+ my $isSpecialSpace = 0;
527
+ my $isTab = 0;
528
+ my $isBullet = 0;
529
+
530
+ my $isForcedEOF = "none"; # 3 signals for end of L: forcedEOF=\"true\" in attribute of <ln> or || <nl orig=\"true\"\/> || end of </para> without encountering any of the above signal in the para plus $isSpace = 0
531
+ # xml feature
532
+ my $align = "none";
533
+ my ($l, $t, $r, $bottom);
534
+ my %fontSizeHash = ();
535
+ my %fontFaceHash = ();
536
+ my @boldArray = ();
537
+ my @italicArray = ();
538
+ my $space = "none";
539
+
540
+ my $lnAttr; my $isLn = 0; my $lnBold = "none"; my $lnItalic = "none";
541
+ my $runAttr; my $runText = ""; my $isRun = 0; my $runBold = "none"; my $runItalic = "none";
542
+ my $wdAttr; my $wdText = ""; my $isWd = 0;
543
+
544
+ my $wdIndex = 0; # word index in a line. When encountering </ln>, this parameter indicates the number of words in a line
545
+ my $lnBoldCount = 0;
546
+ my $lnItalicCount = 0;
547
+
548
+ my $allText = "";
549
+ my $text = ""; #invariant: when never enter a new line, $text will be copied into $allText, and $text is cleared
550
+
551
+ binmode(STDERR, ":utf8");
552
+
553
+ my $isFirstLinePara = 1;
554
+ my @lines = split(/\n/, $inputText);
555
+ for(my $i=0; $i<scalar(@lines); $i++){
556
+ my $line = $lines[$i];
557
+
558
+ ## new para
559
+ if ($line =~ /^<para (.+?)>$/){
560
+ my $attr = $1;
561
+ $align = getAttrValue($attr, "alignment");
562
+ # $indent = getAttrValue($attr, "li");
563
+ $space = getAttrValue($attr, "spaceBefore");
564
+ }
565
+
566
+ ## new ln
567
+ elsif ($line =~ /^<ln (.+)>$/){
568
+ $lnAttr = $1;
569
+ $isLn = 1;
570
+
571
+ if ($isMarkup){
572
+ $markupOutput .= "# Line $lnAttr\n";
573
+ }
574
+
575
+ if ($lnAttr =~ /^.*l=\"(\d+)\" t=\"(\d+)\" r=\"(\d+)\" b=\"(\d+)\".*$/){
576
+ ($l, $t, $r, $bottom) = ($1, $2, $3, $4);
577
+ }
578
+ $isForcedEOF = getAttrValue($lnAttr, "forcedEOF");
579
+
580
+ if($isXmlFeature){ # Bold & Italic
581
+ $lnBold = getAttrValue($lnAttr, "bold");
582
+ $lnItalic = getAttrValue($lnAttr, "italic");
583
+ }
584
+ }
585
+
586
+ ## new run
587
+ elsif ($line =~ /<run (.*)>$/){
588
+ $runAttr = $1;
589
+
590
+ $isSpace = 0;
591
+ $isTab = 0;
592
+ $isRun = 1;
593
+
594
+ if($line =~ /^<wd (.*?)>/){ # new wd, that consists of many runs
595
+ $isWd = 1;
596
+ $wdAttr = $1;
597
+ }
598
+
599
+ if($isXmlFeature){ # Bold & Italic
600
+ $runBold = getAttrValue($runAttr, "bold");
601
+ $runItalic = getAttrValue($runAttr, "italic");
602
+ }
603
+ }
604
+
605
+ ## wd
606
+ elsif ($line =~ /^<wd (.+)?>(.+)<\/wd>$/){
607
+ $wdAttr = $1;
608
+ my $word = $2;
609
+ $isSpace = 0;
610
+ $isTab = 0;
611
+
612
+ if ($isMarkup){
613
+ $markupOutput .= "$word $wdAttr";
614
+ if($isRun && $runAttr =~ /(bold|italic)=\"true\"/){ # if both bold and italic, then just use one
615
+ $markupOutput .= " $1=\"true\"";
616
+ }
617
+ $markupOutput .= "\n";
618
+ }
619
+
620
+ if($isXmlFeature){ # FontSize & FontFace
621
+ checkFontAttr($wdAttr, "fontSize", \%fontSizeHash, 1);
622
+ checkFontAttr($wdAttr, "fontFace", \%fontFaceHash, 1);
623
+ }
624
+
625
+ if($isXmlFeature){ # Bold & Italic
626
+ my $wdBold = getAttrValue($wdAttr, "bold");
627
+ my $wdItalic = getAttrValue($wdAttr, "italic");
628
+
629
+ if($wdBold eq "true" || $runBold eq "true" || $lnBold eq "true"){
630
+ $boldArray[$wdIndex] = 1;
631
+ $lnBoldCount++;
632
+ }
633
+
634
+ if($wdItalic eq "true" || $runItalic eq "true" || $lnItalic eq "true"){
635
+ $italicArray[$wdIndex] = 1;
636
+ $lnItalicCount++;
637
+ }
638
+ } # if($isXmlFeature)
639
+
640
+ ## add text
641
+ $text .= "$word";
642
+
643
+ if($isRun) {
644
+ $runText .= "$word ";
645
+ }
646
+ $wdIndex++;
647
+ }
648
+
649
+ ## end wd
650
+ elsif ($line =~ /^<\/wd>$/){
651
+ $isWd = 0;
652
+
653
+ if($isMarkup){
654
+ $markupOutput .= "$wdText $wdAttr";
655
+ if($isRun && $runAttr =~ /(bold|italic)=\"true\"/){ # if both bold and italic, then just use one
656
+ $markupOutput .= " $1=\"true\"";
657
+ }
658
+ $markupOutput .= "\n";
659
+
660
+ $wdAttr = "";
661
+ }
662
+ }
663
+
664
+ ## end run
665
+ elsif ($line =~ /^(.*)<\/run>$/){
666
+ my $word = $1;
667
+
668
+ ## add text
669
+ if($word ne ""){
670
+ if($isXmlFeature){ # Bold & Italic
671
+ if($runBold eq "true" || $lnBold eq "true"){
672
+ $boldArray[$wdIndex] = 1;
673
+ $lnBoldCount++;
674
+ }
675
+
676
+ if($runItalic eq "true" || $lnItalic eq "true"){
677
+ $italicArray[$wdIndex] = 1;
678
+ $lnItalicCount++;
679
+ }
680
+ }
681
+
682
+ # appear in the final result
683
+ if($isLn){ $text .= "$word"; }
684
+
685
+ # for internal record
686
+ if($isRun){ $runText .= "$word "; }
687
+ if($isWd){ $wdText .= "$word"; }
688
+
689
+ $wdIndex++;
690
+ }
691
+
692
+ # xml feature
693
+ if($isXmlFeature && $runText ne "") { # not a space, tab or new-line run
694
+ my @words = split(/\s+/, $runText);
695
+ my $numWords = scalar(@words);
696
+ checkFontAttr($runAttr, "fontSize", \%fontSizeHash, $numWords);
697
+ checkFontAttr($runAttr, "fontFace", \%fontFaceHash, $numWords);
698
+ }
699
+
700
+ ## reset run
701
+ if(!$isLn){ # <run> not enclosed within <ln>
702
+ $wdIndex = 0;
703
+ }
704
+ $runText = "";
705
+ $isRun = 0;
706
+ $isSpecialSpace = 0;
707
+
708
+ if($isXmlFeature){ # Bold & Italic
709
+ $runBold = "none";
710
+ $runItalic = "none";
711
+
712
+ if(!$isLn){ # <run> not enclosed within <ln>
713
+ $lnBoldCount = 0;
714
+ $lnItalicCount = 0;
715
+ }
716
+ }
717
+ }
718
+
719
+ ## end ln
720
+ elsif ($line =~ /^<\/ln>$/){
721
+ if((!$isAllowEmpty && $text !~ /^\s*$/)
722
+ || ($isAllowEmpty && $text ne "")){
723
+ if($isForcedEOF eq "true" || # there's a forced EOL?
724
+ !$isSpecialSpace # not an emply line with space character
725
+ ){
726
+ $text .= "\n";
727
+
728
+ # update allText
729
+ $allText .= $text;
730
+ $text = "";
731
+ }
732
+
733
+ my $numWords = $wdIndex;
734
+
735
+ if(!$isTable){
736
+ if($isFirstLinePara){
737
+ push(@gPara, "yes");
738
+ $isFirstLinePara = 0;
739
+ } else {
740
+ push(@gPara, "no");
741
+ }
742
+ } else {
743
+ if($$isFirstTableCell){
744
+ push(@gPara, "yes");
745
+ $$isFirstTableCell = 0;
746
+ } else {
747
+ push(@gPara, "no");
748
+ }
749
+ }
750
+
751
+ if($isXmlFeature && $numWords >= 1){
752
+ # xml feature
753
+ # assumtion that: fontSize is either occur in <ln>, or within multiple <run> under <ln>, but not both
754
+ checkFontAttr($lnAttr, "fontSize", \%fontSizeHash, $numWords);
755
+ checkFontAttr($lnAttr, "fontFace", \%fontFaceHash, $numWords);
756
+ }
757
+
758
+ if($isXmlFeature && !$isSpecialSpace){
759
+ my $pos = ($t+$bottom)/2.0;
760
+ if($pos < $gMinPos){ $gMinPos = $pos; }
761
+ if($pos > $gMaxPos){ $gMaxPos = $pos; }
762
+ push(@gPosHash, $pos); # pos feature
763
+ push(@gAlign, $align); # alignment feature
764
+
765
+ if($isPic){
766
+ push(@gPic, "yes");
767
+ } else {
768
+ push(@gPic, "no");
769
+ }
770
+ if($isTable){
771
+ push(@gTable, "yes");
772
+ } else {
773
+ push(@gTable, "no");
774
+ }
775
+
776
+ if($isPic || $isTable){
777
+ ### Not assign value ###
778
+ push(@gFontSize, -1); # bold feature
779
+ push(@gFontFace, "none"); # bold feature
780
+ push(@gBold, "no"); # bold feature
781
+ push(@gItalic, "no"); # italic feature
782
+ push(@gBullet, "no"); # bullet feature
783
+ } else {
784
+ updateXMLFontFeature(\%fontSizeHash, \%fontFaceHash);
785
+ %fontSizeHash = (); %fontFaceHash = ();
786
+
787
+ updateXMLFeatures($lnBoldCount, $lnItalicCount, $numWords, $isBullet, $space);
788
+ } # end if pic
789
+ } # end if($isXmlFeature && !$isSpecialSpace)
790
+ }
791
+
792
+ ## reset ln
793
+ $isLn = 0;
794
+ $isForcedEOF = "none";
795
+ $isSpecialSpace = 0;
796
+ $wdIndex = 0;
797
+
798
+ if($isXmlFeature){ # Bold & Italic
799
+ $lnBold = "none";
800
+ $lnItalic = "none";
801
+
802
+ $lnBoldCount = 0;
803
+ $lnItalicCount = 0;
804
+ }
805
+ } # end else </ln>
806
+
807
+ ## nl newline signal
808
+ elsif ($line =~ /^<nl orig=\"true\"\/>$/){
809
+ if($isLn){
810
+ $isSpace = 0;
811
+ } else {
812
+ if($isDebug){
813
+ print STDERR "#!!! Warning: found <nl orig=\"true\"\/> while not in tag <ln>: $line\n";
814
+ }
815
+ }
816
+ }
817
+
818
+ ## space
819
+ elsif ($line =~ /^<space\/>$/){
820
+ my $startTag = "";
821
+ my $endTag = "";
822
+ if($i>0 && $lines[$i-1] =~ /^<(.+?)\b.*/){
823
+ $startTag = $1;
824
+ }
825
+
826
+ if($i < (scalar(@lines) -1) && $lines[$i+1] =~ /^<\/(.+)>/){
827
+ $endTag = $1;
828
+ }
829
+
830
+ if($startTag eq $endTag && $startTag ne ""){
831
+ # print STDERR "# Special space after \"$text\"\n";
832
+ $isSpecialSpace = 1;
833
+ }
834
+
835
+ ## addText
836
+ $text .= " ";
837
+ $isSpace = 1;
838
+ }
839
+
840
+ ## tab
841
+ elsif ($line =~ /^<tab .*\/>$/){
842
+ ## add Text
843
+ $text .= "\t";
844
+
845
+ $isTab = 1;
846
+ }
847
+
848
+ ## bullet
849
+ elsif ($line =~ /^<bullet .*>$/){
850
+ $isBullet = 1;
851
+ }
852
+ }
853
+
854
+ $allText .= $text;
855
+ return ($allText, $l, $t, $r, $bottom, $isSpace);
856
+ }
857
+
858
+ sub updateXMLFontFeature {
859
+ my ($fontSizeHash, $fontFaceHash) = @_;
860
+
861
+ # font size feature
862
+ if(scalar(keys %{$fontSizeHash}) == 0){
863
+ push(@gFontSize, -1);
864
+ } else {
865
+ my @sortedFonts = sort { $fontSizeHash->{$b} <=> $fontSizeHash->{$a} } keys %{$fontSizeHash};
866
+
867
+ my $fontSize = $sortedFonts[0];
868
+ push(@gFontSize, $fontSize);
869
+
870
+ $gFontSizeHash{$fontSize} = $gFontSizeHash{$fontSize} ? ($gFontSizeHash{$fontSize}+1) : 1;
871
+ }
872
+
873
+ # font face feature
874
+ if(scalar(keys %{$fontFaceHash}) == 0){
875
+ push(@gFontFace, "none");
876
+ } else {
877
+ my @sortedFonts = sort { $fontFaceHash->{$b} <=> $fontFaceHash->{$a} } keys %{$fontFaceHash};
878
+ my $fontFace = $sortedFonts[0];
879
+ push(@gFontFace, $fontFace);
880
+
881
+ $gFontFaceHash{$fontFace} = $gFontFaceHash{$fontFace} ? ($gFontFaceHash{$fontFace}+1) : 1;
882
+ }
883
+ }
884
+
885
+ sub updateXMLFeatures {
886
+ my ($lnBoldCount, $lnItalicCount, $numWords, $isBullet, $space) = @_;
887
+ # bold feature
888
+ my $boldFeature;
889
+ if ($lnBoldCount/$numWords >= 0.667){
890
+ $boldFeature = "yes";
891
+ } else {
892
+ $boldFeature = "no";
893
+ }
894
+ push(@gBold, $boldFeature);
895
+
896
+ # italic feature
897
+ my $italicFeature;
898
+ if ($lnItalicCount/$numWords >= 0.667){
899
+ $italicFeature = "yes";
900
+ } else {
901
+ $italicFeature = "no";
902
+ }
903
+ push(@gItalic, $italicFeature);
904
+
905
+ # bullet feature
906
+ if($isBullet){
907
+ push(@gBullet, "yes");
908
+ } else {
909
+ push(@gBullet, "no");
910
+ }
911
+
912
+ # space feature
913
+ # push(@gSpace, $space);
914
+ }
915
+
916
+ ## Find the positions of header, body, and citation
917
+ sub getStructureInfo {
918
+ my ($lines, $numLines) = @_;
919
+
920
+ my ($bodyLength, $citationLength, $bodyEndId) =
921
+ SectLabel::PreProcess::findCitationText($lines, 0, $numLines);
922
+
923
+ my ($headerLength, $bodyStartId);
924
+ ($headerLength, $bodyLength, $bodyStartId) =
925
+ SectLabel::PreProcess::findHeaderText($lines, 0, $bodyLength);
926
+
927
+ # sanity check
928
+ my $totalLength = $headerLength + $bodyLength + $citationLength;
929
+ if($numLines != $totalLength){
930
+ print STDOUT "Die in getStructureInfo(): different num lines $numLines != $totalLength\n"; # to display in Web
931
+ die "Die in getStructureInfo(): different num lines $numLines != $totalLength\n";
932
+ }
933
+ return ($headerLength, $bodyLength, $citationLength, $bodyStartId, $bodyEndId);
934
+ }
935
+
936
+ ## Count XML tags/values for statistics purpose
937
+ sub processTagInfo {
938
+ my ($line, $tags) = @_;
939
+
940
+ my $tag;
941
+ my $attr;
942
+ if($line =~ /^<(.+?)\b(.*)/){
943
+ $tag = $1;
944
+ $attr = $2;
945
+ if(!$tags->{$tag}){
946
+ $tags->{$tag} = ();
947
+ }
948
+ if($attr =~ /^\s*(.+?)\s*\/?>/){
949
+ $attr = $1;
950
+ }
951
+
952
+ my @tokens = split(/\s+/, $attr);
953
+ foreach my $token (@tokens){
954
+ if($token =~ /^(.+)=(.+)$/){
955
+ my $attrName = $1;
956
+ my $value = $2;
957
+ if(!$tags->{$tag}->{$attrName}){
958
+ $tags->{$tag}->{$attrName} = ();
959
+ }
960
+ if(!$tags->{$tag}->{$attrName}->{$value}){
961
+ $tags->{$tag}->{$attrName}->{$value} = 0;
962
+ }
963
+ $tags->{$tag}->{$attrName}->{$value}++;
964
+ }
965
+ }
966
+ }
967
+ }
968
+
969
+ ## Print tag info to file
970
+ sub printTagInfo {
971
+ my ($tags, $tagFile) = @_;
972
+
973
+ open(TAG, ">:utf8", "$tagFile") || die"#Can't open file \"$tagFile\"\n";
974
+ my @sortedTags = sort {$a cmp $b} keys %{$tags};
975
+ foreach(@sortedTags){
976
+ my @attrs = sort {$a cmp $b} keys %{$tags->{$_}};
977
+ print TAG "# Tag = $_\n";
978
+ foreach my $attr (@attrs) {
979
+ print TAG "$attr:";
980
+ my @values = sort {$a cmp $b} keys %{$tags->{$_}->{$attr}};
981
+ foreach my $value (@values){
982
+ print TAG " $value-$tags->{$_}->{$attr}->{$value}";
983
+ }
984
+ print TAG "\n";
985
+ }
986
+ }
987
+ close TAG;
988
+ }
989
+
990
+ sub untaintPath {
991
+ my ($path) = @_;
992
+
993
+ if ( $path =~ /^([-_\/\w\.]*)$/ ) {
994
+ $path = $1;
995
+ } else {
996
+ die "Bad path \"$path\"\n";
997
+ }
998
+
999
+ return $path;
1000
+ }
1001
+
1002
+ sub untaint {
1003
+ my ($s) = @_;
1004
+ if ($s =~ /^([\w \-\@\(\),\.\/]+)$/) {
1005
+ $s = $1; # $data now untainted
1006
+ } else {
1007
+ die "Bad data in $s"; # log this somewhere
1008
+ }
1009
+ return $s;
1010
+ }
1011
+
1012
+ sub execute {
1013
+ my ($cmd) = @_;
1014
+ if($isDebug){
1015
+ print STDERR "Executing: $cmd\n";
1016
+ }
1017
+ $cmd = untaint($cmd);
1018
+ system($cmd);
1019
+ }
1020
+
1021
+ sub newTmpFile {
1022
+ my $tmpFile = `date '+%Y%m%d-%H%M%S-$$'`;
1023
+ chomp($tmpFile);
1024
+ return $tmpFile;
1025
+ }