biblicit 1.0 → 2.0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (406) hide show
  1. data/.gitmodules +3 -0
  2. data/Gemfile +1 -1
  3. data/README.md +125 -30
  4. data/Rakefile +22 -0
  5. data/biblicit.gemspec +9 -7
  6. data/lib/biblicit/cb2bib.rb +10 -11
  7. data/lib/biblicit/citeseer.rb +14 -26
  8. data/lib/biblicit/extractor.rb +40 -19
  9. data/lib/biblicit/parscit.rb +38 -0
  10. data/parscit/.gitignore +8 -0
  11. data/parscit/CHANGELOG +125 -0
  12. data/parscit/COPYING +674 -0
  13. data/parscit/COPYING.LESSER +165 -0
  14. data/parscit/INSTALL +105 -0
  15. data/parscit/README +97 -0
  16. data/{perl/ParsCit/README.TXT → parscit/USAGE} +25 -15
  17. data/parscit/bin/archtest.pl +31 -0
  18. data/parscit/bin/citeExtract.pl +562 -0
  19. data/parscit/bin/conlleval.pl +315 -0
  20. data/parscit/bin/headExtract.pl +40 -0
  21. data/parscit/bin/parsHed/convert2TokenLevel.pl +138 -0
  22. data/parscit/bin/parsHed/keywordGen.pl +308 -0
  23. data/parscit/bin/parsHed/parseXmlHeader.pl +141 -0
  24. data/parscit/bin/parsHed/redo.parsHed.pl +198 -0
  25. data/parscit/bin/parsHed/tr2crfpp_parsHed.pl +521 -0
  26. data/parscit/bin/parseRefStrings.pl +102 -0
  27. data/parscit/bin/phOutput2xml.pl +223 -0
  28. data/parscit/bin/redo.parsCit.pl +105 -0
  29. data/parscit/bin/sectExtract.pl +149 -0
  30. data/parscit/bin/sectLabel/README +110 -0
  31. data/parscit/bin/sectLabel/README.txt +110 -0
  32. data/parscit/bin/sectLabel/genericSect/crossValidation.rb +98 -0
  33. data/parscit/bin/sectLabel/genericSect/extractFeature.rb +104 -0
  34. data/parscit/bin/sectLabel/genericSectExtract.rb +53 -0
  35. data/parscit/bin/sectLabel/getStructureInfo.pl +156 -0
  36. data/parscit/bin/sectLabel/processOmniXML.pl +1427 -0
  37. data/parscit/bin/sectLabel/processOmniXML_new.pl +1025 -0
  38. data/parscit/bin/sectLabel/processOmniXMLv2.pl +1529 -0
  39. data/parscit/bin/sectLabel/processOmniXMLv3.pl +964 -0
  40. data/parscit/bin/sectLabel/redo.sectLabel.pl +219 -0
  41. data/parscit/bin/sectLabel/simplifyOmniXML.pl +382 -0
  42. data/parscit/bin/sectLabel/single2multi.pl +190 -0
  43. data/parscit/bin/sectLabel/tr2crfpp.pl +158 -0
  44. data/parscit/bin/tr2crfpp.pl +260 -0
  45. data/parscit/bin/xml2train.pl +193 -0
  46. data/parscit/lib/CSXUtil/SafeText.pm +130 -0
  47. data/parscit/lib/Omni/Config.pm +93 -0
  48. data/parscit/lib/Omni/Omnicell.pm +263 -0
  49. data/parscit/lib/Omni/Omnicol.pm +292 -0
  50. data/parscit/lib/Omni/Omnidd.pm +328 -0
  51. data/parscit/lib/Omni/Omnidoc.pm +153 -0
  52. data/parscit/lib/Omni/Omniframe.pm +223 -0
  53. data/parscit/lib/Omni/Omniline.pm +423 -0
  54. data/parscit/lib/Omni/Omnipage.pm +282 -0
  55. data/parscit/lib/Omni/Omnipara.pm +232 -0
  56. data/parscit/lib/Omni/Omnirun.pm +303 -0
  57. data/parscit/lib/Omni/Omnitable.pm +336 -0
  58. data/parscit/lib/Omni/Omniword.pm +162 -0
  59. data/parscit/lib/Omni/Traversal.pm +313 -0
  60. data/parscit/lib/ParsCit/.PostProcess.pm.swp +0 -0
  61. data/parscit/lib/ParsCit/Citation.pm +737 -0
  62. data/parscit/lib/ParsCit/CitationContext.pm +220 -0
  63. data/parscit/lib/ParsCit/Config.pm +35 -0
  64. data/parscit/lib/ParsCit/Controller.pm +653 -0
  65. data/parscit/lib/ParsCit/PostProcess.pm +505 -0
  66. data/parscit/lib/ParsCit/PreProcess.pm +1041 -0
  67. data/parscit/lib/ParsCit/Tr2crfpp.pm +1195 -0
  68. data/parscit/lib/ParsHed/Config.pm +49 -0
  69. data/parscit/lib/ParsHed/Controller.pm +143 -0
  70. data/parscit/lib/ParsHed/PostProcess.pm +322 -0
  71. data/parscit/lib/ParsHed/Tr2crfpp.pm +448 -0
  72. data/{perl/ParsCit/lib/ParsCit/Tr2crfpp.pm → parscit/lib/ParsHed/Tr2crfpp_token.pm} +22 -21
  73. data/parscit/lib/SectLabel/AAMatching.pm +1949 -0
  74. data/parscit/lib/SectLabel/Config.pm +88 -0
  75. data/parscit/lib/SectLabel/Controller.pm +332 -0
  76. data/parscit/lib/SectLabel/PostProcess.pm +425 -0
  77. data/parscit/lib/SectLabel/PreProcess.pm +116 -0
  78. data/parscit/lib/SectLabel/Tr2crfpp.pm +1246 -0
  79. data/parscit/resources/parsCit.model +0 -0
  80. data/parscit/resources/parsCit.split.model +0 -0
  81. data/{perl/ParsCit → parscit}/resources/parsCitDict.txt +205 -0
  82. data/parscit/resources/parsHed/bigram +10 -0
  83. data/parscit/resources/parsHed/keywords +10 -0
  84. data/parscit/resources/parsHed/parsHed.model +0 -0
  85. data/parscit/resources/parsHed/parsHed.template +178 -0
  86. data/parscit/resources/sectLabel/affiliation.model +0 -0
  87. data/parscit/resources/sectLabel/author.model +0 -0
  88. data/parscit/resources/sectLabel/funcWord +320 -0
  89. data/parscit/resources/sectLabel/genericSect.model +0 -0
  90. data/parscit/resources/sectLabel/sectLabel.config +42 -0
  91. data/parscit/resources/sectLabel/sectLabel.configXml +42 -0
  92. data/parscit/resources/sectLabel/sectLabel.model +0 -0
  93. data/sh/convert_to_text.sh +20 -0
  94. data/spec/biblicit/extractor_spec.rb +121 -0
  95. data/spec/fixtures/Review_of_Michael_Tyes_Consciousness_Revisited.docx +0 -0
  96. data/spec/fixtures/critical-infrastructures.ps +63951 -0
  97. data/spec/fixtures/txt/E06-1050.txt +867 -0
  98. data/spec/fixtures/txt/sample1.txt +902 -0
  99. data/spec/fixtures/txt/sample2.txt +394 -0
  100. data/spec/spec_helper.rb +3 -0
  101. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Function.pm +2 -20
  102. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +0 -7
  103. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Parser.pm +0 -2
  104. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +0 -7
  105. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +6 -1
  106. data/svm-header-parse/HeaderParseService/tmp/.gitignore +4 -0
  107. data/svm-header-parse/extract.pl +75 -0
  108. metadata +351 -317
  109. data/perl/DocFilter/lib/DocFilter/Config.pm +0 -35
  110. data/perl/DocFilter/lib/DocFilter/Filter.pm +0 -51
  111. data/perl/FileConversionService/README.TXT +0 -11
  112. data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
  113. data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +0 -140
  114. data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +0 -77
  115. data/perl/FileConversionService/lib/FileConverter/Compression.pm +0 -137
  116. data/perl/FileConversionService/lib/FileConverter/Config.pm +0 -57
  117. data/perl/FileConversionService/lib/FileConverter/Controller.pm +0 -191
  118. data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +0 -61
  119. data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +0 -69
  120. data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +0 -69
  121. data/perl/FileConversionService/lib/FileConverter/PSToText.pm +0 -88
  122. data/perl/FileConversionService/lib/FileConverter/Prescript.pm +0 -68
  123. data/perl/FileConversionService/lib/FileConverter/TET.pm +0 -75
  124. data/perl/FileConversionService/lib/FileConverter/Utils.pm +0 -130
  125. data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +0 -140
  126. data/perl/HeaderParseService/resources/data/EbizHeaders.txt +0 -24330
  127. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +0 -27506
  128. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +0 -26495
  129. data/perl/HeaderParseService/resources/data/tagged_headers.txt +0 -40668
  130. data/perl/HeaderParseService/resources/data/test_header.txt +0 -31
  131. data/perl/HeaderParseService/resources/data/test_header.txt.parsed +0 -31
  132. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +0 -23
  133. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +0 -23
  134. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +0 -23
  135. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +0 -23
  136. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +0 -23
  137. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +0 -23
  138. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +0 -23
  139. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +0 -23
  140. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +0 -23
  141. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +0 -23
  142. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +0 -23
  143. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +0 -23
  144. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +0 -23
  145. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +0 -23
  146. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +0 -23
  147. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +0 -23
  148. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +0 -23
  149. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +0 -23
  150. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +0 -23
  151. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +0 -23
  152. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +0 -23
  153. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +0 -23
  154. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +0 -23
  155. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +0 -23
  156. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +0 -23
  157. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +0 -23
  158. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +0 -23
  159. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +0 -23
  160. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +0 -23
  161. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +0 -23
  162. data/perl/ParsCit/crfpp/traindata/parsCit.template +0 -60
  163. data/perl/ParsCit/crfpp/traindata/parsCit.train.data +0 -12104
  164. data/perl/ParsCit/crfpp/traindata/tagged_references.txt +0 -500
  165. data/perl/ParsCit/lib/CSXUtil/SafeText.pm +0 -140
  166. data/perl/ParsCit/lib/ParsCit/Citation.pm +0 -462
  167. data/perl/ParsCit/lib/ParsCit/CitationContext.pm +0 -132
  168. data/perl/ParsCit/lib/ParsCit/Config.pm +0 -46
  169. data/perl/ParsCit/lib/ParsCit/Controller.pm +0 -306
  170. data/perl/ParsCit/lib/ParsCit/PostProcess.pm +0 -367
  171. data/perl/ParsCit/lib/ParsCit/PreProcess.pm +0 -333
  172. data/perl/ParsCit/resources/parsCit.model +0 -0
  173. data/perl/extract.pl +0 -199
  174. data/spec/biblicit/cb2bib_spec.rb +0 -48
  175. data/spec/biblicit/citeseer_spec.rb +0 -40
  176. /data/{perl → svm-header-parse}/HeaderParseService/README.TXT +0 -0
  177. /data/{perl/DocFilter → svm-header-parse/HeaderParseService}/lib/CSXUtil/SafeText.pm +0 -0
  178. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +0 -0
  179. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +0 -0
  180. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +0 -0
  181. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/50states +0 -0
  182. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AddrTopWords.txt +0 -0
  183. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWords.txt +0 -0
  184. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWordsAll.txt +0 -0
  185. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/ChineseSurNames.txt +0 -0
  186. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames.bin +0 -0
  187. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
  188. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/DomainSuffixes.txt +0 -0
  189. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/LabeledHeader +0 -0
  190. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/README +0 -0
  191. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines +0 -0
  192. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines1 +0 -0
  193. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstract.txt +0 -0
  194. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstractTopWords +0 -0
  195. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/addr.txt +0 -0
  196. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affi.txt +0 -0
  197. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affis.bin +0 -0
  198. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
  199. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/allnamewords.bin +0 -0
  200. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_US.txt +0 -0
  201. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_world.txt +0 -0
  202. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/city.txt +0 -0
  203. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cityname.txt +0 -0
  204. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/country_abbr.txt +0 -0
  205. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/countryname.txt +0 -0
  206. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/dateTopWords +0 -0
  207. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/degree.txt +0 -0
  208. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/email.txt +0 -0
  209. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/excludeWords.txt +0 -0
  210. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/female-names +0 -0
  211. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstNames.txt +0 -0
  212. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames.bin +0 -0
  213. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
  214. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/intro.txt +0 -0
  215. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keyword.txt +0 -0
  216. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keywordTopWords +0 -0
  217. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/male-names +0 -0
  218. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/middleNames.txt +0 -0
  219. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/month.txt +0 -0
  220. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul +0 -0
  221. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label +0 -0
  222. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label.old +0 -0
  223. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.processed +0 -0
  224. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulAuthor +0 -0
  225. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulClassStat +0 -0
  226. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nickname.txt +0 -0
  227. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nicknames.bin +0 -0
  228. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/note.txt +0 -0
  229. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/page.txt +0 -0
  230. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/phone.txt +0 -0
  231. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/postcode.txt +0 -0
  232. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/pubnum.txt +0 -0
  233. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.bin +0 -0
  234. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.txt +0 -0
  235. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/states_and_abbreviations.txt +0 -0
  236. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords +0 -0
  237. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords.bin +0 -0
  238. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surNames.txt +0 -0
  239. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames.bin +0 -0
  240. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames_spec.bin +0 -0
  241. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/A.html +0 -0
  242. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/B.html +0 -0
  243. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/C.html +0 -0
  244. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/D.html +0 -0
  245. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/E.html +0 -0
  246. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/F.html +0 -0
  247. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/G.html +0 -0
  248. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/H.html +0 -0
  249. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/I.html +0 -0
  250. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/J.html +0 -0
  251. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/K.html +0 -0
  252. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/L.html +0 -0
  253. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/M.html +0 -0
  254. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/N.html +0 -0
  255. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/O.html +0 -0
  256. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/P.html +0 -0
  257. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Q.html +0 -0
  258. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/R.html +0 -0
  259. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/S.html +0 -0
  260. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/T.html +0 -0
  261. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/U.html +0 -0
  262. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/V.html +0 -0
  263. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/W.html +0 -0
  264. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
  265. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/X.html +0 -0
  266. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Y.html +0 -0
  267. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Z.html +0 -0
  268. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ae.html +0 -0
  269. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/am.html +0 -0
  270. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ar.html +0 -0
  271. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/at.html +0 -0
  272. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/au.html +0 -0
  273. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bd.html +0 -0
  274. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/be.html +0 -0
  275. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bg.html +0 -0
  276. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bh.html +0 -0
  277. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
  278. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bm.html +0 -0
  279. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bn.html +0 -0
  280. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/br.html +0 -0
  281. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ca.html +0 -0
  282. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ch.html +0 -0
  283. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cl.html +0 -0
  284. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cn.html +0 -0
  285. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/co.html +0 -0
  286. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cr.html +0 -0
  287. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cy.html +0 -0
  288. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cz.html +0 -0
  289. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/de.html +0 -0
  290. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
  291. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dk.html +0 -0
  292. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ec.html +0 -0
  293. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ee.html +0 -0
  294. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/eg.html +0 -0
  295. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/es.html +0 -0
  296. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/et.html +0 -0
  297. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/faq.html +0 -0
  298. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fi.html +0 -0
  299. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fj.html +0 -0
  300. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fo.html +0 -0
  301. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fr.html +0 -0
  302. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/geog.html +0 -0
  303. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gr.html +0 -0
  304. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gu.html +0 -0
  305. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hk.html +0 -0
  306. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hr.html +0 -0
  307. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hu.html +0 -0
  308. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/id.html +0 -0
  309. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ie.html +0 -0
  310. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/il.html +0 -0
  311. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/in.html +0 -0
  312. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/is.html +0 -0
  313. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/it.html +0 -0
  314. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jm.html +0 -0
  315. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jo.html +0 -0
  316. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jp.html +0 -0
  317. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
  318. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kr.html +0 -0
  319. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kw.html +0 -0
  320. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lb.html +0 -0
  321. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
  322. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lk.html +0 -0
  323. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lt.html +0 -0
  324. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lu.html +0 -0
  325. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lv.html +0 -0
  326. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ma.html +0 -0
  327. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
  328. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
  329. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mk.html +0 -0
  330. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mo.html +0 -0
  331. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
  332. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mt.html +0 -0
  333. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mx.html +0 -0
  334. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/my.html +0 -0
  335. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ni.html +0 -0
  336. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nl.html +0 -0
  337. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/no.html +0 -0
  338. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nz.html +0 -0
  339. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pa.html +0 -0
  340. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pe.html +0 -0
  341. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ph.html +0 -0
  342. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pl.html +0 -0
  343. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
  344. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pr.html +0 -0
  345. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ps.html +0 -0
  346. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pt.html +0 -0
  347. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/recognition.html +0 -0
  348. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/results.html +0 -0
  349. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ro.html +0 -0
  350. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ru.html +0 -0
  351. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sd.html +0 -0
  352. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/se.html +0 -0
  353. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sg.html +0 -0
  354. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/si.html +0 -0
  355. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sk.html +0 -0
  356. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/th.html +0 -0
  357. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tr.html +0 -0
  358. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tw.html +0 -0
  359. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ua.html +0 -0
  360. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uk.html +0 -0
  361. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ-full.html +0 -0
  362. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ.html +0 -0
  363. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uy.html +0 -0
  364. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ve.html +0 -0
  365. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/yu.html +0 -0
  366. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/za.html +0 -0
  367. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/zm.html +0 -0
  368. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list.txt +0 -0
  369. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/url.txt +0 -0
  370. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/webTopWords +0 -0
  371. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/words +0 -0
  372. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10ContextModelfold1 +0 -0
  373. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10Modelfold1 +0 -0
  374. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11ContextModelfold1 +0 -0
  375. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11Modelfold1 +0 -0
  376. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12ContextModelfold1 +0 -0
  377. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12Modelfold1 +0 -0
  378. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13ContextModelfold1 +0 -0
  379. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13Modelfold1 +0 -0
  380. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14ContextModelfold1 +0 -0
  381. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14Modelfold1 +0 -0
  382. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15ContextModelfold1 +0 -0
  383. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15Modelfold1 +0 -0
  384. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1ContextModelfold1 +0 -0
  385. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1Modelfold1 +0 -0
  386. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2ContextModelfold1 +0 -0
  387. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2Modelfold1 +0 -0
  388. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3ContextModelfold1 +0 -0
  389. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3Modelfold1 +0 -0
  390. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4ContextModelfold1 +0 -0
  391. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4Modelfold1 +0 -0
  392. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5ContextModelfold1 +0 -0
  393. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5Modelfold1 +0 -0
  394. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6ContextModelfold1 +0 -0
  395. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6Modelfold1 +0 -0
  396. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7ContextModelfold1 +0 -0
  397. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7Modelfold1 +0 -0
  398. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8ContextModelfold1 +0 -0
  399. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8Modelfold1 +0 -0
  400. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9ContextModelfold1 +0 -0
  401. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9Modelfold1 +0 -0
  402. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceModel +0 -0
  403. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceTrainF +0 -0
  404. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperBaseFeaDict +0 -0
  405. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperContextFeaDict +0 -0
  406. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +0 -0
@@ -0,0 +1,1427 @@
1
+ #!/usr/bin/perl -wT
2
+
3
+ # Author: Luong Minh Thang <luongmin@comp.nus.edu.sg>, generated at Tue, 02 Jun 2009 01:30:42
4
+ # Modified from template by Min-Yen Kan <kanmy@comp.nus.edu.sg>
5
+
6
+ require 5.0;
7
+ use strict;
8
+ use Getopt::Long;
9
+ use HTML::Entities;
10
+
11
+ # I do not know a better solution to find a lib path in -T mode.
12
+ # So if you know a better solution, I'd be glad to hear.
13
+ # See this http://www.perlmonks.org/?node_id=585299 for why I used the below code
14
+ use FindBin;
15
+ FindBin::again(); # to get correct path in case 2 scripts in different directories use FindBin
16
+
17
+ my $path;
18
+ BEGIN
19
+ {
20
+ if ($FindBin::Bin =~ /(.*)/)
21
+ {
22
+ $path = $1;
23
+ }
24
+ }
25
+ use lib "$path/../../lib";
26
+
27
+ use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/5.10.0";
28
+ use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/site_perl/5.10.0";
29
+
30
+ use SectLabel::PreProcess;
31
+
32
+ # USER customizable section
33
+ $0 =~ /([^\/]+)$/; my $progname = $1;
34
+ my $outputVersion = "1.0";
35
+ # END user customizable section
36
+
37
+ sub License
38
+ {
39
+ print STDERR "# Copyright 2009 \251 by Luong Minh Thang\n";
40
+ }
41
+
42
+ # HELP sub-procedure
43
+ sub Help
44
+ {
45
+ print STDERR "Process Omnipage XML output (concatenated results fromm all pages of a PDF file), and extract text lines together with other XML infos\n";
46
+ print STDERR "usage: $progname -h\t[invokes help]\n";
47
+ print STDERR " $progname -in xmlFile -out outFile [-xmlFeature -decode -markup -para] [-tag tagFile -allowEmptyLine -log]\n";
48
+ print STDERR "Options:\n";
49
+ print STDERR "\t-q\tQuiet Mode (don't echo license)\n";
50
+ print STDERR "\t-xmlFeature: append XML feature together with text extracted\n";
51
+ print STDERR "\t-decode: decode HTML entities and then output, to avoid double entity encoding later\n";
52
+ print STDERR "\t-para: marking in the output each paragraph with # Para lineId numLines\n";
53
+ print STDERR "\t-markup: marking in the output detailed word-level info ### Page w h\\n## Para l t r b\\n# Line l t r b\\nword l t r b\n";
54
+ print STDERR "\t-tag tagFile: count XML tags/values for statistics purpose\n";
55
+ }
56
+
57
+ my $QUIET = 0;
58
+ my $HELP = 0;
59
+ my $outFile = undef;
60
+ my $inFile = undef;
61
+
62
+ my $isXmlFeature = 0;
63
+ my $isDecode = 0;
64
+
65
+ my $isMarkup = 0;
66
+ my $isParaDelimiter = 0;
67
+
68
+ my $tagFile = "";
69
+ my $isAllowEmpty = 0;
70
+ my $isDebug = 1;
71
+
72
+ $HELP = 1 unless GetOptions('in=s' => \$inFile,
73
+ 'out=s' => \$outFile,
74
+ 'decode' => \$isDecode,
75
+ 'xmlFeature' => \$isXmlFeature,
76
+
77
+ 'tag=s' => \$tagFile,
78
+ 'allowEmptyLine' => \$isAllowEmpty,
79
+ 'markup' => \$isMarkup,
80
+
81
+ 'para' => \$isParaDelimiter,
82
+ 'log' => \$isDebug,
83
+ 'h' => \$HELP,
84
+ 'q' => \$QUIET);
85
+
86
+ if ($HELP || !defined $inFile || !defined $outFile)
87
+ {
88
+ Help();
89
+ exit(0);
90
+ }
91
+
92
+ if (!$QUIET)
93
+ {
94
+ License();
95
+ }
96
+
97
+ # Untaint
98
+ $inFile = untaintPath($inFile);
99
+ $outFile = untaintPath($outFile);
100
+ $tagFile = untaintPath($tagFile);
101
+ $ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
102
+ # End untaint
103
+
104
+ # Mark page, para, line, word
105
+ my %gPageHash = ();
106
+
107
+ # Mark paragraph
108
+ my @gPara = ();
109
+
110
+ # XML features
111
+ # locFeature
112
+ my @gPosHash = ();
113
+ my $gMinPos = 1000000;
114
+ my $gMaxPos = 0;
115
+
116
+ # alignFeature
117
+ my @gAlign = ();
118
+
119
+ # boldFeature
120
+ my @gBold = ();
121
+
122
+ # italicFeature
123
+ my @gItalic = ();
124
+
125
+ # fontSizeFeature
126
+ my %gFontSizeHash = ();
127
+ my @gFontSize = ();
128
+
129
+ # fontFaceFeature
130
+ my %gFontFaceHash = ();
131
+ my @gFontFace = ();
132
+
133
+ # picFeature
134
+ my @gPic = ();
135
+
136
+ # tableFeature
137
+ my @gTable = ();
138
+
139
+ # bulletFeature
140
+ my @gBullet = ();
141
+
142
+ # spaceFeature
143
+ #my %gSpaceHash = ();
144
+ #my @gSpace = ();
145
+ # End XML features
146
+
147
+ my %tags = ();
148
+
149
+ if ($isDebug)
150
+ {
151
+ print STDERR "\n# Processing file $inFile & output to $outFile\n";
152
+ }
153
+
154
+ my $markupOutput = "";
155
+ my $allText = processFile($inFile, $outFile, \%tags);
156
+
157
+ # Find header part
158
+ my @lines = split(/\n/, $allText);
159
+ my $numLines = scalar(@lines);
160
+ my ($headerLength, $bodyLength, $bodyStartId) = SectLabel::PreProcess::FindHeaderText(\@lines, 0, $numLines);
161
+
162
+ # Output
163
+ if ($isMarkup)
164
+ {
165
+ open(OF, ">:utf8", "$outFile") || die"#Can't open file \"$outFile\"\n";
166
+ print OF "$markupOutput";
167
+ close OF;
168
+ }
169
+ else
170
+ {
171
+ output(\@lines, $outFile);
172
+ }
173
+
174
+ if ($tagFile ne "")
175
+ {
176
+ printTagInfo(\%tags, $tagFile);
177
+ }
178
+
179
+ sub processFile
180
+ {
181
+ my ($inFile, $tags) = @_;
182
+
183
+ if (!(-e $inFile)) { die "# $progname crash\t\tFile \"$inFile\" doesn't exist"; }
184
+ open (IF, "<:utf8", $inFile) || die "# $progname crash\t\tCan't open \"$inFile\"";
185
+
186
+ my $isPic = 0;
187
+ my $isPara = 0;
188
+ my $isTable = 0;
189
+ my $isSpace = 0;
190
+
191
+ my $text = undef;
192
+ my $allText = undef;
193
+
194
+ my $lineId = 0;
195
+
196
+ # Each line contains a header
197
+ while (<IF>)
198
+ {
199
+ if (/^\#/) { next; } # skip comments
200
+
201
+ chomp;
202
+ s/\cM$//; # remove ^M character at the end of the file if any
203
+
204
+ my $line = $_;
205
+ if($tagFile ne "")
206
+ {
207
+ processTagInfo($line, $tags);
208
+ }
209
+
210
+ # if ($line =~ /<\?xml version.+>/) { } ### Xml ###
211
+ # if ($line =~ /^<\/column>$/) { } ### Column ###
212
+ if ($isMarkup && $line =~ /<theoreticalPage (.*)\/>/ && $isMarkup)
213
+ {
214
+ $markupOutput .= "### Page $1\n";
215
+ }
216
+
217
+ ### pic ###
218
+ if ($line =~ /^<dd (.*)>$/)
219
+ {
220
+ $isPic = 1;
221
+ if($isMarkup)
222
+ {
223
+ $markupOutput .= "### Dd $1\n";
224
+ }
225
+ }
226
+ elsif ($line =~ /^<\/dd>$/)
227
+ {
228
+ $isPic = 0;
229
+ }
230
+
231
+ ### Table ###
232
+ if ($line =~ /^<table .*>$/)
233
+ {
234
+ $text .= $line."\n"; # we need the header
235
+ $isTable = 1;
236
+ }
237
+ elsif ($line =~ /^<\/table>$/)
238
+ {
239
+ my $tableText = processTable($text, $isPic);
240
+ $allText .= $tableText;
241
+
242
+ my @tmpLines = split(/\n/, $tableText);
243
+ $lineId += scalar(@tmpLines);
244
+
245
+ $isTable = 0;
246
+ $text = "";
247
+ }
248
+ elsif($isTable)
249
+ {
250
+ $text .= $line."\n";
251
+ next;
252
+ }
253
+ ### Paragraph ###
254
+ # Note: table processing should have higher priority than paragraph, i.e. the priority does matter
255
+ elsif ($line =~ /^<para (.*)>$/)
256
+ {
257
+ $text .= $line."\n"; # we need the header
258
+ $isPara = 1;
259
+
260
+ if($isMarkup)
261
+ {
262
+ $markupOutput .= "## Para $1\n";
263
+ }
264
+ }
265
+ elsif ($line =~ /^<\/para>$/)
266
+ {
267
+ my ($paraText, $l, $t, $r, $b);
268
+ ($paraText, $l, $t, $r, $b, $isSpace) = processPara($text, 0, $isPic);
269
+ $allText .= $paraText;
270
+
271
+ my @tmpLines = split(/\n/, $paraText);
272
+ $lineId += scalar(@tmpLines);
273
+ $isPara = 0;
274
+ $text = "";
275
+ }
276
+ elsif($isPara)
277
+ {
278
+ $text .= $line."\n";
279
+ next;
280
+ }
281
+ }
282
+
283
+ close IF;
284
+ return $allText;
285
+ }
286
+
287
+ sub output
288
+ {
289
+ my ($lines, $outFile) = @_;
290
+
291
+ open(OF, ">:utf8", "$outFile") || die"#Can't open file \"$outFile\"\n";
292
+
293
+ ####### Final output ############
294
+ # xml feature label
295
+ my %gFontSizeLabels = ();
296
+ # my %gSpaceLabels = (); # yes, no
297
+
298
+ if($isXmlFeature)
299
+ {
300
+ getFontSizeLabels(\%gFontSizeHash, \%gFontSizeLabels);
301
+ # getSpaceLabels(\%gSpaceHash, \%gSpaceLabels);
302
+ }
303
+
304
+ my $id = -1;
305
+ my $output = "";
306
+ my $paraLineId = -1;
307
+ my $paraLineCount = 0;
308
+ foreach my $line (@{$lines})
309
+ {
310
+ $id++;
311
+
312
+ $line =~ s/\cM$//; # remove ^M character at the end of each line if any
313
+
314
+ # empty lines
315
+ if ($line =~ /^\s*$/)
316
+ {
317
+ if (!$isAllowEmpty)
318
+ {
319
+ next;
320
+ }
321
+ else
322
+ {
323
+ if ($isDebug)
324
+ {
325
+ print STDERR "#! Line $id empty!\n";
326
+ }
327
+ }
328
+ }
329
+
330
+ if ($gPara[$id] eq "yes")
331
+ {
332
+ # mark para
333
+ if ($output ne "")
334
+ {
335
+ if ($isParaDelimiter)
336
+ {
337
+ print OF "# Para $paraLineId $paraLineCount\n$output";
338
+ $paraLineCount = 0;
339
+ }
340
+ else
341
+ {
342
+ if($isDecode)
343
+ {
344
+ $output = decode_entities($output);
345
+ }
346
+ print OF $output;
347
+ }
348
+
349
+ $output = "";
350
+ }
351
+ $paraLineId = $id;
352
+ }
353
+
354
+ $output .= $line;
355
+ $paraLineCount++;
356
+
357
+ ## Output XML features
358
+ if ($isXmlFeature)
359
+ {
360
+ # loc feature
361
+ my $locFeature;
362
+
363
+ if($gPosHash[$id] != -1)
364
+ {
365
+ $locFeature = "xmlLoc_".int(($gPosHash[$id] - $gMinPos) * 8.0 / ($gMaxPos - $gMinPos + 1));
366
+ }
367
+
368
+ # align feature
369
+ my $alignFeature = "xmlAlign_".$gAlign[$id];
370
+
371
+ # fontSize feature
372
+ my $fontSizeFeature;
373
+ if($gFontSize[$id] == -1)
374
+ {
375
+ $fontSizeFeature = "xmlFontSize_none";
376
+ }
377
+ else
378
+ {
379
+ $fontSizeFeature = "xmlFontSize_".$gFontSizeLabels{$gFontSize[$id]};
380
+ }
381
+
382
+ my $boldFeature = "xmlBold_".$gBold[$id]; # bold feature
383
+ my $italicFeature = "xmlItalic_".$gItalic[$id]; # italic feature
384
+ my $picFeature = "xmlPic_".$gPic[$id]; # pic feature
385
+ my $tableFeature = "xmlTable_".$gTable[$id]; # table feature
386
+ my $bulletFeature = "xmlBullet_".$gBullet[$id]; # bullet feature
387
+
388
+ # space feature
389
+ #my $spaceFeature;
390
+ #if($gSpace[$id] eq "none")
391
+ #{
392
+ # $spaceFeature = "xmlSpace_none";
393
+ #}
394
+ #else
395
+ #{
396
+ # $spaceFeature = "xmlSpace_".$gSpaceLabels{$gSpace[$id]};
397
+ #}
398
+
399
+ # differential features
400
+ my ($alignDiff, $fontSizeDiff, $fontFaceDiff, $fontSFDiff, $fontSFBIDiff, $fontSFBIADiff, $paraDiff) = getDifferentialFeatures($id);
401
+
402
+ $output .= " |XML| $locFeature $boldFeature $italicFeature $fontSizeFeature $picFeature $tableFeature $bulletFeature $fontSFBIADiff $paraDiff\n";
403
+ # $alignFeature $alignDiff $fontSizeDiff $fontFaceDiff $fontSFDiff $fontSFBIDiff
404
+ }
405
+ else
406
+ {
407
+ $output .= "\n";
408
+ }
409
+ }
410
+
411
+ # mark para
412
+ if ($output ne "")
413
+ {
414
+ if ($isParaDelimiter)
415
+ {
416
+ print OF "# Para $paraLineId $paraLineCount\n$output";
417
+ $paraLineCount = 0;
418
+ }
419
+ else
420
+ {
421
+ if ($isDecode)
422
+ {
423
+ $output = decode_entities($output);
424
+ }
425
+
426
+ print OF $output;
427
+ }
428
+ $output = ""
429
+ }
430
+ close OF;
431
+ }
432
+
433
+ sub getDifferentialFeatures
434
+ {
435
+ my ($id) = @_;
436
+
437
+ # alignChange feature
438
+ my $alignDiff = "bi_xmlA_";
439
+ if ($id == 0)
440
+ {
441
+ $alignDiff .= $gAlign[$id];
442
+ }
443
+ elsif ($gAlign[$id] eq $gAlign[$id-1])
444
+ {
445
+ $alignDiff .= "continue";
446
+ }
447
+ else
448
+ {
449
+ $alignDiff .= $gAlign[$id];
450
+ }
451
+
452
+ # fontFaceChange feature
453
+ my $fontFaceDiff = "bi_xmlF_";
454
+ if ($id == 0)
455
+ {
456
+ $fontFaceDiff .= "new";
457
+ }
458
+ elsif ($gFontFace[$id] eq $gFontFace[$id-1])
459
+ {
460
+ $fontFaceDiff .= "continue";
461
+ }
462
+ else
463
+ {
464
+ $fontFaceDiff .= "new";
465
+ }
466
+
467
+ # fontSizeChange feature
468
+ my $fontSizeDiff = "bi_xmlS_";
469
+ if ($id == 0)
470
+ {
471
+ $fontSizeDiff .= "new";
472
+ }
473
+ elsif ($gFontSize[$id] == $gFontSize[$id-1])
474
+ {
475
+ $fontSizeDiff .= "continue";
476
+ }
477
+ else
478
+ {
479
+ $fontSizeDiff .= "new";
480
+ }
481
+
482
+ # fontSFChange feature
483
+ my $fontSFDiff = "bi_xmlSF_";
484
+ if($id == 0){
485
+ $fontSFDiff .= "new";
486
+ } elsif($gFontSize[$id] == $gFontSize[$id-1] && $gFontFace[$id] eq $gFontFace[$id-1]){
487
+ $fontSFDiff .= "continue";
488
+ } else {
489
+ $fontSFDiff .= "new";
490
+ }
491
+
492
+ # fontSFBIChange feature
493
+ my $fontSFBIDiff = "bi_xmlSFBI_";
494
+ if($id == 0){
495
+ $fontSFBIDiff .= "new";
496
+ } elsif($gFontSize[$id] == $gFontSize[$id-1] && $gFontFace[$id] eq $gFontFace[$id-1] && $gBold[$id] eq $gBold[$id-1] && $gItalic[$id] eq $gItalic[$id-1]){
497
+ $fontSFBIDiff .= "continue";
498
+ } else {
499
+ $fontSFBIDiff .= "new";
500
+ }
501
+
502
+ # fontSFBIAChange feature
503
+ my $fontSFBIADiff = "bi_xmlSFBIA_";
504
+ if($id == 0){
505
+ $fontSFBIADiff .= "new";
506
+ } elsif($gFontSize[$id] == $gFontSize[$id-1] && $gFontFace[$id] eq $gFontFace[$id-1] && $gBold[$id] eq $gBold[$id-1] && $gItalic[$id] eq $gItalic[$id-1] && $gAlign[$id] eq $gAlign[$id-1]){
507
+ $fontSFBIADiff .= "continue";
508
+ } else {
509
+ $fontSFBIADiff .= "new";
510
+ }
511
+
512
+ # para change feature
513
+ my $paraDiff = "bi_xmlPara_";
514
+ if($id < $bodyStartId){ # header part, consider each line as a separate paragraph
515
+ $paraDiff .= "header";
516
+ } else {
517
+ if($gPara[$id] eq "yes"){
518
+ $paraDiff .= "new";
519
+ } else {
520
+ $paraDiff .= "continue";
521
+ }
522
+ }
523
+
524
+ return ($alignDiff, $fontSizeDiff, $fontFaceDiff, $fontSFDiff, $fontSFBIDiff, $fontSFBIADiff, $paraDiff);
525
+ }
526
+
527
+ sub getFontSizeLabels {
528
+ my ($gFontSizeHash, $gFontSizeLabels) = @_;
529
+
530
+ if($isDebug){ print STDERR "# Map fonts\n"; }
531
+ my @sortedFonts = sort { $gFontSizeHash->{$b} <=> $gFontSizeHash->{$a} } keys %{$gFontSizeHash}; # sort by values, obtain keys
532
+
533
+ my $commonSize = $sortedFonts[0];
534
+ @sortedFonts = sort { $a <=> $b } keys %{$gFontSizeHash}; # sort by keys, obtain keys
535
+ my $commonIndex = 0; # index of common font size
536
+ foreach(@sortedFonts){
537
+ if($commonSize == $_) { # found
538
+ last;
539
+ }
540
+ $commonIndex++;
541
+ }
542
+
543
+ # small fonts
544
+ for(my $i = 0; $i<$commonIndex; $i++){ # smallIndex $largeIndex
545
+ $gFontSizeLabels->{$sortedFonts[$i]} = "smaller";
546
+
547
+ if($isDebug){
548
+ print STDERR "$sortedFonts[$i] --> $gFontSizeLabels->{$sortedFonts[$i]}, freq = $gFontSizeHash->{$sortedFonts[$i]}\n";
549
+ }
550
+ }
551
+
552
+ # common fonts
553
+ $gFontSizeLabels->{$commonSize} = "common";
554
+ if($isDebug){
555
+ print STDERR "$sortedFonts[$commonIndex] --> $gFontSizeLabels->{$sortedFonts[$commonIndex]}, freq = $gFontSizeHash->{$sortedFonts[$commonIndex]}\n";
556
+ }
557
+
558
+ # large fonts
559
+ for(my $i = ($commonIndex+1); $i<scalar(@sortedFonts); $i++){ # ($largeIndex+1) (scalar(@sortedFonts)-1)
560
+ if((scalar(@sortedFonts)-$i) <= 3){
561
+ $gFontSizeLabels->{$sortedFonts[$i]} = "largest".($i+1-scalar(@sortedFonts));
562
+ } else {
563
+ $gFontSizeLabels->{$sortedFonts[$i]} = "larger";
564
+ }
565
+
566
+ if($isDebug){
567
+ print STDERR "$sortedFonts[$i] --> $gFontSizeLabels->{$sortedFonts[$i]}, freq = $gFontSizeHash->{$sortedFonts[$i]}\n";
568
+ }
569
+ }
570
+ }
571
+
572
+ sub getSpaceLabels {
573
+ my ($gSpaceHash, $gSpaceLabels) = @_;
574
+
575
+ if($isDebug){
576
+ print STDERR "\n# Map space\n";
577
+ }
578
+ my @sortedSpaces = sort { $gSpaceHash->{$b} <=> $gSpaceHash->{$a} } keys %{$gSpaceHash}; # sort by freqs, obtain space faces
579
+
580
+ my $commonSpace = $sortedSpaces[0];
581
+ my $commonFreq = $gSpaceHash->{$commonSpace};
582
+ # find similar common freq with larger spaces
583
+ for(my $i = 0; $i<scalar(@sortedSpaces); $i++){ # 0 ($smallIndex-1)
584
+ my $freq = $gSpaceHash->{$sortedSpaces[$i]};
585
+ if($freq/$commonFreq > 0.8){
586
+ if($sortedSpaces[$i] > $commonSpace){
587
+ $commonSpace = $sortedSpaces[$i];
588
+ }
589
+ } else {
590
+ last;
591
+ }
592
+ }
593
+
594
+ for(my $i = 0; $i<scalar(@sortedSpaces); $i++){ # 0 ($smallIndex-1)
595
+ if($sortedSpaces[$i] > $commonSpace){
596
+ $gSpaceLabels->{$sortedSpaces[$i]} = "yes";
597
+ } else {
598
+ $gSpaceLabels->{$sortedSpaces[$i]} = "no";
599
+ }
600
+
601
+ if($isDebug){
602
+ print STDERR "$sortedSpaces[$i] --> $gSpaceLabels->{$sortedSpaces[$i]}, freq = $gSpaceHash->{$sortedSpaces[$i]}\n";
603
+ }
604
+ }
605
+ }
606
+
607
+ sub processTable
608
+ {
609
+ my ($inputText, $isPic) = @_;
610
+
611
+ my $isCell = 0; # for table cell object
612
+
613
+ my $allText = "";
614
+ my $text = "";
615
+
616
+ my @lines = split(/\n/, $inputText);
617
+ my %tablePos = (); # $tablePos{$cellText} = "$l-$t-$r-$bottom"
618
+ my %table = (); # $table{$row}->{$col} = \@paraTexts
619
+ my $rowFrom;
620
+ my $colFrom;
621
+ my $rowTill;
622
+ my $colTill;
623
+
624
+ # xml feature
625
+ my $align = "none";
626
+ my $pos = -1;
627
+ foreach my $line (@lines)
628
+ {
629
+ if ($line =~ /^<table (.+?)>$/)
630
+ {
631
+ my $attr = $1;
632
+
633
+ if($isMarkup) { $markupOutput .= "### Table $attr\n"; }
634
+
635
+ # Fix: wrong regex sequence, huydhn
636
+ #if ($attr =~ /^.*l=\"(\d+)\" t=\"(\d+)\" r=\"(\d+)\" b=\"(\d+)\".*alignment=\"(.+?)\".*$/)
637
+ #{
638
+ # my ($l, $t, $r, $bottom) = ($1, $2, $3, $4);
639
+ # $align = $5;
640
+
641
+ # # pos feature
642
+ # $pos = ($t+$bottom)/2.0;
643
+
644
+ # if($pos < $gMinPos) { $gMinPos = $pos; }
645
+ # if($pos > $gMaxPos) { $gMaxPos = $pos; }
646
+ #}
647
+ #else
648
+ #{
649
+ # print STDERR "# no table alignment or location \"$line\"\n";
650
+ # $align = "";
651
+ #}
652
+
653
+ my ($l, $t, $r, $bottom) = undef;
654
+ if ($attr =~ /^.*l=\"(\d+)\".*$/) { $l = $1; }
655
+ if ($attr =~ /^.*t=\"(\d+)\".*$/) { $t = $1; }
656
+ if ($attr =~ /^.*r=\"(\d+)\".*$/) { $r = $1; }
657
+ if ($attr =~ /^.*b=\"(\d+)\".*$/) { $bottom = $1; }
658
+
659
+ if ($t && $bottom)
660
+ {
661
+ # pos feature
662
+ $pos = ($t + $bottom) / 2.0;
663
+
664
+ if($pos < $gMinPos) { $gMinPos = $pos; }
665
+ if($pos > $gMaxPos) { $gMaxPos = $pos; }
666
+ }
667
+ else
668
+ {
669
+ die "# Undefined table location \"$line\"\n";
670
+ }
671
+
672
+ if ($attr =~ /^.*alignment=\"(\d+)\".*$/)
673
+ {
674
+ $align = $1;
675
+ }
676
+ else
677
+ {
678
+ print STDERR "# no table alignment \"$line\"\n";
679
+ $align = "";
680
+ }
681
+ # End.
682
+ }
683
+ elsif ($line =~ /^<cell .*gridColFrom=\"(\d+)\" gridColTill=\"(\d+)\" gridRowFrom=\"(\d+)\" gridRowTill=\"(\d+)\".*>$/) # new cell
684
+ {
685
+ $colFrom = $1;
686
+ $colTill = $2;
687
+ $rowFrom = $3;
688
+ $rowTill = $4;
689
+ #print STDERR "$rowFrom $rowTill $colFrom $colTill\n";
690
+ $isCell = 1;
691
+ }
692
+ elsif ($line =~ /^<\/cell>$/) # end cell
693
+ {
694
+ my @paraTexts = ();
695
+ processCell($text, \@paraTexts, \%tablePos, $isPic);
696
+
697
+ for(my $i = $rowFrom; $i<=$rowTill; $i++)
698
+ {
699
+ for(my $j = $colFrom; $j<=$colTill; $j++)
700
+ {
701
+ if(!$table{$i}) { $table{$i} = (); }
702
+ if(!$table{$i}->{$j}) { $table{$i}->{$j} = (); }
703
+
704
+ if($i == $rowFrom && $j == $colFrom)
705
+ {
706
+ push(@{$table{$i}->{$j}}, @paraTexts);
707
+ if(scalar(@paraTexts) > 1) { last; }
708
+ }
709
+ else
710
+ {
711
+ push(@{$table{$i}->{$j}}, ""); #add stub "" for spanning rows or cols
712
+ }
713
+ }
714
+ }
715
+
716
+ $isCell = 0;
717
+ $text = "";
718
+ }
719
+ elsif($isCell)
720
+ {
721
+ $text .= $line."\n";
722
+ next;
723
+ }
724
+ }
725
+
726
+ # note: such a complicated code is because in the normal node, Omnipage doesn't seem to strictly print column by column given a row is fixed.
727
+ # E.g if col1: paraText1, col2: paraText21\n$paraText22, and col3: paraText31\n$paraText32
728
+ # It will print paraText1\tparaText21\tparaText31\n\t$paraText22\t$paraText32
729
+ my @sortedRows = sort {$a <=> $b} keys %table;
730
+ my $isFirstLinePara = 1;
731
+ foreach my $row (@sortedRows)
732
+ {
733
+ my %tableR = %{$table{$row}};
734
+ my @sortedCols = sort {$a <=> $b} keys %tableR;
735
+ while(1)
736
+ {
737
+ my $isStop = 1;
738
+ my $rowText = "";
739
+
740
+ foreach my $col (@sortedCols)
741
+ {
742
+ # there's still some thing to process
743
+ if(scalar(@{$tableR{$col}}) > 0)
744
+ {
745
+ $isStop = 0;
746
+ $rowText .= shift(@{$tableR{$col}});
747
+ }
748
+ $rowText .= "\t";
749
+ }
750
+
751
+ if ((!$isAllowEmpty && $rowText =~ /^\s*$/) || ($isAllowEmpty && $rowText eq ""))
752
+ {
753
+ $isStop = 1;
754
+ }
755
+
756
+ if($isStop)
757
+ {
758
+ last;
759
+ }
760
+ else
761
+ {
762
+ $rowText =~ s/\t$/\n/;
763
+ $allText .= $rowText;
764
+ # print STDERR "$rowText";
765
+
766
+ # para
767
+ if($isFirstLinePara)
768
+ {
769
+ push(@gPara, "yes");
770
+ $isFirstLinePara = 0;
771
+ }
772
+ else
773
+ {
774
+ push(@gPara, "no");
775
+ }
776
+
777
+ if($isXmlFeature)
778
+ {
779
+ # table feature
780
+ push(@gTable, "yes");
781
+
782
+ # pic feature
783
+ if($isPic)
784
+ {
785
+ push(@gPic, "yes");
786
+ }
787
+ else
788
+ {
789
+ push(@gPic, "no");
790
+ }
791
+
792
+ push(@gPosHash, $pos); # update xml pos value
793
+ push(@gAlign, $align); # update xml alignment value
794
+
795
+ ### Not assign value ###
796
+ push(@gFontSize, -1); # fontSize feature
797
+ push(@gFontFace, "none"); # fontFace feature
798
+ push(@gBold, "no"); # bold feature
799
+ push(@gItalic, "no"); # italic feature
800
+ push(@gBullet, "no"); # bullet feature
801
+ # push(@gSpace, "none"); # space feature
802
+ } # end if xml feature
803
+ }
804
+ }
805
+ }
806
+
807
+ return $allText;
808
+ }
809
+
810
+ sub processCell
811
+ {
812
+ my ($inputText, $paraTexts, $tablePos, $isPic) = @_;
813
+
814
+ my $text = "";
815
+ my @lines = split(/\n/, $inputText);
816
+ my $isPara = 0;
817
+ my $flag = 0;
818
+ foreach my $line (@lines)
819
+ {
820
+ if ($line =~ /^<para (.*)>$/)
821
+ {
822
+ $text .= $line."\n"; # we need the header
823
+ $isPara = 1;
824
+
825
+ if($isMarkup)
826
+ {
827
+ $markupOutput .= "## ParaTable $1\n";
828
+ }
829
+ }
830
+ elsif ($line =~ /^<\/para>$/)
831
+ {
832
+ my ($paraText, $l, $t, $r, $b) = processPara($text, 1, $isPic);
833
+ my @tokens = split(/\n/, $paraText);
834
+
835
+ foreach my $token (@tokens)
836
+ {
837
+ if($token ne "")
838
+ {
839
+ push(@{$paraTexts}, $token);
840
+ $flag = 1;
841
+ }
842
+ }
843
+
844
+ if(!$tablePos->{$paraText})
845
+ {
846
+ $tablePos->{$paraText} = "$l-$t-$r-$b";
847
+ }
848
+ else
849
+ {
850
+ #print STDERR "#! Warning: in method processCell, encounter the same paraText $paraText\n";
851
+ }
852
+
853
+ $isPara = 0;
854
+ $text = "";
855
+ }
856
+ elsif ($isPara)
857
+ {
858
+ $text .= $line."\n";
859
+ next;
860
+ }
861
+ }
862
+
863
+ # at least one value should be added for cell which is ""
864
+ if($flag == 0)
865
+ {
866
+ push(@{$paraTexts}, "");
867
+ }
868
+ }
869
+
870
+ sub getAttrValue {
871
+ my ($attrText, $attr) = @_;
872
+
873
+ my $value = "none";
874
+ if($attrText =~ /^.*$attr=\"(.+?)\".*$/){
875
+ $value = $1;
876
+ }
877
+
878
+ return $value;
879
+ }
880
+
881
+ sub checkFontAttr {
882
+ my ($attrText, $attr, $attrHash, $count) = @_;
883
+
884
+ if($attrText =~ /^.*$attr=\"(.+?)\".*$/){
885
+ my $attrValue = $1;
886
+
887
+ $attrHash->{$attrValue} = $attrHash->{$attrValue} ? ($attrHash->{$attrValue}+$count) : $count;
888
+ }
889
+ }
890
+
891
+ sub processPara
892
+ {
893
+ my ($inputText, $isCell, $isPic) = @_;
894
+
895
+ my $isSpace = 0;
896
+ my $isSpecialSpace = 0;
897
+ my $isTab = 0;
898
+ my $isBullet = 0;
899
+
900
+ my $isForcedEOF = "none"; # 3 signals for end of L: forcedEOF=\"true\" in attribute of <ln> or || <nl orig=\"true\"\/> || end of </para> without encountering any of the above signal in the para plus $isSpace = 0
901
+ # xml feature
902
+ my $align = "none";
903
+ my ($l, $t, $r, $bottom);
904
+ my %fontSizeHash = ();
905
+ my %fontFaceHash = ();
906
+ my @boldArray = ();
907
+ my @italicArray = ();
908
+ my $space = "none";
909
+
910
+ my $lnAttr; my $isLn = 0; my $lnBold = "none"; my $lnItalic = "none";
911
+ my $runAttr; my $runText = ""; my $isRun = 0; my $runBold = "none"; my $runItalic = "none";
912
+ my $wdAttr; my $wdText = ""; my $isWd = 0;
913
+
914
+ my $wdIndex = 0; # word index in a line. When encountering </ln>, this parameter indicates the number of words in a line
915
+ my $lnBoldCount = 0;
916
+ my $lnItalicCount = 0;
917
+
918
+ my $allText = "";
919
+ my $text = ""; #invariant: when never enter a new line, $text will be copied into $allText, and $text is cleared
920
+
921
+ binmode(STDERR, ":utf8");
922
+
923
+ my $isFirstLinePara = 1;
924
+ my @lines = split(/\n/, $inputText);
925
+ for (my $i=0; $i<scalar(@lines); $i++)
926
+ {
927
+ my $line = $lines[$i];
928
+
929
+ ## new para
930
+ if ($line =~ /^<para (.+?)>$/)
931
+ {
932
+ my $attr = $1;
933
+ $align = getAttrValue($attr, "alignment");
934
+ # $indent = getAttrValue($attr, "li");
935
+ $space = getAttrValue($attr, "spaceBefore");
936
+ }
937
+ ## new ln
938
+ elsif ($line =~ /^<ln (.+)>$/)
939
+ {
940
+ $lnAttr = $1;
941
+ $isLn = 1;
942
+
943
+ if ($isMarkup)
944
+ {
945
+ $markupOutput .= "# Line $lnAttr\n";
946
+ }
947
+
948
+ # Fix: wrong regex sequence, huydhn
949
+ #if ($lnAttr =~ /^.*l=\"(\d+)\" t=\"(\d+)\" r=\"(\d+)\" b=\"(\d+)\".*$/)
950
+ #{
951
+ # ($l, $t, $r, $bottom) = ($1, $2, $3, $4);
952
+ #}
953
+
954
+ if ($lnAttr =~ /^.*l=\"(\d+)\".*$/) { $l = $1; } else { $l = undef; }
955
+ if ($lnAttr =~ /^.*t=\"(\d+)\".*$/) { $t = $1; } else { $t = undef; }
956
+ if ($lnAttr =~ /^.*r=\"(\d+)\".*$/) { $r = $1; } else { $r = undef; }
957
+ if ($lnAttr =~ /^.*b=\"(\d+)\".*$/) { $bottom = $1; } else { $bottom = undef; }
958
+ # End.
959
+
960
+ $isForcedEOF = getAttrValue($lnAttr, "forcedEOF");
961
+
962
+ # Bold & Italic
963
+ if($isXmlFeature)
964
+ {
965
+ $lnBold = getAttrValue($lnAttr, "bold");
966
+ $lnItalic = getAttrValue($lnAttr, "italic");
967
+ }
968
+ }
969
+ ## new run
970
+ elsif ($line =~ /<run (.*)>$/)
971
+ {
972
+ $runAttr = $1;
973
+
974
+ $isSpace = 0;
975
+ $isTab = 0;
976
+ $isRun = 1;
977
+
978
+ # new wd, that consists of many runs
979
+ if($line =~ /^<wd (.*?)>/)
980
+ {
981
+ $isWd = 1;
982
+ $wdAttr = $1;
983
+ }
984
+
985
+ # Bold & Italic
986
+ if ($isXmlFeature)
987
+ {
988
+ $runBold = getAttrValue($runAttr, "bold");
989
+ $runItalic = getAttrValue($runAttr, "italic");
990
+ }
991
+ }
992
+ ## wd
993
+ elsif ($line =~ /^<wd (.+)?>(.+)<\/wd>$/)
994
+ {
995
+ $wdAttr = $1;
996
+ my $word = $2;
997
+ $isSpace = 0;
998
+ $isTab = 0;
999
+
1000
+ if ($isMarkup)
1001
+ {
1002
+ $markupOutput .= "$word $wdAttr\n";
1003
+ }
1004
+
1005
+ # FontSize & FontFace
1006
+ if($isXmlFeature)
1007
+ {
1008
+ checkFontAttr($wdAttr, "fontSize", \%fontSizeHash, 1);
1009
+ checkFontAttr($wdAttr, "fontFace", \%fontFaceHash, 1);
1010
+ }
1011
+
1012
+ # Bold & Italic
1013
+ if($isXmlFeature)
1014
+ {
1015
+ my $wdBold = getAttrValue($wdAttr, "bold");
1016
+ my $wdItalic = getAttrValue($wdAttr, "italic");
1017
+
1018
+ if($wdBold eq "true" || $runBold eq "true" || $lnBold eq "true")
1019
+ {
1020
+ $boldArray[$wdIndex] = 1;
1021
+ $lnBoldCount++;
1022
+ }
1023
+
1024
+ if($wdItalic eq "true" || $runItalic eq "true" || $lnItalic eq "true")
1025
+ {
1026
+ $italicArray[$wdIndex] = 1;
1027
+ $lnItalicCount++;
1028
+ }
1029
+ } # if($isXmlFeature)
1030
+
1031
+ ## add text
1032
+ $text .= "$word";
1033
+
1034
+ if($isRun) { $runText .= "$word "; }
1035
+ $wdIndex++;
1036
+ }
1037
+ ## end wd
1038
+ elsif ($line =~ /^<\/wd>$/)
1039
+ {
1040
+ $isWd = 0;
1041
+
1042
+ if($isMarkup)
1043
+ {
1044
+ $markupOutput .= "$wdText $wdAttr\n";
1045
+ $wdAttr = "";
1046
+ }
1047
+ }
1048
+ ## end run
1049
+ elsif ($line =~ /^(.*)<\/run>$/)
1050
+ {
1051
+ my $word = $1;
1052
+
1053
+ ## add text
1054
+ if($word ne "")
1055
+ {
1056
+ # Bold & Italic
1057
+ if($isXmlFeature)
1058
+ {
1059
+ if($runBold eq "true" || $lnBold eq "true")
1060
+ {
1061
+ $boldArray[$wdIndex] = 1;
1062
+ $lnBoldCount++;
1063
+ }
1064
+
1065
+ if($runItalic eq "true" || $lnItalic eq "true")
1066
+ {
1067
+ $italicArray[$wdIndex] = 1;
1068
+ $lnItalicCount++;
1069
+ }
1070
+ }
1071
+
1072
+ # appear in the final result
1073
+ if($isLn) { $text .= "$word"; }
1074
+
1075
+ # for internal record
1076
+ if($isRun) { $runText .= "$word "; }
1077
+ if($isWd) { $wdText .= "$word"; }
1078
+
1079
+ $wdIndex++;
1080
+ }
1081
+
1082
+ # xml feature
1083
+ # not a space, tab or new-line run
1084
+ if($isXmlFeature && $runText ne "")
1085
+ {
1086
+ my @words = split(/\s+/, $runText);
1087
+ my $numWords = scalar(@words);
1088
+ checkFontAttr($runAttr, "fontSize", \%fontSizeHash, $numWords);
1089
+ checkFontAttr($runAttr, "fontFace", \%fontFaceHash, $numWords);
1090
+ }
1091
+
1092
+ # reset run
1093
+ # <run> not enclosed within <ln>
1094
+ if (!$isLn) { $wdIndex = 0; }
1095
+ $runText = "";
1096
+ $isRun = 0;
1097
+ $isSpecialSpace = 0;
1098
+
1099
+ # Bold & Italic
1100
+ if($isXmlFeature)
1101
+ {
1102
+ $runBold = "none";
1103
+ $runItalic = "none";
1104
+
1105
+ # <run> not enclosed within <ln>
1106
+ if (!$isLn)
1107
+ {
1108
+ $lnBoldCount = 0;
1109
+ $lnItalicCount = 0;
1110
+ }
1111
+ }
1112
+ }
1113
+ ## end ln
1114
+ elsif ($line =~ /^<\/ln>$/)
1115
+ {
1116
+ if((!$isAllowEmpty && $text !~ /^\s*$/) || ($isAllowEmpty && $text ne ""))
1117
+ {
1118
+ # there's a forced EOL?
1119
+ # not an emply line with space character
1120
+ if($isForcedEOF eq "true" || (!$isCell && !$isSpecialSpace) )
1121
+ {
1122
+ $text .= "\n";
1123
+
1124
+ # update allText
1125
+ $allText .= $text;
1126
+ $text = "";
1127
+ }
1128
+
1129
+ my $numWords = $wdIndex;
1130
+
1131
+ if (!$isCell)
1132
+ {
1133
+ if ($isFirstLinePara)
1134
+ {
1135
+ push(@gPara, "yes");
1136
+ $isFirstLinePara = 0;
1137
+ }
1138
+ else
1139
+ {
1140
+ push(@gPara, "no");
1141
+ }
1142
+ }
1143
+
1144
+ if ($isXmlFeature && $numWords >= 1)
1145
+ {
1146
+ # xml feature
1147
+ # assumtion that: fontSize is either occur in <ln>, or within multiple <run> under <ln>, but not both
1148
+ checkFontAttr($lnAttr, "fontSize", \%fontSizeHash, $numWords);
1149
+ checkFontAttr($lnAttr, "fontFace", \%fontFaceHash, $numWords);
1150
+ }
1151
+
1152
+ if ($isXmlFeature && !$isCell && !$isSpecialSpace)
1153
+ {
1154
+ my $pos = ($t+$bottom)/2.0;
1155
+ if ($pos < $gMinPos) { $gMinPos = $pos; }
1156
+ if ($pos > $gMaxPos) { $gMaxPos = $pos; }
1157
+
1158
+ push(@gPosHash, $pos); # pos feature
1159
+ push(@gAlign, $align); # alignment feature
1160
+ push(@gTable, "no"); # table feature
1161
+
1162
+ # pic feature
1163
+ if($isPic)
1164
+ {
1165
+ push(@gPic, "yes");
1166
+
1167
+ ### Not assign value ###
1168
+ push(@gFontSize, -1); # bold feature
1169
+ push(@gFontFace, "none"); # bold feature
1170
+ push(@gBold, "no"); # bold feature
1171
+ push(@gItalic, "no"); # italic feature
1172
+ push(@gBullet, "no"); # bullet feature
1173
+ }
1174
+ else
1175
+ {
1176
+ push(@gPic, "no");
1177
+
1178
+ updateXMLFontFeature(\%fontSizeHash, \%fontFaceHash);
1179
+ %fontSizeHash = (); %fontFaceHash = ();
1180
+
1181
+ updateXMLFeatures($lnBoldCount, $lnItalicCount, $numWords, $isBullet, $space);
1182
+ } # end if pic
1183
+ } # end if($isXmlFeature && !$isCell && !$isSpecialSpace)
1184
+ }
1185
+
1186
+ ## reset ln
1187
+ $isLn = 0;
1188
+ $isForcedEOF = "none";
1189
+ $isSpecialSpace = 0;
1190
+ $wdIndex = 0;
1191
+
1192
+ # Bold & Italic
1193
+ if ($isXmlFeature)
1194
+ {
1195
+ $lnBold = "none";
1196
+ $lnItalic = "none";
1197
+
1198
+ $lnBoldCount = 0;
1199
+ $lnItalicCount = 0;
1200
+ }
1201
+ } # end else </ln>
1202
+ ## nl newline signal
1203
+ elsif ($line =~ /^<nl orig=\"true\"\/>$/)
1204
+ {
1205
+ if($isLn)
1206
+ {
1207
+ $isSpace = 0;
1208
+ }
1209
+ else
1210
+ {
1211
+ if($isDebug)
1212
+ {
1213
+ print STDERR "#!!! Warning: found <nl orig=\"true\"\/> while not in tag <ln>: $line\n";
1214
+ }
1215
+ }
1216
+ }
1217
+ ## space
1218
+ elsif ($line =~ /^<space\/>$/)
1219
+ {
1220
+ my $startTag = "";
1221
+ my $endTag = "";
1222
+ if($i>0 && $lines[$i-1] =~ /^<(.+?)\b.*/)
1223
+ {
1224
+ $startTag = $1;
1225
+ }
1226
+
1227
+ if($i < (scalar(@lines) -1) && $lines[$i+1] =~ /^<\/(.+)>/)
1228
+ {
1229
+ $endTag = $1;
1230
+ }
1231
+
1232
+ if($startTag eq $endTag && $startTag ne "")
1233
+ {
1234
+ # print STDERR "# Special space after \"$text\"\n";
1235
+ $isSpecialSpace = 1;
1236
+ }
1237
+
1238
+ ## addText
1239
+ $text .= " ";
1240
+ $isSpace = 1;
1241
+ }
1242
+ ## tab
1243
+ elsif ($line =~ /^<tab .*\/>$/)
1244
+ {
1245
+ ## add Text
1246
+ $text .= "\t";
1247
+ $isTab = 1;
1248
+ }
1249
+ ## bullet
1250
+ elsif ($line =~ /^<bullet .*>$/)
1251
+ {
1252
+ $isBullet = 1;
1253
+ }
1254
+ }
1255
+
1256
+ $allText .= $text;
1257
+ return ($allText, $l, $t, $r, $bottom, $isSpace);
1258
+ }
1259
+
1260
+ sub updateXMLFontFeature {
1261
+ my ($fontSizeHash, $fontFaceHash) = @_;
1262
+
1263
+ # font size feature
1264
+ if(scalar(keys %{$fontSizeHash}) == 0){
1265
+ push(@gFontSize, -1);
1266
+ } else {
1267
+ my @sortedFonts = sort { $fontSizeHash->{$b} <=> $fontSizeHash->{$a} } keys %{$fontSizeHash};
1268
+
1269
+ my $fontSize = $sortedFonts[0];
1270
+ push(@gFontSize, $fontSize);
1271
+
1272
+ $gFontSizeHash{$fontSize} = $gFontSizeHash{$fontSize} ? ($gFontSizeHash{$fontSize}+1) : 1;
1273
+ }
1274
+
1275
+ # font face feature
1276
+ if(scalar(keys %{$fontFaceHash}) == 0){
1277
+ push(@gFontFace, "none");
1278
+ } else {
1279
+ my @sortedFonts = sort { $fontFaceHash->{$b} <=> $fontFaceHash->{$a} } keys %{$fontFaceHash};
1280
+ my $fontFace = $sortedFonts[0];
1281
+ push(@gFontFace, $fontFace);
1282
+
1283
+ $gFontFaceHash{$fontFace} = $gFontFaceHash{$fontFace} ? ($gFontFaceHash{$fontFace}+1) : 1;
1284
+ }
1285
+ }
1286
+
1287
+ sub updateXMLFeatures {
1288
+ my ($lnBoldCount, $lnItalicCount, $numWords, $isBullet, $space) = @_;
1289
+ # bold feature
1290
+ my $boldFeature;
1291
+ if ($lnBoldCount/$numWords >= 0.667){
1292
+ $boldFeature = "yes";
1293
+ } else {
1294
+ $boldFeature = "no";
1295
+ }
1296
+ push(@gBold, $boldFeature);
1297
+
1298
+ # italic feature
1299
+ my $italicFeature;
1300
+ if ($lnItalicCount/$numWords >= 0.667){
1301
+ $italicFeature = "yes";
1302
+ } else {
1303
+ $italicFeature = "no";
1304
+ }
1305
+ push(@gItalic, $italicFeature);
1306
+
1307
+ # bullet feature
1308
+ if($isBullet){
1309
+ push(@gBullet, "yes");
1310
+ } else {
1311
+ push(@gBullet, "no");
1312
+ }
1313
+
1314
+ # space feature
1315
+ # push(@gSpace, $space);
1316
+ }
1317
+
1318
+ ## Find the positions of header, body, and citation
1319
+ sub getStructureInfo {
1320
+ my ($lines, $numLines) = @_;
1321
+
1322
+ my ($bodyLength, $citationLength, $bodyEndId) =
1323
+ SectLabel::PreProcess::findCitationText($lines, 0, $numLines);
1324
+
1325
+ my ($headerLength, $bodyStartId);
1326
+ ($headerLength, $bodyLength, $bodyStartId) =
1327
+ SectLabel::PreProcess::findHeaderText($lines, 0, $bodyLength);
1328
+
1329
+ # sanity check
1330
+ my $totalLength = $headerLength + $bodyLength + $citationLength;
1331
+ if($numLines != $totalLength){
1332
+ print STDOUT "Die in getStructureInfo(): different num lines $numLines != $totalLength\n"; # to display in Web
1333
+ die "Die in getStructureInfo(): different num lines $numLines != $totalLength\n";
1334
+ }
1335
+ return ($headerLength, $bodyLength, $citationLength, $bodyStartId, $bodyEndId);
1336
+ }
1337
+
1338
+ ## Count XML tags/values for statistics purpose
1339
+ sub processTagInfo {
1340
+ my ($line, $tags) = @_;
1341
+
1342
+ my $tag;
1343
+ my $attr;
1344
+ if($line =~ /^<(.+?)\b(.*)/){
1345
+ $tag = $1;
1346
+ $attr = $2;
1347
+ if(!$tags->{$tag}){
1348
+ $tags->{$tag} = ();
1349
+ }
1350
+ if($attr =~ /^\s*(.+?)\s*\/?>/){
1351
+ $attr = $1;
1352
+ }
1353
+
1354
+ my @tokens = split(/\s+/, $attr);
1355
+ foreach my $token (@tokens){
1356
+ if($token =~ /^(.+)=(.+)$/){
1357
+ my $attrName = $1;
1358
+ my $value = $2;
1359
+ if(!$tags->{$tag}->{$attrName}){
1360
+ $tags->{$tag}->{$attrName} = ();
1361
+ }
1362
+ if(!$tags->{$tag}->{$attrName}->{$value}){
1363
+ $tags->{$tag}->{$attrName}->{$value} = 0;
1364
+ }
1365
+ $tags->{$tag}->{$attrName}->{$value}++;
1366
+ }
1367
+ }
1368
+ }
1369
+ }
1370
+
1371
+ ## Print tag info to file
1372
+ sub printTagInfo {
1373
+ my ($tags, $tagFile) = @_;
1374
+
1375
+ open(TAG, ">:utf8", "$tagFile") || die"#Can't open file \"$tagFile\"\n";
1376
+ my @sortedTags = sort {$a cmp $b} keys %{$tags};
1377
+ foreach(@sortedTags){
1378
+ my @attrs = sort {$a cmp $b} keys %{$tags->{$_}};
1379
+ print TAG "# Tag = $_\n";
1380
+ foreach my $attr (@attrs) {
1381
+ print TAG "$attr:";
1382
+ my @values = sort {$a cmp $b} keys %{$tags->{$_}->{$attr}};
1383
+ foreach my $value (@values){
1384
+ print TAG " $value-$tags->{$_}->{$attr}->{$value}";
1385
+ }
1386
+ print TAG "\n";
1387
+ }
1388
+ }
1389
+ close TAG;
1390
+ }
1391
+
1392
+ sub untaintPath {
1393
+ my ($path) = @_;
1394
+
1395
+ if ( $path =~ /^([-_\/\w\.]*)$/ ) {
1396
+ $path = $1;
1397
+ } else {
1398
+ die "Bad path \"$path\"\n";
1399
+ }
1400
+
1401
+ return $path;
1402
+ }
1403
+
1404
+ sub untaint {
1405
+ my ($s) = @_;
1406
+ if ($s =~ /^([\w \-\@\(\),\.\/]+)$/) {
1407
+ $s = $1; # $data now untainted
1408
+ } else {
1409
+ die "Bad data in $s"; # log this somewhere
1410
+ }
1411
+ return $s;
1412
+ }
1413
+
1414
+ sub execute {
1415
+ my ($cmd) = @_;
1416
+ if($isDebug){
1417
+ print STDERR "Executing: $cmd\n";
1418
+ }
1419
+ $cmd = untaint($cmd);
1420
+ system($cmd);
1421
+ }
1422
+
1423
+ sub newTmpFile {
1424
+ my $tmpFile = `date '+%Y%m%d-%H%M%S-$$'`;
1425
+ chomp($tmpFile);
1426
+ return $tmpFile;
1427
+ }