biblicit 1.0 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (406) hide show
  1. data/.gitmodules +3 -0
  2. data/Gemfile +1 -1
  3. data/README.md +125 -30
  4. data/Rakefile +22 -0
  5. data/biblicit.gemspec +9 -7
  6. data/lib/biblicit/cb2bib.rb +10 -11
  7. data/lib/biblicit/citeseer.rb +14 -26
  8. data/lib/biblicit/extractor.rb +40 -19
  9. data/lib/biblicit/parscit.rb +38 -0
  10. data/parscit/.gitignore +8 -0
  11. data/parscit/CHANGELOG +125 -0
  12. data/parscit/COPYING +674 -0
  13. data/parscit/COPYING.LESSER +165 -0
  14. data/parscit/INSTALL +105 -0
  15. data/parscit/README +97 -0
  16. data/{perl/ParsCit/README.TXT → parscit/USAGE} +25 -15
  17. data/parscit/bin/archtest.pl +31 -0
  18. data/parscit/bin/citeExtract.pl +562 -0
  19. data/parscit/bin/conlleval.pl +315 -0
  20. data/parscit/bin/headExtract.pl +40 -0
  21. data/parscit/bin/parsHed/convert2TokenLevel.pl +138 -0
  22. data/parscit/bin/parsHed/keywordGen.pl +308 -0
  23. data/parscit/bin/parsHed/parseXmlHeader.pl +141 -0
  24. data/parscit/bin/parsHed/redo.parsHed.pl +198 -0
  25. data/parscit/bin/parsHed/tr2crfpp_parsHed.pl +521 -0
  26. data/parscit/bin/parseRefStrings.pl +102 -0
  27. data/parscit/bin/phOutput2xml.pl +223 -0
  28. data/parscit/bin/redo.parsCit.pl +105 -0
  29. data/parscit/bin/sectExtract.pl +149 -0
  30. data/parscit/bin/sectLabel/README +110 -0
  31. data/parscit/bin/sectLabel/README.txt +110 -0
  32. data/parscit/bin/sectLabel/genericSect/crossValidation.rb +98 -0
  33. data/parscit/bin/sectLabel/genericSect/extractFeature.rb +104 -0
  34. data/parscit/bin/sectLabel/genericSectExtract.rb +53 -0
  35. data/parscit/bin/sectLabel/getStructureInfo.pl +156 -0
  36. data/parscit/bin/sectLabel/processOmniXML.pl +1427 -0
  37. data/parscit/bin/sectLabel/processOmniXML_new.pl +1025 -0
  38. data/parscit/bin/sectLabel/processOmniXMLv2.pl +1529 -0
  39. data/parscit/bin/sectLabel/processOmniXMLv3.pl +964 -0
  40. data/parscit/bin/sectLabel/redo.sectLabel.pl +219 -0
  41. data/parscit/bin/sectLabel/simplifyOmniXML.pl +382 -0
  42. data/parscit/bin/sectLabel/single2multi.pl +190 -0
  43. data/parscit/bin/sectLabel/tr2crfpp.pl +158 -0
  44. data/parscit/bin/tr2crfpp.pl +260 -0
  45. data/parscit/bin/xml2train.pl +193 -0
  46. data/parscit/lib/CSXUtil/SafeText.pm +130 -0
  47. data/parscit/lib/Omni/Config.pm +93 -0
  48. data/parscit/lib/Omni/Omnicell.pm +263 -0
  49. data/parscit/lib/Omni/Omnicol.pm +292 -0
  50. data/parscit/lib/Omni/Omnidd.pm +328 -0
  51. data/parscit/lib/Omni/Omnidoc.pm +153 -0
  52. data/parscit/lib/Omni/Omniframe.pm +223 -0
  53. data/parscit/lib/Omni/Omniline.pm +423 -0
  54. data/parscit/lib/Omni/Omnipage.pm +282 -0
  55. data/parscit/lib/Omni/Omnipara.pm +232 -0
  56. data/parscit/lib/Omni/Omnirun.pm +303 -0
  57. data/parscit/lib/Omni/Omnitable.pm +336 -0
  58. data/parscit/lib/Omni/Omniword.pm +162 -0
  59. data/parscit/lib/Omni/Traversal.pm +313 -0
  60. data/parscit/lib/ParsCit/.PostProcess.pm.swp +0 -0
  61. data/parscit/lib/ParsCit/Citation.pm +737 -0
  62. data/parscit/lib/ParsCit/CitationContext.pm +220 -0
  63. data/parscit/lib/ParsCit/Config.pm +35 -0
  64. data/parscit/lib/ParsCit/Controller.pm +653 -0
  65. data/parscit/lib/ParsCit/PostProcess.pm +505 -0
  66. data/parscit/lib/ParsCit/PreProcess.pm +1041 -0
  67. data/parscit/lib/ParsCit/Tr2crfpp.pm +1195 -0
  68. data/parscit/lib/ParsHed/Config.pm +49 -0
  69. data/parscit/lib/ParsHed/Controller.pm +143 -0
  70. data/parscit/lib/ParsHed/PostProcess.pm +322 -0
  71. data/parscit/lib/ParsHed/Tr2crfpp.pm +448 -0
  72. data/{perl/ParsCit/lib/ParsCit/Tr2crfpp.pm → parscit/lib/ParsHed/Tr2crfpp_token.pm} +22 -21
  73. data/parscit/lib/SectLabel/AAMatching.pm +1949 -0
  74. data/parscit/lib/SectLabel/Config.pm +88 -0
  75. data/parscit/lib/SectLabel/Controller.pm +332 -0
  76. data/parscit/lib/SectLabel/PostProcess.pm +425 -0
  77. data/parscit/lib/SectLabel/PreProcess.pm +116 -0
  78. data/parscit/lib/SectLabel/Tr2crfpp.pm +1246 -0
  79. data/parscit/resources/parsCit.model +0 -0
  80. data/parscit/resources/parsCit.split.model +0 -0
  81. data/{perl/ParsCit → parscit}/resources/parsCitDict.txt +205 -0
  82. data/parscit/resources/parsHed/bigram +10 -0
  83. data/parscit/resources/parsHed/keywords +10 -0
  84. data/parscit/resources/parsHed/parsHed.model +0 -0
  85. data/parscit/resources/parsHed/parsHed.template +178 -0
  86. data/parscit/resources/sectLabel/affiliation.model +0 -0
  87. data/parscit/resources/sectLabel/author.model +0 -0
  88. data/parscit/resources/sectLabel/funcWord +320 -0
  89. data/parscit/resources/sectLabel/genericSect.model +0 -0
  90. data/parscit/resources/sectLabel/sectLabel.config +42 -0
  91. data/parscit/resources/sectLabel/sectLabel.configXml +42 -0
  92. data/parscit/resources/sectLabel/sectLabel.model +0 -0
  93. data/sh/convert_to_text.sh +20 -0
  94. data/spec/biblicit/extractor_spec.rb +121 -0
  95. data/spec/fixtures/Review_of_Michael_Tyes_Consciousness_Revisited.docx +0 -0
  96. data/spec/fixtures/critical-infrastructures.ps +63951 -0
  97. data/spec/fixtures/txt/E06-1050.txt +867 -0
  98. data/spec/fixtures/txt/sample1.txt +902 -0
  99. data/spec/fixtures/txt/sample2.txt +394 -0
  100. data/spec/spec_helper.rb +3 -0
  101. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Function.pm +2 -20
  102. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +0 -7
  103. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Parser.pm +0 -2
  104. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +0 -7
  105. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +6 -1
  106. data/svm-header-parse/HeaderParseService/tmp/.gitignore +4 -0
  107. data/svm-header-parse/extract.pl +75 -0
  108. metadata +351 -317
  109. data/perl/DocFilter/lib/DocFilter/Config.pm +0 -35
  110. data/perl/DocFilter/lib/DocFilter/Filter.pm +0 -51
  111. data/perl/FileConversionService/README.TXT +0 -11
  112. data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
  113. data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +0 -140
  114. data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +0 -77
  115. data/perl/FileConversionService/lib/FileConverter/Compression.pm +0 -137
  116. data/perl/FileConversionService/lib/FileConverter/Config.pm +0 -57
  117. data/perl/FileConversionService/lib/FileConverter/Controller.pm +0 -191
  118. data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +0 -61
  119. data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +0 -69
  120. data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +0 -69
  121. data/perl/FileConversionService/lib/FileConverter/PSToText.pm +0 -88
  122. data/perl/FileConversionService/lib/FileConverter/Prescript.pm +0 -68
  123. data/perl/FileConversionService/lib/FileConverter/TET.pm +0 -75
  124. data/perl/FileConversionService/lib/FileConverter/Utils.pm +0 -130
  125. data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +0 -140
  126. data/perl/HeaderParseService/resources/data/EbizHeaders.txt +0 -24330
  127. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +0 -27506
  128. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +0 -26495
  129. data/perl/HeaderParseService/resources/data/tagged_headers.txt +0 -40668
  130. data/perl/HeaderParseService/resources/data/test_header.txt +0 -31
  131. data/perl/HeaderParseService/resources/data/test_header.txt.parsed +0 -31
  132. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +0 -23
  133. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +0 -23
  134. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +0 -23
  135. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +0 -23
  136. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +0 -23
  137. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +0 -23
  138. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +0 -23
  139. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +0 -23
  140. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +0 -23
  141. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +0 -23
  142. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +0 -23
  143. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +0 -23
  144. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +0 -23
  145. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +0 -23
  146. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +0 -23
  147. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +0 -23
  148. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +0 -23
  149. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +0 -23
  150. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +0 -23
  151. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +0 -23
  152. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +0 -23
  153. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +0 -23
  154. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +0 -23
  155. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +0 -23
  156. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +0 -23
  157. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +0 -23
  158. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +0 -23
  159. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +0 -23
  160. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +0 -23
  161. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +0 -23
  162. data/perl/ParsCit/crfpp/traindata/parsCit.template +0 -60
  163. data/perl/ParsCit/crfpp/traindata/parsCit.train.data +0 -12104
  164. data/perl/ParsCit/crfpp/traindata/tagged_references.txt +0 -500
  165. data/perl/ParsCit/lib/CSXUtil/SafeText.pm +0 -140
  166. data/perl/ParsCit/lib/ParsCit/Citation.pm +0 -462
  167. data/perl/ParsCit/lib/ParsCit/CitationContext.pm +0 -132
  168. data/perl/ParsCit/lib/ParsCit/Config.pm +0 -46
  169. data/perl/ParsCit/lib/ParsCit/Controller.pm +0 -306
  170. data/perl/ParsCit/lib/ParsCit/PostProcess.pm +0 -367
  171. data/perl/ParsCit/lib/ParsCit/PreProcess.pm +0 -333
  172. data/perl/ParsCit/resources/parsCit.model +0 -0
  173. data/perl/extract.pl +0 -199
  174. data/spec/biblicit/cb2bib_spec.rb +0 -48
  175. data/spec/biblicit/citeseer_spec.rb +0 -40
  176. /data/{perl → svm-header-parse}/HeaderParseService/README.TXT +0 -0
  177. /data/{perl/DocFilter → svm-header-parse/HeaderParseService}/lib/CSXUtil/SafeText.pm +0 -0
  178. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +0 -0
  179. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +0 -0
  180. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +0 -0
  181. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/50states +0 -0
  182. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AddrTopWords.txt +0 -0
  183. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWords.txt +0 -0
  184. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWordsAll.txt +0 -0
  185. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/ChineseSurNames.txt +0 -0
  186. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames.bin +0 -0
  187. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
  188. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/DomainSuffixes.txt +0 -0
  189. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/LabeledHeader +0 -0
  190. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/README +0 -0
  191. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines +0 -0
  192. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines1 +0 -0
  193. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstract.txt +0 -0
  194. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstractTopWords +0 -0
  195. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/addr.txt +0 -0
  196. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affi.txt +0 -0
  197. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affis.bin +0 -0
  198. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
  199. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/allnamewords.bin +0 -0
  200. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_US.txt +0 -0
  201. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_world.txt +0 -0
  202. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/city.txt +0 -0
  203. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cityname.txt +0 -0
  204. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/country_abbr.txt +0 -0
  205. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/countryname.txt +0 -0
  206. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/dateTopWords +0 -0
  207. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/degree.txt +0 -0
  208. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/email.txt +0 -0
  209. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/excludeWords.txt +0 -0
  210. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/female-names +0 -0
  211. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstNames.txt +0 -0
  212. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames.bin +0 -0
  213. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
  214. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/intro.txt +0 -0
  215. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keyword.txt +0 -0
  216. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keywordTopWords +0 -0
  217. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/male-names +0 -0
  218. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/middleNames.txt +0 -0
  219. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/month.txt +0 -0
  220. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul +0 -0
  221. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label +0 -0
  222. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label.old +0 -0
  223. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.processed +0 -0
  224. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulAuthor +0 -0
  225. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulClassStat +0 -0
  226. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nickname.txt +0 -0
  227. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nicknames.bin +0 -0
  228. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/note.txt +0 -0
  229. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/page.txt +0 -0
  230. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/phone.txt +0 -0
  231. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/postcode.txt +0 -0
  232. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/pubnum.txt +0 -0
  233. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.bin +0 -0
  234. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.txt +0 -0
  235. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/states_and_abbreviations.txt +0 -0
  236. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords +0 -0
  237. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords.bin +0 -0
  238. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surNames.txt +0 -0
  239. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames.bin +0 -0
  240. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames_spec.bin +0 -0
  241. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/A.html +0 -0
  242. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/B.html +0 -0
  243. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/C.html +0 -0
  244. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/D.html +0 -0
  245. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/E.html +0 -0
  246. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/F.html +0 -0
  247. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/G.html +0 -0
  248. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/H.html +0 -0
  249. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/I.html +0 -0
  250. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/J.html +0 -0
  251. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/K.html +0 -0
  252. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/L.html +0 -0
  253. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/M.html +0 -0
  254. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/N.html +0 -0
  255. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/O.html +0 -0
  256. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/P.html +0 -0
  257. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Q.html +0 -0
  258. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/R.html +0 -0
  259. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/S.html +0 -0
  260. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/T.html +0 -0
  261. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/U.html +0 -0
  262. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/V.html +0 -0
  263. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/W.html +0 -0
  264. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
  265. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/X.html +0 -0
  266. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Y.html +0 -0
  267. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Z.html +0 -0
  268. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ae.html +0 -0
  269. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/am.html +0 -0
  270. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ar.html +0 -0
  271. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/at.html +0 -0
  272. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/au.html +0 -0
  273. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bd.html +0 -0
  274. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/be.html +0 -0
  275. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bg.html +0 -0
  276. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bh.html +0 -0
  277. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
  278. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bm.html +0 -0
  279. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bn.html +0 -0
  280. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/br.html +0 -0
  281. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ca.html +0 -0
  282. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ch.html +0 -0
  283. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cl.html +0 -0
  284. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cn.html +0 -0
  285. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/co.html +0 -0
  286. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cr.html +0 -0
  287. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cy.html +0 -0
  288. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cz.html +0 -0
  289. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/de.html +0 -0
  290. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
  291. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dk.html +0 -0
  292. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ec.html +0 -0
  293. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ee.html +0 -0
  294. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/eg.html +0 -0
  295. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/es.html +0 -0
  296. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/et.html +0 -0
  297. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/faq.html +0 -0
  298. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fi.html +0 -0
  299. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fj.html +0 -0
  300. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fo.html +0 -0
  301. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fr.html +0 -0
  302. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/geog.html +0 -0
  303. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gr.html +0 -0
  304. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gu.html +0 -0
  305. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hk.html +0 -0
  306. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hr.html +0 -0
  307. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hu.html +0 -0
  308. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/id.html +0 -0
  309. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ie.html +0 -0
  310. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/il.html +0 -0
  311. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/in.html +0 -0
  312. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/is.html +0 -0
  313. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/it.html +0 -0
  314. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jm.html +0 -0
  315. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jo.html +0 -0
  316. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jp.html +0 -0
  317. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
  318. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kr.html +0 -0
  319. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kw.html +0 -0
  320. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lb.html +0 -0
  321. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
  322. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lk.html +0 -0
  323. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lt.html +0 -0
  324. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lu.html +0 -0
  325. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lv.html +0 -0
  326. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ma.html +0 -0
  327. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
  328. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
  329. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mk.html +0 -0
  330. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mo.html +0 -0
  331. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
  332. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mt.html +0 -0
  333. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mx.html +0 -0
  334. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/my.html +0 -0
  335. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ni.html +0 -0
  336. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nl.html +0 -0
  337. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/no.html +0 -0
  338. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nz.html +0 -0
  339. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pa.html +0 -0
  340. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pe.html +0 -0
  341. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ph.html +0 -0
  342. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pl.html +0 -0
  343. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
  344. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pr.html +0 -0
  345. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ps.html +0 -0
  346. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pt.html +0 -0
  347. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/recognition.html +0 -0
  348. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/results.html +0 -0
  349. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ro.html +0 -0
  350. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ru.html +0 -0
  351. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sd.html +0 -0
  352. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/se.html +0 -0
  353. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sg.html +0 -0
  354. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/si.html +0 -0
  355. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sk.html +0 -0
  356. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/th.html +0 -0
  357. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tr.html +0 -0
  358. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tw.html +0 -0
  359. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ua.html +0 -0
  360. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uk.html +0 -0
  361. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ-full.html +0 -0
  362. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ.html +0 -0
  363. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uy.html +0 -0
  364. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ve.html +0 -0
  365. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/yu.html +0 -0
  366. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/za.html +0 -0
  367. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/zm.html +0 -0
  368. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list.txt +0 -0
  369. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/url.txt +0 -0
  370. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/webTopWords +0 -0
  371. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/words +0 -0
  372. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10ContextModelfold1 +0 -0
  373. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10Modelfold1 +0 -0
  374. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11ContextModelfold1 +0 -0
  375. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11Modelfold1 +0 -0
  376. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12ContextModelfold1 +0 -0
  377. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12Modelfold1 +0 -0
  378. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13ContextModelfold1 +0 -0
  379. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13Modelfold1 +0 -0
  380. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14ContextModelfold1 +0 -0
  381. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14Modelfold1 +0 -0
  382. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15ContextModelfold1 +0 -0
  383. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15Modelfold1 +0 -0
  384. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1ContextModelfold1 +0 -0
  385. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1Modelfold1 +0 -0
  386. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2ContextModelfold1 +0 -0
  387. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2Modelfold1 +0 -0
  388. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3ContextModelfold1 +0 -0
  389. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3Modelfold1 +0 -0
  390. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4ContextModelfold1 +0 -0
  391. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4Modelfold1 +0 -0
  392. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5ContextModelfold1 +0 -0
  393. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5Modelfold1 +0 -0
  394. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6ContextModelfold1 +0 -0
  395. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6Modelfold1 +0 -0
  396. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7ContextModelfold1 +0 -0
  397. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7Modelfold1 +0 -0
  398. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8ContextModelfold1 +0 -0
  399. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8Modelfold1 +0 -0
  400. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9ContextModelfold1 +0 -0
  401. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9Modelfold1 +0 -0
  402. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceModel +0 -0
  403. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceTrainF +0 -0
  404. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperBaseFeaDict +0 -0
  405. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperContextFeaDict +0 -0
  406. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +0 -0
@@ -0,0 +1,425 @@
1
+ package SectLabel::PostProcess;
2
+
3
+ ###
4
+ # Utilities for normalizing the output of CRF++ into standard
5
+ # representations.
6
+ #
7
+ # Luong Minh Thang 25 May, 09. Adopted from Isaac Councill, 07/20/07
8
+ ###
9
+
10
+ use strict;
11
+ use utf8;
12
+
13
+ use CSXUtil::SafeText qw(cleanXML);
14
+ use ParsCit::Config;
15
+ use ParsCit::PostProcess; # qw(normalizeAuthorNames stripPunctuation);
16
+
17
+ ###
18
+ # Main method for processing document data. Specifically, it reads CRF output, performs normalization to individual fields, and outputs to XML
19
+ ###
20
+ sub WrapDocumentXml
21
+ {
22
+ my ($in_file, $section_headers) = @_;
23
+
24
+ my $status = 1;
25
+ my $doc_count = 0;
26
+ my $msg = "";
27
+ my $xml = "";
28
+ my $variant = "";
29
+ my $last_tag = "";
30
+
31
+ my $overall_confidence = "1.0";
32
+ # For lines of the same label
33
+ my $cur_confidence = 0;
34
+ # Count the number of lines in the current same label
35
+ my $count = 0;
36
+
37
+ # Output XML file for display
38
+ $xml .= "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
39
+
40
+ # Array of hash: each element of fields correspond to a pairs of (tag, content)
41
+ # accessible through $fields[$i]->{"tag"} and $fields[$i]->{"content"}
42
+ my @fields = ();
43
+ my $cur_content = "";
44
+
45
+ open(IN, "<:utf8", $in_file) or return (undef, undef, 0, "couldn't open in_file: $!");
46
+
47
+ my $line_id = -1;
48
+ while (<IN>)
49
+ {
50
+ if (/^\# ([\.\d]+)/)
51
+ {
52
+ # Overall confidence info
53
+ $overall_confidence = $1;
54
+ next;
55
+ }
56
+
57
+ # End of a sentence, output (useful to handle multiple document classification
58
+ if (/^\s*$/)
59
+ {
60
+ # Add the last field
61
+ AddFieldInfo(\@fields, $last_tag, $cur_content, $cur_confidence, $count);
62
+
63
+ if ($variant eq "")
64
+ {
65
+ # Benerate XML output
66
+ my $output = GenerateOutput(\@fields);
67
+ my $l_algName = $SectLabel::Config::algorithmName;
68
+ my $l_algVersion = $SectLabel::Config::algorithmVersion;
69
+ $xml .= "<algorithm name=\"$l_algName\" version=\"$l_algVersion\">\n". "<variant no=\"0\" confidence=\"$overall_confidence\">\n". $output . "</variant>\n</algorithm>\n";
70
+ }
71
+
72
+ $doc_count++;
73
+
74
+ # Reset
75
+ @fields = ();
76
+ $last_tag = "";
77
+ $line_id = -1;
78
+ }
79
+ # In a middle of a document
80
+ else
81
+ {
82
+ chop;
83
+ my @tokens = split (/\t/);
84
+ $line_id++;
85
+
86
+ my $line = $tokens[0];
87
+ my $sys = $tokens[-1];
88
+ my $gold = $tokens[-2];
89
+
90
+ # For this line
91
+ my $confidence = 0;
92
+
93
+ # Train at line level, get the original line
94
+ @tokens = split(/\|\|\|/, $line);
95
+ $line = join(" ", @tokens);
96
+
97
+ # Process confidence info in the format e.g, sectionHeader/0.989046
98
+ if ($sys =~ /^(.+)\/([\d\.]+)$/)
99
+ {
100
+ $sys = $1;
101
+ $confidence += $2;
102
+ # print STDERR "$line\t$sys\t$2\n";
103
+ }
104
+ else
105
+ {
106
+ die "Die in SectLabel:PostProcess::wrapDocumentXml : incorrect format \"tag/prob\" $sys\n";
107
+ }
108
+
109
+ # Start a new tag, not an initial value, output
110
+ if ($sys ne $last_tag && $last_tag ne "")
111
+ {
112
+ AddFieldInfo(\@fields, $last_tag, $cur_content, $cur_confidence, $count);
113
+
114
+ # Reset the value
115
+ $cur_content = "";
116
+ $cur_confidence = 0;
117
+ $count = 0;
118
+ }
119
+
120
+ # Store section headers to classify generic sections later
121
+ if ($sys eq "sectionHeader")
122
+ {
123
+ push(@{$section_headers->{"header"}}, $line);
124
+ push(@{$section_headers->{"lineId"}}, $line_id);
125
+ }
126
+
127
+ $cur_content .= "$line\n";
128
+ $cur_confidence += $confidence;
129
+
130
+ $count++;
131
+ # Update last_tag
132
+ $last_tag = $sys;
133
+ }
134
+ }
135
+
136
+ close (IN);
137
+ return $xml;
138
+ }
139
+
140
+ # To add per-field info
141
+ sub AddFieldInfo
142
+ {
143
+ my ($fields, $last_tag, $cur_content, $cur_confidence, $count) = @_;
144
+
145
+ my %tmp_hash = ();
146
+ $tmp_hash{"tag"} = $last_tag;
147
+ $tmp_hash{"content"} = $cur_content;
148
+
149
+ # Confidence info
150
+ if ($count > 0)
151
+ {
152
+ $tmp_hash{"confidence"} = $cur_confidence/$count;
153
+ }
154
+
155
+ push(@{$fields}, \%tmp_hash);
156
+
157
+ # print STDERR "\n###\n";
158
+ # foreach my $key (keys %tmp_hash)
159
+ # {
160
+ # print STDERR "$key -> $tmp_hash{$key}\n";
161
+ # }
162
+ }
163
+
164
+ # Wrap all field infos into XML form
165
+ sub GenerateOutput
166
+ {
167
+ my ($fields) = @_;
168
+
169
+ my $output = "";
170
+ foreach (@{$fields})
171
+ {
172
+ my $tag = $_->{"tag"};
173
+ my $content = $_->{"content"};
174
+ my $conf_str = " confidence=\"".$_->{"confidence"}."\"";
175
+
176
+ if ($content =~ /^\s*$/) { next; };
177
+
178
+ ($tag, $content) = NormalizeDocumentField($tag, $content, 1);
179
+ $output .= "<$tag$conf_str>\n$content\n</$tag>\n";
180
+ }
181
+
182
+ return $output;
183
+ }
184
+
185
+ # Wrap document into non-XML form
186
+ sub WrapDocument
187
+ {
188
+ my ($in_file, $blank_lines, $is_token_level) = @_;
189
+
190
+ my $msg = "";
191
+ my $xml = "";
192
+ my $status = 1;
193
+ my $variant = "";
194
+ my $confidence = "1.0";
195
+
196
+ # Output XML file for display
197
+ # Array of hash: each element of fields correspond to a pairs of (tag, content)
198
+ # accessible through $fields[$i]->{"tag"} and $fields[$i]->{"content"}
199
+ my @fields = ();
200
+ my @cur_content = ();
201
+
202
+ open(IN, "<:utf8", $in_file) or return (undef, undef, 0, "couldn't open in_file: $!");
203
+ my $line_id = -1;
204
+
205
+ while (<IN>)
206
+ {
207
+ # Overall confidence info
208
+ if (/^\# ([\.\d]+)/) { next; }
209
+
210
+ $line_id++;
211
+ while ($blank_lines->{$line_id})
212
+ {
213
+ print STDERR "#! Insert none label for line id $line_id\n";
214
+ $xml .= "none \n";
215
+ $line_id++;
216
+ }
217
+
218
+ # End of a sentence, output (useful to handle multiple document classification
219
+ if (/^\s*$/)
220
+ {
221
+ # Add the last field
222
+ $line_id = -1;
223
+ }
224
+ # In a middle of a document
225
+ else
226
+ {
227
+ chop;
228
+
229
+ my @tokens = split (/\t/);
230
+ my $line = $tokens[0];
231
+ my $sys = $tokens[-1];
232
+ my $gold = $tokens[-2];
233
+
234
+ # Train at line level, get the original line
235
+ @tokens = split(/\|\|\|/, $line);
236
+ $line = join(" ", @tokens);
237
+
238
+ # Process confidence info in the format e.g, sectionHeader/0.989046
239
+ if ($sys =~ /^(.+)\/[\d\.]+$/)
240
+ {
241
+ $sys = $1;
242
+ }
243
+ else
244
+ {
245
+ die "Die in SectLabel:PostProcess::wrapDocument : incorrect format \"tag/prob\" $sys\n";
246
+ }
247
+
248
+ ($sys, $line) = NormalizeDocumentField($sys, $line, 0);
249
+ $xml .= "$sys $line\n";
250
+ }
251
+ }
252
+
253
+ close (IN);
254
+ return $xml;
255
+ }
256
+
257
+ # Make the output "prettier"
258
+ sub SimpleNormalize
259
+ {
260
+ my ($tag, $content) = @_;
261
+
262
+ # Remove keyword at the beginning and strip leading spaces
263
+ $content =~ s/^\s*$tag\s+//i;
264
+
265
+ # Remove trailing spaces
266
+ $content =~ s/\s+$//g;
267
+
268
+ # Unhyphenation
269
+ $content =~ s/\- ([a-z])/$1/g;
270
+
271
+ # Escape XML characters
272
+ cleanXML(\$content);
273
+
274
+ # $content = ParsCit::PostProcess::stripPunctuation($content);
275
+ return ($tag, $content);
276
+ }
277
+
278
+ ###
279
+ # Document normalization subroutine. Reads in a tag and its content, perform normalization based on that tag.
280
+ ###
281
+ sub NormalizeDocumentField
282
+ {
283
+ my ($tag, $content, $isEscape) = @_;
284
+
285
+ # Remove keyword at the beginning and strip leading spaces
286
+ # $content =~ s/^\s*$tag\s+//i;
287
+
288
+ # Remove trailing spaces
289
+ $content =~ s/\s+$//g;
290
+
291
+ # Unhyphenation
292
+ # $content =~ s/\- ([a-z])/$1/g;
293
+
294
+ # Escape XML characters
295
+ if ($isEscape)
296
+ {
297
+ cleanXML(\$content);
298
+ }
299
+
300
+ # $content = ParsCit::PostProcess::stripPunctuation($content);
301
+ return ($tag, $content);
302
+ }
303
+
304
+ ###
305
+ # Huydhn: provide input for parscit
306
+ ###
307
+ sub GenerateParscitInput
308
+ {
309
+ my ($in_file) = @_;
310
+
311
+ my @cit_lines = ();
312
+ my $line_index = 0;
313
+ my $all_text = "";
314
+
315
+ # This file is the output from CRF++ for sectlabel
316
+ open(IN, "<:utf8", $in_file) or return (undef, undef, 0, "couldn't open in_file: $!");
317
+
318
+ while (<IN>)
319
+ {
320
+ # Overall condidence line, do not care about this
321
+ if (/^\# ([\.\d]+)/) { next; }
322
+ # Remove end of line
323
+ chop;
324
+ # Remove blank line
325
+ my $line = $_;
326
+ $line =~ s/^\s+|\s+$//g;
327
+ if ($line eq "") { next; }
328
+
329
+ # Split the line, the last token is the category provide by sectlabel
330
+ my @tokens = split (/\t/, $line);
331
+ # A line's category
332
+ my $sys = $tokens[-1];
333
+
334
+ # Process confidence info in the format e.g, sectionHeader/0.989046
335
+ if ($sys =~ /^(.+)\/([\d\.]+)$/)
336
+ {
337
+ $sys = $1;
338
+ }
339
+ else
340
+ {
341
+ die "Die in SectLabel:PostProcess::wrapDocumentXml : incorrect format \"tag/prob\" $sys\n";
342
+ }
343
+
344
+ # Only keep lines in the reference for parscit
345
+ if ($sys eq "reference") { push @cit_lines, $line_index; }
346
+
347
+ my $content = $tokens[0];
348
+ # Train at line level, get the original line
349
+ @tokens = split(/\|\|\|/, $content);
350
+ $content = join(" ", @tokens);
351
+
352
+ # Save the line
353
+ $all_text = $all_text . $content . "\n";
354
+
355
+ # Point to the next line
356
+ $line_index++;
357
+ }
358
+
359
+ close (IN);
360
+
361
+ # Done
362
+ return ($all_text, \@cit_lines);
363
+ }
364
+
365
+ ###
366
+ # Huydhn: provide author and affiliation for the new matching model
367
+ ###
368
+ sub GenerateAuthorAffiliation
369
+ {
370
+ my ($in_file) = @_;
371
+
372
+ my @aut_lines = ();
373
+ my @aff_lines = ();
374
+ my $line_index = 0;
375
+
376
+ # This file is the output from CRF++ for sectlabel
377
+ open(IN, "<:utf8", $in_file) or return (undef, undef, 0, "couldn't open in_file: $!");
378
+
379
+ while (<IN>)
380
+ {
381
+ # Overall condidence line, do not care about this
382
+ if (/^\# ([\.\d]+)/) { next; }
383
+ # Remove end of line
384
+ chop;
385
+ # Remove blank line
386
+ my $line = $_;
387
+ $line =~ s/^\s+|\s+$//g;
388
+ if ($line eq "") { next; }
389
+
390
+ # Split the line, the last token is the category provide by sectlabel
391
+ my @tokens = split (/\t/, $line);
392
+ # A line's category
393
+ my $sys = $tokens[-1];
394
+
395
+ # Process confidence info in the format e.g, sectionHeader/0.989046
396
+ if ($sys =~ /^(.+)\/([\d\.]+)$/)
397
+ {
398
+ $sys = $1;
399
+ }
400
+ else
401
+ {
402
+ die "Die in SectLabel:PostProcess::wrapDocumentXml : incorrect format \"tag/prob\" $sys\n";
403
+ }
404
+
405
+ # Only keep lines in the reference for parscit
406
+ if ($sys eq "author")
407
+ {
408
+ push @aut_lines, $line_index;
409
+ }
410
+ elsif ($sys eq "affiliation")
411
+ {
412
+ push @aff_lines, $line_index;
413
+ }
414
+
415
+ # Point to the next line
416
+ $line_index++;
417
+ }
418
+
419
+ close (IN);
420
+
421
+ # Done
422
+ return (\@aut_lines, \@aff_lines);
423
+ }
424
+
425
+ 1;
@@ -0,0 +1,116 @@
1
+ package SectLabel::PreProcess;
2
+
3
+ ###
4
+ # Utilities for finding header, body, and reference.
5
+ # Avoid normalization to maintain consistent number of lines in a document
6
+ # Simplified from ParsCit::PreProcess
7
+ #
8
+ # Minh-Thang Luong, v100401
9
+ ###
10
+
11
+ use utf8;
12
+ use strict;
13
+
14
+ ###
15
+ # Looks for header section markers in the supplied text and
16
+ # separates the header text from the body text based on these
17
+ # indicators. If it looks like there is a header section marker
18
+ # too late, an empty header text string will be returned.
19
+ # Input: reference to an array of lines, line id to start process, number of lines (start_id < num_lines)
20
+ # Output: header length, body length, body start id)
21
+ ###
22
+ sub FindHeaderText
23
+ {
24
+ my ($lines, $start_id, $num_lines) = @_;
25
+
26
+ if($start_id >= $num_lines) { die "Die in SectLabel::PreProcess::findHeaderText: start id $start_id >= num lines $num_lines\n"; }
27
+
28
+ my $body_start_id = $start_id;
29
+ for(; $body_start_id < $num_lines; $body_start_id++)
30
+ {
31
+ if($lines->[$body_start_id] =~ /^(.*?)\b(Abstract|ABSTRACT|Introductions?|INTRODUCTIONS?)\b(.*?):?\s*$/)
32
+ {
33
+ # There are trailing text after the word introduction
34
+ if (CountTokens($3) > 0)
35
+ {
36
+ # INTRODUCTION AND BACKGROUND
37
+ if($3 =~ /background/i) { last; }
38
+ }
39
+ else
40
+ {
41
+ last;
42
+ }
43
+ }
44
+ }
45
+
46
+ my $header_length = $body_start_id - $start_id;
47
+ my $body_length = $num_lines - $body_start_id;
48
+
49
+ if ($header_length >= 0.8*$body_length)
50
+ {
51
+ print STDERR "Header text $header_length longer than 80% article body length $body_length: ignoring\n";
52
+
53
+ $body_start_id = $start_id;
54
+ $header_length = 0;
55
+ $body_length = $num_lines - $body_start_id;
56
+ }
57
+
58
+ if ($header_length == 0) { print STDERR "warning: no header text found\n"; }
59
+
60
+ return ($header_length, $body_length, $body_start_id);
61
+ }
62
+
63
+ ###
64
+ # Looks for reference section markers in the supplied text and
65
+ # separates the citation text from the body text based on these
66
+ # indicators. If it looks like there is a reference section marker
67
+ # too early in the document, this procedure will try to find later
68
+ # ones. If the final reference section is still too long, an empty
69
+ # citation text string will be returned.
70
+ ## Input: reference to an array of lines, line id to start process, number of lines (start_id < num_lines)
71
+ ## Output: body length, citation length, body end id
72
+ ###
73
+ sub FindCitationText
74
+ {
75
+ my ($lines, $start_id, $num_lines) = @_;
76
+
77
+ if ($start_id >= $num_lines) { die "Die in SectLabel::PreProcess::findCitationText: start id $start_id >= num lines $num_lines\n"; }
78
+
79
+ my $body_end_id = ($num_lines - 1);
80
+ for(; $body_end_id >= $start_id; $body_end_id--)
81
+ {
82
+ if ($lines->[$body_end_id] =~ /(References?|REFERENCES?|Bibliography|BIBLIOGRAPHY|References?\s+and\s+Notes?|References?\s+Cited|REFERENCES?\s+CITED|REFERENCES?\s+AND\s+NOTES?):?\s*$/)
83
+ {
84
+ last;
85
+ }
86
+ }
87
+
88
+ my $body_length = $body_end_id - $start_id + 1;
89
+ my $citation_length = $num_lines -1 - $body_end_id;
90
+
91
+ if ($citation_length >= 0.8*$body_length)
92
+ {
93
+ print STDERR "Citation text $citation_length longer than 80% article body length $body_length: ignoring\n";
94
+
95
+ $body_end_id = ($num_lines - 1);
96
+ $citation_length = 0;
97
+ $body_length = $body_end_id - $start_id + 1;
98
+ }
99
+
100
+ if ($citation_length == 0) { print STDERR "warning: no citation text found\n"; }
101
+
102
+ return ($body_length, $citation_length, $body_end_id);
103
+ }
104
+
105
+ sub CountTokens
106
+ {
107
+ my ($text) = @_;
108
+
109
+ $text =~ s/^\s+//; # Trip leading spaces
110
+ $text =~ s/\s+$//; # Trip trailing spaces
111
+ my @tokens = split(/\s+/, $text);
112
+
113
+ return scalar(@tokens);
114
+ }
115
+
116
+ 1;