biblicit 1.0 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (406) hide show
  1. data/.gitmodules +3 -0
  2. data/Gemfile +1 -1
  3. data/README.md +125 -30
  4. data/Rakefile +22 -0
  5. data/biblicit.gemspec +9 -7
  6. data/lib/biblicit/cb2bib.rb +10 -11
  7. data/lib/biblicit/citeseer.rb +14 -26
  8. data/lib/biblicit/extractor.rb +40 -19
  9. data/lib/biblicit/parscit.rb +38 -0
  10. data/parscit/.gitignore +8 -0
  11. data/parscit/CHANGELOG +125 -0
  12. data/parscit/COPYING +674 -0
  13. data/parscit/COPYING.LESSER +165 -0
  14. data/parscit/INSTALL +105 -0
  15. data/parscit/README +97 -0
  16. data/{perl/ParsCit/README.TXT → parscit/USAGE} +25 -15
  17. data/parscit/bin/archtest.pl +31 -0
  18. data/parscit/bin/citeExtract.pl +562 -0
  19. data/parscit/bin/conlleval.pl +315 -0
  20. data/parscit/bin/headExtract.pl +40 -0
  21. data/parscit/bin/parsHed/convert2TokenLevel.pl +138 -0
  22. data/parscit/bin/parsHed/keywordGen.pl +308 -0
  23. data/parscit/bin/parsHed/parseXmlHeader.pl +141 -0
  24. data/parscit/bin/parsHed/redo.parsHed.pl +198 -0
  25. data/parscit/bin/parsHed/tr2crfpp_parsHed.pl +521 -0
  26. data/parscit/bin/parseRefStrings.pl +102 -0
  27. data/parscit/bin/phOutput2xml.pl +223 -0
  28. data/parscit/bin/redo.parsCit.pl +105 -0
  29. data/parscit/bin/sectExtract.pl +149 -0
  30. data/parscit/bin/sectLabel/README +110 -0
  31. data/parscit/bin/sectLabel/README.txt +110 -0
  32. data/parscit/bin/sectLabel/genericSect/crossValidation.rb +98 -0
  33. data/parscit/bin/sectLabel/genericSect/extractFeature.rb +104 -0
  34. data/parscit/bin/sectLabel/genericSectExtract.rb +53 -0
  35. data/parscit/bin/sectLabel/getStructureInfo.pl +156 -0
  36. data/parscit/bin/sectLabel/processOmniXML.pl +1427 -0
  37. data/parscit/bin/sectLabel/processOmniXML_new.pl +1025 -0
  38. data/parscit/bin/sectLabel/processOmniXMLv2.pl +1529 -0
  39. data/parscit/bin/sectLabel/processOmniXMLv3.pl +964 -0
  40. data/parscit/bin/sectLabel/redo.sectLabel.pl +219 -0
  41. data/parscit/bin/sectLabel/simplifyOmniXML.pl +382 -0
  42. data/parscit/bin/sectLabel/single2multi.pl +190 -0
  43. data/parscit/bin/sectLabel/tr2crfpp.pl +158 -0
  44. data/parscit/bin/tr2crfpp.pl +260 -0
  45. data/parscit/bin/xml2train.pl +193 -0
  46. data/parscit/lib/CSXUtil/SafeText.pm +130 -0
  47. data/parscit/lib/Omni/Config.pm +93 -0
  48. data/parscit/lib/Omni/Omnicell.pm +263 -0
  49. data/parscit/lib/Omni/Omnicol.pm +292 -0
  50. data/parscit/lib/Omni/Omnidd.pm +328 -0
  51. data/parscit/lib/Omni/Omnidoc.pm +153 -0
  52. data/parscit/lib/Omni/Omniframe.pm +223 -0
  53. data/parscit/lib/Omni/Omniline.pm +423 -0
  54. data/parscit/lib/Omni/Omnipage.pm +282 -0
  55. data/parscit/lib/Omni/Omnipara.pm +232 -0
  56. data/parscit/lib/Omni/Omnirun.pm +303 -0
  57. data/parscit/lib/Omni/Omnitable.pm +336 -0
  58. data/parscit/lib/Omni/Omniword.pm +162 -0
  59. data/parscit/lib/Omni/Traversal.pm +313 -0
  60. data/parscit/lib/ParsCit/.PostProcess.pm.swp +0 -0
  61. data/parscit/lib/ParsCit/Citation.pm +737 -0
  62. data/parscit/lib/ParsCit/CitationContext.pm +220 -0
  63. data/parscit/lib/ParsCit/Config.pm +35 -0
  64. data/parscit/lib/ParsCit/Controller.pm +653 -0
  65. data/parscit/lib/ParsCit/PostProcess.pm +505 -0
  66. data/parscit/lib/ParsCit/PreProcess.pm +1041 -0
  67. data/parscit/lib/ParsCit/Tr2crfpp.pm +1195 -0
  68. data/parscit/lib/ParsHed/Config.pm +49 -0
  69. data/parscit/lib/ParsHed/Controller.pm +143 -0
  70. data/parscit/lib/ParsHed/PostProcess.pm +322 -0
  71. data/parscit/lib/ParsHed/Tr2crfpp.pm +448 -0
  72. data/{perl/ParsCit/lib/ParsCit/Tr2crfpp.pm → parscit/lib/ParsHed/Tr2crfpp_token.pm} +22 -21
  73. data/parscit/lib/SectLabel/AAMatching.pm +1949 -0
  74. data/parscit/lib/SectLabel/Config.pm +88 -0
  75. data/parscit/lib/SectLabel/Controller.pm +332 -0
  76. data/parscit/lib/SectLabel/PostProcess.pm +425 -0
  77. data/parscit/lib/SectLabel/PreProcess.pm +116 -0
  78. data/parscit/lib/SectLabel/Tr2crfpp.pm +1246 -0
  79. data/parscit/resources/parsCit.model +0 -0
  80. data/parscit/resources/parsCit.split.model +0 -0
  81. data/{perl/ParsCit → parscit}/resources/parsCitDict.txt +205 -0
  82. data/parscit/resources/parsHed/bigram +10 -0
  83. data/parscit/resources/parsHed/keywords +10 -0
  84. data/parscit/resources/parsHed/parsHed.model +0 -0
  85. data/parscit/resources/parsHed/parsHed.template +178 -0
  86. data/parscit/resources/sectLabel/affiliation.model +0 -0
  87. data/parscit/resources/sectLabel/author.model +0 -0
  88. data/parscit/resources/sectLabel/funcWord +320 -0
  89. data/parscit/resources/sectLabel/genericSect.model +0 -0
  90. data/parscit/resources/sectLabel/sectLabel.config +42 -0
  91. data/parscit/resources/sectLabel/sectLabel.configXml +42 -0
  92. data/parscit/resources/sectLabel/sectLabel.model +0 -0
  93. data/sh/convert_to_text.sh +20 -0
  94. data/spec/biblicit/extractor_spec.rb +121 -0
  95. data/spec/fixtures/Review_of_Michael_Tyes_Consciousness_Revisited.docx +0 -0
  96. data/spec/fixtures/critical-infrastructures.ps +63951 -0
  97. data/spec/fixtures/txt/E06-1050.txt +867 -0
  98. data/spec/fixtures/txt/sample1.txt +902 -0
  99. data/spec/fixtures/txt/sample2.txt +394 -0
  100. data/spec/spec_helper.rb +3 -0
  101. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Function.pm +2 -20
  102. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +0 -7
  103. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Parser.pm +0 -2
  104. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +0 -7
  105. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +6 -1
  106. data/svm-header-parse/HeaderParseService/tmp/.gitignore +4 -0
  107. data/svm-header-parse/extract.pl +75 -0
  108. metadata +351 -317
  109. data/perl/DocFilter/lib/DocFilter/Config.pm +0 -35
  110. data/perl/DocFilter/lib/DocFilter/Filter.pm +0 -51
  111. data/perl/FileConversionService/README.TXT +0 -11
  112. data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
  113. data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +0 -140
  114. data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +0 -77
  115. data/perl/FileConversionService/lib/FileConverter/Compression.pm +0 -137
  116. data/perl/FileConversionService/lib/FileConverter/Config.pm +0 -57
  117. data/perl/FileConversionService/lib/FileConverter/Controller.pm +0 -191
  118. data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +0 -61
  119. data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +0 -69
  120. data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +0 -69
  121. data/perl/FileConversionService/lib/FileConverter/PSToText.pm +0 -88
  122. data/perl/FileConversionService/lib/FileConverter/Prescript.pm +0 -68
  123. data/perl/FileConversionService/lib/FileConverter/TET.pm +0 -75
  124. data/perl/FileConversionService/lib/FileConverter/Utils.pm +0 -130
  125. data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +0 -140
  126. data/perl/HeaderParseService/resources/data/EbizHeaders.txt +0 -24330
  127. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +0 -27506
  128. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +0 -26495
  129. data/perl/HeaderParseService/resources/data/tagged_headers.txt +0 -40668
  130. data/perl/HeaderParseService/resources/data/test_header.txt +0 -31
  131. data/perl/HeaderParseService/resources/data/test_header.txt.parsed +0 -31
  132. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +0 -23
  133. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +0 -23
  134. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +0 -23
  135. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +0 -23
  136. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +0 -23
  137. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +0 -23
  138. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +0 -23
  139. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +0 -23
  140. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +0 -23
  141. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +0 -23
  142. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +0 -23
  143. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +0 -23
  144. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +0 -23
  145. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +0 -23
  146. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +0 -23
  147. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +0 -23
  148. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +0 -23
  149. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +0 -23
  150. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +0 -23
  151. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +0 -23
  152. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +0 -23
  153. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +0 -23
  154. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +0 -23
  155. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +0 -23
  156. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +0 -23
  157. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +0 -23
  158. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +0 -23
  159. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +0 -23
  160. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +0 -23
  161. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +0 -23
  162. data/perl/ParsCit/crfpp/traindata/parsCit.template +0 -60
  163. data/perl/ParsCit/crfpp/traindata/parsCit.train.data +0 -12104
  164. data/perl/ParsCit/crfpp/traindata/tagged_references.txt +0 -500
  165. data/perl/ParsCit/lib/CSXUtil/SafeText.pm +0 -140
  166. data/perl/ParsCit/lib/ParsCit/Citation.pm +0 -462
  167. data/perl/ParsCit/lib/ParsCit/CitationContext.pm +0 -132
  168. data/perl/ParsCit/lib/ParsCit/Config.pm +0 -46
  169. data/perl/ParsCit/lib/ParsCit/Controller.pm +0 -306
  170. data/perl/ParsCit/lib/ParsCit/PostProcess.pm +0 -367
  171. data/perl/ParsCit/lib/ParsCit/PreProcess.pm +0 -333
  172. data/perl/ParsCit/resources/parsCit.model +0 -0
  173. data/perl/extract.pl +0 -199
  174. data/spec/biblicit/cb2bib_spec.rb +0 -48
  175. data/spec/biblicit/citeseer_spec.rb +0 -40
  176. /data/{perl → svm-header-parse}/HeaderParseService/README.TXT +0 -0
  177. /data/{perl/DocFilter → svm-header-parse/HeaderParseService}/lib/CSXUtil/SafeText.pm +0 -0
  178. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +0 -0
  179. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +0 -0
  180. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +0 -0
  181. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/50states +0 -0
  182. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AddrTopWords.txt +0 -0
  183. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWords.txt +0 -0
  184. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWordsAll.txt +0 -0
  185. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/ChineseSurNames.txt +0 -0
  186. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames.bin +0 -0
  187. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
  188. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/DomainSuffixes.txt +0 -0
  189. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/LabeledHeader +0 -0
  190. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/README +0 -0
  191. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines +0 -0
  192. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines1 +0 -0
  193. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstract.txt +0 -0
  194. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstractTopWords +0 -0
  195. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/addr.txt +0 -0
  196. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affi.txt +0 -0
  197. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affis.bin +0 -0
  198. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
  199. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/allnamewords.bin +0 -0
  200. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_US.txt +0 -0
  201. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_world.txt +0 -0
  202. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/city.txt +0 -0
  203. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cityname.txt +0 -0
  204. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/country_abbr.txt +0 -0
  205. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/countryname.txt +0 -0
  206. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/dateTopWords +0 -0
  207. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/degree.txt +0 -0
  208. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/email.txt +0 -0
  209. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/excludeWords.txt +0 -0
  210. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/female-names +0 -0
  211. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstNames.txt +0 -0
  212. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames.bin +0 -0
  213. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
  214. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/intro.txt +0 -0
  215. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keyword.txt +0 -0
  216. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keywordTopWords +0 -0
  217. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/male-names +0 -0
  218. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/middleNames.txt +0 -0
  219. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/month.txt +0 -0
  220. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul +0 -0
  221. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label +0 -0
  222. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label.old +0 -0
  223. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.processed +0 -0
  224. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulAuthor +0 -0
  225. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulClassStat +0 -0
  226. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nickname.txt +0 -0
  227. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nicknames.bin +0 -0
  228. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/note.txt +0 -0
  229. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/page.txt +0 -0
  230. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/phone.txt +0 -0
  231. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/postcode.txt +0 -0
  232. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/pubnum.txt +0 -0
  233. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.bin +0 -0
  234. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.txt +0 -0
  235. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/states_and_abbreviations.txt +0 -0
  236. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords +0 -0
  237. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords.bin +0 -0
  238. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surNames.txt +0 -0
  239. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames.bin +0 -0
  240. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames_spec.bin +0 -0
  241. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/A.html +0 -0
  242. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/B.html +0 -0
  243. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/C.html +0 -0
  244. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/D.html +0 -0
  245. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/E.html +0 -0
  246. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/F.html +0 -0
  247. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/G.html +0 -0
  248. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/H.html +0 -0
  249. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/I.html +0 -0
  250. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/J.html +0 -0
  251. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/K.html +0 -0
  252. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/L.html +0 -0
  253. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/M.html +0 -0
  254. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/N.html +0 -0
  255. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/O.html +0 -0
  256. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/P.html +0 -0
  257. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Q.html +0 -0
  258. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/R.html +0 -0
  259. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/S.html +0 -0
  260. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/T.html +0 -0
  261. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/U.html +0 -0
  262. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/V.html +0 -0
  263. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/W.html +0 -0
  264. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
  265. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/X.html +0 -0
  266. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Y.html +0 -0
  267. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Z.html +0 -0
  268. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ae.html +0 -0
  269. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/am.html +0 -0
  270. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ar.html +0 -0
  271. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/at.html +0 -0
  272. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/au.html +0 -0
  273. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bd.html +0 -0
  274. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/be.html +0 -0
  275. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bg.html +0 -0
  276. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bh.html +0 -0
  277. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
  278. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bm.html +0 -0
  279. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bn.html +0 -0
  280. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/br.html +0 -0
  281. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ca.html +0 -0
  282. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ch.html +0 -0
  283. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cl.html +0 -0
  284. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cn.html +0 -0
  285. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/co.html +0 -0
  286. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cr.html +0 -0
  287. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cy.html +0 -0
  288. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cz.html +0 -0
  289. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/de.html +0 -0
  290. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
  291. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dk.html +0 -0
  292. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ec.html +0 -0
  293. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ee.html +0 -0
  294. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/eg.html +0 -0
  295. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/es.html +0 -0
  296. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/et.html +0 -0
  297. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/faq.html +0 -0
  298. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fi.html +0 -0
  299. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fj.html +0 -0
  300. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fo.html +0 -0
  301. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fr.html +0 -0
  302. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/geog.html +0 -0
  303. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gr.html +0 -0
  304. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gu.html +0 -0
  305. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hk.html +0 -0
  306. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hr.html +0 -0
  307. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hu.html +0 -0
  308. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/id.html +0 -0
  309. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ie.html +0 -0
  310. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/il.html +0 -0
  311. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/in.html +0 -0
  312. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/is.html +0 -0
  313. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/it.html +0 -0
  314. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jm.html +0 -0
  315. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jo.html +0 -0
  316. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jp.html +0 -0
  317. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
  318. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kr.html +0 -0
  319. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kw.html +0 -0
  320. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lb.html +0 -0
  321. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
  322. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lk.html +0 -0
  323. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lt.html +0 -0
  324. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lu.html +0 -0
  325. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lv.html +0 -0
  326. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ma.html +0 -0
  327. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
  328. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
  329. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mk.html +0 -0
  330. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mo.html +0 -0
  331. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
  332. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mt.html +0 -0
  333. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mx.html +0 -0
  334. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/my.html +0 -0
  335. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ni.html +0 -0
  336. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nl.html +0 -0
  337. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/no.html +0 -0
  338. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nz.html +0 -0
  339. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pa.html +0 -0
  340. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pe.html +0 -0
  341. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ph.html +0 -0
  342. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pl.html +0 -0
  343. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
  344. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pr.html +0 -0
  345. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ps.html +0 -0
  346. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pt.html +0 -0
  347. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/recognition.html +0 -0
  348. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/results.html +0 -0
  349. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ro.html +0 -0
  350. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ru.html +0 -0
  351. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sd.html +0 -0
  352. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/se.html +0 -0
  353. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sg.html +0 -0
  354. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/si.html +0 -0
  355. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sk.html +0 -0
  356. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/th.html +0 -0
  357. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tr.html +0 -0
  358. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tw.html +0 -0
  359. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ua.html +0 -0
  360. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uk.html +0 -0
  361. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ-full.html +0 -0
  362. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ.html +0 -0
  363. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uy.html +0 -0
  364. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ve.html +0 -0
  365. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/yu.html +0 -0
  366. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/za.html +0 -0
  367. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/zm.html +0 -0
  368. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list.txt +0 -0
  369. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/url.txt +0 -0
  370. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/webTopWords +0 -0
  371. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/words +0 -0
  372. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10ContextModelfold1 +0 -0
  373. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10Modelfold1 +0 -0
  374. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11ContextModelfold1 +0 -0
  375. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11Modelfold1 +0 -0
  376. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12ContextModelfold1 +0 -0
  377. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12Modelfold1 +0 -0
  378. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13ContextModelfold1 +0 -0
  379. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13Modelfold1 +0 -0
  380. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14ContextModelfold1 +0 -0
  381. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14Modelfold1 +0 -0
  382. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15ContextModelfold1 +0 -0
  383. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15Modelfold1 +0 -0
  384. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1ContextModelfold1 +0 -0
  385. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1Modelfold1 +0 -0
  386. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2ContextModelfold1 +0 -0
  387. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2Modelfold1 +0 -0
  388. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3ContextModelfold1 +0 -0
  389. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3Modelfold1 +0 -0
  390. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4ContextModelfold1 +0 -0
  391. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4Modelfold1 +0 -0
  392. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5ContextModelfold1 +0 -0
  393. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5Modelfold1 +0 -0
  394. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6ContextModelfold1 +0 -0
  395. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6Modelfold1 +0 -0
  396. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7ContextModelfold1 +0 -0
  397. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7Modelfold1 +0 -0
  398. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8ContextModelfold1 +0 -0
  399. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8Modelfold1 +0 -0
  400. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9ContextModelfold1 +0 -0
  401. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9Modelfold1 +0 -0
  402. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceModel +0 -0
  403. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceTrainF +0 -0
  404. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperBaseFeaDict +0 -0
  405. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperContextFeaDict +0 -0
  406. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +0 -0
@@ -0,0 +1,223 @@
1
+ #!/usr/bin/env perl
2
+ # -*- cperl -*-
3
+ =head1 NAME
4
+
5
+ phOutput2xml.pl
6
+
7
+ =head1 SYNOPSYS
8
+
9
+ RCS:$Id$
10
+
11
+ =head1 DESCRIPTION
12
+
13
+ =head1 HISTORY
14
+
15
+ ORIGIN: created from templateApp.pl version 3.4 by Min-Yen Kan <kanmy@comp.nus.edu.sg>
16
+
17
+ modified from output2xml.pl for ParsCit.
18
+
19
+ RCS:$Log$
20
+
21
+ =cut
22
+
23
+ require 5.0;
24
+ use Getopt::Std;
25
+ use strict 'vars';
26
+ # use diagnostics;
27
+
28
+ ### USER customizable section
29
+ my $tmpfile .= $0; $tmpfile =~ s/[\.\/]//g;
30
+ $tmpfile .= $$ . time;
31
+ if ($tmpfile =~ /^([-\@\w.]+)$/) { $tmpfile = $1; } # untaint tmpfile variable
32
+ $tmpfile = "/tmp/" . $tmpfile;
33
+ $0 =~ /([^\/]+)$/; my $progname = $1;
34
+ my $outputVersion = "1.0";
35
+ ### END user customizable section
36
+
37
+ ### Ctrl-C handler
38
+ sub quitHandler {
39
+ print STDERR "\n# $progname fatal\t\tReceived a 'SIGINT'\n# $progname - exiting cleanly\n";
40
+ exit;
41
+ }
42
+
43
+ ### HELP Sub-procedure
44
+ sub Help {
45
+ print STDERR "usage: $progname -h\t\t\t\t[invokes help]\n";
46
+ print STDERR " $progname -v\t\t\t\t[invokes version]\n";
47
+ print STDERR " $progname [-qEl] [-r <rankfile> -n <num>] filename(s)...\n";
48
+ print STDERR "Options:\n";
49
+ print STDERR "\t-q\tQuiet Mode (don't echo license)\n";
50
+ print STDERR "\t-E\tTurn OFF error checking\n";
51
+ print STDERR "\t-l\tEliminate newline tags\n";
52
+ print STDERR "\t-r <file>\tSVM Ranking output file\n";
53
+ print STDERR "\t-n <num>\tNumber of choices in both ranking file and input file\n";
54
+ print STDERR "\n";
55
+ print STDERR "Will accept input on STDIN as a single file.\n";
56
+ print STDERR "\n";
57
+ }
58
+
59
+ ### VERSION Sub-procedure
60
+ sub Version {
61
+ if (system ("perldoc $0")) {
62
+ die "Need \"perldoc\" in PATH to print version information";
63
+ }
64
+ exit;
65
+ }
66
+
67
+ sub License {
68
+ print STDERR "# Copyright 2009 \251 by Min-Yen Kan\n";
69
+ }
70
+
71
+ ###
72
+ ### MAIN program
73
+ ###
74
+
75
+ my $cmdLine = $0 . " " . join (" ", @ARGV);
76
+ if ($#ARGV == -1) { # invoked with no arguments, possible error in execution?
77
+ print STDERR "# $progname info\t\tNo arguments detected, waiting for input on command line.\n";
78
+ print STDERR "# $progname info\t\tIf you need help, stop this program and reinvoke with \"-h\".\n";
79
+ }
80
+
81
+ $SIG{'INT'} = 'quitHandler';
82
+ getopts ('Ehlqr:n:v');
83
+
84
+ our ($opt_q, $opt_v, $opt_h, $opt_r, $opt_n, $opt_E, $opt_l);
85
+ # use (!defined $opt_X) for options with arguments
86
+ if (!$opt_q) { License(); } # call License, if asked for
87
+ if ($opt_v) { Version(); exit(0); } # call Version, if asked for
88
+ if ($opt_h) { Help(); exit (0); } # call help, if asked for
89
+ my $errorChecking = (defined $opt_E) ? 0 : 1;
90
+ my $ignoreNewlines = (defined $opt_l) ? 1 : 0;
91
+ my $svmRankFile = (defined $opt_r) ? $opt_r : undef;
92
+ my $rankChoices = (defined $opt_n) ? $opt_n : undef;
93
+ if ((defined $rankChoices && !defined $svmRankFile) ||
94
+ (!defined $rankChoices && defined $svmRankFile)) {
95
+ die "# $progname fatal\t\t-n and -r are mutually necessary switches";
96
+ }
97
+
98
+ ## standardize input stream (either STDIN on first arg on command line)
99
+ my $fh;
100
+ my $filename;
101
+ if ($filename = shift) {
102
+ NEWFILE:
103
+ if (!(-e $filename)) { die "# $progname crash\t\tFile \"$filename\" doesn't exist"; }
104
+ open (*IF, $filename) || die "# $progname crash\t\tCan't open \"$filename\"";
105
+ $fh = "IF";
106
+ } else {
107
+ $filename = "<STDIN>";
108
+ $fh = "STDIN";
109
+ }
110
+
111
+ # open rank file info, if applicable
112
+ my $rfh;
113
+ my @max = ();
114
+ if (defined $rankChoices && defined $svmRankFile) {
115
+ open (*RFH, $svmRankFile) || die "# $progname crash\t\tCan't open rankfile \"$svmRankFile\"!";
116
+ $rfh = "RFH";
117
+ my $line = 0;
118
+ my $curLine = 0;
119
+ my $max = 0;
120
+ my $maxLine = 0;
121
+ while (<$rfh>) {
122
+ chop;
123
+ $line++;
124
+ $curLine++;
125
+ if ($_ > $max) { # advance max if applicable
126
+ $max = $_;
127
+ $maxLine = $curLine-1;
128
+ }
129
+
130
+ if ($line % $rankChoices == 0) { # save data at fencepost
131
+ $max[int($line/$rankChoices)-1] = $maxLine;
132
+ # print "$line $max $maxLine\n";
133
+
134
+ $curLine = 0; # reset values
135
+ $max = 0;
136
+ $maxLine = 0;
137
+ }
138
+ }
139
+ close ($rfh);
140
+ }
141
+
142
+ ## output XML file for display
143
+ my $line = 0;
144
+ my $buf = "";
145
+ my $buf2 = "";
146
+ my $lastTag = "";
147
+ my $variant = "";
148
+ my $confidence = "1.0";
149
+ print "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
150
+ print "<?xml-stylesheet href=\"bibxml.xsl\" type=\"text/xsl\" ?>\n";
151
+ print "<file>\n";
152
+ while (<$fh>) {
153
+ if (/^\# (\d+) ([\.\d]+)/) {
154
+ $variant = $1;
155
+ $confidence = $2;
156
+ next;
157
+ }
158
+ elsif (/^\#/) { next; } # skip comments
159
+
160
+ if (/^\s*$/) {
161
+ $buf =~ s/&/&amp;/g;
162
+
163
+ if ($variant eq "") {
164
+ print "<entry no=\"$line\">\n";
165
+ if ($ignoreNewlines) {
166
+ $buf =~ s/\- ([a-z])/$1/g;
167
+ $buf =~ s/>\s+/>/g;
168
+ $buf =~ s/\s+</</g;
169
+ $buf =~ s/\s+$//g;
170
+ $buf =~ s/^\s+/</g;
171
+ # $buf =~ s/PARSHED</\n </g; # replace with newline and spaces for formatting
172
+ $buf =~ s/PARSHED</\n</g; # replace with newline and spaces for formatting
173
+ }
174
+ print "<variant no=\"0\" confidence=\"$confidence\">" . $buf . "</$lastTag>\n</variant>\n";
175
+ print "</entry>\n";
176
+ $line++;
177
+ } else {
178
+ if ($variant eq "0" && $buf2 ne "") {
179
+ print "<entry no=\"$line svmRank: $max[$line]\">\n" . $buf2 . " </entry>\n";
180
+ $buf2 = "";
181
+ $line++;
182
+ }
183
+ $buf2 .= "<variant no=\"$variant\" confidence=\"$confidence\">\n" . $buf . "</$lastTag>\n</variant>\n";
184
+ }
185
+
186
+ $lastTag = "";
187
+ $buf = "";
188
+ } else {
189
+ chop;
190
+
191
+ my @tokens = split (/\t/);
192
+
193
+ my $token = $tokens[0];
194
+ my $sys = $tokens[-1];
195
+ my $gold = $tokens[-2];
196
+ if ($sys ne $lastTag) {
197
+ if ($lastTag ne "") { $buf .= "</$lastTag>\n"; }
198
+ $buf .= "PARSHED<$sys>";
199
+ # $buf .= "<$sys>";
200
+ }
201
+ if ($token eq "+L+" && $ignoreNewlines) {
202
+ next;
203
+ }
204
+ if ($gold ne $sys && $errorChecking) {
205
+ $buf .= "<error correct=\"$gold\" taggedAs=\"$sys\">$token </error>";
206
+ } else {
207
+ $buf .= "$token ";
208
+ }
209
+ $lastTag = $sys;
210
+ }
211
+ }
212
+ # print " <entry no=\"$line\">\n" . $buf2 . " </entry>\n";
213
+ print "</file>\n";
214
+
215
+ close ($fh);
216
+
217
+ if ($filename = shift) {
218
+ goto NEWFILE;
219
+ }
220
+
221
+ ###
222
+ ### END of main program
223
+ ###
@@ -0,0 +1,105 @@
1
+ #!/usr/bin/env perl
2
+ # -*- cperl -*-
3
+
4
+ ### USER customizable section
5
+ my $tmpfile .= $0; $tmpfile =~ s/[\.\/]//g;
6
+ $tmpfile .= $$ . time;
7
+ if ($tmpfile =~ /^([-\@\w.]+)$/) { $tmpfile = $1; } # untaint tmpfile variable
8
+ $tmpfile = "/tmp/" . $tmpfile;
9
+ $0 =~ /([^\/]+)$/; my $progname = $1;
10
+ my $outputVersion = "1.0";
11
+ my $parscitHome = "/home/wing.nus/services/parscit/tools/";
12
+ my $tr2crfppLoc = "$parscitHome/bin/tr2crfpp.pl";
13
+ my $crf_learnLoc = "$ENV{'CRFPP_HOME'}/bin/crf_learn";
14
+ my $crf_testLoc = "$ENV{'CRFPP_HOME'}/bin/crf_test";
15
+ my $conllevalLoc = "$parscitHome/bin/conlleval.pl";
16
+ my $crfTemplateLoc = "$parscitHome/crfpp/traindata/parsCit.template";
17
+ ### END user customizable section
18
+
19
+ my $trainingFile = $ARGV[0];
20
+ my $folds = $ARGV[1];
21
+
22
+ # construct test data
23
+ open (IF, $trainingFile) || die "# $progname fatal\tTraining file cannot be opened \"$trainingFile\"!";
24
+ my $i = 0;
25
+ while (<IF>) {
26
+ open (OF, ">>$tmpfile.$i.test.src") || die "$progname fatal\tCan't append to file \"$tmpfile.$i.test.src\"!";
27
+ print OF $_;
28
+ $i++;
29
+ $i = $i % $folds;
30
+ }
31
+ close (IF);
32
+ for (my $i = 0; $i < $folds; $i++) {
33
+ `$tr2crfppLoc $tmpfile.$i.test.src> $tmpfile.$i.test`;
34
+ }
35
+
36
+ # construct training data
37
+ for (my $i = 0; $i < $folds; $i++) {
38
+ for (my $j = 0; $j < $folds; $j++) {
39
+ if ($j == $i) {next; }
40
+ else {
41
+ `cat $tmpfile.$j.test >> $tmpfile.$i.train`;
42
+ }
43
+ }
44
+ }
45
+
46
+ # train
47
+ for (my $i = 0; $i < $folds; $i++) {
48
+ my $cmd = "$crf_learnLoc -f 2 -c 3 $crfTemplateLoc $tmpfile.$i.train $tmpfile.$i.model ";
49
+ print "$cmd\n";
50
+ system ($cmd);
51
+ }
52
+
53
+ # test
54
+ for (my $i = 0; $i < $folds; $i++) {
55
+ my $cmd = "$crf_testLoc -m $tmpfile.$i.model $tmpfile.$i.test > $tmpfile.$i.out";
56
+ print "$cmd\n";
57
+ system ($cmd);
58
+ my $cmd = "cat $tmpfile.$i.out >> $tmpfile.all.out ";
59
+ print "$cmd\n";
60
+ system ($cmd);
61
+ }
62
+
63
+ # eval
64
+ #for (my $i = 0; $i < $folds; $i++) {
65
+ # my $cmd = "$conllevalLoc -r -d \" \" < $tmpfile.$i.out";
66
+ # print "$cmd\n";
67
+ # system ($cmd);
68
+ #}
69
+ my $cmd = "$conllevalLoc -r -d \" \" < $tmpfile.all.out";
70
+ print "$cmd\n";
71
+ system ($cmd);
72
+
73
+ # clean up
74
+ `rm -f $tmpfile*`;
75
+
76
+ ######################################################################
77
+ # .51
78
+ # on head (first 500 lines of tagged.txt)
79
+ # f=2, c=3 2fold: 92.86
80
+ # f=2, c=3 2fold (more unigram): 93.23
81
+ # 93.19 (with B features)
82
+ # 93.35 without B features
83
+ #
84
+ # on tagged.txt
85
+ # f=2, c=3 2fold cv: 95.24 / 93.99 => 94.61
86
+ # f=2, c=5 2fold cv: => 94.55
87
+ # f=2, c=3 2fold cv = 94.77
88
+ #
89
+ # .48
90
+ # on tagged.txt (cat of all *tagged.txt):
91
+ # normal, 2fold cv: 95.12 / 93.33
92
+ # c=1.5, 2fold cv: 95.14 / 93.38
93
+ # f=2, 2fold cv: 95.29 / 93.93
94
+ # f=2, c=1.5 2fold cv: 95.31 / 93.82
95
+ # f=2, c=3 2fold cv: 95.31 / 93.82
96
+ # f=3, 2fold cv: 95.25 / 93.69
97
+ #
98
+ #
99
+ # a=CRF-L1, f=2 2fold cv: 88.25 / 91.29
100
+ # a=CRF-L1 2fold cv: 80.63 / -- didn't complete
101
+ # a=MIRA 2fold cv: 94.48 / 92.69
102
+ # a=MIRA, f=2 2fold cv: 94.31 / 93.60
103
+
104
+ # 100326 .51 normal, 2fold cv, over all data (including iconip)
105
+ # accuracy: 94.83%; precision: 94.83%; recall: 94.83%; FB1: 94.83
@@ -0,0 +1,149 @@
1
+ #!/usr/bin/perl -wT
2
+
3
+ # Author: Luong Minh Thang <luongmin@comp.nus.edu.sg>, generated at Wed, 03 Mar 2010 00:36:36
4
+ # Modified from template by Min-Yen Kan <kanmy@comp.nus.edu.sg>
5
+
6
+ require 5.0;
7
+ use strict;
8
+ use Getopt::Long;
9
+
10
+ # I do not know a better solution to find a lib path in -T mode.
11
+ # So if you know a better solution, I'd be glad to hear.
12
+ # See this http://www.perlmonks.org/?node_id=585299 for why I
13
+ # used the below code
14
+ use FindBin;
15
+
16
+ my $path;
17
+ BEGIN
18
+ {
19
+ if ($FindBin::Bin =~ /(.*)/) { $path = $1; }
20
+ }
21
+
22
+ use lib "$path/../lib";
23
+
24
+ use SectLabel::Config;
25
+ use SectLabel::Controller;
26
+
27
+ ### USER customizable section
28
+ $0 =~ /([^\/]+)$/; my $progname = $1;
29
+ my $outputVersion = "1.0";
30
+ ### END user customizable section
31
+
32
+ sub License {
33
+ print STDERR "# Copyright 2009 \251 by Luong Minh Thang\n";
34
+ }
35
+
36
+ ### HELP Sub-procedure
37
+ sub Help {
38
+ print STDERR "usage: $progname -h\t[invokes help]\n";
39
+ print STDERR " $progname -in inFile [-out outFile -no-xmlInput -no-xmlOutput -log -new]\n";
40
+ print STDERR "Options:\n";
41
+ print STDERR "\t-q\tQuiet Mode (don't echo license)\n";
42
+ print STDERR "\t-out: indicate output file (if not specified output to STDOUT)\n";
43
+ print STDERR "\t-no-xmlInput: indicate that input is normal text file (default: assume XML file from Omnipage-multiple pages concatenated)\n";
44
+ print STDERR "\t-no-xmlOutput: do not wrap results in XML format (default: xmlOutput)\n";
45
+ print STDERR "\t-log: output debugging messages\n";
46
+ }
47
+ my $QUIET = 0;
48
+ my $HELP = 0;
49
+ my $inFile = undef;
50
+ my $outFile = undef;
51
+ my $isXmlInput = 1;
52
+ my $isXmlOutput = 1;
53
+ my $isDebug = 0;
54
+ my $isNew = 0; # if = 1, use processOmniXml_new.pl
55
+ $HELP = 1 unless GetOptions('in=s' => \$inFile,
56
+ 'out=s' => \$outFile,
57
+ 'xmlInput!' => \$isXmlInput,
58
+ 'xmlOutput!' => \$isXmlOutput,
59
+ 'log' => \$isDebug,
60
+ 'new' => \$isNew,
61
+ 'h' => \$HELP,
62
+ 'q' => \$QUIET);
63
+
64
+ if ($HELP || !defined $inFile) {
65
+ Help();
66
+ exit(0);
67
+ }
68
+
69
+ if (!$QUIET) {
70
+ License();
71
+ }
72
+
73
+ ### Untaint ###
74
+ $inFile = untaintPath($inFile);
75
+ my $envPath = $ENV{'PATH'};
76
+ $envPath = untaintPath($envPath);
77
+ $ENV{'PATH'} = $envPath;
78
+ ### End untaint ###
79
+
80
+ my $modelFile = $isXmlInput? $SectLabel::Config::modelXmlFile : $SectLabel::Config::modelFile;
81
+ $modelFile = "$path/../$modelFile";
82
+ my $configFile = $isXmlInput ? $SectLabel::Config::configXmlFile : $SectLabel::Config::configFile;
83
+ $configFile = "$path/../$configFile";
84
+
85
+ if($isXmlInput){
86
+ my $xmlInFile = newTmpFile();
87
+ $xmlInFile = untaintPath($xmlInFile);
88
+ my $cmd = "$path/sectLabel/";
89
+ $cmd .= ($isNew) ? "processOmniXMLv2.pl" : "processOmniXML.pl";
90
+ $cmd .= " -in $inFile -out $xmlInFile -xmlFeature -decode";
91
+ execute($cmd);
92
+ $inFile = $xmlInFile;
93
+ }
94
+
95
+ my $dictFile = $SectLabel::Config::dictFile;
96
+ $dictFile = "$path/../$dictFile";
97
+
98
+ my $funcFile = $SectLabel::Config::funcFile;
99
+ $funcFile = "$path/../$funcFile";
100
+ my $rXML = SectLabel::Controller::extractSection($inFile, $isXmlOutput, $modelFile, $dictFile, $funcFile, $configFile, $isXmlInput, $isDebug);
101
+
102
+ if($isXmlInput){
103
+ unlink($inFile);
104
+ }
105
+
106
+ if (defined $outFile) {
107
+ $outFile = untaintPath($outFile);
108
+
109
+ open (OUT, ">:utf8", $outFile) or die "Could not open $outFile for writing: $!";
110
+ print OUT $$rXML;
111
+ close OUT;
112
+ } else {
113
+ print "$$rXML";
114
+ }
115
+
116
+ sub untaintPath {
117
+ my ($path) = @_;
118
+
119
+ if ( $path =~ /^([-_\/\w\.\d: ]+)$/ ) {
120
+ $path = $1;
121
+ } else {
122
+ die "Bad path $path\n";
123
+ }
124
+
125
+ return $path;
126
+ }
127
+
128
+ sub untaint {
129
+ my ($s) = @_;
130
+ if ($s =~ /^([\w \-\@\(\),\.\/<>]+)$/) {
131
+ $s = $1; # $data now untainted
132
+ } else {
133
+ die "Bad data in $s"; # log this somewhere
134
+ }
135
+ return $s;
136
+ }
137
+
138
+ sub execute {
139
+ my ($cmd) = @_;
140
+ print STDERR "Executing: $cmd\n";
141
+ $cmd = untaint($cmd);
142
+ system($cmd);
143
+ }
144
+
145
+ sub newTmpFile {
146
+ my $tmpFile = `date '+%Y%m%d-%H%M%S-$$'`;
147
+ chomp($tmpFile);
148
+ return $tmpFile;
149
+ }
@@ -0,0 +1,110 @@
1
+ README for sectLabel module (v100401)
2
+
3
+ CONTENTS
4
+ [0] Directory structure
5
+ [1] Command line Usage
6
+ [1.1] SectLabel
7
+ [1.2] GenericSect
8
+ [3] Known issues
9
+
10
+ ------------------------------------------------------------
11
+ [0] DIRECTORY STRUCTURE
12
+
13
+ * processOmniXML.pl: Process Omnipage XML output (concatenated results
14
+ fromm all pages of a PDF file), and extract text lines together with
15
+ other XML infos
16
+ Note: the current script is complicated since it mixes 2 things: process Omnipage XML as well as extract XML features. We are planning to break into 2 scripts: 1) simplifyOmniXML.pl (Done!) -- to convert Omnipage into output into internal format, and 2) extractXMLFeatures.pl (TODO) -- to take input as the internal results produced by simplifyOmniXML.pl and generate XML features.
17
+
18
+ * redo.sectLabel.pl: Perform stratified cross-validation for SectLabel
19
+ * tr2crfpp.pl: Generate SectLabel features for CRF++
20
+ * single2multi.pl: Convert SectLabel training file
21
+ (e.g. doc/sectLabel.tagged.txt) from single- to multi-line
22
+ format. This script is called by tr2crfpp.pl
23
+ * genericSectExtract.rb: given a list of section headers of a
24
+ scientific document in an input file, assign generic headers for the
25
+ section headers.
26
+ * genericSect/
27
+
28
+ ------------------------------------------------------------
29
+ [1] COMMAND LINE USAGE
30
+
31
+ ------------------------------
32
+ [1.1] SectLabel
33
+ * Process Omnipage XML output
34
+
35
+ ** Usage: processOmniXML.pl -h [invokes help]
36
+ processOmniXML.pl -in xmlFile -out outFile [-xmlFeature -decode -markup -para] [-tag tagFile -allowEmptyLine -log]
37
+ Options:
38
+ -q Quiet Mode (don't echo license)
39
+ -xmlFeature: append XML feature together with text extracted
40
+ -decode: decode HTML entities and then output, to avoid double
41
+ entity encoding later
42
+ -tag tagFile: count XML tags/values for statistics
43
+ -markup: add factor infos (bold, italic etc) per word using
44
+ the format "word|||(b|nb)|||(i|ni)", useful in extracting
45
+ bold/italic phrases
46
+
47
+ * Perform stratified cross-validation
48
+
49
+ ** Usage: redo.sectLabel.pl -h [invokes help]
50
+ redo.sectLabel.pl -in trainFile -dir outDir -n folds -c configFile [-p numCpus -iter numIter -f freqCutoff]
51
+
52
+ Options:
53
+
54
+ -in: training file in the format as in
55
+ doc/sectLabel.tagged.txt
56
+ -dir: output directory, containing all intermediate
57
+ files and outputs
58
+ -n: num of cross validation folds
59
+ -c: config file to extract features and automatically
60
+ generate CRF++ template
61
+
62
+ -p: CRF++ num of CPUs (deault = 6)
63
+ -iter: CRF++ max iteration (default = 100)
64
+ -f: CRF++ frequency cut-off (default = 3)
65
+
66
+ ** E.g.:
67
+ ./bin/sectLabel/redo.sectLabel.pl -in ./doc/sectLabelXml.tagged.txt
68
+ -dir testRedoDir -n 10 -c ./resources/sectLabel/sectLabel.configXml
69
+
70
+ * Extract features
71
+
72
+ ** Usage: tr2crfpp.pl -h [invokes help]
73
+ tr2crfpp.pl -in inFile -c configFile -out outFile [-template -single]
74
+
75
+ Options:
76
+ -q Quiet Mode (don't echo license)
77
+ -in inFile: labeled input file
78
+ -c configFile: to specify which feature set to use.
79
+ -out outFile: output file for CRF++ training.
80
+ -template: to output a template used by CRF++ according to the
81
+ config file.
82
+ -single: indicate that each input document is in single-line
83
+ format (e.g., ./doc/sectLabel.tagged.txt)
84
+
85
+ ------------------------------
86
+ [1.2] GenericSect
87
+ * Create feature file
88
+
89
+ ** Usage: ruby extractFeature.rb filePath
90
+ filePath: path to the labeled data file which lists the actual
91
+ section headers and their corressponding manually assigned generic
92
+ section headers (if it exists)
93
+ syntax: generic_header ||| actual_header
94
+
95
+ * Generate generic section headers for a document
96
+
97
+ ** Usage: ruby genericSectExtract.rb filePath
98
+
99
+ where filePath is a file which lists the actual headers of a
100
+ document (automaticaly extracted by other module of SectLabel)
101
+
102
+ * Perform stratified cross-validation
103
+
104
+ ** Usage: ruby crossValidation.rb dataFile numFold
105
+
106
+ Note that data file has the format as in doc/genericSect.tagged.txt
107
+
108
+ ------------------------------------------------------------
109
+ [3] KNOWN ISSUES
110
+
@@ -0,0 +1,110 @@
1
+ README for sectLabel module (v100401)
2
+
3
+ CONTENTS
4
+ [0] Directory structure
5
+ [1] Command line Usage
6
+ [1.1] SectLabel
7
+ [1.2] GenericSect
8
+ [3] Known issues
9
+
10
+ ------------------------------------------------------------
11
+ [0] DIRECTORY STRUCTURE
12
+
13
+ * processOmniXML.pl: Process Omnipage XML output (concatenated results
14
+ fromm all pages of a PDF file), and extract text lines together with
15
+ other XML infos
16
+ Note: the current script is complicated since it mixes 2 things: process Omnipage XML as well as extract XML features. We are planning to break into 2 scripts: 1) simplifyOmniXML.pl (Done!) -- to convert Omnipage into output into internal format, and 2) extractXMLFeatures.pl (TODO) -- to take input as the internal results produced by simplifyOmniXML.pl and generate XML features.
17
+
18
+ * redo.sectLabel.pl: Perform stratified cross-validation for SectLabel
19
+ * tr2crfpp.pl: Generate SectLabel features for CRF++
20
+ * single2multi.pl: Convert SectLabel training file
21
+ (e.g. doc/sectLabel.tagged.txt) from single- to multi-line
22
+ format. This script is called by tr2crfpp.pl
23
+ * genericSectExtract.rb: given a list of section headers of a
24
+ scientific document in an input file, assign generic headers for the
25
+ section headers.
26
+ * genericSect/
27
+
28
+ ------------------------------------------------------------
29
+ [1] COMMAND LINE USAGE
30
+
31
+ ------------------------------
32
+ [1.1] SectLabel
33
+ * Process Omnipage XML output
34
+
35
+ ** Usage: processOmniXML.pl -h [invokes help]
36
+ processOmniXML.pl -in xmlFile -out outFile [-xmlFeature -decode -markup -para] [-tag tagFile -allowEmptyLine -log]
37
+ Options:
38
+ -q Quiet Mode (don't echo license)
39
+ -xmlFeature: append XML feature together with text extracted
40
+ -decode: decode HTML entities and then output, to avoid double
41
+ entity encoding later
42
+ -tag tagFile: count XML tags/values for statistics
43
+ -markup: add factor infos (bold, italic etc) per word using
44
+ the format "word|||(b|nb)|||(i|ni)", useful in extracting
45
+ bold/italic phrases
46
+
47
+ * Perform stratified cross-validation
48
+
49
+ ** Usage: redo.sectLabel.pl -h [invokes help]
50
+ redo.sectLabel.pl -in trainFile -dir outDir -n folds -c configFile [-p numCpus -iter numIter -f freqCutoff]
51
+
52
+ Options:
53
+
54
+ -in: training file in the format as in
55
+ doc/sectLabel.tagged.txt
56
+ -dir: output directory, containing all intermediate
57
+ files and outputs
58
+ -n: num of cross validation folds
59
+ -c: config file to extract features and automatically
60
+ generate CRF++ template
61
+
62
+ -p: CRF++ num of CPUs (deault = 6)
63
+ -iter: CRF++ max iteration (default = 100)
64
+ -f: CRF++ frequency cut-off (default = 3)
65
+
66
+ ** E.g.:
67
+ ./bin/sectLabel/redo.sectLabel.pl -in ./doc/sectLabelXml.tagged.txt
68
+ -dir testRedoDir -n 10 -c ./resources/sectLabel/sectLabel.configXml
69
+
70
+ * Extract features
71
+
72
+ ** Usage: tr2crfpp.pl -h [invokes help]
73
+ tr2crfpp.pl -in inFile -c configFile -out outFile [-template -single]
74
+
75
+ Options:
76
+ -q Quiet Mode (don't echo license)
77
+ -in inFile: labeled input file
78
+ -c configFile: to specify which feature set to use.
79
+ -out outFile: output file for CRF++ training.
80
+ -template: to output a template used by CRF++ according to the
81
+ config file.
82
+ -single: indicate that each input document is in single-line
83
+ format (e.g., ./doc/sectLabel.tagged.txt)
84
+
85
+ ------------------------------
86
+ [1.2] GenericSect
87
+ * Create feature file
88
+
89
+ ** Usage: ruby extractFeature.rb filePath
90
+ filePath: path to the labeled data file which lists the actual
91
+ section headers and their corressponding manually assigned generic
92
+ section headers (if it exists)
93
+ syntax: generic_header ||| actual_header
94
+
95
+ * Generate generic section headers for a document
96
+
97
+ ** Usage: ruby genericSectExtract.rb filePath
98
+
99
+ where filePath is a file which lists the actual headers of a
100
+ document (automaticaly extracted by other module of SectLabel)
101
+
102
+ * Perform stratified cross-validation
103
+
104
+ ** Usage: ruby crossValidation.rb dataFile numFold
105
+
106
+ Note that data file has the format as in doc/genericSect.tagged.txt
107
+
108
+ ------------------------------------------------------------
109
+ [3] KNOWN ISSUES
110
+