biblicit 1.0 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (406) hide show
  1. data/.gitmodules +3 -0
  2. data/Gemfile +1 -1
  3. data/README.md +125 -30
  4. data/Rakefile +22 -0
  5. data/biblicit.gemspec +9 -7
  6. data/lib/biblicit/cb2bib.rb +10 -11
  7. data/lib/biblicit/citeseer.rb +14 -26
  8. data/lib/biblicit/extractor.rb +40 -19
  9. data/lib/biblicit/parscit.rb +38 -0
  10. data/parscit/.gitignore +8 -0
  11. data/parscit/CHANGELOG +125 -0
  12. data/parscit/COPYING +674 -0
  13. data/parscit/COPYING.LESSER +165 -0
  14. data/parscit/INSTALL +105 -0
  15. data/parscit/README +97 -0
  16. data/{perl/ParsCit/README.TXT → parscit/USAGE} +25 -15
  17. data/parscit/bin/archtest.pl +31 -0
  18. data/parscit/bin/citeExtract.pl +562 -0
  19. data/parscit/bin/conlleval.pl +315 -0
  20. data/parscit/bin/headExtract.pl +40 -0
  21. data/parscit/bin/parsHed/convert2TokenLevel.pl +138 -0
  22. data/parscit/bin/parsHed/keywordGen.pl +308 -0
  23. data/parscit/bin/parsHed/parseXmlHeader.pl +141 -0
  24. data/parscit/bin/parsHed/redo.parsHed.pl +198 -0
  25. data/parscit/bin/parsHed/tr2crfpp_parsHed.pl +521 -0
  26. data/parscit/bin/parseRefStrings.pl +102 -0
  27. data/parscit/bin/phOutput2xml.pl +223 -0
  28. data/parscit/bin/redo.parsCit.pl +105 -0
  29. data/parscit/bin/sectExtract.pl +149 -0
  30. data/parscit/bin/sectLabel/README +110 -0
  31. data/parscit/bin/sectLabel/README.txt +110 -0
  32. data/parscit/bin/sectLabel/genericSect/crossValidation.rb +98 -0
  33. data/parscit/bin/sectLabel/genericSect/extractFeature.rb +104 -0
  34. data/parscit/bin/sectLabel/genericSectExtract.rb +53 -0
  35. data/parscit/bin/sectLabel/getStructureInfo.pl +156 -0
  36. data/parscit/bin/sectLabel/processOmniXML.pl +1427 -0
  37. data/parscit/bin/sectLabel/processOmniXML_new.pl +1025 -0
  38. data/parscit/bin/sectLabel/processOmniXMLv2.pl +1529 -0
  39. data/parscit/bin/sectLabel/processOmniXMLv3.pl +964 -0
  40. data/parscit/bin/sectLabel/redo.sectLabel.pl +219 -0
  41. data/parscit/bin/sectLabel/simplifyOmniXML.pl +382 -0
  42. data/parscit/bin/sectLabel/single2multi.pl +190 -0
  43. data/parscit/bin/sectLabel/tr2crfpp.pl +158 -0
  44. data/parscit/bin/tr2crfpp.pl +260 -0
  45. data/parscit/bin/xml2train.pl +193 -0
  46. data/parscit/lib/CSXUtil/SafeText.pm +130 -0
  47. data/parscit/lib/Omni/Config.pm +93 -0
  48. data/parscit/lib/Omni/Omnicell.pm +263 -0
  49. data/parscit/lib/Omni/Omnicol.pm +292 -0
  50. data/parscit/lib/Omni/Omnidd.pm +328 -0
  51. data/parscit/lib/Omni/Omnidoc.pm +153 -0
  52. data/parscit/lib/Omni/Omniframe.pm +223 -0
  53. data/parscit/lib/Omni/Omniline.pm +423 -0
  54. data/parscit/lib/Omni/Omnipage.pm +282 -0
  55. data/parscit/lib/Omni/Omnipara.pm +232 -0
  56. data/parscit/lib/Omni/Omnirun.pm +303 -0
  57. data/parscit/lib/Omni/Omnitable.pm +336 -0
  58. data/parscit/lib/Omni/Omniword.pm +162 -0
  59. data/parscit/lib/Omni/Traversal.pm +313 -0
  60. data/parscit/lib/ParsCit/.PostProcess.pm.swp +0 -0
  61. data/parscit/lib/ParsCit/Citation.pm +737 -0
  62. data/parscit/lib/ParsCit/CitationContext.pm +220 -0
  63. data/parscit/lib/ParsCit/Config.pm +35 -0
  64. data/parscit/lib/ParsCit/Controller.pm +653 -0
  65. data/parscit/lib/ParsCit/PostProcess.pm +505 -0
  66. data/parscit/lib/ParsCit/PreProcess.pm +1041 -0
  67. data/parscit/lib/ParsCit/Tr2crfpp.pm +1195 -0
  68. data/parscit/lib/ParsHed/Config.pm +49 -0
  69. data/parscit/lib/ParsHed/Controller.pm +143 -0
  70. data/parscit/lib/ParsHed/PostProcess.pm +322 -0
  71. data/parscit/lib/ParsHed/Tr2crfpp.pm +448 -0
  72. data/{perl/ParsCit/lib/ParsCit/Tr2crfpp.pm → parscit/lib/ParsHed/Tr2crfpp_token.pm} +22 -21
  73. data/parscit/lib/SectLabel/AAMatching.pm +1949 -0
  74. data/parscit/lib/SectLabel/Config.pm +88 -0
  75. data/parscit/lib/SectLabel/Controller.pm +332 -0
  76. data/parscit/lib/SectLabel/PostProcess.pm +425 -0
  77. data/parscit/lib/SectLabel/PreProcess.pm +116 -0
  78. data/parscit/lib/SectLabel/Tr2crfpp.pm +1246 -0
  79. data/parscit/resources/parsCit.model +0 -0
  80. data/parscit/resources/parsCit.split.model +0 -0
  81. data/{perl/ParsCit → parscit}/resources/parsCitDict.txt +205 -0
  82. data/parscit/resources/parsHed/bigram +10 -0
  83. data/parscit/resources/parsHed/keywords +10 -0
  84. data/parscit/resources/parsHed/parsHed.model +0 -0
  85. data/parscit/resources/parsHed/parsHed.template +178 -0
  86. data/parscit/resources/sectLabel/affiliation.model +0 -0
  87. data/parscit/resources/sectLabel/author.model +0 -0
  88. data/parscit/resources/sectLabel/funcWord +320 -0
  89. data/parscit/resources/sectLabel/genericSect.model +0 -0
  90. data/parscit/resources/sectLabel/sectLabel.config +42 -0
  91. data/parscit/resources/sectLabel/sectLabel.configXml +42 -0
  92. data/parscit/resources/sectLabel/sectLabel.model +0 -0
  93. data/sh/convert_to_text.sh +20 -0
  94. data/spec/biblicit/extractor_spec.rb +121 -0
  95. data/spec/fixtures/Review_of_Michael_Tyes_Consciousness_Revisited.docx +0 -0
  96. data/spec/fixtures/critical-infrastructures.ps +63951 -0
  97. data/spec/fixtures/txt/E06-1050.txt +867 -0
  98. data/spec/fixtures/txt/sample1.txt +902 -0
  99. data/spec/fixtures/txt/sample2.txt +394 -0
  100. data/spec/spec_helper.rb +3 -0
  101. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Function.pm +2 -20
  102. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +0 -7
  103. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Parser.pm +0 -2
  104. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +0 -7
  105. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +6 -1
  106. data/svm-header-parse/HeaderParseService/tmp/.gitignore +4 -0
  107. data/svm-header-parse/extract.pl +75 -0
  108. metadata +351 -317
  109. data/perl/DocFilter/lib/DocFilter/Config.pm +0 -35
  110. data/perl/DocFilter/lib/DocFilter/Filter.pm +0 -51
  111. data/perl/FileConversionService/README.TXT +0 -11
  112. data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
  113. data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +0 -140
  114. data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +0 -77
  115. data/perl/FileConversionService/lib/FileConverter/Compression.pm +0 -137
  116. data/perl/FileConversionService/lib/FileConverter/Config.pm +0 -57
  117. data/perl/FileConversionService/lib/FileConverter/Controller.pm +0 -191
  118. data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +0 -61
  119. data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +0 -69
  120. data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +0 -69
  121. data/perl/FileConversionService/lib/FileConverter/PSToText.pm +0 -88
  122. data/perl/FileConversionService/lib/FileConverter/Prescript.pm +0 -68
  123. data/perl/FileConversionService/lib/FileConverter/TET.pm +0 -75
  124. data/perl/FileConversionService/lib/FileConverter/Utils.pm +0 -130
  125. data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +0 -140
  126. data/perl/HeaderParseService/resources/data/EbizHeaders.txt +0 -24330
  127. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +0 -27506
  128. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +0 -26495
  129. data/perl/HeaderParseService/resources/data/tagged_headers.txt +0 -40668
  130. data/perl/HeaderParseService/resources/data/test_header.txt +0 -31
  131. data/perl/HeaderParseService/resources/data/test_header.txt.parsed +0 -31
  132. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +0 -23
  133. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +0 -23
  134. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +0 -23
  135. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +0 -23
  136. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +0 -23
  137. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +0 -23
  138. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +0 -23
  139. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +0 -23
  140. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +0 -23
  141. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +0 -23
  142. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +0 -23
  143. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +0 -23
  144. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +0 -23
  145. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +0 -23
  146. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +0 -23
  147. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +0 -23
  148. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +0 -23
  149. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +0 -23
  150. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +0 -23
  151. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +0 -23
  152. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +0 -23
  153. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +0 -23
  154. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +0 -23
  155. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +0 -23
  156. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +0 -23
  157. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +0 -23
  158. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +0 -23
  159. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +0 -23
  160. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +0 -23
  161. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +0 -23
  162. data/perl/ParsCit/crfpp/traindata/parsCit.template +0 -60
  163. data/perl/ParsCit/crfpp/traindata/parsCit.train.data +0 -12104
  164. data/perl/ParsCit/crfpp/traindata/tagged_references.txt +0 -500
  165. data/perl/ParsCit/lib/CSXUtil/SafeText.pm +0 -140
  166. data/perl/ParsCit/lib/ParsCit/Citation.pm +0 -462
  167. data/perl/ParsCit/lib/ParsCit/CitationContext.pm +0 -132
  168. data/perl/ParsCit/lib/ParsCit/Config.pm +0 -46
  169. data/perl/ParsCit/lib/ParsCit/Controller.pm +0 -306
  170. data/perl/ParsCit/lib/ParsCit/PostProcess.pm +0 -367
  171. data/perl/ParsCit/lib/ParsCit/PreProcess.pm +0 -333
  172. data/perl/ParsCit/resources/parsCit.model +0 -0
  173. data/perl/extract.pl +0 -199
  174. data/spec/biblicit/cb2bib_spec.rb +0 -48
  175. data/spec/biblicit/citeseer_spec.rb +0 -40
  176. /data/{perl → svm-header-parse}/HeaderParseService/README.TXT +0 -0
  177. /data/{perl/DocFilter → svm-header-parse/HeaderParseService}/lib/CSXUtil/SafeText.pm +0 -0
  178. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +0 -0
  179. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +0 -0
  180. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +0 -0
  181. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/50states +0 -0
  182. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AddrTopWords.txt +0 -0
  183. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWords.txt +0 -0
  184. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWordsAll.txt +0 -0
  185. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/ChineseSurNames.txt +0 -0
  186. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames.bin +0 -0
  187. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
  188. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/DomainSuffixes.txt +0 -0
  189. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/LabeledHeader +0 -0
  190. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/README +0 -0
  191. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines +0 -0
  192. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines1 +0 -0
  193. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstract.txt +0 -0
  194. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstractTopWords +0 -0
  195. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/addr.txt +0 -0
  196. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affi.txt +0 -0
  197. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affis.bin +0 -0
  198. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
  199. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/allnamewords.bin +0 -0
  200. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_US.txt +0 -0
  201. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_world.txt +0 -0
  202. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/city.txt +0 -0
  203. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cityname.txt +0 -0
  204. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/country_abbr.txt +0 -0
  205. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/countryname.txt +0 -0
  206. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/dateTopWords +0 -0
  207. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/degree.txt +0 -0
  208. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/email.txt +0 -0
  209. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/excludeWords.txt +0 -0
  210. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/female-names +0 -0
  211. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstNames.txt +0 -0
  212. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames.bin +0 -0
  213. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
  214. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/intro.txt +0 -0
  215. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keyword.txt +0 -0
  216. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keywordTopWords +0 -0
  217. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/male-names +0 -0
  218. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/middleNames.txt +0 -0
  219. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/month.txt +0 -0
  220. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul +0 -0
  221. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label +0 -0
  222. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label.old +0 -0
  223. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.processed +0 -0
  224. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulAuthor +0 -0
  225. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulClassStat +0 -0
  226. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nickname.txt +0 -0
  227. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nicknames.bin +0 -0
  228. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/note.txt +0 -0
  229. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/page.txt +0 -0
  230. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/phone.txt +0 -0
  231. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/postcode.txt +0 -0
  232. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/pubnum.txt +0 -0
  233. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.bin +0 -0
  234. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.txt +0 -0
  235. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/states_and_abbreviations.txt +0 -0
  236. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords +0 -0
  237. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords.bin +0 -0
  238. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surNames.txt +0 -0
  239. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames.bin +0 -0
  240. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames_spec.bin +0 -0
  241. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/A.html +0 -0
  242. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/B.html +0 -0
  243. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/C.html +0 -0
  244. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/D.html +0 -0
  245. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/E.html +0 -0
  246. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/F.html +0 -0
  247. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/G.html +0 -0
  248. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/H.html +0 -0
  249. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/I.html +0 -0
  250. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/J.html +0 -0
  251. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/K.html +0 -0
  252. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/L.html +0 -0
  253. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/M.html +0 -0
  254. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/N.html +0 -0
  255. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/O.html +0 -0
  256. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/P.html +0 -0
  257. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Q.html +0 -0
  258. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/R.html +0 -0
  259. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/S.html +0 -0
  260. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/T.html +0 -0
  261. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/U.html +0 -0
  262. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/V.html +0 -0
  263. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/W.html +0 -0
  264. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
  265. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/X.html +0 -0
  266. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Y.html +0 -0
  267. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Z.html +0 -0
  268. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ae.html +0 -0
  269. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/am.html +0 -0
  270. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ar.html +0 -0
  271. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/at.html +0 -0
  272. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/au.html +0 -0
  273. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bd.html +0 -0
  274. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/be.html +0 -0
  275. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bg.html +0 -0
  276. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bh.html +0 -0
  277. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
  278. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bm.html +0 -0
  279. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bn.html +0 -0
  280. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/br.html +0 -0
  281. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ca.html +0 -0
  282. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ch.html +0 -0
  283. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cl.html +0 -0
  284. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cn.html +0 -0
  285. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/co.html +0 -0
  286. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cr.html +0 -0
  287. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cy.html +0 -0
  288. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cz.html +0 -0
  289. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/de.html +0 -0
  290. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
  291. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dk.html +0 -0
  292. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ec.html +0 -0
  293. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ee.html +0 -0
  294. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/eg.html +0 -0
  295. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/es.html +0 -0
  296. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/et.html +0 -0
  297. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/faq.html +0 -0
  298. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fi.html +0 -0
  299. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fj.html +0 -0
  300. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fo.html +0 -0
  301. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fr.html +0 -0
  302. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/geog.html +0 -0
  303. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gr.html +0 -0
  304. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gu.html +0 -0
  305. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hk.html +0 -0
  306. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hr.html +0 -0
  307. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hu.html +0 -0
  308. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/id.html +0 -0
  309. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ie.html +0 -0
  310. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/il.html +0 -0
  311. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/in.html +0 -0
  312. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/is.html +0 -0
  313. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/it.html +0 -0
  314. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jm.html +0 -0
  315. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jo.html +0 -0
  316. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jp.html +0 -0
  317. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
  318. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kr.html +0 -0
  319. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kw.html +0 -0
  320. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lb.html +0 -0
  321. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
  322. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lk.html +0 -0
  323. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lt.html +0 -0
  324. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lu.html +0 -0
  325. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lv.html +0 -0
  326. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ma.html +0 -0
  327. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
  328. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
  329. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mk.html +0 -0
  330. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mo.html +0 -0
  331. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
  332. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mt.html +0 -0
  333. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mx.html +0 -0
  334. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/my.html +0 -0
  335. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ni.html +0 -0
  336. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nl.html +0 -0
  337. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/no.html +0 -0
  338. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nz.html +0 -0
  339. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pa.html +0 -0
  340. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pe.html +0 -0
  341. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ph.html +0 -0
  342. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pl.html +0 -0
  343. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
  344. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pr.html +0 -0
  345. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ps.html +0 -0
  346. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pt.html +0 -0
  347. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/recognition.html +0 -0
  348. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/results.html +0 -0
  349. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ro.html +0 -0
  350. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ru.html +0 -0
  351. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sd.html +0 -0
  352. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/se.html +0 -0
  353. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sg.html +0 -0
  354. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/si.html +0 -0
  355. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sk.html +0 -0
  356. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/th.html +0 -0
  357. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tr.html +0 -0
  358. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tw.html +0 -0
  359. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ua.html +0 -0
  360. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uk.html +0 -0
  361. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ-full.html +0 -0
  362. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ.html +0 -0
  363. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uy.html +0 -0
  364. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ve.html +0 -0
  365. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/yu.html +0 -0
  366. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/za.html +0 -0
  367. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/zm.html +0 -0
  368. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list.txt +0 -0
  369. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/url.txt +0 -0
  370. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/webTopWords +0 -0
  371. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/words +0 -0
  372. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10ContextModelfold1 +0 -0
  373. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10Modelfold1 +0 -0
  374. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11ContextModelfold1 +0 -0
  375. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11Modelfold1 +0 -0
  376. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12ContextModelfold1 +0 -0
  377. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12Modelfold1 +0 -0
  378. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13ContextModelfold1 +0 -0
  379. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13Modelfold1 +0 -0
  380. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14ContextModelfold1 +0 -0
  381. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14Modelfold1 +0 -0
  382. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15ContextModelfold1 +0 -0
  383. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15Modelfold1 +0 -0
  384. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1ContextModelfold1 +0 -0
  385. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1Modelfold1 +0 -0
  386. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2ContextModelfold1 +0 -0
  387. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2Modelfold1 +0 -0
  388. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3ContextModelfold1 +0 -0
  389. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3Modelfold1 +0 -0
  390. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4ContextModelfold1 +0 -0
  391. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4Modelfold1 +0 -0
  392. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5ContextModelfold1 +0 -0
  393. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5Modelfold1 +0 -0
  394. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6ContextModelfold1 +0 -0
  395. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6Modelfold1 +0 -0
  396. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7ContextModelfold1 +0 -0
  397. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7Modelfold1 +0 -0
  398. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8ContextModelfold1 +0 -0
  399. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8Modelfold1 +0 -0
  400. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9ContextModelfold1 +0 -0
  401. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9Modelfold1 +0 -0
  402. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceModel +0 -0
  403. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceTrainF +0 -0
  404. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperBaseFeaDict +0 -0
  405. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperContextFeaDict +0 -0
  406. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +0 -0
@@ -0,0 +1,308 @@
1
+ #!/usr/bin/perl -wT
2
+ # Author: Luong Minh Thang <luongmin@comp.nus.edu.sg>, generated at Tue, 02 Jun 2009 01:30:42
3
+
4
+ # Modified from template by Min-Yen Kan <kanmy@comp.nus.edu.sg>
5
+
6
+ require 5.0;
7
+ use strict;
8
+
9
+ # I do not know a better solution to find a lib path in -T mode.
10
+ # So if you know a better solution, I'd be glad to hear.
11
+ # See this http://www.perlmonks.org/?node_id=585299 for why I used the below code
12
+ use FindBin;
13
+ my $path;
14
+ BEGIN {
15
+ if ($FindBin::Bin =~ /(.*)/) {
16
+ $path = $1;
17
+ }
18
+ }
19
+ use lib "$path/../../lib";
20
+
21
+ use Getopt::Long;
22
+ use ParsHed::Config;
23
+
24
+ ### USER customizable section
25
+ $0 =~ /([^\/]+)$/; my $progname = $1;
26
+ my $outputVersion = "1.0";
27
+ ### END user customizable section
28
+
29
+ sub License {
30
+ print STDERR "# Copyright 2009 \251 by Luong Minh Thang\n";
31
+ }
32
+
33
+ ### HELP Sub-procedure
34
+ sub Help {
35
+ print STDERR "Create keyword info from a tagged header file\n";
36
+ print STDERR "usage: $progname -h\t[invokes help]\n";
37
+ print STDERR " $progname -in taggedHeaderFile -out outFile [-n topN -nGram numNGram -lowercase]\n";
38
+ print STDERR "Options:\n";
39
+ print STDERR "\t-q\tQuiet Mode (don't echo license)\n";
40
+ print STDERR "\t-n: Default topN = 100.\n";
41
+ print STDERR "\t-nGram: Default numNGram = 1.\n";
42
+ print STDERR "\t-lowercase: enable lowercasing (default no lowercasign).\n";
43
+ }
44
+ my $QUIET = 0;
45
+ my $HELP = 0;
46
+ my $outFile = undef;
47
+ my $inFile = undef;
48
+ my $topN = 100;
49
+ my $numNGram = 1;
50
+ my $isLowercase = 0;
51
+ $HELP = 1 unless GetOptions('in=s' => \$inFile,
52
+ 'out=s' => \$outFile,
53
+ 'n=i' => \$topN,
54
+ 'nGram=i' => \$numNGram,
55
+ 'lowercase' => \$isLowercase,
56
+ 'h' => \$HELP,
57
+ 'q' => \$QUIET);
58
+
59
+ if ($HELP || !defined $inFile || !defined $outFile) {
60
+ Help();
61
+ exit(0);
62
+ }
63
+
64
+ if (!$QUIET) {
65
+ License();
66
+ }
67
+
68
+ ### Untaint ###
69
+ $inFile = untaintPath($inFile);
70
+ $outFile = untaintPath($outFile);
71
+ $ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
72
+ ### End untaint ###
73
+
74
+ # keyword statistics
75
+ my %keywords = (); #hash of hash $keywords{"affiliation"}->{"Institute"} = freq of "Institute" for affiliation tag
76
+
77
+ processFile($inFile, $outFile, $numNGram);
78
+
79
+ ##
80
+ # main routine to count frequent keywords/bigrams in $inFile and output to $outFile
81
+ ##
82
+ sub processFile {
83
+ my ($inFile, $outFile, $numNGram) = @_;
84
+
85
+ if (!(-e $inFile)) { die "# $progname crash\t\tFile \"$inFile\" doesn't exist"; }
86
+ open (IF, "<:utf8", $inFile) || die "# $progname crash\t\tCan't open \"$inFile\"";
87
+
88
+ while (<IF>) { #each line contains a header
89
+ if (/^\#/) { next; } # skip comments
90
+ elsif (/^\s+$/) { next; } # skip blank lines
91
+ else {
92
+ my @tokens = split(/ +/);
93
+
94
+ my $tag = "";
95
+ my $line = "";
96
+ foreach(@tokens){
97
+ if (/^\s*$/) { # spaces
98
+ next;
99
+ } elsif (/^\<\/([a-z]+)/) { #end tag
100
+ my @sub_lines = split(/\s*\+L\+\s*/, $line);
101
+
102
+ foreach my $sub_line (@sub_lines){ #go through each subline of a header field
103
+ countKeywords($sub_line, $tag, $numNGram);
104
+ }
105
+ $line = ""; # reset
106
+ next;
107
+ } elsif (/^\<([a-z]+)/) { #beginning tag
108
+ $tag = $1;
109
+
110
+ if(!$keywords{$tag}){
111
+ $keywords{$tag} = ();
112
+ $keywords{$tag}->{""} = 0;
113
+ }
114
+
115
+ next;
116
+ } else { #contents inside tag
117
+ $line .= "$_ ";
118
+ }
119
+ }
120
+
121
+ next;
122
+ }
123
+ } # end while IF
124
+ close IF;
125
+
126
+ ## obtain top keyWords
127
+ my %topKeywords = ();
128
+ foreach my $tag (keys %keywords){
129
+ $topKeywords{$tag} = ();
130
+
131
+ my %freqs = %{$keywords{$tag}};
132
+ my @sorted_keys = sort { $freqs{$b} <=> $freqs{$a} } keys %freqs;
133
+
134
+ my $count = 0;
135
+ foreach my $keyWord (@sorted_keys){
136
+ $topKeywords{$tag}->{$keyWord} = 1;
137
+
138
+ $count++;
139
+ if($count == $topN){
140
+ last;
141
+ }
142
+ }
143
+ }
144
+
145
+ ## filter duplicate keywords
146
+ my %filteredKeywords = ();
147
+ filterDuplicate(\%topKeywords, \%filteredKeywords);
148
+
149
+ ## output results
150
+ outputKeywords(\%filteredKeywords, $outFile);
151
+ }
152
+
153
+ ##
154
+ # Output keyword hash to file
155
+ ##
156
+ sub outputKeywords {
157
+ my ($hash, $outFile) = @_;
158
+
159
+ open(OF, ">:utf8", "$outFile") || die"#Can't open file \"$outFile\"\n";
160
+
161
+ # list of tags trained in parsHed
162
+ # those with value 0 do not have frequent keyword features
163
+ my $tags = $ParsHed::Config::tags; # hash
164
+
165
+ foreach my $tag (keys %{$tags}){
166
+ if($tags->{$tag} == 0){
167
+ next;
168
+ }
169
+
170
+ print OF "$tag:";
171
+
172
+ my @keywords = sort{$a cmp $b} keys %{$hash->{$tag}};
173
+ foreach my $keyword (@keywords){
174
+ print OF " $keyword";
175
+ }
176
+
177
+ print OF "\n";
178
+ }
179
+ close OF;
180
+ }
181
+
182
+ ##
183
+ # Remove keywords that appear in more than one field
184
+ ##
185
+ sub filterDuplicate {
186
+ my ($hash, $filteredHash) = @_;
187
+
188
+ my @tags = keys %{$hash};
189
+ foreach my $tag (@tags){
190
+ $filteredHash->{$tag} = ();
191
+ my @keywords = keys %{$hash->{$tag}};
192
+
193
+ foreach my $keyword (@keywords){
194
+ my $isDuplicated = 0;
195
+
196
+ # check for duplication
197
+ foreach(@tags){
198
+ if($_ ne $tag){ # a different tag
199
+ if($hash->{$_}->{$keyword}){ # duplicated
200
+ $isDuplicated = 1;
201
+ last;
202
+ }
203
+ }
204
+ } # end for @tags
205
+
206
+ if(!$isDuplicated){
207
+ $filteredHash->{$tag}->{$keyword} = 1;
208
+ }
209
+ }
210
+ }
211
+ }
212
+
213
+ ##
214
+ # Count keyword or nGrams
215
+ ##
216
+ sub countKeywords {
217
+ my ($line, $tag, $numNGram) = @_;
218
+
219
+ if($isLowercase){
220
+ $line = lc($line);
221
+ }
222
+
223
+ my @tmpTokens = split(/\s+/, $line);
224
+
225
+ #filter out empty token
226
+ my @tokens = ();
227
+ foreach my $token (@tmpTokens){
228
+ if($token ne ""){
229
+ $token =~ s/^\s+//g; # strip off leading spaces
230
+ $token =~ s/\s+$//g; # strip off trailing spaces
231
+ push(@tokens, $token);
232
+ }
233
+ }
234
+
235
+ my $funcWordCount = 0;
236
+ my $count = 0;
237
+ for(my $i=0; $i<=$#tokens; $i++){
238
+ if(($#tokens-$i + 1) < $numNGram) { last; }; # not enough ngrams
239
+ my $nGram = "";
240
+ for(my $j=$i; $j <= ($i+$numNGram-1); $j++){
241
+ my $token = $tokens[$j];
242
+ $token =~ s/^\p{P}+//g; #strip out leading punctuations
243
+ $token =~ s/\p{P}+$//g; #strip out trailing punctuations
244
+ $token =~ s/^\s+//g; #strip out leading spaces
245
+ $token =~ s/\s+$//g; #strip out trailing spaces
246
+ $token =~ s/\d/0/g; #canocalize number into "0"
247
+
248
+ if($numNGram > 1){ # lowercase to reduce data sparseness
249
+ $token = lc($token);
250
+ }
251
+
252
+ if($j < ($i+$numNGram-1)){
253
+ $nGram .= "$token-";
254
+ } else {
255
+ $nGram .= "$token";
256
+ }
257
+ }
258
+
259
+ if($nGram =~ /^\s*$/){ next; } #skip those with white spaces
260
+ if($nGram =~ /^\d*$/){ next; } #skip those with only digits
261
+
262
+ #print STDERR "$nGram
263
+ if(!$keywords{$tag}->{$nGram}){
264
+ $keywords{$tag}->{$nGram} = 0;
265
+ }
266
+ $keywords{$tag}->{$nGram}++;
267
+
268
+ $count++;
269
+ if($count == 2){
270
+ last;
271
+ }
272
+ } # end while true
273
+ }
274
+
275
+ sub untaintPath {
276
+ my ($path) = @_;
277
+
278
+ if ( $path =~ /^([-_\/\w\.]+)$/ ) {
279
+ $path = $1;
280
+ } else {
281
+ die "Bad path $path\n";
282
+ }
283
+
284
+ return $path;
285
+ }
286
+
287
+ sub untaint {
288
+ my ($s) = @_;
289
+ if ($s =~ /^([\w \-\@\(\),\.\/]+)$/) {
290
+ $s = $1; # $data now untainted
291
+ } else {
292
+ die "Bad data in $s"; # log this somewhere
293
+ }
294
+ return $s;
295
+ }
296
+
297
+ sub execute {
298
+ my ($cmd) = @_;
299
+ print STDERR "Executing: $cmd\n";
300
+ $cmd = untaint($cmd);
301
+ system($cmd);
302
+ }
303
+
304
+ sub newTmpFile {
305
+ my $tmpFile = `date '+%Y%m%d-%H%M%S-$$'`;
306
+ chomp($tmpFile);
307
+ return $tmpFile;
308
+ }
@@ -0,0 +1,141 @@
1
+ #!/usr/bin/perl -wT
2
+ # Author: Luong Minh Thang <luongmin@comp.nus.edu.sg>, generated at Tue, 02 Jun 2009 01:30:42
3
+
4
+ # Modified from template by Min-Yen Kan <kanmy@comp.nus.edu.sg>
5
+
6
+ require 5.0;
7
+ use strict;
8
+ use Getopt::Long;
9
+
10
+ ### USER customizable section
11
+ $0 =~ /([^\/]+)$/; my $progname = $1;
12
+ my $outputVersion = "1.0";
13
+ ### END user customizable section
14
+
15
+ sub License {
16
+ print STDERR "# Copyright 2009 \251 by Luong Minh Thang\n";
17
+ }
18
+
19
+ ### HELP Sub-procedure
20
+ sub Help {
21
+ print STDERR "Parse tagged header file into line-level data\n";
22
+ print STDERR "usage: $progname -h\t[invokes help]\n";
23
+ print STDERR " $progname -in taggedHeaderFile -out outFile\n";
24
+ print STDERR "Options:\n";
25
+ print STDERR "\t-q\tQuiet Mode (don't echo license)\n";
26
+ }
27
+ my $QUIET = 0;
28
+ my $HELP = 0;
29
+ my $outFile = undef;
30
+ my $inFile = undef;
31
+
32
+ $HELP = 1 unless GetOptions('in=s' => \$inFile,
33
+ 'out=s' => \$outFile,
34
+ 'h' => \$HELP,
35
+ 'q' => \$QUIET);
36
+
37
+ if ($HELP || !defined $inFile || !defined $outFile) {
38
+ Help();
39
+ exit(0);
40
+ }
41
+
42
+ if (!$QUIET) {
43
+ License();
44
+ }
45
+
46
+ ### Untaint ###
47
+ $inFile = untaintPath($inFile);
48
+ $outFile = untaintPath($outFile);
49
+ $ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
50
+ ### End untaint ###
51
+ processFile($inFile, $outFile);
52
+
53
+ sub processFile {
54
+ my ($inFile, $outFile) = @_;
55
+
56
+ if (!(-e $inFile)) { die "# $progname crash\t\tFile \"$inFile\" doesn't exist"; }
57
+ open (IF, $inFile) || die "# $progname crash\t\tCan't open \"$inFile\"";
58
+ open(OF, ">:utf8", "$outFile") || die"#Can't open file \"$outFile\"\n";
59
+
60
+ while (<IF>) { #each line contains a header
61
+ if (/^\#/) { next; } # skip comments
62
+ elsif (/^\s+$/) { next; } # skip blank lines
63
+ else {
64
+ my @tokens = split(/ +/);
65
+
66
+ my $tag = "";
67
+ my $line = "";
68
+ foreach(@tokens){
69
+ if (/^\s*$/) { # spaces
70
+ next;
71
+ } elsif (/^\<\/([a-z]+)/) { #end tag
72
+ if($line =~ /^\s*$/) { print STDERR "Skip \"$line\"\n"; next; }
73
+
74
+ my @sub_lines = split(/\s*\+L\+\s*/, $line);
75
+ for(my $i=0; $i<$#sub_lines; $i++){#go through each subline of a header field
76
+
77
+ if($sub_lines[$i] !~ /^\s*$/){
78
+ print OF "$sub_lines[$i] +L+\n";
79
+ }
80
+ }
81
+
82
+ #check if $line end with +L+
83
+ if($line =~ /\+L\+\s*$/){
84
+ print OF "$sub_lines[$#sub_lines] +L+\n";
85
+ } else {
86
+ print OF "$sub_lines[$#sub_lines]\n";
87
+ }
88
+
89
+ $line = ""; # reset
90
+ next;
91
+ } elsif (/^\<([a-z]+)/) { #beginning tag
92
+ $tag = $1;
93
+
94
+ print OF "//$tag\n";
95
+ next;
96
+ } else { #contents inside tag
97
+ $line .= "$_ ";
98
+ }
99
+ }
100
+ }
101
+ print OF "\n";
102
+ }
103
+
104
+ close OF;
105
+ close IF;
106
+ }
107
+
108
+ sub untaintPath {
109
+ my ($path) = @_;
110
+
111
+ if ( $path =~ /^([-_\/\w\.]+)$/ ) {
112
+ $path = $1;
113
+ } else {
114
+ die "Bad path $path\n";
115
+ }
116
+
117
+ return $path;
118
+ }
119
+
120
+ sub untaint {
121
+ my ($s) = @_;
122
+ if ($s =~ /^([\w \-\@\(\),\.\/]+)$/) {
123
+ $s = $1; # $data now untainted
124
+ } else {
125
+ die "Bad data in $s"; # log this somewhere
126
+ }
127
+ return $s;
128
+ }
129
+
130
+ sub execute {
131
+ my ($cmd) = @_;
132
+ print STDERR "Executing: $cmd\n";
133
+ $cmd = untaint($cmd);
134
+ system($cmd);
135
+ }
136
+
137
+ sub newTmpFile {
138
+ my $tmpFile = `date '+%Y%m%d-%H%M%S-$$'`;
139
+ chomp($tmpFile);
140
+ return $tmpFile;
141
+ }
@@ -0,0 +1,198 @@
1
+ #!/usr/bin/env perl
2
+ # -*- cperl -*-
3
+
4
+ require 5.0;
5
+ use strict;
6
+ use Getopt::Long;
7
+ use FindBin;
8
+
9
+ ### USER customizable section
10
+ my $tmpfile .= $0; $tmpfile =~ s/[\.\/]//g;
11
+ $tmpfile .= $$ . time;
12
+ if ($tmpfile =~ /^([-\@\w.]+)$/) { $tmpfile = $1; } # untaint tmpfile variable
13
+ $tmpfile = "/tmp/" . $tmpfile;
14
+ $0 =~ /([^\/]+)$/; my $progname = $1;
15
+ my $outputVersion = "1.0";
16
+
17
+ my $parscitHome = "$FindBin::Bin/../..";
18
+ my $tr2crfppLoc = "$parscitHome/bin/parsHed/tr2crfpp_parsHed.pl";
19
+ my $convertLoc = "$parscitHome/bin/parsHed/convert2TokenLevel.pl"; #new model
20
+ my $keywordLoc = "$parscitHome/bin/parsHed/keywordGen.pl"; #new model
21
+ my $crf_learnLoc = "$ENV{'CRFPP_HOME'}/bin/crf_learn";
22
+ my $crf_testLoc = "$ENV{'CRFPP_HOME'}/bin/crf_test";
23
+ my $conllevalLoc = "$parscitHome/bin/conlleval.pl";
24
+ ### END user customizable section
25
+
26
+ ## Thang add ##
27
+ sub Help {
28
+ print STDERR "usage: $progname -h\t[invokes help]\n";
29
+ print STDERR " $progname -in trainFile -t templateFile -n folds [-p numCpus -oldModel]\n";
30
+ print STDERR "Options:\n";
31
+ print STDERR "\t\t-p: Default is 6 cpus\n";
32
+ print STDERR "\t\t-p: Default: use the new line-level model. Specify -oldModel to run with the old one.\n";
33
+ print STDERR "\t\t-tr2crfpp: Default $ENV{ParsCit}/bin/parsHed/tr2crfpp.pl\n";
34
+ }
35
+
36
+ my $HELP = 0;
37
+ my $trainingFile = undef;
38
+ my $crfTemplateLoc = undef;
39
+ my $folds = undef;
40
+ my $numCpus = 6;
41
+ my $isOldModel = 0;
42
+ $HELP = 1 unless GetOptions('in=s' => \$trainingFile,
43
+ 't=s' => \$crfTemplateLoc,
44
+ 'n=i' => \$folds,
45
+ 'p=i' => \$numCpus,
46
+ 'oldModel' => \$isOldModel,
47
+ 'h' => \$HELP);
48
+
49
+ if ($HELP || !defined $trainingFile || !defined $folds || !defined $crfTemplateLoc) {
50
+ Help();
51
+ exit(0);
52
+ }
53
+ ## End Thang add ##
54
+
55
+ if($isOldModel){
56
+ $tr2crfppLoc = "$parscitHome/bin/tr2crfpp.pl";
57
+ }
58
+
59
+ print STDERR "### Note the number of CPU for parallel crfpp is $numCpus\n";
60
+
61
+ # construct test data
62
+ print STDERR "### Constructing $folds-fold test files $tmpfile.*.test.src...\n"; # Thang add
63
+ open (IF, $trainingFile) || die "# $progname fatal\tTraining file cannot be opened \"$trainingFile\"!";
64
+ my $i = 0;
65
+ while (<IF>) {
66
+ open (OF, ">>$tmpfile.$i.test.src") || die "$progname fatal\tCan't append to file \"$tmpfile.$i.test.src\"!";
67
+ print OF $_;
68
+ $i++;
69
+ $i = $i % $folds;
70
+ }
71
+ close (IF);
72
+
73
+ # construct crf features
74
+ if(!$isOldModel){
75
+ for (my $i = 0; $i < $folds; $i++) {
76
+ # construct src training data first
77
+ for (my $j = 0; $j < $folds; $j++) {
78
+ if ($j == $i) {next; }
79
+ else {
80
+ execute("cat $tmpfile.$j.test.src >> $tmpfile.$i.train.src");
81
+ }
82
+ }
83
+
84
+ #construct keywordFile, using topN = 100
85
+ my $topN = 100;
86
+ execute("$keywordLoc -in $tmpfile.$i.train.src -out $tmpfile.$i.keywords -n $topN"); # keyword file
87
+ execute("$keywordLoc -in $tmpfile.$i.train.src -out $tmpfile.$i.bigram -n $topN -nGram 2"); # bigram file
88
+
89
+ # create crf features
90
+ execute("$tr2crfppLoc -in $tmpfile.$i.test.src -out $tmpfile.$i.test -k $tmpfile.$i.keywords -b $tmpfile.$i.bigram 1>/dev/null");
91
+ }
92
+ } else {
93
+ for (my $i = 0; $i < $folds; $i++) {
94
+ execute("$tr2crfppLoc $tmpfile.$i.test.src > $tmpfile.$i.test");
95
+ }
96
+ }
97
+
98
+ # construct training data
99
+ for (my $i = 0; $i < $folds; $i++) {
100
+ for (my $j = 0; $j < $folds; $j++) {
101
+ if ($j == $i) {next; }
102
+ else {
103
+ execute("cat $tmpfile.$j.test >> $tmpfile.$i.train");
104
+ }
105
+ }
106
+
107
+ }
108
+
109
+ # train
110
+ print STDERR "### Training ...\n"; # Thang add
111
+ for (my $i = 0; $i < $folds; $i++) {
112
+ execute("$crf_learnLoc -m 100 -p $numCpus -f 3 -c 3 $crfTemplateLoc $tmpfile.$i.train $tmpfile.$i.model");
113
+ }
114
+
115
+ # test
116
+ print STDERR "### Testing ...\n"; # Thang add
117
+ for (my $i = 0; $i < $folds; $i++) {
118
+ execute("$crf_testLoc -m $tmpfile.$i.model $tmpfile.$i.test > $tmpfile.$i.out");
119
+
120
+ # convert from line-level format to token-level format
121
+ if(!$isOldModel){
122
+ print STDERR "### Convert from line-level format to token-level format...\n";
123
+ execute("$convertLoc -in1 $tmpfile.$i.test.src -in2 $tmpfile.$i.out -out $tmpfile.$i.out.convert 2>$tmpfile.$i.out.log");
124
+ execute("cat $tmpfile.$i.out.convert >> $tmpfile.all.out");
125
+ } else {
126
+ execute("cat $tmpfile.$i.out >> $tmpfile.all.out");
127
+ }
128
+ }
129
+
130
+
131
+ # eval
132
+ #for (my $i = 0; $i < $folds; $i++) {
133
+ # my $cmd = "$conllevalLoc -r -d \" \" < $tmpfile.$i.out";
134
+ # print "$cmd\n";
135
+ # system ($cmd);
136
+ #}
137
+ print STDERR "### Evaluating ...\n"; # Thang add
138
+ execute("$conllevalLoc -r -d \" \" < $tmpfile.all.out");
139
+
140
+ # clean up
141
+ #`rm -f $tmpfile*`;
142
+
143
+ ## Thang add ##
144
+ sub execute {
145
+ my ($cmd) = @_;
146
+ print STDERR "Executing: $cmd\n";
147
+ system($cmd);
148
+ }
149
+ ## End Thang add ##
150
+
151
+ ######################################################################
152
+ # 2-fold -m 100 -f 3 -c 3 on svm_headerparse.txt
153
+ ### Token-level model
154
+ # accuracy: 96.76%; precision: 96.76%; recall: 96.76%; FB1: 96.76
155
+ # abstract: precision: 99.04%; recall: 99.58%; FB1: 99.31 121194
156
+ # address: precision: 93.75%; recall: 89.43%; FB1: 91.54 5342
157
+ # affiliation: precision: 92.65%; recall: 92.06%; FB1: 92.35 9823
158
+ # author: precision: 94.11%; recall: 92.57%; FB1: 93.33 7825
159
+ # date: precision: 89.94%; recall: 92.59%; FB1: 91.25 1084
160
+ # degree: precision: 80.67%; recall: 84.30%; FB1: 82.45 2163
161
+ # email: precision: 91.54%; recall: 90.30%; FB1: 90.91 1667
162
+ # intro: precision: 97.56%; recall: 96.49%; FB1: 97.03 1354
163
+ # keyword: precision: 92.73%; recall: 81.24%; FB1: 86.60 1966
164
+ # note: precision: 86.70%; recall: 87.27%; FB1: 86.99 11220
165
+ # page: precision: 100.00%; recall: 100.00%; FB1: 100.00 288
166
+ # phone: precision: 92.17%; recall: 79.37%; FB1: 85.29 434
167
+ # pubnum: precision: 91.55%; recall: 85.83%; FB1: 88.60 556
168
+ # title: precision: 94.22%; recall: 95.10%; FB1: 94.66 9043
169
+ # web: precision: 77.32%; recall: 45.45%; FB1: 57.25 97
170
+ #
171
+ ### Line-level model
172
+ #accuracy: 97.15%; precision: 97.15%; recall: 97.15%; FB1: 97.15
173
+ # abstract: precision: 98.70%; recall: 99.77%; FB1: 99.23 121842
174
+ # address: precision: 95.81%; recall: 97.01%; FB1: 96.40 5653
175
+ # affiliation: precision: 95.33%; recall: 95.94%; FB1: 95.63 9955
176
+ # author: precision: 93.31%; recall: 95.05%; FB1: 94.17 8084
177
+ # date: precision: 87.00%; recall: 94.07%; FB1: 90.40 1131
178
+ # degree: precision: 98.38%; recall: 58.55%; FB1: 73.41 1232
179
+ # email: precision: 97.37%; recall: 96.96%; FB1: 97.16 1671
180
+ # intro: precision: 90.77%; recall: 99.85%; FB1: 95.10 1506
181
+ # keyword: precision: 95.29%; recall: 80.17%; FB1: 87.08 1888
182
+ # note: precision: 89.86%; recall: 82.55%; FB1: 86.05 10236
183
+ # page: precision: 100.00%; recall: 100.00%; FB1: 100.00 288
184
+ # phone: precision: 98.02%; recall: 88.49%; FB1: 93.01 455
185
+ # pubnum: precision: 92.18%; recall: 85.93%; FB1: 88.95 550
186
+ # title: precision: 93.66%; recall: 98.05%; FB1: 95.80 9379
187
+ # web: precision: 98.40%; recall: 75.93%; FB1: 85.71 125
188
+ #
189
+ # Model Type of features FB1 # features
190
+ # Token-level model 96.76% 1091340
191
+ # Line-level model
192
+ # fullToken Parscit features 95.72% 60285
193
+ # fullToken.firstToken Parscit features 95.94% 128745
194
+ # fullToken.firstSecondLastToken Parscit features 96.59% 259050
195
+ # fullToken.firstSecondSecondLastToken Parscit features 96.68% 324255
196
+ # fullToken.firstSecondSecondLastToken.back1 linking 96.86% 392625
197
+ # fullToken.firstSecondSecondLastToken.back1.forw1 linking 96.98% 458175 further linking reduces performance
198
+ # fullToken.firstSecondSecondLastToken.back1.forw1.keyword keyword 97.15% 461085