biblicit 1.0 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (406) hide show
  1. data/.gitmodules +3 -0
  2. data/Gemfile +1 -1
  3. data/README.md +125 -30
  4. data/Rakefile +22 -0
  5. data/biblicit.gemspec +9 -7
  6. data/lib/biblicit/cb2bib.rb +10 -11
  7. data/lib/biblicit/citeseer.rb +14 -26
  8. data/lib/biblicit/extractor.rb +40 -19
  9. data/lib/biblicit/parscit.rb +38 -0
  10. data/parscit/.gitignore +8 -0
  11. data/parscit/CHANGELOG +125 -0
  12. data/parscit/COPYING +674 -0
  13. data/parscit/COPYING.LESSER +165 -0
  14. data/parscit/INSTALL +105 -0
  15. data/parscit/README +97 -0
  16. data/{perl/ParsCit/README.TXT → parscit/USAGE} +25 -15
  17. data/parscit/bin/archtest.pl +31 -0
  18. data/parscit/bin/citeExtract.pl +562 -0
  19. data/parscit/bin/conlleval.pl +315 -0
  20. data/parscit/bin/headExtract.pl +40 -0
  21. data/parscit/bin/parsHed/convert2TokenLevel.pl +138 -0
  22. data/parscit/bin/parsHed/keywordGen.pl +308 -0
  23. data/parscit/bin/parsHed/parseXmlHeader.pl +141 -0
  24. data/parscit/bin/parsHed/redo.parsHed.pl +198 -0
  25. data/parscit/bin/parsHed/tr2crfpp_parsHed.pl +521 -0
  26. data/parscit/bin/parseRefStrings.pl +102 -0
  27. data/parscit/bin/phOutput2xml.pl +223 -0
  28. data/parscit/bin/redo.parsCit.pl +105 -0
  29. data/parscit/bin/sectExtract.pl +149 -0
  30. data/parscit/bin/sectLabel/README +110 -0
  31. data/parscit/bin/sectLabel/README.txt +110 -0
  32. data/parscit/bin/sectLabel/genericSect/crossValidation.rb +98 -0
  33. data/parscit/bin/sectLabel/genericSect/extractFeature.rb +104 -0
  34. data/parscit/bin/sectLabel/genericSectExtract.rb +53 -0
  35. data/parscit/bin/sectLabel/getStructureInfo.pl +156 -0
  36. data/parscit/bin/sectLabel/processOmniXML.pl +1427 -0
  37. data/parscit/bin/sectLabel/processOmniXML_new.pl +1025 -0
  38. data/parscit/bin/sectLabel/processOmniXMLv2.pl +1529 -0
  39. data/parscit/bin/sectLabel/processOmniXMLv3.pl +964 -0
  40. data/parscit/bin/sectLabel/redo.sectLabel.pl +219 -0
  41. data/parscit/bin/sectLabel/simplifyOmniXML.pl +382 -0
  42. data/parscit/bin/sectLabel/single2multi.pl +190 -0
  43. data/parscit/bin/sectLabel/tr2crfpp.pl +158 -0
  44. data/parscit/bin/tr2crfpp.pl +260 -0
  45. data/parscit/bin/xml2train.pl +193 -0
  46. data/parscit/lib/CSXUtil/SafeText.pm +130 -0
  47. data/parscit/lib/Omni/Config.pm +93 -0
  48. data/parscit/lib/Omni/Omnicell.pm +263 -0
  49. data/parscit/lib/Omni/Omnicol.pm +292 -0
  50. data/parscit/lib/Omni/Omnidd.pm +328 -0
  51. data/parscit/lib/Omni/Omnidoc.pm +153 -0
  52. data/parscit/lib/Omni/Omniframe.pm +223 -0
  53. data/parscit/lib/Omni/Omniline.pm +423 -0
  54. data/parscit/lib/Omni/Omnipage.pm +282 -0
  55. data/parscit/lib/Omni/Omnipara.pm +232 -0
  56. data/parscit/lib/Omni/Omnirun.pm +303 -0
  57. data/parscit/lib/Omni/Omnitable.pm +336 -0
  58. data/parscit/lib/Omni/Omniword.pm +162 -0
  59. data/parscit/lib/Omni/Traversal.pm +313 -0
  60. data/parscit/lib/ParsCit/.PostProcess.pm.swp +0 -0
  61. data/parscit/lib/ParsCit/Citation.pm +737 -0
  62. data/parscit/lib/ParsCit/CitationContext.pm +220 -0
  63. data/parscit/lib/ParsCit/Config.pm +35 -0
  64. data/parscit/lib/ParsCit/Controller.pm +653 -0
  65. data/parscit/lib/ParsCit/PostProcess.pm +505 -0
  66. data/parscit/lib/ParsCit/PreProcess.pm +1041 -0
  67. data/parscit/lib/ParsCit/Tr2crfpp.pm +1195 -0
  68. data/parscit/lib/ParsHed/Config.pm +49 -0
  69. data/parscit/lib/ParsHed/Controller.pm +143 -0
  70. data/parscit/lib/ParsHed/PostProcess.pm +322 -0
  71. data/parscit/lib/ParsHed/Tr2crfpp.pm +448 -0
  72. data/{perl/ParsCit/lib/ParsCit/Tr2crfpp.pm → parscit/lib/ParsHed/Tr2crfpp_token.pm} +22 -21
  73. data/parscit/lib/SectLabel/AAMatching.pm +1949 -0
  74. data/parscit/lib/SectLabel/Config.pm +88 -0
  75. data/parscit/lib/SectLabel/Controller.pm +332 -0
  76. data/parscit/lib/SectLabel/PostProcess.pm +425 -0
  77. data/parscit/lib/SectLabel/PreProcess.pm +116 -0
  78. data/parscit/lib/SectLabel/Tr2crfpp.pm +1246 -0
  79. data/parscit/resources/parsCit.model +0 -0
  80. data/parscit/resources/parsCit.split.model +0 -0
  81. data/{perl/ParsCit → parscit}/resources/parsCitDict.txt +205 -0
  82. data/parscit/resources/parsHed/bigram +10 -0
  83. data/parscit/resources/parsHed/keywords +10 -0
  84. data/parscit/resources/parsHed/parsHed.model +0 -0
  85. data/parscit/resources/parsHed/parsHed.template +178 -0
  86. data/parscit/resources/sectLabel/affiliation.model +0 -0
  87. data/parscit/resources/sectLabel/author.model +0 -0
  88. data/parscit/resources/sectLabel/funcWord +320 -0
  89. data/parscit/resources/sectLabel/genericSect.model +0 -0
  90. data/parscit/resources/sectLabel/sectLabel.config +42 -0
  91. data/parscit/resources/sectLabel/sectLabel.configXml +42 -0
  92. data/parscit/resources/sectLabel/sectLabel.model +0 -0
  93. data/sh/convert_to_text.sh +20 -0
  94. data/spec/biblicit/extractor_spec.rb +121 -0
  95. data/spec/fixtures/Review_of_Michael_Tyes_Consciousness_Revisited.docx +0 -0
  96. data/spec/fixtures/critical-infrastructures.ps +63951 -0
  97. data/spec/fixtures/txt/E06-1050.txt +867 -0
  98. data/spec/fixtures/txt/sample1.txt +902 -0
  99. data/spec/fixtures/txt/sample2.txt +394 -0
  100. data/spec/spec_helper.rb +3 -0
  101. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Function.pm +2 -20
  102. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +0 -7
  103. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Parser.pm +0 -2
  104. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +0 -7
  105. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +6 -1
  106. data/svm-header-parse/HeaderParseService/tmp/.gitignore +4 -0
  107. data/svm-header-parse/extract.pl +75 -0
  108. metadata +351 -317
  109. data/perl/DocFilter/lib/DocFilter/Config.pm +0 -35
  110. data/perl/DocFilter/lib/DocFilter/Filter.pm +0 -51
  111. data/perl/FileConversionService/README.TXT +0 -11
  112. data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
  113. data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +0 -140
  114. data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +0 -77
  115. data/perl/FileConversionService/lib/FileConverter/Compression.pm +0 -137
  116. data/perl/FileConversionService/lib/FileConverter/Config.pm +0 -57
  117. data/perl/FileConversionService/lib/FileConverter/Controller.pm +0 -191
  118. data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +0 -61
  119. data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +0 -69
  120. data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +0 -69
  121. data/perl/FileConversionService/lib/FileConverter/PSToText.pm +0 -88
  122. data/perl/FileConversionService/lib/FileConverter/Prescript.pm +0 -68
  123. data/perl/FileConversionService/lib/FileConverter/TET.pm +0 -75
  124. data/perl/FileConversionService/lib/FileConverter/Utils.pm +0 -130
  125. data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +0 -140
  126. data/perl/HeaderParseService/resources/data/EbizHeaders.txt +0 -24330
  127. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +0 -27506
  128. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +0 -26495
  129. data/perl/HeaderParseService/resources/data/tagged_headers.txt +0 -40668
  130. data/perl/HeaderParseService/resources/data/test_header.txt +0 -31
  131. data/perl/HeaderParseService/resources/data/test_header.txt.parsed +0 -31
  132. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +0 -23
  133. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +0 -23
  134. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +0 -23
  135. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +0 -23
  136. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +0 -23
  137. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +0 -23
  138. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +0 -23
  139. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +0 -23
  140. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +0 -23
  141. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +0 -23
  142. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +0 -23
  143. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +0 -23
  144. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +0 -23
  145. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +0 -23
  146. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +0 -23
  147. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +0 -23
  148. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +0 -23
  149. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +0 -23
  150. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +0 -23
  151. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +0 -23
  152. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +0 -23
  153. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +0 -23
  154. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +0 -23
  155. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +0 -23
  156. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +0 -23
  157. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +0 -23
  158. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +0 -23
  159. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +0 -23
  160. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +0 -23
  161. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +0 -23
  162. data/perl/ParsCit/crfpp/traindata/parsCit.template +0 -60
  163. data/perl/ParsCit/crfpp/traindata/parsCit.train.data +0 -12104
  164. data/perl/ParsCit/crfpp/traindata/tagged_references.txt +0 -500
  165. data/perl/ParsCit/lib/CSXUtil/SafeText.pm +0 -140
  166. data/perl/ParsCit/lib/ParsCit/Citation.pm +0 -462
  167. data/perl/ParsCit/lib/ParsCit/CitationContext.pm +0 -132
  168. data/perl/ParsCit/lib/ParsCit/Config.pm +0 -46
  169. data/perl/ParsCit/lib/ParsCit/Controller.pm +0 -306
  170. data/perl/ParsCit/lib/ParsCit/PostProcess.pm +0 -367
  171. data/perl/ParsCit/lib/ParsCit/PreProcess.pm +0 -333
  172. data/perl/ParsCit/resources/parsCit.model +0 -0
  173. data/perl/extract.pl +0 -199
  174. data/spec/biblicit/cb2bib_spec.rb +0 -48
  175. data/spec/biblicit/citeseer_spec.rb +0 -40
  176. /data/{perl → svm-header-parse}/HeaderParseService/README.TXT +0 -0
  177. /data/{perl/DocFilter → svm-header-parse/HeaderParseService}/lib/CSXUtil/SafeText.pm +0 -0
  178. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +0 -0
  179. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +0 -0
  180. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +0 -0
  181. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/50states +0 -0
  182. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AddrTopWords.txt +0 -0
  183. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWords.txt +0 -0
  184. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWordsAll.txt +0 -0
  185. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/ChineseSurNames.txt +0 -0
  186. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames.bin +0 -0
  187. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
  188. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/DomainSuffixes.txt +0 -0
  189. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/LabeledHeader +0 -0
  190. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/README +0 -0
  191. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines +0 -0
  192. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines1 +0 -0
  193. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstract.txt +0 -0
  194. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstractTopWords +0 -0
  195. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/addr.txt +0 -0
  196. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affi.txt +0 -0
  197. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affis.bin +0 -0
  198. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
  199. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/allnamewords.bin +0 -0
  200. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_US.txt +0 -0
  201. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_world.txt +0 -0
  202. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/city.txt +0 -0
  203. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cityname.txt +0 -0
  204. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/country_abbr.txt +0 -0
  205. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/countryname.txt +0 -0
  206. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/dateTopWords +0 -0
  207. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/degree.txt +0 -0
  208. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/email.txt +0 -0
  209. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/excludeWords.txt +0 -0
  210. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/female-names +0 -0
  211. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstNames.txt +0 -0
  212. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames.bin +0 -0
  213. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
  214. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/intro.txt +0 -0
  215. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keyword.txt +0 -0
  216. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keywordTopWords +0 -0
  217. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/male-names +0 -0
  218. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/middleNames.txt +0 -0
  219. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/month.txt +0 -0
  220. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul +0 -0
  221. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label +0 -0
  222. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label.old +0 -0
  223. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.processed +0 -0
  224. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulAuthor +0 -0
  225. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulClassStat +0 -0
  226. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nickname.txt +0 -0
  227. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nicknames.bin +0 -0
  228. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/note.txt +0 -0
  229. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/page.txt +0 -0
  230. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/phone.txt +0 -0
  231. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/postcode.txt +0 -0
  232. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/pubnum.txt +0 -0
  233. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.bin +0 -0
  234. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.txt +0 -0
  235. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/states_and_abbreviations.txt +0 -0
  236. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords +0 -0
  237. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords.bin +0 -0
  238. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surNames.txt +0 -0
  239. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames.bin +0 -0
  240. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames_spec.bin +0 -0
  241. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/A.html +0 -0
  242. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/B.html +0 -0
  243. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/C.html +0 -0
  244. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/D.html +0 -0
  245. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/E.html +0 -0
  246. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/F.html +0 -0
  247. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/G.html +0 -0
  248. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/H.html +0 -0
  249. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/I.html +0 -0
  250. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/J.html +0 -0
  251. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/K.html +0 -0
  252. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/L.html +0 -0
  253. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/M.html +0 -0
  254. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/N.html +0 -0
  255. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/O.html +0 -0
  256. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/P.html +0 -0
  257. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Q.html +0 -0
  258. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/R.html +0 -0
  259. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/S.html +0 -0
  260. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/T.html +0 -0
  261. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/U.html +0 -0
  262. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/V.html +0 -0
  263. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/W.html +0 -0
  264. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
  265. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/X.html +0 -0
  266. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Y.html +0 -0
  267. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Z.html +0 -0
  268. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ae.html +0 -0
  269. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/am.html +0 -0
  270. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ar.html +0 -0
  271. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/at.html +0 -0
  272. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/au.html +0 -0
  273. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bd.html +0 -0
  274. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/be.html +0 -0
  275. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bg.html +0 -0
  276. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bh.html +0 -0
  277. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
  278. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bm.html +0 -0
  279. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bn.html +0 -0
  280. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/br.html +0 -0
  281. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ca.html +0 -0
  282. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ch.html +0 -0
  283. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cl.html +0 -0
  284. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cn.html +0 -0
  285. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/co.html +0 -0
  286. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cr.html +0 -0
  287. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cy.html +0 -0
  288. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cz.html +0 -0
  289. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/de.html +0 -0
  290. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
  291. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dk.html +0 -0
  292. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ec.html +0 -0
  293. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ee.html +0 -0
  294. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/eg.html +0 -0
  295. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/es.html +0 -0
  296. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/et.html +0 -0
  297. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/faq.html +0 -0
  298. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fi.html +0 -0
  299. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fj.html +0 -0
  300. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fo.html +0 -0
  301. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fr.html +0 -0
  302. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/geog.html +0 -0
  303. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gr.html +0 -0
  304. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gu.html +0 -0
  305. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hk.html +0 -0
  306. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hr.html +0 -0
  307. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hu.html +0 -0
  308. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/id.html +0 -0
  309. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ie.html +0 -0
  310. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/il.html +0 -0
  311. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/in.html +0 -0
  312. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/is.html +0 -0
  313. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/it.html +0 -0
  314. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jm.html +0 -0
  315. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jo.html +0 -0
  316. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jp.html +0 -0
  317. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
  318. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kr.html +0 -0
  319. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kw.html +0 -0
  320. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lb.html +0 -0
  321. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
  322. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lk.html +0 -0
  323. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lt.html +0 -0
  324. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lu.html +0 -0
  325. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lv.html +0 -0
  326. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ma.html +0 -0
  327. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
  328. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
  329. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mk.html +0 -0
  330. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mo.html +0 -0
  331. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
  332. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mt.html +0 -0
  333. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mx.html +0 -0
  334. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/my.html +0 -0
  335. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ni.html +0 -0
  336. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nl.html +0 -0
  337. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/no.html +0 -0
  338. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nz.html +0 -0
  339. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pa.html +0 -0
  340. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pe.html +0 -0
  341. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ph.html +0 -0
  342. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pl.html +0 -0
  343. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
  344. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pr.html +0 -0
  345. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ps.html +0 -0
  346. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pt.html +0 -0
  347. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/recognition.html +0 -0
  348. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/results.html +0 -0
  349. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ro.html +0 -0
  350. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ru.html +0 -0
  351. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sd.html +0 -0
  352. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/se.html +0 -0
  353. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sg.html +0 -0
  354. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/si.html +0 -0
  355. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sk.html +0 -0
  356. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/th.html +0 -0
  357. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tr.html +0 -0
  358. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tw.html +0 -0
  359. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ua.html +0 -0
  360. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uk.html +0 -0
  361. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ-full.html +0 -0
  362. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ.html +0 -0
  363. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uy.html +0 -0
  364. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ve.html +0 -0
  365. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/yu.html +0 -0
  366. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/za.html +0 -0
  367. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/zm.html +0 -0
  368. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list.txt +0 -0
  369. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/url.txt +0 -0
  370. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/webTopWords +0 -0
  371. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/words +0 -0
  372. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10ContextModelfold1 +0 -0
  373. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10Modelfold1 +0 -0
  374. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11ContextModelfold1 +0 -0
  375. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11Modelfold1 +0 -0
  376. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12ContextModelfold1 +0 -0
  377. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12Modelfold1 +0 -0
  378. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13ContextModelfold1 +0 -0
  379. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13Modelfold1 +0 -0
  380. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14ContextModelfold1 +0 -0
  381. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14Modelfold1 +0 -0
  382. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15ContextModelfold1 +0 -0
  383. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15Modelfold1 +0 -0
  384. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1ContextModelfold1 +0 -0
  385. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1Modelfold1 +0 -0
  386. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2ContextModelfold1 +0 -0
  387. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2Modelfold1 +0 -0
  388. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3ContextModelfold1 +0 -0
  389. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3Modelfold1 +0 -0
  390. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4ContextModelfold1 +0 -0
  391. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4Modelfold1 +0 -0
  392. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5ContextModelfold1 +0 -0
  393. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5Modelfold1 +0 -0
  394. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6ContextModelfold1 +0 -0
  395. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6Modelfold1 +0 -0
  396. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7ContextModelfold1 +0 -0
  397. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7Modelfold1 +0 -0
  398. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8ContextModelfold1 +0 -0
  399. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8Modelfold1 +0 -0
  400. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9ContextModelfold1 +0 -0
  401. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9Modelfold1 +0 -0
  402. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceModel +0 -0
  403. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceTrainF +0 -0
  404. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperBaseFeaDict +0 -0
  405. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperContextFeaDict +0 -0
  406. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +0 -0
@@ -0,0 +1,653 @@
1
+ package ParsCit::Controller;
2
+
3
+ ###
4
+ # This package is used to pull together various citation
5
+ # processing modules in the ParsCit distribution, serving
6
+ # as a script for handling the entire citation processing
7
+ # control flow. The extractCitations subroutine should be
8
+ # the only needed API element if XML output is desired;
9
+ # however, the extractCitationsImpl subroutine can be used
10
+ # to get direct access to the list of citation objects.
11
+ #
12
+ # Isaac Councill, 07/23/07
13
+ ###
14
+
15
+ require 'dumpvar.pl';
16
+
17
+ use strict;
18
+ # Local libraries
19
+ use ParsCit::Config;
20
+ use ParsCit::Tr2crfpp;
21
+ use ParsCit::PreProcess;
22
+ use ParsCit::PostProcess;
23
+ use ParsCit::CitationContext;
24
+ # Omnipage libraries
25
+ use Omni::Omnidoc;
26
+ # Dependencies
27
+ use CSXUtil::SafeText qw(cleanXML);
28
+
29
+ ###
30
+ # Main API method for generating an XML document including
31
+ # all citation data. Returns a reference XML document and
32
+ # a reference to the article body text.
33
+ ###
34
+
35
+ # Extract citations from text
36
+ sub ExtractCitations
37
+ {
38
+ my ($text_file, $org_file, $is_xml) = @_;
39
+
40
+ # Real works are in there
41
+ my ($status, $msg, $citations, $body_text) = ExtractCitationsImpl($text_file, $org_file, $is_xml);
42
+
43
+ # Check the result status
44
+ if ($status > 0)
45
+ {
46
+ return BuildXMLResponse($citations);
47
+ }
48
+ else
49
+ {
50
+ # Return error message
51
+ my $error = "Error: " . $msg; return \$error;
52
+ }
53
+ }
54
+
55
+ ###
56
+ # Huydhn
57
+ # Extract citations from text
58
+ # The reference section will be provided by sectlabel.
59
+ # Previously, parscit find this section itself using
60
+ # regular expression
61
+ ###
62
+ sub ExtractCitations2
63
+ {
64
+ my ($all_text, $cit_lines, $is_xml, $doc, $cit_addrs) = @_;
65
+
66
+ # Real works are in there
67
+ my ($status, $msg, $citations, $body_text) = ExtractCitationsImpl2($all_text, $cit_lines, $is_xml, $doc, $cit_addrs);
68
+
69
+ # Check the result status
70
+ if ($status > 0)
71
+ {
72
+ return BuildXMLResponse($citations);
73
+ }
74
+ else
75
+ {
76
+ # Return error message
77
+ my $error = "Error: " . $msg; return \$error;
78
+ }
79
+ }
80
+
81
+ sub ExtractCitationsAlreadySegmented
82
+ {
83
+ my ($text_file) = @_;
84
+
85
+ my ($status, $msg) = (1, "");
86
+
87
+ # Cannot open input file, return error message
88
+ if (! open(IN, "<:utf8", $text_file)) { return (-1, "Could not open file " . $text_file . ": " . $!); }
89
+
90
+ #
91
+ my @raw_citations = ();
92
+ my $current_citation = undef;
93
+
94
+ while (<IN>)
95
+ {
96
+ # Remove eol
97
+ chomp();
98
+
99
+ # Save current citation
100
+ if (m/^\s*$/ && defined $current_citation)
101
+ {
102
+ my $cite = new ParsCit::Citation();
103
+ $cite->setString($current_citation);
104
+ push @raw_citations, $cite;
105
+ $current_citation = undef;
106
+ next;
107
+ }
108
+
109
+ # Current citation eq current line
110
+ if (! defined $current_citation)
111
+ {
112
+ $current_citation = $_;
113
+ }
114
+ # Append the current line to the current citation
115
+ else
116
+ {
117
+ $current_citation = $current_citation . " " . $_;
118
+ }
119
+ }
120
+
121
+ # Close the input after reading
122
+ close IN;
123
+
124
+ # Save the last citation
125
+ if (defined $current_citation)
126
+ {
127
+ my $cite = new ParsCit::Citation();
128
+ push @raw_citations, $cite;
129
+ }
130
+
131
+ my @citations = ();
132
+ my @valid_citations = ();
133
+ my $normalized_cite_text = "";
134
+
135
+ foreach my $citation (@raw_citations)
136
+ {
137
+ # Tr2cfpp needs an enclosing tag for initial class seed.
138
+ my $cite_string = $citation->getString();
139
+
140
+ if (defined $cite_string && $cite_string !~ m/^\s*$/)
141
+ {
142
+ $normalized_cite_text .= "<title> " . $citation->getString() . " </title>\n";
143
+ push @citations, $citation;
144
+ }
145
+ }
146
+
147
+ # Stop - nothing left to do.
148
+ if ($#citations < 0) { return ($status, $msg, \@valid_citations); }
149
+
150
+ my $tmpfile = ParsCit::Tr2crfpp::PrepData(\$normalized_cite_text, $text_file);
151
+ my $outfile = $tmpfile . "_dec";
152
+
153
+ if (ParsCit::Tr2crfpp::Decode($tmpfile, $outfile))
154
+ {
155
+ my ($raw_xml, $cite_info, $tstatus, $tmsg) = ParsCit::PostProcess::ReadAndNormalize($outfile);
156
+
157
+ if ($tstatus <= 0) { return ($tstatus, $msg, undef, undef); }
158
+
159
+ my @all_cite_info = @{ $cite_info };
160
+
161
+ if ($#citations == $#all_cite_info)
162
+ {
163
+ for (my $i = 0; $i <= $#citations; $i++)
164
+ {
165
+ my $citation = $citations[ $i ];
166
+ my %cite_hash = %{ $all_cite_info[ $i ] };
167
+
168
+ foreach my $key (keys %cite_hash)
169
+ {
170
+ $citation->loadDataItem($key, $cite_hash{ $key });
171
+ }
172
+
173
+ my $marker = $citation->getMarker();
174
+
175
+ if (! defined $marker)
176
+ {
177
+ $marker = $citation->buildAuthYearMarker();
178
+ $citation->setMarker($marker);
179
+ }
180
+
181
+ push @valid_citations, $citation;
182
+ }
183
+ }
184
+ else
185
+ {
186
+ $status = -1;
187
+ $msg = "Mismatch between expected citations and cite info";
188
+ }
189
+ }
190
+
191
+ unlink($tmpfile);
192
+ unlink($outfile);
193
+
194
+ return BuildXMLResponse(\@valid_citations);
195
+ }
196
+
197
+ # Thang: tmp method for debugging purpose
198
+ sub PrintArray
199
+ {
200
+ my ($filename, $tokens) = @_;
201
+ open(OF, ">:utf8", $filename);
202
+ foreach (@{ $tokens }) { print OF $_, "\n"; }
203
+ close OF;
204
+ }
205
+
206
+ ###
207
+ # Main script for actually walking through the steps of citation
208
+ # processing. Returns a status code (0 for failure), an error
209
+ # message (may be blank if no error), a reference to an array of
210
+ # citation objects and a reference to the body text of the article
211
+ # being processed.
212
+ ###
213
+ sub ExtractCitationsImpl
214
+ {
215
+ my ($textfile, $orgfile, $is_xml, $bwrite_split) = @_;
216
+
217
+ if (! defined $bwrite_split) { $bwrite_split = $ParsCit::Config::bWriteSplit; }
218
+
219
+ # Status and error message initialization
220
+ my ($status, $msg) = (1, "");
221
+
222
+ # NOTE: What are their purpose?
223
+ my ($citefile, $bodyfile) = ("", "");
224
+ # NOTE: What is its purpose?
225
+ my @pos_array = ();
226
+ # Reference text, boby text, and normalize body text
227
+ my ($rcite_text, $rnorm_body_text, $rbody_text) = undef;
228
+ # Reference to an array of single reference
229
+ my $rraw_citations = undef;
230
+
231
+ # Find and separate reference
232
+ if ($is_xml)
233
+ {
234
+ ###
235
+ # Huydhn: input is xml from Omnipage
236
+ ###
237
+ if (! open(IN, "<:utf8", $orgfile)) { return (-1, "Could not open xml file " . $orgfile . ": " . $!); }
238
+ my $xml = do { local $/; <IN> };
239
+ close IN;
240
+
241
+ ###
242
+ # Huydhn
243
+ # NOTE: the omnipage xml is not well constructed (concatenated multiple xml files).
244
+ # This merged xml need to be fixed first before pass it to xml processing libraries, e.g. xml::twig
245
+ ###
246
+ # Convert to Unix format
247
+ $xml =~ s/\r//g;
248
+ # Remove <?xml version="1.0" encoding="UTF-8"?>
249
+ $xml =~ s/<\?xml.+?>\n//g;
250
+ # Remove <!--XML document generated using OCR technology from ScanSoft, Inc.-->
251
+ $xml =~ s/<\!\-\-XML.+?>\n//g;
252
+ # Declaration and root
253
+ $xml = "<?xml version=\"1.0\"?>" . "\n" . "<root>" . "\n" . $xml . "\n" . "</root>";
254
+
255
+ # New document
256
+ my $doc = new Omni::Omnidoc();
257
+ $doc->set_raw($xml);
258
+
259
+ # Extract the reference portion from the XML
260
+ my ($start_ref, $end_ref, $rcite_text_from_xml, $rcit_addrs) = ParsCit::PreProcess::FindCitationTextXML($doc);
261
+
262
+ # Extract the reference portion from the text.
263
+ # TODO: NEED TO BE REMOVED FROM HERE
264
+ my $content = $doc->get_content();
265
+ ($rcite_text, $rnorm_body_text, $rbody_text) = ParsCit::PreProcess::FindCitationText(\$content, \@pos_array);
266
+
267
+ my @norm_body_tokens = split(/\s+/, $$rnorm_body_text);
268
+ my @body_tokens = split(/\s+/, $$rbody_text);
269
+
270
+ my $size = scalar(@norm_body_tokens);
271
+ my $size1 = scalar(@pos_array);
272
+
273
+ if($size != $size1) { die "ParsCit::Controller::extractCitationsImpl: normBodyText size $size != posArray size $size1\n"; }
274
+ # TODO: TO HERE
275
+
276
+ # Filename initialization
277
+ if ($bwrite_split > 0) { ($citefile, $bodyfile) = WriteSplit($textfile, $rcite_text_from_xml, $rbody_text); }
278
+
279
+ # Prepare to split unmarked reference portion
280
+ my $tmp_file = ParsCit::Tr2crfpp::PrepDataUnmarked($doc, $rcit_addrs);
281
+
282
+ # Extract citations from citation text
283
+ $rraw_citations = ParsCit::PreProcess::SegmentCitationsXML($rcite_text_from_xml, $tmp_file);
284
+ }
285
+ else
286
+ {
287
+ if (! open(IN, "<:utf8", $textfile)) { return (-1, "Could not open text file " . $textfile . ": " . $!); }
288
+ my $text = do { local $/; <IN> };
289
+ close IN;
290
+
291
+ ###
292
+ # Thang May 2010
293
+ # Map each position in norm_body_text to a position in body_text, scalar(@pos_array) = number of tokens in norm_body_text
294
+ # TODO: Switch this function to sectlabel module
295
+ ($rcite_text, $rnorm_body_text, $rbody_text) = ParsCit::PreProcess::FindCitationText(\$text, \@pos_array);
296
+
297
+ my @norm_body_tokens = split(/\s+/, $$rnorm_body_text);
298
+ my @body_tokens = split(/\s+/, $$rbody_text);
299
+
300
+ my $size = scalar(@norm_body_tokens);
301
+ my $size1 = scalar(@pos_array);
302
+
303
+ if($size != $size1) { die "ParsCit::Controller::extractCitationsImpl: normBodyText size $size != posArray size $size1\n"; }
304
+ # End Thang May 2010
305
+ ###
306
+
307
+ # Filename initialization
308
+ if ($bwrite_split > 0) { ($citefile, $bodyfile) = WriteSplit($textfile, $rcite_text, $rbody_text); }
309
+
310
+ # Extract citations from citation text
311
+ $rraw_citations = ParsCit::PreProcess::SegmentCitations($rcite_text);
312
+ }
313
+
314
+ my @citations = ();
315
+ my @valid_citations = ();
316
+
317
+ # Process each citation
318
+ my $normalized_cite_text = "";
319
+ foreach my $citation (@{ $rraw_citations })
320
+ {
321
+ # Tr2cfpp needs an enclosing tag for initial class seed.
322
+ my $cite_string = $citation->getString();
323
+ if (defined $cite_string && $cite_string !~ m/^\s*$/)
324
+ {
325
+ $normalized_cite_text .= "<title> " . $citation->getString() . " </title>\n";
326
+ push @citations, $citation;
327
+ }
328
+ }
329
+
330
+ # Stop - nothing left to do.
331
+ if ($#citations < 0) { return ($status, $msg, \@valid_citations, $rnorm_body_text); }
332
+
333
+ my $tmpfile = ParsCit::Tr2crfpp::PrepData(\$normalized_cite_text, $textfile);
334
+ my $outfile = $tmpfile . "_dec";
335
+
336
+ if (ParsCit::Tr2crfpp::Decode($tmpfile, $outfile))
337
+ {
338
+ my ($rraw_xml, $rcite_info, $tstatus, $tmsg) = ParsCit::PostProcess::ReadAndNormalize($outfile);
339
+ if ($tstatus <= 0) { return ($tstatus, $msg, undef, undef); }
340
+
341
+ my @cite_info = @{ $rcite_info };
342
+
343
+ if ($#citations == $#cite_info)
344
+ {
345
+ for (my $i = 0; $i <= $#citations; $i++)
346
+ {
347
+ my $citation = $citations[ $i ];
348
+ my %cite_info = %{ $cite_info[ $i ] };
349
+
350
+ foreach my $key (keys %cite_info)
351
+ {
352
+ $citation->loadDataItem($key, $cite_info{ $key });
353
+ }
354
+
355
+ my $marker = $citation->getMarker();
356
+ if (!defined $marker)
357
+ {
358
+ $marker = $citation->buildAuthYearMarker();
359
+ $citation->setMarker($marker);
360
+ }
361
+
362
+ ###
363
+ # Modified by Nick Friedrich$ref_lines->[ 0 ]
364
+ ### getCitationContext returns contexts and the position of the contexts
365
+ ###
366
+ # Thang: Nov 2009 add $rcit_strs - in-text ciation strs
367
+ ###
368
+ my ($rcontexts, $rpositions, $start_word_positions, $end_word_positions, $rcit_strs) = ParsCit::CitationContext::GetCitationContext($rnorm_body_text,
369
+ \@pos_array,
370
+ $marker);
371
+
372
+ ###
373
+ # Thang May 2010: add $rWordPositions, $rBodyText to find word-based positions (0-based) according to the *.body file
374
+ ###
375
+
376
+ foreach my $context (@{ $rcontexts })
377
+ {
378
+ # Next citation context
379
+ $citation->addContext($context);
380
+
381
+ # Next citation position
382
+ my $position = shift @{ $rpositions };
383
+ $citation->addPosition($position);
384
+
385
+ ##
386
+ # Thang: Nov 2009, add $rcit_strs
387
+ ###
388
+ # Next citation string
389
+ my $cit_str = shift @{ $rcit_strs };
390
+ $citation->addCitStr($cit_str);
391
+ # End Thang: Nov 2009
392
+
393
+ # Next start and end of citation
394
+ my $start_pos = shift @{ $start_word_positions };
395
+ my $end_pos = shift @{ $end_word_positions };
396
+
397
+ $citation->addStartWordPosition( $pos_array[ $start_pos ] );
398
+ $citation->addEndWordPosition( $pos_array[ $end_pos ] );
399
+ # print STDERR $cit_str, " --> ", $body_tokens[ $pos_array[ $start_pos ] ], " \t ", $pos_array[ $start_pos], " ### ";
400
+ # print STDERR $pos_array[ $end_pos], " \t ", $body_tokens[ $pos_array[ $end_pos ] ], "\n";
401
+ }
402
+
403
+ push @valid_citations, $citation;
404
+ }
405
+ }
406
+ else
407
+ {
408
+ $status = -1;
409
+ $msg = "Mismatch between expected citations and cite info";
410
+ }
411
+ }
412
+
413
+ unlink($tmpfile);
414
+ unlink($outfile);
415
+
416
+ # Our work here is done
417
+ return ($status, $msg, \@valid_citations, $rbody_text, $citefile, $bodyfile);
418
+ }
419
+
420
+ ###
421
+ # Huydhn
422
+ # New function for citation extraction based on the output
423
+ # of sectlabel
424
+ ###
425
+ sub ExtractCitationsImpl2
426
+ {
427
+ my ($all_text, $cit_lines, $is_xml, $doc, $cit_addrs) = @_;
428
+
429
+ # Status and error message initialization
430
+ my ($status, $msg) = (1, "");
431
+
432
+ # NOTE: What is its purpose?
433
+ my @pos_array = ();
434
+ # Reference text, boby text, and normalize body text
435
+ my ($rcite_text, $rnorm_body_text, $rbody_text) = undef;
436
+ # Reference to an array of single reference
437
+ my $rraw_citations = undef;
438
+
439
+ # Find and separate reference
440
+ if ($is_xml)
441
+ {
442
+ # TODO: NEED TO BE REMOVED FROM HERE
443
+ ($rcite_text, $rnorm_body_text, $rbody_text) = ParsCit::PreProcess::FindCitationText2($all_text, $cit_lines, \@pos_array);
444
+
445
+ my @norm_body_tokens = split(/\s+/, $$rnorm_body_text);
446
+ my @body_tokens = split(/\s+/, $$rbody_text);
447
+
448
+ my $size = scalar(@norm_body_tokens);
449
+ my $size1 = scalar(@pos_array);
450
+
451
+ if($size != $size1) { die "ParsCit::Controller::extractCitationsImpl: normBodyText size $size != posArray size $size1\n"; }
452
+ # TODO: TO HERE
453
+
454
+ # Prepare to split unmarked reference portion
455
+ my $tmp_file = ParsCit::Tr2crfpp::PrepDataUnmarked($doc, $cit_addrs);
456
+
457
+ # Extract citations from citation text
458
+ $rraw_citations = ParsCit::PreProcess::SegmentCitationsXML($rcite_text, $tmp_file);
459
+ }
460
+ else
461
+ {
462
+ ###
463
+ # Thang May 2010
464
+ # Map each position in norm_body_text to a position in body_text, scalar(@pos_array) = number of tokens in norm_body_text
465
+ # TODO: Switch this function to sectlabel module
466
+ ###
467
+ ($rcite_text, $rnorm_body_text, $rbody_text) = ParsCit::PreProcess::FindCitationText2($all_text, $cit_lines, \@pos_array);
468
+
469
+ my @norm_body_tokens = split(/\s+/, $$rnorm_body_text);
470
+ my @body_tokens = split(/\s+/, $$rbody_text);
471
+
472
+ my $size = scalar(@norm_body_tokens);
473
+ my $size1 = scalar(@pos_array);
474
+
475
+ if($size != $size1) { die "ParsCit::Controller::extractCitationsImpl: normBodyText size $size != posArray size $size1\n"; }
476
+ # End Thang May 2010
477
+ ###
478
+
479
+ # Extract citations from citation text
480
+ $rraw_citations = ParsCit::PreProcess::SegmentCitations($rcite_text);
481
+ }
482
+
483
+ my @citations = ();
484
+ my @valid_citations = ();
485
+
486
+ # Process each citation
487
+ my $normalized_cite_text = "";
488
+ foreach my $citation (@{ $rraw_citations })
489
+ {
490
+ # Tr2cfpp needs an enclosing tag for initial class seed.
491
+ my $cite_string = $citation->getString();
492
+ if (defined $cite_string && $cite_string !~ m/^\s*$/)
493
+ {
494
+ $normalized_cite_text .= "<title> " . $citation->getString() . " </title>\n";
495
+ push @citations, $citation;
496
+ }
497
+ }
498
+
499
+ # Stop - nothing left to do.
500
+ if ($#citations < 0) { return ($status, $msg, \@valid_citations, $rnorm_body_text); }
501
+
502
+ my $tmpfile = ParsCit::Tr2crfpp::PrepData(\$normalized_cite_text, "");
503
+ my $outfile = $tmpfile . "_dec";
504
+
505
+ if (ParsCit::Tr2crfpp::Decode($tmpfile, $outfile))
506
+ {
507
+ my ($rraw_xml, $rcite_info, $tstatus, $tmsg) = ParsCit::PostProcess::ReadAndNormalize($outfile);
508
+ if ($tstatus <= 0) { return ($tstatus, $msg, undef, undef); }
509
+
510
+ my @cite_info = @{ $rcite_info };
511
+
512
+ if ($#citations == $#cite_info)
513
+ {
514
+ for (my $i = 0; $i <= $#citations; $i++)
515
+ {
516
+ my $citation = $citations[ $i ];
517
+ my %cite_info = %{ $cite_info[ $i ] };
518
+
519
+ foreach my $key (keys %cite_info)
520
+ {
521
+ $citation->loadDataItem($key, $cite_info{ $key });
522
+ }
523
+
524
+ my $marker = $citation->getMarker();
525
+ if (!defined $marker)
526
+ {
527
+ $marker = $citation->buildAuthYearMarker();
528
+ $citation->setMarker($marker);
529
+ }
530
+
531
+ ###
532
+ # Modified by Nick Friedrich$ref_lines->[ 0 ]
533
+ ### getCitationContext returns contexts and the position of the contexts
534
+ ###
535
+ # Thang: Nov 2009 add $rcit_strs - in-text ciation strs
536
+ ###
537
+ my ($rcontexts, $rpositions, $start_word_positions, $end_word_positions, $rcit_strs) = ParsCit::CitationContext::GetCitationContext($rnorm_body_text,
538
+ \@pos_array,
539
+ $marker);
540
+
541
+ ###
542
+ # Thang May 2010: add $rWordPositions, $rBodyText to find word-based positions (0-based) according to the *.body file
543
+ ###
544
+
545
+ foreach my $context (@{ $rcontexts })
546
+ {
547
+ # Next citation context
548
+ $citation->addContext($context);
549
+
550
+ # Next citation position
551
+ my $position = shift @{ $rpositions };
552
+ $citation->addPosition($position);
553
+
554
+ ##
555
+ # Thang: Nov 2009, add $rcit_strs
556
+ ###
557
+ # Next citation string
558
+ my $cit_str = shift @{ $rcit_strs };
559
+ $citation->addCitStr($cit_str);
560
+ # End Thang: Nov 2009
561
+
562
+ # Next start and end of citation
563
+ my $start_pos = shift @{ $start_word_positions };
564
+ my $end_pos = shift @{ $end_word_positions };
565
+
566
+ $citation->addStartWordPosition( $pos_array[ $start_pos ] );
567
+ $citation->addEndWordPosition( $pos_array[ $end_pos ] );
568
+ # print STDERR $cit_str, " --> ", $body_tokens[ $pos_array[ $start_pos ] ], " \t ", $pos_array[ $start_pos], " ### ";
569
+ # print STDERR $pos_array[ $end_pos], " \t ", $body_tokens[ $pos_array[ $end_pos ] ], "\n";
570
+ }
571
+
572
+ push @valid_citations, $citation;
573
+ }
574
+ }
575
+ else
576
+ {
577
+ $status = -1;
578
+ $msg = "Mismatch between expected citations and cite info";
579
+ }
580
+ }
581
+
582
+ unlink($tmpfile);
583
+ unlink($outfile);
584
+
585
+ # Our work here is done
586
+ return ($status, $msg, \@valid_citations, $rbody_text);
587
+ }
588
+
589
+ # Write citation list in xml format
590
+ sub BuildXMLResponse
591
+ {
592
+ my ($rcitations) = @_;
593
+
594
+ my $l_alg_name = $ParsCit::Config::algorithmName;
595
+ my $l_alg_version = $ParsCit::Config::algorithmVersion;
596
+
597
+ cleanXML(\$l_alg_name);
598
+ cleanXML(\$l_alg_version);
599
+
600
+ my $xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" . "<algorithm name=\"$l_alg_name\" " . "version=\"$l_alg_version\">\n";
601
+ $xml = $xml . "<citationList>\n";
602
+
603
+ # Write output
604
+ foreach my $citation (@$rcitations) { $xml .= $citation->toXML(); }
605
+
606
+ $xml .= "</citationList>\n";
607
+ $xml .= "</algorithm>\n";
608
+ return \$xml;
609
+ }
610
+
611
+ #
612
+ sub WriteSplit
613
+ {
614
+ my ($textfile, $rcite_text, $rbody_text) = @_;
615
+
616
+ my $citefile = ChangeExtension($textfile, "cite");
617
+ my $bodyfile = ChangeExtension($textfile, "body");
618
+
619
+ if (open(OUT, ">$citefile"))
620
+ {
621
+ binmode OUT, ":utf8";
622
+ print OUT $$rcite_text;
623
+ close OUT;
624
+ }
625
+ else
626
+ {
627
+ print STDERR "Could not open .cite file for writing: $!\n";
628
+ }
629
+
630
+ if (open(OUT, ">$bodyfile"))
631
+ {
632
+ binmode OUT, ":utf8";
633
+ print OUT $$rbody_text;
634
+ close OUT;
635
+ }
636
+ else
637
+ {
638
+ print STDERR "Could not open .body file for writing: $!\n";
639
+ }
640
+
641
+ # Our work here is done
642
+ return ($citefile, $bodyfile);
643
+ }
644
+
645
+ # Support function: change the extension of a file
646
+ sub ChangeExtension
647
+ {
648
+ my ($fn, $ext) = @_;
649
+ unless ($fn =~ s/^(.*)\..*$/$1\.$ext/) { $fn .= "." . $ext; }
650
+ return $fn;
651
+ }
652
+
653
+ 1;