biblicit 1.0 → 2.0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (406) hide show
  1. data/.gitmodules +3 -0
  2. data/Gemfile +1 -1
  3. data/README.md +125 -30
  4. data/Rakefile +22 -0
  5. data/biblicit.gemspec +9 -7
  6. data/lib/biblicit/cb2bib.rb +10 -11
  7. data/lib/biblicit/citeseer.rb +14 -26
  8. data/lib/biblicit/extractor.rb +40 -19
  9. data/lib/biblicit/parscit.rb +38 -0
  10. data/parscit/.gitignore +8 -0
  11. data/parscit/CHANGELOG +125 -0
  12. data/parscit/COPYING +674 -0
  13. data/parscit/COPYING.LESSER +165 -0
  14. data/parscit/INSTALL +105 -0
  15. data/parscit/README +97 -0
  16. data/{perl/ParsCit/README.TXT → parscit/USAGE} +25 -15
  17. data/parscit/bin/archtest.pl +31 -0
  18. data/parscit/bin/citeExtract.pl +562 -0
  19. data/parscit/bin/conlleval.pl +315 -0
  20. data/parscit/bin/headExtract.pl +40 -0
  21. data/parscit/bin/parsHed/convert2TokenLevel.pl +138 -0
  22. data/parscit/bin/parsHed/keywordGen.pl +308 -0
  23. data/parscit/bin/parsHed/parseXmlHeader.pl +141 -0
  24. data/parscit/bin/parsHed/redo.parsHed.pl +198 -0
  25. data/parscit/bin/parsHed/tr2crfpp_parsHed.pl +521 -0
  26. data/parscit/bin/parseRefStrings.pl +102 -0
  27. data/parscit/bin/phOutput2xml.pl +223 -0
  28. data/parscit/bin/redo.parsCit.pl +105 -0
  29. data/parscit/bin/sectExtract.pl +149 -0
  30. data/parscit/bin/sectLabel/README +110 -0
  31. data/parscit/bin/sectLabel/README.txt +110 -0
  32. data/parscit/bin/sectLabel/genericSect/crossValidation.rb +98 -0
  33. data/parscit/bin/sectLabel/genericSect/extractFeature.rb +104 -0
  34. data/parscit/bin/sectLabel/genericSectExtract.rb +53 -0
  35. data/parscit/bin/sectLabel/getStructureInfo.pl +156 -0
  36. data/parscit/bin/sectLabel/processOmniXML.pl +1427 -0
  37. data/parscit/bin/sectLabel/processOmniXML_new.pl +1025 -0
  38. data/parscit/bin/sectLabel/processOmniXMLv2.pl +1529 -0
  39. data/parscit/bin/sectLabel/processOmniXMLv3.pl +964 -0
  40. data/parscit/bin/sectLabel/redo.sectLabel.pl +219 -0
  41. data/parscit/bin/sectLabel/simplifyOmniXML.pl +382 -0
  42. data/parscit/bin/sectLabel/single2multi.pl +190 -0
  43. data/parscit/bin/sectLabel/tr2crfpp.pl +158 -0
  44. data/parscit/bin/tr2crfpp.pl +260 -0
  45. data/parscit/bin/xml2train.pl +193 -0
  46. data/parscit/lib/CSXUtil/SafeText.pm +130 -0
  47. data/parscit/lib/Omni/Config.pm +93 -0
  48. data/parscit/lib/Omni/Omnicell.pm +263 -0
  49. data/parscit/lib/Omni/Omnicol.pm +292 -0
  50. data/parscit/lib/Omni/Omnidd.pm +328 -0
  51. data/parscit/lib/Omni/Omnidoc.pm +153 -0
  52. data/parscit/lib/Omni/Omniframe.pm +223 -0
  53. data/parscit/lib/Omni/Omniline.pm +423 -0
  54. data/parscit/lib/Omni/Omnipage.pm +282 -0
  55. data/parscit/lib/Omni/Omnipara.pm +232 -0
  56. data/parscit/lib/Omni/Omnirun.pm +303 -0
  57. data/parscit/lib/Omni/Omnitable.pm +336 -0
  58. data/parscit/lib/Omni/Omniword.pm +162 -0
  59. data/parscit/lib/Omni/Traversal.pm +313 -0
  60. data/parscit/lib/ParsCit/.PostProcess.pm.swp +0 -0
  61. data/parscit/lib/ParsCit/Citation.pm +737 -0
  62. data/parscit/lib/ParsCit/CitationContext.pm +220 -0
  63. data/parscit/lib/ParsCit/Config.pm +35 -0
  64. data/parscit/lib/ParsCit/Controller.pm +653 -0
  65. data/parscit/lib/ParsCit/PostProcess.pm +505 -0
  66. data/parscit/lib/ParsCit/PreProcess.pm +1041 -0
  67. data/parscit/lib/ParsCit/Tr2crfpp.pm +1195 -0
  68. data/parscit/lib/ParsHed/Config.pm +49 -0
  69. data/parscit/lib/ParsHed/Controller.pm +143 -0
  70. data/parscit/lib/ParsHed/PostProcess.pm +322 -0
  71. data/parscit/lib/ParsHed/Tr2crfpp.pm +448 -0
  72. data/{perl/ParsCit/lib/ParsCit/Tr2crfpp.pm → parscit/lib/ParsHed/Tr2crfpp_token.pm} +22 -21
  73. data/parscit/lib/SectLabel/AAMatching.pm +1949 -0
  74. data/parscit/lib/SectLabel/Config.pm +88 -0
  75. data/parscit/lib/SectLabel/Controller.pm +332 -0
  76. data/parscit/lib/SectLabel/PostProcess.pm +425 -0
  77. data/parscit/lib/SectLabel/PreProcess.pm +116 -0
  78. data/parscit/lib/SectLabel/Tr2crfpp.pm +1246 -0
  79. data/parscit/resources/parsCit.model +0 -0
  80. data/parscit/resources/parsCit.split.model +0 -0
  81. data/{perl/ParsCit → parscit}/resources/parsCitDict.txt +205 -0
  82. data/parscit/resources/parsHed/bigram +10 -0
  83. data/parscit/resources/parsHed/keywords +10 -0
  84. data/parscit/resources/parsHed/parsHed.model +0 -0
  85. data/parscit/resources/parsHed/parsHed.template +178 -0
  86. data/parscit/resources/sectLabel/affiliation.model +0 -0
  87. data/parscit/resources/sectLabel/author.model +0 -0
  88. data/parscit/resources/sectLabel/funcWord +320 -0
  89. data/parscit/resources/sectLabel/genericSect.model +0 -0
  90. data/parscit/resources/sectLabel/sectLabel.config +42 -0
  91. data/parscit/resources/sectLabel/sectLabel.configXml +42 -0
  92. data/parscit/resources/sectLabel/sectLabel.model +0 -0
  93. data/sh/convert_to_text.sh +20 -0
  94. data/spec/biblicit/extractor_spec.rb +121 -0
  95. data/spec/fixtures/Review_of_Michael_Tyes_Consciousness_Revisited.docx +0 -0
  96. data/spec/fixtures/critical-infrastructures.ps +63951 -0
  97. data/spec/fixtures/txt/E06-1050.txt +867 -0
  98. data/spec/fixtures/txt/sample1.txt +902 -0
  99. data/spec/fixtures/txt/sample2.txt +394 -0
  100. data/spec/spec_helper.rb +3 -0
  101. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Function.pm +2 -20
  102. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +0 -7
  103. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Parser.pm +0 -2
  104. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +0 -7
  105. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +6 -1
  106. data/svm-header-parse/HeaderParseService/tmp/.gitignore +4 -0
  107. data/svm-header-parse/extract.pl +75 -0
  108. metadata +351 -317
  109. data/perl/DocFilter/lib/DocFilter/Config.pm +0 -35
  110. data/perl/DocFilter/lib/DocFilter/Filter.pm +0 -51
  111. data/perl/FileConversionService/README.TXT +0 -11
  112. data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
  113. data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +0 -140
  114. data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +0 -77
  115. data/perl/FileConversionService/lib/FileConverter/Compression.pm +0 -137
  116. data/perl/FileConversionService/lib/FileConverter/Config.pm +0 -57
  117. data/perl/FileConversionService/lib/FileConverter/Controller.pm +0 -191
  118. data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +0 -61
  119. data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +0 -69
  120. data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +0 -69
  121. data/perl/FileConversionService/lib/FileConverter/PSToText.pm +0 -88
  122. data/perl/FileConversionService/lib/FileConverter/Prescript.pm +0 -68
  123. data/perl/FileConversionService/lib/FileConverter/TET.pm +0 -75
  124. data/perl/FileConversionService/lib/FileConverter/Utils.pm +0 -130
  125. data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +0 -140
  126. data/perl/HeaderParseService/resources/data/EbizHeaders.txt +0 -24330
  127. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +0 -27506
  128. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +0 -26495
  129. data/perl/HeaderParseService/resources/data/tagged_headers.txt +0 -40668
  130. data/perl/HeaderParseService/resources/data/test_header.txt +0 -31
  131. data/perl/HeaderParseService/resources/data/test_header.txt.parsed +0 -31
  132. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +0 -23
  133. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +0 -23
  134. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +0 -23
  135. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +0 -23
  136. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +0 -23
  137. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +0 -23
  138. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +0 -23
  139. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +0 -23
  140. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +0 -23
  141. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +0 -23
  142. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +0 -23
  143. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +0 -23
  144. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +0 -23
  145. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +0 -23
  146. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +0 -23
  147. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +0 -23
  148. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +0 -23
  149. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +0 -23
  150. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +0 -23
  151. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +0 -23
  152. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +0 -23
  153. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +0 -23
  154. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +0 -23
  155. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +0 -23
  156. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +0 -23
  157. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +0 -23
  158. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +0 -23
  159. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +0 -23
  160. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +0 -23
  161. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +0 -23
  162. data/perl/ParsCit/crfpp/traindata/parsCit.template +0 -60
  163. data/perl/ParsCit/crfpp/traindata/parsCit.train.data +0 -12104
  164. data/perl/ParsCit/crfpp/traindata/tagged_references.txt +0 -500
  165. data/perl/ParsCit/lib/CSXUtil/SafeText.pm +0 -140
  166. data/perl/ParsCit/lib/ParsCit/Citation.pm +0 -462
  167. data/perl/ParsCit/lib/ParsCit/CitationContext.pm +0 -132
  168. data/perl/ParsCit/lib/ParsCit/Config.pm +0 -46
  169. data/perl/ParsCit/lib/ParsCit/Controller.pm +0 -306
  170. data/perl/ParsCit/lib/ParsCit/PostProcess.pm +0 -367
  171. data/perl/ParsCit/lib/ParsCit/PreProcess.pm +0 -333
  172. data/perl/ParsCit/resources/parsCit.model +0 -0
  173. data/perl/extract.pl +0 -199
  174. data/spec/biblicit/cb2bib_spec.rb +0 -48
  175. data/spec/biblicit/citeseer_spec.rb +0 -40
  176. /data/{perl → svm-header-parse}/HeaderParseService/README.TXT +0 -0
  177. /data/{perl/DocFilter → svm-header-parse/HeaderParseService}/lib/CSXUtil/SafeText.pm +0 -0
  178. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +0 -0
  179. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +0 -0
  180. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +0 -0
  181. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/50states +0 -0
  182. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AddrTopWords.txt +0 -0
  183. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWords.txt +0 -0
  184. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWordsAll.txt +0 -0
  185. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/ChineseSurNames.txt +0 -0
  186. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames.bin +0 -0
  187. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
  188. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/DomainSuffixes.txt +0 -0
  189. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/LabeledHeader +0 -0
  190. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/README +0 -0
  191. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines +0 -0
  192. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines1 +0 -0
  193. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstract.txt +0 -0
  194. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstractTopWords +0 -0
  195. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/addr.txt +0 -0
  196. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affi.txt +0 -0
  197. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affis.bin +0 -0
  198. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
  199. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/allnamewords.bin +0 -0
  200. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_US.txt +0 -0
  201. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_world.txt +0 -0
  202. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/city.txt +0 -0
  203. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cityname.txt +0 -0
  204. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/country_abbr.txt +0 -0
  205. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/countryname.txt +0 -0
  206. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/dateTopWords +0 -0
  207. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/degree.txt +0 -0
  208. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/email.txt +0 -0
  209. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/excludeWords.txt +0 -0
  210. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/female-names +0 -0
  211. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstNames.txt +0 -0
  212. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames.bin +0 -0
  213. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
  214. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/intro.txt +0 -0
  215. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keyword.txt +0 -0
  216. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keywordTopWords +0 -0
  217. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/male-names +0 -0
  218. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/middleNames.txt +0 -0
  219. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/month.txt +0 -0
  220. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul +0 -0
  221. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label +0 -0
  222. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label.old +0 -0
  223. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.processed +0 -0
  224. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulAuthor +0 -0
  225. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulClassStat +0 -0
  226. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nickname.txt +0 -0
  227. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nicknames.bin +0 -0
  228. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/note.txt +0 -0
  229. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/page.txt +0 -0
  230. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/phone.txt +0 -0
  231. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/postcode.txt +0 -0
  232. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/pubnum.txt +0 -0
  233. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.bin +0 -0
  234. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.txt +0 -0
  235. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/states_and_abbreviations.txt +0 -0
  236. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords +0 -0
  237. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords.bin +0 -0
  238. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surNames.txt +0 -0
  239. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames.bin +0 -0
  240. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames_spec.bin +0 -0
  241. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/A.html +0 -0
  242. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/B.html +0 -0
  243. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/C.html +0 -0
  244. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/D.html +0 -0
  245. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/E.html +0 -0
  246. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/F.html +0 -0
  247. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/G.html +0 -0
  248. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/H.html +0 -0
  249. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/I.html +0 -0
  250. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/J.html +0 -0
  251. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/K.html +0 -0
  252. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/L.html +0 -0
  253. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/M.html +0 -0
  254. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/N.html +0 -0
  255. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/O.html +0 -0
  256. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/P.html +0 -0
  257. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Q.html +0 -0
  258. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/R.html +0 -0
  259. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/S.html +0 -0
  260. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/T.html +0 -0
  261. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/U.html +0 -0
  262. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/V.html +0 -0
  263. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/W.html +0 -0
  264. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
  265. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/X.html +0 -0
  266. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Y.html +0 -0
  267. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Z.html +0 -0
  268. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ae.html +0 -0
  269. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/am.html +0 -0
  270. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ar.html +0 -0
  271. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/at.html +0 -0
  272. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/au.html +0 -0
  273. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bd.html +0 -0
  274. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/be.html +0 -0
  275. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bg.html +0 -0
  276. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bh.html +0 -0
  277. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
  278. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bm.html +0 -0
  279. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bn.html +0 -0
  280. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/br.html +0 -0
  281. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ca.html +0 -0
  282. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ch.html +0 -0
  283. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cl.html +0 -0
  284. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cn.html +0 -0
  285. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/co.html +0 -0
  286. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cr.html +0 -0
  287. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cy.html +0 -0
  288. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cz.html +0 -0
  289. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/de.html +0 -0
  290. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
  291. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dk.html +0 -0
  292. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ec.html +0 -0
  293. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ee.html +0 -0
  294. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/eg.html +0 -0
  295. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/es.html +0 -0
  296. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/et.html +0 -0
  297. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/faq.html +0 -0
  298. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fi.html +0 -0
  299. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fj.html +0 -0
  300. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fo.html +0 -0
  301. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fr.html +0 -0
  302. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/geog.html +0 -0
  303. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gr.html +0 -0
  304. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gu.html +0 -0
  305. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hk.html +0 -0
  306. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hr.html +0 -0
  307. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hu.html +0 -0
  308. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/id.html +0 -0
  309. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ie.html +0 -0
  310. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/il.html +0 -0
  311. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/in.html +0 -0
  312. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/is.html +0 -0
  313. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/it.html +0 -0
  314. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jm.html +0 -0
  315. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jo.html +0 -0
  316. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jp.html +0 -0
  317. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
  318. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kr.html +0 -0
  319. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kw.html +0 -0
  320. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lb.html +0 -0
  321. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
  322. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lk.html +0 -0
  323. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lt.html +0 -0
  324. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lu.html +0 -0
  325. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lv.html +0 -0
  326. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ma.html +0 -0
  327. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
  328. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
  329. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mk.html +0 -0
  330. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mo.html +0 -0
  331. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
  332. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mt.html +0 -0
  333. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mx.html +0 -0
  334. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/my.html +0 -0
  335. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ni.html +0 -0
  336. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nl.html +0 -0
  337. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/no.html +0 -0
  338. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nz.html +0 -0
  339. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pa.html +0 -0
  340. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pe.html +0 -0
  341. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ph.html +0 -0
  342. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pl.html +0 -0
  343. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
  344. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pr.html +0 -0
  345. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ps.html +0 -0
  346. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pt.html +0 -0
  347. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/recognition.html +0 -0
  348. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/results.html +0 -0
  349. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ro.html +0 -0
  350. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ru.html +0 -0
  351. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sd.html +0 -0
  352. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/se.html +0 -0
  353. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sg.html +0 -0
  354. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/si.html +0 -0
  355. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sk.html +0 -0
  356. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/th.html +0 -0
  357. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tr.html +0 -0
  358. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tw.html +0 -0
  359. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ua.html +0 -0
  360. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uk.html +0 -0
  361. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ-full.html +0 -0
  362. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ.html +0 -0
  363. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uy.html +0 -0
  364. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ve.html +0 -0
  365. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/yu.html +0 -0
  366. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/za.html +0 -0
  367. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/zm.html +0 -0
  368. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list.txt +0 -0
  369. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/url.txt +0 -0
  370. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/webTopWords +0 -0
  371. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/words +0 -0
  372. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10ContextModelfold1 +0 -0
  373. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10Modelfold1 +0 -0
  374. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11ContextModelfold1 +0 -0
  375. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11Modelfold1 +0 -0
  376. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12ContextModelfold1 +0 -0
  377. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12Modelfold1 +0 -0
  378. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13ContextModelfold1 +0 -0
  379. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13Modelfold1 +0 -0
  380. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14ContextModelfold1 +0 -0
  381. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14Modelfold1 +0 -0
  382. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15ContextModelfold1 +0 -0
  383. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15Modelfold1 +0 -0
  384. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1ContextModelfold1 +0 -0
  385. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1Modelfold1 +0 -0
  386. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2ContextModelfold1 +0 -0
  387. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2Modelfold1 +0 -0
  388. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3ContextModelfold1 +0 -0
  389. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3Modelfold1 +0 -0
  390. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4ContextModelfold1 +0 -0
  391. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4Modelfold1 +0 -0
  392. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5ContextModelfold1 +0 -0
  393. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5Modelfold1 +0 -0
  394. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6ContextModelfold1 +0 -0
  395. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6Modelfold1 +0 -0
  396. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7ContextModelfold1 +0 -0
  397. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7Modelfold1 +0 -0
  398. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8ContextModelfold1 +0 -0
  399. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8Modelfold1 +0 -0
  400. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9ContextModelfold1 +0 -0
  401. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9Modelfold1 +0 -0
  402. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceModel +0 -0
  403. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceTrainF +0 -0
  404. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperBaseFeaDict +0 -0
  405. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperContextFeaDict +0 -0
  406. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +0 -0
@@ -0,0 +1,1041 @@
1
+ package ParsCit::PreProcess;
2
+
3
+ ###
4
+ # Utilities for finding and normalizing citations within
5
+ # text files, including separating citation text from
6
+ # body text and segmenting citations.
7
+ #
8
+ # Isaac Councill, 7/19/07
9
+ ###
10
+
11
+ use utf8;
12
+ use strict;
13
+
14
+ use Omni::Config;
15
+ use ParsCit::Citation;
16
+
17
+ my %marker_types = ( 'SQUARE' => '\\[.+?\\]',
18
+ 'PAREN' => '\\(.+?\\)',
19
+ 'NAKEDNUM' => '\\d+',
20
+ 'NAKEDNUMDOT' => '\\d+\\.',
21
+ #'NAKEDNUM' => '\\d{1,3}', # Modified by Artemy Kolchinsky (v090625)
22
+ #'NAKEDNUMDOT' => '\\d{1,3}\\.' # Modified by Artemy Kolchinsky (v090625)
23
+ );
24
+
25
+ # Omnilib configuration: object name
26
+ my $obj_list = $Omni::Config::obj_list;
27
+
28
+ ###
29
+ # Huydhn: similar to findCitationText, find the citation portion using regular expression.
30
+ # However the input is an omnipage xml document object, not the raw text
31
+ ###
32
+ sub FindCitationTextXML
33
+ {
34
+ my ($doc) = @_;
35
+
36
+ # Positions or addresses of all lines in the reference
37
+ my @cit_addrs = ();
38
+
39
+ # Start and end of a reference
40
+ my $start_found = 0;
41
+ my %start_ref = ();
42
+ my $end_found = 0;
43
+ my %end_ref = ();
44
+
45
+ # All pages in the document
46
+ my $pages = $doc->get_objs_ref();
47
+ # Foreach line in the document, check if it is the beginning of a reference using regular expression
48
+ for (my $x = scalar(@{ $pages }) - 1; $x >= 0; $x--)
49
+ {
50
+ # All columns in one page
51
+ my $columns = $pages->[ $x ]->get_objs_ref();
52
+
53
+ for (my $y = scalar(@{ $columns }) - 1; $y >= 0; $y--)
54
+ {
55
+ # All paragraphs in one column
56
+ my $paras = $columns->[ $y ]->get_objs_ref();
57
+
58
+ for (my $z = scalar(@{ $paras }) - 1; $z >= 0; $z--)
59
+ {
60
+ # All lines in one paragraph
61
+ my $lines = $paras->[ $z ]->get_objs_ref();
62
+
63
+ for (my $t = scalar(@{ $lines }) - 1; $t >= 0; $t--)
64
+ {
65
+ my $ln_content = $lines->[ $t ]->get_content();
66
+
67
+ # Is it the beginning of a reference
68
+ if ($ln_content =~ m/\b(References?|REFERENCES?|Bibliography|BIBLIOGRAPHY|References?\s+and\s+Notes?|References?\s+Cited|REFERENCES?\s+CITED|REFERENCES?\s+AND\s+NOTES?|LITERATURE?\s+CITED?):?\s*$/)
69
+ {
70
+ if (($t + 1) < scalar(@{ $lines }))
71
+ {
72
+ $start_ref{ 'L4' } = $t + 1;
73
+ $start_ref{ 'L3' } = $z;
74
+ $start_ref{ 'L2' } = $y;
75
+ $start_ref{ 'L1' } = $x;
76
+ }
77
+ elsif (($z + 1) < scalar(@{ $paras }))
78
+ {
79
+ $start_ref{ 'L4' } = 0;
80
+ $start_ref{ 'L3' } = $z + 1;
81
+ $start_ref{ 'L2' } = $y;
82
+ $start_ref{ 'L1' } = $x;
83
+ }
84
+ elsif (($y + 1) < scalar(@{ $columns }))
85
+ {
86
+ $start_ref{ 'L4' } = 0;
87
+ $start_ref{ 'L3' } = 0;
88
+ $start_ref{ 'L2' } = $y + 1;
89
+ $start_ref{ 'L1' } = $x;
90
+ }
91
+ elsif (($x + 1) < scalar(@{ $pages }))
92
+ {
93
+ $start_ref{ 'L4' } = 0;
94
+ $start_ref{ 'L3' } = 0;
95
+ $start_ref{ 'L2' } = 0;
96
+ $start_ref{ 'L1' } = $x + 1;
97
+ }
98
+ else
99
+ {
100
+ # What the heck, the beginning is at the end of the document.
101
+ }
102
+
103
+ $start_found = 1;
104
+ last;
105
+ }
106
+ }
107
+
108
+ if ($start_found == 1) { last; }
109
+ }
110
+
111
+ if ($start_found == 1) { last; }
112
+ }
113
+
114
+ if ($start_found == 1) { last; }
115
+ }
116
+
117
+ # Reference length
118
+ my $reference_length = 0;
119
+ # Citation
120
+ my $reference_text = "";
121
+
122
+ # Reference not found
123
+ if (! exists $start_ref{ 'L1' }) { return (\%start_ref, \%end_ref, \$reference_text); }
124
+
125
+ # Foreach line in the document after the start of the reference, check if it is the end of a reference using regular expression
126
+ for (my $x = $start_ref{ 'L1' }; $x < scalar(@{ $pages }); $x++)
127
+ {
128
+ # All columns in one page
129
+ my $columns = $pages->[ $x ]->get_objs_ref();
130
+
131
+ my $start_column = ($x == $start_ref{ 'L1' }) ? $start_ref{ 'L2' } : 0;
132
+
133
+ for (my $y = $start_column; $y < scalar(@{ $columns }); $y++)
134
+ {
135
+ # All paragraphs in one column
136
+ my $paras = $columns->[ $y ]->get_objs_ref();
137
+
138
+ my $start_para = (($x == $start_ref{ 'L1' }) && ($y == $start_ref{ 'L2' })) ? $start_ref{ 'L3' } : 0;
139
+
140
+ for (my $z = $start_para; $z < scalar(@{ $paras }); $z++)
141
+ {
142
+ # All lines in one paragraph
143
+ my $lines = $paras->[ $z ]->get_objs_ref();
144
+
145
+ my $start_line = (($x == $start_ref{ 'L1' }) && ($y == $start_ref{ 'L2' }) && ($z == $start_ref{ 'L3' })) ? $start_ref{ 'L4' } : 0;
146
+
147
+ for (my $t = $start_line; $t < scalar(@{ $lines }); $t++)
148
+ {
149
+ my $ln_content = $lines->[ $t ]->get_content();
150
+
151
+ # Just a temporary variable
152
+ my $tmp = undef;
153
+ # Is it the end?
154
+ if ($ln_content =~ m/^([\s\d\.]+)?(Acknowledge?ments?|Autobiographical|Tables?|Appendix|Exhibit|Annex|Fig|Notes?)(.*?)$/)
155
+ {
156
+ # Then save its location
157
+ if ($t == 0)
158
+ {
159
+ if ($z == 0)
160
+ {
161
+ if ($y == 0)
162
+ {
163
+ if ($x == 0)
164
+ {
165
+ # What the heck, the end is at the beginning of the document.
166
+ }
167
+ else
168
+ {
169
+ $end_ref{ 'L1' } = $x - 1;
170
+
171
+ $tmp = $pages->[ $x - 1 ]->get_objs_ref();
172
+ $end_ref{ 'L2' } = scalar(@{ $tmp }) - 1;
173
+
174
+ $tmp = $tmp->[ -1 ]->get_objs_ref();
175
+ $end_ref{ 'L3' } = scalar(@{ $tmp }) - 1;
176
+
177
+ $tmp = $tmp->[ -1 ]->get_objs_ref();
178
+ $end_ref{ 'L4' } = scalar(@{ $tmp }) - 1;
179
+ }
180
+ }
181
+ else
182
+ {
183
+ $end_ref{ 'L1' } = $x;
184
+ $end_ref{ 'L2' } = $y - 1;
185
+
186
+ $tmp = $columns->[ $y - 1 ]->get_objs_ref();
187
+ $end_ref{ 'L3' } = scalar(@{ $tmp }) - 1;
188
+
189
+ $tmp = $tmp->[ -1 ]->get_objs_ref();
190
+ $end_ref{ 'L4' } = scalar(@{ $tmp }) - 1;
191
+ }
192
+ }
193
+ else
194
+ {
195
+ $end_ref{ 'L1' } = $x;
196
+ $end_ref{ 'L2' } = $y;
197
+ $end_ref{ 'L3' } = $z - 1;
198
+
199
+ $tmp = $paras->[ $z - 1 ]->get_objs_ref();
200
+ $end_ref{ 'L4' } = scalar(@{ $tmp }) - 1;
201
+ }
202
+ }
203
+ else
204
+ {
205
+ $end_ref{ 'L1' } = $x;
206
+ $end_ref{ 'L2' } = $y;
207
+ $end_ref{ 'L3' } = $z;
208
+ $end_ref{ 'L4' } = $t - 1;
209
+ }
210
+
211
+ $end_found = 1;
212
+ last;
213
+ }
214
+ # This is is not the end of the reference, so, logically, it belongs to the reference
215
+ else
216
+ {
217
+ push @cit_addrs, { 'L1' => $x, 'L2' => $y, 'L3' => $z, 'L4' => $t };
218
+ }
219
+
220
+ $reference_length += length($ln_content);
221
+ $reference_text .= $ln_content . "\n";
222
+ }
223
+
224
+ if ($end_found == 1) { last; }
225
+ }
226
+
227
+ if ($end_found == 1) { last; }
228
+ }
229
+
230
+ if ($end_found == 1) { last; }
231
+ }
232
+
233
+ # End of the reference not found, asume that it's the end of the document
234
+ if (! exists $end_ref{ 'L1' })
235
+ {
236
+ # Just a temporary variable
237
+ my $tmp = undef;
238
+
239
+ $end_ref{ 'L1' } = scalar(@{ $pages }) - 1;
240
+
241
+ $tmp = $pages->[ -1 ]->get_objs_ref();
242
+ $end_ref{ 'L2' } = scalar(@{ $tmp }) - 1;
243
+
244
+ $tmp = $tmp->[ -1 ]->get_objs_ref();
245
+ $end_ref{ 'L3' } = scalar(@{ $tmp }) - 1;
246
+
247
+ $tmp = $tmp->[ -1 ]->get_objs_ref();
248
+ $end_ref{ 'L4' } = scalar(@{ $tmp }) - 1;
249
+ }
250
+
251
+ # Odd case: when citation is longer than the content itself, what should we do?
252
+ if (1.8 * $reference_length >= 0.8 * length($doc->get_content()))
253
+ {
254
+ print STDERR "Citation text longer than article body: ignoring\n";
255
+
256
+ %start_ref = (); %end_ref = (); $reference_text = "";
257
+ return (\%start_ref, \%end_ref, \$reference_text);
258
+ }
259
+
260
+ # Now we have the citation text
261
+ return (\%start_ref, \%end_ref, \$reference_text, \@cit_addrs);
262
+ }
263
+
264
+ ###
265
+ # Looks for reference section markers in the supplied text and
266
+ # separates the citation text from the body text based on these
267
+ # indicators. If it looks like there is a reference section marker
268
+ # too early in the document, this procedure will try to find later
269
+ # ones. If the final reference section is still too long, an empty
270
+ # citation text string will be returned. Returns references to
271
+ # the citation text, normalized body text, and original body text.
272
+ ###
273
+ sub FindCitationText
274
+ {
275
+ my ($rtext, $pos_array) = @_;
276
+
277
+ # Save the text
278
+ my $text = $$rtext;
279
+ my $bodytext = "";
280
+ my $citetext = "";
281
+
282
+ ###
283
+ # Corrected by Cheong Chi Hong <chcheong@cse.cuhk.edu.hk> 2 Feb 2010
284
+ # while ($text =~ m/\b(References?|REFERENCES?|Bibliography|BIBLIOGRAPHY|References?\s+and\s+Notes?|References?\s+Cited|REFERENCE?\s+CITED|REFERENCES?\s+AND\s+NOTES?):?\s*\n+/sg)
285
+ # {
286
+ ###
287
+ ###
288
+ # Corrected by Huy Do, 15 Jan 2011
289
+ # while ($text =~ m/\b(References?|REFERENCES?|Bibliography|BIBLIOGRAPHY|References?\s+and\s+Notes?|References?\s+Cited|REFERENCES?\s+CITED|REFERENCES?\s+AND\s+NOTES?):?\s*\n+/sg)
290
+ # {
291
+ ###
292
+ while ($text =~ m/\b(References?|REFERENCES?|Bibliography|BIBLIOGRAPHY|References?\s+and\s+Notes?|References?\s+Cited|REFERENCES?\s+CITED|REFERENCES?\s+AND\s+NOTES?|LITERATURE?\s+CITED?):?\s*\n+/sg)
293
+ {
294
+ $bodytext = substr $text, 0, pos $text;
295
+ $citetext = substr $text, pos $text unless (pos $text < 1);
296
+ }
297
+
298
+ # No citation
299
+ if ($citetext eq "")
300
+ {
301
+ print STDERR "Citation text cannot be found: ignoring", "\n";
302
+ return \$citetext, NormalizeBodyText(\$bodytext, $pos_array), \$bodytext;
303
+ }
304
+
305
+ # Odd case: when citation is longer than the content itself, what should we do?
306
+ if (length($citetext) >= 0.8 * length($bodytext))
307
+ {
308
+ print STDERR "Citation text longer than article body: ignoring\n";
309
+ return \$citetext, NormalizeBodyText(\$bodytext, $pos_array), \$bodytext;
310
+ }
311
+
312
+ # Citation stops when another section starts
313
+ my ($scitetext, $tmp) = split(/^([\s\d\.]+)?(Acknowledge?ments?|Autobiographical|Tables?|Appendix|Exhibit|Annex|Fig|Notes?)(.*?)\n+/m, $citetext);
314
+
315
+ if (length($scitetext) > 0) { $citetext = $scitetext; }
316
+
317
+ # No citation exists
318
+ if ($citetext eq '0' || ! defined $citetext) { print STDERR "warning: no citation text found\n"; }
319
+
320
+ # Now we have the citation text
321
+ return (NormalizeCiteText(\$citetext), NormalizeBodyText(\$bodytext, $pos_array), \$bodytext);
322
+ }
323
+
324
+ ###
325
+ # Huydhn: find citation section in raw text
326
+ # This function is used exclusively when the citation
327
+ # section is provided by sectlabel
328
+ sub FindCitationText2
329
+ {
330
+ my ($rtext, $rcit_lines, $pos_array) = @_;
331
+
332
+ # Citation and body text
333
+ my $citetext = "";
334
+ my $bodytext = "";
335
+
336
+ # All line in the document
337
+ my @lines = split(/\n/, $$rtext);
338
+
339
+ # Append all lines that belong to the citation
340
+ foreach my $line_index (@{ $rcit_lines })
341
+ {
342
+ $citetext = $citetext . $lines[ $line_index ] . "\n";
343
+ }
344
+
345
+ # If a line is not in @cit_lines, it belongs to the body text
346
+ for (my $i = 0; $i < $rcit_lines->[ 0 ]; $i++)
347
+ {
348
+ $bodytext = $bodytext . $lines[ $i ] . "\n";
349
+ }
350
+
351
+ # Odd case: when citation is longer than the content itself, what should we do?
352
+ if (length($citetext) >= 0.8 * length($bodytext))
353
+ {
354
+ print STDERR "Citation text longer than article body: ignoring\n";
355
+ return \$citetext, NormalizeBodyText(\$bodytext, $pos_array), \$bodytext;
356
+ }
357
+
358
+ # Now we have the citation text
359
+ return (NormalizeCiteText(\$citetext), NormalizeBodyText(\$bodytext, $pos_array), \$bodytext);
360
+ }
361
+
362
+ ##
363
+ # Removes lines that appear to be junk from the citation text.
364
+ ##
365
+ sub NormalizeCiteText
366
+ {
367
+ my ($rcitetext) = @_;
368
+
369
+ my @newlines = ();
370
+ my @lines = split "\n", $$rcitetext;
371
+
372
+ ###
373
+ # Modified by Artemy Kolchinsky (v090625)
374
+ # In some cases, I had situations like:
375
+ # Smith B, "Blah Blah." Journal1, 2000, p. 23-
376
+ # 85
377
+ # Here, the line consisting of '85' is part of the citation and shouldn't be dropped,
378
+ # even though it only consist of numeric characters. The way I went about this is
379
+ # that I dropped those lines consisting of only spacing characters, *or* only numeric
380
+ # characters *if the previous line did not end on a hyphen*.
381
+ ###
382
+ my $oldline = "";
383
+
384
+ foreach my $line (@lines)
385
+ {
386
+ $line =~ s/^\s*//g; # Dropped leading spaces added by Thang (v090625)
387
+ $line =~ s/\s*$//g; # Dropped trailing spaces added by Thang (v090625)
388
+
389
+ if ($line =~ m/^\s*$/ || ($oldline !~ m/\-$/ && $line =~ m/^\d*$/))
390
+ {
391
+ $oldline = $line;
392
+ next;
393
+ }
394
+
395
+ $oldline = $line;
396
+ push @newlines, $line;
397
+ }
398
+ ###
399
+ # End modified by Artemy Kolchinsky (v090625)
400
+ ###
401
+
402
+ my $newtext = join "\n", @newlines;
403
+ return \$newtext;
404
+ }
405
+
406
+ ###
407
+ # Thang May 2010
408
+ # Address the problem Nick mentioned in method normalizeBodyText()
409
+ # This method handle multiple bracket references in a line, e.g "abc [1, 2-5, 11] def [1-3, 5] ghi jkl"
410
+ # + this method maps the position of tokens in normalized body text --> positions of tokens in body text (for later retrieve context positions)
411
+ ###
412
+ sub ExpandBracketMarker
413
+ {
414
+ my ($line, $pos_array, $token_count) = @_;
415
+ # $line = "abc [1, 2-5, 11] def [1-3, 5] ghi jkl";
416
+ # $line = "abc[1, 2-5, 11]def[1-3, 5]ghi jkl";
417
+ # $line = "abc def ghi jkl";
418
+
419
+ my $count = 0;
420
+ my $front = "";
421
+ my $match = "";
422
+ my $remain = $line;
423
+ my $newline = "";
424
+ my $space_flag = 0;
425
+
426
+ while($line =~ m/\[(\d+[,;] *)*((\d+)-(\d+))([,;] *\d+)*\]/g)
427
+ {
428
+ $front = $`;
429
+ $match = $&;
430
+ $line = $';
431
+
432
+ # Handle front part
433
+ if($space_flag == 1) { $newline .= " "; }
434
+ $newline .= $front;
435
+
436
+ my @tokens = split(/\s+/, $front);
437
+ my $length = scalar(@tokens);
438
+
439
+ for(my $i=0; $i < $length; $i++)
440
+ {
441
+ if($i < ($length -1) || $front =~ / $/)
442
+ {
443
+ #print STDERR "$tokens[$i] --> ".$token_count."\n";
444
+ push(@{ $pos_array }, $token_count++);
445
+ }
446
+ }
447
+
448
+ # Handle match part
449
+ my $num_new_tokens = 0;
450
+ if ($match =~ /^\[(\d+[,;] *)*((\d+)-(\d+))([,;] *\d+)*\]$/)
451
+ {
452
+ $num_new_tokens = $4 - $3;
453
+ if ($num_new_tokens > 0)
454
+ {
455
+ $match = "[" . $1 . TransformMarker($3, $4) . $5 . "]";
456
+ }
457
+ else
458
+ {
459
+ $num_new_tokens = 0;
460
+ }
461
+ }
462
+ $newline .= $match;
463
+
464
+ @tokens = split(/\s+/, $match);
465
+ $length = scalar(@tokens);
466
+
467
+ for(my $i=0; $i < $length; $i++)
468
+ {
469
+ if($i < ($length -1) || $line =~ /^ /)
470
+ {
471
+ #print STDERR "$tokens[$i] --> ".$token_count."\n";
472
+ if ($i >= ($length - $num_new_tokens-1) && $i < ($length -1))
473
+ {
474
+ push(@{ $pos_array }, $token_count);
475
+ }
476
+ else
477
+ {
478
+ push(@{ $pos_array }, $token_count++);
479
+ }
480
+ }
481
+ }
482
+
483
+ if ($line =~ /^ /)
484
+ {
485
+ $space_flag = 1;
486
+ $line =~ s/^\s+//;
487
+ }
488
+ else
489
+ {
490
+ $space_flag = 0;
491
+ }
492
+
493
+ $count++;
494
+ }
495
+
496
+ if($space_flag == 1) { $newline .= " "; }
497
+ $newline .= $line;
498
+
499
+ my @tokens = split(/\s+/, $line);
500
+ my $length = scalar(@tokens);
501
+
502
+ for(my $i=0; $i < $length; $i++)
503
+ {
504
+ #print STDERR "$tokens[$i] --> ".$token_count."\n";
505
+ push(@{ $pos_array }, $token_count++);
506
+ }
507
+
508
+ return ($newline, $token_count);
509
+ }
510
+
511
+ ###
512
+ # Removes lines that appear to be junk from the body text,
513
+ # de-hyphenates words where a hyphen occurs at the end of
514
+ # a line, and normalizes strings of blank spaces to only
515
+ # single blancks.
516
+ #
517
+ # HISTORY: Nick (v081201)
518
+ #
519
+ # In some publications markers with a range such as [1-5] or [1-12, 16]
520
+ # are used. ParsCit cannot find these markers. I added a simple
521
+ # workaround to PreProcess::normalizeBodyText. The markers with range
522
+ # are replaced by markers containing every number of the range
523
+ # (e.g. [1-5] replaced by [1, 2, 3, 4, 5]).
524
+ ###
525
+ sub NormalizeBodyText
526
+ {
527
+ my ($rtext, $pos_array) = @_;
528
+
529
+ my @lines = split "\n", $$rtext;
530
+ my $text = "";
531
+ my $token_count = 0;
532
+
533
+ foreach my $line (@lines)
534
+ {
535
+ $line =~ s/^\s+//; # Thang May 2010: trip leading spaces
536
+
537
+ my @tmp_pos_array = ();
538
+ ($line, $token_count) = ExpandBracketMarker($line, \@tmp_pos_array, $token_count); # Thang May 2010
539
+ my @tokens = split(/\s+/, $line);
540
+
541
+ if(scalar(@tokens) != scalar(@tmp_pos_array))
542
+ {
543
+ die "scalar(@tokens) != scalar(@tmp_pos_array)\n$line\n";
544
+ }
545
+ #$line =~ s/\[(\d+[,;] *)*((\d+)-(\d+))([,;] *\d+)*\]/"[".$1.transformMarker($3,$4).$5."]"/e;
546
+
547
+ if ($line =~ m/^\s*$/) { next; }
548
+
549
+ ###
550
+ # Modified by Artemy Kolchinsky (v090625)
551
+ # !!! merge without removing "-" if preceeded by numbers...
552
+ ###
553
+ if ($text =~ s/([A-Za-z])\-$/$1/)
554
+ {
555
+ $text .= $line;
556
+ shift(@tmp_pos_array);
557
+ }
558
+ else
559
+ {
560
+ if ($text !~ m/\-\s+$/ && $text ne "") { $text .= " " } # Thang May 2010: change m/\-\s*$/ -> m/\-\s+$/
561
+ $text .= $line;
562
+ }
563
+
564
+ push(@{$pos_array}, @tmp_pos_array);
565
+ ###
566
+ # End modified by Artemy Kolchinsky (v090625)
567
+ ###
568
+ }
569
+
570
+ $text =~ s/\s{2,}/ /g;
571
+ return \$text;
572
+ }
573
+
574
+ #
575
+ sub TransformMarker
576
+ {
577
+ my ($first_number, $second_number) = @_;
578
+
579
+ my $new_marker = $first_number;
580
+ for (my $i = ($first_number + 1) ; $i <= $second_number ; $i++) { $new_marker .= ", " . $i; }
581
+ return $new_marker;
582
+ }
583
+
584
+ ###
585
+ # Controls the process by which citations are segmented, based
586
+ # on the result of trying to guess the type of citation marker
587
+ # used in the reference section. Returns a reference to a list
588
+ # of citation objects.
589
+ ###
590
+ sub SegmentCitations
591
+ {
592
+ my ($rcite_text) = @_;
593
+
594
+ my $marker_type = GuessMarkerType($rcite_text);
595
+
596
+ my $rcitations = undef;
597
+ if ($marker_type ne 'UNKNOWN')
598
+ {
599
+ $rcitations = SplitCitationsByMarker($rcite_text, $marker_type);
600
+ }
601
+ else
602
+ {
603
+ $rcitations = SplitUnmarkedCitations($rcite_text);
604
+ }
605
+
606
+ return $rcitations;
607
+ }
608
+
609
+ ###
610
+ # Segments citations that have explicit markers in the
611
+ # reference section. Whenever a new line starts with an
612
+ # expression that matches what we'd expect of a marker,
613
+ # a new citation is started. Returns a reference to a
614
+ # list of citation objects.
615
+ ###
616
+ sub SplitCitationsByMarker
617
+ {
618
+ my ($rcite_text, $marker_type) = @_;
619
+
620
+ my @citations = ();
621
+ my $current_citation = new ParsCit::Citation();
622
+ my $current_citation_string = undef;
623
+
624
+ # TODO: Might want to add a check that marker number is
625
+ # increasing as we'd expect, if the marker is numeric.
626
+
627
+ foreach my $line (split "\n", $$rcite_text)
628
+ {
629
+ if ($line =~ m/^\s*($marker_types{ $marker_type })\s*(.*)$/)
630
+ {
631
+ my ($marker, $cite_string) = ($1, $2);
632
+
633
+ if (defined $current_citation_string)
634
+ {
635
+ $current_citation->setString($current_citation_string);
636
+ push @citations, $current_citation;
637
+ $current_citation_string = undef;
638
+ }
639
+
640
+ $current_citation = new ParsCit::Citation();
641
+ $current_citation->setMarkerType($marker_type);
642
+ $current_citation->setMarker($marker);
643
+ $current_citation_string = $cite_string;
644
+ }
645
+ else
646
+ {
647
+ ###
648
+ # Modified by Artemy Kolchinsky (v090625)
649
+ # !!! merge without removing "-" if preceeded by numbers...
650
+ ###
651
+ if ((defined $current_citation_string) && ($current_citation_string =~ m/[A-Za-z]\-$/))
652
+ {
653
+ # Merge words when lines are hyphenated
654
+ $current_citation_string =~ s/\-$//;
655
+ $current_citation_string .= $line;
656
+ }
657
+ else
658
+ {
659
+ if ((! defined $current_citation_string) || ($current_citation_string !~ m/\-\s*$/)) { $current_citation_string .= " "; } #!!!
660
+ $current_citation_string .= $line;
661
+ }
662
+ ###
663
+ # End modified by Artemy Kolchinsky (v090625)
664
+ ###
665
+ }
666
+ }
667
+
668
+ # Last citation
669
+ if (defined $current_citation && defined $current_citation_string)
670
+ {
671
+ $current_citation->setString($current_citation_string);
672
+ push @citations, $current_citation;
673
+ }
674
+
675
+ # Now, we have an array of separated citations
676
+ return \@citations;
677
+ }
678
+
679
+
680
+ ###
681
+ # Uses several heuristics to decide where individual citations
682
+ # begin and end based on the length of previous lines, strings
683
+ # that look like author lists, and punctuation. Returns a
684
+ # reference to a list of citation objects.
685
+ #
686
+ # HISTORY: Modified in 081201 by Nick and J\"{o}ran.
687
+ #
688
+ # There was an error with unmarkedCitations. ParsCit ignored the last
689
+ # citation in the reference section due to a simple error in a for loop.
690
+ # In PreProcess::splitUnmarkedCitations (line 241; line 258 in my
691
+ # modified file) "$k<$#citeStarts" is used as exit condition. It should
692
+ # be "<=" and not "<" beause $#citeStarts provides the last index and
693
+ # not the length of the array.
694
+ #
695
+ # HISTORY: Modified in 081201 by Min to remove superfluous print statements
696
+ ###
697
+ sub SplitUnmarkedCitations
698
+ {
699
+ my ($rcite_text) = @_;
700
+
701
+ my @content = split "\n", $$rcite_text;
702
+
703
+ my $cite_start = 0;
704
+ my @cite_starts = ();
705
+ my @citations = ();
706
+
707
+ ###
708
+ # Huydhn: when a line is an author line (the line at the start of
709
+ # a citation with a long list of author), the next line cannot be
710
+ # the start of another (consequence) citation. This next line should
711
+ # be the next part of the current citation after the author line.
712
+ ###
713
+ my $last_author_line = undef;
714
+
715
+ for (my $i = 0; $i <= $#content; $i++)
716
+ {
717
+ if ($content[ $i ] =~ m/\b\(?[1-2][0-9]{3}[\p{IsLower}]?[\)?\s,\.]*(\s|\b)/s)
718
+ {
719
+ for (my $k = $i; $k > $cite_start; $k--)
720
+ {
721
+ if ($content[ $k ] =~ m/\s*[\p{IsUpper}]/g)
722
+ {
723
+ ###
724
+ # Huydhn: The previous line is an author line, so this line
725
+ # cannot be the start of another citation
726
+ if ($last_author_line == $k - 1) { next; }
727
+
728
+ # If length of previous line is extremely
729
+ # small, then start a new citation here.
730
+ if (length($content[ $k - 1 ]) < 2)
731
+ {
732
+ $cite_start = $k;
733
+ last;
734
+ }
735
+
736
+ # Start looking backwards for lines that could
737
+ # be author lists - these usually start the
738
+ # citation, have several separation characters (,;),
739
+ # and shouldn't contain any numbers.
740
+ my $beginning_author_line = -1;
741
+
742
+ for (my $j = $k - 1; $j > $cite_start; $j--)
743
+ {
744
+ if ($content[ $j ] =~ m/\d/) { last; }
745
+
746
+ $_ = $content[ $j ];
747
+ my $n_sep = s/([,;])/$1/g;
748
+
749
+ if ($n_sep >= 3)
750
+ {
751
+ if (($content[ $j - 1 ] =~ m/\.\s*$/) || $j == 0)
752
+ {
753
+ $beginning_author_line = $j;
754
+ }
755
+ }
756
+ else
757
+ {
758
+ last;
759
+ }
760
+ }
761
+
762
+ if ($beginning_author_line >= 0)
763
+ {
764
+ $cite_start = $beginning_author_line;
765
+
766
+ ###
767
+ # Huydhn: see $last_author_line
768
+ ###
769
+ $last_author_line = $beginning_author_line;
770
+
771
+ last;
772
+ }
773
+
774
+ # Now that the backwards author search failed
775
+ # to find any extra lines, start a new citation
776
+ # here if the previous line ends with a ".".
777
+
778
+ ###
779
+ # Modified by Artemy Kolchinsky (v090625)
780
+ # A new citation is started if the previous line ended with
781
+ # a period, but not if it ended with a period, something else,
782
+ # and then a period. This is to avoid assuming that abbrevations,
783
+ # like U.S.A. , indicate the end of a cite. Also, a new cite is
784
+ # started only if the current line does not begin with a series of
785
+ # 4 digits. This helped avoid some mis-parsed citations for me.
786
+ # The new if-statement read like:
787
+ ###
788
+ if ($content[ $k - 1 ] =~ m/[^\.].\.\s*$/ && $content[ $k ] !~ m/^\d\d\d\d/)
789
+ {
790
+ $cite_start = $k;
791
+ last;
792
+ }
793
+ }
794
+ }
795
+ # End of for
796
+
797
+ push @cite_starts, $cite_start unless (($cite_start <= $cite_starts[ $#cite_starts ]) && ($cite_start != 0));
798
+ }
799
+ }
800
+
801
+ for (my $k = 0; $k <= $#cite_starts; $k++)
802
+ {
803
+ my $first_line = $cite_starts[ $k ];
804
+ my $last_line = ($k == $#cite_starts) ? $#content : ($cite_starts[ $k + 1 ] - 1);
805
+
806
+ my $cite_string = MergeLines(join "\n", @content[ $first_line .. $last_line ]);
807
+
808
+ my $citation = new ParsCit::Citation();
809
+ $citation->setString($cite_string);
810
+ push @citations, $citation;
811
+ }
812
+
813
+ # And then from nothing came everything
814
+ return \@citations;
815
+ }
816
+
817
+ ###
818
+ # Controls the process by which citations are segmented.
819
+ # Input includes XML information.
820
+ # Returns a reference to a list of citation objects.
821
+ #
822
+ # Added by Huydhn, 13 Jan 2011
823
+ ###
824
+ sub SegmentCitationsXML
825
+ {
826
+ my ($rcite_text_from_xml, $tmp_file) = @_;
827
+
828
+ # TODO: Need to be removed
829
+ my $marker_type = GuessMarkerType($rcite_text_from_xml);
830
+
831
+ my $rcitations = undef;
832
+ if ($marker_type ne 'UNKNOWN')
833
+ {
834
+ # TODO: Need to be removed
835
+ $rcitations = SplitCitationsByMarker($rcite_text_from_xml, $marker_type);
836
+ }
837
+ else
838
+ {
839
+ # Huydhn: split reference using crf++ model
840
+ $rcitations = SplitUnmarkedCitations2($tmp_file);
841
+ }
842
+
843
+ return $rcitations;
844
+ }
845
+
846
+ ###
847
+ # Replace heuristics rules with crf++ model based on both textual
848
+ # and XML features from Omnipage.
849
+ #
850
+ # HISTORY: Added in 100111 by Huy Do
851
+ ###
852
+ sub SplitUnmarkedCitations2
853
+ {
854
+ my ($infile) = @_;
855
+
856
+ # Citation list
857
+ my @citations = ();
858
+
859
+ # Run the crf++
860
+ my $outfile = $infile . "_split.dec";
861
+ if (ParsCit::Tr2crfpp::SplitReference($infile, $outfile))
862
+ {
863
+ my $file_handle = undef;
864
+ unless(open($file_handle, "<:utf8", $outfile))
865
+ {
866
+ fatal("Could not open file: $!");
867
+ return;
868
+ }
869
+
870
+ # Read all lines
871
+ my @lines = ();
872
+ while(<$file_handle>)
873
+ {
874
+ chomp();
875
+ push @lines, $_;
876
+ }
877
+ close $file_handle;
878
+
879
+ my $cit_str = "";
880
+ for (my $i = 0; $i < scalar(@lines); $i++)
881
+ {
882
+ # Get the class of the file: "parsCit_begin", "parsCit_continue", or "parsCit_end"
883
+ my @tokens = split(/\s+/, $lines[$i]);
884
+ my $class = $tokens[ $#tokens ];
885
+
886
+ # Line content
887
+ my $ln_con = undef;
888
+ $ln_con = $tokens[ 0 ];
889
+ # Replace the ||| sequence with \s
890
+ $ln_con =~ s/\|\|\|/ /g;
891
+
892
+ # Beginning of a citation
893
+ if ($class eq "parsCit_begin")
894
+ {
895
+ # Save the previous citation
896
+ if ($cit_str ne "")
897
+ {
898
+ my $citation = new ParsCit::Citation();
899
+
900
+ # Clean up the citation text first
901
+ my $one_cit_str = MergeLines($cit_str);
902
+
903
+ # Save the citation
904
+ $citation->setString($one_cit_str);
905
+ push @citations, $citation;
906
+ }
907
+
908
+ # Create new citation
909
+ $cit_str = $ln_con;
910
+ }
911
+ # Inside a citation
912
+ elsif ($class ne "parsCit_unknown")
913
+ {
914
+ $cit_str = $cit_str . "\n" . $ln_con;
915
+ }
916
+ }
917
+
918
+ # Last citation
919
+ if ($cit_str ne "")
920
+ {
921
+ my $citation = new ParsCit::Citation();
922
+
923
+ # Clean up the citation text first
924
+ my $one_cit_str = MergeLines($cit_str);
925
+
926
+ # Save the citation
927
+ $citation->setString($one_cit_str);
928
+ push @citations, $citation;
929
+ }
930
+ }
931
+
932
+ unlink($infile);
933
+ unlink($outfile);
934
+
935
+ # Our work here is done
936
+ return \@citations;
937
+ }
938
+
939
+ ###
940
+ # Merges lines of text by dehyphenating where appropriate,
941
+ # with normal spacing.
942
+ ###
943
+ sub MergeLines
944
+ {
945
+ my ($text) = shift;
946
+
947
+ my @lines = split "\n", $text;
948
+ my $merged_text = "";
949
+
950
+ foreach my $line (@lines)
951
+ {
952
+ $line = Trim($line);
953
+
954
+ ###
955
+ # Modified by Artemy Kolchinsky (v090625)
956
+ # # !!! merge without removing "-" if preceeded by numbers...
957
+ ###
958
+ if ($merged_text =~ m/[A-Za-z]\-$/)
959
+ {
960
+ # Merge words when lines are hyphenated
961
+ $merged_text =~ s/\-$//;
962
+ $merged_text .= $line;
963
+ }
964
+ else
965
+ {
966
+ if ($merged_text !~ m/\-\s*$/) { $merged_text .= " " } #!!!
967
+ $merged_text .= $line;
968
+ }
969
+ ###
970
+ # End modified by Artemy Kolchinsky (v090625)
971
+ ###
972
+ }
973
+
974
+ return Trim($merged_text);
975
+ }
976
+
977
+ ###
978
+ # Uses a list of regular expressions that match common citation
979
+ # markers to count the number of matches for each type in the
980
+ # text. If a sufficient number of matches to a particular type
981
+ # are found, we can be reasonably sure of the type.
982
+ ###
983
+ sub GuessMarkerType
984
+ {
985
+ my ($rcite_text) = @_;
986
+
987
+ my $marker_type = 'UNKNOWN';
988
+ my %marker_observations = ();
989
+
990
+ foreach my $type (keys %marker_types)
991
+ {
992
+ $marker_observations{$type} = 0;
993
+ }
994
+
995
+ my $cite_text = "\n" . $$rcite_text;
996
+ $_ = $cite_text;
997
+ my $n_lines = s/\n/\n/gs - 1;
998
+
999
+ while ($cite_text =~ m/\n\s*($marker_types{'SQUARE'}([^\n]){10})/sg)
1000
+ {
1001
+ $marker_observations{'SQUARE'}++;
1002
+ }
1003
+
1004
+ while ($cite_text =~ m/\n\s*($marker_types{'PAREN'}([^\n]){10})/sg)
1005
+ {
1006
+ $marker_observations{'PAREN'}++;
1007
+ }
1008
+
1009
+ ###
1010
+ # Modified by Artemy Kolchinsky (v090625): remove space after {10})
1011
+ ###
1012
+ while ($cite_text =~ m/\n\s*($marker_types{'NAKEDNUM'} [^\n]{10})/sg)
1013
+ {
1014
+ $marker_observations{'NAKEDNUM'}++;
1015
+ }
1016
+
1017
+ while ($cite_text =~ m/\n\s*$marker_types{'NAKEDNUMDOT'}([^\n]){10}/sg)
1018
+ {
1019
+ $marker_observations{'NAKEDNUMDOT'}++;
1020
+ }
1021
+
1022
+ my @sorted_observations = sort { $marker_observations{ $b } <=> $marker_observations{ $a } } keys %marker_observations;
1023
+
1024
+ my $min_markers = $n_lines / 6;
1025
+ if ($marker_observations{ $sorted_observations[0] } >= $min_markers)
1026
+ {
1027
+ $marker_type = $sorted_observations[0];
1028
+ }
1029
+
1030
+ return $marker_type;
1031
+ }
1032
+
1033
+ sub Trim
1034
+ {
1035
+ my $text = shift;
1036
+ $text =~ s/^\s+//;
1037
+ $text =~ s/\s+$//;
1038
+ return $text;
1039
+ }
1040
+
1041
+ 1;