biblicit 1.0 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (406) hide show
  1. data/.gitmodules +3 -0
  2. data/Gemfile +1 -1
  3. data/README.md +125 -30
  4. data/Rakefile +22 -0
  5. data/biblicit.gemspec +9 -7
  6. data/lib/biblicit/cb2bib.rb +10 -11
  7. data/lib/biblicit/citeseer.rb +14 -26
  8. data/lib/biblicit/extractor.rb +40 -19
  9. data/lib/biblicit/parscit.rb +38 -0
  10. data/parscit/.gitignore +8 -0
  11. data/parscit/CHANGELOG +125 -0
  12. data/parscit/COPYING +674 -0
  13. data/parscit/COPYING.LESSER +165 -0
  14. data/parscit/INSTALL +105 -0
  15. data/parscit/README +97 -0
  16. data/{perl/ParsCit/README.TXT → parscit/USAGE} +25 -15
  17. data/parscit/bin/archtest.pl +31 -0
  18. data/parscit/bin/citeExtract.pl +562 -0
  19. data/parscit/bin/conlleval.pl +315 -0
  20. data/parscit/bin/headExtract.pl +40 -0
  21. data/parscit/bin/parsHed/convert2TokenLevel.pl +138 -0
  22. data/parscit/bin/parsHed/keywordGen.pl +308 -0
  23. data/parscit/bin/parsHed/parseXmlHeader.pl +141 -0
  24. data/parscit/bin/parsHed/redo.parsHed.pl +198 -0
  25. data/parscit/bin/parsHed/tr2crfpp_parsHed.pl +521 -0
  26. data/parscit/bin/parseRefStrings.pl +102 -0
  27. data/parscit/bin/phOutput2xml.pl +223 -0
  28. data/parscit/bin/redo.parsCit.pl +105 -0
  29. data/parscit/bin/sectExtract.pl +149 -0
  30. data/parscit/bin/sectLabel/README +110 -0
  31. data/parscit/bin/sectLabel/README.txt +110 -0
  32. data/parscit/bin/sectLabel/genericSect/crossValidation.rb +98 -0
  33. data/parscit/bin/sectLabel/genericSect/extractFeature.rb +104 -0
  34. data/parscit/bin/sectLabel/genericSectExtract.rb +53 -0
  35. data/parscit/bin/sectLabel/getStructureInfo.pl +156 -0
  36. data/parscit/bin/sectLabel/processOmniXML.pl +1427 -0
  37. data/parscit/bin/sectLabel/processOmniXML_new.pl +1025 -0
  38. data/parscit/bin/sectLabel/processOmniXMLv2.pl +1529 -0
  39. data/parscit/bin/sectLabel/processOmniXMLv3.pl +964 -0
  40. data/parscit/bin/sectLabel/redo.sectLabel.pl +219 -0
  41. data/parscit/bin/sectLabel/simplifyOmniXML.pl +382 -0
  42. data/parscit/bin/sectLabel/single2multi.pl +190 -0
  43. data/parscit/bin/sectLabel/tr2crfpp.pl +158 -0
  44. data/parscit/bin/tr2crfpp.pl +260 -0
  45. data/parscit/bin/xml2train.pl +193 -0
  46. data/parscit/lib/CSXUtil/SafeText.pm +130 -0
  47. data/parscit/lib/Omni/Config.pm +93 -0
  48. data/parscit/lib/Omni/Omnicell.pm +263 -0
  49. data/parscit/lib/Omni/Omnicol.pm +292 -0
  50. data/parscit/lib/Omni/Omnidd.pm +328 -0
  51. data/parscit/lib/Omni/Omnidoc.pm +153 -0
  52. data/parscit/lib/Omni/Omniframe.pm +223 -0
  53. data/parscit/lib/Omni/Omniline.pm +423 -0
  54. data/parscit/lib/Omni/Omnipage.pm +282 -0
  55. data/parscit/lib/Omni/Omnipara.pm +232 -0
  56. data/parscit/lib/Omni/Omnirun.pm +303 -0
  57. data/parscit/lib/Omni/Omnitable.pm +336 -0
  58. data/parscit/lib/Omni/Omniword.pm +162 -0
  59. data/parscit/lib/Omni/Traversal.pm +313 -0
  60. data/parscit/lib/ParsCit/.PostProcess.pm.swp +0 -0
  61. data/parscit/lib/ParsCit/Citation.pm +737 -0
  62. data/parscit/lib/ParsCit/CitationContext.pm +220 -0
  63. data/parscit/lib/ParsCit/Config.pm +35 -0
  64. data/parscit/lib/ParsCit/Controller.pm +653 -0
  65. data/parscit/lib/ParsCit/PostProcess.pm +505 -0
  66. data/parscit/lib/ParsCit/PreProcess.pm +1041 -0
  67. data/parscit/lib/ParsCit/Tr2crfpp.pm +1195 -0
  68. data/parscit/lib/ParsHed/Config.pm +49 -0
  69. data/parscit/lib/ParsHed/Controller.pm +143 -0
  70. data/parscit/lib/ParsHed/PostProcess.pm +322 -0
  71. data/parscit/lib/ParsHed/Tr2crfpp.pm +448 -0
  72. data/{perl/ParsCit/lib/ParsCit/Tr2crfpp.pm → parscit/lib/ParsHed/Tr2crfpp_token.pm} +22 -21
  73. data/parscit/lib/SectLabel/AAMatching.pm +1949 -0
  74. data/parscit/lib/SectLabel/Config.pm +88 -0
  75. data/parscit/lib/SectLabel/Controller.pm +332 -0
  76. data/parscit/lib/SectLabel/PostProcess.pm +425 -0
  77. data/parscit/lib/SectLabel/PreProcess.pm +116 -0
  78. data/parscit/lib/SectLabel/Tr2crfpp.pm +1246 -0
  79. data/parscit/resources/parsCit.model +0 -0
  80. data/parscit/resources/parsCit.split.model +0 -0
  81. data/{perl/ParsCit → parscit}/resources/parsCitDict.txt +205 -0
  82. data/parscit/resources/parsHed/bigram +10 -0
  83. data/parscit/resources/parsHed/keywords +10 -0
  84. data/parscit/resources/parsHed/parsHed.model +0 -0
  85. data/parscit/resources/parsHed/parsHed.template +178 -0
  86. data/parscit/resources/sectLabel/affiliation.model +0 -0
  87. data/parscit/resources/sectLabel/author.model +0 -0
  88. data/parscit/resources/sectLabel/funcWord +320 -0
  89. data/parscit/resources/sectLabel/genericSect.model +0 -0
  90. data/parscit/resources/sectLabel/sectLabel.config +42 -0
  91. data/parscit/resources/sectLabel/sectLabel.configXml +42 -0
  92. data/parscit/resources/sectLabel/sectLabel.model +0 -0
  93. data/sh/convert_to_text.sh +20 -0
  94. data/spec/biblicit/extractor_spec.rb +121 -0
  95. data/spec/fixtures/Review_of_Michael_Tyes_Consciousness_Revisited.docx +0 -0
  96. data/spec/fixtures/critical-infrastructures.ps +63951 -0
  97. data/spec/fixtures/txt/E06-1050.txt +867 -0
  98. data/spec/fixtures/txt/sample1.txt +902 -0
  99. data/spec/fixtures/txt/sample2.txt +394 -0
  100. data/spec/spec_helper.rb +3 -0
  101. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Function.pm +2 -20
  102. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +0 -7
  103. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Parser.pm +0 -2
  104. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +0 -7
  105. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +6 -1
  106. data/svm-header-parse/HeaderParseService/tmp/.gitignore +4 -0
  107. data/svm-header-parse/extract.pl +75 -0
  108. metadata +351 -317
  109. data/perl/DocFilter/lib/DocFilter/Config.pm +0 -35
  110. data/perl/DocFilter/lib/DocFilter/Filter.pm +0 -51
  111. data/perl/FileConversionService/README.TXT +0 -11
  112. data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
  113. data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +0 -140
  114. data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +0 -77
  115. data/perl/FileConversionService/lib/FileConverter/Compression.pm +0 -137
  116. data/perl/FileConversionService/lib/FileConverter/Config.pm +0 -57
  117. data/perl/FileConversionService/lib/FileConverter/Controller.pm +0 -191
  118. data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +0 -61
  119. data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +0 -69
  120. data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +0 -69
  121. data/perl/FileConversionService/lib/FileConverter/PSToText.pm +0 -88
  122. data/perl/FileConversionService/lib/FileConverter/Prescript.pm +0 -68
  123. data/perl/FileConversionService/lib/FileConverter/TET.pm +0 -75
  124. data/perl/FileConversionService/lib/FileConverter/Utils.pm +0 -130
  125. data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +0 -140
  126. data/perl/HeaderParseService/resources/data/EbizHeaders.txt +0 -24330
  127. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +0 -27506
  128. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +0 -26495
  129. data/perl/HeaderParseService/resources/data/tagged_headers.txt +0 -40668
  130. data/perl/HeaderParseService/resources/data/test_header.txt +0 -31
  131. data/perl/HeaderParseService/resources/data/test_header.txt.parsed +0 -31
  132. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +0 -23
  133. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +0 -23
  134. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +0 -23
  135. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +0 -23
  136. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +0 -23
  137. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +0 -23
  138. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +0 -23
  139. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +0 -23
  140. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +0 -23
  141. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +0 -23
  142. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +0 -23
  143. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +0 -23
  144. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +0 -23
  145. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +0 -23
  146. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +0 -23
  147. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +0 -23
  148. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +0 -23
  149. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +0 -23
  150. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +0 -23
  151. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +0 -23
  152. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +0 -23
  153. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +0 -23
  154. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +0 -23
  155. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +0 -23
  156. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +0 -23
  157. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +0 -23
  158. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +0 -23
  159. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +0 -23
  160. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +0 -23
  161. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +0 -23
  162. data/perl/ParsCit/crfpp/traindata/parsCit.template +0 -60
  163. data/perl/ParsCit/crfpp/traindata/parsCit.train.data +0 -12104
  164. data/perl/ParsCit/crfpp/traindata/tagged_references.txt +0 -500
  165. data/perl/ParsCit/lib/CSXUtil/SafeText.pm +0 -140
  166. data/perl/ParsCit/lib/ParsCit/Citation.pm +0 -462
  167. data/perl/ParsCit/lib/ParsCit/CitationContext.pm +0 -132
  168. data/perl/ParsCit/lib/ParsCit/Config.pm +0 -46
  169. data/perl/ParsCit/lib/ParsCit/Controller.pm +0 -306
  170. data/perl/ParsCit/lib/ParsCit/PostProcess.pm +0 -367
  171. data/perl/ParsCit/lib/ParsCit/PreProcess.pm +0 -333
  172. data/perl/ParsCit/resources/parsCit.model +0 -0
  173. data/perl/extract.pl +0 -199
  174. data/spec/biblicit/cb2bib_spec.rb +0 -48
  175. data/spec/biblicit/citeseer_spec.rb +0 -40
  176. /data/{perl → svm-header-parse}/HeaderParseService/README.TXT +0 -0
  177. /data/{perl/DocFilter → svm-header-parse/HeaderParseService}/lib/CSXUtil/SafeText.pm +0 -0
  178. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +0 -0
  179. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +0 -0
  180. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +0 -0
  181. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/50states +0 -0
  182. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AddrTopWords.txt +0 -0
  183. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWords.txt +0 -0
  184. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWordsAll.txt +0 -0
  185. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/ChineseSurNames.txt +0 -0
  186. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames.bin +0 -0
  187. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
  188. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/DomainSuffixes.txt +0 -0
  189. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/LabeledHeader +0 -0
  190. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/README +0 -0
  191. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines +0 -0
  192. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines1 +0 -0
  193. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstract.txt +0 -0
  194. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstractTopWords +0 -0
  195. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/addr.txt +0 -0
  196. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affi.txt +0 -0
  197. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affis.bin +0 -0
  198. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
  199. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/allnamewords.bin +0 -0
  200. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_US.txt +0 -0
  201. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_world.txt +0 -0
  202. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/city.txt +0 -0
  203. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cityname.txt +0 -0
  204. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/country_abbr.txt +0 -0
  205. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/countryname.txt +0 -0
  206. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/dateTopWords +0 -0
  207. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/degree.txt +0 -0
  208. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/email.txt +0 -0
  209. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/excludeWords.txt +0 -0
  210. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/female-names +0 -0
  211. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstNames.txt +0 -0
  212. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames.bin +0 -0
  213. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
  214. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/intro.txt +0 -0
  215. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keyword.txt +0 -0
  216. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keywordTopWords +0 -0
  217. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/male-names +0 -0
  218. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/middleNames.txt +0 -0
  219. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/month.txt +0 -0
  220. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul +0 -0
  221. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label +0 -0
  222. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label.old +0 -0
  223. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.processed +0 -0
  224. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulAuthor +0 -0
  225. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulClassStat +0 -0
  226. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nickname.txt +0 -0
  227. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nicknames.bin +0 -0
  228. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/note.txt +0 -0
  229. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/page.txt +0 -0
  230. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/phone.txt +0 -0
  231. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/postcode.txt +0 -0
  232. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/pubnum.txt +0 -0
  233. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.bin +0 -0
  234. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.txt +0 -0
  235. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/states_and_abbreviations.txt +0 -0
  236. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords +0 -0
  237. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords.bin +0 -0
  238. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surNames.txt +0 -0
  239. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames.bin +0 -0
  240. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames_spec.bin +0 -0
  241. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/A.html +0 -0
  242. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/B.html +0 -0
  243. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/C.html +0 -0
  244. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/D.html +0 -0
  245. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/E.html +0 -0
  246. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/F.html +0 -0
  247. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/G.html +0 -0
  248. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/H.html +0 -0
  249. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/I.html +0 -0
  250. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/J.html +0 -0
  251. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/K.html +0 -0
  252. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/L.html +0 -0
  253. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/M.html +0 -0
  254. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/N.html +0 -0
  255. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/O.html +0 -0
  256. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/P.html +0 -0
  257. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Q.html +0 -0
  258. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/R.html +0 -0
  259. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/S.html +0 -0
  260. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/T.html +0 -0
  261. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/U.html +0 -0
  262. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/V.html +0 -0
  263. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/W.html +0 -0
  264. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
  265. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/X.html +0 -0
  266. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Y.html +0 -0
  267. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Z.html +0 -0
  268. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ae.html +0 -0
  269. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/am.html +0 -0
  270. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ar.html +0 -0
  271. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/at.html +0 -0
  272. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/au.html +0 -0
  273. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bd.html +0 -0
  274. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/be.html +0 -0
  275. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bg.html +0 -0
  276. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bh.html +0 -0
  277. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
  278. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bm.html +0 -0
  279. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bn.html +0 -0
  280. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/br.html +0 -0
  281. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ca.html +0 -0
  282. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ch.html +0 -0
  283. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cl.html +0 -0
  284. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cn.html +0 -0
  285. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/co.html +0 -0
  286. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cr.html +0 -0
  287. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cy.html +0 -0
  288. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cz.html +0 -0
  289. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/de.html +0 -0
  290. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
  291. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dk.html +0 -0
  292. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ec.html +0 -0
  293. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ee.html +0 -0
  294. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/eg.html +0 -0
  295. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/es.html +0 -0
  296. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/et.html +0 -0
  297. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/faq.html +0 -0
  298. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fi.html +0 -0
  299. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fj.html +0 -0
  300. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fo.html +0 -0
  301. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fr.html +0 -0
  302. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/geog.html +0 -0
  303. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gr.html +0 -0
  304. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gu.html +0 -0
  305. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hk.html +0 -0
  306. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hr.html +0 -0
  307. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hu.html +0 -0
  308. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/id.html +0 -0
  309. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ie.html +0 -0
  310. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/il.html +0 -0
  311. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/in.html +0 -0
  312. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/is.html +0 -0
  313. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/it.html +0 -0
  314. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jm.html +0 -0
  315. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jo.html +0 -0
  316. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jp.html +0 -0
  317. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
  318. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kr.html +0 -0
  319. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kw.html +0 -0
  320. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lb.html +0 -0
  321. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
  322. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lk.html +0 -0
  323. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lt.html +0 -0
  324. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lu.html +0 -0
  325. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lv.html +0 -0
  326. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ma.html +0 -0
  327. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
  328. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
  329. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mk.html +0 -0
  330. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mo.html +0 -0
  331. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
  332. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mt.html +0 -0
  333. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mx.html +0 -0
  334. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/my.html +0 -0
  335. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ni.html +0 -0
  336. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nl.html +0 -0
  337. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/no.html +0 -0
  338. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nz.html +0 -0
  339. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pa.html +0 -0
  340. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pe.html +0 -0
  341. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ph.html +0 -0
  342. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pl.html +0 -0
  343. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
  344. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pr.html +0 -0
  345. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ps.html +0 -0
  346. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pt.html +0 -0
  347. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/recognition.html +0 -0
  348. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/results.html +0 -0
  349. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ro.html +0 -0
  350. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ru.html +0 -0
  351. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sd.html +0 -0
  352. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/se.html +0 -0
  353. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sg.html +0 -0
  354. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/si.html +0 -0
  355. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sk.html +0 -0
  356. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/th.html +0 -0
  357. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tr.html +0 -0
  358. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tw.html +0 -0
  359. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ua.html +0 -0
  360. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uk.html +0 -0
  361. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ-full.html +0 -0
  362. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ.html +0 -0
  363. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uy.html +0 -0
  364. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ve.html +0 -0
  365. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/yu.html +0 -0
  366. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/za.html +0 -0
  367. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/zm.html +0 -0
  368. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list.txt +0 -0
  369. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/url.txt +0 -0
  370. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/webTopWords +0 -0
  371. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/words +0 -0
  372. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10ContextModelfold1 +0 -0
  373. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10Modelfold1 +0 -0
  374. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11ContextModelfold1 +0 -0
  375. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11Modelfold1 +0 -0
  376. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12ContextModelfold1 +0 -0
  377. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12Modelfold1 +0 -0
  378. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13ContextModelfold1 +0 -0
  379. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13Modelfold1 +0 -0
  380. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14ContextModelfold1 +0 -0
  381. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14Modelfold1 +0 -0
  382. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15ContextModelfold1 +0 -0
  383. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15Modelfold1 +0 -0
  384. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1ContextModelfold1 +0 -0
  385. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1Modelfold1 +0 -0
  386. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2ContextModelfold1 +0 -0
  387. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2Modelfold1 +0 -0
  388. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3ContextModelfold1 +0 -0
  389. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3Modelfold1 +0 -0
  390. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4ContextModelfold1 +0 -0
  391. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4Modelfold1 +0 -0
  392. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5ContextModelfold1 +0 -0
  393. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5Modelfold1 +0 -0
  394. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6ContextModelfold1 +0 -0
  395. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6Modelfold1 +0 -0
  396. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7ContextModelfold1 +0 -0
  397. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7Modelfold1 +0 -0
  398. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8ContextModelfold1 +0 -0
  399. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8Modelfold1 +0 -0
  400. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9ContextModelfold1 +0 -0
  401. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9Modelfold1 +0 -0
  402. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceModel +0 -0
  403. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceTrainF +0 -0
  404. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperBaseFeaDict +0 -0
  405. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperContextFeaDict +0 -0
  406. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +0 -0
@@ -0,0 +1,1041 @@
1
+ package ParsCit::PreProcess;
2
+
3
+ ###
4
+ # Utilities for finding and normalizing citations within
5
+ # text files, including separating citation text from
6
+ # body text and segmenting citations.
7
+ #
8
+ # Isaac Councill, 7/19/07
9
+ ###
10
+
11
+ use utf8;
12
+ use strict;
13
+
14
+ use Omni::Config;
15
+ use ParsCit::Citation;
16
+
17
+ my %marker_types = ( 'SQUARE' => '\\[.+?\\]',
18
+ 'PAREN' => '\\(.+?\\)',
19
+ 'NAKEDNUM' => '\\d+',
20
+ 'NAKEDNUMDOT' => '\\d+\\.',
21
+ #'NAKEDNUM' => '\\d{1,3}', # Modified by Artemy Kolchinsky (v090625)
22
+ #'NAKEDNUMDOT' => '\\d{1,3}\\.' # Modified by Artemy Kolchinsky (v090625)
23
+ );
24
+
25
+ # Omnilib configuration: object name
26
+ my $obj_list = $Omni::Config::obj_list;
27
+
28
+ ###
29
+ # Huydhn: similar to findCitationText, find the citation portion using regular expression.
30
+ # However the input is an omnipage xml document object, not the raw text
31
+ ###
32
+ sub FindCitationTextXML
33
+ {
34
+ my ($doc) = @_;
35
+
36
+ # Positions or addresses of all lines in the reference
37
+ my @cit_addrs = ();
38
+
39
+ # Start and end of a reference
40
+ my $start_found = 0;
41
+ my %start_ref = ();
42
+ my $end_found = 0;
43
+ my %end_ref = ();
44
+
45
+ # All pages in the document
46
+ my $pages = $doc->get_objs_ref();
47
+ # Foreach line in the document, check if it is the beginning of a reference using regular expression
48
+ for (my $x = scalar(@{ $pages }) - 1; $x >= 0; $x--)
49
+ {
50
+ # All columns in one page
51
+ my $columns = $pages->[ $x ]->get_objs_ref();
52
+
53
+ for (my $y = scalar(@{ $columns }) - 1; $y >= 0; $y--)
54
+ {
55
+ # All paragraphs in one column
56
+ my $paras = $columns->[ $y ]->get_objs_ref();
57
+
58
+ for (my $z = scalar(@{ $paras }) - 1; $z >= 0; $z--)
59
+ {
60
+ # All lines in one paragraph
61
+ my $lines = $paras->[ $z ]->get_objs_ref();
62
+
63
+ for (my $t = scalar(@{ $lines }) - 1; $t >= 0; $t--)
64
+ {
65
+ my $ln_content = $lines->[ $t ]->get_content();
66
+
67
+ # Is it the beginning of a reference
68
+ if ($ln_content =~ m/\b(References?|REFERENCES?|Bibliography|BIBLIOGRAPHY|References?\s+and\s+Notes?|References?\s+Cited|REFERENCES?\s+CITED|REFERENCES?\s+AND\s+NOTES?|LITERATURE?\s+CITED?):?\s*$/)
69
+ {
70
+ if (($t + 1) < scalar(@{ $lines }))
71
+ {
72
+ $start_ref{ 'L4' } = $t + 1;
73
+ $start_ref{ 'L3' } = $z;
74
+ $start_ref{ 'L2' } = $y;
75
+ $start_ref{ 'L1' } = $x;
76
+ }
77
+ elsif (($z + 1) < scalar(@{ $paras }))
78
+ {
79
+ $start_ref{ 'L4' } = 0;
80
+ $start_ref{ 'L3' } = $z + 1;
81
+ $start_ref{ 'L2' } = $y;
82
+ $start_ref{ 'L1' } = $x;
83
+ }
84
+ elsif (($y + 1) < scalar(@{ $columns }))
85
+ {
86
+ $start_ref{ 'L4' } = 0;
87
+ $start_ref{ 'L3' } = 0;
88
+ $start_ref{ 'L2' } = $y + 1;
89
+ $start_ref{ 'L1' } = $x;
90
+ }
91
+ elsif (($x + 1) < scalar(@{ $pages }))
92
+ {
93
+ $start_ref{ 'L4' } = 0;
94
+ $start_ref{ 'L3' } = 0;
95
+ $start_ref{ 'L2' } = 0;
96
+ $start_ref{ 'L1' } = $x + 1;
97
+ }
98
+ else
99
+ {
100
+ # What the heck, the beginning is at the end of the document.
101
+ }
102
+
103
+ $start_found = 1;
104
+ last;
105
+ }
106
+ }
107
+
108
+ if ($start_found == 1) { last; }
109
+ }
110
+
111
+ if ($start_found == 1) { last; }
112
+ }
113
+
114
+ if ($start_found == 1) { last; }
115
+ }
116
+
117
+ # Reference length
118
+ my $reference_length = 0;
119
+ # Citation
120
+ my $reference_text = "";
121
+
122
+ # Reference not found
123
+ if (! exists $start_ref{ 'L1' }) { return (\%start_ref, \%end_ref, \$reference_text); }
124
+
125
+ # Foreach line in the document after the start of the reference, check if it is the end of a reference using regular expression
126
+ for (my $x = $start_ref{ 'L1' }; $x < scalar(@{ $pages }); $x++)
127
+ {
128
+ # All columns in one page
129
+ my $columns = $pages->[ $x ]->get_objs_ref();
130
+
131
+ my $start_column = ($x == $start_ref{ 'L1' }) ? $start_ref{ 'L2' } : 0;
132
+
133
+ for (my $y = $start_column; $y < scalar(@{ $columns }); $y++)
134
+ {
135
+ # All paragraphs in one column
136
+ my $paras = $columns->[ $y ]->get_objs_ref();
137
+
138
+ my $start_para = (($x == $start_ref{ 'L1' }) && ($y == $start_ref{ 'L2' })) ? $start_ref{ 'L3' } : 0;
139
+
140
+ for (my $z = $start_para; $z < scalar(@{ $paras }); $z++)
141
+ {
142
+ # All lines in one paragraph
143
+ my $lines = $paras->[ $z ]->get_objs_ref();
144
+
145
+ my $start_line = (($x == $start_ref{ 'L1' }) && ($y == $start_ref{ 'L2' }) && ($z == $start_ref{ 'L3' })) ? $start_ref{ 'L4' } : 0;
146
+
147
+ for (my $t = $start_line; $t < scalar(@{ $lines }); $t++)
148
+ {
149
+ my $ln_content = $lines->[ $t ]->get_content();
150
+
151
+ # Just a temporary variable
152
+ my $tmp = undef;
153
+ # Is it the end?
154
+ if ($ln_content =~ m/^([\s\d\.]+)?(Acknowledge?ments?|Autobiographical|Tables?|Appendix|Exhibit|Annex|Fig|Notes?)(.*?)$/)
155
+ {
156
+ # Then save its location
157
+ if ($t == 0)
158
+ {
159
+ if ($z == 0)
160
+ {
161
+ if ($y == 0)
162
+ {
163
+ if ($x == 0)
164
+ {
165
+ # What the heck, the end is at the beginning of the document.
166
+ }
167
+ else
168
+ {
169
+ $end_ref{ 'L1' } = $x - 1;
170
+
171
+ $tmp = $pages->[ $x - 1 ]->get_objs_ref();
172
+ $end_ref{ 'L2' } = scalar(@{ $tmp }) - 1;
173
+
174
+ $tmp = $tmp->[ -1 ]->get_objs_ref();
175
+ $end_ref{ 'L3' } = scalar(@{ $tmp }) - 1;
176
+
177
+ $tmp = $tmp->[ -1 ]->get_objs_ref();
178
+ $end_ref{ 'L4' } = scalar(@{ $tmp }) - 1;
179
+ }
180
+ }
181
+ else
182
+ {
183
+ $end_ref{ 'L1' } = $x;
184
+ $end_ref{ 'L2' } = $y - 1;
185
+
186
+ $tmp = $columns->[ $y - 1 ]->get_objs_ref();
187
+ $end_ref{ 'L3' } = scalar(@{ $tmp }) - 1;
188
+
189
+ $tmp = $tmp->[ -1 ]->get_objs_ref();
190
+ $end_ref{ 'L4' } = scalar(@{ $tmp }) - 1;
191
+ }
192
+ }
193
+ else
194
+ {
195
+ $end_ref{ 'L1' } = $x;
196
+ $end_ref{ 'L2' } = $y;
197
+ $end_ref{ 'L3' } = $z - 1;
198
+
199
+ $tmp = $paras->[ $z - 1 ]->get_objs_ref();
200
+ $end_ref{ 'L4' } = scalar(@{ $tmp }) - 1;
201
+ }
202
+ }
203
+ else
204
+ {
205
+ $end_ref{ 'L1' } = $x;
206
+ $end_ref{ 'L2' } = $y;
207
+ $end_ref{ 'L3' } = $z;
208
+ $end_ref{ 'L4' } = $t - 1;
209
+ }
210
+
211
+ $end_found = 1;
212
+ last;
213
+ }
214
+ # This is is not the end of the reference, so, logically, it belongs to the reference
215
+ else
216
+ {
217
+ push @cit_addrs, { 'L1' => $x, 'L2' => $y, 'L3' => $z, 'L4' => $t };
218
+ }
219
+
220
+ $reference_length += length($ln_content);
221
+ $reference_text .= $ln_content . "\n";
222
+ }
223
+
224
+ if ($end_found == 1) { last; }
225
+ }
226
+
227
+ if ($end_found == 1) { last; }
228
+ }
229
+
230
+ if ($end_found == 1) { last; }
231
+ }
232
+
233
+ # End of the reference not found, asume that it's the end of the document
234
+ if (! exists $end_ref{ 'L1' })
235
+ {
236
+ # Just a temporary variable
237
+ my $tmp = undef;
238
+
239
+ $end_ref{ 'L1' } = scalar(@{ $pages }) - 1;
240
+
241
+ $tmp = $pages->[ -1 ]->get_objs_ref();
242
+ $end_ref{ 'L2' } = scalar(@{ $tmp }) - 1;
243
+
244
+ $tmp = $tmp->[ -1 ]->get_objs_ref();
245
+ $end_ref{ 'L3' } = scalar(@{ $tmp }) - 1;
246
+
247
+ $tmp = $tmp->[ -1 ]->get_objs_ref();
248
+ $end_ref{ 'L4' } = scalar(@{ $tmp }) - 1;
249
+ }
250
+
251
+ # Odd case: when citation is longer than the content itself, what should we do?
252
+ if (1.8 * $reference_length >= 0.8 * length($doc->get_content()))
253
+ {
254
+ print STDERR "Citation text longer than article body: ignoring\n";
255
+
256
+ %start_ref = (); %end_ref = (); $reference_text = "";
257
+ return (\%start_ref, \%end_ref, \$reference_text);
258
+ }
259
+
260
+ # Now we have the citation text
261
+ return (\%start_ref, \%end_ref, \$reference_text, \@cit_addrs);
262
+ }
263
+
264
+ ###
265
+ # Looks for reference section markers in the supplied text and
266
+ # separates the citation text from the body text based on these
267
+ # indicators. If it looks like there is a reference section marker
268
+ # too early in the document, this procedure will try to find later
269
+ # ones. If the final reference section is still too long, an empty
270
+ # citation text string will be returned. Returns references to
271
+ # the citation text, normalized body text, and original body text.
272
+ ###
273
+ sub FindCitationText
274
+ {
275
+ my ($rtext, $pos_array) = @_;
276
+
277
+ # Save the text
278
+ my $text = $$rtext;
279
+ my $bodytext = "";
280
+ my $citetext = "";
281
+
282
+ ###
283
+ # Corrected by Cheong Chi Hong <chcheong@cse.cuhk.edu.hk> 2 Feb 2010
284
+ # while ($text =~ m/\b(References?|REFERENCES?|Bibliography|BIBLIOGRAPHY|References?\s+and\s+Notes?|References?\s+Cited|REFERENCE?\s+CITED|REFERENCES?\s+AND\s+NOTES?):?\s*\n+/sg)
285
+ # {
286
+ ###
287
+ ###
288
+ # Corrected by Huy Do, 15 Jan 2011
289
+ # while ($text =~ m/\b(References?|REFERENCES?|Bibliography|BIBLIOGRAPHY|References?\s+and\s+Notes?|References?\s+Cited|REFERENCES?\s+CITED|REFERENCES?\s+AND\s+NOTES?):?\s*\n+/sg)
290
+ # {
291
+ ###
292
+ while ($text =~ m/\b(References?|REFERENCES?|Bibliography|BIBLIOGRAPHY|References?\s+and\s+Notes?|References?\s+Cited|REFERENCES?\s+CITED|REFERENCES?\s+AND\s+NOTES?|LITERATURE?\s+CITED?):?\s*\n+/sg)
293
+ {
294
+ $bodytext = substr $text, 0, pos $text;
295
+ $citetext = substr $text, pos $text unless (pos $text < 1);
296
+ }
297
+
298
+ # No citation
299
+ if ($citetext eq "")
300
+ {
301
+ print STDERR "Citation text cannot be found: ignoring", "\n";
302
+ return \$citetext, NormalizeBodyText(\$bodytext, $pos_array), \$bodytext;
303
+ }
304
+
305
+ # Odd case: when citation is longer than the content itself, what should we do?
306
+ if (length($citetext) >= 0.8 * length($bodytext))
307
+ {
308
+ print STDERR "Citation text longer than article body: ignoring\n";
309
+ return \$citetext, NormalizeBodyText(\$bodytext, $pos_array), \$bodytext;
310
+ }
311
+
312
+ # Citation stops when another section starts
313
+ my ($scitetext, $tmp) = split(/^([\s\d\.]+)?(Acknowledge?ments?|Autobiographical|Tables?|Appendix|Exhibit|Annex|Fig|Notes?)(.*?)\n+/m, $citetext);
314
+
315
+ if (length($scitetext) > 0) { $citetext = $scitetext; }
316
+
317
+ # No citation exists
318
+ if ($citetext eq '0' || ! defined $citetext) { print STDERR "warning: no citation text found\n"; }
319
+
320
+ # Now we have the citation text
321
+ return (NormalizeCiteText(\$citetext), NormalizeBodyText(\$bodytext, $pos_array), \$bodytext);
322
+ }
323
+
324
+ ###
325
+ # Huydhn: find citation section in raw text
326
+ # This function is used exclusively when the citation
327
+ # section is provided by sectlabel
328
+ sub FindCitationText2
329
+ {
330
+ my ($rtext, $rcit_lines, $pos_array) = @_;
331
+
332
+ # Citation and body text
333
+ my $citetext = "";
334
+ my $bodytext = "";
335
+
336
+ # All line in the document
337
+ my @lines = split(/\n/, $$rtext);
338
+
339
+ # Append all lines that belong to the citation
340
+ foreach my $line_index (@{ $rcit_lines })
341
+ {
342
+ $citetext = $citetext . $lines[ $line_index ] . "\n";
343
+ }
344
+
345
+ # If a line is not in @cit_lines, it belongs to the body text
346
+ for (my $i = 0; $i < $rcit_lines->[ 0 ]; $i++)
347
+ {
348
+ $bodytext = $bodytext . $lines[ $i ] . "\n";
349
+ }
350
+
351
+ # Odd case: when citation is longer than the content itself, what should we do?
352
+ if (length($citetext) >= 0.8 * length($bodytext))
353
+ {
354
+ print STDERR "Citation text longer than article body: ignoring\n";
355
+ return \$citetext, NormalizeBodyText(\$bodytext, $pos_array), \$bodytext;
356
+ }
357
+
358
+ # Now we have the citation text
359
+ return (NormalizeCiteText(\$citetext), NormalizeBodyText(\$bodytext, $pos_array), \$bodytext);
360
+ }
361
+
362
+ ##
363
+ # Removes lines that appear to be junk from the citation text.
364
+ ##
365
+ sub NormalizeCiteText
366
+ {
367
+ my ($rcitetext) = @_;
368
+
369
+ my @newlines = ();
370
+ my @lines = split "\n", $$rcitetext;
371
+
372
+ ###
373
+ # Modified by Artemy Kolchinsky (v090625)
374
+ # In some cases, I had situations like:
375
+ # Smith B, "Blah Blah." Journal1, 2000, p. 23-
376
+ # 85
377
+ # Here, the line consisting of '85' is part of the citation and shouldn't be dropped,
378
+ # even though it only consist of numeric characters. The way I went about this is
379
+ # that I dropped those lines consisting of only spacing characters, *or* only numeric
380
+ # characters *if the previous line did not end on a hyphen*.
381
+ ###
382
+ my $oldline = "";
383
+
384
+ foreach my $line (@lines)
385
+ {
386
+ $line =~ s/^\s*//g; # Dropped leading spaces added by Thang (v090625)
387
+ $line =~ s/\s*$//g; # Dropped trailing spaces added by Thang (v090625)
388
+
389
+ if ($line =~ m/^\s*$/ || ($oldline !~ m/\-$/ && $line =~ m/^\d*$/))
390
+ {
391
+ $oldline = $line;
392
+ next;
393
+ }
394
+
395
+ $oldline = $line;
396
+ push @newlines, $line;
397
+ }
398
+ ###
399
+ # End modified by Artemy Kolchinsky (v090625)
400
+ ###
401
+
402
+ my $newtext = join "\n", @newlines;
403
+ return \$newtext;
404
+ }
405
+
406
+ ###
407
+ # Thang May 2010
408
+ # Address the problem Nick mentioned in method normalizeBodyText()
409
+ # This method handle multiple bracket references in a line, e.g "abc [1, 2-5, 11] def [1-3, 5] ghi jkl"
410
+ # + this method maps the position of tokens in normalized body text --> positions of tokens in body text (for later retrieve context positions)
411
+ ###
412
+ sub ExpandBracketMarker
413
+ {
414
+ my ($line, $pos_array, $token_count) = @_;
415
+ # $line = "abc [1, 2-5, 11] def [1-3, 5] ghi jkl";
416
+ # $line = "abc[1, 2-5, 11]def[1-3, 5]ghi jkl";
417
+ # $line = "abc def ghi jkl";
418
+
419
+ my $count = 0;
420
+ my $front = "";
421
+ my $match = "";
422
+ my $remain = $line;
423
+ my $newline = "";
424
+ my $space_flag = 0;
425
+
426
+ while($line =~ m/\[(\d+[,;] *)*((\d+)-(\d+))([,;] *\d+)*\]/g)
427
+ {
428
+ $front = $`;
429
+ $match = $&;
430
+ $line = $';
431
+
432
+ # Handle front part
433
+ if($space_flag == 1) { $newline .= " "; }
434
+ $newline .= $front;
435
+
436
+ my @tokens = split(/\s+/, $front);
437
+ my $length = scalar(@tokens);
438
+
439
+ for(my $i=0; $i < $length; $i++)
440
+ {
441
+ if($i < ($length -1) || $front =~ / $/)
442
+ {
443
+ #print STDERR "$tokens[$i] --> ".$token_count."\n";
444
+ push(@{ $pos_array }, $token_count++);
445
+ }
446
+ }
447
+
448
+ # Handle match part
449
+ my $num_new_tokens = 0;
450
+ if ($match =~ /^\[(\d+[,;] *)*((\d+)-(\d+))([,;] *\d+)*\]$/)
451
+ {
452
+ $num_new_tokens = $4 - $3;
453
+ if ($num_new_tokens > 0)
454
+ {
455
+ $match = "[" . $1 . TransformMarker($3, $4) . $5 . "]";
456
+ }
457
+ else
458
+ {
459
+ $num_new_tokens = 0;
460
+ }
461
+ }
462
+ $newline .= $match;
463
+
464
+ @tokens = split(/\s+/, $match);
465
+ $length = scalar(@tokens);
466
+
467
+ for(my $i=0; $i < $length; $i++)
468
+ {
469
+ if($i < ($length -1) || $line =~ /^ /)
470
+ {
471
+ #print STDERR "$tokens[$i] --> ".$token_count."\n";
472
+ if ($i >= ($length - $num_new_tokens-1) && $i < ($length -1))
473
+ {
474
+ push(@{ $pos_array }, $token_count);
475
+ }
476
+ else
477
+ {
478
+ push(@{ $pos_array }, $token_count++);
479
+ }
480
+ }
481
+ }
482
+
483
+ if ($line =~ /^ /)
484
+ {
485
+ $space_flag = 1;
486
+ $line =~ s/^\s+//;
487
+ }
488
+ else
489
+ {
490
+ $space_flag = 0;
491
+ }
492
+
493
+ $count++;
494
+ }
495
+
496
+ if($space_flag == 1) { $newline .= " "; }
497
+ $newline .= $line;
498
+
499
+ my @tokens = split(/\s+/, $line);
500
+ my $length = scalar(@tokens);
501
+
502
+ for(my $i=0; $i < $length; $i++)
503
+ {
504
+ #print STDERR "$tokens[$i] --> ".$token_count."\n";
505
+ push(@{ $pos_array }, $token_count++);
506
+ }
507
+
508
+ return ($newline, $token_count);
509
+ }
510
+
511
+ ###
512
+ # Removes lines that appear to be junk from the body text,
513
+ # de-hyphenates words where a hyphen occurs at the end of
514
+ # a line, and normalizes strings of blank spaces to only
515
+ # single blancks.
516
+ #
517
+ # HISTORY: Nick (v081201)
518
+ #
519
+ # In some publications markers with a range such as [1-5] or [1-12, 16]
520
+ # are used. ParsCit cannot find these markers. I added a simple
521
+ # workaround to PreProcess::normalizeBodyText. The markers with range
522
+ # are replaced by markers containing every number of the range
523
+ # (e.g. [1-5] replaced by [1, 2, 3, 4, 5]).
524
+ ###
525
+ sub NormalizeBodyText
526
+ {
527
+ my ($rtext, $pos_array) = @_;
528
+
529
+ my @lines = split "\n", $$rtext;
530
+ my $text = "";
531
+ my $token_count = 0;
532
+
533
+ foreach my $line (@lines)
534
+ {
535
+ $line =~ s/^\s+//; # Thang May 2010: trip leading spaces
536
+
537
+ my @tmp_pos_array = ();
538
+ ($line, $token_count) = ExpandBracketMarker($line, \@tmp_pos_array, $token_count); # Thang May 2010
539
+ my @tokens = split(/\s+/, $line);
540
+
541
+ if(scalar(@tokens) != scalar(@tmp_pos_array))
542
+ {
543
+ die "scalar(@tokens) != scalar(@tmp_pos_array)\n$line\n";
544
+ }
545
+ #$line =~ s/\[(\d+[,;] *)*((\d+)-(\d+))([,;] *\d+)*\]/"[".$1.transformMarker($3,$4).$5."]"/e;
546
+
547
+ if ($line =~ m/^\s*$/) { next; }
548
+
549
+ ###
550
+ # Modified by Artemy Kolchinsky (v090625)
551
+ # !!! merge without removing "-" if preceeded by numbers...
552
+ ###
553
+ if ($text =~ s/([A-Za-z])\-$/$1/)
554
+ {
555
+ $text .= $line;
556
+ shift(@tmp_pos_array);
557
+ }
558
+ else
559
+ {
560
+ if ($text !~ m/\-\s+$/ && $text ne "") { $text .= " " } # Thang May 2010: change m/\-\s*$/ -> m/\-\s+$/
561
+ $text .= $line;
562
+ }
563
+
564
+ push(@{$pos_array}, @tmp_pos_array);
565
+ ###
566
+ # End modified by Artemy Kolchinsky (v090625)
567
+ ###
568
+ }
569
+
570
+ $text =~ s/\s{2,}/ /g;
571
+ return \$text;
572
+ }
573
+
574
+ #
575
+ sub TransformMarker
576
+ {
577
+ my ($first_number, $second_number) = @_;
578
+
579
+ my $new_marker = $first_number;
580
+ for (my $i = ($first_number + 1) ; $i <= $second_number ; $i++) { $new_marker .= ", " . $i; }
581
+ return $new_marker;
582
+ }
583
+
584
+ ###
585
+ # Controls the process by which citations are segmented, based
586
+ # on the result of trying to guess the type of citation marker
587
+ # used in the reference section. Returns a reference to a list
588
+ # of citation objects.
589
+ ###
590
+ sub SegmentCitations
591
+ {
592
+ my ($rcite_text) = @_;
593
+
594
+ my $marker_type = GuessMarkerType($rcite_text);
595
+
596
+ my $rcitations = undef;
597
+ if ($marker_type ne 'UNKNOWN')
598
+ {
599
+ $rcitations = SplitCitationsByMarker($rcite_text, $marker_type);
600
+ }
601
+ else
602
+ {
603
+ $rcitations = SplitUnmarkedCitations($rcite_text);
604
+ }
605
+
606
+ return $rcitations;
607
+ }
608
+
609
+ ###
610
+ # Segments citations that have explicit markers in the
611
+ # reference section. Whenever a new line starts with an
612
+ # expression that matches what we'd expect of a marker,
613
+ # a new citation is started. Returns a reference to a
614
+ # list of citation objects.
615
+ ###
616
+ sub SplitCitationsByMarker
617
+ {
618
+ my ($rcite_text, $marker_type) = @_;
619
+
620
+ my @citations = ();
621
+ my $current_citation = new ParsCit::Citation();
622
+ my $current_citation_string = undef;
623
+
624
+ # TODO: Might want to add a check that marker number is
625
+ # increasing as we'd expect, if the marker is numeric.
626
+
627
+ foreach my $line (split "\n", $$rcite_text)
628
+ {
629
+ if ($line =~ m/^\s*($marker_types{ $marker_type })\s*(.*)$/)
630
+ {
631
+ my ($marker, $cite_string) = ($1, $2);
632
+
633
+ if (defined $current_citation_string)
634
+ {
635
+ $current_citation->setString($current_citation_string);
636
+ push @citations, $current_citation;
637
+ $current_citation_string = undef;
638
+ }
639
+
640
+ $current_citation = new ParsCit::Citation();
641
+ $current_citation->setMarkerType($marker_type);
642
+ $current_citation->setMarker($marker);
643
+ $current_citation_string = $cite_string;
644
+ }
645
+ else
646
+ {
647
+ ###
648
+ # Modified by Artemy Kolchinsky (v090625)
649
+ # !!! merge without removing "-" if preceeded by numbers...
650
+ ###
651
+ if ((defined $current_citation_string) && ($current_citation_string =~ m/[A-Za-z]\-$/))
652
+ {
653
+ # Merge words when lines are hyphenated
654
+ $current_citation_string =~ s/\-$//;
655
+ $current_citation_string .= $line;
656
+ }
657
+ else
658
+ {
659
+ if ((! defined $current_citation_string) || ($current_citation_string !~ m/\-\s*$/)) { $current_citation_string .= " "; } #!!!
660
+ $current_citation_string .= $line;
661
+ }
662
+ ###
663
+ # End modified by Artemy Kolchinsky (v090625)
664
+ ###
665
+ }
666
+ }
667
+
668
+ # Last citation
669
+ if (defined $current_citation && defined $current_citation_string)
670
+ {
671
+ $current_citation->setString($current_citation_string);
672
+ push @citations, $current_citation;
673
+ }
674
+
675
+ # Now, we have an array of separated citations
676
+ return \@citations;
677
+ }
678
+
679
+
680
+ ###
681
+ # Uses several heuristics to decide where individual citations
682
+ # begin and end based on the length of previous lines, strings
683
+ # that look like author lists, and punctuation. Returns a
684
+ # reference to a list of citation objects.
685
+ #
686
+ # HISTORY: Modified in 081201 by Nick and J\"{o}ran.
687
+ #
688
+ # There was an error with unmarkedCitations. ParsCit ignored the last
689
+ # citation in the reference section due to a simple error in a for loop.
690
+ # In PreProcess::splitUnmarkedCitations (line 241; line 258 in my
691
+ # modified file) "$k<$#citeStarts" is used as exit condition. It should
692
+ # be "<=" and not "<" beause $#citeStarts provides the last index and
693
+ # not the length of the array.
694
+ #
695
+ # HISTORY: Modified in 081201 by Min to remove superfluous print statements
696
+ ###
697
+ sub SplitUnmarkedCitations
698
+ {
699
+ my ($rcite_text) = @_;
700
+
701
+ my @content = split "\n", $$rcite_text;
702
+
703
+ my $cite_start = 0;
704
+ my @cite_starts = ();
705
+ my @citations = ();
706
+
707
+ ###
708
+ # Huydhn: when a line is an author line (the line at the start of
709
+ # a citation with a long list of author), the next line cannot be
710
+ # the start of another (consequence) citation. This next line should
711
+ # be the next part of the current citation after the author line.
712
+ ###
713
+ my $last_author_line = undef;
714
+
715
+ for (my $i = 0; $i <= $#content; $i++)
716
+ {
717
+ if ($content[ $i ] =~ m/\b\(?[1-2][0-9]{3}[\p{IsLower}]?[\)?\s,\.]*(\s|\b)/s)
718
+ {
719
+ for (my $k = $i; $k > $cite_start; $k--)
720
+ {
721
+ if ($content[ $k ] =~ m/\s*[\p{IsUpper}]/g)
722
+ {
723
+ ###
724
+ # Huydhn: The previous line is an author line, so this line
725
+ # cannot be the start of another citation
726
+ if ($last_author_line == $k - 1) { next; }
727
+
728
+ # If length of previous line is extremely
729
+ # small, then start a new citation here.
730
+ if (length($content[ $k - 1 ]) < 2)
731
+ {
732
+ $cite_start = $k;
733
+ last;
734
+ }
735
+
736
+ # Start looking backwards for lines that could
737
+ # be author lists - these usually start the
738
+ # citation, have several separation characters (,;),
739
+ # and shouldn't contain any numbers.
740
+ my $beginning_author_line = -1;
741
+
742
+ for (my $j = $k - 1; $j > $cite_start; $j--)
743
+ {
744
+ if ($content[ $j ] =~ m/\d/) { last; }
745
+
746
+ $_ = $content[ $j ];
747
+ my $n_sep = s/([,;])/$1/g;
748
+
749
+ if ($n_sep >= 3)
750
+ {
751
+ if (($content[ $j - 1 ] =~ m/\.\s*$/) || $j == 0)
752
+ {
753
+ $beginning_author_line = $j;
754
+ }
755
+ }
756
+ else
757
+ {
758
+ last;
759
+ }
760
+ }
761
+
762
+ if ($beginning_author_line >= 0)
763
+ {
764
+ $cite_start = $beginning_author_line;
765
+
766
+ ###
767
+ # Huydhn: see $last_author_line
768
+ ###
769
+ $last_author_line = $beginning_author_line;
770
+
771
+ last;
772
+ }
773
+
774
+ # Now that the backwards author search failed
775
+ # to find any extra lines, start a new citation
776
+ # here if the previous line ends with a ".".
777
+
778
+ ###
779
+ # Modified by Artemy Kolchinsky (v090625)
780
+ # A new citation is started if the previous line ended with
781
+ # a period, but not if it ended with a period, something else,
782
+ # and then a period. This is to avoid assuming that abbrevations,
783
+ # like U.S.A. , indicate the end of a cite. Also, a new cite is
784
+ # started only if the current line does not begin with a series of
785
+ # 4 digits. This helped avoid some mis-parsed citations for me.
786
+ # The new if-statement read like:
787
+ ###
788
+ if ($content[ $k - 1 ] =~ m/[^\.].\.\s*$/ && $content[ $k ] !~ m/^\d\d\d\d/)
789
+ {
790
+ $cite_start = $k;
791
+ last;
792
+ }
793
+ }
794
+ }
795
+ # End of for
796
+
797
+ push @cite_starts, $cite_start unless (($cite_start <= $cite_starts[ $#cite_starts ]) && ($cite_start != 0));
798
+ }
799
+ }
800
+
801
+ for (my $k = 0; $k <= $#cite_starts; $k++)
802
+ {
803
+ my $first_line = $cite_starts[ $k ];
804
+ my $last_line = ($k == $#cite_starts) ? $#content : ($cite_starts[ $k + 1 ] - 1);
805
+
806
+ my $cite_string = MergeLines(join "\n", @content[ $first_line .. $last_line ]);
807
+
808
+ my $citation = new ParsCit::Citation();
809
+ $citation->setString($cite_string);
810
+ push @citations, $citation;
811
+ }
812
+
813
+ # And then from nothing came everything
814
+ return \@citations;
815
+ }
816
+
817
+ ###
818
+ # Controls the process by which citations are segmented.
819
+ # Input includes XML information.
820
+ # Returns a reference to a list of citation objects.
821
+ #
822
+ # Added by Huydhn, 13 Jan 2011
823
+ ###
824
+ sub SegmentCitationsXML
825
+ {
826
+ my ($rcite_text_from_xml, $tmp_file) = @_;
827
+
828
+ # TODO: Need to be removed
829
+ my $marker_type = GuessMarkerType($rcite_text_from_xml);
830
+
831
+ my $rcitations = undef;
832
+ if ($marker_type ne 'UNKNOWN')
833
+ {
834
+ # TODO: Need to be removed
835
+ $rcitations = SplitCitationsByMarker($rcite_text_from_xml, $marker_type);
836
+ }
837
+ else
838
+ {
839
+ # Huydhn: split reference using crf++ model
840
+ $rcitations = SplitUnmarkedCitations2($tmp_file);
841
+ }
842
+
843
+ return $rcitations;
844
+ }
845
+
846
+ ###
847
+ # Replace heuristics rules with crf++ model based on both textual
848
+ # and XML features from Omnipage.
849
+ #
850
+ # HISTORY: Added in 100111 by Huy Do
851
+ ###
852
+ sub SplitUnmarkedCitations2
853
+ {
854
+ my ($infile) = @_;
855
+
856
+ # Citation list
857
+ my @citations = ();
858
+
859
+ # Run the crf++
860
+ my $outfile = $infile . "_split.dec";
861
+ if (ParsCit::Tr2crfpp::SplitReference($infile, $outfile))
862
+ {
863
+ my $file_handle = undef;
864
+ unless(open($file_handle, "<:utf8", $outfile))
865
+ {
866
+ fatal("Could not open file: $!");
867
+ return;
868
+ }
869
+
870
+ # Read all lines
871
+ my @lines = ();
872
+ while(<$file_handle>)
873
+ {
874
+ chomp();
875
+ push @lines, $_;
876
+ }
877
+ close $file_handle;
878
+
879
+ my $cit_str = "";
880
+ for (my $i = 0; $i < scalar(@lines); $i++)
881
+ {
882
+ # Get the class of the file: "parsCit_begin", "parsCit_continue", or "parsCit_end"
883
+ my @tokens = split(/\s+/, $lines[$i]);
884
+ my $class = $tokens[ $#tokens ];
885
+
886
+ # Line content
887
+ my $ln_con = undef;
888
+ $ln_con = $tokens[ 0 ];
889
+ # Replace the ||| sequence with \s
890
+ $ln_con =~ s/\|\|\|/ /g;
891
+
892
+ # Beginning of a citation
893
+ if ($class eq "parsCit_begin")
894
+ {
895
+ # Save the previous citation
896
+ if ($cit_str ne "")
897
+ {
898
+ my $citation = new ParsCit::Citation();
899
+
900
+ # Clean up the citation text first
901
+ my $one_cit_str = MergeLines($cit_str);
902
+
903
+ # Save the citation
904
+ $citation->setString($one_cit_str);
905
+ push @citations, $citation;
906
+ }
907
+
908
+ # Create new citation
909
+ $cit_str = $ln_con;
910
+ }
911
+ # Inside a citation
912
+ elsif ($class ne "parsCit_unknown")
913
+ {
914
+ $cit_str = $cit_str . "\n" . $ln_con;
915
+ }
916
+ }
917
+
918
+ # Last citation
919
+ if ($cit_str ne "")
920
+ {
921
+ my $citation = new ParsCit::Citation();
922
+
923
+ # Clean up the citation text first
924
+ my $one_cit_str = MergeLines($cit_str);
925
+
926
+ # Save the citation
927
+ $citation->setString($one_cit_str);
928
+ push @citations, $citation;
929
+ }
930
+ }
931
+
932
+ unlink($infile);
933
+ unlink($outfile);
934
+
935
+ # Our work here is done
936
+ return \@citations;
937
+ }
938
+
939
+ ###
940
+ # Merges lines of text by dehyphenating where appropriate,
941
+ # with normal spacing.
942
+ ###
943
+ sub MergeLines
944
+ {
945
+ my ($text) = shift;
946
+
947
+ my @lines = split "\n", $text;
948
+ my $merged_text = "";
949
+
950
+ foreach my $line (@lines)
951
+ {
952
+ $line = Trim($line);
953
+
954
+ ###
955
+ # Modified by Artemy Kolchinsky (v090625)
956
+ # # !!! merge without removing "-" if preceeded by numbers...
957
+ ###
958
+ if ($merged_text =~ m/[A-Za-z]\-$/)
959
+ {
960
+ # Merge words when lines are hyphenated
961
+ $merged_text =~ s/\-$//;
962
+ $merged_text .= $line;
963
+ }
964
+ else
965
+ {
966
+ if ($merged_text !~ m/\-\s*$/) { $merged_text .= " " } #!!!
967
+ $merged_text .= $line;
968
+ }
969
+ ###
970
+ # End modified by Artemy Kolchinsky (v090625)
971
+ ###
972
+ }
973
+
974
+ return Trim($merged_text);
975
+ }
976
+
977
+ ###
978
+ # Uses a list of regular expressions that match common citation
979
+ # markers to count the number of matches for each type in the
980
+ # text. If a sufficient number of matches to a particular type
981
+ # are found, we can be reasonably sure of the type.
982
+ ###
983
+ sub GuessMarkerType
984
+ {
985
+ my ($rcite_text) = @_;
986
+
987
+ my $marker_type = 'UNKNOWN';
988
+ my %marker_observations = ();
989
+
990
+ foreach my $type (keys %marker_types)
991
+ {
992
+ $marker_observations{$type} = 0;
993
+ }
994
+
995
+ my $cite_text = "\n" . $$rcite_text;
996
+ $_ = $cite_text;
997
+ my $n_lines = s/\n/\n/gs - 1;
998
+
999
+ while ($cite_text =~ m/\n\s*($marker_types{'SQUARE'}([^\n]){10})/sg)
1000
+ {
1001
+ $marker_observations{'SQUARE'}++;
1002
+ }
1003
+
1004
+ while ($cite_text =~ m/\n\s*($marker_types{'PAREN'}([^\n]){10})/sg)
1005
+ {
1006
+ $marker_observations{'PAREN'}++;
1007
+ }
1008
+
1009
+ ###
1010
+ # Modified by Artemy Kolchinsky (v090625): remove space after {10})
1011
+ ###
1012
+ while ($cite_text =~ m/\n\s*($marker_types{'NAKEDNUM'} [^\n]{10})/sg)
1013
+ {
1014
+ $marker_observations{'NAKEDNUM'}++;
1015
+ }
1016
+
1017
+ while ($cite_text =~ m/\n\s*$marker_types{'NAKEDNUMDOT'}([^\n]){10}/sg)
1018
+ {
1019
+ $marker_observations{'NAKEDNUMDOT'}++;
1020
+ }
1021
+
1022
+ my @sorted_observations = sort { $marker_observations{ $b } <=> $marker_observations{ $a } } keys %marker_observations;
1023
+
1024
+ my $min_markers = $n_lines / 6;
1025
+ if ($marker_observations{ $sorted_observations[0] } >= $min_markers)
1026
+ {
1027
+ $marker_type = $sorted_observations[0];
1028
+ }
1029
+
1030
+ return $marker_type;
1031
+ }
1032
+
1033
+ sub Trim
1034
+ {
1035
+ my $text = shift;
1036
+ $text =~ s/^\s+//;
1037
+ $text =~ s/\s+$//;
1038
+ return $text;
1039
+ }
1040
+
1041
+ 1;