biblicit 1.0 → 2.0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (406) hide show
  1. data/.gitmodules +3 -0
  2. data/Gemfile +1 -1
  3. data/README.md +125 -30
  4. data/Rakefile +22 -0
  5. data/biblicit.gemspec +9 -7
  6. data/lib/biblicit/cb2bib.rb +10 -11
  7. data/lib/biblicit/citeseer.rb +14 -26
  8. data/lib/biblicit/extractor.rb +40 -19
  9. data/lib/biblicit/parscit.rb +38 -0
  10. data/parscit/.gitignore +8 -0
  11. data/parscit/CHANGELOG +125 -0
  12. data/parscit/COPYING +674 -0
  13. data/parscit/COPYING.LESSER +165 -0
  14. data/parscit/INSTALL +105 -0
  15. data/parscit/README +97 -0
  16. data/{perl/ParsCit/README.TXT → parscit/USAGE} +25 -15
  17. data/parscit/bin/archtest.pl +31 -0
  18. data/parscit/bin/citeExtract.pl +562 -0
  19. data/parscit/bin/conlleval.pl +315 -0
  20. data/parscit/bin/headExtract.pl +40 -0
  21. data/parscit/bin/parsHed/convert2TokenLevel.pl +138 -0
  22. data/parscit/bin/parsHed/keywordGen.pl +308 -0
  23. data/parscit/bin/parsHed/parseXmlHeader.pl +141 -0
  24. data/parscit/bin/parsHed/redo.parsHed.pl +198 -0
  25. data/parscit/bin/parsHed/tr2crfpp_parsHed.pl +521 -0
  26. data/parscit/bin/parseRefStrings.pl +102 -0
  27. data/parscit/bin/phOutput2xml.pl +223 -0
  28. data/parscit/bin/redo.parsCit.pl +105 -0
  29. data/parscit/bin/sectExtract.pl +149 -0
  30. data/parscit/bin/sectLabel/README +110 -0
  31. data/parscit/bin/sectLabel/README.txt +110 -0
  32. data/parscit/bin/sectLabel/genericSect/crossValidation.rb +98 -0
  33. data/parscit/bin/sectLabel/genericSect/extractFeature.rb +104 -0
  34. data/parscit/bin/sectLabel/genericSectExtract.rb +53 -0
  35. data/parscit/bin/sectLabel/getStructureInfo.pl +156 -0
  36. data/parscit/bin/sectLabel/processOmniXML.pl +1427 -0
  37. data/parscit/bin/sectLabel/processOmniXML_new.pl +1025 -0
  38. data/parscit/bin/sectLabel/processOmniXMLv2.pl +1529 -0
  39. data/parscit/bin/sectLabel/processOmniXMLv3.pl +964 -0
  40. data/parscit/bin/sectLabel/redo.sectLabel.pl +219 -0
  41. data/parscit/bin/sectLabel/simplifyOmniXML.pl +382 -0
  42. data/parscit/bin/sectLabel/single2multi.pl +190 -0
  43. data/parscit/bin/sectLabel/tr2crfpp.pl +158 -0
  44. data/parscit/bin/tr2crfpp.pl +260 -0
  45. data/parscit/bin/xml2train.pl +193 -0
  46. data/parscit/lib/CSXUtil/SafeText.pm +130 -0
  47. data/parscit/lib/Omni/Config.pm +93 -0
  48. data/parscit/lib/Omni/Omnicell.pm +263 -0
  49. data/parscit/lib/Omni/Omnicol.pm +292 -0
  50. data/parscit/lib/Omni/Omnidd.pm +328 -0
  51. data/parscit/lib/Omni/Omnidoc.pm +153 -0
  52. data/parscit/lib/Omni/Omniframe.pm +223 -0
  53. data/parscit/lib/Omni/Omniline.pm +423 -0
  54. data/parscit/lib/Omni/Omnipage.pm +282 -0
  55. data/parscit/lib/Omni/Omnipara.pm +232 -0
  56. data/parscit/lib/Omni/Omnirun.pm +303 -0
  57. data/parscit/lib/Omni/Omnitable.pm +336 -0
  58. data/parscit/lib/Omni/Omniword.pm +162 -0
  59. data/parscit/lib/Omni/Traversal.pm +313 -0
  60. data/parscit/lib/ParsCit/.PostProcess.pm.swp +0 -0
  61. data/parscit/lib/ParsCit/Citation.pm +737 -0
  62. data/parscit/lib/ParsCit/CitationContext.pm +220 -0
  63. data/parscit/lib/ParsCit/Config.pm +35 -0
  64. data/parscit/lib/ParsCit/Controller.pm +653 -0
  65. data/parscit/lib/ParsCit/PostProcess.pm +505 -0
  66. data/parscit/lib/ParsCit/PreProcess.pm +1041 -0
  67. data/parscit/lib/ParsCit/Tr2crfpp.pm +1195 -0
  68. data/parscit/lib/ParsHed/Config.pm +49 -0
  69. data/parscit/lib/ParsHed/Controller.pm +143 -0
  70. data/parscit/lib/ParsHed/PostProcess.pm +322 -0
  71. data/parscit/lib/ParsHed/Tr2crfpp.pm +448 -0
  72. data/{perl/ParsCit/lib/ParsCit/Tr2crfpp.pm → parscit/lib/ParsHed/Tr2crfpp_token.pm} +22 -21
  73. data/parscit/lib/SectLabel/AAMatching.pm +1949 -0
  74. data/parscit/lib/SectLabel/Config.pm +88 -0
  75. data/parscit/lib/SectLabel/Controller.pm +332 -0
  76. data/parscit/lib/SectLabel/PostProcess.pm +425 -0
  77. data/parscit/lib/SectLabel/PreProcess.pm +116 -0
  78. data/parscit/lib/SectLabel/Tr2crfpp.pm +1246 -0
  79. data/parscit/resources/parsCit.model +0 -0
  80. data/parscit/resources/parsCit.split.model +0 -0
  81. data/{perl/ParsCit → parscit}/resources/parsCitDict.txt +205 -0
  82. data/parscit/resources/parsHed/bigram +10 -0
  83. data/parscit/resources/parsHed/keywords +10 -0
  84. data/parscit/resources/parsHed/parsHed.model +0 -0
  85. data/parscit/resources/parsHed/parsHed.template +178 -0
  86. data/parscit/resources/sectLabel/affiliation.model +0 -0
  87. data/parscit/resources/sectLabel/author.model +0 -0
  88. data/parscit/resources/sectLabel/funcWord +320 -0
  89. data/parscit/resources/sectLabel/genericSect.model +0 -0
  90. data/parscit/resources/sectLabel/sectLabel.config +42 -0
  91. data/parscit/resources/sectLabel/sectLabel.configXml +42 -0
  92. data/parscit/resources/sectLabel/sectLabel.model +0 -0
  93. data/sh/convert_to_text.sh +20 -0
  94. data/spec/biblicit/extractor_spec.rb +121 -0
  95. data/spec/fixtures/Review_of_Michael_Tyes_Consciousness_Revisited.docx +0 -0
  96. data/spec/fixtures/critical-infrastructures.ps +63951 -0
  97. data/spec/fixtures/txt/E06-1050.txt +867 -0
  98. data/spec/fixtures/txt/sample1.txt +902 -0
  99. data/spec/fixtures/txt/sample2.txt +394 -0
  100. data/spec/spec_helper.rb +3 -0
  101. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Function.pm +2 -20
  102. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +0 -7
  103. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Parser.pm +0 -2
  104. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +0 -7
  105. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +6 -1
  106. data/svm-header-parse/HeaderParseService/tmp/.gitignore +4 -0
  107. data/svm-header-parse/extract.pl +75 -0
  108. metadata +351 -317
  109. data/perl/DocFilter/lib/DocFilter/Config.pm +0 -35
  110. data/perl/DocFilter/lib/DocFilter/Filter.pm +0 -51
  111. data/perl/FileConversionService/README.TXT +0 -11
  112. data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
  113. data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +0 -140
  114. data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +0 -77
  115. data/perl/FileConversionService/lib/FileConverter/Compression.pm +0 -137
  116. data/perl/FileConversionService/lib/FileConverter/Config.pm +0 -57
  117. data/perl/FileConversionService/lib/FileConverter/Controller.pm +0 -191
  118. data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +0 -61
  119. data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +0 -69
  120. data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +0 -69
  121. data/perl/FileConversionService/lib/FileConverter/PSToText.pm +0 -88
  122. data/perl/FileConversionService/lib/FileConverter/Prescript.pm +0 -68
  123. data/perl/FileConversionService/lib/FileConverter/TET.pm +0 -75
  124. data/perl/FileConversionService/lib/FileConverter/Utils.pm +0 -130
  125. data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +0 -140
  126. data/perl/HeaderParseService/resources/data/EbizHeaders.txt +0 -24330
  127. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +0 -27506
  128. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +0 -26495
  129. data/perl/HeaderParseService/resources/data/tagged_headers.txt +0 -40668
  130. data/perl/HeaderParseService/resources/data/test_header.txt +0 -31
  131. data/perl/HeaderParseService/resources/data/test_header.txt.parsed +0 -31
  132. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +0 -23
  133. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +0 -23
  134. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +0 -23
  135. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +0 -23
  136. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +0 -23
  137. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +0 -23
  138. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +0 -23
  139. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +0 -23
  140. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +0 -23
  141. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +0 -23
  142. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +0 -23
  143. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +0 -23
  144. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +0 -23
  145. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +0 -23
  146. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +0 -23
  147. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +0 -23
  148. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +0 -23
  149. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +0 -23
  150. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +0 -23
  151. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +0 -23
  152. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +0 -23
  153. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +0 -23
  154. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +0 -23
  155. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +0 -23
  156. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +0 -23
  157. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +0 -23
  158. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +0 -23
  159. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +0 -23
  160. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +0 -23
  161. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +0 -23
  162. data/perl/ParsCit/crfpp/traindata/parsCit.template +0 -60
  163. data/perl/ParsCit/crfpp/traindata/parsCit.train.data +0 -12104
  164. data/perl/ParsCit/crfpp/traindata/tagged_references.txt +0 -500
  165. data/perl/ParsCit/lib/CSXUtil/SafeText.pm +0 -140
  166. data/perl/ParsCit/lib/ParsCit/Citation.pm +0 -462
  167. data/perl/ParsCit/lib/ParsCit/CitationContext.pm +0 -132
  168. data/perl/ParsCit/lib/ParsCit/Config.pm +0 -46
  169. data/perl/ParsCit/lib/ParsCit/Controller.pm +0 -306
  170. data/perl/ParsCit/lib/ParsCit/PostProcess.pm +0 -367
  171. data/perl/ParsCit/lib/ParsCit/PreProcess.pm +0 -333
  172. data/perl/ParsCit/resources/parsCit.model +0 -0
  173. data/perl/extract.pl +0 -199
  174. data/spec/biblicit/cb2bib_spec.rb +0 -48
  175. data/spec/biblicit/citeseer_spec.rb +0 -40
  176. /data/{perl → svm-header-parse}/HeaderParseService/README.TXT +0 -0
  177. /data/{perl/DocFilter → svm-header-parse/HeaderParseService}/lib/CSXUtil/SafeText.pm +0 -0
  178. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +0 -0
  179. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +0 -0
  180. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +0 -0
  181. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/50states +0 -0
  182. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AddrTopWords.txt +0 -0
  183. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWords.txt +0 -0
  184. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWordsAll.txt +0 -0
  185. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/ChineseSurNames.txt +0 -0
  186. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames.bin +0 -0
  187. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
  188. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/DomainSuffixes.txt +0 -0
  189. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/LabeledHeader +0 -0
  190. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/README +0 -0
  191. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines +0 -0
  192. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines1 +0 -0
  193. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstract.txt +0 -0
  194. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstractTopWords +0 -0
  195. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/addr.txt +0 -0
  196. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affi.txt +0 -0
  197. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affis.bin +0 -0
  198. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
  199. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/allnamewords.bin +0 -0
  200. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_US.txt +0 -0
  201. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_world.txt +0 -0
  202. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/city.txt +0 -0
  203. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cityname.txt +0 -0
  204. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/country_abbr.txt +0 -0
  205. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/countryname.txt +0 -0
  206. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/dateTopWords +0 -0
  207. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/degree.txt +0 -0
  208. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/email.txt +0 -0
  209. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/excludeWords.txt +0 -0
  210. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/female-names +0 -0
  211. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstNames.txt +0 -0
  212. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames.bin +0 -0
  213. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
  214. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/intro.txt +0 -0
  215. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keyword.txt +0 -0
  216. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keywordTopWords +0 -0
  217. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/male-names +0 -0
  218. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/middleNames.txt +0 -0
  219. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/month.txt +0 -0
  220. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul +0 -0
  221. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label +0 -0
  222. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label.old +0 -0
  223. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.processed +0 -0
  224. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulAuthor +0 -0
  225. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulClassStat +0 -0
  226. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nickname.txt +0 -0
  227. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nicknames.bin +0 -0
  228. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/note.txt +0 -0
  229. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/page.txt +0 -0
  230. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/phone.txt +0 -0
  231. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/postcode.txt +0 -0
  232. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/pubnum.txt +0 -0
  233. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.bin +0 -0
  234. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.txt +0 -0
  235. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/states_and_abbreviations.txt +0 -0
  236. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords +0 -0
  237. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords.bin +0 -0
  238. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surNames.txt +0 -0
  239. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames.bin +0 -0
  240. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames_spec.bin +0 -0
  241. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/A.html +0 -0
  242. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/B.html +0 -0
  243. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/C.html +0 -0
  244. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/D.html +0 -0
  245. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/E.html +0 -0
  246. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/F.html +0 -0
  247. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/G.html +0 -0
  248. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/H.html +0 -0
  249. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/I.html +0 -0
  250. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/J.html +0 -0
  251. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/K.html +0 -0
  252. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/L.html +0 -0
  253. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/M.html +0 -0
  254. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/N.html +0 -0
  255. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/O.html +0 -0
  256. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/P.html +0 -0
  257. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Q.html +0 -0
  258. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/R.html +0 -0
  259. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/S.html +0 -0
  260. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/T.html +0 -0
  261. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/U.html +0 -0
  262. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/V.html +0 -0
  263. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/W.html +0 -0
  264. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
  265. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/X.html +0 -0
  266. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Y.html +0 -0
  267. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Z.html +0 -0
  268. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ae.html +0 -0
  269. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/am.html +0 -0
  270. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ar.html +0 -0
  271. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/at.html +0 -0
  272. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/au.html +0 -0
  273. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bd.html +0 -0
  274. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/be.html +0 -0
  275. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bg.html +0 -0
  276. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bh.html +0 -0
  277. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
  278. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bm.html +0 -0
  279. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bn.html +0 -0
  280. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/br.html +0 -0
  281. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ca.html +0 -0
  282. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ch.html +0 -0
  283. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cl.html +0 -0
  284. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cn.html +0 -0
  285. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/co.html +0 -0
  286. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cr.html +0 -0
  287. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cy.html +0 -0
  288. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cz.html +0 -0
  289. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/de.html +0 -0
  290. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
  291. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dk.html +0 -0
  292. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ec.html +0 -0
  293. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ee.html +0 -0
  294. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/eg.html +0 -0
  295. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/es.html +0 -0
  296. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/et.html +0 -0
  297. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/faq.html +0 -0
  298. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fi.html +0 -0
  299. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fj.html +0 -0
  300. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fo.html +0 -0
  301. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fr.html +0 -0
  302. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/geog.html +0 -0
  303. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gr.html +0 -0
  304. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gu.html +0 -0
  305. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hk.html +0 -0
  306. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hr.html +0 -0
  307. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hu.html +0 -0
  308. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/id.html +0 -0
  309. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ie.html +0 -0
  310. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/il.html +0 -0
  311. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/in.html +0 -0
  312. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/is.html +0 -0
  313. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/it.html +0 -0
  314. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jm.html +0 -0
  315. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jo.html +0 -0
  316. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jp.html +0 -0
  317. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
  318. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kr.html +0 -0
  319. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kw.html +0 -0
  320. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lb.html +0 -0
  321. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
  322. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lk.html +0 -0
  323. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lt.html +0 -0
  324. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lu.html +0 -0
  325. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lv.html +0 -0
  326. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ma.html +0 -0
  327. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
  328. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
  329. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mk.html +0 -0
  330. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mo.html +0 -0
  331. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
  332. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mt.html +0 -0
  333. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mx.html +0 -0
  334. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/my.html +0 -0
  335. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ni.html +0 -0
  336. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nl.html +0 -0
  337. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/no.html +0 -0
  338. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nz.html +0 -0
  339. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pa.html +0 -0
  340. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pe.html +0 -0
  341. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ph.html +0 -0
  342. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pl.html +0 -0
  343. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
  344. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pr.html +0 -0
  345. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ps.html +0 -0
  346. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pt.html +0 -0
  347. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/recognition.html +0 -0
  348. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/results.html +0 -0
  349. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ro.html +0 -0
  350. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ru.html +0 -0
  351. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sd.html +0 -0
  352. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/se.html +0 -0
  353. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sg.html +0 -0
  354. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/si.html +0 -0
  355. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sk.html +0 -0
  356. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/th.html +0 -0
  357. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tr.html +0 -0
  358. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tw.html +0 -0
  359. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ua.html +0 -0
  360. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uk.html +0 -0
  361. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ-full.html +0 -0
  362. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ.html +0 -0
  363. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uy.html +0 -0
  364. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ve.html +0 -0
  365. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/yu.html +0 -0
  366. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/za.html +0 -0
  367. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/zm.html +0 -0
  368. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list.txt +0 -0
  369. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/url.txt +0 -0
  370. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/webTopWords +0 -0
  371. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/words +0 -0
  372. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10ContextModelfold1 +0 -0
  373. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10Modelfold1 +0 -0
  374. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11ContextModelfold1 +0 -0
  375. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11Modelfold1 +0 -0
  376. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12ContextModelfold1 +0 -0
  377. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12Modelfold1 +0 -0
  378. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13ContextModelfold1 +0 -0
  379. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13Modelfold1 +0 -0
  380. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14ContextModelfold1 +0 -0
  381. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14Modelfold1 +0 -0
  382. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15ContextModelfold1 +0 -0
  383. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15Modelfold1 +0 -0
  384. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1ContextModelfold1 +0 -0
  385. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1Modelfold1 +0 -0
  386. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2ContextModelfold1 +0 -0
  387. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2Modelfold1 +0 -0
  388. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3ContextModelfold1 +0 -0
  389. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3Modelfold1 +0 -0
  390. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4ContextModelfold1 +0 -0
  391. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4Modelfold1 +0 -0
  392. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5ContextModelfold1 +0 -0
  393. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5Modelfold1 +0 -0
  394. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6ContextModelfold1 +0 -0
  395. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6Modelfold1 +0 -0
  396. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7ContextModelfold1 +0 -0
  397. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7Modelfold1 +0 -0
  398. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8ContextModelfold1 +0 -0
  399. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8Modelfold1 +0 -0
  400. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9ContextModelfold1 +0 -0
  401. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9Modelfold1 +0 -0
  402. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceModel +0 -0
  403. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceTrainF +0 -0
  404. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperBaseFeaDict +0 -0
  405. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperContextFeaDict +0 -0
  406. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +0 -0
@@ -1,367 +0,0 @@
1
- package ParsCit::PostProcess;
2
- #
3
- # Utilities for normalizing the output of CRF++ into standard
4
- # representations.
5
- #
6
- # Isaac Councill, 07/20/07
7
- #
8
-
9
- use strict;
10
- use utf8;
11
-
12
- ##
13
- # Main normalization subroutine. Reads in a CRF++ output file
14
- # and normalizes each field of individual citations. An intermediate
15
- # XML representation is used to keep track of the tags discovered by
16
- # the model. Returns a reference to the raw XML (may not be encoded
17
- # safely) and a reference to a list of hashes containing the normalized
18
- # citation subfields, keyed by tag name.
19
- ##
20
- sub readAndNormalize {
21
- my ($inFile) = @_;
22
-
23
- my $status = 1;
24
- my $msg = "";
25
-
26
- open(IN, "<:utf8", $inFile) or return (undef, undef, 0,
27
- "couldn't open infile: $!");
28
-
29
- my $currentTag;
30
- my @currentTokens = ();
31
-
32
- my $newCitation = 1;
33
-
34
- my $xml = "";
35
-
36
- while(<IN>) {
37
- if (m/^\s*$/) { # blank line separates citations
38
- if ($newCitation <= 0) {
39
- finishCitation(\$xml, \$currentTag, \@currentTokens);
40
- @currentTokens = ();
41
- $newCitation = 1;
42
- next;
43
- }
44
- }
45
- if ($newCitation > 0) {
46
- $xml .= "<citation>\n";
47
- $newCitation = 0;
48
- }
49
- my @fields = split /\s+/;
50
- my $token = $fields[0];
51
- my $tag = $fields[$#fields];
52
- if (!defined $currentTag) {
53
- $currentTag = $tag;
54
- }
55
- if ($tag eq $currentTag) {
56
- push @currentTokens, $token;
57
- } else {
58
- $xml .= makeSegment($currentTag, @currentTokens);
59
- $currentTag = $tag;
60
- @currentTokens = ();
61
- push @currentTokens, $token;
62
- }
63
- }
64
-
65
- close IN;
66
-
67
- if ($newCitation <= 0) {
68
- finishCitation(\$xml, \$currentTag, \@currentTokens);
69
- @currentTokens = ();
70
- $newCitation = 1;
71
- }
72
-
73
- my $rCiteInfo = normalizeFields(\$xml);
74
-
75
- return \$xml, $rCiteInfo, $status, $msg;
76
-
77
- } # readAndNormalize
78
-
79
-
80
- ##
81
- # Utility for adding a closing tag to a citation in the
82
- # intermediate XML, and setting the currentTag value to undef.
83
- ##
84
- sub finishCitation {
85
- my ($r_xml, $r_currentTag, $r_currentTokens) = @_;
86
- if (defined $$r_currentTag) {
87
- $$r_xml .= makeSegment($$r_currentTag, @$r_currentTokens);
88
- }
89
- $$r_xml .= "</citation>\n";
90
- $$r_currentTag = undef;
91
-
92
- } # finishCitation
93
-
94
-
95
- ##
96
- # Makes an XML segment based on the specifed tag and token list.
97
- ##
98
- sub makeSegment {
99
- my ($tag, @tokens) = @_;
100
- my $segment = join " ", @tokens;
101
- return "<$tag>$segment</$tag>\n";
102
- }
103
-
104
-
105
- ##
106
- # Switching utility for reading through the intermediate XMl
107
- # and passing control to an appropriate normalization routine
108
- # for each field encountered. Returns a reference to a list
109
- # of hashes containing normalized fields, keyed by tag name.
110
- ##
111
- sub normalizeFields {
112
- my ($rXML) = @_;
113
- my @citeInfos = ();
114
-
115
- $_ = $$rXML;
116
- my @citeBlocks = m/<citation>(.*?)<\/citation>/gs;
117
- foreach my $block (@citeBlocks) {
118
- my %citeInfo;
119
- while($block =~ m/<(.*?)>(.*?)<\/\1>/gs) {
120
- my ($tag, $content) = ($1, $2);
121
- if ($tag eq "author") {
122
- $tag = "authors";
123
- $content = normalizeAuthorNames($content);
124
- } elsif ($tag eq "date") {
125
- $content = normalizeDate($content);
126
- } elsif ($tag eq "volume") {
127
- $content = normalizeNumber($content);
128
- } elsif ($tag eq "number") {
129
- $content = normalizeNumber($content);
130
- } elsif ($tag eq "pages") {
131
- $content = normalizePages($content);
132
- } else {
133
- $content = stripPunctuation($content);
134
- }
135
- # Heuristic - only get first instance of tag.
136
- # TODO: we can do better than that...
137
- unless (defined $citeInfo{$tag} || ! defined $content) {
138
- $citeInfo{$tag} = $content;
139
- }
140
- }
141
- push @citeInfos, \%citeInfo;
142
- }
143
- return \@citeInfos;
144
-
145
- } # normalizeFields
146
-
147
-
148
- sub stripPunctuation {
149
- my $text = shift;
150
- $text =~ s/^[^\p{IsLower}\p{IsUpper}0-9]+//;
151
- $text =~ s/[^\p{IsLower}\p{IsUpper}0-9]+$//;
152
- return $text;
153
- }
154
-
155
-
156
- ##
157
- # Tries to split the author tokens into individual author names
158
- # and then normalizes these names individually. Returns a
159
- # list of author names.
160
- ##
161
- sub normalizeAuthorNames {
162
- my ($authorText) = @_;
163
-
164
- my @tokens = repairAndTokenizeAuthorText($authorText);
165
-
166
- my @authors = ();
167
- my @currentAuth = ();
168
- my $beginAuth = 1;
169
-
170
- foreach my $tok (@tokens) {
171
- if ($tok =~ m/^(&|and)$/i) {
172
- if ($#currentAuth >= 0) {
173
- my $auth = normalizeAuthorName(@currentAuth);
174
- push @authors, $auth;
175
- }
176
- @currentAuth = ();
177
- $beginAuth = 1;
178
- next;
179
- }
180
- if ($beginAuth > 0) {
181
- push @currentAuth, $tok;
182
- $beginAuth = 0;
183
- next;
184
- }
185
- if ($tok =~ m/,$/) {
186
- push @currentAuth, $tok;
187
- if ($#currentAuth>0) {
188
- my $auth = normalizeAuthorName(@currentAuth);
189
- push @authors, $auth;
190
- @currentAuth = ();
191
- $beginAuth = 1;
192
- }
193
- } else {
194
- push @currentAuth, $tok;
195
- }
196
- }
197
- if ($#currentAuth >= 0) {
198
- my $auth = normalizeAuthorName(@currentAuth);
199
- push @authors, $auth;
200
- }
201
- return \@authors;
202
-
203
- } # normalizeAuthorNames
204
-
205
-
206
- ##
207
- # Strips unexpected punctuation and removes tokens that
208
- # are obviously not name words from the token list.
209
- ##
210
- sub repairAndTokenizeAuthorText {
211
- my ($authorText) = @_;
212
-
213
- # Repair obvious parse errors and weird notations.
214
- $authorText =~ s/et\.? al\.?.*$//;
215
- $authorText =~ s/^.*?[\p{IsUpper}\p{IsLower}][\p{IsUpper}\p{IsLower}]+\. //;
216
- $authorText =~ s/\(.*?\)//g;
217
- $authorText =~ s/^.*?\)\.?//g;
218
- $authorText =~ s/\(.*?$//g;
219
-
220
- $authorText =~ s/\[.*?\]//g;
221
- $authorText =~ s/^.*?\]\.?//g;
222
- $authorText =~ s/\[.*?$//g;
223
-
224
- $authorText =~ s/;/,/g;
225
- $authorText =~ s/,/, /g;
226
- $authorText =~ s/\:/ /g;
227
- $authorText =~ s/[\:\"\<\>\/\?\{\}\[\]\+\=\(\)\*\^\%\$\#\@\!\~\_]//g;
228
- $authorText = joinMultiWordNames($authorText);
229
-
230
- my @origTokens = split '\s+', $authorText;
231
- my @tokens = ();
232
-
233
- for (my $i=0; $i<=$#origTokens; $i++) {
234
- my $tok = $origTokens[$i];
235
- if ($tok !~ m/[\p{IsUpper}\p{IsLower}&]/) {
236
- if ($i < $#origTokens/2) {
237
- # Probably got junk up to now.
238
- @tokens = ();
239
- next;
240
- } else {
241
- last;
242
- }
243
- }
244
- if ($tok =~ m/^(jr|sr|ph\.?d|m\.?d|esq)\.?\,?$/i) {
245
- if ($tokens[$#tokens] =~ m/\,$/) {
246
- next;
247
- }
248
- }
249
- if ($tok =~ m/^[IVX][IVX]+\.?\,?$/) {
250
- next;
251
- }
252
- push @tokens, $tok;
253
- }
254
- return @tokens;
255
-
256
- } #repairAndTokenizeAuthorText
257
-
258
-
259
- ##
260
- # Tries to normalize an individual author name into the form
261
- # "First Middle Last", without punctuation.
262
- ##
263
- sub normalizeAuthorName {
264
- my @authTokens = @_;
265
- if ($#authTokens < 0) {
266
- return "";
267
- }
268
-
269
- my $tmpStr = join " ", @authTokens;
270
- if ($tmpStr =~ m/(.+),\s*(.+)/) {
271
- $tmpStr = "$2 $1";
272
- }
273
-
274
- $tmpStr =~ s/\.\-/-/g;
275
- $tmpStr =~ s/[\,\.]/ /g;
276
- $tmpStr =~ s/ +/ /g;
277
- $tmpStr = trim($tmpStr);
278
-
279
- if ($tmpStr =~ m/^[^\s][^\s]+(\s+[^\s]|\s+[^\s]\-[^\s])+$/) {
280
- my @newTokens = split '\s+', $tmpStr;
281
- my @newOrder = @newTokens[1..$#newTokens];
282
- push @newOrder, $newTokens[0];
283
- $tmpStr = join " ", @newOrder;
284
- }
285
-
286
- return $tmpStr;
287
-
288
- } # normalizeAuthorName
289
-
290
-
291
- ##
292
- # Utility for creating an intermediate representation of multi-word
293
- # name components, e.g., transforms "van der Wald" to "van_dir_Wald".
294
- # this helps keep things straight during normalization. The
295
- # underscores can be stripped out later.
296
- ##
297
- sub joinMultiWordNames {
298
- my $authorText = shift;
299
- $authorText =~ s/\b((?:van|von|der|den|de|di|le|el))\s/\1_/sgi;
300
- return $authorText;
301
-
302
- } # joinMultiWordNames
303
-
304
-
305
- ##
306
- # Normalizes a date field into just the year. Looks for a string of
307
- # four digits.
308
- ##
309
- sub normalizeDate {
310
- my $dateText = shift;
311
- if ($dateText =~ m/(\d{4})/) {
312
- my $year = $1;
313
- # check to see whether this is a sane year setting
314
- my @timeData = localtime(time);
315
- my $currentYear = $timeData[5]+1900;
316
- if ($year <= $currentYear+3) {
317
- return $1;
318
- }
319
- }
320
-
321
- } # normalizeDate
322
-
323
-
324
- ##
325
- # If a field should be numeric only, this utility is used
326
- # to extract the first number string only.
327
- ##
328
- sub normalizeNumber {
329
- my $numText = shift;
330
- if ($numText =~ m/(\d+)/) {
331
- return $1;
332
- } else {
333
- return $numText;
334
- }
335
-
336
- } # normalizeNumber
337
-
338
-
339
- ##
340
- # Normalizes page fields into the form "start--end". If the page
341
- # field does not appear to be in a standard form, does nothing.
342
- ##
343
- sub normalizePages {
344
- my $pageText = shift;
345
- if ($pageText =~ m/(\d+)[^\d]+?(\d+)/) {
346
- if ($1>=$2) {
347
- return undef;
348
- }
349
- return "$1--$2";
350
- } elsif ($pageText =~ m/(\d+)/) {
351
- return $1;
352
- } else {
353
- return undef;
354
- }
355
-
356
- } # normalizePages
357
-
358
-
359
- sub trim {
360
- my $str = shift;
361
- $str =~ s/^\s+//;
362
- $str =~ s/\s+$//;
363
- return $str;
364
- }
365
-
366
-
367
- 1;
@@ -1,333 +0,0 @@
1
- package ParsCit::PreProcess;
2
- #
3
- # Utilities for finding and normalizing citations within
4
- # text files, including separating citation text from
5
- # body text and segmenting citations.
6
- #
7
- # Isaac Councill, 7/19/07
8
- #
9
-
10
- use strict;
11
- use utf8;
12
- use ParsCit::Citation;
13
-
14
- my %markerTypes = (
15
- 'SQUARE' => '\\[.+?\\]',
16
- 'PAREN' => '\\(.+?\\)',
17
- 'NAKEDNUM' => '\\d+',
18
- 'NAKEDNUMDOT' => '\\d+\\.',
19
- );
20
-
21
-
22
- ##
23
- # Looks for reference section markers in the supplied text and
24
- # separates the citation text from the body text based on these
25
- # indicators. If it looks like there is a reference section marker
26
- # too early in the document, this procedure will try to find later
27
- # ones. If the final reference section is still too long, an empty
28
- # citation text string will be returned. Returns references to
29
- # the citation text, normalized body text, and original body text.
30
- ##
31
- sub findCitationText {
32
- my ($rText) = @_;
33
- my $text = $$rText;
34
- my $bodyText = '0';
35
- my $citeText = '0';
36
-
37
- while ($text =~ m/\b(References?|REFERENCES?|Bibliography|BIBLIOGRAPHY|References?\s+and\s+Notes?|References?\s+Cited|REFERENCE?\s+CITED|REFERENCES?\s+AND\s+NOTES?):?\s*\n+/sg) {
38
- $bodyText = substr $text, 0, pos $text;
39
- $citeText = substr $text, pos $text unless (pos $text < 1);
40
- }
41
- if (length($citeText) >= 0.8*length($bodyText)) {
42
- print STDERR "Citation text longer than article body: ignoring\n";
43
- $citeText = "";
44
- return \$citeText, \normalizeBodyText(\$bodyText), \$bodyText;
45
- }
46
- my ($sciteText, $tmp) = split(/^([\s\d\.]+)?(Acknowledge?ments?|Autobiographical|Tables?|Appendix|Exhibit|Annex|Fig|Notes?)(.*?)\n+/m, $citeText);
47
- if (length($sciteText)>0) {
48
- $citeText = $sciteText;
49
- }
50
-
51
- if ($citeText eq '0' || !defined $citeText) {
52
- print STDERR "warning: no citation text found\n";
53
- }
54
-
55
- return (normalizeCiteText(\$citeText),
56
- normalizeBodyText(\$bodyText),
57
- \$bodyText);
58
-
59
- } # findCitationText
60
-
61
-
62
- ##
63
- # Removes lines that appear to be junk from the citation text.
64
- ##
65
- sub normalizeCiteText {
66
- my ($rCiteText) = @_;
67
-
68
- my @lines = split "\n", $$rCiteText;
69
- my @newLines = ();
70
- foreach my $line (@lines) {
71
- if ($line =~ m/^[\s\d]*$/) {
72
- next;
73
- }
74
- push @newLines, $line;
75
- }
76
- my $newText = join "\n", @newLines;
77
- return \$newText;
78
-
79
- } # normalizeCiteText
80
-
81
-
82
- ##
83
- # Removes lines that appear to be junk from the body text,
84
- # de-hyphenates words where a hyphen occurs at the end of
85
- # a line, and normalizes strings of blank spaces to only
86
- # single blancks.
87
- ##
88
- sub normalizeBodyText {
89
- my ($rText) = @_;
90
- my @lines = split "\n", $$rText;
91
- my $text = "";
92
- foreach my $line (@lines) {
93
- if ($line =~ m/^\s*$/) {
94
- next;
95
- }
96
- if ($text =~ s/(\w)\-$/$1/) {
97
- $text .= $line;
98
- } else {
99
- $text .= " ".$line;
100
- }
101
- }
102
- $text =~ s/\s\s+/\s/g;
103
- return \$text;
104
-
105
- } # normalizeBodyText
106
-
107
-
108
- ##
109
- # Controls the process by which citations are segmented,
110
- # based on the result of trying to guess the type of
111
- # citation marker used in the reference section. Returns
112
- # a reference to a list of citation objects.
113
- ##
114
- sub segmentCitations {
115
- my ($rCiteText) = @_;
116
- my $markerType = guessMarkerType($rCiteText);
117
-
118
- my $rCitations;
119
-
120
- if ($markerType ne 'UNKNOWN') {
121
- $rCitations = splitCitationsByMarker($rCiteText, $markerType);
122
- } else {
123
- $rCitations = splitUnmarkedCitations($rCiteText);
124
- }
125
-
126
- return $rCitations;
127
-
128
- } # segmentCitations
129
-
130
-
131
- ##
132
- # Segments citations that have explicit markers in the
133
- # reference section. Whenever a new line starts with an
134
- # expression that matches what we'd expect of a marker,
135
- # a new citation is started. Returns a reference to a
136
- # list of citation objects.
137
- ##
138
- sub splitCitationsByMarker {
139
- my ($rCiteText, $markerType) = @_;
140
- my @citations;
141
- my $currentCitation = new ParsCit::Citation();
142
- my $currentCitationString;
143
-
144
- # TODO: Might want to add a check that marker number is
145
- # increasing as we'd expect, if the marker is numeric.
146
-
147
- foreach my $line (split "\n", $$rCiteText) {
148
- if ($line =~ m/^\s*($markerTypes{$markerType})\s*(.*)$/) {
149
- my ($marker, $citeString) = ($1, $2);
150
- if (defined $currentCitationString) {
151
- $currentCitation->setString($currentCitationString);
152
- push @citations, $currentCitation;
153
- $currentCitationString = undef;
154
- }
155
- $currentCitation = new ParsCit::Citation();
156
- $currentCitation->setMarkerType($markerType);
157
- $currentCitation->setMarker($marker);
158
- $currentCitationString = $citeString;
159
- } else {
160
- if ($currentCitationString =~ m/\w\-$/) {
161
- # merge words when lines are hyphenated
162
- $currentCitationString =~ s/\-$//;
163
- $currentCitationString .= $line;
164
- } else {
165
- $currentCitationString .= " ".$line;
166
- }
167
- }
168
- }
169
- if (defined $currentCitation && defined $currentCitationString) {
170
- $currentCitation->setString($currentCitationString);
171
- push @citations, $currentCitation;
172
- }
173
- return \@citations;
174
-
175
- } # splitCitationsByMarker
176
-
177
-
178
- ##
179
- # Uses several heuristics to decide where individual citations
180
- # begin and end based on the length of previous lines, strings
181
- # that look like author lists, and punctuation. Returns a
182
- # reference to a list of citation objects.
183
- ##
184
- sub splitUnmarkedCitations {
185
- my ($rCiteText) = @_;
186
- my @content = split "\n", $$rCiteText;
187
- my @citeStarts = ();
188
- my $citeStart = 0;
189
- my @citations = ();
190
-
191
- for (my $i=0; $i<=$#content; $i++) {
192
- if ($content[$i] =~ m/\b\(?[1-2][0-9]{3}[\p{IsLower}]?[\)?\s,\.]*(\s|\b)/s) {
193
- for (my $k=$i; $k > $citeStart; $k--) {
194
- if ($content[$k] =~ m/\s*[\p{IsUpper}]/g) {
195
-
196
- # If length of previous line is extremely small,
197
- # start a new citation here.
198
- if (length($content[$k-1]) < 2) {
199
- $citeStart = $k;
200
- last;
201
- }
202
-
203
- # Start looking backwards for lines that could
204
- # be author lists - these usually start the
205
- # citation, have several separation characters (,;),
206
- # and shouldn't contain any numbers.
207
- my $beginningAuthorLine = -1;
208
- for (my $j=$k-1; $j>$citeStart; $j--) {
209
- if ($content[$j] =~ m/\d/) {
210
- last;
211
- }
212
- $_ = $content[$j];
213
- my $nSep = s/([,;])/\1/g;
214
- if ($nSep >= 3) {
215
- if (($content[$j-1] =~ m/\.\s*$/) || $j==0) {
216
- $beginningAuthorLine = $j;
217
- }
218
- } else {
219
- last;
220
- }
221
- }
222
- if ($beginningAuthorLine >= 0) {
223
- $citeStart = $beginningAuthorLine;
224
- last;
225
- }
226
-
227
- # Now that the backwards author search failed
228
- # to find any extra lines, start a new citation
229
- # here if the previous line ends with a ".".
230
- if ($content[$k-1] =~ m/\.\s*$/) {
231
- $citeStart = $k;
232
- last;
233
- }
234
- }
235
- }
236
- push @citeStarts, $citeStart
237
- unless (($citeStart <= $citeStarts[$#citeStarts]) &&
238
- ($citeStart != 0));
239
- }
240
- }
241
- for (my $k=0; $k<$#citeStarts; $k++) {
242
- my $firstLine = $citeStarts[$k];
243
- my $lastLine = ($k==$#citeStarts) ? $#content : ($citeStarts[$k+1]-1);
244
- my $citeString =
245
- mergeLines(join "\n", @content[$firstLine .. $lastLine]);
246
- my $citation = new ParsCit::Citation();
247
- $citation->setString($citeString);
248
- push @citations, $citation;
249
- }
250
- return \@citations;
251
-
252
- } # splitUnmarkedCitations
253
-
254
-
255
- ##
256
- # Merges lines of text by dehyphenating where appropriate,
257
- # with normal spacing.
258
- ##
259
- sub mergeLines {
260
- my ($text) = shift;
261
- my @lines = split "\n", $text;
262
- my $mergedText = "";
263
- foreach my $line (@lines) {
264
- $line = trim($line);
265
- if ($mergedText =~ m/\w\-$/) {
266
- $mergedText =~ s/\-$//;
267
- $mergedText .= $line;
268
- } else {
269
- $mergedText .= " ".$line;
270
- }
271
- }
272
- return trim($mergedText);
273
-
274
- } # mergeLines
275
-
276
-
277
- ##
278
- # Uses a list of regular expressions that match common citation
279
- # markers to count the number of matches for each type in the
280
- # text. If a sufficient number of matches to a particular type
281
- # are found, we can be reasonably sure of the type.
282
- ##
283
- sub guessMarkerType {
284
- my ($rCiteText) = @_;
285
- my $markerType = 'UNKNOWN';
286
- my %markerObservations;
287
- foreach my $type (keys %markerTypes) {
288
- $markerObservations{$type} = 0;
289
- }
290
-
291
- my $citeText = "\n".$$rCiteText;
292
- $_ = $citeText;
293
- my $nLines = s/\n/\n/gs - 1;
294
-
295
- while ($citeText =~ m/\n\s*($markerTypes{'SQUARE'}([^\n]){10})/sg) {
296
- $markerObservations{'SQUARE'}++;
297
- }
298
-
299
- while ($citeText =~ m/\n\s*($markerTypes{'PAREN'}([^\n]){10})/sg) {
300
- $markerObservations{'PAREN'}++;
301
- }
302
-
303
- while ($citeText =~ m/\n\s*($markerTypes{'NAKEDNUM'} [^\n]{10}) /sg) {
304
- $markerObservations{'NAKEDNUM'}++;
305
- }
306
-
307
- while ($citeText =~ m/\n\s*$markerTypes{'NAKEDNUMDOT'}([^\n]){10}/sg) {
308
- $markerObservations{'NAKEDNUMDOT'}++;
309
- }
310
-
311
- my @sortedObservations =
312
- sort {$markerObservations{$b} <=> $markerObservations{$a}}
313
- keys %markerObservations;
314
-
315
- my $minMarkers = $nLines / 6;
316
- if ($markerObservations{$sortedObservations[0]} >= $minMarkers) {
317
- $markerType = $sortedObservations[0];
318
- }
319
- return $markerType;
320
-
321
- } # guessMarkerType
322
-
323
-
324
- sub trim {
325
- my $text = shift;
326
- $text =~ s/^\s+//;
327
- $text =~ s/\s+$//;
328
- return $text;
329
-
330
- } # trim
331
-
332
-
333
- 1;
Binary file