biblicit 1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (322) hide show
  1. data/.gitignore +3 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +6 -0
  4. data/LICENSE.TXT +176 -0
  5. data/README.md +120 -0
  6. data/Rakefile +8 -0
  7. data/biblicit.gemspec +33 -0
  8. data/lib/biblicit/cb2bib.rb +83 -0
  9. data/lib/biblicit/citeseer.rb +53 -0
  10. data/lib/biblicit/extractor.rb +37 -0
  11. data/lib/biblicit.rb +6 -0
  12. data/perl/DocFilter/lib/CSXUtil/SafeText.pm +140 -0
  13. data/perl/DocFilter/lib/DocFilter/Config.pm +35 -0
  14. data/perl/DocFilter/lib/DocFilter/Filter.pm +51 -0
  15. data/perl/FileConversionService/README.TXT +11 -0
  16. data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
  17. data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +140 -0
  18. data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +77 -0
  19. data/perl/FileConversionService/lib/FileConverter/Compression.pm +137 -0
  20. data/perl/FileConversionService/lib/FileConverter/Config.pm +57 -0
  21. data/perl/FileConversionService/lib/FileConverter/Controller.pm +191 -0
  22. data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +61 -0
  23. data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +69 -0
  24. data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +69 -0
  25. data/perl/FileConversionService/lib/FileConverter/PSToText.pm +88 -0
  26. data/perl/FileConversionService/lib/FileConverter/Prescript.pm +68 -0
  27. data/perl/FileConversionService/lib/FileConverter/TET.pm +75 -0
  28. data/perl/FileConversionService/lib/FileConverter/Utils.pm +130 -0
  29. data/perl/HeaderParseService/README.TXT +80 -0
  30. data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +140 -0
  31. data/perl/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +968 -0
  32. data/perl/HeaderParseService/lib/HeaderParse/API/Function.pm +2016 -0
  33. data/perl/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +444 -0
  34. data/perl/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +409 -0
  35. data/perl/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +537 -0
  36. data/perl/HeaderParseService/lib/HeaderParse/API/Parser.pm +68 -0
  37. data/perl/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +1880 -0
  38. data/perl/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +46 -0
  39. data/perl/HeaderParseService/resources/data/EbizHeaders.txt +24330 -0
  40. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +27506 -0
  41. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +26495 -0
  42. data/perl/HeaderParseService/resources/data/tagged_headers.txt +40668 -0
  43. data/perl/HeaderParseService/resources/data/test_header.txt +31 -0
  44. data/perl/HeaderParseService/resources/data/test_header.txt.parsed +31 -0
  45. data/perl/HeaderParseService/resources/database/50states +60 -0
  46. data/perl/HeaderParseService/resources/database/AddrTopWords.txt +17 -0
  47. data/perl/HeaderParseService/resources/database/AffiTopWords.txt +35 -0
  48. data/perl/HeaderParseService/resources/database/AffiTopWordsAll.txt +533 -0
  49. data/perl/HeaderParseService/resources/database/ChineseSurNames.txt +276 -0
  50. data/perl/HeaderParseService/resources/database/Csurnames.bin +0 -0
  51. data/perl/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
  52. data/perl/HeaderParseService/resources/database/DomainSuffixes.txt +242 -0
  53. data/perl/HeaderParseService/resources/database/LabeledHeader +18 -0
  54. data/perl/HeaderParseService/resources/database/README +2 -0
  55. data/perl/HeaderParseService/resources/database/TrainMulClassLines +254 -0
  56. data/perl/HeaderParseService/resources/database/TrainMulClassLines1 +510 -0
  57. data/perl/HeaderParseService/resources/database/abstract.txt +1 -0
  58. data/perl/HeaderParseService/resources/database/abstractTopWords +9 -0
  59. data/perl/HeaderParseService/resources/database/addr.txt +28 -0
  60. data/perl/HeaderParseService/resources/database/affi.txt +34 -0
  61. data/perl/HeaderParseService/resources/database/affis.bin +0 -0
  62. data/perl/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
  63. data/perl/HeaderParseService/resources/database/allnamewords.bin +0 -0
  64. data/perl/HeaderParseService/resources/database/cities_US.txt +4512 -0
  65. data/perl/HeaderParseService/resources/database/cities_world.txt +4463 -0
  66. data/perl/HeaderParseService/resources/database/city.txt +3150 -0
  67. data/perl/HeaderParseService/resources/database/cityname.txt +3151 -0
  68. data/perl/HeaderParseService/resources/database/country_abbr.txt +243 -0
  69. data/perl/HeaderParseService/resources/database/countryname.txt +262 -0
  70. data/perl/HeaderParseService/resources/database/dateTopWords +30 -0
  71. data/perl/HeaderParseService/resources/database/degree.txt +67 -0
  72. data/perl/HeaderParseService/resources/database/email.txt +3 -0
  73. data/perl/HeaderParseService/resources/database/excludeWords.txt +40 -0
  74. data/perl/HeaderParseService/resources/database/female-names +4960 -0
  75. data/perl/HeaderParseService/resources/database/firstNames.txt +8448 -0
  76. data/perl/HeaderParseService/resources/database/firstnames.bin +0 -0
  77. data/perl/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
  78. data/perl/HeaderParseService/resources/database/intro.txt +2 -0
  79. data/perl/HeaderParseService/resources/database/keyword.txt +5 -0
  80. data/perl/HeaderParseService/resources/database/keywordTopWords +7 -0
  81. data/perl/HeaderParseService/resources/database/male-names +3906 -0
  82. data/perl/HeaderParseService/resources/database/middleNames.txt +2 -0
  83. data/perl/HeaderParseService/resources/database/month.txt +35 -0
  84. data/perl/HeaderParseService/resources/database/mul +868 -0
  85. data/perl/HeaderParseService/resources/database/mul.label +869 -0
  86. data/perl/HeaderParseService/resources/database/mul.label.old +869 -0
  87. data/perl/HeaderParseService/resources/database/mul.processed +762 -0
  88. data/perl/HeaderParseService/resources/database/mulAuthor +619 -0
  89. data/perl/HeaderParseService/resources/database/mulClassStat +45 -0
  90. data/perl/HeaderParseService/resources/database/nickname.txt +58 -0
  91. data/perl/HeaderParseService/resources/database/nicknames.bin +0 -0
  92. data/perl/HeaderParseService/resources/database/note.txt +121 -0
  93. data/perl/HeaderParseService/resources/database/page.txt +1 -0
  94. data/perl/HeaderParseService/resources/database/phone.txt +9 -0
  95. data/perl/HeaderParseService/resources/database/postcode.txt +54 -0
  96. data/perl/HeaderParseService/resources/database/pubnum.txt +45 -0
  97. data/perl/HeaderParseService/resources/database/statename.bin +0 -0
  98. data/perl/HeaderParseService/resources/database/statename.txt +73 -0
  99. data/perl/HeaderParseService/resources/database/states_and_abbreviations.txt +118 -0
  100. data/perl/HeaderParseService/resources/database/stopwords +438 -0
  101. data/perl/HeaderParseService/resources/database/stopwords.bin +0 -0
  102. data/perl/HeaderParseService/resources/database/surNames.txt +19613 -0
  103. data/perl/HeaderParseService/resources/database/surnames.bin +0 -0
  104. data/perl/HeaderParseService/resources/database/surnames_spec.bin +0 -0
  105. data/perl/HeaderParseService/resources/database/university_list/A.html +167 -0
  106. data/perl/HeaderParseService/resources/database/university_list/B.html +161 -0
  107. data/perl/HeaderParseService/resources/database/university_list/C.html +288 -0
  108. data/perl/HeaderParseService/resources/database/university_list/D.html +115 -0
  109. data/perl/HeaderParseService/resources/database/university_list/E.html +147 -0
  110. data/perl/HeaderParseService/resources/database/university_list/F.html +112 -0
  111. data/perl/HeaderParseService/resources/database/university_list/G.html +115 -0
  112. data/perl/HeaderParseService/resources/database/university_list/H.html +140 -0
  113. data/perl/HeaderParseService/resources/database/university_list/I.html +138 -0
  114. data/perl/HeaderParseService/resources/database/university_list/J.html +82 -0
  115. data/perl/HeaderParseService/resources/database/university_list/K.html +115 -0
  116. data/perl/HeaderParseService/resources/database/university_list/L.html +131 -0
  117. data/perl/HeaderParseService/resources/database/university_list/M.html +201 -0
  118. data/perl/HeaderParseService/resources/database/university_list/N.html +204 -0
  119. data/perl/HeaderParseService/resources/database/university_list/O.html +89 -0
  120. data/perl/HeaderParseService/resources/database/university_list/P.html +125 -0
  121. data/perl/HeaderParseService/resources/database/university_list/Q.html +49 -0
  122. data/perl/HeaderParseService/resources/database/university_list/R.html +126 -0
  123. data/perl/HeaderParseService/resources/database/university_list/S.html +296 -0
  124. data/perl/HeaderParseService/resources/database/university_list/T.html +156 -0
  125. data/perl/HeaderParseService/resources/database/university_list/U.html +800 -0
  126. data/perl/HeaderParseService/resources/database/university_list/V.html +75 -0
  127. data/perl/HeaderParseService/resources/database/university_list/W.html +144 -0
  128. data/perl/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
  129. data/perl/HeaderParseService/resources/database/university_list/X.html +44 -0
  130. data/perl/HeaderParseService/resources/database/university_list/Y.html +53 -0
  131. data/perl/HeaderParseService/resources/database/university_list/Z.html +43 -0
  132. data/perl/HeaderParseService/resources/database/university_list/ae.html +31 -0
  133. data/perl/HeaderParseService/resources/database/university_list/am.html +30 -0
  134. data/perl/HeaderParseService/resources/database/university_list/ar.html +35 -0
  135. data/perl/HeaderParseService/resources/database/university_list/at.html +43 -0
  136. data/perl/HeaderParseService/resources/database/university_list/au.html +82 -0
  137. data/perl/HeaderParseService/resources/database/university_list/bd.html +28 -0
  138. data/perl/HeaderParseService/resources/database/university_list/be.html +41 -0
  139. data/perl/HeaderParseService/resources/database/university_list/bg.html +28 -0
  140. data/perl/HeaderParseService/resources/database/university_list/bh.html +28 -0
  141. data/perl/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
  142. data/perl/HeaderParseService/resources/database/university_list/bm.html +28 -0
  143. data/perl/HeaderParseService/resources/database/university_list/bn.html +28 -0
  144. data/perl/HeaderParseService/resources/database/university_list/br.html +66 -0
  145. data/perl/HeaderParseService/resources/database/university_list/ca.html +174 -0
  146. data/perl/HeaderParseService/resources/database/university_list/ch.html +52 -0
  147. data/perl/HeaderParseService/resources/database/university_list/cl.html +40 -0
  148. data/perl/HeaderParseService/resources/database/university_list/cn.html +87 -0
  149. data/perl/HeaderParseService/resources/database/university_list/co.html +39 -0
  150. data/perl/HeaderParseService/resources/database/university_list/cr.html +34 -0
  151. data/perl/HeaderParseService/resources/database/university_list/cy.html +34 -0
  152. data/perl/HeaderParseService/resources/database/university_list/cz.html +44 -0
  153. data/perl/HeaderParseService/resources/database/university_list/de.html +128 -0
  154. data/perl/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
  155. data/perl/HeaderParseService/resources/database/university_list/dk.html +42 -0
  156. data/perl/HeaderParseService/resources/database/university_list/ec.html +31 -0
  157. data/perl/HeaderParseService/resources/database/university_list/ee.html +30 -0
  158. data/perl/HeaderParseService/resources/database/university_list/eg.html +29 -0
  159. data/perl/HeaderParseService/resources/database/university_list/es.html +68 -0
  160. data/perl/HeaderParseService/resources/database/university_list/et.html +28 -0
  161. data/perl/HeaderParseService/resources/database/university_list/faq.html +147 -0
  162. data/perl/HeaderParseService/resources/database/university_list/fi.html +49 -0
  163. data/perl/HeaderParseService/resources/database/university_list/fj.html +28 -0
  164. data/perl/HeaderParseService/resources/database/university_list/fo.html +28 -0
  165. data/perl/HeaderParseService/resources/database/university_list/fr.html +106 -0
  166. data/perl/HeaderParseService/resources/database/university_list/geog.html +150 -0
  167. data/perl/HeaderParseService/resources/database/university_list/gr.html +38 -0
  168. data/perl/HeaderParseService/resources/database/university_list/gu.html +28 -0
  169. data/perl/HeaderParseService/resources/database/university_list/hk.html +34 -0
  170. data/perl/HeaderParseService/resources/database/university_list/hr.html +28 -0
  171. data/perl/HeaderParseService/resources/database/university_list/hu.html +46 -0
  172. data/perl/HeaderParseService/resources/database/university_list/id.html +29 -0
  173. data/perl/HeaderParseService/resources/database/university_list/ie.html +49 -0
  174. data/perl/HeaderParseService/resources/database/university_list/il.html +35 -0
  175. data/perl/HeaderParseService/resources/database/university_list/in.html +109 -0
  176. data/perl/HeaderParseService/resources/database/university_list/is.html +32 -0
  177. data/perl/HeaderParseService/resources/database/university_list/it.html +75 -0
  178. data/perl/HeaderParseService/resources/database/university_list/jm.html +28 -0
  179. data/perl/HeaderParseService/resources/database/university_list/jo.html +28 -0
  180. data/perl/HeaderParseService/resources/database/university_list/jp.html +155 -0
  181. data/perl/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
  182. data/perl/HeaderParseService/resources/database/university_list/kr.html +65 -0
  183. data/perl/HeaderParseService/resources/database/university_list/kw.html +28 -0
  184. data/perl/HeaderParseService/resources/database/university_list/lb.html +28 -0
  185. data/perl/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
  186. data/perl/HeaderParseService/resources/database/university_list/lk.html +30 -0
  187. data/perl/HeaderParseService/resources/database/university_list/lt.html +31 -0
  188. data/perl/HeaderParseService/resources/database/university_list/lu.html +34 -0
  189. data/perl/HeaderParseService/resources/database/university_list/lv.html +30 -0
  190. data/perl/HeaderParseService/resources/database/university_list/ma.html +28 -0
  191. data/perl/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
  192. data/perl/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
  193. data/perl/HeaderParseService/resources/database/university_list/mk.html +29 -0
  194. data/perl/HeaderParseService/resources/database/university_list/mo.html +29 -0
  195. data/perl/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
  196. data/perl/HeaderParseService/resources/database/university_list/mt.html +28 -0
  197. data/perl/HeaderParseService/resources/database/university_list/mx.html +68 -0
  198. data/perl/HeaderParseService/resources/database/university_list/my.html +39 -0
  199. data/perl/HeaderParseService/resources/database/university_list/ni.html +28 -0
  200. data/perl/HeaderParseService/resources/database/university_list/nl.html +51 -0
  201. data/perl/HeaderParseService/resources/database/university_list/no.html +56 -0
  202. data/perl/HeaderParseService/resources/database/university_list/nz.html +41 -0
  203. data/perl/HeaderParseService/resources/database/university_list/pa.html +31 -0
  204. data/perl/HeaderParseService/resources/database/university_list/pe.html +40 -0
  205. data/perl/HeaderParseService/resources/database/university_list/ph.html +41 -0
  206. data/perl/HeaderParseService/resources/database/university_list/pl.html +51 -0
  207. data/perl/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
  208. data/perl/HeaderParseService/resources/database/university_list/pr.html +31 -0
  209. data/perl/HeaderParseService/resources/database/university_list/ps.html +28 -0
  210. data/perl/HeaderParseService/resources/database/university_list/pt.html +45 -0
  211. data/perl/HeaderParseService/resources/database/university_list/recognition.html +69 -0
  212. data/perl/HeaderParseService/resources/database/university_list/results.html +71 -0
  213. data/perl/HeaderParseService/resources/database/university_list/ro.html +38 -0
  214. data/perl/HeaderParseService/resources/database/university_list/ru.html +48 -0
  215. data/perl/HeaderParseService/resources/database/university_list/sd.html +28 -0
  216. data/perl/HeaderParseService/resources/database/university_list/se.html +57 -0
  217. data/perl/HeaderParseService/resources/database/university_list/sg.html +33 -0
  218. data/perl/HeaderParseService/resources/database/university_list/si.html +30 -0
  219. data/perl/HeaderParseService/resources/database/university_list/sk.html +35 -0
  220. data/perl/HeaderParseService/resources/database/university_list/th.html +45 -0
  221. data/perl/HeaderParseService/resources/database/university_list/tr.html +44 -0
  222. data/perl/HeaderParseService/resources/database/university_list/tw.html +76 -0
  223. data/perl/HeaderParseService/resources/database/university_list/ua.html +29 -0
  224. data/perl/HeaderParseService/resources/database/university_list/uk.html +168 -0
  225. data/perl/HeaderParseService/resources/database/university_list/univ-full.html +3166 -0
  226. data/perl/HeaderParseService/resources/database/university_list/univ.html +122 -0
  227. data/perl/HeaderParseService/resources/database/university_list/uy.html +31 -0
  228. data/perl/HeaderParseService/resources/database/university_list/ve.html +34 -0
  229. data/perl/HeaderParseService/resources/database/university_list/yu.html +28 -0
  230. data/perl/HeaderParseService/resources/database/university_list/za.html +46 -0
  231. data/perl/HeaderParseService/resources/database/university_list/zm.html +28 -0
  232. data/perl/HeaderParseService/resources/database/university_list.txt +3025 -0
  233. data/perl/HeaderParseService/resources/database/url.txt +1 -0
  234. data/perl/HeaderParseService/resources/database/webTopWords +225 -0
  235. data/perl/HeaderParseService/resources/database/words +45402 -0
  236. data/perl/HeaderParseService/resources/models/10ContextModelfold1 +369 -0
  237. data/perl/HeaderParseService/resources/models/10Modelfold1 +376 -0
  238. data/perl/HeaderParseService/resources/models/11ContextModelfold1 +400 -0
  239. data/perl/HeaderParseService/resources/models/11Modelfold1 +526 -0
  240. data/perl/HeaderParseService/resources/models/12ContextModelfold1 +510 -0
  241. data/perl/HeaderParseService/resources/models/12Modelfold1 +423 -0
  242. data/perl/HeaderParseService/resources/models/13ContextModelfold1 +364 -0
  243. data/perl/HeaderParseService/resources/models/13Modelfold1 +677 -0
  244. data/perl/HeaderParseService/resources/models/14ContextModelfold1 +459 -0
  245. data/perl/HeaderParseService/resources/models/14Modelfold1 +325 -0
  246. data/perl/HeaderParseService/resources/models/15ContextModelfold1 +340 -0
  247. data/perl/HeaderParseService/resources/models/15Modelfold1 +390 -0
  248. data/perl/HeaderParseService/resources/models/1ContextModelfold1 +668 -0
  249. data/perl/HeaderParseService/resources/models/1Modelfold1 +1147 -0
  250. data/perl/HeaderParseService/resources/models/2ContextModelfold1 +755 -0
  251. data/perl/HeaderParseService/resources/models/2Modelfold1 +796 -0
  252. data/perl/HeaderParseService/resources/models/3ContextModelfold1 +1299 -0
  253. data/perl/HeaderParseService/resources/models/3Modelfold1 +1360 -0
  254. data/perl/HeaderParseService/resources/models/4ContextModelfold1 +1062 -0
  255. data/perl/HeaderParseService/resources/models/4Modelfold1 +993 -0
  256. data/perl/HeaderParseService/resources/models/5ContextModelfold1 +1339 -0
  257. data/perl/HeaderParseService/resources/models/5Modelfold1 +2098 -0
  258. data/perl/HeaderParseService/resources/models/6ContextModelfold1 +888 -0
  259. data/perl/HeaderParseService/resources/models/6Modelfold1 +620 -0
  260. data/perl/HeaderParseService/resources/models/7ContextModelfold1 +257 -0
  261. data/perl/HeaderParseService/resources/models/7Modelfold1 +228 -0
  262. data/perl/HeaderParseService/resources/models/8ContextModelfold1 +677 -0
  263. data/perl/HeaderParseService/resources/models/8Modelfold1 +1871 -0
  264. data/perl/HeaderParseService/resources/models/9ContextModelfold1 +198 -0
  265. data/perl/HeaderParseService/resources/models/9Modelfold1 +170 -0
  266. data/perl/HeaderParseService/resources/models/NameSpaceModel +181 -0
  267. data/perl/HeaderParseService/resources/models/NameSpaceTrainF +347 -0
  268. data/perl/HeaderParseService/resources/models/WrapperBaseFeaDict +13460 -0
  269. data/perl/HeaderParseService/resources/models/WrapperContextFeaDict +14045 -0
  270. data/perl/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +510 -0
  271. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +23 -0
  272. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +23 -0
  273. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +23 -0
  274. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +23 -0
  275. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +23 -0
  276. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +23 -0
  277. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +23 -0
  278. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +23 -0
  279. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +23 -0
  280. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +23 -0
  281. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +23 -0
  282. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +23 -0
  283. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +23 -0
  284. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +23 -0
  285. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +23 -0
  286. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +23 -0
  287. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +23 -0
  288. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +23 -0
  289. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +23 -0
  290. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +23 -0
  291. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +23 -0
  292. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +23 -0
  293. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +23 -0
  294. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +23 -0
  295. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +23 -0
  296. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +23 -0
  297. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +23 -0
  298. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +23 -0
  299. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +23 -0
  300. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +23 -0
  301. data/perl/ParsCit/README.TXT +82 -0
  302. data/perl/ParsCit/crfpp/traindata/parsCit.template +60 -0
  303. data/perl/ParsCit/crfpp/traindata/parsCit.train.data +12104 -0
  304. data/perl/ParsCit/crfpp/traindata/tagged_references.txt +500 -0
  305. data/perl/ParsCit/lib/CSXUtil/SafeText.pm +140 -0
  306. data/perl/ParsCit/lib/ParsCit/Citation.pm +462 -0
  307. data/perl/ParsCit/lib/ParsCit/CitationContext.pm +132 -0
  308. data/perl/ParsCit/lib/ParsCit/Config.pm +46 -0
  309. data/perl/ParsCit/lib/ParsCit/Controller.pm +306 -0
  310. data/perl/ParsCit/lib/ParsCit/PostProcess.pm +367 -0
  311. data/perl/ParsCit/lib/ParsCit/PreProcess.pm +333 -0
  312. data/perl/ParsCit/lib/ParsCit/Tr2crfpp.pm +331 -0
  313. data/perl/ParsCit/resources/parsCit.model +0 -0
  314. data/perl/ParsCit/resources/parsCitDict.txt +148783 -0
  315. data/perl/extract.pl +199 -0
  316. data/spec/biblicit/cb2bib_spec.rb +48 -0
  317. data/spec/biblicit/citeseer_spec.rb +40 -0
  318. data/spec/fixtures/pdf/10.1.1.109.4049.pdf +0 -0
  319. data/spec/fixtures/pdf/Bagnoli Watts TAR 2010.pdf +0 -0
  320. data/spec/fixtures/pdf/ICINCO_2010.pdf +0 -0
  321. data/spec/spec_helper.rb +3 -0
  322. metadata +474 -0
@@ -0,0 +1,367 @@
1
+ package ParsCit::PostProcess;
2
+ #
3
+ # Utilities for normalizing the output of CRF++ into standard
4
+ # representations.
5
+ #
6
+ # Isaac Councill, 07/20/07
7
+ #
8
+
9
+ use strict;
10
+ use utf8;
11
+
12
+ ##
13
+ # Main normalization subroutine. Reads in a CRF++ output file
14
+ # and normalizes each field of individual citations. An intermediate
15
+ # XML representation is used to keep track of the tags discovered by
16
+ # the model. Returns a reference to the raw XML (may not be encoded
17
+ # safely) and a reference to a list of hashes containing the normalized
18
+ # citation subfields, keyed by tag name.
19
+ ##
20
+ sub readAndNormalize {
21
+ my ($inFile) = @_;
22
+
23
+ my $status = 1;
24
+ my $msg = "";
25
+
26
+ open(IN, "<:utf8", $inFile) or return (undef, undef, 0,
27
+ "couldn't open infile: $!");
28
+
29
+ my $currentTag;
30
+ my @currentTokens = ();
31
+
32
+ my $newCitation = 1;
33
+
34
+ my $xml = "";
35
+
36
+ while(<IN>) {
37
+ if (m/^\s*$/) { # blank line separates citations
38
+ if ($newCitation <= 0) {
39
+ finishCitation(\$xml, \$currentTag, \@currentTokens);
40
+ @currentTokens = ();
41
+ $newCitation = 1;
42
+ next;
43
+ }
44
+ }
45
+ if ($newCitation > 0) {
46
+ $xml .= "<citation>\n";
47
+ $newCitation = 0;
48
+ }
49
+ my @fields = split /\s+/;
50
+ my $token = $fields[0];
51
+ my $tag = $fields[$#fields];
52
+ if (!defined $currentTag) {
53
+ $currentTag = $tag;
54
+ }
55
+ if ($tag eq $currentTag) {
56
+ push @currentTokens, $token;
57
+ } else {
58
+ $xml .= makeSegment($currentTag, @currentTokens);
59
+ $currentTag = $tag;
60
+ @currentTokens = ();
61
+ push @currentTokens, $token;
62
+ }
63
+ }
64
+
65
+ close IN;
66
+
67
+ if ($newCitation <= 0) {
68
+ finishCitation(\$xml, \$currentTag, \@currentTokens);
69
+ @currentTokens = ();
70
+ $newCitation = 1;
71
+ }
72
+
73
+ my $rCiteInfo = normalizeFields(\$xml);
74
+
75
+ return \$xml, $rCiteInfo, $status, $msg;
76
+
77
+ } # readAndNormalize
78
+
79
+
80
+ ##
81
+ # Utility for adding a closing tag to a citation in the
82
+ # intermediate XML, and setting the currentTag value to undef.
83
+ ##
84
+ sub finishCitation {
85
+ my ($r_xml, $r_currentTag, $r_currentTokens) = @_;
86
+ if (defined $$r_currentTag) {
87
+ $$r_xml .= makeSegment($$r_currentTag, @$r_currentTokens);
88
+ }
89
+ $$r_xml .= "</citation>\n";
90
+ $$r_currentTag = undef;
91
+
92
+ } # finishCitation
93
+
94
+
95
+ ##
96
+ # Makes an XML segment based on the specifed tag and token list.
97
+ ##
98
+ sub makeSegment {
99
+ my ($tag, @tokens) = @_;
100
+ my $segment = join " ", @tokens;
101
+ return "<$tag>$segment</$tag>\n";
102
+ }
103
+
104
+
105
+ ##
106
+ # Switching utility for reading through the intermediate XMl
107
+ # and passing control to an appropriate normalization routine
108
+ # for each field encountered. Returns a reference to a list
109
+ # of hashes containing normalized fields, keyed by tag name.
110
+ ##
111
+ sub normalizeFields {
112
+ my ($rXML) = @_;
113
+ my @citeInfos = ();
114
+
115
+ $_ = $$rXML;
116
+ my @citeBlocks = m/<citation>(.*?)<\/citation>/gs;
117
+ foreach my $block (@citeBlocks) {
118
+ my %citeInfo;
119
+ while($block =~ m/<(.*?)>(.*?)<\/\1>/gs) {
120
+ my ($tag, $content) = ($1, $2);
121
+ if ($tag eq "author") {
122
+ $tag = "authors";
123
+ $content = normalizeAuthorNames($content);
124
+ } elsif ($tag eq "date") {
125
+ $content = normalizeDate($content);
126
+ } elsif ($tag eq "volume") {
127
+ $content = normalizeNumber($content);
128
+ } elsif ($tag eq "number") {
129
+ $content = normalizeNumber($content);
130
+ } elsif ($tag eq "pages") {
131
+ $content = normalizePages($content);
132
+ } else {
133
+ $content = stripPunctuation($content);
134
+ }
135
+ # Heuristic - only get first instance of tag.
136
+ # TODO: we can do better than that...
137
+ unless (defined $citeInfo{$tag} || ! defined $content) {
138
+ $citeInfo{$tag} = $content;
139
+ }
140
+ }
141
+ push @citeInfos, \%citeInfo;
142
+ }
143
+ return \@citeInfos;
144
+
145
+ } # normalizeFields
146
+
147
+
148
+ sub stripPunctuation {
149
+ my $text = shift;
150
+ $text =~ s/^[^\p{IsLower}\p{IsUpper}0-9]+//;
151
+ $text =~ s/[^\p{IsLower}\p{IsUpper}0-9]+$//;
152
+ return $text;
153
+ }
154
+
155
+
156
+ ##
157
+ # Tries to split the author tokens into individual author names
158
+ # and then normalizes these names individually. Returns a
159
+ # list of author names.
160
+ ##
161
+ sub normalizeAuthorNames {
162
+ my ($authorText) = @_;
163
+
164
+ my @tokens = repairAndTokenizeAuthorText($authorText);
165
+
166
+ my @authors = ();
167
+ my @currentAuth = ();
168
+ my $beginAuth = 1;
169
+
170
+ foreach my $tok (@tokens) {
171
+ if ($tok =~ m/^(&|and)$/i) {
172
+ if ($#currentAuth >= 0) {
173
+ my $auth = normalizeAuthorName(@currentAuth);
174
+ push @authors, $auth;
175
+ }
176
+ @currentAuth = ();
177
+ $beginAuth = 1;
178
+ next;
179
+ }
180
+ if ($beginAuth > 0) {
181
+ push @currentAuth, $tok;
182
+ $beginAuth = 0;
183
+ next;
184
+ }
185
+ if ($tok =~ m/,$/) {
186
+ push @currentAuth, $tok;
187
+ if ($#currentAuth>0) {
188
+ my $auth = normalizeAuthorName(@currentAuth);
189
+ push @authors, $auth;
190
+ @currentAuth = ();
191
+ $beginAuth = 1;
192
+ }
193
+ } else {
194
+ push @currentAuth, $tok;
195
+ }
196
+ }
197
+ if ($#currentAuth >= 0) {
198
+ my $auth = normalizeAuthorName(@currentAuth);
199
+ push @authors, $auth;
200
+ }
201
+ return \@authors;
202
+
203
+ } # normalizeAuthorNames
204
+
205
+
206
+ ##
207
+ # Strips unexpected punctuation and removes tokens that
208
+ # are obviously not name words from the token list.
209
+ ##
210
+ sub repairAndTokenizeAuthorText {
211
+ my ($authorText) = @_;
212
+
213
+ # Repair obvious parse errors and weird notations.
214
+ $authorText =~ s/et\.? al\.?.*$//;
215
+ $authorText =~ s/^.*?[\p{IsUpper}\p{IsLower}][\p{IsUpper}\p{IsLower}]+\. //;
216
+ $authorText =~ s/\(.*?\)//g;
217
+ $authorText =~ s/^.*?\)\.?//g;
218
+ $authorText =~ s/\(.*?$//g;
219
+
220
+ $authorText =~ s/\[.*?\]//g;
221
+ $authorText =~ s/^.*?\]\.?//g;
222
+ $authorText =~ s/\[.*?$//g;
223
+
224
+ $authorText =~ s/;/,/g;
225
+ $authorText =~ s/,/, /g;
226
+ $authorText =~ s/\:/ /g;
227
+ $authorText =~ s/[\:\"\<\>\/\?\{\}\[\]\+\=\(\)\*\^\%\$\#\@\!\~\_]//g;
228
+ $authorText = joinMultiWordNames($authorText);
229
+
230
+ my @origTokens = split '\s+', $authorText;
231
+ my @tokens = ();
232
+
233
+ for (my $i=0; $i<=$#origTokens; $i++) {
234
+ my $tok = $origTokens[$i];
235
+ if ($tok !~ m/[\p{IsUpper}\p{IsLower}&]/) {
236
+ if ($i < $#origTokens/2) {
237
+ # Probably got junk up to now.
238
+ @tokens = ();
239
+ next;
240
+ } else {
241
+ last;
242
+ }
243
+ }
244
+ if ($tok =~ m/^(jr|sr|ph\.?d|m\.?d|esq)\.?\,?$/i) {
245
+ if ($tokens[$#tokens] =~ m/\,$/) {
246
+ next;
247
+ }
248
+ }
249
+ if ($tok =~ m/^[IVX][IVX]+\.?\,?$/) {
250
+ next;
251
+ }
252
+ push @tokens, $tok;
253
+ }
254
+ return @tokens;
255
+
256
+ } #repairAndTokenizeAuthorText
257
+
258
+
259
+ ##
260
+ # Tries to normalize an individual author name into the form
261
+ # "First Middle Last", without punctuation.
262
+ ##
263
+ sub normalizeAuthorName {
264
+ my @authTokens = @_;
265
+ if ($#authTokens < 0) {
266
+ return "";
267
+ }
268
+
269
+ my $tmpStr = join " ", @authTokens;
270
+ if ($tmpStr =~ m/(.+),\s*(.+)/) {
271
+ $tmpStr = "$2 $1";
272
+ }
273
+
274
+ $tmpStr =~ s/\.\-/-/g;
275
+ $tmpStr =~ s/[\,\.]/ /g;
276
+ $tmpStr =~ s/ +/ /g;
277
+ $tmpStr = trim($tmpStr);
278
+
279
+ if ($tmpStr =~ m/^[^\s][^\s]+(\s+[^\s]|\s+[^\s]\-[^\s])+$/) {
280
+ my @newTokens = split '\s+', $tmpStr;
281
+ my @newOrder = @newTokens[1..$#newTokens];
282
+ push @newOrder, $newTokens[0];
283
+ $tmpStr = join " ", @newOrder;
284
+ }
285
+
286
+ return $tmpStr;
287
+
288
+ } # normalizeAuthorName
289
+
290
+
291
+ ##
292
+ # Utility for creating an intermediate representation of multi-word
293
+ # name components, e.g., transforms "van der Wald" to "van_dir_Wald".
294
+ # this helps keep things straight during normalization. The
295
+ # underscores can be stripped out later.
296
+ ##
297
+ sub joinMultiWordNames {
298
+ my $authorText = shift;
299
+ $authorText =~ s/\b((?:van|von|der|den|de|di|le|el))\s/\1_/sgi;
300
+ return $authorText;
301
+
302
+ } # joinMultiWordNames
303
+
304
+
305
+ ##
306
+ # Normalizes a date field into just the year. Looks for a string of
307
+ # four digits.
308
+ ##
309
+ sub normalizeDate {
310
+ my $dateText = shift;
311
+ if ($dateText =~ m/(\d{4})/) {
312
+ my $year = $1;
313
+ # check to see whether this is a sane year setting
314
+ my @timeData = localtime(time);
315
+ my $currentYear = $timeData[5]+1900;
316
+ if ($year <= $currentYear+3) {
317
+ return $1;
318
+ }
319
+ }
320
+
321
+ } # normalizeDate
322
+
323
+
324
+ ##
325
+ # If a field should be numeric only, this utility is used
326
+ # to extract the first number string only.
327
+ ##
328
+ sub normalizeNumber {
329
+ my $numText = shift;
330
+ if ($numText =~ m/(\d+)/) {
331
+ return $1;
332
+ } else {
333
+ return $numText;
334
+ }
335
+
336
+ } # normalizeNumber
337
+
338
+
339
+ ##
340
+ # Normalizes page fields into the form "start--end". If the page
341
+ # field does not appear to be in a standard form, does nothing.
342
+ ##
343
+ sub normalizePages {
344
+ my $pageText = shift;
345
+ if ($pageText =~ m/(\d+)[^\d]+?(\d+)/) {
346
+ if ($1>=$2) {
347
+ return undef;
348
+ }
349
+ return "$1--$2";
350
+ } elsif ($pageText =~ m/(\d+)/) {
351
+ return $1;
352
+ } else {
353
+ return undef;
354
+ }
355
+
356
+ } # normalizePages
357
+
358
+
359
+ sub trim {
360
+ my $str = shift;
361
+ $str =~ s/^\s+//;
362
+ $str =~ s/\s+$//;
363
+ return $str;
364
+ }
365
+
366
+
367
+ 1;
@@ -0,0 +1,333 @@
1
+ package ParsCit::PreProcess;
2
+ #
3
+ # Utilities for finding and normalizing citations within
4
+ # text files, including separating citation text from
5
+ # body text and segmenting citations.
6
+ #
7
+ # Isaac Councill, 7/19/07
8
+ #
9
+
10
+ use strict;
11
+ use utf8;
12
+ use ParsCit::Citation;
13
+
14
+ my %markerTypes = (
15
+ 'SQUARE' => '\\[.+?\\]',
16
+ 'PAREN' => '\\(.+?\\)',
17
+ 'NAKEDNUM' => '\\d+',
18
+ 'NAKEDNUMDOT' => '\\d+\\.',
19
+ );
20
+
21
+
22
+ ##
23
+ # Looks for reference section markers in the supplied text and
24
+ # separates the citation text from the body text based on these
25
+ # indicators. If it looks like there is a reference section marker
26
+ # too early in the document, this procedure will try to find later
27
+ # ones. If the final reference section is still too long, an empty
28
+ # citation text string will be returned. Returns references to
29
+ # the citation text, normalized body text, and original body text.
30
+ ##
31
+ sub findCitationText {
32
+ my ($rText) = @_;
33
+ my $text = $$rText;
34
+ my $bodyText = '0';
35
+ my $citeText = '0';
36
+
37
+ while ($text =~ m/\b(References?|REFERENCES?|Bibliography|BIBLIOGRAPHY|References?\s+and\s+Notes?|References?\s+Cited|REFERENCE?\s+CITED|REFERENCES?\s+AND\s+NOTES?):?\s*\n+/sg) {
38
+ $bodyText = substr $text, 0, pos $text;
39
+ $citeText = substr $text, pos $text unless (pos $text < 1);
40
+ }
41
+ if (length($citeText) >= 0.8*length($bodyText)) {
42
+ print STDERR "Citation text longer than article body: ignoring\n";
43
+ $citeText = "";
44
+ return \$citeText, \normalizeBodyText(\$bodyText), \$bodyText;
45
+ }
46
+ my ($sciteText, $tmp) = split(/^([\s\d\.]+)?(Acknowledge?ments?|Autobiographical|Tables?|Appendix|Exhibit|Annex|Fig|Notes?)(.*?)\n+/m, $citeText);
47
+ if (length($sciteText)>0) {
48
+ $citeText = $sciteText;
49
+ }
50
+
51
+ if ($citeText eq '0' || !defined $citeText) {
52
+ print STDERR "warning: no citation text found\n";
53
+ }
54
+
55
+ return (normalizeCiteText(\$citeText),
56
+ normalizeBodyText(\$bodyText),
57
+ \$bodyText);
58
+
59
+ } # findCitationText
60
+
61
+
62
+ ##
63
+ # Removes lines that appear to be junk from the citation text.
64
+ ##
65
+ sub normalizeCiteText {
66
+ my ($rCiteText) = @_;
67
+
68
+ my @lines = split "\n", $$rCiteText;
69
+ my @newLines = ();
70
+ foreach my $line (@lines) {
71
+ if ($line =~ m/^[\s\d]*$/) {
72
+ next;
73
+ }
74
+ push @newLines, $line;
75
+ }
76
+ my $newText = join "\n", @newLines;
77
+ return \$newText;
78
+
79
+ } # normalizeCiteText
80
+
81
+
82
+ ##
83
+ # Removes lines that appear to be junk from the body text,
84
+ # de-hyphenates words where a hyphen occurs at the end of
85
+ # a line, and normalizes strings of blank spaces to only
86
+ # single blancks.
87
+ ##
88
+ sub normalizeBodyText {
89
+ my ($rText) = @_;
90
+ my @lines = split "\n", $$rText;
91
+ my $text = "";
92
+ foreach my $line (@lines) {
93
+ if ($line =~ m/^\s*$/) {
94
+ next;
95
+ }
96
+ if ($text =~ s/(\w)\-$/$1/) {
97
+ $text .= $line;
98
+ } else {
99
+ $text .= " ".$line;
100
+ }
101
+ }
102
+ $text =~ s/\s\s+/\s/g;
103
+ return \$text;
104
+
105
+ } # normalizeBodyText
106
+
107
+
108
+ ##
109
+ # Controls the process by which citations are segmented,
110
+ # based on the result of trying to guess the type of
111
+ # citation marker used in the reference section. Returns
112
+ # a reference to a list of citation objects.
113
+ ##
114
+ sub segmentCitations {
115
+ my ($rCiteText) = @_;
116
+ my $markerType = guessMarkerType($rCiteText);
117
+
118
+ my $rCitations;
119
+
120
+ if ($markerType ne 'UNKNOWN') {
121
+ $rCitations = splitCitationsByMarker($rCiteText, $markerType);
122
+ } else {
123
+ $rCitations = splitUnmarkedCitations($rCiteText);
124
+ }
125
+
126
+ return $rCitations;
127
+
128
+ } # segmentCitations
129
+
130
+
131
+ ##
132
+ # Segments citations that have explicit markers in the
133
+ # reference section. Whenever a new line starts with an
134
+ # expression that matches what we'd expect of a marker,
135
+ # a new citation is started. Returns a reference to a
136
+ # list of citation objects.
137
+ ##
138
+ sub splitCitationsByMarker {
139
+ my ($rCiteText, $markerType) = @_;
140
+ my @citations;
141
+ my $currentCitation = new ParsCit::Citation();
142
+ my $currentCitationString;
143
+
144
+ # TODO: Might want to add a check that marker number is
145
+ # increasing as we'd expect, if the marker is numeric.
146
+
147
+ foreach my $line (split "\n", $$rCiteText) {
148
+ if ($line =~ m/^\s*($markerTypes{$markerType})\s*(.*)$/) {
149
+ my ($marker, $citeString) = ($1, $2);
150
+ if (defined $currentCitationString) {
151
+ $currentCitation->setString($currentCitationString);
152
+ push @citations, $currentCitation;
153
+ $currentCitationString = undef;
154
+ }
155
+ $currentCitation = new ParsCit::Citation();
156
+ $currentCitation->setMarkerType($markerType);
157
+ $currentCitation->setMarker($marker);
158
+ $currentCitationString = $citeString;
159
+ } else {
160
+ if ($currentCitationString =~ m/\w\-$/) {
161
+ # merge words when lines are hyphenated
162
+ $currentCitationString =~ s/\-$//;
163
+ $currentCitationString .= $line;
164
+ } else {
165
+ $currentCitationString .= " ".$line;
166
+ }
167
+ }
168
+ }
169
+ if (defined $currentCitation && defined $currentCitationString) {
170
+ $currentCitation->setString($currentCitationString);
171
+ push @citations, $currentCitation;
172
+ }
173
+ return \@citations;
174
+
175
+ } # splitCitationsByMarker
176
+
177
+
178
+ ##
179
+ # Uses several heuristics to decide where individual citations
180
+ # begin and end based on the length of previous lines, strings
181
+ # that look like author lists, and punctuation. Returns a
182
+ # reference to a list of citation objects.
183
+ ##
184
+ sub splitUnmarkedCitations {
185
+ my ($rCiteText) = @_;
186
+ my @content = split "\n", $$rCiteText;
187
+ my @citeStarts = ();
188
+ my $citeStart = 0;
189
+ my @citations = ();
190
+
191
+ for (my $i=0; $i<=$#content; $i++) {
192
+ if ($content[$i] =~ m/\b\(?[1-2][0-9]{3}[\p{IsLower}]?[\)?\s,\.]*(\s|\b)/s) {
193
+ for (my $k=$i; $k > $citeStart; $k--) {
194
+ if ($content[$k] =~ m/\s*[\p{IsUpper}]/g) {
195
+
196
+ # If length of previous line is extremely small,
197
+ # start a new citation here.
198
+ if (length($content[$k-1]) < 2) {
199
+ $citeStart = $k;
200
+ last;
201
+ }
202
+
203
+ # Start looking backwards for lines that could
204
+ # be author lists - these usually start the
205
+ # citation, have several separation characters (,;),
206
+ # and shouldn't contain any numbers.
207
+ my $beginningAuthorLine = -1;
208
+ for (my $j=$k-1; $j>$citeStart; $j--) {
209
+ if ($content[$j] =~ m/\d/) {
210
+ last;
211
+ }
212
+ $_ = $content[$j];
213
+ my $nSep = s/([,;])/\1/g;
214
+ if ($nSep >= 3) {
215
+ if (($content[$j-1] =~ m/\.\s*$/) || $j==0) {
216
+ $beginningAuthorLine = $j;
217
+ }
218
+ } else {
219
+ last;
220
+ }
221
+ }
222
+ if ($beginningAuthorLine >= 0) {
223
+ $citeStart = $beginningAuthorLine;
224
+ last;
225
+ }
226
+
227
+ # Now that the backwards author search failed
228
+ # to find any extra lines, start a new citation
229
+ # here if the previous line ends with a ".".
230
+ if ($content[$k-1] =~ m/\.\s*$/) {
231
+ $citeStart = $k;
232
+ last;
233
+ }
234
+ }
235
+ }
236
+ push @citeStarts, $citeStart
237
+ unless (($citeStart <= $citeStarts[$#citeStarts]) &&
238
+ ($citeStart != 0));
239
+ }
240
+ }
241
+ for (my $k=0; $k<$#citeStarts; $k++) {
242
+ my $firstLine = $citeStarts[$k];
243
+ my $lastLine = ($k==$#citeStarts) ? $#content : ($citeStarts[$k+1]-1);
244
+ my $citeString =
245
+ mergeLines(join "\n", @content[$firstLine .. $lastLine]);
246
+ my $citation = new ParsCit::Citation();
247
+ $citation->setString($citeString);
248
+ push @citations, $citation;
249
+ }
250
+ return \@citations;
251
+
252
+ } # splitUnmarkedCitations
253
+
254
+
255
+ ##
256
+ # Merges lines of text by dehyphenating where appropriate,
257
+ # with normal spacing.
258
+ ##
259
+ sub mergeLines {
260
+ my ($text) = shift;
261
+ my @lines = split "\n", $text;
262
+ my $mergedText = "";
263
+ foreach my $line (@lines) {
264
+ $line = trim($line);
265
+ if ($mergedText =~ m/\w\-$/) {
266
+ $mergedText =~ s/\-$//;
267
+ $mergedText .= $line;
268
+ } else {
269
+ $mergedText .= " ".$line;
270
+ }
271
+ }
272
+ return trim($mergedText);
273
+
274
+ } # mergeLines
275
+
276
+
277
+ ##
278
+ # Uses a list of regular expressions that match common citation
279
+ # markers to count the number of matches for each type in the
280
+ # text. If a sufficient number of matches to a particular type
281
+ # are found, we can be reasonably sure of the type.
282
+ ##
283
+ sub guessMarkerType {
284
+ my ($rCiteText) = @_;
285
+ my $markerType = 'UNKNOWN';
286
+ my %markerObservations;
287
+ foreach my $type (keys %markerTypes) {
288
+ $markerObservations{$type} = 0;
289
+ }
290
+
291
+ my $citeText = "\n".$$rCiteText;
292
+ $_ = $citeText;
293
+ my $nLines = s/\n/\n/gs - 1;
294
+
295
+ while ($citeText =~ m/\n\s*($markerTypes{'SQUARE'}([^\n]){10})/sg) {
296
+ $markerObservations{'SQUARE'}++;
297
+ }
298
+
299
+ while ($citeText =~ m/\n\s*($markerTypes{'PAREN'}([^\n]){10})/sg) {
300
+ $markerObservations{'PAREN'}++;
301
+ }
302
+
303
+ while ($citeText =~ m/\n\s*($markerTypes{'NAKEDNUM'} [^\n]{10}) /sg) {
304
+ $markerObservations{'NAKEDNUM'}++;
305
+ }
306
+
307
+ while ($citeText =~ m/\n\s*$markerTypes{'NAKEDNUMDOT'}([^\n]){10}/sg) {
308
+ $markerObservations{'NAKEDNUMDOT'}++;
309
+ }
310
+
311
+ my @sortedObservations =
312
+ sort {$markerObservations{$b} <=> $markerObservations{$a}}
313
+ keys %markerObservations;
314
+
315
+ my $minMarkers = $nLines / 6;
316
+ if ($markerObservations{$sortedObservations[0]} >= $minMarkers) {
317
+ $markerType = $sortedObservations[0];
318
+ }
319
+ return $markerType;
320
+
321
+ } # guessMarkerType
322
+
323
+
324
+ sub trim {
325
+ my $text = shift;
326
+ $text =~ s/^\s+//;
327
+ $text =~ s/\s+$//;
328
+ return $text;
329
+
330
+ } # trim
331
+
332
+
333
+ 1;