biblicit 2.1.0 → 2.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (246) hide show
  1. data/biblicit.gemspec +0 -1
  2. data/lib/biblicit/extractor.rb +2 -7
  3. data/lib/biblicit/parscit.rb +18 -6
  4. data/lib/biblicit/version.rb +1 -1
  5. data/parscit/bin/citeExtract.pl +16 -4
  6. data/{svm-header-parse/HeaderParseService → parscit}/lib/HeaderParse/API/AssembleXMLMetadata.pm +0 -0
  7. data/{svm-header-parse/HeaderParseService → parscit}/lib/HeaderParse/API/Function.pm +0 -0
  8. data/{svm-header-parse/HeaderParseService → parscit}/lib/HeaderParse/API/LoadInformation.pm +0 -0
  9. data/{svm-header-parse/HeaderParseService → parscit}/lib/HeaderParse/API/MultiClassChunking.pm +0 -0
  10. data/{svm-header-parse/HeaderParseService → parscit}/lib/HeaderParse/API/NamePatternMatch.pm +0 -0
  11. data/{svm-header-parse/HeaderParseService → parscit}/lib/HeaderParse/API/Parser.pm +21 -0
  12. data/{svm-header-parse/HeaderParseService → parscit}/lib/HeaderParse/API/ParserMethods.pm +0 -0
  13. data/{svm-header-parse/HeaderParseService → parscit}/lib/HeaderParse/Config/API_Config.pm +11 -10
  14. data/{svm-header-parse/HeaderParseService → parscit/lib/HeaderParse}/README.TXT +0 -0
  15. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/50states +0 -0
  16. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/AddrTopWords.txt +0 -0
  17. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/AffiTopWords.txt +0 -0
  18. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/AffiTopWordsAll.txt +0 -0
  19. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/ChineseSurNames.txt +0 -0
  20. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/Csurnames.bin +0 -0
  21. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/Csurnames_spec.bin +0 -0
  22. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/DomainSuffixes.txt +0 -0
  23. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/LabeledHeader +0 -0
  24. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/README +0 -0
  25. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/TrainMulClassLines +0 -0
  26. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/TrainMulClassLines1 +0 -0
  27. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/abstract.txt +0 -0
  28. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/abstractTopWords +0 -0
  29. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/addr.txt +0 -0
  30. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/affi.txt +0 -0
  31. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/affis.bin +0 -0
  32. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/all_namewords_spec.bin +0 -0
  33. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/allnamewords.bin +0 -0
  34. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/cities_US.txt +0 -0
  35. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/cities_world.txt +0 -0
  36. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/city.txt +0 -0
  37. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/cityname.txt +0 -0
  38. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/country_abbr.txt +0 -0
  39. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/countryname.txt +0 -0
  40. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/dateTopWords +0 -0
  41. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/degree.txt +0 -0
  42. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/email.txt +0 -0
  43. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/excludeWords.txt +0 -0
  44. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/female-names +0 -0
  45. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/firstNames.txt +0 -0
  46. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/firstnames.bin +0 -0
  47. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/firstnames_spec.bin +0 -0
  48. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/intro.txt +0 -0
  49. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/keyword.txt +0 -0
  50. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/keywordTopWords +0 -0
  51. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/male-names +0 -0
  52. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/middleNames.txt +0 -0
  53. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/month.txt +0 -0
  54. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/mul +0 -0
  55. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/mul.label +0 -0
  56. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/mul.label.old +0 -0
  57. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/mul.processed +0 -0
  58. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/mulAuthor +0 -0
  59. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/mulClassStat +0 -0
  60. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/nickname.txt +0 -0
  61. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/nicknames.bin +0 -0
  62. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/note.txt +0 -0
  63. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/page.txt +0 -0
  64. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/phone.txt +0 -0
  65. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/postcode.txt +0 -0
  66. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/pubnum.txt +0 -0
  67. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/statename.bin +0 -0
  68. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/statename.txt +0 -0
  69. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/states_and_abbreviations.txt +0 -0
  70. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/stopwords +0 -0
  71. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/stopwords.bin +0 -0
  72. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/surNames.txt +0 -0
  73. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/surnames.bin +0 -0
  74. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/surnames_spec.bin +0 -0
  75. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list.txt +0 -0
  76. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/A.html +0 -0
  77. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/B.html +0 -0
  78. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/C.html +0 -0
  79. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/D.html +0 -0
  80. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/E.html +0 -0
  81. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/F.html +0 -0
  82. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/G.html +0 -0
  83. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/H.html +0 -0
  84. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/I.html +0 -0
  85. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/J.html +0 -0
  86. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/K.html +0 -0
  87. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/L.html +0 -0
  88. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/M.html +0 -0
  89. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/N.html +0 -0
  90. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/O.html +0 -0
  91. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/P.html +0 -0
  92. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/Q.html +0 -0
  93. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/R.html +0 -0
  94. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/S.html +0 -0
  95. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/T.html +0 -0
  96. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/U.html +0 -0
  97. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/V.html +0 -0
  98. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/W.html +0 -0
  99. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/WCSelect.gif +0 -0
  100. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/X.html +0 -0
  101. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/Y.html +0 -0
  102. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/Z.html +0 -0
  103. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/ae.html +0 -0
  104. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/am.html +0 -0
  105. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/ar.html +0 -0
  106. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/at.html +0 -0
  107. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/au.html +0 -0
  108. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/bd.html +0 -0
  109. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/be.html +0 -0
  110. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/bg.html +0 -0
  111. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/bh.html +0 -0
  112. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/blueribbon.gif +0 -0
  113. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/bm.html +0 -0
  114. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/bn.html +0 -0
  115. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/br.html +0 -0
  116. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/ca.html +0 -0
  117. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/ch.html +0 -0
  118. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/cl.html +0 -0
  119. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/cn.html +0 -0
  120. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/co.html +0 -0
  121. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/cr.html +0 -0
  122. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/cy.html +0 -0
  123. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/cz.html +0 -0
  124. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/de.html +0 -0
  125. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/dean-mainlink.jpg +0 -0
  126. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/dk.html +0 -0
  127. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/ec.html +0 -0
  128. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/ee.html +0 -0
  129. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/eg.html +0 -0
  130. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/es.html +0 -0
  131. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/et.html +0 -0
  132. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/faq.html +0 -0
  133. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/fi.html +0 -0
  134. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/fj.html +0 -0
  135. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/fo.html +0 -0
  136. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/fr.html +0 -0
  137. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/geog.html +0 -0
  138. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/gr.html +0 -0
  139. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/gu.html +0 -0
  140. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/hk.html +0 -0
  141. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/hr.html +0 -0
  142. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/hu.html +0 -0
  143. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/id.html +0 -0
  144. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/ie.html +0 -0
  145. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/il.html +0 -0
  146. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/in.html +0 -0
  147. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/is.html +0 -0
  148. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/it.html +0 -0
  149. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/jm.html +0 -0
  150. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/jo.html +0 -0
  151. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/jp.html +0 -0
  152. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/kaplan.gif +0 -0
  153. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/kr.html +0 -0
  154. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/kw.html +0 -0
  155. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/lb.html +0 -0
  156. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/linkbw2.gif +0 -0
  157. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/lk.html +0 -0
  158. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/lt.html +0 -0
  159. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/lu.html +0 -0
  160. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/lv.html +0 -0
  161. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/ma.html +0 -0
  162. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/maczynski.gif +0 -0
  163. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/mirror.tar +0 -0
  164. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/mk.html +0 -0
  165. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/mo.html +0 -0
  166. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/mseawdm.gif +0 -0
  167. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/mt.html +0 -0
  168. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/mx.html +0 -0
  169. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/my.html +0 -0
  170. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/ni.html +0 -0
  171. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/nl.html +0 -0
  172. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/no.html +0 -0
  173. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/nz.html +0 -0
  174. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/pa.html +0 -0
  175. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/pe.html +0 -0
  176. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/ph.html +0 -0
  177. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/pl.html +0 -0
  178. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/pointcom.gif +0 -0
  179. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/pr.html +0 -0
  180. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/ps.html +0 -0
  181. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/pt.html +0 -0
  182. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/recognition.html +0 -0
  183. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/results.html +0 -0
  184. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/ro.html +0 -0
  185. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/ru.html +0 -0
  186. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/sd.html +0 -0
  187. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/se.html +0 -0
  188. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/sg.html +0 -0
  189. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/si.html +0 -0
  190. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/sk.html +0 -0
  191. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/th.html +0 -0
  192. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/tr.html +0 -0
  193. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/tw.html +0 -0
  194. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/ua.html +0 -0
  195. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/uk.html +0 -0
  196. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/univ-full.html +0 -0
  197. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/univ.html +0 -0
  198. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/uy.html +0 -0
  199. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/ve.html +0 -0
  200. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/yu.html +0 -0
  201. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/za.html +0 -0
  202. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/university_list/zm.html +0 -0
  203. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/url.txt +0 -0
  204. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/webTopWords +0 -0
  205. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/database/words +0 -0
  206. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/10ContextModelfold1 +0 -0
  207. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/10Modelfold1 +0 -0
  208. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/11ContextModelfold1 +0 -0
  209. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/11Modelfold1 +0 -0
  210. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/12ContextModelfold1 +0 -0
  211. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/12Modelfold1 +0 -0
  212. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/13ContextModelfold1 +0 -0
  213. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/13Modelfold1 +0 -0
  214. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/14ContextModelfold1 +0 -0
  215. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/14Modelfold1 +0 -0
  216. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/15ContextModelfold1 +0 -0
  217. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/15Modelfold1 +0 -0
  218. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/1ContextModelfold1 +0 -0
  219. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/1Modelfold1 +0 -0
  220. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/2ContextModelfold1 +0 -0
  221. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/2Modelfold1 +0 -0
  222. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/3ContextModelfold1 +0 -0
  223. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/3Modelfold1 +0 -0
  224. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/4ContextModelfold1 +0 -0
  225. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/4Modelfold1 +0 -0
  226. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/5ContextModelfold1 +0 -0
  227. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/5Modelfold1 +0 -0
  228. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/6ContextModelfold1 +0 -0
  229. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/6Modelfold1 +0 -0
  230. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/7ContextModelfold1 +0 -0
  231. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/7Modelfold1 +0 -0
  232. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/8ContextModelfold1 +0 -0
  233. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/8Modelfold1 +0 -0
  234. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/9ContextModelfold1 +0 -0
  235. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/9Modelfold1 +0 -0
  236. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/NameSpaceModel +0 -0
  237. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/NameSpaceTrainF +0 -0
  238. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/WrapperBaseFeaDict +0 -0
  239. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/WrapperContextFeaDict +0 -0
  240. data/{svm-header-parse/HeaderParseService/resources → parscit/resources/headerParse}/models/WrapperSpaceAuthorFeaDict +0 -0
  241. data/sh/convert_to_text.sh +2 -1
  242. metadata +267 -282
  243. data/lib/biblicit/citeseer.rb +0 -42
  244. data/svm-header-parse/HeaderParseService/lib/CSXUtil/SafeText.pm +0 -140
  245. data/svm-header-parse/HeaderParseService/tmp/.gitignore +0 -4
  246. data/svm-header-parse/extract.pl +0 -75
@@ -1,42 +0,0 @@
1
- # encoding: UTF-8
2
-
3
- require 'tmpdir'
4
- require 'shellwords'
5
- require 'nokogiri'
6
-
7
- module CiteSeer
8
-
9
- PERL_DIR = "#{File.dirname(__FILE__)}/../../svm-header-parse"
10
-
11
- def self.extract(in_file, opts={})
12
- ParseOperation.new(in_file).result
13
- end
14
-
15
- class ParseOperation
16
-
17
- attr_reader :result
18
-
19
- def initialize(in_file)
20
- Dir.mktmpdir do |out_dir|
21
- `#{PERL_DIR}/extract.pl #{in_file.path} #{out_dir}`
22
- output = IO.read("#{out_dir}/out.header")
23
- xml = Nokogiri::XML output
24
- @result = parse(xml)
25
- end
26
- end
27
-
28
- private
29
-
30
- def parse(xml)
31
- {
32
- title: xml.css('title').text,
33
- authors: xml.css('author > name').map { |n| n.text.strip }.reject(&:blank?).uniq,
34
- author_emails: xml.css('author > email').map { |n| n.text.strip }.reject(&:blank?).uniq,
35
- abstract: xml.css('abstract').text,
36
- valid: xml.css('validHeader').first.text == '1',
37
- }
38
- end
39
-
40
- end
41
-
42
- end
@@ -1,140 +0,0 @@
1
- #
2
- # Copyright 2007 Penn State University
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- # http://www.apache.org/licenses/LICENSE-2.0
7
- # Unless required by applicable law or agreed to in writing, software
8
- # distributed under the License is distributed on an "AS IS" BASIS,
9
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
- # See the License for the specific language governing permissions and
11
- # limitations under the License.
12
- #
13
- package CSXUtil::SafeText;
14
- ##
15
- ## Methods for stripping bad (XML unsafe) characters
16
- ## from strings and performing basic HTML entity
17
- ## translations. Also contains a utility (stripArtifacts)
18
- ## for getting rid of crazy control characters and
19
- ## other things that probably aren't proper text.
20
- ##
21
- ## Isaac Councill, 12/06/06
22
- ##
23
- #######################################################
24
- ##
25
- use strict;
26
- use utf8;
27
- require Exporter;
28
-
29
- our ($VERSION, @ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS);
30
-
31
- $VERSION = 1.00;
32
-
33
- @ISA = qw(Exporter);
34
- @EXPORT_OK = qw(@badChars %htmlSpecialChars
35
- %htmlCharEntities &stripBadChars
36
- &encodeHTMLSpecialChars
37
- &decodeHTMLSpecialChars
38
- &cleanXML &cleanAll &stripArtifacts);
39
-
40
-
41
- ##
42
- #######################################################
43
- ##
44
- ## Sharable encoding data.
45
- ##
46
-
47
- ## Hex codes for characters that should never be put into
48
- ## XML - or else parsers will barf.
49
- our @badChars = qw(\x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07
50
- \x08 \x0B \x0C \x0E \x0F \x10 \x11 \x12
51
- \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1A
52
- \x1B \x1C \x1D \x1E \x1F \x7F);
53
-
54
- ## Subset of HTML characters that could be problematic
55
- ## for XML. This is not a complete list of HTML
56
- ## special characters, but more mappings can be added
57
- ## as needed.
58
- our %htmlSpecialCharEncodings = ("&" => "&",
59
- ">" => ">",
60
- "<" => "&lt;",
61
- "\"" => "&quot;"
62
- );
63
-
64
- ## The reverse map.
65
- our %htmlSpecialCharDecodings;
66
- foreach my $key (keys %htmlSpecialCharEncodings) {
67
- my $val = $htmlSpecialCharEncodings{$key};
68
- $htmlSpecialCharDecodings{$val} = $key;
69
- }
70
-
71
-
72
- ##
73
- #######################################################
74
- ##
75
- ## Subroutines
76
- ##
77
-
78
- ## Delete all occurences of bad characters in text,
79
- ## returns a new string that is clean.
80
- sub stripBadChars {
81
- my $rtext = shift;
82
- foreach my $char (@badChars) {
83
- $$rtext =~ s/$char//g;
84
- }
85
- }
86
-
87
-
88
- ## Encodes special characters into HTML equivalents
89
- ## and returns the encoded string.
90
- sub encodeHTMLSpecialChars {
91
- my $rtext = shift;
92
- foreach my $char (keys %htmlSpecialCharEncodings) {
93
- my $code = $htmlSpecialCharEncodings{$char};
94
- $$rtext =~ s/$char/$code/g;
95
- }
96
- }
97
-
98
-
99
- ## Decodes a HTML entities in the supplied string
100
- ## into non-HTML character equivalents and returns
101
- ## the decoded string.
102
- sub decodeHTMLSpecialChars {
103
- my $rtext = shift;
104
- foreach my $code (keys %htmlSpecialCharDecodings) {
105
- my $char = $htmlSpecialCharDecodings{$code};
106
- $$rtext =~ s/$code/$char/g;
107
- }
108
- }
109
-
110
-
111
- ## Strip out any characters that don't look like they
112
- ## belong in a proper, readable text string.
113
- ##
114
- sub stripArtifacts {
115
- my $rtext = shift;
116
- $$rtext =~ s/[^\p{IsAlnum}\p{IsPunct}\p{IsSpace}\p{IsS}]//g;
117
- }
118
-
119
-
120
- ## Convenience routine for executing both XML safety
121
- ## routines in a single call.
122
- ##
123
- sub cleanXML {
124
- my $rtext = shift;
125
- stripBadChars($rtext);
126
- encodeHTMLSpecialChars($rtext);
127
- }
128
-
129
-
130
- ## Clean for XML and also strip out strange characters.
131
- ##
132
- sub cleanAll {
133
- my $rtext = shift;
134
- stripBadChars($rtext);
135
- stripArtifacts($rtext);
136
- encodeHTMLSpecialChars($rtext);
137
- }
138
-
139
-
140
- 1;
@@ -1,4 +0,0 @@
1
- # Ignore everything in this directory
2
- *
3
- # Except this file
4
- !.gitignore
@@ -1,75 +0,0 @@
1
- #!/usr/bin/perl -CSD
2
- use strict;
3
- use FindBin;
4
-
5
- use lib "$FindBin::Bin/HeaderParseService/lib";
6
-
7
- use HeaderParse::API::Parser;
8
- use HeaderParse::Config::API_Config;
9
-
10
- my $argc = scalar(@ARGV);
11
-
12
- if ($argc != 2) {
13
- print "Usage: ./extract.pl path_to_input path_to_output\n";
14
- exit 1;
15
- }
16
-
17
- my $inputPath = $ARGV[0];
18
- my $outputPath = $ARGV[1];
19
-
20
- import($inputPath, $outputPath);
21
-
22
- exit;
23
-
24
- sub import {
25
- my ($filePath, $id) = @_;
26
-
27
- system("mkdir","-p","$id");
28
-
29
- my ($status, $msg) = prep($filePath, $id);
30
- if ($status == 0) {
31
- print STDERR "$id: $msg\n";
32
- }
33
- if ($status == 1) {
34
- print STDOUT "$id\n";
35
- }
36
- }
37
-
38
-
39
- sub prep {
40
- my ($textFile, $id) = @_;
41
-
42
- my ($ehstatus, $msg) = extractHeader($textFile, $id);
43
- if ($ehstatus <= 0) {
44
- return ($ehstatus, $msg);
45
- }
46
-
47
- return (1, "");
48
- }
49
-
50
- sub extractHeader {
51
- my ($textFile, $id) = @_;
52
-
53
- my $jobID;
54
- while($jobID = rand(time)) {
55
- unless(-f $offlineD."$jobID") {
56
- last;
57
- }
58
- }
59
-
60
- my ($status, $msg, $rXML) =
61
- HeaderParse::API::Parser::_parseHeader($textFile, $jobID);
62
-
63
- if ($status <= 0) {
64
- return ($status, $msg);
65
- }
66
-
67
- unless(open(HEAD, ">:utf8", "$outputPath/out.header")) {
68
- return (0, "Unable to open header file: $!");
69
- }
70
-
71
- print HEAD $$rXML;
72
- close HEAD;
73
- return (1);
74
-
75
- }