biblicit 1.0 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (406) hide show
  1. data/.gitmodules +3 -0
  2. data/Gemfile +1 -1
  3. data/README.md +125 -30
  4. data/Rakefile +22 -0
  5. data/biblicit.gemspec +9 -7
  6. data/lib/biblicit/cb2bib.rb +10 -11
  7. data/lib/biblicit/citeseer.rb +14 -26
  8. data/lib/biblicit/extractor.rb +40 -19
  9. data/lib/biblicit/parscit.rb +38 -0
  10. data/parscit/.gitignore +8 -0
  11. data/parscit/CHANGELOG +125 -0
  12. data/parscit/COPYING +674 -0
  13. data/parscit/COPYING.LESSER +165 -0
  14. data/parscit/INSTALL +105 -0
  15. data/parscit/README +97 -0
  16. data/{perl/ParsCit/README.TXT → parscit/USAGE} +25 -15
  17. data/parscit/bin/archtest.pl +31 -0
  18. data/parscit/bin/citeExtract.pl +562 -0
  19. data/parscit/bin/conlleval.pl +315 -0
  20. data/parscit/bin/headExtract.pl +40 -0
  21. data/parscit/bin/parsHed/convert2TokenLevel.pl +138 -0
  22. data/parscit/bin/parsHed/keywordGen.pl +308 -0
  23. data/parscit/bin/parsHed/parseXmlHeader.pl +141 -0
  24. data/parscit/bin/parsHed/redo.parsHed.pl +198 -0
  25. data/parscit/bin/parsHed/tr2crfpp_parsHed.pl +521 -0
  26. data/parscit/bin/parseRefStrings.pl +102 -0
  27. data/parscit/bin/phOutput2xml.pl +223 -0
  28. data/parscit/bin/redo.parsCit.pl +105 -0
  29. data/parscit/bin/sectExtract.pl +149 -0
  30. data/parscit/bin/sectLabel/README +110 -0
  31. data/parscit/bin/sectLabel/README.txt +110 -0
  32. data/parscit/bin/sectLabel/genericSect/crossValidation.rb +98 -0
  33. data/parscit/bin/sectLabel/genericSect/extractFeature.rb +104 -0
  34. data/parscit/bin/sectLabel/genericSectExtract.rb +53 -0
  35. data/parscit/bin/sectLabel/getStructureInfo.pl +156 -0
  36. data/parscit/bin/sectLabel/processOmniXML.pl +1427 -0
  37. data/parscit/bin/sectLabel/processOmniXML_new.pl +1025 -0
  38. data/parscit/bin/sectLabel/processOmniXMLv2.pl +1529 -0
  39. data/parscit/bin/sectLabel/processOmniXMLv3.pl +964 -0
  40. data/parscit/bin/sectLabel/redo.sectLabel.pl +219 -0
  41. data/parscit/bin/sectLabel/simplifyOmniXML.pl +382 -0
  42. data/parscit/bin/sectLabel/single2multi.pl +190 -0
  43. data/parscit/bin/sectLabel/tr2crfpp.pl +158 -0
  44. data/parscit/bin/tr2crfpp.pl +260 -0
  45. data/parscit/bin/xml2train.pl +193 -0
  46. data/parscit/lib/CSXUtil/SafeText.pm +130 -0
  47. data/parscit/lib/Omni/Config.pm +93 -0
  48. data/parscit/lib/Omni/Omnicell.pm +263 -0
  49. data/parscit/lib/Omni/Omnicol.pm +292 -0
  50. data/parscit/lib/Omni/Omnidd.pm +328 -0
  51. data/parscit/lib/Omni/Omnidoc.pm +153 -0
  52. data/parscit/lib/Omni/Omniframe.pm +223 -0
  53. data/parscit/lib/Omni/Omniline.pm +423 -0
  54. data/parscit/lib/Omni/Omnipage.pm +282 -0
  55. data/parscit/lib/Omni/Omnipara.pm +232 -0
  56. data/parscit/lib/Omni/Omnirun.pm +303 -0
  57. data/parscit/lib/Omni/Omnitable.pm +336 -0
  58. data/parscit/lib/Omni/Omniword.pm +162 -0
  59. data/parscit/lib/Omni/Traversal.pm +313 -0
  60. data/parscit/lib/ParsCit/.PostProcess.pm.swp +0 -0
  61. data/parscit/lib/ParsCit/Citation.pm +737 -0
  62. data/parscit/lib/ParsCit/CitationContext.pm +220 -0
  63. data/parscit/lib/ParsCit/Config.pm +35 -0
  64. data/parscit/lib/ParsCit/Controller.pm +653 -0
  65. data/parscit/lib/ParsCit/PostProcess.pm +505 -0
  66. data/parscit/lib/ParsCit/PreProcess.pm +1041 -0
  67. data/parscit/lib/ParsCit/Tr2crfpp.pm +1195 -0
  68. data/parscit/lib/ParsHed/Config.pm +49 -0
  69. data/parscit/lib/ParsHed/Controller.pm +143 -0
  70. data/parscit/lib/ParsHed/PostProcess.pm +322 -0
  71. data/parscit/lib/ParsHed/Tr2crfpp.pm +448 -0
  72. data/{perl/ParsCit/lib/ParsCit/Tr2crfpp.pm → parscit/lib/ParsHed/Tr2crfpp_token.pm} +22 -21
  73. data/parscit/lib/SectLabel/AAMatching.pm +1949 -0
  74. data/parscit/lib/SectLabel/Config.pm +88 -0
  75. data/parscit/lib/SectLabel/Controller.pm +332 -0
  76. data/parscit/lib/SectLabel/PostProcess.pm +425 -0
  77. data/parscit/lib/SectLabel/PreProcess.pm +116 -0
  78. data/parscit/lib/SectLabel/Tr2crfpp.pm +1246 -0
  79. data/parscit/resources/parsCit.model +0 -0
  80. data/parscit/resources/parsCit.split.model +0 -0
  81. data/{perl/ParsCit → parscit}/resources/parsCitDict.txt +205 -0
  82. data/parscit/resources/parsHed/bigram +10 -0
  83. data/parscit/resources/parsHed/keywords +10 -0
  84. data/parscit/resources/parsHed/parsHed.model +0 -0
  85. data/parscit/resources/parsHed/parsHed.template +178 -0
  86. data/parscit/resources/sectLabel/affiliation.model +0 -0
  87. data/parscit/resources/sectLabel/author.model +0 -0
  88. data/parscit/resources/sectLabel/funcWord +320 -0
  89. data/parscit/resources/sectLabel/genericSect.model +0 -0
  90. data/parscit/resources/sectLabel/sectLabel.config +42 -0
  91. data/parscit/resources/sectLabel/sectLabel.configXml +42 -0
  92. data/parscit/resources/sectLabel/sectLabel.model +0 -0
  93. data/sh/convert_to_text.sh +20 -0
  94. data/spec/biblicit/extractor_spec.rb +121 -0
  95. data/spec/fixtures/Review_of_Michael_Tyes_Consciousness_Revisited.docx +0 -0
  96. data/spec/fixtures/critical-infrastructures.ps +63951 -0
  97. data/spec/fixtures/txt/E06-1050.txt +867 -0
  98. data/spec/fixtures/txt/sample1.txt +902 -0
  99. data/spec/fixtures/txt/sample2.txt +394 -0
  100. data/spec/spec_helper.rb +3 -0
  101. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Function.pm +2 -20
  102. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +0 -7
  103. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Parser.pm +0 -2
  104. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +0 -7
  105. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +6 -1
  106. data/svm-header-parse/HeaderParseService/tmp/.gitignore +4 -0
  107. data/svm-header-parse/extract.pl +75 -0
  108. metadata +351 -317
  109. data/perl/DocFilter/lib/DocFilter/Config.pm +0 -35
  110. data/perl/DocFilter/lib/DocFilter/Filter.pm +0 -51
  111. data/perl/FileConversionService/README.TXT +0 -11
  112. data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
  113. data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +0 -140
  114. data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +0 -77
  115. data/perl/FileConversionService/lib/FileConverter/Compression.pm +0 -137
  116. data/perl/FileConversionService/lib/FileConverter/Config.pm +0 -57
  117. data/perl/FileConversionService/lib/FileConverter/Controller.pm +0 -191
  118. data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +0 -61
  119. data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +0 -69
  120. data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +0 -69
  121. data/perl/FileConversionService/lib/FileConverter/PSToText.pm +0 -88
  122. data/perl/FileConversionService/lib/FileConverter/Prescript.pm +0 -68
  123. data/perl/FileConversionService/lib/FileConverter/TET.pm +0 -75
  124. data/perl/FileConversionService/lib/FileConverter/Utils.pm +0 -130
  125. data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +0 -140
  126. data/perl/HeaderParseService/resources/data/EbizHeaders.txt +0 -24330
  127. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +0 -27506
  128. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +0 -26495
  129. data/perl/HeaderParseService/resources/data/tagged_headers.txt +0 -40668
  130. data/perl/HeaderParseService/resources/data/test_header.txt +0 -31
  131. data/perl/HeaderParseService/resources/data/test_header.txt.parsed +0 -31
  132. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +0 -23
  133. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +0 -23
  134. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +0 -23
  135. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +0 -23
  136. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +0 -23
  137. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +0 -23
  138. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +0 -23
  139. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +0 -23
  140. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +0 -23
  141. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +0 -23
  142. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +0 -23
  143. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +0 -23
  144. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +0 -23
  145. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +0 -23
  146. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +0 -23
  147. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +0 -23
  148. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +0 -23
  149. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +0 -23
  150. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +0 -23
  151. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +0 -23
  152. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +0 -23
  153. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +0 -23
  154. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +0 -23
  155. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +0 -23
  156. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +0 -23
  157. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +0 -23
  158. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +0 -23
  159. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +0 -23
  160. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +0 -23
  161. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +0 -23
  162. data/perl/ParsCit/crfpp/traindata/parsCit.template +0 -60
  163. data/perl/ParsCit/crfpp/traindata/parsCit.train.data +0 -12104
  164. data/perl/ParsCit/crfpp/traindata/tagged_references.txt +0 -500
  165. data/perl/ParsCit/lib/CSXUtil/SafeText.pm +0 -140
  166. data/perl/ParsCit/lib/ParsCit/Citation.pm +0 -462
  167. data/perl/ParsCit/lib/ParsCit/CitationContext.pm +0 -132
  168. data/perl/ParsCit/lib/ParsCit/Config.pm +0 -46
  169. data/perl/ParsCit/lib/ParsCit/Controller.pm +0 -306
  170. data/perl/ParsCit/lib/ParsCit/PostProcess.pm +0 -367
  171. data/perl/ParsCit/lib/ParsCit/PreProcess.pm +0 -333
  172. data/perl/ParsCit/resources/parsCit.model +0 -0
  173. data/perl/extract.pl +0 -199
  174. data/spec/biblicit/cb2bib_spec.rb +0 -48
  175. data/spec/biblicit/citeseer_spec.rb +0 -40
  176. /data/{perl → svm-header-parse}/HeaderParseService/README.TXT +0 -0
  177. /data/{perl/DocFilter → svm-header-parse/HeaderParseService}/lib/CSXUtil/SafeText.pm +0 -0
  178. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +0 -0
  179. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +0 -0
  180. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +0 -0
  181. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/50states +0 -0
  182. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AddrTopWords.txt +0 -0
  183. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWords.txt +0 -0
  184. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWordsAll.txt +0 -0
  185. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/ChineseSurNames.txt +0 -0
  186. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames.bin +0 -0
  187. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
  188. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/DomainSuffixes.txt +0 -0
  189. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/LabeledHeader +0 -0
  190. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/README +0 -0
  191. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines +0 -0
  192. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines1 +0 -0
  193. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstract.txt +0 -0
  194. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstractTopWords +0 -0
  195. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/addr.txt +0 -0
  196. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affi.txt +0 -0
  197. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affis.bin +0 -0
  198. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
  199. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/allnamewords.bin +0 -0
  200. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_US.txt +0 -0
  201. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_world.txt +0 -0
  202. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/city.txt +0 -0
  203. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cityname.txt +0 -0
  204. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/country_abbr.txt +0 -0
  205. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/countryname.txt +0 -0
  206. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/dateTopWords +0 -0
  207. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/degree.txt +0 -0
  208. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/email.txt +0 -0
  209. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/excludeWords.txt +0 -0
  210. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/female-names +0 -0
  211. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstNames.txt +0 -0
  212. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames.bin +0 -0
  213. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
  214. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/intro.txt +0 -0
  215. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keyword.txt +0 -0
  216. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keywordTopWords +0 -0
  217. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/male-names +0 -0
  218. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/middleNames.txt +0 -0
  219. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/month.txt +0 -0
  220. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul +0 -0
  221. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label +0 -0
  222. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label.old +0 -0
  223. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.processed +0 -0
  224. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulAuthor +0 -0
  225. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulClassStat +0 -0
  226. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nickname.txt +0 -0
  227. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nicknames.bin +0 -0
  228. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/note.txt +0 -0
  229. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/page.txt +0 -0
  230. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/phone.txt +0 -0
  231. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/postcode.txt +0 -0
  232. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/pubnum.txt +0 -0
  233. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.bin +0 -0
  234. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.txt +0 -0
  235. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/states_and_abbreviations.txt +0 -0
  236. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords +0 -0
  237. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords.bin +0 -0
  238. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surNames.txt +0 -0
  239. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames.bin +0 -0
  240. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames_spec.bin +0 -0
  241. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/A.html +0 -0
  242. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/B.html +0 -0
  243. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/C.html +0 -0
  244. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/D.html +0 -0
  245. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/E.html +0 -0
  246. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/F.html +0 -0
  247. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/G.html +0 -0
  248. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/H.html +0 -0
  249. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/I.html +0 -0
  250. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/J.html +0 -0
  251. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/K.html +0 -0
  252. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/L.html +0 -0
  253. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/M.html +0 -0
  254. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/N.html +0 -0
  255. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/O.html +0 -0
  256. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/P.html +0 -0
  257. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Q.html +0 -0
  258. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/R.html +0 -0
  259. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/S.html +0 -0
  260. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/T.html +0 -0
  261. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/U.html +0 -0
  262. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/V.html +0 -0
  263. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/W.html +0 -0
  264. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
  265. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/X.html +0 -0
  266. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Y.html +0 -0
  267. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Z.html +0 -0
  268. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ae.html +0 -0
  269. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/am.html +0 -0
  270. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ar.html +0 -0
  271. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/at.html +0 -0
  272. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/au.html +0 -0
  273. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bd.html +0 -0
  274. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/be.html +0 -0
  275. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bg.html +0 -0
  276. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bh.html +0 -0
  277. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
  278. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bm.html +0 -0
  279. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bn.html +0 -0
  280. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/br.html +0 -0
  281. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ca.html +0 -0
  282. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ch.html +0 -0
  283. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cl.html +0 -0
  284. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cn.html +0 -0
  285. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/co.html +0 -0
  286. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cr.html +0 -0
  287. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cy.html +0 -0
  288. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cz.html +0 -0
  289. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/de.html +0 -0
  290. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
  291. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dk.html +0 -0
  292. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ec.html +0 -0
  293. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ee.html +0 -0
  294. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/eg.html +0 -0
  295. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/es.html +0 -0
  296. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/et.html +0 -0
  297. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/faq.html +0 -0
  298. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fi.html +0 -0
  299. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fj.html +0 -0
  300. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fo.html +0 -0
  301. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fr.html +0 -0
  302. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/geog.html +0 -0
  303. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gr.html +0 -0
  304. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gu.html +0 -0
  305. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hk.html +0 -0
  306. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hr.html +0 -0
  307. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hu.html +0 -0
  308. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/id.html +0 -0
  309. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ie.html +0 -0
  310. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/il.html +0 -0
  311. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/in.html +0 -0
  312. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/is.html +0 -0
  313. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/it.html +0 -0
  314. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jm.html +0 -0
  315. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jo.html +0 -0
  316. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jp.html +0 -0
  317. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
  318. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kr.html +0 -0
  319. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kw.html +0 -0
  320. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lb.html +0 -0
  321. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
  322. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lk.html +0 -0
  323. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lt.html +0 -0
  324. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lu.html +0 -0
  325. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lv.html +0 -0
  326. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ma.html +0 -0
  327. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
  328. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
  329. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mk.html +0 -0
  330. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mo.html +0 -0
  331. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
  332. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mt.html +0 -0
  333. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mx.html +0 -0
  334. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/my.html +0 -0
  335. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ni.html +0 -0
  336. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nl.html +0 -0
  337. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/no.html +0 -0
  338. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nz.html +0 -0
  339. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pa.html +0 -0
  340. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pe.html +0 -0
  341. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ph.html +0 -0
  342. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pl.html +0 -0
  343. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
  344. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pr.html +0 -0
  345. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ps.html +0 -0
  346. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pt.html +0 -0
  347. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/recognition.html +0 -0
  348. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/results.html +0 -0
  349. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ro.html +0 -0
  350. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ru.html +0 -0
  351. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sd.html +0 -0
  352. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/se.html +0 -0
  353. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sg.html +0 -0
  354. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/si.html +0 -0
  355. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sk.html +0 -0
  356. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/th.html +0 -0
  357. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tr.html +0 -0
  358. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tw.html +0 -0
  359. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ua.html +0 -0
  360. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uk.html +0 -0
  361. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ-full.html +0 -0
  362. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ.html +0 -0
  363. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uy.html +0 -0
  364. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ve.html +0 -0
  365. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/yu.html +0 -0
  366. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/za.html +0 -0
  367. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/zm.html +0 -0
  368. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list.txt +0 -0
  369. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/url.txt +0 -0
  370. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/webTopWords +0 -0
  371. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/words +0 -0
  372. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10ContextModelfold1 +0 -0
  373. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10Modelfold1 +0 -0
  374. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11ContextModelfold1 +0 -0
  375. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11Modelfold1 +0 -0
  376. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12ContextModelfold1 +0 -0
  377. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12Modelfold1 +0 -0
  378. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13ContextModelfold1 +0 -0
  379. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13Modelfold1 +0 -0
  380. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14ContextModelfold1 +0 -0
  381. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14Modelfold1 +0 -0
  382. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15ContextModelfold1 +0 -0
  383. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15Modelfold1 +0 -0
  384. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1ContextModelfold1 +0 -0
  385. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1Modelfold1 +0 -0
  386. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2ContextModelfold1 +0 -0
  387. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2Modelfold1 +0 -0
  388. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3ContextModelfold1 +0 -0
  389. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3Modelfold1 +0 -0
  390. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4ContextModelfold1 +0 -0
  391. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4Modelfold1 +0 -0
  392. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5ContextModelfold1 +0 -0
  393. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5Modelfold1 +0 -0
  394. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6ContextModelfold1 +0 -0
  395. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6Modelfold1 +0 -0
  396. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7ContextModelfold1 +0 -0
  397. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7Modelfold1 +0 -0
  398. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8ContextModelfold1 +0 -0
  399. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8Modelfold1 +0 -0
  400. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9ContextModelfold1 +0 -0
  401. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9Modelfold1 +0 -0
  402. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceModel +0 -0
  403. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceTrainF +0 -0
  404. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperBaseFeaDict +0 -0
  405. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperContextFeaDict +0 -0
  406. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +0 -0
@@ -0,0 +1,320 @@
1
+ a
2
+ about
3
+ above
4
+ after
5
+ again
6
+ ago
7
+ all
8
+ almost
9
+ along
10
+ already
11
+ also
12
+ although
13
+ always
14
+ am
15
+ among
16
+ an
17
+ and
18
+ another
19
+ any
20
+ anybody
21
+ anything
22
+ anywhere
23
+ are
24
+ aren't
25
+ around
26
+ as
27
+ at
28
+ back
29
+ else
30
+ be
31
+ been
32
+ before
33
+ being
34
+ below
35
+ beneath
36
+ beside
37
+ between
38
+ beyond
39
+ billion
40
+ billionth
41
+ both
42
+ each
43
+ but
44
+ by
45
+ can
46
+ can't
47
+ could
48
+ couldn't
49
+ did
50
+ didn't
51
+ do
52
+ does
53
+ doesn't
54
+ doing
55
+ done
56
+ don't
57
+ down
58
+ during
59
+ eight
60
+ eighteen
61
+ eighteenth
62
+ eighth
63
+ eightieth
64
+ eighty
65
+ either
66
+ eleven
67
+ eleventh
68
+ enough
69
+ even
70
+ ever
71
+ every
72
+ everybody
73
+ everyone
74
+ everything
75
+ everywhere
76
+ except
77
+ far
78
+ few
79
+ fewer
80
+ fifteen
81
+ fifteenth
82
+ fifth
83
+ fiftieth
84
+ fifty
85
+ first
86
+ five
87
+ for
88
+ fortieth
89
+ forty
90
+ four
91
+ fourteen
92
+ fourteenth
93
+ fourth
94
+ hundred
95
+ from
96
+ get
97
+ gets
98
+ getting
99
+ got
100
+ had
101
+ hadn't
102
+ has
103
+ hasn't
104
+ have
105
+ haven't
106
+ having
107
+ he
108
+ he'd
109
+ he'll
110
+ hence
111
+ her
112
+ here
113
+ hers
114
+ herself
115
+ he's
116
+ him
117
+ himself
118
+ his
119
+ hither
120
+ how
121
+ however
122
+ near
123
+ hundredth
124
+ i
125
+ i'd
126
+ if
127
+ i'll
128
+ i'm
129
+ in
130
+ into
131
+ is
132
+ i've
133
+ isn't
134
+ it
135
+ its
136
+ it's
137
+ itself
138
+ last
139
+ less
140
+ many
141
+ me
142
+ may
143
+ might
144
+ million
145
+ millionth
146
+ mine
147
+ more
148
+ most
149
+ much
150
+ must
151
+ mustn't
152
+ my
153
+ myself
154
+ near
155
+ nearby
156
+ nearly
157
+ neither
158
+ never
159
+ next
160
+ nine
161
+ nineteen
162
+ nineteenth
163
+ ninetieth
164
+ ninety
165
+ ninth
166
+ no
167
+ nobody
168
+ none
169
+ noone
170
+ nothing
171
+ nor
172
+ not
173
+ now
174
+ nowhere
175
+ of
176
+ off
177
+ often
178
+ on
179
+ or
180
+ once
181
+ one
182
+ only
183
+ other
184
+ others
185
+ ought
186
+ oughtn't
187
+ our
188
+ ours
189
+ ourselves
190
+ out
191
+ over
192
+ quite
193
+ rather
194
+ round
195
+ second
196
+ seven
197
+ seventeen
198
+ seventeenth
199
+ seventh
200
+ seventieth
201
+ seventy
202
+ shall
203
+ shan't
204
+ she'd
205
+ she
206
+ she'll
207
+ she's
208
+ should
209
+ shouldn't
210
+ since
211
+ six
212
+ sixteen
213
+ sixteenth
214
+ sixth
215
+ sixtieth
216
+ sixty
217
+ so
218
+ some
219
+ somebody
220
+ someone
221
+ something
222
+ sometimes
223
+ somewhere
224
+ soon
225
+ still
226
+ such
227
+ ten
228
+ tenth
229
+ than
230
+ that
231
+ that
232
+ that's
233
+ the
234
+ their
235
+ theirs
236
+ them
237
+ themselves
238
+ these
239
+ then
240
+ thence
241
+ there
242
+ therefore
243
+ they
244
+ they'd
245
+ they'll
246
+ they're
247
+ third
248
+ thirteen
249
+ thirteenth
250
+ thirtieth
251
+ thirty
252
+ this
253
+ thither
254
+ those
255
+ though
256
+ thousand
257
+ thousandth
258
+ three
259
+ thrice
260
+ through
261
+ thus
262
+ till
263
+ to
264
+ towards
265
+ today
266
+ tomorrow
267
+ too
268
+ twelfth
269
+ twelve
270
+ twentieth
271
+ twenty
272
+ twice
273
+ two
274
+ under
275
+ underneath
276
+ unless
277
+ until
278
+ up
279
+ us
280
+ very
281
+ when
282
+ was
283
+ wasn't
284
+ we
285
+ we'd
286
+ we'll
287
+ were
288
+ we're
289
+ weren't
290
+ we've
291
+ what
292
+ whence
293
+ where
294
+ whereas
295
+ which
296
+ while
297
+ whither
298
+ who
299
+ whom
300
+ whose
301
+ why
302
+ will
303
+ with
304
+ within
305
+ without
306
+ won't
307
+ would
308
+ wouldn't
309
+ yes
310
+ yesterday
311
+ yet
312
+ you
313
+ your
314
+ you'd
315
+ you'll
316
+ you're
317
+ yours
318
+ yourself
319
+ yourselves
320
+ you've
@@ -0,0 +1,42 @@
1
+ 1token=1
2
+ 2token=1
3
+ 3token=1
4
+ 4token=1
5
+ parscit=1
6
+ parscit_char=1
7
+ tokenCapital=1
8
+ tokenNumber=1
9
+ tokenName=1
10
+ tokenPunct=1
11
+ tokenKeyword=0
12
+
13
+ lineNum=1
14
+ linePunct=1
15
+ lineLength=1
16
+ lineCapital=0
17
+ linePos=1
18
+
19
+ 1gram=0
20
+ 2gram=0
21
+ 3gram=0
22
+ 4gram=0
23
+
24
+ xmlLoc=0
25
+ xmlAlign=0
26
+
27
+ xmlBold=0
28
+ xmlItalic=0
29
+ xmlFontSize=0
30
+
31
+ xmlPic=0
32
+ xmlTable=0
33
+ xmlBullet=0
34
+
35
+ bi_xmlA=0
36
+ bi_xmlF=0
37
+ bi_xmlSF=0
38
+ bi_xmlSFBI=0
39
+ bi_xmlSFBIA=0
40
+ bi_xmlPara=0
41
+
42
+ xmlSpace=0
@@ -0,0 +1,42 @@
1
+ 1token=1
2
+ 2token=1
3
+ 3token=1
4
+ 4token=1
5
+ parscit=1
6
+ parscit_char=1
7
+ tokenCapital=1
8
+ tokenNumber=1
9
+ tokenName=1
10
+ tokenPunct=1
11
+ tokenKeyword=0
12
+
13
+ lineNum=1
14
+ linePunct=1
15
+ lineLength=1
16
+ lineCapital=0
17
+ linePos=1
18
+
19
+ 1gram=0
20
+ 2gram=0
21
+ 3gram=0
22
+ 4gram=0
23
+
24
+ xmlLoc=1
25
+ xmlAlign=0
26
+
27
+ xmlBold=1
28
+ xmlItalic=1
29
+ xmlFontSize=1
30
+
31
+ xmlPic=1
32
+ xmlTable=1
33
+ xmlBullet=1
34
+
35
+ bi_xmlA=0
36
+ bi_xmlF=0
37
+ bi_xmlSF=0
38
+ bi_xmlSFBI=0
39
+ bi_xmlSFBIA=1
40
+ bi_xmlPara=1
41
+
42
+ xmlSpace=0
@@ -0,0 +1,20 @@
1
+ #!/bin/bash
2
+
3
+ if [ -z "$2" ]; then
4
+ echo "Usage: ./convert_to_text.sh in.pdf out.txt";
5
+ exit 1;
6
+ fi
7
+
8
+ INPUT=$1
9
+ OUTPUT=$2
10
+ TYPE=`file -b --mime-type "$INPUT"`
11
+
12
+ if [[ "$TYPE" == "text/plain" ]]; then
13
+ cp "$INPUT" "$OUTPUT";
14
+ elif [[ "$TYPE" == "application/pdf" && -n `which pdftotext` ]]; then
15
+ pdftotext -raw -enc UTF-8 "$INPUT" "$OUTPUT";
16
+ elif [[ "$TYPE" == "application/postscript" && -n `which ps2ascii` ]]; then
17
+ ps2ascii "$INPUT" > "$OUTPUT";
18
+ elif [[ -n `which abiword` ]]; then
19
+ abiword -t txt "$INPUT" -o "$OUTPUT";
20
+ fi
@@ -0,0 +1,121 @@
1
+ # encoding: UTF-8
2
+
3
+ module Biblicit
4
+
5
+ describe Extractor do
6
+
7
+ unless ENV['LOCAL']
8
+ it "handles 'Multi-scale collaborative...' using remote cb2bib" do
9
+ result = Extractor.extract(file: "#{FIXTURES_DIR}/pdf/ICINCO_2010.pdf", tools: [:cb2bib], remote: true)
10
+
11
+ cb2bib = result[:cb2bib]
12
+ cb2bib[:title].should == "Multiscale collaborative searching through swarming"
13
+ cb2bib[:authors].should == ["W. Liu", "M. B. Short", "Y. E. Taima", "A. L. Bertozzi"]
14
+ cb2bib[:year].should == 2010
15
+ end
16
+ end
17
+
18
+ it "handles 'Multi-scale collaborative...' " do
19
+ result = Extractor.extract(file: "#{FIXTURES_DIR}/pdf/ICINCO_2010.pdf")
20
+
21
+ citeseer = result[:citeseer]
22
+ citeseer[:valid].should be_true
23
+ citeseer[:title].should == 'MULTI-SCALE COLLABORATIVE SEARCHING THROUGH SWARMING'
24
+ citeseer[:authors].should == ["Wangyi Liu", "Yasser E. Taima", "Martin B. Short", "Andrea L. Bertozzi"]
25
+
26
+ parshed = result[:parshed]
27
+ parshed[:title].should == 'MULTI-SCALE COLLABORATIVE SEARCHING THROUGH SWARMING'
28
+ parshed[:authors].should == ["Wangyi Liu", "Yasser E Taima", "Martin B Short", "Andrea L Bertozzi"]
29
+ end
30
+
31
+ it "handles 'Multi-scale collaborative...' " do
32
+ content = IO.read("#{FIXTURES_DIR}/pdf/ICINCO_2010.pdf")
33
+ result = Extractor.extract(content: content)
34
+
35
+ citeseer = result[:citeseer]
36
+ citeseer[:valid].should be_true
37
+ citeseer[:title].should == 'MULTI-SCALE COLLABORATIVE SEARCHING THROUGH SWARMING'
38
+ citeseer[:authors].should == ["Wangyi Liu", "Yasser E. Taima", "Martin B. Short", "Andrea L. Bertozzi"]
39
+
40
+ parshed = result[:parshed]
41
+ parshed[:title].should == 'MULTI-SCALE COLLABORATIVE SEARCHING THROUGH SWARMING'
42
+ parshed[:authors].should == ["Wangyi Liu", "Yasser E Taima", "Martin B Short", "Andrea L Bertozzi"]
43
+ end
44
+
45
+ it "handles 'Oligopoly, Disclosure...'" do
46
+ result = Extractor.extract(file: "#{FIXTURES_DIR}/pdf/Bagnoli Watts TAR 2010.pdf")
47
+
48
+ citeseer = result[:citeseer]
49
+ citeseer[:valid].should be_true
50
+ citeseer[:title].should == 'Oligopoly, Disclosure, and Earnings Management'
51
+ citeseer[:authors].should == ["Mark Bagnoli", "Susan G. Watts"]
52
+
53
+ parshed = result[:parshed]
54
+ parshed[:title].should == 'Oligopoly, Disclosure, and Earnings Management'
55
+ parshed[:authors].should == ["Mark Bagnoli Susan G Watts"]
56
+ end
57
+
58
+ it "handles Google paper" do
59
+ result = Extractor.extract(file: "#{FIXTURES_DIR}/pdf/10.1.1.109.4049.pdf")
60
+
61
+ citeseer = result[:citeseer]
62
+ citeseer[:valid].should be_true
63
+ citeseer[:title].should == 'The Anatomy of a Large-Scale Hypertextual Web Search Engine'
64
+ citeseer[:authors].should == ['Sergey Brin']
65
+
66
+ parshed = result[:parshed]
67
+ parshed[:title].should == 'The Anatomy of a Large-Scale Hypertextual Web Search Engine'
68
+ parshed[:authors].should == ['Sergey Brin', 'Lawrence Page']
69
+ end
70
+
71
+ it "handles docx file" do
72
+ pending "Fails because Abiword can't be installed on current versions of OS X"
73
+ result = Extractor.extract(file: "#{FIXTURES_DIR}/Review_of_Michael_Tyes_Consciousness_Revisited.docx")
74
+ end
75
+
76
+ it "handles ps file" do
77
+ result = Extractor.extract(file: "#{FIXTURES_DIR}/critical-infrastructures.ps")
78
+
79
+ citeseer = result[:citeseer]
80
+ citeseer[:valid].should be_true
81
+ citeseer[:title].should == "Critical Infrastructures You Can Trust: Where Telecommunications Fits"
82
+ citeseer[:authors].should == ["Productname Gpl Ghostscript", "Fred B. Schneider", "Steven M. Bellovin", "Alan S. Inouye"]
83
+
84
+ parshed = result[:parshed]
85
+ parshed[:title].should == "Critical Infrastructures You Can Trust: Where Telecommunications Fits"
86
+ parshed[:authors].should == ["Fred B Schneider", "Cornell University Steven M Bellovin", "AT&T Labs-Research Alan S Inouye", "Computer Science", "Telecommunications Board"] # not ideal, but ok to match against
87
+ end
88
+
89
+ it "handles ParsCit sample1" do
90
+ result = Extractor.extract(file: "#{FIXTURES_DIR}/txt/sample1.txt")
91
+
92
+ result[:citeseer][:title].should == "A Calculus of Program Transformations and Its Applications"
93
+ result[:citeseer][:authors].should == ["Rahma Ben Ayed", "Jules Desharnais", "Ali Mili"]
94
+
95
+ result[:parshed][:title].should == "A Calculus of Program Transformations and Its Applications"
96
+ result[:parshed][:authors].should == ["Rahma Ben Ayed", "Jules Desharnais", "Marc Frappier", "Ali Mili"]
97
+ end
98
+
99
+ it "handles ParsCit sample2" do
100
+ result = Extractor.extract(file: "#{FIXTURES_DIR}/txt/sample2.txt")
101
+
102
+ result[:citeseer][:title].should == "Explanation-Based Learning of Indirect Speech Act Interpretation Rules"
103
+ result[:citeseer][:authors].should == ["David Schulenburg", "Michael J. Pazzani"]
104
+
105
+ result[:parshed][:title].should == "Explanation-Based Learning of Indirect Speech Act Interpretation Rules"
106
+ result[:parshed][:authors].should == ["David Schulenburg Michael J Pazzani"]
107
+ end
108
+
109
+ it "handles ParsCit sample E06" do
110
+ result = Extractor.extract(file: "#{FIXTURES_DIR}/txt/E06-1050.txt")
111
+
112
+ result[:citeseer][:title].should == "A Probabilistic Answer Type Model"
113
+ result[:citeseer][:authors].should == ["Christopher Pinchak", "Dekang Lin"]
114
+
115
+ result[:parshed][:title].should == "A Probabilistic Answer Type Model"
116
+ result[:parshed][:authors].should == ["Christopher Pinchak", "Dekang Lin"]
117
+ end
118
+
119
+ end
120
+
121
+ end