extcite 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +36 -0
- data/.travis.yml +9 -0
- data/CHANGELOG.md +14 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +81 -0
- data/README.md +71 -0
- data/Rakefile +41 -0
- data/bin/extcite +17 -0
- data/extcite.gemspec +36 -0
- data/extra/fetch.rb +49 -0
- data/extra/fetch_method.rb +17 -0
- data/lib/extcite/methods_array.rb +8 -0
- data/lib/extcite/methods_string.rb +32 -0
- data/lib/extcite/utils.rb +47 -0
- data/lib/extcite/version.rb +3 -0
- data/lib/extcite.rb +224 -0
- metadata +246 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 59c11ddc6a3c2055e6d32941cf1e5227d759d8bb
|
4
|
+
data.tar.gz: 8bb4d062337f6caf5a272cca33900476ff29049c
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 8319b4a0eaacadc82b97d3780df98ae6669b3d62bb017f348dd32c30b09d433ae5b610afeae9d4566f2714b2681dd57526b826819872182c108dafe6a1b780b5
|
7
|
+
data.tar.gz: e26c399bf22cd9e498bf302ae2b484238aed579b6994a8f4e9d81c0da949b60b6b694681459c02af611d0c5664b5d37d2c1f89e071eea86e43c15f13487413f6
|
data/.gitignore
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
/.config
|
4
|
+
/coverage/
|
5
|
+
/InstalledFiles
|
6
|
+
/pkg/
|
7
|
+
/spec/reports/
|
8
|
+
/test/tmp/
|
9
|
+
/test/version_tmp/
|
10
|
+
/tmp/
|
11
|
+
|
12
|
+
## Specific to RubyMotion:
|
13
|
+
.dat*
|
14
|
+
.repl_history
|
15
|
+
build/
|
16
|
+
|
17
|
+
## Documentation cache and generated files:
|
18
|
+
/.yardoc/
|
19
|
+
/_yardoc/
|
20
|
+
/doc/
|
21
|
+
/rdoc/
|
22
|
+
|
23
|
+
## Environment normalisation:
|
24
|
+
/.bundle/
|
25
|
+
/lib/bundler/man/
|
26
|
+
|
27
|
+
# for a library or gem, you might want to ignore these files since the code is
|
28
|
+
# intended to run in multiple environments; otherwise, check them in:
|
29
|
+
#Gemfile.lock
|
30
|
+
.ruby-version
|
31
|
+
.ruby-gemset
|
32
|
+
|
33
|
+
# unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
|
34
|
+
.rvmrc
|
35
|
+
|
36
|
+
cache/
|
data/.travis.yml
ADDED
data/CHANGELOG.md
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
## 0.1.0 (2017-04-06)
|
2
|
+
|
3
|
+
* First version to Rubygems
|
4
|
+
|
5
|
+
## 0.0.9 (2016-06-17)
|
6
|
+
|
7
|
+
* battle tested more, fixed a number of bugs
|
8
|
+
* now works with arxiv papers
|
9
|
+
* now works with biorxiv, or at least should
|
10
|
+
* improved extraction of DOIs
|
11
|
+
|
12
|
+
## 0.1.0 (2016-06-07)
|
13
|
+
|
14
|
+
* just started, :)
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,81 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
extcite (0.1.0)
|
5
|
+
bibtex-ruby (~> 4.4)
|
6
|
+
faraday (~> 0.12.0.1)
|
7
|
+
faraday_middleware (~> 0.11.0.1)
|
8
|
+
oga (~> 2.2)
|
9
|
+
pdf-reader (~> 2.0)
|
10
|
+
serrano (~> 0.3.6)
|
11
|
+
thor (~> 0.19.4)
|
12
|
+
|
13
|
+
GEM
|
14
|
+
remote: https://rubygems.org/
|
15
|
+
specs:
|
16
|
+
Ascii85 (1.0.2)
|
17
|
+
afm (0.2.2)
|
18
|
+
ansi (1.5.0)
|
19
|
+
ast (2.3.0)
|
20
|
+
bibtex-ruby (4.4.3)
|
21
|
+
latex-decode (~> 0.0)
|
22
|
+
codecov (0.1.10)
|
23
|
+
json
|
24
|
+
simplecov
|
25
|
+
url
|
26
|
+
docile (1.1.5)
|
27
|
+
faraday (0.12.0.1)
|
28
|
+
multipart-post (>= 1.2, < 3)
|
29
|
+
faraday_middleware (0.11.0.1)
|
30
|
+
faraday (>= 0.7.4, < 1.0)
|
31
|
+
hashery (2.1.2)
|
32
|
+
json (2.0.3)
|
33
|
+
latex-decode (0.2.2)
|
34
|
+
unicode (~> 0.4)
|
35
|
+
multi_json (1.12.1)
|
36
|
+
multipart-post (2.0.0)
|
37
|
+
oga (2.9)
|
38
|
+
ast
|
39
|
+
ruby-ll (~> 2.1)
|
40
|
+
pdf-reader (2.0.0)
|
41
|
+
Ascii85 (~> 1.0.0)
|
42
|
+
afm (~> 0.2.1)
|
43
|
+
hashery (~> 2.0)
|
44
|
+
ruby-rc4
|
45
|
+
ttfunk
|
46
|
+
power_assert (1.0.1)
|
47
|
+
rake (12.0.0)
|
48
|
+
ruby-ll (2.1.2)
|
49
|
+
ansi
|
50
|
+
ast
|
51
|
+
ruby-rc4 (0.1.5)
|
52
|
+
serrano (0.3.6)
|
53
|
+
faraday (~> 0.12.0.1)
|
54
|
+
faraday_middleware (~> 0.11.0.1)
|
55
|
+
multi_json (~> 1.12, >= 1.12.1)
|
56
|
+
thor (~> 0.19.4)
|
57
|
+
simplecov (0.14.1)
|
58
|
+
docile (~> 1.1.0)
|
59
|
+
json (>= 1.8, < 3)
|
60
|
+
simplecov-html (~> 0.10.0)
|
61
|
+
simplecov-html (0.10.0)
|
62
|
+
test-unit (3.2.3)
|
63
|
+
power_assert
|
64
|
+
thor (0.19.4)
|
65
|
+
ttfunk (1.5.0)
|
66
|
+
unicode (0.4.4.2)
|
67
|
+
url (0.3.2)
|
68
|
+
|
69
|
+
PLATFORMS
|
70
|
+
ruby
|
71
|
+
|
72
|
+
DEPENDENCIES
|
73
|
+
bundler (~> 1.14, >= 1.14.6)
|
74
|
+
codecov (~> 0.1.10)
|
75
|
+
extcite!
|
76
|
+
rake (~> 12.0, >= 12.0.0)
|
77
|
+
simplecov (~> 0.14.1)
|
78
|
+
test-unit (~> 3.2, >= 3.2.1)
|
79
|
+
|
80
|
+
BUNDLED WITH
|
81
|
+
1.14.6
|
data/README.md
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
extcite
|
2
|
+
=======
|
3
|
+
|
4
|
+
[![gem version](https://img.shields.io/gem/v/extcite.svg)](https://rubygems.org/gems/extcite)
|
5
|
+
[![Build Status](https://travis-ci.org/sckott/extcite.svg?branch=master)](https://travis-ci.org/sckott/extcite)
|
6
|
+
[![codecov.io](http://codecov.io/github/sckott/extcite/coverage.svg?branch=master)](http://codecov.io/github/sckott/extcite?branch=master)
|
7
|
+
|
8
|
+
__`extcite` gets DOIS and generates citations for your papers__
|
9
|
+
|
10
|
+
## Install
|
11
|
+
|
12
|
+
### Release version
|
13
|
+
|
14
|
+
```
|
15
|
+
gem install extcite
|
16
|
+
```
|
17
|
+
|
18
|
+
### Development version
|
19
|
+
|
20
|
+
```
|
21
|
+
git clone git@github.com:sckott/extcite.git
|
22
|
+
cd extcite
|
23
|
+
rake install
|
24
|
+
```
|
25
|
+
|
26
|
+
> if `rake install` fails, try `sudo rake install`. If that fails, open an issue with what `rake install --trace` gives you
|
27
|
+
|
28
|
+
## Examples
|
29
|
+
|
30
|
+
### Within Ruby
|
31
|
+
|
32
|
+
```ruby
|
33
|
+
require 'extcite'
|
34
|
+
```
|
35
|
+
|
36
|
+
#### Search
|
37
|
+
|
38
|
+
A single paper
|
39
|
+
|
40
|
+
```ruby
|
41
|
+
require 'net/http'
|
42
|
+
File.write("foo.pdf", Net::HTTP.get(URI.parse("https://scottchamberlain.info/pdfs/GuoEtal2015PlosOne.pdf")))
|
43
|
+
Extcite.extract(path: 'foo.pdf')
|
44
|
+
```
|
45
|
+
|
46
|
+
bib citation is written to a file given in `file` param
|
47
|
+
|
48
|
+
Many papers at once
|
49
|
+
|
50
|
+
```ruby
|
51
|
+
Dir.mkdir('bar')
|
52
|
+
File.write("bar/foo1.pdf", Net::HTTP.get(URI.parse("https://scottchamberlain.info/pdfs/Chamberlain&Szocs2013F1000Research.pdf")))
|
53
|
+
File.write("bar/foo2.pdf", Net::HTTP.get(URI.parse("https://scottchamberlain.info/pdfs/GuoEtal2015PlosOne.pdf")))
|
54
|
+
Extcite.extract(path: 'bar')
|
55
|
+
```
|
56
|
+
|
57
|
+
### On the CLI
|
58
|
+
|
59
|
+
All pdfs in the current directory:
|
60
|
+
|
61
|
+
```shell
|
62
|
+
extcite extract .
|
63
|
+
```
|
64
|
+
|
65
|
+
Single paper
|
66
|
+
|
67
|
+
```shell
|
68
|
+
extcite extract foo.pdf
|
69
|
+
```
|
70
|
+
|
71
|
+
[changelog]: https://github.com/sckott/extcite/blob/master/CHANGELOG.md
|
data/Rakefile
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
2
|
+
require 'rake/testtask'
|
3
|
+
|
4
|
+
Rake::TestTask.new do |t|
|
5
|
+
t.libs << "test"
|
6
|
+
t.test_files = FileList['test/test-*.rb']
|
7
|
+
t.verbose = true
|
8
|
+
end
|
9
|
+
|
10
|
+
desc "Run tests"
|
11
|
+
task :default => :test
|
12
|
+
|
13
|
+
desc "Build extcite docs"
|
14
|
+
task :docs do
|
15
|
+
system "yardoc"
|
16
|
+
end
|
17
|
+
|
18
|
+
desc "bundle install"
|
19
|
+
task :bundle do
|
20
|
+
system "bundle install"
|
21
|
+
end
|
22
|
+
|
23
|
+
desc "clean out builds"
|
24
|
+
task :clean do
|
25
|
+
system "ls | grep [0-9].gem | xargs rm"
|
26
|
+
end
|
27
|
+
|
28
|
+
desc "Build extcite"
|
29
|
+
task :build do
|
30
|
+
system "gem build extcite.gemspec"
|
31
|
+
end
|
32
|
+
|
33
|
+
desc "Install extcite"
|
34
|
+
task :install => [:bundle, :build] do
|
35
|
+
system "gem install extcite-#{Extcite::VERSION}.gem"
|
36
|
+
end
|
37
|
+
|
38
|
+
desc "Release to Rubygems"
|
39
|
+
task :release => :build do
|
40
|
+
system "gem push extcite-#{Extcite::VERSION}.gem"
|
41
|
+
end
|
data/bin/extcite
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "thor"
|
4
|
+
|
5
|
+
class Dz < Thor
|
6
|
+
include Thor::Actions
|
7
|
+
require 'extcite'
|
8
|
+
|
9
|
+
desc "extract STRING", "Get bib data from PDFs"
|
10
|
+
# method_option :path => :string
|
11
|
+
def extract(tt)
|
12
|
+
tt = "#{tt}"
|
13
|
+
Extcite.extract(path: tt)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
Dz.start(ARGV)
|
data/extcite.gemspec
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'extcite/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = 'extcite'
|
8
|
+
s.version = Extcite::VERSION
|
9
|
+
s.date = '2017-04-06'
|
10
|
+
s.summary = "Citations from PDFs"
|
11
|
+
s.description = "Gets DOIS and generates citations for your papers"
|
12
|
+
s.authors = "Scott Chamberlain"
|
13
|
+
s.email = 'myrmecocystus@gmail.com'
|
14
|
+
s.homepage = 'http://github.com/sckott/extcite'
|
15
|
+
s.licenses = 'MIT'
|
16
|
+
|
17
|
+
s.files = `git ls-files -z`.split("\x0").reject {|f| f.match(%r{^(test|spec|features)/}) }
|
18
|
+
s.require_paths = ["lib"]
|
19
|
+
|
20
|
+
s.bindir = 'bin'
|
21
|
+
s.executables = ['extcite']
|
22
|
+
|
23
|
+
s.add_development_dependency 'bundler', '~> 1.14', '>= 1.14.6'
|
24
|
+
s.add_development_dependency 'rake', '~> 12.0', '>= 12.0.0'
|
25
|
+
s.add_development_dependency 'test-unit', '~> 3.2', '>= 3.2.1'
|
26
|
+
s.add_development_dependency 'simplecov', '~> 0.14.1'
|
27
|
+
s.add_development_dependency 'codecov', '~> 0.1.10'
|
28
|
+
|
29
|
+
s.add_runtime_dependency 'faraday', '~> 0.12.0.1'
|
30
|
+
s.add_runtime_dependency 'faraday_middleware', '~> 0.11.0.1'
|
31
|
+
s.add_runtime_dependency 'thor', '~> 0.19.4'
|
32
|
+
s.add_runtime_dependency 'oga', '~> 2.2'
|
33
|
+
s.add_runtime_dependency 'serrano', '~> 0.3.6'
|
34
|
+
s.add_runtime_dependency 'bibtex-ruby', '~> 4.4'
|
35
|
+
s.add_runtime_dependency 'pdf-reader', '~> 2.0'
|
36
|
+
end
|
data/extra/fetch.rb
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
module Textminer
|
2
|
+
class Fetch #:nodoc:
|
3
|
+
attr_accessor :doi, :type
|
4
|
+
|
5
|
+
def initialize(doi, type)
|
6
|
+
self.doi = doi
|
7
|
+
self.type = type
|
8
|
+
end
|
9
|
+
|
10
|
+
def fetchtext
|
11
|
+
lks = Textminer.links(self.doi)
|
12
|
+
lk = pick_link(lks)
|
13
|
+
case self.type
|
14
|
+
when "xml"
|
15
|
+
# HTTParty.get(lk)
|
16
|
+
coll = []
|
17
|
+
Array(lk).each do |x|
|
18
|
+
coll << HTTParty.get(x)
|
19
|
+
end
|
20
|
+
return coll
|
21
|
+
when "pdf"
|
22
|
+
serialize_pdf(lk, self.doi)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
def pick_link(x)
|
29
|
+
case self.type
|
30
|
+
when "xml"
|
31
|
+
x.xml
|
32
|
+
when "pdf"
|
33
|
+
x.pdf
|
34
|
+
else
|
35
|
+
puts "type must be xml or pdf"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def serialize_pdf(x, y)
|
40
|
+
path = "/Users/sacmac/.textminer/" + y.gsub('/', '_') + ".pdf"
|
41
|
+
File.open(path, "wb") do |f|
|
42
|
+
f.write HTTParty.get(x).parsed_response
|
43
|
+
end
|
44
|
+
|
45
|
+
return path
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
##
|
2
|
+
# Thin layer around pdf-reader gem's PDF::Reader
|
3
|
+
#
|
4
|
+
# @param doi [Array] A DOI, digital object identifier
|
5
|
+
# @param type [Array] One of two options to download: xml (default) or pdf
|
6
|
+
#
|
7
|
+
# @example
|
8
|
+
# require 'textminer'
|
9
|
+
# # fetch full text by DOI - xml by default
|
10
|
+
# Textminer.fetch("10.3897/phytokeys.42.7604")
|
11
|
+
# # many DOIs - xml output
|
12
|
+
# res = Textminer.fetch(["10.3897/phytokeys.42.7604", "10.3897/zookeys.516.9439"])
|
13
|
+
# # fetch full text - pdf
|
14
|
+
# Textminer.fetch("10.3897/phytokeys.42.7604", "pdf")
|
15
|
+
def self.fetch(doi, type = 'xml')
|
16
|
+
Fetch.new(doi, type).fetchtext
|
17
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'oga'
|
2
|
+
require 'bibtex'
|
3
|
+
|
4
|
+
# String methods
|
5
|
+
class String
|
6
|
+
def write_bib(file)
|
7
|
+
File.open(file, 'a') do |f|
|
8
|
+
f.puts self
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
class String
|
14
|
+
def make_bib_arxiv(id)
|
15
|
+
# prep xml
|
16
|
+
xml = Oga.parse_xml(self)
|
17
|
+
# author = xml.xpath('//author//name')[0].text.downcase.gsub(/\s|\./, '_')
|
18
|
+
year = DateTime.strptime(xml.xpath('//updated')[0].text).year
|
19
|
+
|
20
|
+
# make bib citation
|
21
|
+
bib = BibTeX::Bibliography.new
|
22
|
+
bib << BibTeX::Entry.new({
|
23
|
+
:bibtex_type => :article,
|
24
|
+
:url => xml.xpath('//entry/id').text,
|
25
|
+
:author => xml.xpath('//author//name').collect { |x| x.text }.join(' and '),
|
26
|
+
:eprint => id,
|
27
|
+
:title => xml.xpath('//entry//title').text,
|
28
|
+
:year => year
|
29
|
+
})
|
30
|
+
return bib.to_s
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
require "oga"
|
2
|
+
|
3
|
+
def singlearray2hash(x)
|
4
|
+
if x.length == 1 && x.class == Array
|
5
|
+
return x[0]
|
6
|
+
else
|
7
|
+
return x
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
def dir_files(x)
|
12
|
+
Dir.entries(x).select { |entry|
|
13
|
+
!File.directory? File.join(x, entry) and !(entry =='.' || entry == '..')
|
14
|
+
}.map { |z|
|
15
|
+
x + '/' + z
|
16
|
+
}
|
17
|
+
end
|
18
|
+
|
19
|
+
def make_paths(x)
|
20
|
+
path = Array(x)
|
21
|
+
if path.length == 1
|
22
|
+
# if a directory
|
23
|
+
if File.directory?(path[0])
|
24
|
+
# keep only files with .pdf extension
|
25
|
+
path = dir_files(path[0]).keep_if { |z| !!z.match(/.pdf/) }
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
# check that files exist
|
30
|
+
path.each do |z|
|
31
|
+
if !File.exist?(z)
|
32
|
+
raise z + ' not found'
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
return path
|
37
|
+
end
|
38
|
+
|
39
|
+
def pdf_doi(x)
|
40
|
+
xml = Oga.parse_xml(x)
|
41
|
+
begin
|
42
|
+
tt = xml.xpath('//rdf:Description')
|
43
|
+
return tt.attr('dc:identifier')[0].text.sub(/doi:/, '')
|
44
|
+
rescue
|
45
|
+
return nil
|
46
|
+
end
|
47
|
+
end
|
data/lib/extcite.rb
ADDED
@@ -0,0 +1,224 @@
|
|
1
|
+
require "extcite/utils"
|
2
|
+
require "extcite/methods_array"
|
3
|
+
require "extcite/methods_string"
|
4
|
+
require "extcite/version"
|
5
|
+
|
6
|
+
|
7
|
+
require 'serrano'
|
8
|
+
require 'pdf-reader'
|
9
|
+
require 'faraday'
|
10
|
+
|
11
|
+
|
12
|
+
|
13
|
+
module Extcite
|
14
|
+
##
|
15
|
+
# Extract DOIs from one or more PDFs
|
16
|
+
#
|
17
|
+
# @param path [String] Path to a pdf file, or a folder of PDF files
|
18
|
+
# @param file [String] File name to write data to - or nil to stdout
|
19
|
+
# @param output [String] Typeo of output. only bibtex for now
|
20
|
+
#
|
21
|
+
# Return: writes bib files to a .bib file or an array if file is nil
|
22
|
+
# When writing to a file, `extract` by default appends to the end
|
23
|
+
# of the file so you can build up your bibtex file with your
|
24
|
+
# citations
|
25
|
+
#
|
26
|
+
# @example
|
27
|
+
# require 'extcite'
|
28
|
+
# require 'faraday'
|
29
|
+
# # get a paper in pdf format
|
30
|
+
# path = '2068.pdf'
|
31
|
+
# res = Faraday.new(:url => "https://peerj.com/articles/2068.pdf").get
|
32
|
+
# f = File.new(path, "wb")
|
33
|
+
# f.write(res.body)
|
34
|
+
# f.close()
|
35
|
+
# # extract doi from the pdf
|
36
|
+
# Extcite.extract(path: path)
|
37
|
+
# Extcite.extract(path: path, file: nil)
|
38
|
+
def self.extract(path:, file: "out.bib", output: "bib")
|
39
|
+
path = make_paths(path)
|
40
|
+
path.each do |x|
|
41
|
+
# try PDF metadata first
|
42
|
+
ids = nil
|
43
|
+
rr = PDF::Reader.new(x)
|
44
|
+
pdfmeta = rr.metadata
|
45
|
+
if !pdfmeta.nil?
|
46
|
+
xml = Oga.parse_xml(pdfmeta)
|
47
|
+
begin
|
48
|
+
tt = xml.xpath('//rdf:Description')
|
49
|
+
# try dc:identifier attribute
|
50
|
+
ss = tt.attr('dc:identifier')[0]
|
51
|
+
if !ss.nil?
|
52
|
+
ids = ss.text.sub(/doi:/, '')
|
53
|
+
else
|
54
|
+
# try prism:doi node
|
55
|
+
pdoi = xml.xpath('//rdf:Description//prism:doi')
|
56
|
+
if pdoi.length == 1
|
57
|
+
ids = pdoi.text
|
58
|
+
else
|
59
|
+
# try pdf:WPS-ARTICLEDOI node
|
60
|
+
wpsdoi = xml.xpath('//rdf:Description//pdf:WPS-ARTICLEDOI')
|
61
|
+
if wpsdoi.length == 1
|
62
|
+
ids = wpsdoi.text
|
63
|
+
else
|
64
|
+
# try pdfx:WPS-ARTICLEDOI node
|
65
|
+
pdfxwpsdoi = xml.xpath('//rdf:Description//pdfx:WPS-ARTICLEDOI')
|
66
|
+
if pdfxwpsdoi.length == 1
|
67
|
+
ids = pdfxwpsdoi.text
|
68
|
+
else
|
69
|
+
ids = nil
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
rescue
|
75
|
+
ids = nil
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
# if not found, try regexing for DOI
|
80
|
+
if ids.nil?
|
81
|
+
ids = Extcite.get_ids(txt: Extcite.extract_text_one(x))
|
82
|
+
end
|
83
|
+
|
84
|
+
if ids.length == 0
|
85
|
+
puts "no DOI found in " + x
|
86
|
+
else
|
87
|
+
if !ids.match(/arxiv/i).nil? && ids.length < 200
|
88
|
+
conn = Faraday.new(:url => 'http://export.arxiv.org/api/query?id_list=' + ids.gsub(/arxiv:/i, '')).get
|
89
|
+
bibs = conn.body.make_bib_arxiv(ids.gsub(/arxiv:/i, ''))
|
90
|
+
else
|
91
|
+
bibs = Extcite.cont_neg(ids: ids)
|
92
|
+
end
|
93
|
+
|
94
|
+
# if an error or not found, skip
|
95
|
+
bibstest = nil
|
96
|
+
if bibs.class == Array
|
97
|
+
bibstest = bibs[0]
|
98
|
+
end
|
99
|
+
|
100
|
+
if !bibstest.nil?
|
101
|
+
if !bibstest.match(/error|not found/i).nil? || !bibstest.match(/<\/html>/i).nil?
|
102
|
+
puts "DOI found: " + ids + " ; but citation not found via content negotation - passing"
|
103
|
+
# do something else?
|
104
|
+
else
|
105
|
+
if file.nil?
|
106
|
+
return bibs
|
107
|
+
else
|
108
|
+
puts "writing " + ids + " to " + file
|
109
|
+
bibs.write_bib(file)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
##
|
118
|
+
# Extract DOIs from one or more PDFs after extracting text
|
119
|
+
#
|
120
|
+
# @param path [String] Path to a pdf file, or a folder of PDF files
|
121
|
+
#
|
122
|
+
# @example
|
123
|
+
# require 'extcite'
|
124
|
+
# require 'faraday'
|
125
|
+
# # get a paper in pdf format
|
126
|
+
# path = '2068.pdf'
|
127
|
+
# res = Faraday.new(:url => "https://peerj.com/articles/2068.pdf").get
|
128
|
+
# f = File.new(path, "wb")
|
129
|
+
# f.write(res.body)
|
130
|
+
# f.close()
|
131
|
+
# # extract doi from the pdf
|
132
|
+
# Extcite.extract_dois(path: path)
|
133
|
+
def self.extract_dois(path:)
|
134
|
+
txt = Extcite.extract_text(path: path)
|
135
|
+
return txt.map { |z| z.match("[0-9]+\\.[0-9]+/.+").to_s.gsub(/\s.+/, '') }
|
136
|
+
end
|
137
|
+
|
138
|
+
##
|
139
|
+
# Get DOIs from a String or Array of String's
|
140
|
+
#
|
141
|
+
# @param txt [String] String or Array of String's
|
142
|
+
#
|
143
|
+
# Return: Array of DOIs
|
144
|
+
#
|
145
|
+
# @example
|
146
|
+
# require 'extcite'
|
147
|
+
# Extcite.get_ids(txt: '10.1016/j.dendro.2014.01.004 adfasdf asd fas df asdfsd')
|
148
|
+
def self.get_ids(txt:)
|
149
|
+
# see if there's
|
150
|
+
|
151
|
+
return Array(txt).map { |z|
|
152
|
+
# detect if is an arxiv paper
|
153
|
+
if !z.match(/arxiv:[0-9]+\.[0-9A-Za-z]+/i).nil?
|
154
|
+
# if so, return arxiv id for later extraction of arxiv citation via their API
|
155
|
+
z = z.match(/arxiv:[0-9]+\.[0-9A-Za-z]+/i).to_s
|
156
|
+
else
|
157
|
+
doi_pattern = '(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![%"#? ])\\S)+)'
|
158
|
+
z = z.match(doi_pattern).to_s.gsub(/\s.+/, '')
|
159
|
+
# z = z.match("10\\.[0-9]+/.+").to_s.gsub(/\s.+/, '')
|
160
|
+
end
|
161
|
+
# clean up doi
|
162
|
+
z = z.gsub(/\.$|\.;$|\.\]$|\.\}$|\.\)$|,$/, '')
|
163
|
+
return z.gsub(/;$|\]$|\}$|\)$/, '')
|
164
|
+
}[0]
|
165
|
+
end
|
166
|
+
|
167
|
+
##
|
168
|
+
# Extract text from a pdf, or many pdfs
|
169
|
+
#
|
170
|
+
# @param path [String] Path to a pdf file, or a folder of PDF files
|
171
|
+
#
|
172
|
+
# This method is used internally within fetch to parse PDFs.
|
173
|
+
#
|
174
|
+
# @example
|
175
|
+
# require 'extcite'
|
176
|
+
# require 'faraday'
|
177
|
+
# # get a paper in pdf format
|
178
|
+
# path = '2068.pdf'
|
179
|
+
# res = Faraday.new(:url => "https://peerj.com/articles/2068.pdf").get
|
180
|
+
# f = File.new(path, "wb")
|
181
|
+
# f.write(res.body)
|
182
|
+
# f.close()
|
183
|
+
# # extract doi from the pdf
|
184
|
+
# Extcite.extract_text(path: path)
|
185
|
+
def self.extract_text(path:)
|
186
|
+
path = Array(path)
|
187
|
+
if path.length == 1
|
188
|
+
if File.directory?(path[0])
|
189
|
+
# keep only files with .pdf extension
|
190
|
+
path = dir_files(path[0]).keep_if { |z| !!z.match(/.pdf/) }
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
out = []
|
195
|
+
path.each do |x|
|
196
|
+
rr = PDF::Reader.new(x)
|
197
|
+
out << rr.pages.map { |page| page.text }.join("\n")
|
198
|
+
end
|
199
|
+
return out
|
200
|
+
end
|
201
|
+
|
202
|
+
##
|
203
|
+
# Get citation(s) using Crossref content negotation
|
204
|
+
#
|
205
|
+
# @param ids [Array[String]] One or more DOIs in an array
|
206
|
+
#
|
207
|
+
# Return: an array of bib data
|
208
|
+
#
|
209
|
+
# @example
|
210
|
+
# require 'extcite'
|
211
|
+
# Extcite.cont_neg(ids: "10.1016/j.dendro.2014.01.004")
|
212
|
+
def self.cont_neg(ids:)
|
213
|
+
out = Serrano.content_negotiation(ids: ids)
|
214
|
+
return out
|
215
|
+
end
|
216
|
+
|
217
|
+
protected
|
218
|
+
|
219
|
+
def self.extract_text_one(x)
|
220
|
+
rr = PDF::Reader.new(x)
|
221
|
+
return rr.pages.map { |page| page.text }.join("\n")
|
222
|
+
end
|
223
|
+
|
224
|
+
end
|
metadata
ADDED
@@ -0,0 +1,246 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: extcite
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Scott Chamberlain
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2017-04-06 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.14'
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 1.14.6
|
23
|
+
type: :development
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
requirements:
|
27
|
+
- - "~>"
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '1.14'
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 1.14.6
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: rake
|
35
|
+
requirement: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - "~>"
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '12.0'
|
40
|
+
- - ">="
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: 12.0.0
|
43
|
+
type: :development
|
44
|
+
prerelease: false
|
45
|
+
version_requirements: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - "~>"
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: '12.0'
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: 12.0.0
|
53
|
+
- !ruby/object:Gem::Dependency
|
54
|
+
name: test-unit
|
55
|
+
requirement: !ruby/object:Gem::Requirement
|
56
|
+
requirements:
|
57
|
+
- - "~>"
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: '3.2'
|
60
|
+
- - ">="
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: 3.2.1
|
63
|
+
type: :development
|
64
|
+
prerelease: false
|
65
|
+
version_requirements: !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- - "~>"
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '3.2'
|
70
|
+
- - ">="
|
71
|
+
- !ruby/object:Gem::Version
|
72
|
+
version: 3.2.1
|
73
|
+
- !ruby/object:Gem::Dependency
|
74
|
+
name: simplecov
|
75
|
+
requirement: !ruby/object:Gem::Requirement
|
76
|
+
requirements:
|
77
|
+
- - "~>"
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
version: 0.14.1
|
80
|
+
type: :development
|
81
|
+
prerelease: false
|
82
|
+
version_requirements: !ruby/object:Gem::Requirement
|
83
|
+
requirements:
|
84
|
+
- - "~>"
|
85
|
+
- !ruby/object:Gem::Version
|
86
|
+
version: 0.14.1
|
87
|
+
- !ruby/object:Gem::Dependency
|
88
|
+
name: codecov
|
89
|
+
requirement: !ruby/object:Gem::Requirement
|
90
|
+
requirements:
|
91
|
+
- - "~>"
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: 0.1.10
|
94
|
+
type: :development
|
95
|
+
prerelease: false
|
96
|
+
version_requirements: !ruby/object:Gem::Requirement
|
97
|
+
requirements:
|
98
|
+
- - "~>"
|
99
|
+
- !ruby/object:Gem::Version
|
100
|
+
version: 0.1.10
|
101
|
+
- !ruby/object:Gem::Dependency
|
102
|
+
name: faraday
|
103
|
+
requirement: !ruby/object:Gem::Requirement
|
104
|
+
requirements:
|
105
|
+
- - "~>"
|
106
|
+
- !ruby/object:Gem::Version
|
107
|
+
version: 0.12.0.1
|
108
|
+
type: :runtime
|
109
|
+
prerelease: false
|
110
|
+
version_requirements: !ruby/object:Gem::Requirement
|
111
|
+
requirements:
|
112
|
+
- - "~>"
|
113
|
+
- !ruby/object:Gem::Version
|
114
|
+
version: 0.12.0.1
|
115
|
+
- !ruby/object:Gem::Dependency
|
116
|
+
name: faraday_middleware
|
117
|
+
requirement: !ruby/object:Gem::Requirement
|
118
|
+
requirements:
|
119
|
+
- - "~>"
|
120
|
+
- !ruby/object:Gem::Version
|
121
|
+
version: 0.11.0.1
|
122
|
+
type: :runtime
|
123
|
+
prerelease: false
|
124
|
+
version_requirements: !ruby/object:Gem::Requirement
|
125
|
+
requirements:
|
126
|
+
- - "~>"
|
127
|
+
- !ruby/object:Gem::Version
|
128
|
+
version: 0.11.0.1
|
129
|
+
- !ruby/object:Gem::Dependency
|
130
|
+
name: thor
|
131
|
+
requirement: !ruby/object:Gem::Requirement
|
132
|
+
requirements:
|
133
|
+
- - "~>"
|
134
|
+
- !ruby/object:Gem::Version
|
135
|
+
version: 0.19.4
|
136
|
+
type: :runtime
|
137
|
+
prerelease: false
|
138
|
+
version_requirements: !ruby/object:Gem::Requirement
|
139
|
+
requirements:
|
140
|
+
- - "~>"
|
141
|
+
- !ruby/object:Gem::Version
|
142
|
+
version: 0.19.4
|
143
|
+
- !ruby/object:Gem::Dependency
|
144
|
+
name: oga
|
145
|
+
requirement: !ruby/object:Gem::Requirement
|
146
|
+
requirements:
|
147
|
+
- - "~>"
|
148
|
+
- !ruby/object:Gem::Version
|
149
|
+
version: '2.2'
|
150
|
+
type: :runtime
|
151
|
+
prerelease: false
|
152
|
+
version_requirements: !ruby/object:Gem::Requirement
|
153
|
+
requirements:
|
154
|
+
- - "~>"
|
155
|
+
- !ruby/object:Gem::Version
|
156
|
+
version: '2.2'
|
157
|
+
- !ruby/object:Gem::Dependency
|
158
|
+
name: serrano
|
159
|
+
requirement: !ruby/object:Gem::Requirement
|
160
|
+
requirements:
|
161
|
+
- - "~>"
|
162
|
+
- !ruby/object:Gem::Version
|
163
|
+
version: 0.3.6
|
164
|
+
type: :runtime
|
165
|
+
prerelease: false
|
166
|
+
version_requirements: !ruby/object:Gem::Requirement
|
167
|
+
requirements:
|
168
|
+
- - "~>"
|
169
|
+
- !ruby/object:Gem::Version
|
170
|
+
version: 0.3.6
|
171
|
+
- !ruby/object:Gem::Dependency
|
172
|
+
name: bibtex-ruby
|
173
|
+
requirement: !ruby/object:Gem::Requirement
|
174
|
+
requirements:
|
175
|
+
- - "~>"
|
176
|
+
- !ruby/object:Gem::Version
|
177
|
+
version: '4.4'
|
178
|
+
type: :runtime
|
179
|
+
prerelease: false
|
180
|
+
version_requirements: !ruby/object:Gem::Requirement
|
181
|
+
requirements:
|
182
|
+
- - "~>"
|
183
|
+
- !ruby/object:Gem::Version
|
184
|
+
version: '4.4'
|
185
|
+
- !ruby/object:Gem::Dependency
|
186
|
+
name: pdf-reader
|
187
|
+
requirement: !ruby/object:Gem::Requirement
|
188
|
+
requirements:
|
189
|
+
- - "~>"
|
190
|
+
- !ruby/object:Gem::Version
|
191
|
+
version: '2.0'
|
192
|
+
type: :runtime
|
193
|
+
prerelease: false
|
194
|
+
version_requirements: !ruby/object:Gem::Requirement
|
195
|
+
requirements:
|
196
|
+
- - "~>"
|
197
|
+
- !ruby/object:Gem::Version
|
198
|
+
version: '2.0'
|
199
|
+
description: Gets DOIS and generates citations for your papers
|
200
|
+
email: myrmecocystus@gmail.com
|
201
|
+
executables:
|
202
|
+
- extcite
|
203
|
+
extensions: []
|
204
|
+
extra_rdoc_files: []
|
205
|
+
files:
|
206
|
+
- ".gitignore"
|
207
|
+
- ".travis.yml"
|
208
|
+
- CHANGELOG.md
|
209
|
+
- Gemfile
|
210
|
+
- Gemfile.lock
|
211
|
+
- README.md
|
212
|
+
- Rakefile
|
213
|
+
- bin/extcite
|
214
|
+
- extcite.gemspec
|
215
|
+
- extra/fetch.rb
|
216
|
+
- extra/fetch_method.rb
|
217
|
+
- lib/extcite.rb
|
218
|
+
- lib/extcite/methods_array.rb
|
219
|
+
- lib/extcite/methods_string.rb
|
220
|
+
- lib/extcite/utils.rb
|
221
|
+
- lib/extcite/version.rb
|
222
|
+
homepage: http://github.com/sckott/extcite
|
223
|
+
licenses:
|
224
|
+
- MIT
|
225
|
+
metadata: {}
|
226
|
+
post_install_message:
|
227
|
+
rdoc_options: []
|
228
|
+
require_paths:
|
229
|
+
- lib
|
230
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
231
|
+
requirements:
|
232
|
+
- - ">="
|
233
|
+
- !ruby/object:Gem::Version
|
234
|
+
version: '0'
|
235
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
236
|
+
requirements:
|
237
|
+
- - ">="
|
238
|
+
- !ruby/object:Gem::Version
|
239
|
+
version: '0'
|
240
|
+
requirements: []
|
241
|
+
rubyforge_project:
|
242
|
+
rubygems_version: 2.6.8
|
243
|
+
signing_key:
|
244
|
+
specification_version: 4
|
245
|
+
summary: Citations from PDFs
|
246
|
+
test_files: []
|