extcite 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +36 -0
- data/.travis.yml +9 -0
- data/CHANGELOG.md +14 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +81 -0
- data/README.md +71 -0
- data/Rakefile +41 -0
- data/bin/extcite +17 -0
- data/extcite.gemspec +36 -0
- data/extra/fetch.rb +49 -0
- data/extra/fetch_method.rb +17 -0
- data/lib/extcite/methods_array.rb +8 -0
- data/lib/extcite/methods_string.rb +32 -0
- data/lib/extcite/utils.rb +47 -0
- data/lib/extcite/version.rb +3 -0
- data/lib/extcite.rb +224 -0
- metadata +246 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 59c11ddc6a3c2055e6d32941cf1e5227d759d8bb
|
4
|
+
data.tar.gz: 8bb4d062337f6caf5a272cca33900476ff29049c
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 8319b4a0eaacadc82b97d3780df98ae6669b3d62bb017f348dd32c30b09d433ae5b610afeae9d4566f2714b2681dd57526b826819872182c108dafe6a1b780b5
|
7
|
+
data.tar.gz: e26c399bf22cd9e498bf302ae2b484238aed579b6994a8f4e9d81c0da949b60b6b694681459c02af611d0c5664b5d37d2c1f89e071eea86e43c15f13487413f6
|
data/.gitignore
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
/.config
|
4
|
+
/coverage/
|
5
|
+
/InstalledFiles
|
6
|
+
/pkg/
|
7
|
+
/spec/reports/
|
8
|
+
/test/tmp/
|
9
|
+
/test/version_tmp/
|
10
|
+
/tmp/
|
11
|
+
|
12
|
+
## Specific to RubyMotion:
|
13
|
+
.dat*
|
14
|
+
.repl_history
|
15
|
+
build/
|
16
|
+
|
17
|
+
## Documentation cache and generated files:
|
18
|
+
/.yardoc/
|
19
|
+
/_yardoc/
|
20
|
+
/doc/
|
21
|
+
/rdoc/
|
22
|
+
|
23
|
+
## Environment normalisation:
|
24
|
+
/.bundle/
|
25
|
+
/lib/bundler/man/
|
26
|
+
|
27
|
+
# for a library or gem, you might want to ignore these files since the code is
|
28
|
+
# intended to run in multiple environments; otherwise, check them in:
|
29
|
+
#Gemfile.lock
|
30
|
+
.ruby-version
|
31
|
+
.ruby-gemset
|
32
|
+
|
33
|
+
# unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
|
34
|
+
.rvmrc
|
35
|
+
|
36
|
+
cache/
|
data/.travis.yml
ADDED
data/CHANGELOG.md
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
## 0.1.0 (2017-04-06)
|
2
|
+
|
3
|
+
* First version to Rubygems
|
4
|
+
|
5
|
+
## 0.0.9 (2016-06-17)
|
6
|
+
|
7
|
+
* battle tested more, fixed a number of bugs
|
8
|
+
* now works with arxiv papers
|
9
|
+
* now works with biorxiv, or at least should
|
10
|
+
* improved extraction of DOIs
|
11
|
+
|
12
|
+
## 0.1.0 (2016-06-07)
|
13
|
+
|
14
|
+
* just started, :)
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,81 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
extcite (0.1.0)
|
5
|
+
bibtex-ruby (~> 4.4)
|
6
|
+
faraday (~> 0.12.0.1)
|
7
|
+
faraday_middleware (~> 0.11.0.1)
|
8
|
+
oga (~> 2.2)
|
9
|
+
pdf-reader (~> 2.0)
|
10
|
+
serrano (~> 0.3.6)
|
11
|
+
thor (~> 0.19.4)
|
12
|
+
|
13
|
+
GEM
|
14
|
+
remote: https://rubygems.org/
|
15
|
+
specs:
|
16
|
+
Ascii85 (1.0.2)
|
17
|
+
afm (0.2.2)
|
18
|
+
ansi (1.5.0)
|
19
|
+
ast (2.3.0)
|
20
|
+
bibtex-ruby (4.4.3)
|
21
|
+
latex-decode (~> 0.0)
|
22
|
+
codecov (0.1.10)
|
23
|
+
json
|
24
|
+
simplecov
|
25
|
+
url
|
26
|
+
docile (1.1.5)
|
27
|
+
faraday (0.12.0.1)
|
28
|
+
multipart-post (>= 1.2, < 3)
|
29
|
+
faraday_middleware (0.11.0.1)
|
30
|
+
faraday (>= 0.7.4, < 1.0)
|
31
|
+
hashery (2.1.2)
|
32
|
+
json (2.0.3)
|
33
|
+
latex-decode (0.2.2)
|
34
|
+
unicode (~> 0.4)
|
35
|
+
multi_json (1.12.1)
|
36
|
+
multipart-post (2.0.0)
|
37
|
+
oga (2.9)
|
38
|
+
ast
|
39
|
+
ruby-ll (~> 2.1)
|
40
|
+
pdf-reader (2.0.0)
|
41
|
+
Ascii85 (~> 1.0.0)
|
42
|
+
afm (~> 0.2.1)
|
43
|
+
hashery (~> 2.0)
|
44
|
+
ruby-rc4
|
45
|
+
ttfunk
|
46
|
+
power_assert (1.0.1)
|
47
|
+
rake (12.0.0)
|
48
|
+
ruby-ll (2.1.2)
|
49
|
+
ansi
|
50
|
+
ast
|
51
|
+
ruby-rc4 (0.1.5)
|
52
|
+
serrano (0.3.6)
|
53
|
+
faraday (~> 0.12.0.1)
|
54
|
+
faraday_middleware (~> 0.11.0.1)
|
55
|
+
multi_json (~> 1.12, >= 1.12.1)
|
56
|
+
thor (~> 0.19.4)
|
57
|
+
simplecov (0.14.1)
|
58
|
+
docile (~> 1.1.0)
|
59
|
+
json (>= 1.8, < 3)
|
60
|
+
simplecov-html (~> 0.10.0)
|
61
|
+
simplecov-html (0.10.0)
|
62
|
+
test-unit (3.2.3)
|
63
|
+
power_assert
|
64
|
+
thor (0.19.4)
|
65
|
+
ttfunk (1.5.0)
|
66
|
+
unicode (0.4.4.2)
|
67
|
+
url (0.3.2)
|
68
|
+
|
69
|
+
PLATFORMS
|
70
|
+
ruby
|
71
|
+
|
72
|
+
DEPENDENCIES
|
73
|
+
bundler (~> 1.14, >= 1.14.6)
|
74
|
+
codecov (~> 0.1.10)
|
75
|
+
extcite!
|
76
|
+
rake (~> 12.0, >= 12.0.0)
|
77
|
+
simplecov (~> 0.14.1)
|
78
|
+
test-unit (~> 3.2, >= 3.2.1)
|
79
|
+
|
80
|
+
BUNDLED WITH
|
81
|
+
1.14.6
|
data/README.md
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
extcite
|
2
|
+
=======
|
3
|
+
|
4
|
+
[](https://rubygems.org/gems/extcite)
|
5
|
+
[](https://travis-ci.org/sckott/extcite)
|
6
|
+
[](http://codecov.io/github/sckott/extcite?branch=master)
|
7
|
+
|
8
|
+
__`extcite` gets DOIS and generates citations for your papers__
|
9
|
+
|
10
|
+
## Install
|
11
|
+
|
12
|
+
### Release version
|
13
|
+
|
14
|
+
```
|
15
|
+
gem install extcite
|
16
|
+
```
|
17
|
+
|
18
|
+
### Development version
|
19
|
+
|
20
|
+
```
|
21
|
+
git clone git@github.com:sckott/extcite.git
|
22
|
+
cd extcite
|
23
|
+
rake install
|
24
|
+
```
|
25
|
+
|
26
|
+
> if `rake install` fails, try `sudo rake install`. If that fails, open an issue with what `rake install --trace` gives you
|
27
|
+
|
28
|
+
## Examples
|
29
|
+
|
30
|
+
### Within Ruby
|
31
|
+
|
32
|
+
```ruby
|
33
|
+
require 'extcite'
|
34
|
+
```
|
35
|
+
|
36
|
+
#### Search
|
37
|
+
|
38
|
+
A single paper
|
39
|
+
|
40
|
+
```ruby
|
41
|
+
require 'net/http'
|
42
|
+
File.write("foo.pdf", Net::HTTP.get(URI.parse("https://scottchamberlain.info/pdfs/GuoEtal2015PlosOne.pdf")))
|
43
|
+
Extcite.extract(path: 'foo.pdf')
|
44
|
+
```
|
45
|
+
|
46
|
+
bib citation is written to a file given in `file` param
|
47
|
+
|
48
|
+
Many papers at once
|
49
|
+
|
50
|
+
```ruby
|
51
|
+
Dir.mkdir('bar')
|
52
|
+
File.write("bar/foo1.pdf", Net::HTTP.get(URI.parse("https://scottchamberlain.info/pdfs/Chamberlain&Szocs2013F1000Research.pdf")))
|
53
|
+
File.write("bar/foo2.pdf", Net::HTTP.get(URI.parse("https://scottchamberlain.info/pdfs/GuoEtal2015PlosOne.pdf")))
|
54
|
+
Extcite.extract(path: 'bar')
|
55
|
+
```
|
56
|
+
|
57
|
+
### On the CLI
|
58
|
+
|
59
|
+
All pdfs in the current directory:
|
60
|
+
|
61
|
+
```shell
|
62
|
+
extcite extract .
|
63
|
+
```
|
64
|
+
|
65
|
+
Single paper
|
66
|
+
|
67
|
+
```shell
|
68
|
+
extcite extract foo.pdf
|
69
|
+
```
|
70
|
+
|
71
|
+
[changelog]: https://github.com/sckott/extcite/blob/master/CHANGELOG.md
|
data/Rakefile
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
2
|
+
require 'rake/testtask'
|
3
|
+
|
4
|
+
Rake::TestTask.new do |t|
|
5
|
+
t.libs << "test"
|
6
|
+
t.test_files = FileList['test/test-*.rb']
|
7
|
+
t.verbose = true
|
8
|
+
end
|
9
|
+
|
10
|
+
desc "Run tests"
|
11
|
+
task :default => :test
|
12
|
+
|
13
|
+
desc "Build extcite docs"
|
14
|
+
task :docs do
|
15
|
+
system "yardoc"
|
16
|
+
end
|
17
|
+
|
18
|
+
desc "bundle install"
|
19
|
+
task :bundle do
|
20
|
+
system "bundle install"
|
21
|
+
end
|
22
|
+
|
23
|
+
desc "clean out builds"
|
24
|
+
task :clean do
|
25
|
+
system "ls | grep [0-9].gem | xargs rm"
|
26
|
+
end
|
27
|
+
|
28
|
+
desc "Build extcite"
|
29
|
+
task :build do
|
30
|
+
system "gem build extcite.gemspec"
|
31
|
+
end
|
32
|
+
|
33
|
+
desc "Install extcite"
|
34
|
+
task :install => [:bundle, :build] do
|
35
|
+
system "gem install extcite-#{Extcite::VERSION}.gem"
|
36
|
+
end
|
37
|
+
|
38
|
+
desc "Release to Rubygems"
|
39
|
+
task :release => :build do
|
40
|
+
system "gem push extcite-#{Extcite::VERSION}.gem"
|
41
|
+
end
|
data/bin/extcite
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "thor"
|
4
|
+
|
5
|
+
class Dz < Thor
|
6
|
+
include Thor::Actions
|
7
|
+
require 'extcite'
|
8
|
+
|
9
|
+
desc "extract STRING", "Get bib data from PDFs"
|
10
|
+
# method_option :path => :string
|
11
|
+
def extract(tt)
|
12
|
+
tt = "#{tt}"
|
13
|
+
Extcite.extract(path: tt)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
Dz.start(ARGV)
|
data/extcite.gemspec
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'extcite/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = 'extcite'
|
8
|
+
s.version = Extcite::VERSION
|
9
|
+
s.date = '2017-04-06'
|
10
|
+
s.summary = "Citations from PDFs"
|
11
|
+
s.description = "Gets DOIS and generates citations for your papers"
|
12
|
+
s.authors = "Scott Chamberlain"
|
13
|
+
s.email = 'myrmecocystus@gmail.com'
|
14
|
+
s.homepage = 'http://github.com/sckott/extcite'
|
15
|
+
s.licenses = 'MIT'
|
16
|
+
|
17
|
+
s.files = `git ls-files -z`.split("\x0").reject {|f| f.match(%r{^(test|spec|features)/}) }
|
18
|
+
s.require_paths = ["lib"]
|
19
|
+
|
20
|
+
s.bindir = 'bin'
|
21
|
+
s.executables = ['extcite']
|
22
|
+
|
23
|
+
s.add_development_dependency 'bundler', '~> 1.14', '>= 1.14.6'
|
24
|
+
s.add_development_dependency 'rake', '~> 12.0', '>= 12.0.0'
|
25
|
+
s.add_development_dependency 'test-unit', '~> 3.2', '>= 3.2.1'
|
26
|
+
s.add_development_dependency 'simplecov', '~> 0.14.1'
|
27
|
+
s.add_development_dependency 'codecov', '~> 0.1.10'
|
28
|
+
|
29
|
+
s.add_runtime_dependency 'faraday', '~> 0.12.0.1'
|
30
|
+
s.add_runtime_dependency 'faraday_middleware', '~> 0.11.0.1'
|
31
|
+
s.add_runtime_dependency 'thor', '~> 0.19.4'
|
32
|
+
s.add_runtime_dependency 'oga', '~> 2.2'
|
33
|
+
s.add_runtime_dependency 'serrano', '~> 0.3.6'
|
34
|
+
s.add_runtime_dependency 'bibtex-ruby', '~> 4.4'
|
35
|
+
s.add_runtime_dependency 'pdf-reader', '~> 2.0'
|
36
|
+
end
|
data/extra/fetch.rb
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
module Textminer
|
2
|
+
class Fetch #:nodoc:
|
3
|
+
attr_accessor :doi, :type
|
4
|
+
|
5
|
+
def initialize(doi, type)
|
6
|
+
self.doi = doi
|
7
|
+
self.type = type
|
8
|
+
end
|
9
|
+
|
10
|
+
def fetchtext
|
11
|
+
lks = Textminer.links(self.doi)
|
12
|
+
lk = pick_link(lks)
|
13
|
+
case self.type
|
14
|
+
when "xml"
|
15
|
+
# HTTParty.get(lk)
|
16
|
+
coll = []
|
17
|
+
Array(lk).each do |x|
|
18
|
+
coll << HTTParty.get(x)
|
19
|
+
end
|
20
|
+
return coll
|
21
|
+
when "pdf"
|
22
|
+
serialize_pdf(lk, self.doi)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
def pick_link(x)
|
29
|
+
case self.type
|
30
|
+
when "xml"
|
31
|
+
x.xml
|
32
|
+
when "pdf"
|
33
|
+
x.pdf
|
34
|
+
else
|
35
|
+
puts "type must be xml or pdf"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def serialize_pdf(x, y)
|
40
|
+
path = "/Users/sacmac/.textminer/" + y.gsub('/', '_') + ".pdf"
|
41
|
+
File.open(path, "wb") do |f|
|
42
|
+
f.write HTTParty.get(x).parsed_response
|
43
|
+
end
|
44
|
+
|
45
|
+
return path
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
##
|
2
|
+
# Thin layer around pdf-reader gem's PDF::Reader
|
3
|
+
#
|
4
|
+
# @param doi [Array] A DOI, digital object identifier
|
5
|
+
# @param type [Array] One of two options to download: xml (default) or pdf
|
6
|
+
#
|
7
|
+
# @example
|
8
|
+
# require 'textminer'
|
9
|
+
# # fetch full text by DOI - xml by default
|
10
|
+
# Textminer.fetch("10.3897/phytokeys.42.7604")
|
11
|
+
# # many DOIs - xml output
|
12
|
+
# res = Textminer.fetch(["10.3897/phytokeys.42.7604", "10.3897/zookeys.516.9439"])
|
13
|
+
# # fetch full text - pdf
|
14
|
+
# Textminer.fetch("10.3897/phytokeys.42.7604", "pdf")
|
15
|
+
def self.fetch(doi, type = 'xml')
|
16
|
+
Fetch.new(doi, type).fetchtext
|
17
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'oga'
|
2
|
+
require 'bibtex'
|
3
|
+
|
4
|
+
# String methods
|
5
|
+
class String
|
6
|
+
def write_bib(file)
|
7
|
+
File.open(file, 'a') do |f|
|
8
|
+
f.puts self
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
class String
|
14
|
+
def make_bib_arxiv(id)
|
15
|
+
# prep xml
|
16
|
+
xml = Oga.parse_xml(self)
|
17
|
+
# author = xml.xpath('//author//name')[0].text.downcase.gsub(/\s|\./, '_')
|
18
|
+
year = DateTime.strptime(xml.xpath('//updated')[0].text).year
|
19
|
+
|
20
|
+
# make bib citation
|
21
|
+
bib = BibTeX::Bibliography.new
|
22
|
+
bib << BibTeX::Entry.new({
|
23
|
+
:bibtex_type => :article,
|
24
|
+
:url => xml.xpath('//entry/id').text,
|
25
|
+
:author => xml.xpath('//author//name').collect { |x| x.text }.join(' and '),
|
26
|
+
:eprint => id,
|
27
|
+
:title => xml.xpath('//entry//title').text,
|
28
|
+
:year => year
|
29
|
+
})
|
30
|
+
return bib.to_s
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
require "oga"
|
2
|
+
|
3
|
+
def singlearray2hash(x)
|
4
|
+
if x.length == 1 && x.class == Array
|
5
|
+
return x[0]
|
6
|
+
else
|
7
|
+
return x
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
def dir_files(x)
|
12
|
+
Dir.entries(x).select { |entry|
|
13
|
+
!File.directory? File.join(x, entry) and !(entry =='.' || entry == '..')
|
14
|
+
}.map { |z|
|
15
|
+
x + '/' + z
|
16
|
+
}
|
17
|
+
end
|
18
|
+
|
19
|
+
def make_paths(x)
|
20
|
+
path = Array(x)
|
21
|
+
if path.length == 1
|
22
|
+
# if a directory
|
23
|
+
if File.directory?(path[0])
|
24
|
+
# keep only files with .pdf extension
|
25
|
+
path = dir_files(path[0]).keep_if { |z| !!z.match(/.pdf/) }
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
# check that files exist
|
30
|
+
path.each do |z|
|
31
|
+
if !File.exist?(z)
|
32
|
+
raise z + ' not found'
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
return path
|
37
|
+
end
|
38
|
+
|
39
|
+
def pdf_doi(x)
|
40
|
+
xml = Oga.parse_xml(x)
|
41
|
+
begin
|
42
|
+
tt = xml.xpath('//rdf:Description')
|
43
|
+
return tt.attr('dc:identifier')[0].text.sub(/doi:/, '')
|
44
|
+
rescue
|
45
|
+
return nil
|
46
|
+
end
|
47
|
+
end
|
data/lib/extcite.rb
ADDED
@@ -0,0 +1,224 @@
|
|
1
|
+
require "extcite/utils"
|
2
|
+
require "extcite/methods_array"
|
3
|
+
require "extcite/methods_string"
|
4
|
+
require "extcite/version"
|
5
|
+
|
6
|
+
|
7
|
+
require 'serrano'
|
8
|
+
require 'pdf-reader'
|
9
|
+
require 'faraday'
|
10
|
+
|
11
|
+
|
12
|
+
|
13
|
+
module Extcite
|
14
|
+
##
|
15
|
+
# Extract DOIs from one or more PDFs
|
16
|
+
#
|
17
|
+
# @param path [String] Path to a pdf file, or a folder of PDF files
|
18
|
+
# @param file [String] File name to write data to - or nil to stdout
|
19
|
+
# @param output [String] Typeo of output. only bibtex for now
|
20
|
+
#
|
21
|
+
# Return: writes bib files to a .bib file or an array if file is nil
|
22
|
+
# When writing to a file, `extract` by default appends to the end
|
23
|
+
# of the file so you can build up your bibtex file with your
|
24
|
+
# citations
|
25
|
+
#
|
26
|
+
# @example
|
27
|
+
# require 'extcite'
|
28
|
+
# require 'faraday'
|
29
|
+
# # get a paper in pdf format
|
30
|
+
# path = '2068.pdf'
|
31
|
+
# res = Faraday.new(:url => "https://peerj.com/articles/2068.pdf").get
|
32
|
+
# f = File.new(path, "wb")
|
33
|
+
# f.write(res.body)
|
34
|
+
# f.close()
|
35
|
+
# # extract doi from the pdf
|
36
|
+
# Extcite.extract(path: path)
|
37
|
+
# Extcite.extract(path: path, file: nil)
|
38
|
+
def self.extract(path:, file: "out.bib", output: "bib")
|
39
|
+
path = make_paths(path)
|
40
|
+
path.each do |x|
|
41
|
+
# try PDF metadata first
|
42
|
+
ids = nil
|
43
|
+
rr = PDF::Reader.new(x)
|
44
|
+
pdfmeta = rr.metadata
|
45
|
+
if !pdfmeta.nil?
|
46
|
+
xml = Oga.parse_xml(pdfmeta)
|
47
|
+
begin
|
48
|
+
tt = xml.xpath('//rdf:Description')
|
49
|
+
# try dc:identifier attribute
|
50
|
+
ss = tt.attr('dc:identifier')[0]
|
51
|
+
if !ss.nil?
|
52
|
+
ids = ss.text.sub(/doi:/, '')
|
53
|
+
else
|
54
|
+
# try prism:doi node
|
55
|
+
pdoi = xml.xpath('//rdf:Description//prism:doi')
|
56
|
+
if pdoi.length == 1
|
57
|
+
ids = pdoi.text
|
58
|
+
else
|
59
|
+
# try pdf:WPS-ARTICLEDOI node
|
60
|
+
wpsdoi = xml.xpath('//rdf:Description//pdf:WPS-ARTICLEDOI')
|
61
|
+
if wpsdoi.length == 1
|
62
|
+
ids = wpsdoi.text
|
63
|
+
else
|
64
|
+
# try pdfx:WPS-ARTICLEDOI node
|
65
|
+
pdfxwpsdoi = xml.xpath('//rdf:Description//pdfx:WPS-ARTICLEDOI')
|
66
|
+
if pdfxwpsdoi.length == 1
|
67
|
+
ids = pdfxwpsdoi.text
|
68
|
+
else
|
69
|
+
ids = nil
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
rescue
|
75
|
+
ids = nil
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
# if not found, try regexing for DOI
|
80
|
+
if ids.nil?
|
81
|
+
ids = Extcite.get_ids(txt: Extcite.extract_text_one(x))
|
82
|
+
end
|
83
|
+
|
84
|
+
if ids.length == 0
|
85
|
+
puts "no DOI found in " + x
|
86
|
+
else
|
87
|
+
if !ids.match(/arxiv/i).nil? && ids.length < 200
|
88
|
+
conn = Faraday.new(:url => 'http://export.arxiv.org/api/query?id_list=' + ids.gsub(/arxiv:/i, '')).get
|
89
|
+
bibs = conn.body.make_bib_arxiv(ids.gsub(/arxiv:/i, ''))
|
90
|
+
else
|
91
|
+
bibs = Extcite.cont_neg(ids: ids)
|
92
|
+
end
|
93
|
+
|
94
|
+
# if an error or not found, skip
|
95
|
+
bibstest = nil
|
96
|
+
if bibs.class == Array
|
97
|
+
bibstest = bibs[0]
|
98
|
+
end
|
99
|
+
|
100
|
+
if !bibstest.nil?
|
101
|
+
if !bibstest.match(/error|not found/i).nil? || !bibstest.match(/<\/html>/i).nil?
|
102
|
+
puts "DOI found: " + ids + " ; but citation not found via content negotation - passing"
|
103
|
+
# do something else?
|
104
|
+
else
|
105
|
+
if file.nil?
|
106
|
+
return bibs
|
107
|
+
else
|
108
|
+
puts "writing " + ids + " to " + file
|
109
|
+
bibs.write_bib(file)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
##
|
118
|
+
# Extract DOIs from one or more PDFs after extracting text
|
119
|
+
#
|
120
|
+
# @param path [String] Path to a pdf file, or a folder of PDF files
|
121
|
+
#
|
122
|
+
# @example
|
123
|
+
# require 'extcite'
|
124
|
+
# require 'faraday'
|
125
|
+
# # get a paper in pdf format
|
126
|
+
# path = '2068.pdf'
|
127
|
+
# res = Faraday.new(:url => "https://peerj.com/articles/2068.pdf").get
|
128
|
+
# f = File.new(path, "wb")
|
129
|
+
# f.write(res.body)
|
130
|
+
# f.close()
|
131
|
+
# # extract doi from the pdf
|
132
|
+
# Extcite.extract_dois(path: path)
|
133
|
+
def self.extract_dois(path:)
|
134
|
+
txt = Extcite.extract_text(path: path)
|
135
|
+
return txt.map { |z| z.match("[0-9]+\\.[0-9]+/.+").to_s.gsub(/\s.+/, '') }
|
136
|
+
end
|
137
|
+
|
138
|
+
##
|
139
|
+
# Get DOIs from a String or Array of String's
|
140
|
+
#
|
141
|
+
# @param txt [String] String or Array of String's
|
142
|
+
#
|
143
|
+
# Return: Array of DOIs
|
144
|
+
#
|
145
|
+
# @example
|
146
|
+
# require 'extcite'
|
147
|
+
# Extcite.get_ids(txt: '10.1016/j.dendro.2014.01.004 adfasdf asd fas df asdfsd')
|
148
|
+
def self.get_ids(txt:)
|
149
|
+
# see if there's
|
150
|
+
|
151
|
+
return Array(txt).map { |z|
|
152
|
+
# detect if is an arxiv paper
|
153
|
+
if !z.match(/arxiv:[0-9]+\.[0-9A-Za-z]+/i).nil?
|
154
|
+
# if so, return arxiv id for later extraction of arxiv citation via their API
|
155
|
+
z = z.match(/arxiv:[0-9]+\.[0-9A-Za-z]+/i).to_s
|
156
|
+
else
|
157
|
+
doi_pattern = '(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![%"#? ])\\S)+)'
|
158
|
+
z = z.match(doi_pattern).to_s.gsub(/\s.+/, '')
|
159
|
+
# z = z.match("10\\.[0-9]+/.+").to_s.gsub(/\s.+/, '')
|
160
|
+
end
|
161
|
+
# clean up doi
|
162
|
+
z = z.gsub(/\.$|\.;$|\.\]$|\.\}$|\.\)$|,$/, '')
|
163
|
+
return z.gsub(/;$|\]$|\}$|\)$/, '')
|
164
|
+
}[0]
|
165
|
+
end
|
166
|
+
|
167
|
+
##
|
168
|
+
# Extract text from a pdf, or many pdfs
|
169
|
+
#
|
170
|
+
# @param path [String] Path to a pdf file, or a folder of PDF files
|
171
|
+
#
|
172
|
+
# This method is used internally within fetch to parse PDFs.
|
173
|
+
#
|
174
|
+
# @example
|
175
|
+
# require 'extcite'
|
176
|
+
# require 'faraday'
|
177
|
+
# # get a paper in pdf format
|
178
|
+
# path = '2068.pdf'
|
179
|
+
# res = Faraday.new(:url => "https://peerj.com/articles/2068.pdf").get
|
180
|
+
# f = File.new(path, "wb")
|
181
|
+
# f.write(res.body)
|
182
|
+
# f.close()
|
183
|
+
# # extract doi from the pdf
|
184
|
+
# Extcite.extract_text(path: path)
|
185
|
+
def self.extract_text(path:)
|
186
|
+
path = Array(path)
|
187
|
+
if path.length == 1
|
188
|
+
if File.directory?(path[0])
|
189
|
+
# keep only files with .pdf extension
|
190
|
+
path = dir_files(path[0]).keep_if { |z| !!z.match(/.pdf/) }
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
out = []
|
195
|
+
path.each do |x|
|
196
|
+
rr = PDF::Reader.new(x)
|
197
|
+
out << rr.pages.map { |page| page.text }.join("\n")
|
198
|
+
end
|
199
|
+
return out
|
200
|
+
end
|
201
|
+
|
202
|
+
##
|
203
|
+
# Get citation(s) using Crossref content negotation
|
204
|
+
#
|
205
|
+
# @param ids [Array[String]] One or more DOIs in an array
|
206
|
+
#
|
207
|
+
# Return: an array of bib data
|
208
|
+
#
|
209
|
+
# @example
|
210
|
+
# require 'extcite'
|
211
|
+
# Extcite.cont_neg(ids: "10.1016/j.dendro.2014.01.004")
|
212
|
+
def self.cont_neg(ids:)
|
213
|
+
out = Serrano.content_negotiation(ids: ids)
|
214
|
+
return out
|
215
|
+
end
|
216
|
+
|
217
|
+
protected
|
218
|
+
|
219
|
+
def self.extract_text_one(x)
|
220
|
+
rr = PDF::Reader.new(x)
|
221
|
+
return rr.pages.map { |page| page.text }.join("\n")
|
222
|
+
end
|
223
|
+
|
224
|
+
end
|
metadata
ADDED
@@ -0,0 +1,246 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: extcite
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Scott Chamberlain
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2017-04-06 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.14'
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 1.14.6
|
23
|
+
type: :development
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
requirements:
|
27
|
+
- - "~>"
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '1.14'
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 1.14.6
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: rake
|
35
|
+
requirement: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - "~>"
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '12.0'
|
40
|
+
- - ">="
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: 12.0.0
|
43
|
+
type: :development
|
44
|
+
prerelease: false
|
45
|
+
version_requirements: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - "~>"
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: '12.0'
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: 12.0.0
|
53
|
+
- !ruby/object:Gem::Dependency
|
54
|
+
name: test-unit
|
55
|
+
requirement: !ruby/object:Gem::Requirement
|
56
|
+
requirements:
|
57
|
+
- - "~>"
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: '3.2'
|
60
|
+
- - ">="
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: 3.2.1
|
63
|
+
type: :development
|
64
|
+
prerelease: false
|
65
|
+
version_requirements: !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- - "~>"
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '3.2'
|
70
|
+
- - ">="
|
71
|
+
- !ruby/object:Gem::Version
|
72
|
+
version: 3.2.1
|
73
|
+
- !ruby/object:Gem::Dependency
|
74
|
+
name: simplecov
|
75
|
+
requirement: !ruby/object:Gem::Requirement
|
76
|
+
requirements:
|
77
|
+
- - "~>"
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
version: 0.14.1
|
80
|
+
type: :development
|
81
|
+
prerelease: false
|
82
|
+
version_requirements: !ruby/object:Gem::Requirement
|
83
|
+
requirements:
|
84
|
+
- - "~>"
|
85
|
+
- !ruby/object:Gem::Version
|
86
|
+
version: 0.14.1
|
87
|
+
- !ruby/object:Gem::Dependency
|
88
|
+
name: codecov
|
89
|
+
requirement: !ruby/object:Gem::Requirement
|
90
|
+
requirements:
|
91
|
+
- - "~>"
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: 0.1.10
|
94
|
+
type: :development
|
95
|
+
prerelease: false
|
96
|
+
version_requirements: !ruby/object:Gem::Requirement
|
97
|
+
requirements:
|
98
|
+
- - "~>"
|
99
|
+
- !ruby/object:Gem::Version
|
100
|
+
version: 0.1.10
|
101
|
+
- !ruby/object:Gem::Dependency
|
102
|
+
name: faraday
|
103
|
+
requirement: !ruby/object:Gem::Requirement
|
104
|
+
requirements:
|
105
|
+
- - "~>"
|
106
|
+
- !ruby/object:Gem::Version
|
107
|
+
version: 0.12.0.1
|
108
|
+
type: :runtime
|
109
|
+
prerelease: false
|
110
|
+
version_requirements: !ruby/object:Gem::Requirement
|
111
|
+
requirements:
|
112
|
+
- - "~>"
|
113
|
+
- !ruby/object:Gem::Version
|
114
|
+
version: 0.12.0.1
|
115
|
+
- !ruby/object:Gem::Dependency
|
116
|
+
name: faraday_middleware
|
117
|
+
requirement: !ruby/object:Gem::Requirement
|
118
|
+
requirements:
|
119
|
+
- - "~>"
|
120
|
+
- !ruby/object:Gem::Version
|
121
|
+
version: 0.11.0.1
|
122
|
+
type: :runtime
|
123
|
+
prerelease: false
|
124
|
+
version_requirements: !ruby/object:Gem::Requirement
|
125
|
+
requirements:
|
126
|
+
- - "~>"
|
127
|
+
- !ruby/object:Gem::Version
|
128
|
+
version: 0.11.0.1
|
129
|
+
- !ruby/object:Gem::Dependency
|
130
|
+
name: thor
|
131
|
+
requirement: !ruby/object:Gem::Requirement
|
132
|
+
requirements:
|
133
|
+
- - "~>"
|
134
|
+
- !ruby/object:Gem::Version
|
135
|
+
version: 0.19.4
|
136
|
+
type: :runtime
|
137
|
+
prerelease: false
|
138
|
+
version_requirements: !ruby/object:Gem::Requirement
|
139
|
+
requirements:
|
140
|
+
- - "~>"
|
141
|
+
- !ruby/object:Gem::Version
|
142
|
+
version: 0.19.4
|
143
|
+
- !ruby/object:Gem::Dependency
|
144
|
+
name: oga
|
145
|
+
requirement: !ruby/object:Gem::Requirement
|
146
|
+
requirements:
|
147
|
+
- - "~>"
|
148
|
+
- !ruby/object:Gem::Version
|
149
|
+
version: '2.2'
|
150
|
+
type: :runtime
|
151
|
+
prerelease: false
|
152
|
+
version_requirements: !ruby/object:Gem::Requirement
|
153
|
+
requirements:
|
154
|
+
- - "~>"
|
155
|
+
- !ruby/object:Gem::Version
|
156
|
+
version: '2.2'
|
157
|
+
- !ruby/object:Gem::Dependency
|
158
|
+
name: serrano
|
159
|
+
requirement: !ruby/object:Gem::Requirement
|
160
|
+
requirements:
|
161
|
+
- - "~>"
|
162
|
+
- !ruby/object:Gem::Version
|
163
|
+
version: 0.3.6
|
164
|
+
type: :runtime
|
165
|
+
prerelease: false
|
166
|
+
version_requirements: !ruby/object:Gem::Requirement
|
167
|
+
requirements:
|
168
|
+
- - "~>"
|
169
|
+
- !ruby/object:Gem::Version
|
170
|
+
version: 0.3.6
|
171
|
+
- !ruby/object:Gem::Dependency
|
172
|
+
name: bibtex-ruby
|
173
|
+
requirement: !ruby/object:Gem::Requirement
|
174
|
+
requirements:
|
175
|
+
- - "~>"
|
176
|
+
- !ruby/object:Gem::Version
|
177
|
+
version: '4.4'
|
178
|
+
type: :runtime
|
179
|
+
prerelease: false
|
180
|
+
version_requirements: !ruby/object:Gem::Requirement
|
181
|
+
requirements:
|
182
|
+
- - "~>"
|
183
|
+
- !ruby/object:Gem::Version
|
184
|
+
version: '4.4'
|
185
|
+
- !ruby/object:Gem::Dependency
|
186
|
+
name: pdf-reader
|
187
|
+
requirement: !ruby/object:Gem::Requirement
|
188
|
+
requirements:
|
189
|
+
- - "~>"
|
190
|
+
- !ruby/object:Gem::Version
|
191
|
+
version: '2.0'
|
192
|
+
type: :runtime
|
193
|
+
prerelease: false
|
194
|
+
version_requirements: !ruby/object:Gem::Requirement
|
195
|
+
requirements:
|
196
|
+
- - "~>"
|
197
|
+
- !ruby/object:Gem::Version
|
198
|
+
version: '2.0'
|
199
|
+
description: Gets DOIS and generates citations for your papers
|
200
|
+
email: myrmecocystus@gmail.com
|
201
|
+
executables:
|
202
|
+
- extcite
|
203
|
+
extensions: []
|
204
|
+
extra_rdoc_files: []
|
205
|
+
files:
|
206
|
+
- ".gitignore"
|
207
|
+
- ".travis.yml"
|
208
|
+
- CHANGELOG.md
|
209
|
+
- Gemfile
|
210
|
+
- Gemfile.lock
|
211
|
+
- README.md
|
212
|
+
- Rakefile
|
213
|
+
- bin/extcite
|
214
|
+
- extcite.gemspec
|
215
|
+
- extra/fetch.rb
|
216
|
+
- extra/fetch_method.rb
|
217
|
+
- lib/extcite.rb
|
218
|
+
- lib/extcite/methods_array.rb
|
219
|
+
- lib/extcite/methods_string.rb
|
220
|
+
- lib/extcite/utils.rb
|
221
|
+
- lib/extcite/version.rb
|
222
|
+
homepage: http://github.com/sckott/extcite
|
223
|
+
licenses:
|
224
|
+
- MIT
|
225
|
+
metadata: {}
|
226
|
+
post_install_message:
|
227
|
+
rdoc_options: []
|
228
|
+
require_paths:
|
229
|
+
- lib
|
230
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
231
|
+
requirements:
|
232
|
+
- - ">="
|
233
|
+
- !ruby/object:Gem::Version
|
234
|
+
version: '0'
|
235
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
236
|
+
requirements:
|
237
|
+
- - ">="
|
238
|
+
- !ruby/object:Gem::Version
|
239
|
+
version: '0'
|
240
|
+
requirements: []
|
241
|
+
rubyforge_project:
|
242
|
+
rubygems_version: 2.6.8
|
243
|
+
signing_key:
|
244
|
+
specification_version: 4
|
245
|
+
summary: Citations from PDFs
|
246
|
+
test_files: []
|