extcite 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 59c11ddc6a3c2055e6d32941cf1e5227d759d8bb
4
+ data.tar.gz: 8bb4d062337f6caf5a272cca33900476ff29049c
5
+ SHA512:
6
+ metadata.gz: 8319b4a0eaacadc82b97d3780df98ae6669b3d62bb017f348dd32c30b09d433ae5b610afeae9d4566f2714b2681dd57526b826819872182c108dafe6a1b780b5
7
+ data.tar.gz: e26c399bf22cd9e498bf302ae2b484238aed579b6994a8f4e9d81c0da949b60b6b694681459c02af611d0c5664b5d37d2c1f89e071eea86e43c15f13487413f6
data/.gitignore ADDED
@@ -0,0 +1,36 @@
1
+ *.gem
2
+ *.rbc
3
+ /.config
4
+ /coverage/
5
+ /InstalledFiles
6
+ /pkg/
7
+ /spec/reports/
8
+ /test/tmp/
9
+ /test/version_tmp/
10
+ /tmp/
11
+
12
+ ## Specific to RubyMotion:
13
+ .dat*
14
+ .repl_history
15
+ build/
16
+
17
+ ## Documentation cache and generated files:
18
+ /.yardoc/
19
+ /_yardoc/
20
+ /doc/
21
+ /rdoc/
22
+
23
+ ## Environment normalisation:
24
+ /.bundle/
25
+ /lib/bundler/man/
26
+
27
+ # for a library or gem, you might want to ignore these files since the code is
28
+ # intended to run in multiple environments; otherwise, check them in:
29
+ #Gemfile.lock
30
+ .ruby-version
31
+ .ruby-gemset
32
+
33
+ # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
34
+ .rvmrc
35
+
36
+ cache/
data/.travis.yml ADDED
@@ -0,0 +1,9 @@
1
+ language: ruby
2
+ cache: bundler
3
+ rvm:
4
+ - 2.1.7
5
+ - 2.2.3
6
+ - 2.2.4
7
+ - 2.3.1
8
+ script:
9
+ bundle exec rake test TESTOPTS="-v"
data/CHANGELOG.md ADDED
@@ -0,0 +1,14 @@
1
+ ## 0.1.0 (2017-04-06)
2
+
3
+ * First version to Rubygems
4
+
5
+ ## 0.0.9 (2016-06-17)
6
+
7
+ * battle tested more, fixed a number of bugs
8
+ * now works with arxiv papers
9
+ * now works with biorxiv, or at least should
10
+ * improved extraction of DOIs
11
+
12
+ ## 0.1.0 (2016-06-07)
13
+
14
+ * just started, :)
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,81 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ extcite (0.1.0)
5
+ bibtex-ruby (~> 4.4)
6
+ faraday (~> 0.12.0.1)
7
+ faraday_middleware (~> 0.11.0.1)
8
+ oga (~> 2.2)
9
+ pdf-reader (~> 2.0)
10
+ serrano (~> 0.3.6)
11
+ thor (~> 0.19.4)
12
+
13
+ GEM
14
+ remote: https://rubygems.org/
15
+ specs:
16
+ Ascii85 (1.0.2)
17
+ afm (0.2.2)
18
+ ansi (1.5.0)
19
+ ast (2.3.0)
20
+ bibtex-ruby (4.4.3)
21
+ latex-decode (~> 0.0)
22
+ codecov (0.1.10)
23
+ json
24
+ simplecov
25
+ url
26
+ docile (1.1.5)
27
+ faraday (0.12.0.1)
28
+ multipart-post (>= 1.2, < 3)
29
+ faraday_middleware (0.11.0.1)
30
+ faraday (>= 0.7.4, < 1.0)
31
+ hashery (2.1.2)
32
+ json (2.0.3)
33
+ latex-decode (0.2.2)
34
+ unicode (~> 0.4)
35
+ multi_json (1.12.1)
36
+ multipart-post (2.0.0)
37
+ oga (2.9)
38
+ ast
39
+ ruby-ll (~> 2.1)
40
+ pdf-reader (2.0.0)
41
+ Ascii85 (~> 1.0.0)
42
+ afm (~> 0.2.1)
43
+ hashery (~> 2.0)
44
+ ruby-rc4
45
+ ttfunk
46
+ power_assert (1.0.1)
47
+ rake (12.0.0)
48
+ ruby-ll (2.1.2)
49
+ ansi
50
+ ast
51
+ ruby-rc4 (0.1.5)
52
+ serrano (0.3.6)
53
+ faraday (~> 0.12.0.1)
54
+ faraday_middleware (~> 0.11.0.1)
55
+ multi_json (~> 1.12, >= 1.12.1)
56
+ thor (~> 0.19.4)
57
+ simplecov (0.14.1)
58
+ docile (~> 1.1.0)
59
+ json (>= 1.8, < 3)
60
+ simplecov-html (~> 0.10.0)
61
+ simplecov-html (0.10.0)
62
+ test-unit (3.2.3)
63
+ power_assert
64
+ thor (0.19.4)
65
+ ttfunk (1.5.0)
66
+ unicode (0.4.4.2)
67
+ url (0.3.2)
68
+
69
+ PLATFORMS
70
+ ruby
71
+
72
+ DEPENDENCIES
73
+ bundler (~> 1.14, >= 1.14.6)
74
+ codecov (~> 0.1.10)
75
+ extcite!
76
+ rake (~> 12.0, >= 12.0.0)
77
+ simplecov (~> 0.14.1)
78
+ test-unit (~> 3.2, >= 3.2.1)
79
+
80
+ BUNDLED WITH
81
+ 1.14.6
data/README.md ADDED
@@ -0,0 +1,71 @@
1
+ extcite
2
+ =======
3
+
4
+ [![gem version](https://img.shields.io/gem/v/extcite.svg)](https://rubygems.org/gems/extcite)
5
+ [![Build Status](https://travis-ci.org/sckott/extcite.svg?branch=master)](https://travis-ci.org/sckott/extcite)
6
+ [![codecov.io](http://codecov.io/github/sckott/extcite/coverage.svg?branch=master)](http://codecov.io/github/sckott/extcite?branch=master)
7
+
8
+ __`extcite` gets DOIS and generates citations for your papers__
9
+
10
+ ## Install
11
+
12
+ ### Release version
13
+
14
+ ```
15
+ gem install extcite
16
+ ```
17
+
18
+ ### Development version
19
+
20
+ ```
21
+ git clone git@github.com:sckott/extcite.git
22
+ cd extcite
23
+ rake install
24
+ ```
25
+
26
+ > if `rake install` fails, try `sudo rake install`. If that fails, open an issue with what `rake install --trace` gives you
27
+
28
+ ## Examples
29
+
30
+ ### Within Ruby
31
+
32
+ ```ruby
33
+ require 'extcite'
34
+ ```
35
+
36
+ #### Search
37
+
38
+ A single paper
39
+
40
+ ```ruby
41
+ require 'net/http'
42
+ File.write("foo.pdf", Net::HTTP.get(URI.parse("https://scottchamberlain.info/pdfs/GuoEtal2015PlosOne.pdf")))
43
+ Extcite.extract(path: 'foo.pdf')
44
+ ```
45
+
46
+ bib citation is written to a file given in `file` param
47
+
48
+ Many papers at once
49
+
50
+ ```ruby
51
+ Dir.mkdir('bar')
52
+ File.write("bar/foo1.pdf", Net::HTTP.get(URI.parse("https://scottchamberlain.info/pdfs/Chamberlain&Szocs2013F1000Research.pdf")))
53
+ File.write("bar/foo2.pdf", Net::HTTP.get(URI.parse("https://scottchamberlain.info/pdfs/GuoEtal2015PlosOne.pdf")))
54
+ Extcite.extract(path: 'bar')
55
+ ```
56
+
57
+ ### On the CLI
58
+
59
+ All pdfs in the current directory:
60
+
61
+ ```shell
62
+ extcite extract .
63
+ ```
64
+
65
+ Single paper
66
+
67
+ ```shell
68
+ extcite extract foo.pdf
69
+ ```
70
+
71
+ [changelog]: https://github.com/sckott/extcite/blob/master/CHANGELOG.md
data/Rakefile ADDED
@@ -0,0 +1,41 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rake/testtask'
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.libs << "test"
6
+ t.test_files = FileList['test/test-*.rb']
7
+ t.verbose = true
8
+ end
9
+
10
+ desc "Run tests"
11
+ task :default => :test
12
+
13
+ desc "Build extcite docs"
14
+ task :docs do
15
+ system "yardoc"
16
+ end
17
+
18
+ desc "bundle install"
19
+ task :bundle do
20
+ system "bundle install"
21
+ end
22
+
23
+ desc "clean out builds"
24
+ task :clean do
25
+ system "ls | grep [0-9].gem | xargs rm"
26
+ end
27
+
28
+ desc "Build extcite"
29
+ task :build do
30
+ system "gem build extcite.gemspec"
31
+ end
32
+
33
+ desc "Install extcite"
34
+ task :install => [:bundle, :build] do
35
+ system "gem install extcite-#{Extcite::VERSION}.gem"
36
+ end
37
+
38
+ desc "Release to Rubygems"
39
+ task :release => :build do
40
+ system "gem push extcite-#{Extcite::VERSION}.gem"
41
+ end
data/bin/extcite ADDED
@@ -0,0 +1,17 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "thor"
4
+
5
+ class Dz < Thor
6
+ include Thor::Actions
7
+ require 'extcite'
8
+
9
+ desc "extract STRING", "Get bib data from PDFs"
10
+ # method_option :path => :string
11
+ def extract(tt)
12
+ tt = "#{tt}"
13
+ Extcite.extract(path: tt)
14
+ end
15
+ end
16
+
17
+ Dz.start(ARGV)
data/extcite.gemspec ADDED
@@ -0,0 +1,36 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'extcite/version'
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = 'extcite'
8
+ s.version = Extcite::VERSION
9
+ s.date = '2017-04-06'
10
+ s.summary = "Citations from PDFs"
11
+ s.description = "Gets DOIS and generates citations for your papers"
12
+ s.authors = "Scott Chamberlain"
13
+ s.email = 'myrmecocystus@gmail.com'
14
+ s.homepage = 'http://github.com/sckott/extcite'
15
+ s.licenses = 'MIT'
16
+
17
+ s.files = `git ls-files -z`.split("\x0").reject {|f| f.match(%r{^(test|spec|features)/}) }
18
+ s.require_paths = ["lib"]
19
+
20
+ s.bindir = 'bin'
21
+ s.executables = ['extcite']
22
+
23
+ s.add_development_dependency 'bundler', '~> 1.14', '>= 1.14.6'
24
+ s.add_development_dependency 'rake', '~> 12.0', '>= 12.0.0'
25
+ s.add_development_dependency 'test-unit', '~> 3.2', '>= 3.2.1'
26
+ s.add_development_dependency 'simplecov', '~> 0.14.1'
27
+ s.add_development_dependency 'codecov', '~> 0.1.10'
28
+
29
+ s.add_runtime_dependency 'faraday', '~> 0.12.0.1'
30
+ s.add_runtime_dependency 'faraday_middleware', '~> 0.11.0.1'
31
+ s.add_runtime_dependency 'thor', '~> 0.19.4'
32
+ s.add_runtime_dependency 'oga', '~> 2.2'
33
+ s.add_runtime_dependency 'serrano', '~> 0.3.6'
34
+ s.add_runtime_dependency 'bibtex-ruby', '~> 4.4'
35
+ s.add_runtime_dependency 'pdf-reader', '~> 2.0'
36
+ end
data/extra/fetch.rb ADDED
@@ -0,0 +1,49 @@
1
+ module Textminer
2
+ class Fetch #:nodoc:
3
+ attr_accessor :doi, :type
4
+
5
+ def initialize(doi, type)
6
+ self.doi = doi
7
+ self.type = type
8
+ end
9
+
10
+ def fetchtext
11
+ lks = Textminer.links(self.doi)
12
+ lk = pick_link(lks)
13
+ case self.type
14
+ when "xml"
15
+ # HTTParty.get(lk)
16
+ coll = []
17
+ Array(lk).each do |x|
18
+ coll << HTTParty.get(x)
19
+ end
20
+ return coll
21
+ when "pdf"
22
+ serialize_pdf(lk, self.doi)
23
+ end
24
+ end
25
+
26
+ private
27
+
28
+ def pick_link(x)
29
+ case self.type
30
+ when "xml"
31
+ x.xml
32
+ when "pdf"
33
+ x.pdf
34
+ else
35
+ puts "type must be xml or pdf"
36
+ end
37
+ end
38
+
39
+ def serialize_pdf(x, y)
40
+ path = "/Users/sacmac/.textminer/" + y.gsub('/', '_') + ".pdf"
41
+ File.open(path, "wb") do |f|
42
+ f.write HTTParty.get(x).parsed_response
43
+ end
44
+
45
+ return path
46
+ end
47
+
48
+ end
49
+ end
@@ -0,0 +1,17 @@
1
+ ##
2
+ # Thin layer around pdf-reader gem's PDF::Reader
3
+ #
4
+ # @param doi [Array] A DOI, digital object identifier
5
+ # @param type [Array] One of two options to download: xml (default) or pdf
6
+ #
7
+ # @example
8
+ # require 'textminer'
9
+ # # fetch full text by DOI - xml by default
10
+ # Textminer.fetch("10.3897/phytokeys.42.7604")
11
+ # # many DOIs - xml output
12
+ # res = Textminer.fetch(["10.3897/phytokeys.42.7604", "10.3897/zookeys.516.9439"])
13
+ # # fetch full text - pdf
14
+ # Textminer.fetch("10.3897/phytokeys.42.7604", "pdf")
15
+ def self.fetch(doi, type = 'xml')
16
+ Fetch.new(doi, type).fetchtext
17
+ end
@@ -0,0 +1,8 @@
1
+ # Array methods
2
+ class Array
3
+ def write_bib(file)
4
+ File.open(file, 'a') do |f|
5
+ f.puts self
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,32 @@
1
+ require 'oga'
2
+ require 'bibtex'
3
+
4
+ # String methods
5
+ class String
6
+ def write_bib(file)
7
+ File.open(file, 'a') do |f|
8
+ f.puts self
9
+ end
10
+ end
11
+ end
12
+
13
+ class String
14
+ def make_bib_arxiv(id)
15
+ # prep xml
16
+ xml = Oga.parse_xml(self)
17
+ # author = xml.xpath('//author//name')[0].text.downcase.gsub(/\s|\./, '_')
18
+ year = DateTime.strptime(xml.xpath('//updated')[0].text).year
19
+
20
+ # make bib citation
21
+ bib = BibTeX::Bibliography.new
22
+ bib << BibTeX::Entry.new({
23
+ :bibtex_type => :article,
24
+ :url => xml.xpath('//entry/id').text,
25
+ :author => xml.xpath('//author//name').collect { |x| x.text }.join(' and '),
26
+ :eprint => id,
27
+ :title => xml.xpath('//entry//title').text,
28
+ :year => year
29
+ })
30
+ return bib.to_s
31
+ end
32
+ end
@@ -0,0 +1,47 @@
1
+ require "oga"
2
+
3
+ def singlearray2hash(x)
4
+ if x.length == 1 && x.class == Array
5
+ return x[0]
6
+ else
7
+ return x
8
+ end
9
+ end
10
+
11
+ def dir_files(x)
12
+ Dir.entries(x).select { |entry|
13
+ !File.directory? File.join(x, entry) and !(entry =='.' || entry == '..')
14
+ }.map { |z|
15
+ x + '/' + z
16
+ }
17
+ end
18
+
19
+ def make_paths(x)
20
+ path = Array(x)
21
+ if path.length == 1
22
+ # if a directory
23
+ if File.directory?(path[0])
24
+ # keep only files with .pdf extension
25
+ path = dir_files(path[0]).keep_if { |z| !!z.match(/.pdf/) }
26
+ end
27
+ end
28
+
29
+ # check that files exist
30
+ path.each do |z|
31
+ if !File.exist?(z)
32
+ raise z + ' not found'
33
+ end
34
+ end
35
+
36
+ return path
37
+ end
38
+
39
+ def pdf_doi(x)
40
+ xml = Oga.parse_xml(x)
41
+ begin
42
+ tt = xml.xpath('//rdf:Description')
43
+ return tt.attr('dc:identifier')[0].text.sub(/doi:/, '')
44
+ rescue
45
+ return nil
46
+ end
47
+ end
@@ -0,0 +1,3 @@
1
+ module Extcite
2
+ VERSION = "0.1.0"
3
+ end
data/lib/extcite.rb ADDED
@@ -0,0 +1,224 @@
1
+ require "extcite/utils"
2
+ require "extcite/methods_array"
3
+ require "extcite/methods_string"
4
+ require "extcite/version"
5
+
6
+
7
+ require 'serrano'
8
+ require 'pdf-reader'
9
+ require 'faraday'
10
+
11
+
12
+
13
+ module Extcite
14
+ ##
15
+ # Extract DOIs from one or more PDFs
16
+ #
17
+ # @param path [String] Path to a pdf file, or a folder of PDF files
18
+ # @param file [String] File name to write data to - or nil to stdout
19
+ # @param output [String] Typeo of output. only bibtex for now
20
+ #
21
+ # Return: writes bib files to a .bib file or an array if file is nil
22
+ # When writing to a file, `extract` by default appends to the end
23
+ # of the file so you can build up your bibtex file with your
24
+ # citations
25
+ #
26
+ # @example
27
+ # require 'extcite'
28
+ # require 'faraday'
29
+ # # get a paper in pdf format
30
+ # path = '2068.pdf'
31
+ # res = Faraday.new(:url => "https://peerj.com/articles/2068.pdf").get
32
+ # f = File.new(path, "wb")
33
+ # f.write(res.body)
34
+ # f.close()
35
+ # # extract doi from the pdf
36
+ # Extcite.extract(path: path)
37
+ # Extcite.extract(path: path, file: nil)
38
+ def self.extract(path:, file: "out.bib", output: "bib")
39
+ path = make_paths(path)
40
+ path.each do |x|
41
+ # try PDF metadata first
42
+ ids = nil
43
+ rr = PDF::Reader.new(x)
44
+ pdfmeta = rr.metadata
45
+ if !pdfmeta.nil?
46
+ xml = Oga.parse_xml(pdfmeta)
47
+ begin
48
+ tt = xml.xpath('//rdf:Description')
49
+ # try dc:identifier attribute
50
+ ss = tt.attr('dc:identifier')[0]
51
+ if !ss.nil?
52
+ ids = ss.text.sub(/doi:/, '')
53
+ else
54
+ # try prism:doi node
55
+ pdoi = xml.xpath('//rdf:Description//prism:doi')
56
+ if pdoi.length == 1
57
+ ids = pdoi.text
58
+ else
59
+ # try pdf:WPS-ARTICLEDOI node
60
+ wpsdoi = xml.xpath('//rdf:Description//pdf:WPS-ARTICLEDOI')
61
+ if wpsdoi.length == 1
62
+ ids = wpsdoi.text
63
+ else
64
+ # try pdfx:WPS-ARTICLEDOI node
65
+ pdfxwpsdoi = xml.xpath('//rdf:Description//pdfx:WPS-ARTICLEDOI')
66
+ if pdfxwpsdoi.length == 1
67
+ ids = pdfxwpsdoi.text
68
+ else
69
+ ids = nil
70
+ end
71
+ end
72
+ end
73
+ end
74
+ rescue
75
+ ids = nil
76
+ end
77
+ end
78
+
79
+ # if not found, try regexing for DOI
80
+ if ids.nil?
81
+ ids = Extcite.get_ids(txt: Extcite.extract_text_one(x))
82
+ end
83
+
84
+ if ids.length == 0
85
+ puts "no DOI found in " + x
86
+ else
87
+ if !ids.match(/arxiv/i).nil? && ids.length < 200
88
+ conn = Faraday.new(:url => 'http://export.arxiv.org/api/query?id_list=' + ids.gsub(/arxiv:/i, '')).get
89
+ bibs = conn.body.make_bib_arxiv(ids.gsub(/arxiv:/i, ''))
90
+ else
91
+ bibs = Extcite.cont_neg(ids: ids)
92
+ end
93
+
94
+ # if an error or not found, skip
95
+ bibstest = nil
96
+ if bibs.class == Array
97
+ bibstest = bibs[0]
98
+ end
99
+
100
+ if !bibstest.nil?
101
+ if !bibstest.match(/error|not found/i).nil? || !bibstest.match(/<\/html>/i).nil?
102
+ puts "DOI found: " + ids + " ; but citation not found via content negotation - passing"
103
+ # do something else?
104
+ else
105
+ if file.nil?
106
+ return bibs
107
+ else
108
+ puts "writing " + ids + " to " + file
109
+ bibs.write_bib(file)
110
+ end
111
+ end
112
+ end
113
+ end
114
+ end
115
+ end
116
+
117
+ ##
118
+ # Extract DOIs from one or more PDFs after extracting text
119
+ #
120
+ # @param path [String] Path to a pdf file, or a folder of PDF files
121
+ #
122
+ # @example
123
+ # require 'extcite'
124
+ # require 'faraday'
125
+ # # get a paper in pdf format
126
+ # path = '2068.pdf'
127
+ # res = Faraday.new(:url => "https://peerj.com/articles/2068.pdf").get
128
+ # f = File.new(path, "wb")
129
+ # f.write(res.body)
130
+ # f.close()
131
+ # # extract doi from the pdf
132
+ # Extcite.extract_dois(path: path)
133
+ def self.extract_dois(path:)
134
+ txt = Extcite.extract_text(path: path)
135
+ return txt.map { |z| z.match("[0-9]+\\.[0-9]+/.+").to_s.gsub(/\s.+/, '') }
136
+ end
137
+
138
+ ##
139
+ # Get DOIs from a String or Array of String's
140
+ #
141
+ # @param txt [String] String or Array of String's
142
+ #
143
+ # Return: Array of DOIs
144
+ #
145
+ # @example
146
+ # require 'extcite'
147
+ # Extcite.get_ids(txt: '10.1016/j.dendro.2014.01.004 adfasdf asd fas df asdfsd')
148
+ def self.get_ids(txt:)
149
+ # see if there's
150
+
151
+ return Array(txt).map { |z|
152
+ # detect if is an arxiv paper
153
+ if !z.match(/arxiv:[0-9]+\.[0-9A-Za-z]+/i).nil?
154
+ # if so, return arxiv id for later extraction of arxiv citation via their API
155
+ z = z.match(/arxiv:[0-9]+\.[0-9A-Za-z]+/i).to_s
156
+ else
157
+ doi_pattern = '(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![%"#? ])\\S)+)'
158
+ z = z.match(doi_pattern).to_s.gsub(/\s.+/, '')
159
+ # z = z.match("10\\.[0-9]+/.+").to_s.gsub(/\s.+/, '')
160
+ end
161
+ # clean up doi
162
+ z = z.gsub(/\.$|\.;$|\.\]$|\.\}$|\.\)$|,$/, '')
163
+ return z.gsub(/;$|\]$|\}$|\)$/, '')
164
+ }[0]
165
+ end
166
+
167
+ ##
168
+ # Extract text from a pdf, or many pdfs
169
+ #
170
+ # @param path [String] Path to a pdf file, or a folder of PDF files
171
+ #
172
+ # This method is used internally within fetch to parse PDFs.
173
+ #
174
+ # @example
175
+ # require 'extcite'
176
+ # require 'faraday'
177
+ # # get a paper in pdf format
178
+ # path = '2068.pdf'
179
+ # res = Faraday.new(:url => "https://peerj.com/articles/2068.pdf").get
180
+ # f = File.new(path, "wb")
181
+ # f.write(res.body)
182
+ # f.close()
183
+ # # extract doi from the pdf
184
+ # Extcite.extract_text(path: path)
185
+ def self.extract_text(path:)
186
+ path = Array(path)
187
+ if path.length == 1
188
+ if File.directory?(path[0])
189
+ # keep only files with .pdf extension
190
+ path = dir_files(path[0]).keep_if { |z| !!z.match(/.pdf/) }
191
+ end
192
+ end
193
+
194
+ out = []
195
+ path.each do |x|
196
+ rr = PDF::Reader.new(x)
197
+ out << rr.pages.map { |page| page.text }.join("\n")
198
+ end
199
+ return out
200
+ end
201
+
202
+ ##
203
+ # Get citation(s) using Crossref content negotation
204
+ #
205
+ # @param ids [Array[String]] One or more DOIs in an array
206
+ #
207
+ # Return: an array of bib data
208
+ #
209
+ # @example
210
+ # require 'extcite'
211
+ # Extcite.cont_neg(ids: "10.1016/j.dendro.2014.01.004")
212
+ def self.cont_neg(ids:)
213
+ out = Serrano.content_negotiation(ids: ids)
214
+ return out
215
+ end
216
+
217
+ protected
218
+
219
+ def self.extract_text_one(x)
220
+ rr = PDF::Reader.new(x)
221
+ return rr.pages.map { |page| page.text }.join("\n")
222
+ end
223
+
224
+ end
metadata ADDED
@@ -0,0 +1,246 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: extcite
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Scott Chamberlain
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-04-06 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.14'
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 1.14.6
23
+ type: :development
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: '1.14'
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.14.6
33
+ - !ruby/object:Gem::Dependency
34
+ name: rake
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '12.0'
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: 12.0.0
43
+ type: :development
44
+ prerelease: false
45
+ version_requirements: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - "~>"
48
+ - !ruby/object:Gem::Version
49
+ version: '12.0'
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: 12.0.0
53
+ - !ruby/object:Gem::Dependency
54
+ name: test-unit
55
+ requirement: !ruby/object:Gem::Requirement
56
+ requirements:
57
+ - - "~>"
58
+ - !ruby/object:Gem::Version
59
+ version: '3.2'
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: 3.2.1
63
+ type: :development
64
+ prerelease: false
65
+ version_requirements: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - "~>"
68
+ - !ruby/object:Gem::Version
69
+ version: '3.2'
70
+ - - ">="
71
+ - !ruby/object:Gem::Version
72
+ version: 3.2.1
73
+ - !ruby/object:Gem::Dependency
74
+ name: simplecov
75
+ requirement: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - "~>"
78
+ - !ruby/object:Gem::Version
79
+ version: 0.14.1
80
+ type: :development
81
+ prerelease: false
82
+ version_requirements: !ruby/object:Gem::Requirement
83
+ requirements:
84
+ - - "~>"
85
+ - !ruby/object:Gem::Version
86
+ version: 0.14.1
87
+ - !ruby/object:Gem::Dependency
88
+ name: codecov
89
+ requirement: !ruby/object:Gem::Requirement
90
+ requirements:
91
+ - - "~>"
92
+ - !ruby/object:Gem::Version
93
+ version: 0.1.10
94
+ type: :development
95
+ prerelease: false
96
+ version_requirements: !ruby/object:Gem::Requirement
97
+ requirements:
98
+ - - "~>"
99
+ - !ruby/object:Gem::Version
100
+ version: 0.1.10
101
+ - !ruby/object:Gem::Dependency
102
+ name: faraday
103
+ requirement: !ruby/object:Gem::Requirement
104
+ requirements:
105
+ - - "~>"
106
+ - !ruby/object:Gem::Version
107
+ version: 0.12.0.1
108
+ type: :runtime
109
+ prerelease: false
110
+ version_requirements: !ruby/object:Gem::Requirement
111
+ requirements:
112
+ - - "~>"
113
+ - !ruby/object:Gem::Version
114
+ version: 0.12.0.1
115
+ - !ruby/object:Gem::Dependency
116
+ name: faraday_middleware
117
+ requirement: !ruby/object:Gem::Requirement
118
+ requirements:
119
+ - - "~>"
120
+ - !ruby/object:Gem::Version
121
+ version: 0.11.0.1
122
+ type: :runtime
123
+ prerelease: false
124
+ version_requirements: !ruby/object:Gem::Requirement
125
+ requirements:
126
+ - - "~>"
127
+ - !ruby/object:Gem::Version
128
+ version: 0.11.0.1
129
+ - !ruby/object:Gem::Dependency
130
+ name: thor
131
+ requirement: !ruby/object:Gem::Requirement
132
+ requirements:
133
+ - - "~>"
134
+ - !ruby/object:Gem::Version
135
+ version: 0.19.4
136
+ type: :runtime
137
+ prerelease: false
138
+ version_requirements: !ruby/object:Gem::Requirement
139
+ requirements:
140
+ - - "~>"
141
+ - !ruby/object:Gem::Version
142
+ version: 0.19.4
143
+ - !ruby/object:Gem::Dependency
144
+ name: oga
145
+ requirement: !ruby/object:Gem::Requirement
146
+ requirements:
147
+ - - "~>"
148
+ - !ruby/object:Gem::Version
149
+ version: '2.2'
150
+ type: :runtime
151
+ prerelease: false
152
+ version_requirements: !ruby/object:Gem::Requirement
153
+ requirements:
154
+ - - "~>"
155
+ - !ruby/object:Gem::Version
156
+ version: '2.2'
157
+ - !ruby/object:Gem::Dependency
158
+ name: serrano
159
+ requirement: !ruby/object:Gem::Requirement
160
+ requirements:
161
+ - - "~>"
162
+ - !ruby/object:Gem::Version
163
+ version: 0.3.6
164
+ type: :runtime
165
+ prerelease: false
166
+ version_requirements: !ruby/object:Gem::Requirement
167
+ requirements:
168
+ - - "~>"
169
+ - !ruby/object:Gem::Version
170
+ version: 0.3.6
171
+ - !ruby/object:Gem::Dependency
172
+ name: bibtex-ruby
173
+ requirement: !ruby/object:Gem::Requirement
174
+ requirements:
175
+ - - "~>"
176
+ - !ruby/object:Gem::Version
177
+ version: '4.4'
178
+ type: :runtime
179
+ prerelease: false
180
+ version_requirements: !ruby/object:Gem::Requirement
181
+ requirements:
182
+ - - "~>"
183
+ - !ruby/object:Gem::Version
184
+ version: '4.4'
185
+ - !ruby/object:Gem::Dependency
186
+ name: pdf-reader
187
+ requirement: !ruby/object:Gem::Requirement
188
+ requirements:
189
+ - - "~>"
190
+ - !ruby/object:Gem::Version
191
+ version: '2.0'
192
+ type: :runtime
193
+ prerelease: false
194
+ version_requirements: !ruby/object:Gem::Requirement
195
+ requirements:
196
+ - - "~>"
197
+ - !ruby/object:Gem::Version
198
+ version: '2.0'
199
+ description: Gets DOIS and generates citations for your papers
200
+ email: myrmecocystus@gmail.com
201
+ executables:
202
+ - extcite
203
+ extensions: []
204
+ extra_rdoc_files: []
205
+ files:
206
+ - ".gitignore"
207
+ - ".travis.yml"
208
+ - CHANGELOG.md
209
+ - Gemfile
210
+ - Gemfile.lock
211
+ - README.md
212
+ - Rakefile
213
+ - bin/extcite
214
+ - extcite.gemspec
215
+ - extra/fetch.rb
216
+ - extra/fetch_method.rb
217
+ - lib/extcite.rb
218
+ - lib/extcite/methods_array.rb
219
+ - lib/extcite/methods_string.rb
220
+ - lib/extcite/utils.rb
221
+ - lib/extcite/version.rb
222
+ homepage: http://github.com/sckott/extcite
223
+ licenses:
224
+ - MIT
225
+ metadata: {}
226
+ post_install_message:
227
+ rdoc_options: []
228
+ require_paths:
229
+ - lib
230
+ required_ruby_version: !ruby/object:Gem::Requirement
231
+ requirements:
232
+ - - ">="
233
+ - !ruby/object:Gem::Version
234
+ version: '0'
235
+ required_rubygems_version: !ruby/object:Gem::Requirement
236
+ requirements:
237
+ - - ">="
238
+ - !ruby/object:Gem::Version
239
+ version: '0'
240
+ requirements: []
241
+ rubyforge_project:
242
+ rubygems_version: 2.6.8
243
+ signing_key:
244
+ specification_version: 4
245
+ summary: Citations from PDFs
246
+ test_files: []