serrano 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +36 -0
- data/.travis.yml +4 -0
- data/CONDUCT.md +25 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +56 -0
- data/LICENSE +7 -0
- data/NEWS.md +3 -0
- data/README.md +128 -0
- data/Rakefile +41 -0
- data/lib/serrano.rb +440 -0
- data/lib/serrano/cn.rb +30 -0
- data/lib/serrano/cnrequest.rb +83 -0
- data/lib/serrano/constants.rb +36 -0
- data/lib/serrano/errors.rb +67 -0
- data/lib/serrano/filterhandler.rb +58 -0
- data/lib/serrano/filters.rb +84 -0
- data/lib/serrano/helpers/configuration.rb +26 -0
- data/lib/serrano/mine_utils.rb +65 -0
- data/lib/serrano/mined.rb +31 -0
- data/lib/serrano/miner.rb +42 -0
- data/lib/serrano/request.rb +99 -0
- data/lib/serrano/version.rb +3 -0
- data/serrano.gemspec +35 -0
- metadata +253 -0
data/lib/serrano/cn.rb
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
require "serrano/version"
|
2
|
+
require "serrano/cnrequest"
|
3
|
+
|
4
|
+
##
|
5
|
+
# ContentNegotiation - Content Negotiation class
|
6
|
+
#
|
7
|
+
# @see http://www.crosscite.org/cn/ for details
|
8
|
+
module Serrano
|
9
|
+
|
10
|
+
class ContentNegotiation
|
11
|
+
|
12
|
+
attr_accessor :ids
|
13
|
+
attr_accessor :format
|
14
|
+
attr_accessor :style
|
15
|
+
attr_accessor :locale
|
16
|
+
|
17
|
+
def initialize(ids, format = "bibtex", style = "apa", locale = "en-US")
|
18
|
+
self.ids = ids
|
19
|
+
self.format = format
|
20
|
+
self.style = style
|
21
|
+
self.locale = locale
|
22
|
+
end
|
23
|
+
|
24
|
+
def cn
|
25
|
+
CNRequest.new(self.ids, self.format, self.style, self.locale).perform
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
require "faraday"
|
2
|
+
require "faraday_middleware"
|
3
|
+
require "multi_json"
|
4
|
+
require "serrano/errors"
|
5
|
+
require "serrano/constants"
|
6
|
+
require 'serrano/helpers/configuration'
|
7
|
+
|
8
|
+
##
|
9
|
+
# Serrano::CNRequest
|
10
|
+
#
|
11
|
+
# Class to perform HTTP requests to the Crossref API
|
12
|
+
module Serrano
|
13
|
+
class CNRequest #:nodoc:
|
14
|
+
|
15
|
+
attr_accessor :ids
|
16
|
+
attr_accessor :format
|
17
|
+
attr_accessor :style
|
18
|
+
attr_accessor :locale
|
19
|
+
|
20
|
+
def initialize(ids, format, style, locale)
|
21
|
+
self.ids = ids
|
22
|
+
self.format = format
|
23
|
+
self.style = style
|
24
|
+
self.locale = locale
|
25
|
+
end
|
26
|
+
|
27
|
+
def perform
|
28
|
+
if !$cn_formats.include? self.format
|
29
|
+
raise "format not one of accepted types"
|
30
|
+
end
|
31
|
+
|
32
|
+
$conn = Faraday.new "http://dx.doi.org/" do |c|
|
33
|
+
c.use FaradayMiddleware::FollowRedirects
|
34
|
+
c.adapter :net_http
|
35
|
+
end
|
36
|
+
|
37
|
+
if self.ids.length == 1
|
38
|
+
return make_request(self.ids, self.format, self.style, self.locale)
|
39
|
+
else
|
40
|
+
coll = []
|
41
|
+
Array(self.ids).each do |x|
|
42
|
+
coll << make_request(x, self.format, self.style, self.locale)
|
43
|
+
end
|
44
|
+
return coll
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def make_request(ids, format, style, locale)
|
51
|
+
type = $cn_format_headers.select { |x, _| x.include? format }.values[0]
|
52
|
+
|
53
|
+
if format == "citeproc-json"
|
54
|
+
endpt = "http://api.crossref.org/works/" + ids + "/" + type
|
55
|
+
cr_works = Faraday.new(:url => endpt)
|
56
|
+
res = cr_works.get
|
57
|
+
else
|
58
|
+
if format == "text"
|
59
|
+
type = type + "; style = " + style + "; locale = " + locale
|
60
|
+
end
|
61
|
+
|
62
|
+
res = $conn.get do |req|
|
63
|
+
req.url ids
|
64
|
+
req.headers['Accept'] = type
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
return res.body
|
69
|
+
end
|
70
|
+
|
71
|
+
# parser <- cn_types[[self.format]]
|
72
|
+
# if (raw) {
|
73
|
+
# content(response, "text")
|
74
|
+
# } else {
|
75
|
+
# out <- content(response, "parsed", parser, "UTF-8")
|
76
|
+
# if (format == "text") {
|
77
|
+
# out <- gsub("\n", "", out)
|
78
|
+
# }
|
79
|
+
# if (format == "bibentry") {
|
80
|
+
# out <- parse_bibtex(out)
|
81
|
+
# }
|
82
|
+
# out
|
83
|
+
# }
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
|
3
|
+
NETWORKABLE_EXCEPTIONS = [Faraday::Error::ClientError,
|
4
|
+
URI::InvalidURIError,
|
5
|
+
Encoding::UndefinedConversionError,
|
6
|
+
ArgumentError,
|
7
|
+
NoMethodError,
|
8
|
+
TypeError]
|
9
|
+
|
10
|
+
$cn_formats = ["rdf-xml", "turtle", "citeproc-json",
|
11
|
+
"citeproc-json-ish", "text", "ris", "bibtex",
|
12
|
+
"crossref-xml", "datacite-xml", "bibentry",
|
13
|
+
"crossref-tdm"]
|
14
|
+
|
15
|
+
$cn_format_headers = {"rdf-xml" => "application/rdf+xml",
|
16
|
+
"turtle" => "text/turtle",
|
17
|
+
"citeproc-json" => "transform/application/vnd.citationstyles.csl+json",
|
18
|
+
"text" => "text/x-bibliography",
|
19
|
+
"ris" => "application/x-research-info-systems",
|
20
|
+
"bibtex" => "application/x-bibtex",
|
21
|
+
"crossref-xml" => "application/vnd.crossref.unixref+xml",
|
22
|
+
"datacite-xml" => "application/vnd.datacite.datacite+xml",
|
23
|
+
"bibentry" => "application/x-bibtex",
|
24
|
+
"crossref-tdm" => "application/vnd.crossref.unixsd+xml"}
|
25
|
+
|
26
|
+
$cn_types = {"rdf-xml" => "text/xml",
|
27
|
+
"turtle" => "text/plain",
|
28
|
+
"citeproc-json" => "application/json",
|
29
|
+
"citeproc-json-ish" => "application/json",
|
30
|
+
"text" => "text/plain",
|
31
|
+
"ris" => "text/plain",
|
32
|
+
"bibtex" => "text/plain",
|
33
|
+
"crossref-xml" => "text/xml",
|
34
|
+
"datacite-xml" => "text/xml",
|
35
|
+
"bibentry" => "text/plain",
|
36
|
+
"crossref-tdm" => "text/xml"}
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
|
3
|
+
def rescue_faraday_error(url, error, options={})
|
4
|
+
details = nil
|
5
|
+
headers = {}
|
6
|
+
|
7
|
+
if error.is_a?(Faraday::Error::TimeoutError)
|
8
|
+
status = 408
|
9
|
+
elsif error.respond_to?('status')
|
10
|
+
status = error[:status]
|
11
|
+
elsif error.respond_to?('response') && error.response.present?
|
12
|
+
status = error.response[:status]
|
13
|
+
details = error.response[:body]
|
14
|
+
headers = error.response[:headers]
|
15
|
+
else
|
16
|
+
status = 400
|
17
|
+
end
|
18
|
+
|
19
|
+
# Some sources use a different status for rate-limiting errors
|
20
|
+
status = 429 if status == 403 && details.include?("Excessive use detected")
|
21
|
+
|
22
|
+
if error.respond_to?('exception')
|
23
|
+
exception = error.exception
|
24
|
+
else
|
25
|
+
exception = ""
|
26
|
+
end
|
27
|
+
|
28
|
+
class_name = class_name_by_status(status) || error.class
|
29
|
+
|
30
|
+
message = parse_error_response(error.message)
|
31
|
+
message = "#{message} for #{url}"
|
32
|
+
message = "#{message} with rev #{options[:data][:rev]}" if class_name == Net::HTTPConflict
|
33
|
+
|
34
|
+
{ error: message, status: status }
|
35
|
+
end
|
36
|
+
|
37
|
+
def parse_error_response(string)
|
38
|
+
if is_json?(string)
|
39
|
+
string = MultiJson.load(string)
|
40
|
+
end
|
41
|
+
string = string['error'] if string.is_a?(Hash) && string['error']
|
42
|
+
string
|
43
|
+
end
|
44
|
+
|
45
|
+
def is_json?(string)
|
46
|
+
MultiJson.load(string)
|
47
|
+
rescue MultiJson::ParseError => e
|
48
|
+
e.data
|
49
|
+
e.cause
|
50
|
+
end
|
51
|
+
|
52
|
+
def class_name_by_status(status)
|
53
|
+
{ 400 => Net::HTTPBadRequest,
|
54
|
+
401 => Net::HTTPUnauthorized,
|
55
|
+
403 => Net::HTTPForbidden,
|
56
|
+
404 => Net::HTTPNotFound,
|
57
|
+
406 => Net::HTTPNotAcceptable,
|
58
|
+
408 => Net::HTTPRequestTimeOut,
|
59
|
+
409 => Net::HTTPConflict,
|
60
|
+
417 => Net::HTTPExpectationFailed,
|
61
|
+
429 => Net::HTTPTooManyRequests,
|
62
|
+
500 => Net::HTTPInternalServerError,
|
63
|
+
502 => Net::HTTPBadGateway,
|
64
|
+
503 => Net::HTTPServiceUnavailable,
|
65
|
+
504 => Net::HTTPGatewayTimeOut }.fetch(status, nil)
|
66
|
+
end
|
67
|
+
|
@@ -0,0 +1,58 @@
|
|
1
|
+
# helper functions
|
2
|
+
module Serrano
|
3
|
+
class Request #:nodoc:
|
4
|
+
|
5
|
+
private
|
6
|
+
|
7
|
+
$others = ['license_url','license_version','license_delay','full_text_version','full_text_type',
|
8
|
+
'award_number','award_funder']
|
9
|
+
|
10
|
+
def filter_handler(x = nil)
|
11
|
+
if x.nil?
|
12
|
+
nil
|
13
|
+
else
|
14
|
+
x = stringify(x)
|
15
|
+
nn = x.keys.collect{ |x| x.to_s }
|
16
|
+
if nn.collect{ |x| $others.include? x }.any?
|
17
|
+
nn = nn.collect{ |x|
|
18
|
+
if $others.include? x
|
19
|
+
case x
|
20
|
+
when 'license_url'
|
21
|
+
'license.url'
|
22
|
+
when 'license_version'
|
23
|
+
'license.version'
|
24
|
+
when 'license_delay'
|
25
|
+
'license.delay'
|
26
|
+
when 'full_text_version'
|
27
|
+
'full-text.version'
|
28
|
+
when 'full_text_type'
|
29
|
+
'full-text.type'
|
30
|
+
when 'award_number'
|
31
|
+
'award.number'
|
32
|
+
when 'award_funder'
|
33
|
+
'award.funder'
|
34
|
+
end
|
35
|
+
else
|
36
|
+
x
|
37
|
+
end
|
38
|
+
}
|
39
|
+
end
|
40
|
+
|
41
|
+
newnn = nn.collect{ |x| x.gsub("_", "-") }
|
42
|
+
x = rename_keys(x, newnn)
|
43
|
+
x = x.collect{ |k,v| [k, v].join(":") }.join(',')
|
44
|
+
return x
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def stringify(x)
|
49
|
+
(x.keys.map{ |k,v| k.to_s }.zip x.values).to_h
|
50
|
+
end
|
51
|
+
|
52
|
+
def rename_keys(x, y)
|
53
|
+
(y.zip x.values).to_h
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
##
|
2
|
+
# Serrano::Filters
|
3
|
+
#
|
4
|
+
# Information on Crossref API filters
|
5
|
+
#
|
6
|
+
# @example
|
7
|
+
# # List filter names
|
8
|
+
# Serrano::Filters.names
|
9
|
+
# # List filter values and description
|
10
|
+
# Serrano::Filters.filters
|
11
|
+
# Serrano::Filters.filters['has_funder']
|
12
|
+
# Serrano::Filters.filters['has_funder']['description']
|
13
|
+
module Serrano
|
14
|
+
module Filters
|
15
|
+
def self.names
|
16
|
+
$filter_list
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.filters
|
20
|
+
$filter_details
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
$filter_list = [
|
26
|
+
'has_funder','funder','prefix','member','from_index_date','until_index_date',
|
27
|
+
'from_deposit_date','until_deposit_date','from_update_date','until_update_date',
|
28
|
+
'from_first_deposit_date','until_first_deposit_date','from_pub_date','until_pub_date',
|
29
|
+
'has_license','license_url','license_version','license_delay','has_full_text',
|
30
|
+
'full_text_version','full_text_type','public_references','has_references','has_archive',
|
31
|
+
'archive','has_orcid','orcid','issn','type','directory','doi','updates','is_update',
|
32
|
+
'has_update_policy','container_title','publisher_name','category_name','type_name',
|
33
|
+
'from_created_date', 'until_created_date', 'affiliation', 'has_affiliation',
|
34
|
+
'assertion_group', 'assertion', 'article_number', 'alternative_id'
|
35
|
+
]
|
36
|
+
|
37
|
+
$filter_details = {
|
38
|
+
"has_funder" => { "possible_values" => nil, "description" => "metadata which includes one or more funder entry" },
|
39
|
+
"funder" => { "possible_values" => "{funder_id}", "description" => "metadata which include the {funder_id} in FundRef data" },
|
40
|
+
"prefix" => { "possible_values" => "{owner_prefix}", "description" => "metadata belonging to a DOI owner prefix {owner_prefix} (e.g. '10.1016' )" },
|
41
|
+
"member" => { "possible_values" => "{member_id}", "description" => "metadata belonging to a CrossRef member" },
|
42
|
+
"from_index_date" => { "possible_values" => '{date}', "description" => "metadata indexed since (inclusive) {date}" },
|
43
|
+
"until_index_date" => { "possible_values" => '{date}', "description" => "metadata indexed before (inclusive) {date}" },
|
44
|
+
"from_deposit_date" => { "possible_values" => '{date}', "description" => "metadata last (re)deposited since (inclusive) {date}" },
|
45
|
+
"until_deposit_date" => { "possible_values" => '{date}', "description" => "metadata last (re)deposited before (inclusive) {date}" },
|
46
|
+
"from_update_date" => { "possible_values" => '{date}', "description" => "Metadata updated since (inclusive) {date} Currently the same as 'from_deposit_date'" },
|
47
|
+
"until_update_date" => { "possible_values" => '{date}', "description" => "Metadata updated before (inclusive) {date} Currently the same as 'until_deposit_date'" },
|
48
|
+
"from_created_date" => { "possible_values" => '{date}', "description" => "metadata first deposited since (inclusive) {date}" },
|
49
|
+
"until_created_date" => { "possible_values" => '{date}', "description" => "metadata first deposited before (inclusive) {date}" },
|
50
|
+
"from_pub_date" => { "possible_values" => '{date}', "description" => "metadata where published date is since (inclusive) {date}" },
|
51
|
+
"until_pub_date" => { "possible_values" => '{date}', "description" => "metadata where published date is before (inclusive) {date}" },
|
52
|
+
"has_license" => { "possible_values" => nil, "description" => "metadata that includes any '<license_ref>' elements" },
|
53
|
+
"license_url" => { "possible_values" => '{url}', "description" => "metadata where '<license_ref>' value equals {url}" },
|
54
|
+
"license_version" => { "possible_values" => '{string}', "description" => "metadata where the '<license_ref>''s 'applies_to' attribute is '{string}'"},
|
55
|
+
"license_delay" => { "possible_values" => "{integer}", "description" => "metadata where difference between publication date and the '<license_ref>''s 'start_date' attribute is <= '{integer}' (in days"},
|
56
|
+
"has_full_text" => { "possible_values" => nil, "description" => "metadata that includes any full text '<resource>' elements_" },
|
57
|
+
"full_text_version" => { "possible_values" => '{string}' , "description" => "metadata where '<resource>' element's 'content_version' attribute is '{string}'" },
|
58
|
+
"full_text_type" => { "possible_values" => '{mime_type}' , "description" => "metadata where '<resource>' element's 'content_type' attribute is '{mime_type}' (e.g. 'application/pdf')" },
|
59
|
+
"public_references" => { "possible_values" => nil, "description" => "metadata where publishers allow references to be distributed publically" },
|
60
|
+
"has_references" => { "possible_values" => nil , "description" => "metadata for works that have a list of references" },
|
61
|
+
"has_archive" => { "possible_values" => nil , "description" => "metadata which include name of archive partner" },
|
62
|
+
"archive" => { "possible_values" => '{string}', "description" => "metadata which where value of archive partner is '{string}'" },
|
63
|
+
"has_orcid" => { "possible_values" => nil, "description" => "metadata which includes one or more ORCIDs" },
|
64
|
+
"orcid" => { "possible_values" => '{orcid}', "description" => "metadata where '<orcid>' element's value = '{orcid}'" },
|
65
|
+
"issn" => { "possible_values" => '{issn}', "description" => "metadata where record has an ISSN = '{issn}' Format is 'xxxx_xxxx'." },
|
66
|
+
"type" => { "possible_values" => '{type}', "description" => "metadata records whose type = '{type}' Type must be an ID value from the list of types returned by the '/types' resource" },
|
67
|
+
"directory" => { "possible_values" => "{directory}", "description" => "metadata records whose article or serial are mentioned in the given '{directory}'. Currently the only supported value is 'doaj'" },
|
68
|
+
"doi" => { "possible_values" => '{doi}', "description" => "metadata describing the DOI '{doi}'" },
|
69
|
+
"updates" => { "possible_values" => '{doi}', "description" => "metadata for records that represent editorial updates to the DOI '{doi}'" },
|
70
|
+
"is_update" => { "possible_values" => nil, "description" => "metadata for records that represent editorial updates" },
|
71
|
+
"has_update_policy" => { "possible_values" => nil, "description" => "metadata for records that include a link to an editorial update policy" },
|
72
|
+
"container_title" => { "possible_values" => nil, "description" => "metadata for records with a publication title exactly with an exact match" },
|
73
|
+
"publisher_name" => { "possible_values" => nil, "description" => "metadata for records with an exact matching publisher name" },
|
74
|
+
"category_name" => { "possible_values" => nil, "description" => "metadata for records with an exact matching category label" },
|
75
|
+
"type_name" => { "possible_values" => nil, "description" => "metadata for records with an exacty matching type label" },
|
76
|
+
"award_number" => { "possible_values" => "{award_number}", "description" => "metadata for records with a matching award nunber_ Optionally combine with 'award_funder'" },
|
77
|
+
"award_funder" => { "possible_values" => '{funder doi or id}', "description" => "metadata for records with an award with matching funder. Optionally combine with 'award_number'" },
|
78
|
+
"assertion_group" => { "possible_values" => nil, "description" => "metadata for records with an assertion in a particular group" },
|
79
|
+
"assertion" => { "possible_values" => nil, "description" => "metadata for records with a particular named assertion" },
|
80
|
+
"affiliation" => { "possible_values" => nil, "description" => "metadata for records with at least one contributor with the given affiliation" },
|
81
|
+
"has_affiliation" => { "possible_values" => nil, "description" => "metadata for records that have any affiliation information" },
|
82
|
+
"alternative_id" => { "possible_values" => nil, "description" => "metadata for records with the given alternative ID, which may be a publisher_specific ID, or any other identifier a publisher may have provided" },
|
83
|
+
"article_number" => { "possible_values" => nil, "description" => "metadata for records with a given article number" }
|
84
|
+
}
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# taken from: https://viget.com/extend/easy-gem-configuration-variables-with-defaults
|
2
|
+
module Configuration
|
3
|
+
|
4
|
+
def configuration
|
5
|
+
yield self
|
6
|
+
end
|
7
|
+
|
8
|
+
def define_setting(name, default = nil)
|
9
|
+
class_variable_set("@@#{name}", default)
|
10
|
+
define_class_method "#{name}=" do |value|
|
11
|
+
class_variable_set("@@#{name}", value)
|
12
|
+
end
|
13
|
+
define_class_method name do
|
14
|
+
class_variable_get("@@#{name}")
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def define_class_method(name, &block)
|
21
|
+
(class << self; self; end).instance_eval do
|
22
|
+
define_method name, &block
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'uuidtools'
|
3
|
+
|
4
|
+
def detect_type(x)
|
5
|
+
ctype = x.headers['content-type']
|
6
|
+
case ctype
|
7
|
+
when 'text/xml'
|
8
|
+
'xml'
|
9
|
+
when 'text/plain'
|
10
|
+
'plain'
|
11
|
+
when 'application/pdf'
|
12
|
+
'pdf'
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def make_ext(x)
|
17
|
+
case x
|
18
|
+
when 'xml'
|
19
|
+
'xml'
|
20
|
+
when 'plain'
|
21
|
+
'txt'
|
22
|
+
when 'pdf'
|
23
|
+
'pdf'
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def make_path(type)
|
28
|
+
# id = x.split('article/')[1].split('?')[0]
|
29
|
+
# path = id + '.' + type
|
30
|
+
# return path
|
31
|
+
type = make_ext(type)
|
32
|
+
uuid = UUIDTools::UUID.random_create.to_s
|
33
|
+
path = uuid + '.' + type
|
34
|
+
return path
|
35
|
+
end
|
36
|
+
|
37
|
+
def write_disk(res, path)
|
38
|
+
f = File.new(path, "wb")
|
39
|
+
f.write(res.body)
|
40
|
+
f.close()
|
41
|
+
end
|
42
|
+
|
43
|
+
def read_disk(path)
|
44
|
+
return File.read(path)
|
45
|
+
end
|
46
|
+
|
47
|
+
def parse_xml(x)
|
48
|
+
text = read_disk(x)
|
49
|
+
xml = Nokogiri.parse(text)
|
50
|
+
return xml
|
51
|
+
end
|
52
|
+
|
53
|
+
def parse_plain(x)
|
54
|
+
text = read_disk(x)
|
55
|
+
return text
|
56
|
+
end
|
57
|
+
|
58
|
+
def parse_pdf(x)
|
59
|
+
raise "not ready yet"
|
60
|
+
end
|
61
|
+
|
62
|
+
def is_elsevier(x)
|
63
|
+
tmp = x.match 'elsevier'
|
64
|
+
!tmp.nil?
|
65
|
+
end
|