kishu 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +36 -0
- data/.rspec +3 -0
- data/.travis.yml +7 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +222 -0
- data/LICENSE.txt +21 -0
- data/README.md +47 -0
- data/Rakefile +20 -0
- data/bin/kishu +5 -0
- data/kishu.gemspec +54 -0
- data/lib/kishu.rb +30 -0
- data/lib/kishu/base.rb +14 -0
- data/lib/kishu/cli.rb +42 -0
- data/lib/kishu/client.rb +89 -0
- data/lib/kishu/lagotto_job.rb +22 -0
- data/lib/kishu/log.rb +33 -0
- data/lib/kishu/merger.rb +69 -0
- data/lib/kishu/pipeline.rb +29 -0
- data/lib/kishu/report.rb +149 -0
- data/lib/kishu/resolution_event.rb +83 -0
- data/lib/kishu/s3.rb +24 -0
- data/lib/kishu/sushi.rb +59 -0
- data/lib/kishu/usage_event.rb +124 -0
- data/lib/kishu/utils.rb +115 -0
- data/lib/kishu/version.rb +3 -0
- data/spec/factories/default.rb +71 -0
- data/spec/fixtures/vcr_cassettes/Kishu_Sushi/wrap_event/when_doi_doesn_t_exist/should_fail.yml +3867 -0
- data/spec/kishu_spec.rb +9 -0
- data/spec/report_spec.rb +79 -0
- data/spec/resolution_event_spec.rb +80 -0
- data/spec/spec_helper.rb +93 -0
- metadata +400 -0
@@ -0,0 +1,83 @@
|
|
1
|
+
require 'faraday'
|
2
|
+
require 'logger'
|
3
|
+
require 'maremma'
|
4
|
+
require 'sucker_punch'
|
5
|
+
|
6
|
+
require_relative 'utils'
|
7
|
+
require_relative 'base'
|
8
|
+
require_relative 'lagotto_job'
|
9
|
+
|
10
|
+
module Kishu
|
11
|
+
class ResolutionEvent
|
12
|
+
|
13
|
+
include Kishu::Utils
|
14
|
+
|
15
|
+
def initialize(event, options={})
|
16
|
+
@event = event
|
17
|
+
@logger = Logger.new(STDOUT)
|
18
|
+
@period = options[:period]
|
19
|
+
end
|
20
|
+
|
21
|
+
def wrap_event
|
22
|
+
# puts "------------------ \n"
|
23
|
+
totale = @event.dig("totale").fetch("buckets", [])
|
24
|
+
# puts @event.dig("unique").fetch("buckets", nil)
|
25
|
+
unique = @event.dig("unique").fetch("buckets", [])
|
26
|
+
# puts unique[1].dig('key')
|
27
|
+
|
28
|
+
unique_regular = unique.find_all {|access_method| access_method.fetch('key',"").match('regular') }
|
29
|
+
unique_machine = unique.find_all {|access_method| access_method.fetch('key',"").match('machine') }
|
30
|
+
total_regular = totale.find_all {|access_method| access_method.fetch('key',"").match('regular') }
|
31
|
+
total_machine = totale.find_all {|access_method| access_method.fetch('key',"").match('machine') }
|
32
|
+
|
33
|
+
dataset = {
|
34
|
+
doi: @event.dig("key","doi"),
|
35
|
+
unique_counts_regular: unique_regular.empty? ? 0 : unique_regular.size,
|
36
|
+
unique_counts_machine: unique_machine.empty? ? 0 : unique_machine.size,
|
37
|
+
total_counts_regular: total_regular.empty? ? 0 : total_regular.dig(0,"doc_count"),
|
38
|
+
total_counts_machine: total_machine.empty? ? 0 : total_machine.dig(0,"doc_count")
|
39
|
+
}
|
40
|
+
|
41
|
+
|
42
|
+
@doi = dataset.fetch(:doi,nil)
|
43
|
+
|
44
|
+
data = {}
|
45
|
+
instances =[
|
46
|
+
{
|
47
|
+
"count" => dataset.fetch(:total_counts_regular),
|
48
|
+
"access-method" => "regular",
|
49
|
+
"metric-type" => "total-resolutions"
|
50
|
+
},
|
51
|
+
{
|
52
|
+
"count" => dataset.fetch(:unique_counts_regular),
|
53
|
+
"access-method" => "regular",
|
54
|
+
"metric-type" => "unique-resolutions"
|
55
|
+
},
|
56
|
+
{
|
57
|
+
"count" => dataset.fetch(:unique_counts_machine),
|
58
|
+
"access-method" => "machine",
|
59
|
+
"metric-type" => "unique-resolutions"
|
60
|
+
},
|
61
|
+
{
|
62
|
+
"count" => dataset.fetch(:total_counts_machine),
|
63
|
+
"access-method" => "machine",
|
64
|
+
"metric-type" => "total-resolutions"
|
65
|
+
},
|
66
|
+
]
|
67
|
+
|
68
|
+
instances.delete_if {|instance| instance.dig("count") < 1}
|
69
|
+
|
70
|
+
|
71
|
+
instanced = {
|
72
|
+
"dataset-id" => [{"type" => "doi", "value"=> dataset.fetch(:doi,nil)}],
|
73
|
+
"performance" => [{
|
74
|
+
"period"=> @period,
|
75
|
+
"instance"=> instances
|
76
|
+
}]
|
77
|
+
}
|
78
|
+
instanced
|
79
|
+
end
|
80
|
+
|
81
|
+
|
82
|
+
end
|
83
|
+
end
|
data/lib/kishu/s3.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
|
2
|
+
require 'aws-sdk-s3'
|
3
|
+
|
4
|
+
require_relative 'utils'
|
5
|
+
require_relative 'base'
|
6
|
+
|
7
|
+
module Kishu
|
8
|
+
class S3
|
9
|
+
|
10
|
+
def initialize
|
11
|
+
s3 = Aws::S3::Client.new
|
12
|
+
resp = s3.list_buckets
|
13
|
+
resp.buckets.map(&:name)
|
14
|
+
end
|
15
|
+
|
16
|
+
def download_logs
|
17
|
+
resp = s3.get_object(
|
18
|
+
response_target: '/logs',
|
19
|
+
bucket: S3_RESOLUTION_LOGS_BUCKET,
|
20
|
+
key: 'object-key')
|
21
|
+
resp.metadata
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
data/lib/kishu/sushi.rb
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
|
2
|
+
require 'thor'
|
3
|
+
|
4
|
+
|
5
|
+
require_relative 'resolution_event'
|
6
|
+
require_relative 'report'
|
7
|
+
require_relative 'utils'
|
8
|
+
require_relative 'base'
|
9
|
+
|
10
|
+
module Kishu
|
11
|
+
class Sushi < Thor
|
12
|
+
|
13
|
+
include Kishu::Base
|
14
|
+
include Kishu::Utils
|
15
|
+
|
16
|
+
|
17
|
+
desc "get sushi", "get resolution report"
|
18
|
+
# method_option :username, :default => ENV['MDS_USERNAME']
|
19
|
+
method_option :aggs_size, :type => :numeric, :default => 1000
|
20
|
+
method_option :month_year, :type => :string, :default => "2018-04"
|
21
|
+
def get
|
22
|
+
x =Report.new()
|
23
|
+
x.make_report(options)
|
24
|
+
|
25
|
+
end
|
26
|
+
|
27
|
+
method_option :month_year, :type => :string, :default => "2018-04"
|
28
|
+
method_option :after_key, :type => :string
|
29
|
+
def continue_report
|
30
|
+
x =Report.new()
|
31
|
+
x.generate_files(options)
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
desc "clean_all sushi", "clean index"
|
36
|
+
method_option :month_year, :type => :string, :default => "2018-04"
|
37
|
+
method_option :after_key, :type => :string
|
38
|
+
def clean_all
|
39
|
+
x =Client.new()
|
40
|
+
x.clear_index
|
41
|
+
|
42
|
+
end
|
43
|
+
|
44
|
+
|
45
|
+
desc "send_report_events sushi", "send_report_events index"
|
46
|
+
method_option :month_year, :type => :string, :default => "2018-04"
|
47
|
+
method_option :after_key, :type => :string
|
48
|
+
method_option :chunk_size, :type => :numeric, :default => 40000
|
49
|
+
method_option :aggs_size, :type => :numeric, :default => 500
|
50
|
+
def send_report_events
|
51
|
+
fail "You need to set your JWT" if HUB_TOKEN.blank?
|
52
|
+
x =Report.new(options)
|
53
|
+
x.make_report(options)
|
54
|
+
|
55
|
+
end
|
56
|
+
|
57
|
+
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,124 @@
|
|
1
|
+
require 'faraday'
|
2
|
+
require 'logger'
|
3
|
+
require 'maremma'
|
4
|
+
|
5
|
+
require_relative 'utils'
|
6
|
+
require_relative 'base'
|
7
|
+
|
8
|
+
module Kishu
|
9
|
+
class UsageEvent
|
10
|
+
|
11
|
+
API_URL = "https://api.datacite.org"
|
12
|
+
|
13
|
+
def wrap_event(event)
|
14
|
+
puts "------------------ \n"
|
15
|
+
totale = event.dig("totale").fetch("buckets", [])
|
16
|
+
# puts event.dig("unique").fetch("buckets", nil)
|
17
|
+
unique = event.dig("unique").fetch("buckets", [])
|
18
|
+
# puts unique[1].dig('key')
|
19
|
+
|
20
|
+
unique_regular = unique.find_all {|access_method| access_method.fetch('key',"").match('regular') }
|
21
|
+
unique_machine = unique.find_all {|access_method| access_method.fetch('key',"").match('machine') }
|
22
|
+
total_regular = totale.find_all {|access_method| access_method.fetch('key',"").match('regular') }
|
23
|
+
total_machine = totale.find_all {|access_method| access_method.fetch('key',"").match('machine') }
|
24
|
+
|
25
|
+
dataset = {
|
26
|
+
doi: event.dig("key","doi"),
|
27
|
+
unique_counts_regular: unique_regular.empty? ? 0 : unique_regular.size,
|
28
|
+
unique_counts_machine: unique_machine.empty? ? 0 : unique_machine.size,
|
29
|
+
total_counts_regular: total_regular.empty? ? 0 : total_regular.dig(0,"doc_count"),
|
30
|
+
total_counts_machine: total_machine.empty? ? 0: total_machine.dig(0,"doc_count")
|
31
|
+
}
|
32
|
+
|
33
|
+
|
34
|
+
# conn = Faraday.new(:url => API_URL)
|
35
|
+
logger = Logger.new(STDOUT)
|
36
|
+
logger.info event.fetch("doc_count")
|
37
|
+
|
38
|
+
# arr = dois.map do |dataset|
|
39
|
+
logger.info dataset
|
40
|
+
doi = dataset.fetch(:doi,nil)
|
41
|
+
# json = conn.get "/works/#{doi}"
|
42
|
+
# json = conn.get do |req|
|
43
|
+
# req.url "/works/#{doi}"
|
44
|
+
# req.options.timeout = 50 # open/read timeout in seconds
|
45
|
+
# req.options.open_timeout = 20 # connection open timeout in seconds
|
46
|
+
# end
|
47
|
+
# json = Maremma.get "#{API_URL}/works/#{doi}"
|
48
|
+
# logger.info json.status
|
49
|
+
|
50
|
+
# return {} unless json.status == 200
|
51
|
+
# logger.info "Success on getting metadata for #{doi}"
|
52
|
+
# data = JSON.parse(json.body)
|
53
|
+
# data = json.body
|
54
|
+
data = {}
|
55
|
+
instances =[
|
56
|
+
{
|
57
|
+
count: dataset.fetch(:total_counts_regular),
|
58
|
+
"access-method": "regular",
|
59
|
+
"metric-type": "total-resolutions"
|
60
|
+
},
|
61
|
+
{
|
62
|
+
count: dataset.fetch(:unique_counts_regular),
|
63
|
+
"access-method": "regular",
|
64
|
+
"metric-type": "unique-resolutions"
|
65
|
+
},
|
66
|
+
{
|
67
|
+
count: dataset.fetch(:unique_counts_machine),
|
68
|
+
"access-method": "machine",
|
69
|
+
"metric-type": "unique-resolutions"
|
70
|
+
},
|
71
|
+
{
|
72
|
+
count: dataset.fetch(:total_counts_machine),
|
73
|
+
"access-method": "machine",
|
74
|
+
"metric-type": "total-resolutions"
|
75
|
+
},
|
76
|
+
]
|
77
|
+
|
78
|
+
instances.delete_if {|instance| instance.dig(:count) <= 0}
|
79
|
+
attributes = {} #data.dig("data","attributes")
|
80
|
+
resource_type = "" #attributes.fetch("resource-type-id",nil).nil? ? "dataset" : attributes.fetch("resource-type-id",nil)
|
81
|
+
|
82
|
+
instanced = {
|
83
|
+
"dataset-id" => [{type: "doi", value: dataset.fetch(:doi,nil)}],
|
84
|
+
"data-type" => resource_type,
|
85
|
+
yop: attributes.fetch("published",nil),
|
86
|
+
uri: attributes.fetch("identifier",nil),
|
87
|
+
publisher: attributes.fetch("container-title",nil),
|
88
|
+
"dataset-title": attributes.fetch("title",nil),
|
89
|
+
"publisher-id": [{
|
90
|
+
type: "client-id",
|
91
|
+
value: attributes.fetch("data-center-id",nil)
|
92
|
+
}],
|
93
|
+
"dataset-dates": [{
|
94
|
+
type: "pub-date",
|
95
|
+
value: attributes.fetch("published",nil)
|
96
|
+
}],
|
97
|
+
"dataset-contributors": attributes.fetch("author",[]).map { |a| get_authors(a) },
|
98
|
+
platform: "datacite",
|
99
|
+
performance: [{
|
100
|
+
period: @period,
|
101
|
+
instance: instances
|
102
|
+
}]
|
103
|
+
}
|
104
|
+
logger.info instanced
|
105
|
+
|
106
|
+
instanced
|
107
|
+
end
|
108
|
+
|
109
|
+
|
110
|
+
def get_authors author
|
111
|
+
if (author.key?("given") && author.key?("family"))
|
112
|
+
{ type: "name",
|
113
|
+
value: author.fetch("given",nil)+" "+author.fetch("family",nil) }
|
114
|
+
elsif author.key?("literal")
|
115
|
+
{ type: "name",
|
116
|
+
value: author.fetch("literal",nil) }
|
117
|
+
else
|
118
|
+
{ type: "name",
|
119
|
+
value: "" }
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
end
|
124
|
+
end
|
data/lib/kishu/utils.rb
ADDED
@@ -0,0 +1,115 @@
|
|
1
|
+
require "bolognese"
|
2
|
+
require "time"
|
3
|
+
|
4
|
+
module Kishu
|
5
|
+
module Utils
|
6
|
+
include ::Bolognese::MetadataUtils
|
7
|
+
|
8
|
+
def clean_tmp
|
9
|
+
system("rm tmp/datasets-*.json")
|
10
|
+
puts "/tmp Files deleted"
|
11
|
+
end
|
12
|
+
|
13
|
+
def merged_file
|
14
|
+
"reports/datacite_resolution_report_#{report_period.strftime("%Y-%m")}_2.json"
|
15
|
+
end
|
16
|
+
|
17
|
+
def encoded_file
|
18
|
+
"reports/datacite_resolution_report_#{report_period.strftime("%Y-%m")}_encoded.json"
|
19
|
+
end
|
20
|
+
|
21
|
+
def generate_header_footer
|
22
|
+
report_header = '{"report-header": '+get_header.to_json.to_s+',"report-datasets": [ '+"\n"
|
23
|
+
|
24
|
+
File.open("tmp/datasets-00-report-header.json","w") do |f|
|
25
|
+
f.write(report_header)
|
26
|
+
end
|
27
|
+
report_footer = ']'+"\n"+'}'
|
28
|
+
|
29
|
+
File.open("tmp/datasets-zz99-report-footer.json","w") do |f|
|
30
|
+
f.write(report_footer)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def get_authors author
|
35
|
+
if (author.key?("given") && author.key?("family"))
|
36
|
+
{ type: "name",
|
37
|
+
value: author.fetch("given",nil)+" "+author.fetch("family",nil) }
|
38
|
+
elsif author.key?("literal")
|
39
|
+
{ type: "name",
|
40
|
+
value: author.fetch("literal",nil) }
|
41
|
+
else
|
42
|
+
{ type: "name",
|
43
|
+
value: "" }
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def format_instance data, options={}
|
48
|
+
obj = get_metadata(options[:dataset_id])
|
49
|
+
subj = {id:options[:report_id]}
|
50
|
+
# subj = "https://api.datacite.org/reports/0cb326d1-e3e7-4cc1-9d86-7c5f3d5ca310"
|
51
|
+
relation_type = "#{data[:"metric-type"]}-#{data[:"access-method"]}"
|
52
|
+
source_id = "datacite-resolution"
|
53
|
+
source_token = SOURCE_TOKEN
|
54
|
+
{
|
55
|
+
"data" => {
|
56
|
+
"type" => "events",
|
57
|
+
"attributes" => {
|
58
|
+
"message-action" => "create",
|
59
|
+
"subj-id" => options[:report_id],
|
60
|
+
"total" => data[:count],
|
61
|
+
"obj-id" => options[:dataset_id],
|
62
|
+
"relation-type-id" => relation_type.to_s.dasherize,
|
63
|
+
"source-id" => source_id,
|
64
|
+
"source-token" => source_token,
|
65
|
+
"occurred-at" => Time.now.iso8601, # need modify
|
66
|
+
"timestamp" => Time.now.iso8601,
|
67
|
+
"license" => LICENSE,
|
68
|
+
"subj" => subj,
|
69
|
+
"obj" => obj } }}
|
70
|
+
end
|
71
|
+
|
72
|
+
def get_metadata id
|
73
|
+
doi = doi_from_url(id)
|
74
|
+
return {} unless doi.present?
|
75
|
+
|
76
|
+
url = API_URL + "/dois/#{doi}"
|
77
|
+
response = Maremma.get(url)
|
78
|
+
return {} if response.status != 200
|
79
|
+
|
80
|
+
attributes = response.body.dig("data", "attributes")
|
81
|
+
relationships = response.body.dig("data", "relationships")
|
82
|
+
|
83
|
+
resource_type = response.body.dig("data", "relationships")
|
84
|
+
resource_type_general = relationships.dig("resource-type", "data", "id")
|
85
|
+
type = Bolognese::Utils::CR_TO_SO_TRANSLATIONS[resource_type.to_s.underscore.camelcase] || Bolognese::Utils::DC_TO_SO_TRANSLATIONS[resource_type_general.to_s.underscore.camelcase(first_letter = :upper)] || "CreativeWork"
|
86
|
+
author = Array.wrap(attributes["author"]).map do |a|
|
87
|
+
{
|
88
|
+
"given_name" => a["givenName"],
|
89
|
+
"family_name" => a["familyName"],
|
90
|
+
"name" => a["familyName"].present? ? nil : a["name"] }.compact
|
91
|
+
end
|
92
|
+
client_id = relationships.dig("client", "data", "id")
|
93
|
+
|
94
|
+
{
|
95
|
+
"id" => id,
|
96
|
+
"type" => type.underscore.dasherize,
|
97
|
+
"name" => attributes["title"],
|
98
|
+
"author" => author,
|
99
|
+
"publisher" => attributes["publisher"],
|
100
|
+
"version" => attributes["version"],
|
101
|
+
"date_published" => attributes["published"],
|
102
|
+
"date_modified" => attributes["updated"],
|
103
|
+
"registrant_id" => "datacite.#{client_id}" }.compact
|
104
|
+
end
|
105
|
+
|
106
|
+
def encoded
|
107
|
+
Base64.strict_encode64(compress_merged_file)
|
108
|
+
end
|
109
|
+
|
110
|
+
def checksum
|
111
|
+
Digest::SHA256.hexdigest(compress_merged_file)
|
112
|
+
end
|
113
|
+
|
114
|
+
end
|
115
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
|
2
|
+
FactoryBot.define do
|
3
|
+
factory :resolution_event do
|
4
|
+
period "begin_date": "2018-03-01", "end_date": "2018-03-31"
|
5
|
+
event
|
6
|
+
{
|
7
|
+
"key": "10.5065/D6V1236Q",
|
8
|
+
"doc_count": 5566,
|
9
|
+
"total": {
|
10
|
+
"doc_count_error_upper_bound": 0,
|
11
|
+
"sum_other_doc_count": 0,
|
12
|
+
"buckets": [
|
13
|
+
{
|
14
|
+
"key": "machine",
|
15
|
+
"doc_count": 5093
|
16
|
+
},
|
17
|
+
{
|
18
|
+
"key": "regular",
|
19
|
+
"doc_count": 473
|
20
|
+
}
|
21
|
+
]
|
22
|
+
},
|
23
|
+
"access_method": {
|
24
|
+
"doc_count_error_upper_bound": 0,
|
25
|
+
"sum_other_doc_count": 0,
|
26
|
+
"buckets": [
|
27
|
+
{
|
28
|
+
"key": "machine",
|
29
|
+
"doc_count": 5093,
|
30
|
+
"session": {
|
31
|
+
"doc_count_error_upper_bound": 10,
|
32
|
+
"sum_other_doc_count": 5072,
|
33
|
+
"buckets": [
|
34
|
+
{
|
35
|
+
"key": "2018-09-18_16_10.5065/D6V1236Q_54.71.12.185_curl/7.38.0",
|
36
|
+
"doc_count": 5
|
37
|
+
},
|
38
|
+
{
|
39
|
+
"key": "2018-09-01_05_10.5065/D6V1236Q_45.79.139.170_curl/7.38.0",
|
40
|
+
"doc_count": 4
|
41
|
+
},
|
42
|
+
{
|
43
|
+
"key": "2018-09-03_16_10.5065/D6V1236Q_52.40.104.81_curl/7.38.0",
|
44
|
+
"doc_count": 4
|
45
|
+
},
|
46
|
+
{
|
47
|
+
"key": "2018-09-12_00_10.5065/D6V1236Q_52.39.7.168_curl/7.38.0",
|
48
|
+
"doc_count": 4
|
49
|
+
},
|
50
|
+
{
|
51
|
+
"key": "2018-09-26_06_10.5065/D6V1236Q_52.39.7.168_curl/7.38.0",
|
52
|
+
"doc_count": 4
|
53
|
+
}
|
54
|
+
]
|
55
|
+
},
|
56
|
+
"unqiue": {
|
57
|
+
"value": 3084
|
58
|
+
}
|
59
|
+
}]
|
60
|
+
}
|
61
|
+
}
|
62
|
+
end
|
63
|
+
|
64
|
+
|
65
|
+
factory :usage_event do
|
66
|
+
end
|
67
|
+
|
68
|
+
factory :report do
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|