kishu 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.expand_path("../../lib/kishu", __FILE__)
4
+
5
+ Kishu::CLI.start
@@ -0,0 +1,54 @@
1
+ require "date"
2
+ require File.expand_path("../lib/kishu/version", __FILE__)
3
+
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "kishu"
7
+ spec.version = Kishu::VERSION
8
+ spec.authors = ["Kristian Garza"]
9
+ spec.email = ["kgarza@datacite.org"]
10
+
11
+ spec.summary = "Client for DOI Resolution Logs processing pipeline"
12
+ spec.description = "This client helps you to prepare logs to be consumed for the pipeline as well as for creating DOI resolution reports using the COUNTER CoP "
13
+ spec.homepage = "https://github.com/datacite/kishu"
14
+ spec.license = "MIT"
15
+
16
+ # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
17
+ # to allow pushing to a single host or delete this section to allow pushing to any host.
18
+ # if spec.respond_to?(:metadata)
19
+ # spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'"
20
+ # else
21
+ # raise "RubyGems 2.0 or newer is required to protect against " \
22
+ # "public gem pushes."
23
+ # end
24
+
25
+ # Specify which files should be added to the gem when it is released.
26
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
27
+
28
+ # spec.add_dependency 'maremma', '>= 4.1', '< 5'
29
+ spec.add_dependency 'faraday', "~>0.15.3"
30
+ spec.add_dependency 'builder', '~> 3.2', '>= 3.2.2'
31
+ spec.add_dependency 'dotenv', '~> 2.1', '>= 2.1.1'
32
+ spec.add_dependency 'thor', '~> 0.19'
33
+ spec.add_dependency 'maremma', '>= 4.1', '< 5'
34
+ spec.add_dependency 'faraday_middleware-aws-sigv4', '~> 0.2.4'
35
+ spec.add_development_dependency "bundler", "~> 1.16"
36
+ spec.add_development_dependency "rake", "~> 10.0"
37
+ spec.add_development_dependency "rspec", "~> 3.0"
38
+ spec.add_development_dependency 'elasticsearch', '~> 6.1.0'
39
+ spec.add_development_dependency "thor", '~> 0.19'
40
+ spec.add_development_dependency "faraday", "~>0.15.3"
41
+ spec.add_development_dependency 'rack-test', '~> 0'
42
+ spec.add_development_dependency 'vcr', '~> 3.0', '>= 3.0.3'
43
+ spec.add_development_dependency 'webmock', '~> 3.0', '>= 3.0.1'
44
+ spec.add_development_dependency 'simplecov', '~> 0.14.1'
45
+ spec.add_development_dependency 'factory_bot', '~> 4.0'
46
+ spec.add_dependency 'sucker_punch', '~> 2.0'
47
+ spec.add_dependency 'bolognese', '~> 0.9', '>= 0.10'
48
+ spec.add_dependency 'elasticsearch', '~> 6.1.0'
49
+
50
+ spec.files = `git ls-files`.split($/)
51
+ spec.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
52
+ spec.executables = ["kishu"]
53
+ spec.require_paths = ["lib"]
54
+ end
@@ -0,0 +1,30 @@
1
+ require "kishu/resolution_event"
2
+ require "kishu/usage_event"
3
+ require "kishu/report"
4
+ require "kishu/cli"
5
+ require "kishu/sushi"
6
+ require "kishu/version"
7
+ require "kishu/client"
8
+ require "kishu/log"
9
+ require "kishu/pipeline"
10
+ require "kishu/lagotto_job"
11
+
12
+
13
+ API_URL = ENV['API_URL'] ? ENV['API_URL'] : "https://api.datacite.org"
14
+ HUB_URL = ENV['HUB_URL'] ? ENV['HUB_URL'] : "https://api.test.datacite.org"
15
+ HUB_TOKEN = ENV['HUB_TOKEN'] ? ENV['HUB_TOKEN'] : ""
16
+ ES_HOST = ENV['ES_HOST'] ? ENV['ES_HOST'] : "localhost:9200"
17
+ ES_INDEX = ENV['ES_INDEX'] ? ENV['ES_INDEX'] : "resolutions"
18
+ LOGSTASH_HOST = ENV['LOGSTASH_HOST'] ? ENV['LOGSTASH_HOST'] : "localhost:9600"
19
+ LAGOTTINO_URL = ENV['LAGOTTINO_URL'] ? ENV['LAGOTTINO_URL'] : "https://api.test.datacite.org"
20
+ LAGOTTINO_TOKEN = ENV['LAGOTTINO_TOKEN'] ? ENV['LAGOTTINO_TOKEN'] : ""
21
+ LICENSE = ENV['LICENSE'] ? ENV['LICENSE'] : "https://creativecommons.org/publicdomain/zero/1.0/"
22
+ SOURCE_TOKEN = ENV['SOURCE_TOKEN'] ? ENV['SOURCE_TOKEN'] : "65903a54-01c8-4a3f-9bf2-04ecc658247a"
23
+ S3_MERGED_LOGS_BUCKET = ENV['S3_MERGED_LOGS_BUCKET'] ? ENV['S3_MERGED_LOGS_BUCKET'] : "./monthly_logs"
24
+ S3_RESOLUTION_LOGS_BUCKET = ENV['S3_RESOLUTION_LOGS_BUCKET'] ? ENV['S3_RESOLUTION_LOGS_BUCKET'] : "./"
25
+ AWS_REGION = ENV['AWS_REGION'] ? ENV['AWS_REGION'] : ""
26
+ AWS_ACCESS_KEY_ID = ENV['AWS_ACCESS_KEY_ID'] ? ENV['AWS_ACCESS_KEY_ID'] : ""
27
+ AWS_SECRET_ACCESS_KEY = ENV['AWS_SECRET_ACCESS_KEY'] ? ENV['AWS_SECRET_ACCESS_KEY'] : ""
28
+ ELASTIC_PASSWORD = ENV['ELASTIC_PASSWORD'] ? ENV['ELASTIC_PASSWORD'] : ""
29
+ LOGS_TAG = "[Resolution Logs]"
30
+ puts ENV.to_a
@@ -0,0 +1,14 @@
1
+ require 'elasticsearch'
2
+ require 'json'
3
+ require 'faraday'
4
+
5
+
6
+
7
+ module Kishu
8
+ module Base
9
+ ES_HOST = ENV['ES_HOST'] ? ENV['ES_HOST'] : "localhost:9200"
10
+
11
+ # __elasticsearch__ = Faraday.new(url: ES_HOST)
12
+ __elasticsearch__ = Elasticsearch::Client.new host: ES_HOST, transport_options: { request: { timeout: 3600, open_timeout: 3600 } }
13
+ end
14
+ end
@@ -0,0 +1,42 @@
1
+ require 'thor'
2
+
3
+
4
+ require_relative 'sushi'
5
+ require_relative 'log'
6
+
7
+
8
+ module Kishu
9
+ class CLI < Thor
10
+
11
+ include Kishu::Base
12
+ include Kishu::Utils
13
+ # include Kishu::Report
14
+ include Kishu::Merger
15
+ # include Kishu::Event
16
+
17
+ # load ENV variables from .env file if it exists
18
+ env_file = File.expand_path("../../.env", __FILE__)
19
+ if File.exist?(env_file)
20
+ require 'dotenv'
21
+ Dotenv.overload env_file
22
+ end
23
+
24
+ def self.exit_on_failure?
25
+ true
26
+ end
27
+
28
+ # from http://stackoverflow.com/questions/22809972/adding-a-version-option-to-a-ruby-thor-cli
29
+ map %w[--version -v] => :__print_version
30
+
31
+ desc "--version, -v", "print the version"
32
+ def __print_version
33
+ puts Kishu::VERSION
34
+ end
35
+
36
+ desc "sushi SUBCOMMAND", "sushi commands"
37
+ subcommand "sushi", Kishu::Sushi
38
+
39
+ desc "log SUBCOMMAND", "log commands"
40
+ subcommand "log", Kishu::Log
41
+ end
42
+ end
@@ -0,0 +1,89 @@
1
+ require 'faraday_middleware'
2
+ require 'faraday_middleware/aws_sigv4'
3
+ require 'logger'
4
+
5
+ require_relative 'utils'
6
+ require_relative 'base'
7
+
8
+ module Kishu
9
+ class Client
10
+
11
+ def initialize
12
+
13
+ if ES_HOST == "localhost:9200" || ES_HOST == "elasticsearch:9200"
14
+ @client = Elasticsearch::Client.new(host: ES_HOST, user: "elastic", password: ELASTIC_PASSWORD, transport_options: { request: { timeout: 3600, open_timeout: 3600 }}) do |f|
15
+ f.adapter Faraday.default_adapter
16
+ end
17
+ else
18
+ @client = Elasticsearch::Client.new(host: ES_HOST, port: '80', scheme: 'http') do |f|
19
+ f.request :aws_sigv4,
20
+ service: 'es',
21
+ region: AWS_REGION,
22
+ access_key_id: AWS_ACCESS_KEY_ID,
23
+ secret_access_key: AWS_SECRET_ACCESS_KEY
24
+ f.adapter Faraday.default_adapter
25
+ end
26
+ end
27
+ @client
28
+ end
29
+
30
+
31
+ def get options={}
32
+
33
+ x =@client.search(body:{
34
+ size: options[:size] ||= 0,
35
+ query: {
36
+ query_string: {
37
+ query: "*"
38
+ }
39
+ },
40
+ aggregations: aggregations(options)
41
+ },
42
+ index: ES_INDEX
43
+ )
44
+ x
45
+ end
46
+
47
+ def is_empty?
48
+ return true unless get
49
+ nil
50
+ end
51
+
52
+ def clear_index
53
+ @client.indices.delete index: ES_INDEX
54
+ puts "Resolutions index has been deleted"
55
+ end
56
+
57
+
58
+ def get_logdate options={}
59
+ @client.search(body:{
60
+ size: 1,
61
+ query: {
62
+ query_string: {
63
+ query: "*"
64
+ }
65
+ },
66
+ aggregations: aggregations(options)
67
+ },
68
+ index: "resolutions"
69
+ ).dig("hits","hits",0,"_source","logdate")
70
+ end
71
+
72
+ def aggregations options={}
73
+ {
74
+ doi: {composite: {
75
+ sources: [{doi: {terms: {field: :doi }}}],
76
+ after: { doi: options.fetch(:after_key,"")},
77
+ size: options[:aggs_size]
78
+ },
79
+ aggs: {
80
+ unique: {terms: {field: "unique_usage"}},
81
+ totale: {terms: {field: "total_usage" }}
82
+ }
83
+ }
84
+ }
85
+ end
86
+
87
+ end
88
+ end
89
+
@@ -0,0 +1,22 @@
1
+ require_relative 'resolution_event'
2
+
3
+
4
+ class LagottoJob
5
+ include SuckerPunch::Job
6
+ include Kishu::Utils
7
+ workers 4
8
+
9
+
10
+
11
+ def perform(report, options={})
12
+ # data = format_instance event, options
13
+
14
+ # push_url = LAGOTTINO_URL + "/events"
15
+ # response = Maremma.post(push_url, data: data.to_json,
16
+ # bearer: LAGOTTINO_TOKEN,
17
+ # content_type: 'application/vnd.api+json')
18
+ # puts data
19
+ # puts response.status
20
+ Report.send_report report
21
+ end
22
+ end
@@ -0,0 +1,33 @@
1
+
2
+ require 'thor'
3
+
4
+ require_relative 'merger'
5
+ require_relative 'utils'
6
+ require_relative 'base'
7
+
8
+ module Kishu
9
+ class Log < Thor
10
+
11
+ include Kishu::Base
12
+ include Kishu::Merger
13
+ include Kishu::Utils
14
+
15
+
16
+ desc "create logs", "create logs"
17
+ method_option :logs_bucket, :default => ENV['S3_RESOLUTION_LOGS_BUCKET']
18
+ method_option :output_bucket, :default => ENV['S3_MERGED_LOGS_BUCKET']
19
+ method_option :month_year, :type => :string, :default => "201804"
20
+
21
+ def create
22
+ return "Logs don't exist" unless File.directory?(options[:month_year])
23
+ return "Pipeline has events" unless Pipeline.new.is_empty?
24
+ @log_date = get_date options[:month_year]
25
+ @folder = options[:month_year]
26
+ puts @log_date
27
+ uncompress_files
28
+ # add_bookends
29
+ merge_files
30
+ sort_files
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,69 @@
1
+
2
+ require 'date'
3
+
4
+ module Kishu
5
+ module Merger
6
+
7
+ FILE_STEM = "DataCite-access.log"
8
+
9
+ def get_date filename
10
+ Date.parse("#{filename}01")
11
+ end
12
+
13
+
14
+ def uncompress_files
15
+ system("gunzip #{resolution_logs_folder}/#{FILE_STEM}-*")
16
+ end
17
+
18
+
19
+ # def add_bookends
20
+ # File.delete("#{resolution_logs_folder}/#{FILE_STEM}-1-begin.log") if File.exist?("#{resolution_logs_folder}/#{FILE_STEM}-1-begin.log")
21
+ # File.delete("#{resolution_logs_folder}/#{FILE_STEM}-9-eof.log") if File.exist?("#{resolution_logs_folder}/#{FILE_STEM}-9-eof.log")
22
+
23
+ # begin_date = Date.civil(@log_date.year,@log_date.month,1).strftime("%Y-%m-%d")
24
+ # end_date = Date.civil(@log_date.year,@log_date.month+1, 1).strftime("%Y-%m-%d")
25
+
26
+ # begin_line = '0.0.0.0 HTTP:HDL "'+begin_date+' 00:00:00.000Z" 1 1 22ms 10.5281/zenodo.1043571 "300:10.admin/codata" "" "Mozilla"'+"\n"
27
+ # puts begin_line
28
+
29
+ # end_line = '0.0.0.0 HTTP:HDL "'+end_date+' 00:01:00.000Z" 1 1 22ms 10.5281/zenodo.1043571 "300:10.admin/codata" "" "Mozilla"'+"\n"
30
+ # puts end_line
31
+
32
+ # File.open("#{resolution_logs_folder}/#{FILE_STEM}-1-begin.log","w") {|f| f.write(begin_line) }
33
+ # File.open("#{resolution_logs_folder}/#{FILE_STEM}-9-eof.log","w") {|f| f.write(end_line) }
34
+ # end
35
+
36
+ def merged_file
37
+ "#{merged_logs_folder}/datacite_resolution_logs_#{@log_date}.log"
38
+ end
39
+
40
+ def sorted_file
41
+ "#{resolution_logs_folder}/datacite_resolution_logs_#{@log_date}_sorted.log"
42
+ end
43
+
44
+ def resolution_logs_folder
45
+ bucket = S3_RESOLUTION_LOGS_BUCKET
46
+ "#{bucket}#{@folder}"
47
+ end
48
+
49
+ def merged_logs_folder
50
+ bucket = S3_MERGED_LOGS_BUCKET
51
+ "#{bucket}#{@folder}"
52
+ end
53
+
54
+ def merge_files
55
+ File.delete(merged_file) if File.exist?(merged_file)
56
+
57
+ system("cat #{resolution_logs_folder}/#{FILE_STEM}-* > #{merged_file}")
58
+ puts "Merged Completed"
59
+ end
60
+
61
+ def sort_files
62
+ File.delete(sorted_file) if File.exist?(sorted_file)
63
+
64
+ system("sort -k3 #{merged_file} > #{sorted_file}")
65
+ puts "Sorted Completed"
66
+ puts sorted_file
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,29 @@
1
+ require 'faraday'
2
+ require 'logger'
3
+
4
+ require_relative 'utils'
5
+ require_relative 'base'
6
+
7
+ module Kishu
8
+ class Pipeline
9
+
10
+ def initialize
11
+ @conn = Faraday.new(:url => LOGSTASH_HOST)
12
+ # logger = Logger.new(STDOUT)
13
+ # logger.info
14
+ end
15
+
16
+ def is_ready?
17
+ main = @conn.get do |req|
18
+ req.url '/_node/stats/pipelines/main'
19
+ end
20
+ return nil unless main.dig("pipelines","main","events","out") == 0
21
+ end
22
+
23
+ def is_running?
24
+
25
+ end
26
+
27
+ end
28
+ end
29
+
@@ -0,0 +1,149 @@
1
+ require 'json'
2
+ require 'date'
3
+ require "faraday"
4
+ require 'securerandom'
5
+ require 'zlib'
6
+ require 'digest'
7
+
8
+ require_relative 'resolution_event'
9
+ require_relative 'client'
10
+
11
+ module Kishu
12
+ class Report
13
+
14
+ include Kishu::Base
15
+ include Kishu::Utils
16
+
17
+ def initialize options={}
18
+ set_period
19
+ @es_client = Client.new()
20
+ @logger = Logger.new(STDOUT)
21
+ @report_id = options[:report_id] ? options[:report_id] : ""
22
+ @total = 0
23
+ @aggs_size = options[:aggs_size]
24
+ @chunk_size = options[:chunk_size]
25
+ @after = options[:after_key] ||=""
26
+ end
27
+
28
+ def report_period options={}
29
+ es_client = Client.new()
30
+
31
+ logdate = es_client.get_logdate({aggs_size: 1})
32
+ puts logdate
33
+ Date.parse(logdate)
34
+ end
35
+
36
+
37
+ def get_events options={}
38
+ logger = Logger.new(STDOUT)
39
+ es_client = Client.new()
40
+ response = es_client.get({aggs_size: @aggs_size || 500, after_key: options[:after_key] ||=""})
41
+ aggs = response.dig("aggregations","doi","buckets")
42
+ x = aggs.map do |agg|
43
+ ResolutionEvent.new(agg,{period: @period, report_id: @report_id}).wrap_event
44
+ end
45
+ after = response.dig("aggregations","doi").fetch("after_key",{"doi"=>nil}).dig("doi")
46
+ logger.info "After_key for pagination #{after}"
47
+ y = {data: x, after: after}
48
+ y
49
+ end
50
+
51
+
52
+ def generate_dataset_array
53
+ @datasets = []
54
+ loop do
55
+ response = get_events({after_key: @after ||=""})
56
+ @datasets = @datasets.concat response[:data]
57
+ @after = response[:after]
58
+ @total += @datasets.size
59
+ generate_chunk_report if @datasets.size > @chunk_size
60
+ break if @after.nil?
61
+ end
62
+ end
63
+
64
+ def compress report
65
+ # report = File.read(hash)
66
+ gzip = Zlib::GzipWriter.new(StringIO.new)
67
+ string = report.to_json
68
+ gzip << string
69
+ body = gzip.close.string
70
+ body
71
+ end
72
+
73
+
74
+ def generate_chunk_report
75
+ # puts get_template
76
+ # LagottoJob.perform_async(get_template(@datasets))
77
+ file = merged_file #+ 'after_key_' + @after
78
+ File.open(file,"w") do |f|
79
+ f.write(JSON.pretty_generate get_template)
80
+ end
81
+ send_report get_template
82
+ @datasets = []
83
+ end
84
+
85
+ def make_report options={}
86
+ generate_dataset_array
87
+ @logger.info "#{LOGS_TAG} Month of #{@period.dig("begin-date")} sent to Hub in report #{@uid} with stats for #{@total} datasets"
88
+ end
89
+
90
+
91
+ def set_period
92
+ report_period
93
+ @period = {
94
+ "begin-date": Date.civil(report_period.year, report_period.mon, 1).strftime("%Y-%m-%d"),
95
+ "end-date": Date.civil(report_period.year, report_period.mon, -1).strftime("%Y-%m-%d"),
96
+ }
97
+ end
98
+
99
+ def send_report report, options={}
100
+ uri = HUB_URL+'/reports'
101
+ puts uri
102
+
103
+ headers = {
104
+ content_type: "application/gzip",
105
+ content_encoding: 'gzip',
106
+ accept: 'gzip'
107
+ }
108
+
109
+ body = compress(report)
110
+ n = 0
111
+ loop do
112
+ request = Maremma.post(uri, data: body,
113
+ bearer: ENV['HUB_TOKEN'],
114
+ headers: headers,
115
+ timeout: 100)
116
+
117
+ @uid = request.body.dig("data","report","id")
118
+ @logger.info "#{LOGS_TAG} Hub response #{request.status} for Report finishing in #{@after}"
119
+ @logger.info "#{LOGS_TAG} Hub response #{@uid} for Report finishing in #{@after}"
120
+ n += 1
121
+ break if request.status == 201
122
+ fail "#{LOGS_TAG} Too many attempts were tried to push this report" if n > 1
123
+ sleep 1
124
+ end
125
+ end
126
+
127
+ def get_template
128
+ {
129
+ "report-header": get_header,
130
+ "report-datasets": @datasets
131
+ }
132
+ end
133
+
134
+ def get_header
135
+ {
136
+ "report-name": "resolution report",
137
+ "report-id": "dsr",
138
+ release: "drl",
139
+ created: Date.today.strftime("%Y-%m-%d"),
140
+ "created-by": "datacite",
141
+ "reporting-period": @period,
142
+ "report-filters": [],
143
+ "report-attributes": [],
144
+ exceptions: [{code: 69,severity: "warning", message: "Report is compressed using gzip","help-url": "https://github.com/datacite/sashimi",data: "usage data needs to be uncompressed"}]
145
+ }
146
+ end
147
+
148
+ end
149
+ end