kishu 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.expand_path("../../lib/kishu", __FILE__)
4
+
5
+ Kishu::CLI.start
@@ -0,0 +1,54 @@
1
+ require "date"
2
+ require File.expand_path("../lib/kishu/version", __FILE__)
3
+
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "kishu"
7
+ spec.version = Kishu::VERSION
8
+ spec.authors = ["Kristian Garza"]
9
+ spec.email = ["kgarza@datacite.org"]
10
+
11
+ spec.summary = "Client for DOI Resolution Logs processing pipeline"
12
+ spec.description = "This client helps you to prepare logs to be consumed for the pipeline as well as for creating DOI resolution reports using the COUNTER CoP "
13
+ spec.homepage = "https://github.com/datacite/kishu"
14
+ spec.license = "MIT"
15
+
16
+ # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
17
+ # to allow pushing to a single host or delete this section to allow pushing to any host.
18
+ # if spec.respond_to?(:metadata)
19
+ # spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'"
20
+ # else
21
+ # raise "RubyGems 2.0 or newer is required to protect against " \
22
+ # "public gem pushes."
23
+ # end
24
+
25
+ # Specify which files should be added to the gem when it is released.
26
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
27
+
28
+ # spec.add_dependency 'maremma', '>= 4.1', '< 5'
29
+ spec.add_dependency 'faraday', "~>0.15.3"
30
+ spec.add_dependency 'builder', '~> 3.2', '>= 3.2.2'
31
+ spec.add_dependency 'dotenv', '~> 2.1', '>= 2.1.1'
32
+ spec.add_dependency 'thor', '~> 0.19'
33
+ spec.add_dependency 'maremma', '>= 4.1', '< 5'
34
+ spec.add_dependency 'faraday_middleware-aws-sigv4', '~> 0.2.4'
35
+ spec.add_development_dependency "bundler", "~> 1.16"
36
+ spec.add_development_dependency "rake", "~> 10.0"
37
+ spec.add_development_dependency "rspec", "~> 3.0"
38
+ spec.add_development_dependency 'elasticsearch', '~> 6.1.0'
39
+ spec.add_development_dependency "thor", '~> 0.19'
40
+ spec.add_development_dependency "faraday", "~>0.15.3"
41
+ spec.add_development_dependency 'rack-test', '~> 0'
42
+ spec.add_development_dependency 'vcr', '~> 3.0', '>= 3.0.3'
43
+ spec.add_development_dependency 'webmock', '~> 3.0', '>= 3.0.1'
44
+ spec.add_development_dependency 'simplecov', '~> 0.14.1'
45
+ spec.add_development_dependency 'factory_bot', '~> 4.0'
46
+ spec.add_dependency 'sucker_punch', '~> 2.0'
47
+ spec.add_dependency 'bolognese', '~> 0.9', '>= 0.10'
48
+ spec.add_dependency 'elasticsearch', '~> 6.1.0'
49
+
50
+ spec.files = `git ls-files`.split($/)
51
+ spec.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
52
+ spec.executables = ["kishu"]
53
+ spec.require_paths = ["lib"]
54
+ end
@@ -0,0 +1,30 @@
1
+ require "kishu/resolution_event"
2
+ require "kishu/usage_event"
3
+ require "kishu/report"
4
+ require "kishu/cli"
5
+ require "kishu/sushi"
6
+ require "kishu/version"
7
+ require "kishu/client"
8
+ require "kishu/log"
9
+ require "kishu/pipeline"
10
+ require "kishu/lagotto_job"
11
+
12
+
13
+ API_URL = ENV['API_URL'] ? ENV['API_URL'] : "https://api.datacite.org"
14
+ HUB_URL = ENV['HUB_URL'] ? ENV['HUB_URL'] : "https://api.test.datacite.org"
15
+ HUB_TOKEN = ENV['HUB_TOKEN'] ? ENV['HUB_TOKEN'] : ""
16
+ ES_HOST = ENV['ES_HOST'] ? ENV['ES_HOST'] : "localhost:9200"
17
+ ES_INDEX = ENV['ES_INDEX'] ? ENV['ES_INDEX'] : "resolutions"
18
+ LOGSTASH_HOST = ENV['LOGSTASH_HOST'] ? ENV['LOGSTASH_HOST'] : "localhost:9600"
19
+ LAGOTTINO_URL = ENV['LAGOTTINO_URL'] ? ENV['LAGOTTINO_URL'] : "https://api.test.datacite.org"
20
+ LAGOTTINO_TOKEN = ENV['LAGOTTINO_TOKEN'] ? ENV['LAGOTTINO_TOKEN'] : ""
21
+ LICENSE = ENV['LICENSE'] ? ENV['LICENSE'] : "https://creativecommons.org/publicdomain/zero/1.0/"
22
+ SOURCE_TOKEN = ENV['SOURCE_TOKEN'] ? ENV['SOURCE_TOKEN'] : "65903a54-01c8-4a3f-9bf2-04ecc658247a"
23
+ S3_MERGED_LOGS_BUCKET = ENV['S3_MERGED_LOGS_BUCKET'] ? ENV['S3_MERGED_LOGS_BUCKET'] : "./monthly_logs"
24
+ S3_RESOLUTION_LOGS_BUCKET = ENV['S3_RESOLUTION_LOGS_BUCKET'] ? ENV['S3_RESOLUTION_LOGS_BUCKET'] : "./"
25
+ AWS_REGION = ENV['AWS_REGION'] ? ENV['AWS_REGION'] : ""
26
+ AWS_ACCESS_KEY_ID = ENV['AWS_ACCESS_KEY_ID'] ? ENV['AWS_ACCESS_KEY_ID'] : ""
27
+ AWS_SECRET_ACCESS_KEY = ENV['AWS_SECRET_ACCESS_KEY'] ? ENV['AWS_SECRET_ACCESS_KEY'] : ""
28
+ ELASTIC_PASSWORD = ENV['ELASTIC_PASSWORD'] ? ENV['ELASTIC_PASSWORD'] : ""
29
+ LOGS_TAG = "[Resolution Logs]"
30
+ puts ENV.to_a
@@ -0,0 +1,14 @@
1
+ require 'elasticsearch'
2
+ require 'json'
3
+ require 'faraday'
4
+
5
+
6
+
7
+ module Kishu
8
+ module Base
9
+ ES_HOST = ENV['ES_HOST'] ? ENV['ES_HOST'] : "localhost:9200"
10
+
11
+ # __elasticsearch__ = Faraday.new(url: ES_HOST)
12
+ __elasticsearch__ = Elasticsearch::Client.new host: ES_HOST, transport_options: { request: { timeout: 3600, open_timeout: 3600 } }
13
+ end
14
+ end
@@ -0,0 +1,42 @@
1
+ require 'thor'
2
+
3
+
4
+ require_relative 'sushi'
5
+ require_relative 'log'
6
+
7
+
8
+ module Kishu
9
+ class CLI < Thor
10
+
11
+ include Kishu::Base
12
+ include Kishu::Utils
13
+ # include Kishu::Report
14
+ include Kishu::Merger
15
+ # include Kishu::Event
16
+
17
+ # load ENV variables from .env file if it exists
18
+ env_file = File.expand_path("../../.env", __FILE__)
19
+ if File.exist?(env_file)
20
+ require 'dotenv'
21
+ Dotenv.overload env_file
22
+ end
23
+
24
+ def self.exit_on_failure?
25
+ true
26
+ end
27
+
28
+ # from http://stackoverflow.com/questions/22809972/adding-a-version-option-to-a-ruby-thor-cli
29
+ map %w[--version -v] => :__print_version
30
+
31
+ desc "--version, -v", "print the version"
32
+ def __print_version
33
+ puts Kishu::VERSION
34
+ end
35
+
36
+ desc "sushi SUBCOMMAND", "sushi commands"
37
+ subcommand "sushi", Kishu::Sushi
38
+
39
+ desc "log SUBCOMMAND", "log commands"
40
+ subcommand "log", Kishu::Log
41
+ end
42
+ end
@@ -0,0 +1,89 @@
1
+ require 'faraday_middleware'
2
+ require 'faraday_middleware/aws_sigv4'
3
+ require 'logger'
4
+
5
+ require_relative 'utils'
6
+ require_relative 'base'
7
+
8
+ module Kishu
9
+ class Client
10
+
11
+ def initialize
12
+
13
+ if ES_HOST == "localhost:9200" || ES_HOST == "elasticsearch:9200"
14
+ @client = Elasticsearch::Client.new(host: ES_HOST, user: "elastic", password: ELASTIC_PASSWORD, transport_options: { request: { timeout: 3600, open_timeout: 3600 }}) do |f|
15
+ f.adapter Faraday.default_adapter
16
+ end
17
+ else
18
+ @client = Elasticsearch::Client.new(host: ES_HOST, port: '80', scheme: 'http') do |f|
19
+ f.request :aws_sigv4,
20
+ service: 'es',
21
+ region: AWS_REGION,
22
+ access_key_id: AWS_ACCESS_KEY_ID,
23
+ secret_access_key: AWS_SECRET_ACCESS_KEY
24
+ f.adapter Faraday.default_adapter
25
+ end
26
+ end
27
+ @client
28
+ end
29
+
30
+
31
+ def get options={}
32
+
33
+ x =@client.search(body:{
34
+ size: options[:size] ||= 0,
35
+ query: {
36
+ query_string: {
37
+ query: "*"
38
+ }
39
+ },
40
+ aggregations: aggregations(options)
41
+ },
42
+ index: ES_INDEX
43
+ )
44
+ x
45
+ end
46
+
47
+ def is_empty?
48
+ return true unless get
49
+ nil
50
+ end
51
+
52
+ def clear_index
53
+ @client.indices.delete index: ES_INDEX
54
+ puts "Resolutions index has been deleted"
55
+ end
56
+
57
+
58
+ def get_logdate options={}
59
+ @client.search(body:{
60
+ size: 1,
61
+ query: {
62
+ query_string: {
63
+ query: "*"
64
+ }
65
+ },
66
+ aggregations: aggregations(options)
67
+ },
68
+ index: "resolutions"
69
+ ).dig("hits","hits",0,"_source","logdate")
70
+ end
71
+
72
+ def aggregations options={}
73
+ {
74
+ doi: {composite: {
75
+ sources: [{doi: {terms: {field: :doi }}}],
76
+ after: { doi: options.fetch(:after_key,"")},
77
+ size: options[:aggs_size]
78
+ },
79
+ aggs: {
80
+ unique: {terms: {field: "unique_usage"}},
81
+ totale: {terms: {field: "total_usage" }}
82
+ }
83
+ }
84
+ }
85
+ end
86
+
87
+ end
88
+ end
89
+
@@ -0,0 +1,22 @@
1
+ require_relative 'resolution_event'
2
+
3
+
4
+ class LagottoJob
5
+ include SuckerPunch::Job
6
+ include Kishu::Utils
7
+ workers 4
8
+
9
+
10
+
11
+ def perform(report, options={})
12
+ # data = format_instance event, options
13
+
14
+ # push_url = LAGOTTINO_URL + "/events"
15
+ # response = Maremma.post(push_url, data: data.to_json,
16
+ # bearer: LAGOTTINO_TOKEN,
17
+ # content_type: 'application/vnd.api+json')
18
+ # puts data
19
+ # puts response.status
20
+ Report.send_report report
21
+ end
22
+ end
@@ -0,0 +1,33 @@
1
+
2
+ require 'thor'
3
+
4
+ require_relative 'merger'
5
+ require_relative 'utils'
6
+ require_relative 'base'
7
+
8
+ module Kishu
9
+ class Log < Thor
10
+
11
+ include Kishu::Base
12
+ include Kishu::Merger
13
+ include Kishu::Utils
14
+
15
+
16
+ desc "create logs", "create logs"
17
+ method_option :logs_bucket, :default => ENV['S3_RESOLUTION_LOGS_BUCKET']
18
+ method_option :output_bucket, :default => ENV['S3_MERGED_LOGS_BUCKET']
19
+ method_option :month_year, :type => :string, :default => "201804"
20
+
21
+ def create
22
+ return "Logs don't exist" unless File.directory?(options[:month_year])
23
+ return "Pipeline has events" unless Pipeline.new.is_empty?
24
+ @log_date = get_date options[:month_year]
25
+ @folder = options[:month_year]
26
+ puts @log_date
27
+ uncompress_files
28
+ # add_bookends
29
+ merge_files
30
+ sort_files
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,69 @@
1
+
2
+ require 'date'
3
+
4
+ module Kishu
5
+ module Merger
6
+
7
+ FILE_STEM = "DataCite-access.log"
8
+
9
+ def get_date filename
10
+ Date.parse("#{filename}01")
11
+ end
12
+
13
+
14
+ def uncompress_files
15
+ system("gunzip #{resolution_logs_folder}/#{FILE_STEM}-*")
16
+ end
17
+
18
+
19
+ # def add_bookends
20
+ # File.delete("#{resolution_logs_folder}/#{FILE_STEM}-1-begin.log") if File.exist?("#{resolution_logs_folder}/#{FILE_STEM}-1-begin.log")
21
+ # File.delete("#{resolution_logs_folder}/#{FILE_STEM}-9-eof.log") if File.exist?("#{resolution_logs_folder}/#{FILE_STEM}-9-eof.log")
22
+
23
+ # begin_date = Date.civil(@log_date.year,@log_date.month,1).strftime("%Y-%m-%d")
24
+ # end_date = Date.civil(@log_date.year,@log_date.month+1, 1).strftime("%Y-%m-%d")
25
+
26
+ # begin_line = '0.0.0.0 HTTP:HDL "'+begin_date+' 00:00:00.000Z" 1 1 22ms 10.5281/zenodo.1043571 "300:10.admin/codata" "" "Mozilla"'+"\n"
27
+ # puts begin_line
28
+
29
+ # end_line = '0.0.0.0 HTTP:HDL "'+end_date+' 00:01:00.000Z" 1 1 22ms 10.5281/zenodo.1043571 "300:10.admin/codata" "" "Mozilla"'+"\n"
30
+ # puts end_line
31
+
32
+ # File.open("#{resolution_logs_folder}/#{FILE_STEM}-1-begin.log","w") {|f| f.write(begin_line) }
33
+ # File.open("#{resolution_logs_folder}/#{FILE_STEM}-9-eof.log","w") {|f| f.write(end_line) }
34
+ # end
35
+
36
+ def merged_file
37
+ "#{merged_logs_folder}/datacite_resolution_logs_#{@log_date}.log"
38
+ end
39
+
40
+ def sorted_file
41
+ "#{resolution_logs_folder}/datacite_resolution_logs_#{@log_date}_sorted.log"
42
+ end
43
+
44
+ def resolution_logs_folder
45
+ bucket = S3_RESOLUTION_LOGS_BUCKET
46
+ "#{bucket}#{@folder}"
47
+ end
48
+
49
+ def merged_logs_folder
50
+ bucket = S3_MERGED_LOGS_BUCKET
51
+ "#{bucket}#{@folder}"
52
+ end
53
+
54
+ def merge_files
55
+ File.delete(merged_file) if File.exist?(merged_file)
56
+
57
+ system("cat #{resolution_logs_folder}/#{FILE_STEM}-* > #{merged_file}")
58
+ puts "Merged Completed"
59
+ end
60
+
61
+ def sort_files
62
+ File.delete(sorted_file) if File.exist?(sorted_file)
63
+
64
+ system("sort -k3 #{merged_file} > #{sorted_file}")
65
+ puts "Sorted Completed"
66
+ puts sorted_file
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,29 @@
1
+ require 'faraday'
2
+ require 'logger'
3
+
4
+ require_relative 'utils'
5
+ require_relative 'base'
6
+
7
+ module Kishu
8
+ class Pipeline
9
+
10
+ def initialize
11
+ @conn = Faraday.new(:url => LOGSTASH_HOST)
12
+ # logger = Logger.new(STDOUT)
13
+ # logger.info
14
+ end
15
+
16
+ def is_ready?
17
+ main = @conn.get do |req|
18
+ req.url '/_node/stats/pipelines/main'
19
+ end
20
+ return nil unless main.dig("pipelines","main","events","out") == 0
21
+ end
22
+
23
+ def is_running?
24
+
25
+ end
26
+
27
+ end
28
+ end
29
+
@@ -0,0 +1,149 @@
1
+ require 'json'
2
+ require 'date'
3
+ require "faraday"
4
+ require 'securerandom'
5
+ require 'zlib'
6
+ require 'digest'
7
+
8
+ require_relative 'resolution_event'
9
+ require_relative 'client'
10
+
11
+ module Kishu
12
+ class Report
13
+
14
+ include Kishu::Base
15
+ include Kishu::Utils
16
+
17
+ def initialize options={}
18
+ set_period
19
+ @es_client = Client.new()
20
+ @logger = Logger.new(STDOUT)
21
+ @report_id = options[:report_id] ? options[:report_id] : ""
22
+ @total = 0
23
+ @aggs_size = options[:aggs_size]
24
+ @chunk_size = options[:chunk_size]
25
+ @after = options[:after_key] ||=""
26
+ end
27
+
28
+ def report_period options={}
29
+ es_client = Client.new()
30
+
31
+ logdate = es_client.get_logdate({aggs_size: 1})
32
+ puts logdate
33
+ Date.parse(logdate)
34
+ end
35
+
36
+
37
+ def get_events options={}
38
+ logger = Logger.new(STDOUT)
39
+ es_client = Client.new()
40
+ response = es_client.get({aggs_size: @aggs_size || 500, after_key: options[:after_key] ||=""})
41
+ aggs = response.dig("aggregations","doi","buckets")
42
+ x = aggs.map do |agg|
43
+ ResolutionEvent.new(agg,{period: @period, report_id: @report_id}).wrap_event
44
+ end
45
+ after = response.dig("aggregations","doi").fetch("after_key",{"doi"=>nil}).dig("doi")
46
+ logger.info "After_key for pagination #{after}"
47
+ y = {data: x, after: after}
48
+ y
49
+ end
50
+
51
+
52
+ def generate_dataset_array
53
+ @datasets = []
54
+ loop do
55
+ response = get_events({after_key: @after ||=""})
56
+ @datasets = @datasets.concat response[:data]
57
+ @after = response[:after]
58
+ @total += @datasets.size
59
+ generate_chunk_report if @datasets.size > @chunk_size
60
+ break if @after.nil?
61
+ end
62
+ end
63
+
64
+ def compress report
65
+ # report = File.read(hash)
66
+ gzip = Zlib::GzipWriter.new(StringIO.new)
67
+ string = report.to_json
68
+ gzip << string
69
+ body = gzip.close.string
70
+ body
71
+ end
72
+
73
+
74
+ def generate_chunk_report
75
+ # puts get_template
76
+ # LagottoJob.perform_async(get_template(@datasets))
77
+ file = merged_file #+ 'after_key_' + @after
78
+ File.open(file,"w") do |f|
79
+ f.write(JSON.pretty_generate get_template)
80
+ end
81
+ send_report get_template
82
+ @datasets = []
83
+ end
84
+
85
+ def make_report options={}
86
+ generate_dataset_array
87
+ @logger.info "#{LOGS_TAG} Month of #{@period.dig("begin-date")} sent to Hub in report #{@uid} with stats for #{@total} datasets"
88
+ end
89
+
90
+
91
+ def set_period
92
+ report_period
93
+ @period = {
94
+ "begin-date": Date.civil(report_period.year, report_period.mon, 1).strftime("%Y-%m-%d"),
95
+ "end-date": Date.civil(report_period.year, report_period.mon, -1).strftime("%Y-%m-%d"),
96
+ }
97
+ end
98
+
99
+ def send_report report, options={}
100
+ uri = HUB_URL+'/reports'
101
+ puts uri
102
+
103
+ headers = {
104
+ content_type: "application/gzip",
105
+ content_encoding: 'gzip',
106
+ accept: 'gzip'
107
+ }
108
+
109
+ body = compress(report)
110
+ n = 0
111
+ loop do
112
+ request = Maremma.post(uri, data: body,
113
+ bearer: ENV['HUB_TOKEN'],
114
+ headers: headers,
115
+ timeout: 100)
116
+
117
+ @uid = request.body.dig("data","report","id")
118
+ @logger.info "#{LOGS_TAG} Hub response #{request.status} for Report finishing in #{@after}"
119
+ @logger.info "#{LOGS_TAG} Hub response #{@uid} for Report finishing in #{@after}"
120
+ n += 1
121
+ break if request.status == 201
122
+ fail "#{LOGS_TAG} Too many attempts were tried to push this report" if n > 1
123
+ sleep 1
124
+ end
125
+ end
126
+
127
+ def get_template
128
+ {
129
+ "report-header": get_header,
130
+ "report-datasets": @datasets
131
+ }
132
+ end
133
+
134
+ def get_header
135
+ {
136
+ "report-name": "resolution report",
137
+ "report-id": "dsr",
138
+ release: "drl",
139
+ created: Date.today.strftime("%Y-%m-%d"),
140
+ "created-by": "datacite",
141
+ "reporting-period": @period,
142
+ "report-filters": [],
143
+ "report-attributes": [],
144
+ exceptions: [{code: 69,severity: "warning", message: "Report is compressed using gzip","help-url": "https://github.com/datacite/sashimi",data: "usage data needs to be uncompressed"}]
145
+ }
146
+ end
147
+
148
+ end
149
+ end