log_analysis 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -2,28 +2,35 @@ require 'log_analysis/version'
2
2
  require 'log_analysis/preprocess'
3
3
  require 'log_analysis/user_identification'
4
4
  require 'log_analysis/session_identification'
5
+ require 'log_analysis/transformation'
6
+ require 'log_analysis/rule_generation'
5
7
 
6
8
  class LogAnalysis
7
9
  class Error < StandardError; end
8
10
  # Your code goes here...
9
11
 
10
- attr_reader :path, :origin_logs
12
+ attr_reader :path, :type, :cleaned_data
11
13
 
12
14
  def initialize(path, type = nil)
13
- @path = path
14
- @type = type
15
- @origin_logs = PreProcess.input(path, type)
16
- end
17
-
18
- def cleaned_data
19
- PreProcess.data_cleaning(@origin_logs)
15
+ @path = path
16
+ @type = type
17
+ @cleaned_data = PreProcess.input(path, type)
18
+ system('mkdir', '-p', LogAnalysis::DATA_PATH)
20
19
  end
21
20
 
22
21
  def identified_user
23
- UserIdentification.execute(cleaned_data)
22
+ UserIdentification.execute(@cleaned_data)
24
23
  end
25
24
 
26
25
  def identified_session
27
- SessionIdentification.execute(cleaned_data)
26
+ SessionIdentification.execute(@cleaned_data)
27
+ end
28
+
29
+ def transformation
30
+ Transformation.execute(identified_session)
31
+ end
32
+
33
+ def rule_generation
34
+ RuleGeneration.execute(transformation)
28
35
  end
29
36
  end
@@ -25,7 +25,7 @@ class Record
25
25
 
26
26
  attr_reader :params
27
27
 
28
- DATA_TYPE = %w[.txt .json .js .css .jpg .jpeg .gif .woff2 .ico .png .bmp .mp3 .wav .avi .mpeg .vmw .mpg .map .pdf .doc .svg].freeze
28
+ DATA_TYPE = %w[.txt .json .js .css .jpg .jpeg .gif .woff2 .ico .png .bmp .mp3 .wav .avi .mpeg .vmw .mpg .map .pdf .doc .svg .otf].freeze
29
29
  REGEX_BOT = /facebookexternalhit|Mediapartners-Google|AWS|-|Crawler|spider|Detection/.freeze
30
30
 
31
31
  def initialize(params)
@@ -70,7 +70,7 @@ class Record
70
70
  p['apptime'] = @params['apptime'].to_f
71
71
  p['cache'] = validate_string(@params['cache'])
72
72
  p['vhost'] = validate_string(@params['vhost'])
73
- p['user'] = UserIdentity.new(host: IPAddr.new(@params['host']), user_agent: UserAgent.parse(@params['ua']))
73
+ p['user'] = @params['user'] || nil
74
74
  p['forwardedfor'] = validate_string(@params['forwardedfor'])
75
75
  p['forwardedproto'] = validate_string(@params['forwardedproto'])
76
76
  end
@@ -1,6 +1,6 @@
1
1
  require 'log_analysis/model/record'
2
+ require 'log_analysis/model/user_identity'
2
3
  require 'json'
3
- require 'pry'
4
4
 
5
5
  module PreProcess
6
6
  class Error < StandardError; end
@@ -13,63 +13,54 @@ module PreProcess
13
13
  CONVERT_RECORD = { 'nginx' => 'convert_nginx_logs', 'apache' => 'convert_apache_logs', 'default' => 'to_records' }.freeze
14
14
 
15
15
  def self.input(file_path, type)
16
- arr_logs = File.readlines(file_path).each_with_object([]) do |line, arr|
17
- preprocess_log = type.nil? ? line.gsub(/[\t]/i, ' ').chomp! : line
18
- arr.push(preprocess_log)
19
- end
20
-
21
- send(CONVERT_RECORD[type.nil? ? 'nginx' : type], arr_logs)
22
- end
23
-
24
- def self.data_cleaning(logs)
25
- logs.select { |record| record.status_200? && record.method_get? && record.uri_without_data && !record.robot? }
26
- end
27
-
28
- def self.to_records(logs)
29
- logs.each_with_object([]).with_index do |(log, arrays), i|
30
- next if log.nil?
31
-
32
- o = log.split(REGEX_KEYS)
33
- o = o.map(&:strip)
34
- o.delete('')
16
+ @users = []
35
17
 
36
- log = o.each_slice(2).to_a.each_with_object({}) do |pair, log_obj|
37
- log_obj.merge!(to_json(pair))
38
- end
18
+ File.readlines(file_path).each_with_object([]).with_index do |(line, arr), i|
19
+ preprocessed_log = type.nil? ? line.gsub(/[\t]/i, ' ').chomp! : line
20
+ record = Record.new(send(CONVERT_RECORD[type.nil? ? 'nginx' : type], preprocessed_log)) unless preprocessed_log.nil?
39
21
 
40
- arrays << Record.new(log)
22
+ arr.push(record) if record.status_200? && record.method_get? && record.uri_without_data && !record.robot?
41
23
 
42
- puts "#{i}/#{logs.size}"
24
+ puts arr.size
43
25
  end
44
26
  end
45
27
 
46
- def self.convert_nginx_logs(logs)
47
- logs.each_with_object([]).with_index do |(log, arrays), i|
48
- next if log.nil?
49
-
50
- o = log.split(REGEX_NGINX)
51
- o.delete('')
52
-
53
- obj = {}.tap do |p|
54
- p['host'] = o[0]
55
- p['user'] = o[2]
56
- p['time'] = o[3]
57
- p['method'] = o[4]
58
- p['uri'] = o[5]
59
- p['status'] = o[6]
60
- p['size'] = o[7]
61
- p['referer'] = o[8]
62
- p['ua'] = o[9]
63
- p['forwarded'] = o[10]
64
- end
65
-
66
- arrays << Record.new(obj)
28
+ def self.to_record(log)
29
+ o = log.gsub!('\t', ' ')
30
+ o = log.split(REGEX_KEYS)
31
+ o = o.map(&:strip)
32
+ o.delete('')
33
+ o.each_slice(2).to_a.each_with_object({}) { |pair, log_obj| log_obj.merge!(to_json(pair)) }
34
+ end
67
35
 
68
- puts "#{i}/#{logs.size}"
36
+ def self.convert_nginx_logs(log)
37
+ o = log.split(REGEX_NGINX)
38
+ o.delete('')
39
+
40
+ {}.tap do |p|
41
+ p['host'] = o[0]
42
+ p['user'] = o[2]
43
+ p['time'] = o[3]
44
+ p['method'] = o[4]
45
+ p['uri'] = o[5]
46
+ p['status'] = o[6]
47
+ p['size'] = o[7]
48
+ p['referer'] = o[8]
49
+ p['ua'] = o[9]
50
+ p['forwarded'] = o[10]
51
+ p['user'] = save_user(o)
69
52
  end
70
53
  end
71
54
 
72
55
  def self.to_json(pair)
73
56
  { pair.first.delete(':') => pair.last }
74
57
  end
58
+
59
+ def self.save_user(log)
60
+ user = @users.find { |i| i.host == log[0] && i.user_agent.to_s == log[9] }
61
+ return user unless user.nil?
62
+
63
+ @users.push(UserIdentity.new(host: IPAddr.new(log[0]), user_agent: UserAgent.parse(log[9])))
64
+ @users.last
65
+ end
75
66
  end
@@ -0,0 +1,59 @@
1
+ require 'time'
2
+ require 'log_analysis/version'
3
+
4
+ module RuleGeneration
5
+ JAR_FILE_PATH = File.expand_path('spmf.jar')
6
+ TRANSFORM_DATA_PATH = File.expand_path("#{LogAnalysis::DATA_PATH}transform_data_#{Time.now.strftime('%Y%m%d')}.txt")
7
+ RULE_FILE_PATH = File.expand_path("#{LogAnalysis::DATA_PATH}output_#{Time.now.strftime('%Y%m%d')}.txt")
8
+ MAP_URI_FILE_PATH = File.expand_path("#{LogAnalysis::DATA_PATH}map_uri_#{Time.now.strftime('%Y%m%d')}.txt")
9
+
10
+ class Error < StandardError; end
11
+ # Your code goes here...
12
+
13
+ def self.execute(transform_data)
14
+ File.open(TRANSFORM_DATA_PATH, 'w+') { |f| transform_data.keys.each { |e| f.puts(transform_data[e].map { |i| i.is_a?(Array) ? i.join(' ') : i }.join(' -1 ').concat(' -1 -2')) } }
15
+ system("java -jar #{JAR_FILE_PATH} run SPADE #{TRANSFORM_DATA_PATH} #{RULE_FILE_PATH} 65%")
16
+ result = rule_gen(get_seq(File.read(RULE_FILE_PATH)), 0.5)
17
+ map_uri = File.read(MAP_URI_FILE_PATH).split(' ')
18
+
19
+ result.map do |rule|
20
+ seq, sub, rea = rule
21
+ [seq.map { |i| map_uri[i.to_i] }, sub.map { |i| map_uri[i.to_i] }, rea]
22
+ end
23
+ end
24
+
25
+ def self.rule_gen(seqs, min_conf)
26
+ seqs.each_with_object([]) { |seq, arr| seqs.each { |sub| arr.push([seq[0], sub[0], seq[1] / sub[1]]) if sub[0] != seq[0] && sub_seq?(sub[0], seq[0]) && seq[1] / sub[1] >= min_conf } }
27
+ end
28
+
29
+ def self.sub_seq?(first, second)
30
+ ptr = 0
31
+ first.each do |sub|
32
+ return false if ptr >= second.size
33
+
34
+ (ptr..second.size - 1).each do |n|
35
+ if sub?(second[n], sub)
36
+ ptr = n + 1
37
+ break
38
+ end
39
+ return false if ptr == second.size - 1
40
+ end
41
+ end
42
+ true
43
+ end
44
+
45
+ def self.sub?(str, sub)
46
+ mark_sub = 0
47
+ sub.split(' ').each { |char| mark_sub += 1 if str.include?(char) }
48
+
49
+ mark_sub == sub.split(' ').size
50
+ end
51
+
52
+ def self.get_seq(seq_str)
53
+ seq = seq_str.split("\n")
54
+ seq.each_with_object([]) do |s, arr|
55
+ split_seq = s.split('-1')
56
+ arr.push([split_seq[0..-2], split_seq[-1][-1].to_f])
57
+ end
58
+ end
59
+ end
@@ -8,18 +8,10 @@ module SessionIdentification
8
8
  # Your code goes here...
9
9
 
10
10
  def self.execute(cleaned_data)
11
- session_identity = []
12
- cleaned_data.each do |record|
13
- isession = session_identity.rindex { |s| s.user.host == record.user.host && s.user.user_agent == record.user.user_agent }
14
-
15
- if isession.present? && validate_time_session(session_identity[isession].records.last.time, record.time)
16
- session_identity[isession].records << record
17
- else
18
- session_identity << SessionIdentity.new(session_identity_params(record))
19
- end
11
+ cleaned_data.each_with_object([]) do |record, arr|
12
+ isession = arr.rindex { |s| s.user == record.user }
13
+ isession.present? && validate_time_session(arr[isession].records.last.time, record.time) ? arr[isession].records << record : arr << SessionIdentity.new(session_identity_params(record))
20
14
  end
21
-
22
- session_identity.map { |i| i.records.map(&:uri) }
23
15
  end
24
16
 
25
17
  private
@@ -0,0 +1,26 @@
1
+ require 'log_analysis/model/session_identity'
2
+ require 'log_analysis/model/user_identity'
3
+ require 'log_analysis/version'
4
+
5
+ module Transformation
6
+ MAP_URI_FILE_PATH = File.expand_path("#{LogAnalysis::DATA_PATH}map_uri_#{Time.now.strftime('%Y%m%d')}.txt")
7
+
8
+ class Error < StandardError; end
9
+ # Your code goes here...
10
+
11
+ def self.execute(identified_session)
12
+ map_uri = []
13
+ transform = identified_session.each_with_object({}) do |v, hash|
14
+ uries = v.records.map(&:uri)
15
+ uries.each { |i| map_uri.push(i) unless map_uri.include?(i) }
16
+ if hash.key?(v.user.host.to_s)
17
+ uries.size == 1 ? hash[v.user.host.to_s] += v.records.map { |i| map_uri.index(i.uri) } : hash[v.user.host.to_s].push(v.records.map { |i| map_uri.index(i.uri) })
18
+ else
19
+ hash.merge!(v.user.host.to_s => v.records.map { |i| map_uri.index(i.uri) })
20
+ end
21
+ end
22
+
23
+ File.open(MAP_URI_FILE_PATH, 'w+') { |f| f.write(map_uri.join(' ')) }
24
+ transform
25
+ end
26
+ end
@@ -6,10 +6,6 @@ module UserIdentification
6
6
  # Your code goes here...
7
7
 
8
8
  def self.execute(cleaned_data)
9
- data = cleaned_data.map { |record| [record.host, record.ua.to_s] }.uniq
10
- data.each_with_object([]) do |record, arrs|
11
- o = UserIdentity.new(host: record.first, user_agent: record.last)
12
- arrs << o
13
- end
9
+ cleaned_data.map(&:user).uniq
14
10
  end
15
11
  end
@@ -1,3 +1,4 @@
1
1
  class LogAnalysis
2
- VERSION = '0.1.0'.freeze
2
+ VERSION = '0.1.1'.freeze
3
+ DATA_PATH = '~/data/waazabag/'.freeze
3
4
  end
Binary file
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: log_analysis
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Tran
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-06-27 00:00:00.000000000 Z
11
+ date: 2020-07-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: useragent
@@ -53,6 +53,7 @@ files:
53
53
  - LICENSE.txt
54
54
  - README.md
55
55
  - Rakefile
56
+ - access.log
56
57
  - bin/console
57
58
  - bin/setup
58
59
  - lib/log_analysis.rb
@@ -60,10 +61,13 @@ files:
60
61
  - lib/log_analysis/model/session_identity.rb
61
62
  - lib/log_analysis/model/user_identity.rb
62
63
  - lib/log_analysis/preprocess.rb
64
+ - lib/log_analysis/rule_generation.rb
63
65
  - lib/log_analysis/session_identification.rb
66
+ - lib/log_analysis/transformation.rb
64
67
  - lib/log_analysis/user_identification.rb
65
68
  - lib/log_analysis/version.rb
66
69
  - log_analysis.gemspec
70
+ - spmf.jar
67
71
  homepage: https://github.com/michaelt0520/log_analysis_thesis
68
72
  licenses:
69
73
  - MIT