log_analysis 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,28 +2,35 @@ require 'log_analysis/version'
2
2
  require 'log_analysis/preprocess'
3
3
  require 'log_analysis/user_identification'
4
4
  require 'log_analysis/session_identification'
5
+ require 'log_analysis/transformation'
6
+ require 'log_analysis/rule_generation'
5
7
 
6
8
  class LogAnalysis
7
9
  class Error < StandardError; end
8
10
  # Your code goes here...
9
11
 
10
- attr_reader :path, :origin_logs
12
+ attr_reader :path, :type, :cleaned_data
11
13
 
12
14
  def initialize(path, type = nil)
13
- @path = path
14
- @type = type
15
- @origin_logs = PreProcess.input(path, type)
16
- end
17
-
18
- def cleaned_data
19
- PreProcess.data_cleaning(@origin_logs)
15
+ @path = path
16
+ @type = type
17
+ @cleaned_data = PreProcess.input(path, type)
18
+ system('mkdir', '-p', LogAnalysis::DATA_PATH)
20
19
  end
21
20
 
22
21
  def identified_user
23
- UserIdentification.execute(cleaned_data)
22
+ UserIdentification.execute(@cleaned_data)
24
23
  end
25
24
 
26
25
  def identified_session
27
- SessionIdentification.execute(cleaned_data)
26
+ SessionIdentification.execute(@cleaned_data)
27
+ end
28
+
29
+ def transformation
30
+ Transformation.execute(identified_session)
31
+ end
32
+
33
+ def rule_generation
34
+ RuleGeneration.execute(transformation)
28
35
  end
29
36
  end
@@ -25,7 +25,7 @@ class Record
25
25
 
26
26
  attr_reader :params
27
27
 
28
- DATA_TYPE = %w[.txt .json .js .css .jpg .jpeg .gif .woff2 .ico .png .bmp .mp3 .wav .avi .mpeg .vmw .mpg .map .pdf .doc .svg].freeze
28
+ DATA_TYPE = %w[.txt .json .js .css .jpg .jpeg .gif .woff2 .ico .png .bmp .mp3 .wav .avi .mpeg .vmw .mpg .map .pdf .doc .svg .otf].freeze
29
29
  REGEX_BOT = /facebookexternalhit|Mediapartners-Google|AWS|-|Crawler|spider|Detection/.freeze
30
30
 
31
31
  def initialize(params)
@@ -70,7 +70,7 @@ class Record
70
70
  p['apptime'] = @params['apptime'].to_f
71
71
  p['cache'] = validate_string(@params['cache'])
72
72
  p['vhost'] = validate_string(@params['vhost'])
73
- p['user'] = UserIdentity.new(host: IPAddr.new(@params['host']), user_agent: UserAgent.parse(@params['ua']))
73
+ p['user'] = @params['user'] || nil
74
74
  p['forwardedfor'] = validate_string(@params['forwardedfor'])
75
75
  p['forwardedproto'] = validate_string(@params['forwardedproto'])
76
76
  end
@@ -1,6 +1,6 @@
1
1
  require 'log_analysis/model/record'
2
+ require 'log_analysis/model/user_identity'
2
3
  require 'json'
3
- require 'pry'
4
4
 
5
5
  module PreProcess
6
6
  class Error < StandardError; end
@@ -13,63 +13,54 @@ module PreProcess
13
13
  CONVERT_RECORD = { 'nginx' => 'convert_nginx_logs', 'apache' => 'convert_apache_logs', 'default' => 'to_records' }.freeze
14
14
 
15
15
  def self.input(file_path, type)
16
- arr_logs = File.readlines(file_path).each_with_object([]) do |line, arr|
17
- preprocess_log = type.nil? ? line.gsub(/[\t]/i, ' ').chomp! : line
18
- arr.push(preprocess_log)
19
- end
20
-
21
- send(CONVERT_RECORD[type.nil? ? 'nginx' : type], arr_logs)
22
- end
23
-
24
- def self.data_cleaning(logs)
25
- logs.select { |record| record.status_200? && record.method_get? && record.uri_without_data && !record.robot? }
26
- end
27
-
28
- def self.to_records(logs)
29
- logs.each_with_object([]).with_index do |(log, arrays), i|
30
- next if log.nil?
31
-
32
- o = log.split(REGEX_KEYS)
33
- o = o.map(&:strip)
34
- o.delete('')
16
+ @users = []
35
17
 
36
- log = o.each_slice(2).to_a.each_with_object({}) do |pair, log_obj|
37
- log_obj.merge!(to_json(pair))
38
- end
18
+ File.readlines(file_path).each_with_object([]).with_index do |(line, arr), i|
19
+ preprocessed_log = type.nil? ? line.gsub(/[\t]/i, ' ').chomp! : line
20
+ record = Record.new(send(CONVERT_RECORD[type.nil? ? 'nginx' : type], preprocessed_log)) unless preprocessed_log.nil?
39
21
 
40
- arrays << Record.new(log)
22
+ arr.push(record) if record.status_200? && record.method_get? && record.uri_without_data && !record.robot?
41
23
 
42
- puts "#{i}/#{logs.size}"
24
+ puts arr.size
43
25
  end
44
26
  end
45
27
 
46
- def self.convert_nginx_logs(logs)
47
- logs.each_with_object([]).with_index do |(log, arrays), i|
48
- next if log.nil?
49
-
50
- o = log.split(REGEX_NGINX)
51
- o.delete('')
52
-
53
- obj = {}.tap do |p|
54
- p['host'] = o[0]
55
- p['user'] = o[2]
56
- p['time'] = o[3]
57
- p['method'] = o[4]
58
- p['uri'] = o[5]
59
- p['status'] = o[6]
60
- p['size'] = o[7]
61
- p['referer'] = o[8]
62
- p['ua'] = o[9]
63
- p['forwarded'] = o[10]
64
- end
65
-
66
- arrays << Record.new(obj)
28
+ def self.to_record(log)
29
+ o = log.gsub!('\t', ' ')
30
+ o = log.split(REGEX_KEYS)
31
+ o = o.map(&:strip)
32
+ o.delete('')
33
+ o.each_slice(2).to_a.each_with_object({}) { |pair, log_obj| log_obj.merge!(to_json(pair)) }
34
+ end
67
35
 
68
- puts "#{i}/#{logs.size}"
36
+ def self.convert_nginx_logs(log)
37
+ o = log.split(REGEX_NGINX)
38
+ o.delete('')
39
+
40
+ {}.tap do |p|
41
+ p['host'] = o[0]
42
+ p['user'] = o[2]
43
+ p['time'] = o[3]
44
+ p['method'] = o[4]
45
+ p['uri'] = o[5]
46
+ p['status'] = o[6]
47
+ p['size'] = o[7]
48
+ p['referer'] = o[8]
49
+ p['ua'] = o[9]
50
+ p['forwarded'] = o[10]
51
+ p['user'] = save_user(o)
69
52
  end
70
53
  end
71
54
 
72
55
  def self.to_json(pair)
73
56
  { pair.first.delete(':') => pair.last }
74
57
  end
58
+
59
+ def self.save_user(log)
60
+ user = @users.find { |i| i.host == log[0] && i.user_agent.to_s == log[9] }
61
+ return user unless user.nil?
62
+
63
+ @users.push(UserIdentity.new(host: IPAddr.new(log[0]), user_agent: UserAgent.parse(log[9])))
64
+ @users.last
65
+ end
75
66
  end
@@ -0,0 +1,59 @@
1
+ require 'time'
2
+ require 'log_analysis/version'
3
+
4
+ module RuleGeneration
5
+ JAR_FILE_PATH = File.expand_path('spmf.jar')
6
+ TRANSFORM_DATA_PATH = File.expand_path("#{LogAnalysis::DATA_PATH}transform_data_#{Time.now.strftime('%Y%m%d')}.txt")
7
+ RULE_FILE_PATH = File.expand_path("#{LogAnalysis::DATA_PATH}output_#{Time.now.strftime('%Y%m%d')}.txt")
8
+ MAP_URI_FILE_PATH = File.expand_path("#{LogAnalysis::DATA_PATH}map_uri_#{Time.now.strftime('%Y%m%d')}.txt")
9
+
10
+ class Error < StandardError; end
11
+ # Your code goes here...
12
+
13
+ def self.execute(transform_data)
14
+ File.open(TRANSFORM_DATA_PATH, 'w+') { |f| transform_data.keys.each { |e| f.puts(transform_data[e].map { |i| i.is_a?(Array) ? i.join(' ') : i }.join(' -1 ').concat(' -1 -2')) } }
15
+ system("java -jar #{JAR_FILE_PATH} run SPADE #{TRANSFORM_DATA_PATH} #{RULE_FILE_PATH} 65%")
16
+ result = rule_gen(get_seq(File.read(RULE_FILE_PATH)), 0.5)
17
+ map_uri = File.read(MAP_URI_FILE_PATH).split(' ')
18
+
19
+ result.map do |rule|
20
+ seq, sub, rea = rule
21
+ [seq.map { |i| map_uri[i.to_i] }, sub.map { |i| map_uri[i.to_i] }, rea]
22
+ end
23
+ end
24
+
25
+ def self.rule_gen(seqs, min_conf)
26
+ seqs.each_with_object([]) { |seq, arr| seqs.each { |sub| arr.push([seq[0], sub[0], seq[1] / sub[1]]) if sub[0] != seq[0] && sub_seq?(sub[0], seq[0]) && seq[1] / sub[1] >= min_conf } }
27
+ end
28
+
29
+ def self.sub_seq?(first, second)
30
+ ptr = 0
31
+ first.each do |sub|
32
+ return false if ptr >= second.size
33
+
34
+ (ptr..second.size - 1).each do |n|
35
+ if sub?(second[n], sub)
36
+ ptr = n + 1
37
+ break
38
+ end
39
+ return false if ptr == second.size - 1
40
+ end
41
+ end
42
+ true
43
+ end
44
+
45
+ def self.sub?(str, sub)
46
+ mark_sub = 0
47
+ sub.split(' ').each { |char| mark_sub += 1 if str.include?(char) }
48
+
49
+ mark_sub == sub.split(' ').size
50
+ end
51
+
52
+ def self.get_seq(seq_str)
53
+ seq = seq_str.split("\n")
54
+ seq.each_with_object([]) do |s, arr|
55
+ split_seq = s.split('-1')
56
+ arr.push([split_seq[0..-2], split_seq[-1][-1].to_f])
57
+ end
58
+ end
59
+ end
@@ -8,18 +8,10 @@ module SessionIdentification
8
8
  # Your code goes here...
9
9
 
10
10
  def self.execute(cleaned_data)
11
- session_identity = []
12
- cleaned_data.each do |record|
13
- isession = session_identity.rindex { |s| s.user.host == record.user.host && s.user.user_agent == record.user.user_agent }
14
-
15
- if isession.present? && validate_time_session(session_identity[isession].records.last.time, record.time)
16
- session_identity[isession].records << record
17
- else
18
- session_identity << SessionIdentity.new(session_identity_params(record))
19
- end
11
+ cleaned_data.each_with_object([]) do |record, arr|
12
+ isession = arr.rindex { |s| s.user == record.user }
13
+ isession.present? && validate_time_session(arr[isession].records.last.time, record.time) ? arr[isession].records << record : arr << SessionIdentity.new(session_identity_params(record))
20
14
  end
21
-
22
- session_identity.map { |i| i.records.map(&:uri) }
23
15
  end
24
16
 
25
17
  private
@@ -0,0 +1,26 @@
1
+ require 'log_analysis/model/session_identity'
2
+ require 'log_analysis/model/user_identity'
3
+ require 'log_analysis/version'
4
+
5
+ module Transformation
6
+ MAP_URI_FILE_PATH = File.expand_path("#{LogAnalysis::DATA_PATH}map_uri_#{Time.now.strftime('%Y%m%d')}.txt")
7
+
8
+ class Error < StandardError; end
9
+ # Your code goes here...
10
+
11
+ def self.execute(identified_session)
12
+ map_uri = []
13
+ transform = identified_session.each_with_object({}) do |v, hash|
14
+ uries = v.records.map(&:uri)
15
+ uries.each { |i| map_uri.push(i) unless map_uri.include?(i) }
16
+ if hash.key?(v.user.host.to_s)
17
+ uries.size == 1 ? hash[v.user.host.to_s] += v.records.map { |i| map_uri.index(i.uri) } : hash[v.user.host.to_s].push(v.records.map { |i| map_uri.index(i.uri) })
18
+ else
19
+ hash.merge!(v.user.host.to_s => v.records.map { |i| map_uri.index(i.uri) })
20
+ end
21
+ end
22
+
23
+ File.open(MAP_URI_FILE_PATH, 'w+') { |f| f.write(map_uri.join(' ')) }
24
+ transform
25
+ end
26
+ end
@@ -6,10 +6,6 @@ module UserIdentification
6
6
  # Your code goes here...
7
7
 
8
8
  def self.execute(cleaned_data)
9
- data = cleaned_data.map { |record| [record.host, record.ua.to_s] }.uniq
10
- data.each_with_object([]) do |record, arrs|
11
- o = UserIdentity.new(host: record.first, user_agent: record.last)
12
- arrs << o
13
- end
9
+ cleaned_data.map(&:user).uniq
14
10
  end
15
11
  end
@@ -1,3 +1,4 @@
1
1
  class LogAnalysis
2
- VERSION = '0.1.0'.freeze
2
+ VERSION = '0.1.1'.freeze
3
+ DATA_PATH = '~/data/waazabag/'.freeze
3
4
  end
Binary file
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: log_analysis
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Tran
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-06-27 00:00:00.000000000 Z
11
+ date: 2020-07-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: useragent
@@ -53,6 +53,7 @@ files:
53
53
  - LICENSE.txt
54
54
  - README.md
55
55
  - Rakefile
56
+ - access.log
56
57
  - bin/console
57
58
  - bin/setup
58
59
  - lib/log_analysis.rb
@@ -60,10 +61,13 @@ files:
60
61
  - lib/log_analysis/model/session_identity.rb
61
62
  - lib/log_analysis/model/user_identity.rb
62
63
  - lib/log_analysis/preprocess.rb
64
+ - lib/log_analysis/rule_generation.rb
63
65
  - lib/log_analysis/session_identification.rb
66
+ - lib/log_analysis/transformation.rb
64
67
  - lib/log_analysis/user_identification.rb
65
68
  - lib/log_analysis/version.rb
66
69
  - log_analysis.gemspec
70
+ - spmf.jar
67
71
  homepage: https://github.com/michaelt0520/log_analysis_thesis
68
72
  licenses:
69
73
  - MIT