log_analysis 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +17 -19
- data/access.log +7549 -0
- data/lib/log_analysis.rb +17 -10
- data/lib/log_analysis/model/record.rb +2 -2
- data/lib/log_analysis/preprocess.rb +38 -47
- data/lib/log_analysis/rule_generation.rb +59 -0
- data/lib/log_analysis/session_identification.rb +3 -11
- data/lib/log_analysis/transformation.rb +26 -0
- data/lib/log_analysis/user_identification.rb +1 -5
- data/lib/log_analysis/version.rb +2 -1
- data/spmf.jar +0 -0
- metadata +6 -2
data/lib/log_analysis.rb
CHANGED
@@ -2,28 +2,35 @@ require 'log_analysis/version'
|
|
2
2
|
require 'log_analysis/preprocess'
|
3
3
|
require 'log_analysis/user_identification'
|
4
4
|
require 'log_analysis/session_identification'
|
5
|
+
require 'log_analysis/transformation'
|
6
|
+
require 'log_analysis/rule_generation'
|
5
7
|
|
6
8
|
class LogAnalysis
|
7
9
|
class Error < StandardError; end
|
8
10
|
# Your code goes here...
|
9
11
|
|
10
|
-
attr_reader :path, :
|
12
|
+
attr_reader :path, :type, :cleaned_data
|
11
13
|
|
12
14
|
def initialize(path, type = nil)
|
13
|
-
@path
|
14
|
-
@type
|
15
|
-
@
|
16
|
-
|
17
|
-
|
18
|
-
def cleaned_data
|
19
|
-
PreProcess.data_cleaning(@origin_logs)
|
15
|
+
@path = path
|
16
|
+
@type = type
|
17
|
+
@cleaned_data = PreProcess.input(path, type)
|
18
|
+
system('mkdir', '-p', LogAnalysis::DATA_PATH)
|
20
19
|
end
|
21
20
|
|
22
21
|
def identified_user
|
23
|
-
UserIdentification.execute(cleaned_data)
|
22
|
+
UserIdentification.execute(@cleaned_data)
|
24
23
|
end
|
25
24
|
|
26
25
|
def identified_session
|
27
|
-
SessionIdentification.execute(cleaned_data)
|
26
|
+
SessionIdentification.execute(@cleaned_data)
|
27
|
+
end
|
28
|
+
|
29
|
+
def transformation
|
30
|
+
Transformation.execute(identified_session)
|
31
|
+
end
|
32
|
+
|
33
|
+
def rule_generation
|
34
|
+
RuleGeneration.execute(transformation)
|
28
35
|
end
|
29
36
|
end
|
@@ -25,7 +25,7 @@ class Record
|
|
25
25
|
|
26
26
|
attr_reader :params
|
27
27
|
|
28
|
-
DATA_TYPE = %w[.txt .json .js .css .jpg .jpeg .gif .woff2 .ico .png .bmp .mp3 .wav .avi .mpeg .vmw .mpg .map .pdf .doc .svg].freeze
|
28
|
+
DATA_TYPE = %w[.txt .json .js .css .jpg .jpeg .gif .woff2 .ico .png .bmp .mp3 .wav .avi .mpeg .vmw .mpg .map .pdf .doc .svg .otf].freeze
|
29
29
|
REGEX_BOT = /facebookexternalhit|Mediapartners-Google|AWS|-|Crawler|spider|Detection/.freeze
|
30
30
|
|
31
31
|
def initialize(params)
|
@@ -70,7 +70,7 @@ class Record
|
|
70
70
|
p['apptime'] = @params['apptime'].to_f
|
71
71
|
p['cache'] = validate_string(@params['cache'])
|
72
72
|
p['vhost'] = validate_string(@params['vhost'])
|
73
|
-
p['user'] =
|
73
|
+
p['user'] = @params['user'] || nil
|
74
74
|
p['forwardedfor'] = validate_string(@params['forwardedfor'])
|
75
75
|
p['forwardedproto'] = validate_string(@params['forwardedproto'])
|
76
76
|
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'log_analysis/model/record'
|
2
|
+
require 'log_analysis/model/user_identity'
|
2
3
|
require 'json'
|
3
|
-
require 'pry'
|
4
4
|
|
5
5
|
module PreProcess
|
6
6
|
class Error < StandardError; end
|
@@ -13,63 +13,54 @@ module PreProcess
|
|
13
13
|
CONVERT_RECORD = { 'nginx' => 'convert_nginx_logs', 'apache' => 'convert_apache_logs', 'default' => 'to_records' }.freeze
|
14
14
|
|
15
15
|
def self.input(file_path, type)
|
16
|
-
|
17
|
-
preprocess_log = type.nil? ? line.gsub(/[\t]/i, ' ').chomp! : line
|
18
|
-
arr.push(preprocess_log)
|
19
|
-
end
|
20
|
-
|
21
|
-
send(CONVERT_RECORD[type.nil? ? 'nginx' : type], arr_logs)
|
22
|
-
end
|
23
|
-
|
24
|
-
def self.data_cleaning(logs)
|
25
|
-
logs.select { |record| record.status_200? && record.method_get? && record.uri_without_data && !record.robot? }
|
26
|
-
end
|
27
|
-
|
28
|
-
def self.to_records(logs)
|
29
|
-
logs.each_with_object([]).with_index do |(log, arrays), i|
|
30
|
-
next if log.nil?
|
31
|
-
|
32
|
-
o = log.split(REGEX_KEYS)
|
33
|
-
o = o.map(&:strip)
|
34
|
-
o.delete('')
|
16
|
+
@users = []
|
35
17
|
|
36
|
-
|
37
|
-
|
38
|
-
|
18
|
+
File.readlines(file_path).each_with_object([]).with_index do |(line, arr), i|
|
19
|
+
preprocessed_log = type.nil? ? line.gsub(/[\t]/i, ' ').chomp! : line
|
20
|
+
record = Record.new(send(CONVERT_RECORD[type.nil? ? 'nginx' : type], preprocessed_log)) unless preprocessed_log.nil?
|
39
21
|
|
40
|
-
|
22
|
+
arr.push(record) if record.status_200? && record.method_get? && record.uri_without_data && !record.robot?
|
41
23
|
|
42
|
-
puts
|
24
|
+
puts arr.size
|
43
25
|
end
|
44
26
|
end
|
45
27
|
|
46
|
-
def self.
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
obj = {}.tap do |p|
|
54
|
-
p['host'] = o[0]
|
55
|
-
p['user'] = o[2]
|
56
|
-
p['time'] = o[3]
|
57
|
-
p['method'] = o[4]
|
58
|
-
p['uri'] = o[5]
|
59
|
-
p['status'] = o[6]
|
60
|
-
p['size'] = o[7]
|
61
|
-
p['referer'] = o[8]
|
62
|
-
p['ua'] = o[9]
|
63
|
-
p['forwarded'] = o[10]
|
64
|
-
end
|
65
|
-
|
66
|
-
arrays << Record.new(obj)
|
28
|
+
def self.to_record(log)
|
29
|
+
o = log.gsub!('\t', ' ')
|
30
|
+
o = log.split(REGEX_KEYS)
|
31
|
+
o = o.map(&:strip)
|
32
|
+
o.delete('')
|
33
|
+
o.each_slice(2).to_a.each_with_object({}) { |pair, log_obj| log_obj.merge!(to_json(pair)) }
|
34
|
+
end
|
67
35
|
|
68
|
-
|
36
|
+
def self.convert_nginx_logs(log)
|
37
|
+
o = log.split(REGEX_NGINX)
|
38
|
+
o.delete('')
|
39
|
+
|
40
|
+
{}.tap do |p|
|
41
|
+
p['host'] = o[0]
|
42
|
+
p['user'] = o[2]
|
43
|
+
p['time'] = o[3]
|
44
|
+
p['method'] = o[4]
|
45
|
+
p['uri'] = o[5]
|
46
|
+
p['status'] = o[6]
|
47
|
+
p['size'] = o[7]
|
48
|
+
p['referer'] = o[8]
|
49
|
+
p['ua'] = o[9]
|
50
|
+
p['forwarded'] = o[10]
|
51
|
+
p['user'] = save_user(o)
|
69
52
|
end
|
70
53
|
end
|
71
54
|
|
72
55
|
def self.to_json(pair)
|
73
56
|
{ pair.first.delete(':') => pair.last }
|
74
57
|
end
|
58
|
+
|
59
|
+
def self.save_user(log)
|
60
|
+
user = @users.find { |i| i.host == log[0] && i.user_agent.to_s == log[9] }
|
61
|
+
return user unless user.nil?
|
62
|
+
|
63
|
+
@users.push(UserIdentity.new(host: IPAddr.new(log[0]), user_agent: UserAgent.parse(log[9])))
|
64
|
+
@users.last
|
65
|
+
end
|
75
66
|
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
require 'time'
|
2
|
+
require 'log_analysis/version'
|
3
|
+
|
4
|
+
module RuleGeneration
|
5
|
+
JAR_FILE_PATH = File.expand_path('spmf.jar')
|
6
|
+
TRANSFORM_DATA_PATH = File.expand_path("#{LogAnalysis::DATA_PATH}transform_data_#{Time.now.strftime('%Y%m%d')}.txt")
|
7
|
+
RULE_FILE_PATH = File.expand_path("#{LogAnalysis::DATA_PATH}output_#{Time.now.strftime('%Y%m%d')}.txt")
|
8
|
+
MAP_URI_FILE_PATH = File.expand_path("#{LogAnalysis::DATA_PATH}map_uri_#{Time.now.strftime('%Y%m%d')}.txt")
|
9
|
+
|
10
|
+
class Error < StandardError; end
|
11
|
+
# Your code goes here...
|
12
|
+
|
13
|
+
def self.execute(transform_data)
|
14
|
+
File.open(TRANSFORM_DATA_PATH, 'w+') { |f| transform_data.keys.each { |e| f.puts(transform_data[e].map { |i| i.is_a?(Array) ? i.join(' ') : i }.join(' -1 ').concat(' -1 -2')) } }
|
15
|
+
system("java -jar #{JAR_FILE_PATH} run SPADE #{TRANSFORM_DATA_PATH} #{RULE_FILE_PATH} 65%")
|
16
|
+
result = rule_gen(get_seq(File.read(RULE_FILE_PATH)), 0.5)
|
17
|
+
map_uri = File.read(MAP_URI_FILE_PATH).split(' ')
|
18
|
+
|
19
|
+
result.map do |rule|
|
20
|
+
seq, sub, rea = rule
|
21
|
+
[seq.map { |i| map_uri[i.to_i] }, sub.map { |i| map_uri[i.to_i] }, rea]
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.rule_gen(seqs, min_conf)
|
26
|
+
seqs.each_with_object([]) { |seq, arr| seqs.each { |sub| arr.push([seq[0], sub[0], seq[1] / sub[1]]) if sub[0] != seq[0] && sub_seq?(sub[0], seq[0]) && seq[1] / sub[1] >= min_conf } }
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.sub_seq?(first, second)
|
30
|
+
ptr = 0
|
31
|
+
first.each do |sub|
|
32
|
+
return false if ptr >= second.size
|
33
|
+
|
34
|
+
(ptr..second.size - 1).each do |n|
|
35
|
+
if sub?(second[n], sub)
|
36
|
+
ptr = n + 1
|
37
|
+
break
|
38
|
+
end
|
39
|
+
return false if ptr == second.size - 1
|
40
|
+
end
|
41
|
+
end
|
42
|
+
true
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.sub?(str, sub)
|
46
|
+
mark_sub = 0
|
47
|
+
sub.split(' ').each { |char| mark_sub += 1 if str.include?(char) }
|
48
|
+
|
49
|
+
mark_sub == sub.split(' ').size
|
50
|
+
end
|
51
|
+
|
52
|
+
def self.get_seq(seq_str)
|
53
|
+
seq = seq_str.split("\n")
|
54
|
+
seq.each_with_object([]) do |s, arr|
|
55
|
+
split_seq = s.split('-1')
|
56
|
+
arr.push([split_seq[0..-2], split_seq[-1][-1].to_f])
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -8,18 +8,10 @@ module SessionIdentification
|
|
8
8
|
# Your code goes here...
|
9
9
|
|
10
10
|
def self.execute(cleaned_data)
|
11
|
-
|
12
|
-
|
13
|
-
isession
|
14
|
-
|
15
|
-
if isession.present? && validate_time_session(session_identity[isession].records.last.time, record.time)
|
16
|
-
session_identity[isession].records << record
|
17
|
-
else
|
18
|
-
session_identity << SessionIdentity.new(session_identity_params(record))
|
19
|
-
end
|
11
|
+
cleaned_data.each_with_object([]) do |record, arr|
|
12
|
+
isession = arr.rindex { |s| s.user == record.user }
|
13
|
+
isession.present? && validate_time_session(arr[isession].records.last.time, record.time) ? arr[isession].records << record : arr << SessionIdentity.new(session_identity_params(record))
|
20
14
|
end
|
21
|
-
|
22
|
-
session_identity.map { |i| i.records.map(&:uri) }
|
23
15
|
end
|
24
16
|
|
25
17
|
private
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'log_analysis/model/session_identity'
|
2
|
+
require 'log_analysis/model/user_identity'
|
3
|
+
require 'log_analysis/version'
|
4
|
+
|
5
|
+
module Transformation
|
6
|
+
MAP_URI_FILE_PATH = File.expand_path("#{LogAnalysis::DATA_PATH}map_uri_#{Time.now.strftime('%Y%m%d')}.txt")
|
7
|
+
|
8
|
+
class Error < StandardError; end
|
9
|
+
# Your code goes here...
|
10
|
+
|
11
|
+
def self.execute(identified_session)
|
12
|
+
map_uri = []
|
13
|
+
transform = identified_session.each_with_object({}) do |v, hash|
|
14
|
+
uries = v.records.map(&:uri)
|
15
|
+
uries.each { |i| map_uri.push(i) unless map_uri.include?(i) }
|
16
|
+
if hash.key?(v.user.host.to_s)
|
17
|
+
uries.size == 1 ? hash[v.user.host.to_s] += v.records.map { |i| map_uri.index(i.uri) } : hash[v.user.host.to_s].push(v.records.map { |i| map_uri.index(i.uri) })
|
18
|
+
else
|
19
|
+
hash.merge!(v.user.host.to_s => v.records.map { |i| map_uri.index(i.uri) })
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
File.open(MAP_URI_FILE_PATH, 'w+') { |f| f.write(map_uri.join(' ')) }
|
24
|
+
transform
|
25
|
+
end
|
26
|
+
end
|
@@ -6,10 +6,6 @@ module UserIdentification
|
|
6
6
|
# Your code goes here...
|
7
7
|
|
8
8
|
def self.execute(cleaned_data)
|
9
|
-
|
10
|
-
data.each_with_object([]) do |record, arrs|
|
11
|
-
o = UserIdentity.new(host: record.first, user_agent: record.last)
|
12
|
-
arrs << o
|
13
|
-
end
|
9
|
+
cleaned_data.map(&:user).uniq
|
14
10
|
end
|
15
11
|
end
|
data/lib/log_analysis/version.rb
CHANGED
data/spmf.jar
ADDED
Binary file
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: log_analysis
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Tran
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-07-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: useragent
|
@@ -53,6 +53,7 @@ files:
|
|
53
53
|
- LICENSE.txt
|
54
54
|
- README.md
|
55
55
|
- Rakefile
|
56
|
+
- access.log
|
56
57
|
- bin/console
|
57
58
|
- bin/setup
|
58
59
|
- lib/log_analysis.rb
|
@@ -60,10 +61,13 @@ files:
|
|
60
61
|
- lib/log_analysis/model/session_identity.rb
|
61
62
|
- lib/log_analysis/model/user_identity.rb
|
62
63
|
- lib/log_analysis/preprocess.rb
|
64
|
+
- lib/log_analysis/rule_generation.rb
|
63
65
|
- lib/log_analysis/session_identification.rb
|
66
|
+
- lib/log_analysis/transformation.rb
|
64
67
|
- lib/log_analysis/user_identification.rb
|
65
68
|
- lib/log_analysis/version.rb
|
66
69
|
- log_analysis.gemspec
|
70
|
+
- spmf.jar
|
67
71
|
homepage: https://github.com/michaelt0520/log_analysis_thesis
|
68
72
|
licenses:
|
69
73
|
- MIT
|