log_analysis 0.1.0 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,29 +1,48 @@
1
1
  require 'log_analysis/version'
2
- require 'log_analysis/preprocess'
2
+ require 'log_analysis/loading_data'
3
3
  require 'log_analysis/user_identification'
4
4
  require 'log_analysis/session_identification'
5
+ require 'log_analysis/transformation'
6
+ require 'log_analysis/data_mining'
7
+ require 'log_analysis/intepretation'
8
+ require 'time'
5
9
 
6
10
  class LogAnalysis
7
11
  class Error < StandardError; end
8
12
  # Your code goes here...
9
13
 
10
- attr_reader :path, :origin_logs
14
+ attr_accessor :path, :type, :match_uri, :conf, :sup, :origin_data
11
15
 
12
16
  def initialize(path, type = nil)
13
- @path = path
14
- @type = type
15
- @origin_logs = PreProcess.input(path, type)
17
+ @path = path
18
+ @type = type
19
+ @origin_data = LoadingData.input(path, type)
16
20
  end
17
21
 
18
- def cleaned_data
19
- PreProcess.data_cleaning(@origin_logs)
22
+ def selecting_data
23
+ return @origin_data if @match_uri.nil?
24
+
25
+ @origin_data.select { |record| record.uri.match?(@match_uri) }
26
+ end
27
+
28
+ def preprocessing_data
29
+ filter = selecting_data.select { |record| record.status_200? && record.method_get? && record.uri_without_data && !record.robot? }
30
+ user = UserIdentification.execute(filter)
31
+ session = SessionIdentification.execute(user)
32
+ session
33
+ end
34
+
35
+ def transformation
36
+ Transformation.execute(preprocessing_data)
20
37
  end
21
38
 
22
- def identified_user
23
- UserIdentification.execute(cleaned_data)
39
+ def data_mining
40
+ @conf ||= 0.5
41
+ @sup ||= 60
42
+ DataMining.execute(transformation, @conf, @sup)
24
43
  end
25
44
 
26
- def identified_session
27
- SessionIdentification.execute(cleaned_data)
45
+ def intepretation
46
+ Intepretation.execute(data_mining)
28
47
  end
29
48
  end
@@ -0,0 +1,48 @@
1
+ require 'time'
2
+ require 'log_analysis/version'
3
+
4
+ module DataMining
5
+ class Error < StandardError; end
6
+ # Your code goes here...
7
+
8
+ def self.execute(transform_data, min_conf, min_sup)
9
+ File.open(LogAnalysis::TRANSFORM_DATA_PATH, 'w+') { |f| transform_data.keys.each { |e| f.puts(transform_data[e].map { |i| i.is_a?(Array) ? i.join(' ') : i }.join(' -1 ').concat(' -1 -2')) } }
10
+ system("java -jar #{LogAnalysis::JAR_FILE_PATH} run SPADE #{LogAnalysis::TRANSFORM_DATA_PATH} #{LogAnalysis::RULE_FILE_PATH} #{min_sup}%")
11
+ rule_gen(get_seq(File.read(LogAnalysis::RULE_FILE_PATH)), min_conf)
12
+ end
13
+
14
+ def self.rule_gen(seqs, min_conf)
15
+ seqs.each_with_object([]) { |seq, arr| seqs.each { |sub| arr.push([seq[0], sub[0], seq[1] / sub[1]]) if sub[0] != seq[0] && sub_seq?(sub[0], seq[0]) && seq[1] / sub[1] >= min_conf } }
16
+ end
17
+
18
+ def self.sub_seq?(first, second)
19
+ ptr = 0
20
+ first.each do |sub|
21
+ return false if ptr >= second.size
22
+
23
+ (ptr..second.size - 1).each do |n|
24
+ if sub?(second[n], sub)
25
+ ptr = n + 1
26
+ break
27
+ end
28
+ return false if ptr == second.size - 1
29
+ end
30
+ end
31
+ true
32
+ end
33
+
34
+ def self.sub?(str, sub)
35
+ mark_sub = 0
36
+ sub.split(' ').each { |char| mark_sub += 1 if str.include?(char) }
37
+
38
+ mark_sub == sub.split(' ').size
39
+ end
40
+
41
+ def self.get_seq(seq_str)
42
+ seq = seq_str.split("\n")
43
+ seq.each_with_object([]) do |s, arr|
44
+ split_seq = s.split('-1')
45
+ arr.push([split_seq[0..-2], split_seq[-1][-1].to_f])
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,22 @@
1
+ require 'log_analysis/version'
2
+
3
+ module Intepretation
4
+ class Error < StandardError; end
5
+ # Your code goes here...
6
+
7
+ def self.execute(data_mining)
8
+ map_uri = File.read(LogAnalysis::MAP_URI_FILE_PATH).split(' ')
9
+ move_data
10
+
11
+ data_mining.map do |data|
12
+ seq, sub, rea = data
13
+ [seq.map { |i| map_uri[i.to_i] }, sub.map { |i| map_uri[i.to_i] }, rea]
14
+ end
15
+ end
16
+
17
+ def self.move_data
18
+ return unless File.directory?(LogAnalysis::DATA_PATH)
19
+
20
+ system('mv', "*_#{Time.now.strftime('%Y%m%d')}.txt", LogAnalysis::DATA_PATH)
21
+ end
22
+ end
@@ -0,0 +1,70 @@
1
+ require 'log_analysis/model/record'
2
+ require 'log_analysis/model/user_identity'
3
+ require 'json'
4
+
5
+ module LoadingData
6
+ class Error < StandardError; end
7
+ # Your code goes here...
8
+
9
+ REGEX_KEYS = /(time:| host:| status:| size:| request_length:| req:| method:| uri:| referer:| ua:| reqtime:| runtime:| apptime:| cache:| vhost:| server:| user:| forwardedfor:| forwardedproto:)/.freeze
10
+ REGEX_NGINX = /\A^(?<host>\S*) (?<identity>\S*) (?<user>\S*) \[(?<time>[^\]]*)\] "(?<method>\S+)(?: +(?<path>[^\"]*?)(?:\s+\S*)?)?" (?<code>\S*) (?<size>\S*)(?: "(?<referer>[^\"]*)" "(?<agent>[^\"]*)"(?:\s+(?<http_x_forwarded_for>\S+))?)?$/.freeze
11
+ REGEX_APACHE = %r{(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - (.{0})- \[([^\]]+?)\] "(GET|POST|PUT|DELETE) ([^\s]+?) (HTTP\/1\.1)" (\d+) (\d+) "-" "(.*)"}.freeze
12
+
13
+ CONVERT_RECORD = { 'nginx' => 'convert_nginx_logs', 'apache' => 'convert_apache_logs', 'default' => 'to_records' }.freeze
14
+
15
+ def self.input(file_path, type)
16
+ @users = []
17
+ text_file = File.readlines(file_path)
18
+
19
+ text_file.each_with_object([]).with_index do |(line, arr), i|
20
+ preprocessed_log = type.nil? ? line.gsub(/[\t]/i, ' ').chomp! : line
21
+ record_params = send(CONVERT_RECORD[type.nil? ? 'nginx' : type], preprocessed_log)
22
+ record = Record.new(record_params) if record_params && preprocessed_log
23
+
24
+ system('clear')
25
+ puts "#{((i.to_f / text_file.size) * 100).round}/100"
26
+ arr.push(record) if record
27
+ end
28
+ end
29
+
30
+ def self.to_record(log)
31
+ o = log.gsub!('\t', ' ')
32
+ o = log.split(REGEX_KEYS)
33
+ o = o.map(&:strip)
34
+ o.delete('')
35
+ o.each_slice(2).to_a.each_with_object({}) { |pair, log_obj| log_obj.merge!(to_json(pair)) }
36
+ end
37
+
38
+ def self.convert_nginx_logs(log)
39
+ o = log.split(REGEX_NGINX)
40
+
41
+ return false if o.size <= 1
42
+ o.delete('')
43
+
44
+ {}.tap do |p|
45
+ p['host'] = o[0]
46
+ p['user'] = o[2]
47
+ p['time'] = o[3]
48
+ p['method'] = o[4]
49
+ p['uri'] = o[5]
50
+ p['status'] = o[6]
51
+ p['size'] = o[7]
52
+ p['referer'] = o[8]
53
+ p['ua'] = o[9]
54
+ p['forwarded'] = o[10]
55
+ p['user'] = save_user(o)
56
+ end
57
+ end
58
+
59
+ def self.to_json(pair)
60
+ { pair.first.delete(':') => pair.last }
61
+ end
62
+
63
+ def self.save_user(log)
64
+ user = @users.find { |i| i.host == log[0] && i.user_agent.to_s == log[9] }
65
+ return user unless user.nil?
66
+
67
+ @users.push(UserIdentity.new(host: IPAddr.new(log[0]), user_agent: UserAgent.parse(log[9])))
68
+ @users.last
69
+ end
70
+ end
@@ -25,7 +25,7 @@ class Record
25
25
 
26
26
  attr_reader :params
27
27
 
28
- DATA_TYPE = %w[.txt .json .js .css .jpg .jpeg .gif .woff2 .ico .png .bmp .mp3 .wav .avi .mpeg .vmw .mpg .map .pdf .doc .svg].freeze
28
+ DATA_TYPE = %w[.txt .json .js .css .jpg .jpeg .gif .woff2 .ico .png .bmp .mp3 .wav .avi .mpeg .vmw .mpg .map .pdf .doc .svg .otf].freeze
29
29
  REGEX_BOT = /facebookexternalhit|Mediapartners-Google|AWS|-|Crawler|spider|Detection/.freeze
30
30
 
31
31
  def initialize(params)
@@ -70,7 +70,7 @@ class Record
70
70
  p['apptime'] = @params['apptime'].to_f
71
71
  p['cache'] = validate_string(@params['cache'])
72
72
  p['vhost'] = validate_string(@params['vhost'])
73
- p['user'] = UserIdentity.new(host: IPAddr.new(@params['host']), user_agent: UserAgent.parse(@params['ua']))
73
+ p['user'] = @params['user'] || nil
74
74
  p['forwardedfor'] = validate_string(@params['forwardedfor'])
75
75
  p['forwardedproto'] = validate_string(@params['forwardedproto'])
76
76
  end
@@ -1,13 +1,15 @@
1
1
  require 'active_support/core_ext/module/delegation'
2
+ require 'log_analysis/model/record'
2
3
  require 'useragent'
3
4
 
4
5
  class UserIdentity
5
- attr_accessor :host, :user_agent
6
+ attr_accessor :host, :user_agent, :records
6
7
 
7
8
  delegate :browser, :version, :os, :platform, :mobile?, :application, :localization, to: :user_agent
8
9
 
9
10
  def initialize(params)
10
11
  @host = params[:host]
11
12
  @user_agent = params[:user_agent]
13
+ @records = params[:records]
12
14
  end
13
15
  end
@@ -7,19 +7,13 @@ module SessionIdentification
7
7
  class Error < StandardError; end
8
8
  # Your code goes here...
9
9
 
10
- def self.execute(cleaned_data)
11
- session_identity = []
12
- cleaned_data.each do |record|
13
- isession = session_identity.rindex { |s| s.user.host == record.user.host && s.user.user_agent == record.user.user_agent }
14
-
15
- if isession.present? && validate_time_session(session_identity[isession].records.last.time, record.time)
16
- session_identity[isession].records << record
17
- else
18
- session_identity << SessionIdentity.new(session_identity_params(record))
10
+ def self.execute(identified_user)
11
+ identified_user.each_with_object([]) do |user, arr|
12
+ user.records.each do |record|
13
+ isession = arr.rindex { |s| s.user == user }
14
+ isession.present? && validate_time_session(arr[isession].records.last.time, record.time) ? arr[isession].records << record : arr << SessionIdentity.new(session_identity_params(record))
19
15
  end
20
16
  end
21
-
22
- session_identity.map { |i| i.records.map(&:uri) }
23
17
  end
24
18
 
25
19
  private
@@ -0,0 +1,24 @@
1
+ require 'log_analysis/model/session_identity'
2
+ require 'log_analysis/model/user_identity'
3
+ require 'log_analysis/version'
4
+
5
+ module Transformation
6
+ class Error < StandardError; end
7
+ # Your code goes here...
8
+
9
+ def self.execute(identified_session)
10
+ map_uri = []
11
+ transform = identified_session.each_with_object({}) do |v, hash|
12
+ uries = v.records.map(&:uri)
13
+ uries.each { |i| map_uri.push(i) unless map_uri.include?(i) }
14
+ if hash.key?(v.user.host.to_s)
15
+ uries.size == 1 ? hash[v.user.host.to_s] += v.records.map { |i| map_uri.index(i.uri) } : hash[v.user.host.to_s].push(v.records.map { |i| map_uri.index(i.uri) })
16
+ else
17
+ hash.merge!(v.user.host.to_s => v.records.map { |i| map_uri.index(i.uri) })
18
+ end
19
+ end
20
+
21
+ File.open(LogAnalysis::MAP_URI_FILE_PATH, 'w+') { |f| f.write(map_uri.join(' ')) }
22
+ transform
23
+ end
24
+ end
@@ -6,10 +6,14 @@ module UserIdentification
6
6
  # Your code goes here...
7
7
 
8
8
  def self.execute(cleaned_data)
9
- data = cleaned_data.map { |record| [record.host, record.ua.to_s] }.uniq
10
- data.each_with_object([]) do |record, arrs|
11
- o = UserIdentity.new(host: record.first, user_agent: record.last)
12
- arrs << o
9
+ cleaned_data.each_with_object([]) do |record, arr|
10
+ user = arr.detect { |i| i == record.user }
11
+ if user
12
+ user.records.push(record)
13
+ else
14
+ record.user.records = [record]
15
+ arr << record.user
16
+ end
13
17
  end
14
18
  end
15
19
  end
@@ -1,3 +1,8 @@
1
1
  class LogAnalysis
2
- VERSION = '0.1.0'.freeze
2
+ VERSION = '0.1.5'.freeze
3
+ TRANSFORM_DATA_PATH = "transform_data_#{Time.now.strftime('%Y%m%d')}.txt".freeze
4
+ RULE_FILE_PATH = "output_#{Time.now.strftime('%Y%m%d')}.txt".freeze
5
+ MAP_URI_FILE_PATH = "map_uri_#{Time.now.strftime('%Y%m%d')}.txt".freeze
6
+ JAR_FILE_PATH = File.join(File.dirname(__FILE__), './files/spmf.jar')
7
+ DATA_PATH = File.expand_path('data/log_analysis', '~')
3
8
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: log_analysis
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Tran
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-06-27 00:00:00.000000000 Z
11
+ date: 2020-07-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: useragent
@@ -53,14 +53,19 @@ files:
53
53
  - LICENSE.txt
54
54
  - README.md
55
55
  - Rakefile
56
+ - access.log
56
57
  - bin/console
57
58
  - bin/setup
58
59
  - lib/log_analysis.rb
60
+ - lib/log_analysis/data_mining.rb
61
+ - lib/log_analysis/files/spmf.jar
62
+ - lib/log_analysis/intepretation.rb
63
+ - lib/log_analysis/loading_data.rb
59
64
  - lib/log_analysis/model/record.rb
60
65
  - lib/log_analysis/model/session_identity.rb
61
66
  - lib/log_analysis/model/user_identity.rb
62
- - lib/log_analysis/preprocess.rb
63
67
  - lib/log_analysis/session_identification.rb
68
+ - lib/log_analysis/transformation.rb
64
69
  - lib/log_analysis/user_identification.rb
65
70
  - lib/log_analysis/version.rb
66
71
  - log_analysis.gemspec
@@ -70,7 +75,7 @@ licenses:
70
75
  metadata:
71
76
  homepage_uri: https://github.com/michaelt0520/log_analysis_thesis
72
77
  source_code_uri: https://github.com/michaelt0520/log_analysis_thesis
73
- post_install_message:
78
+ post_install_message:
74
79
  rdoc_options: []
75
80
  require_paths:
76
81
  - lib
@@ -85,8 +90,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
85
90
  - !ruby/object:Gem::Version
86
91
  version: '0'
87
92
  requirements: []
88
- rubygems_version: 3.1.3
89
- signing_key:
93
+ rubygems_version: 3.1.2
94
+ signing_key:
90
95
  specification_version: 4
91
96
  summary: Log Analysis for thesis Huflit
92
97
  test_files: []
@@ -1,75 +0,0 @@
1
- require 'log_analysis/model/record'
2
- require 'json'
3
- require 'pry'
4
-
5
- module PreProcess
6
- class Error < StandardError; end
7
- # Your code goes here...
8
-
9
- REGEX_KEYS = /(time:| host:| status:| size:| request_length:| req:| method:| uri:| referer:| ua:| reqtime:| runtime:| apptime:| cache:| vhost:| server:| user:| forwardedfor:| forwardedproto:)/.freeze
10
- REGEX_NGINX = /\A^(?<host>\S*) (?<identity>\S*) (?<user>\S*) \[(?<time>[^\]]*)\] "(?<method>\S+)(?: +(?<path>[^\"]*?)(?:\s+\S*)?)?" (?<code>\S*) (?<size>\S*)(?: "(?<referer>[^\"]*)" "(?<agent>[^\"]*)"(?:\s+(?<http_x_forwarded_for>\S+))?)?$/.freeze
11
- REGEX_APACHE = %r{(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - (.{0})- \[([^\]]+?)\] "(GET|POST|PUT|DELETE) ([^\s]+?) (HTTP\/1\.1)" (\d+) (\d+) "-" "(.*)"}.freeze
12
-
13
- CONVERT_RECORD = { 'nginx' => 'convert_nginx_logs', 'apache' => 'convert_apache_logs', 'default' => 'to_records' }.freeze
14
-
15
- def self.input(file_path, type)
16
- arr_logs = File.readlines(file_path).each_with_object([]) do |line, arr|
17
- preprocess_log = type.nil? ? line.gsub(/[\t]/i, ' ').chomp! : line
18
- arr.push(preprocess_log)
19
- end
20
-
21
- send(CONVERT_RECORD[type.nil? ? 'nginx' : type], arr_logs)
22
- end
23
-
24
- def self.data_cleaning(logs)
25
- logs.select { |record| record.status_200? && record.method_get? && record.uri_without_data && !record.robot? }
26
- end
27
-
28
- def self.to_records(logs)
29
- logs.each_with_object([]).with_index do |(log, arrays), i|
30
- next if log.nil?
31
-
32
- o = log.split(REGEX_KEYS)
33
- o = o.map(&:strip)
34
- o.delete('')
35
-
36
- log = o.each_slice(2).to_a.each_with_object({}) do |pair, log_obj|
37
- log_obj.merge!(to_json(pair))
38
- end
39
-
40
- arrays << Record.new(log)
41
-
42
- puts "#{i}/#{logs.size}"
43
- end
44
- end
45
-
46
- def self.convert_nginx_logs(logs)
47
- logs.each_with_object([]).with_index do |(log, arrays), i|
48
- next if log.nil?
49
-
50
- o = log.split(REGEX_NGINX)
51
- o.delete('')
52
-
53
- obj = {}.tap do |p|
54
- p['host'] = o[0]
55
- p['user'] = o[2]
56
- p['time'] = o[3]
57
- p['method'] = o[4]
58
- p['uri'] = o[5]
59
- p['status'] = o[6]
60
- p['size'] = o[7]
61
- p['referer'] = o[8]
62
- p['ua'] = o[9]
63
- p['forwarded'] = o[10]
64
- end
65
-
66
- arrays << Record.new(obj)
67
-
68
- puts "#{i}/#{logs.size}"
69
- end
70
- end
71
-
72
- def self.to_json(pair)
73
- { pair.first.delete(':') => pair.last }
74
- end
75
- end