log_analysis 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "log_analysis"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,52 @@
1
+ require 'log_analysis/version'
2
+ require 'log_analysis/preprocess'
3
+ require 'log_analysis/user_identification'
4
+ require 'log_analysis/session_identification'
5
+ require 'log_analysis/transformation'
6
+ require 'log_analysis/rule_generation'
7
+ require 'time'
8
+
9
+ class LogAnalysis
10
+ class Error < StandardError; end
11
+ # Your code goes here...
12
+
13
+ attr_reader :path, :type, :cleaned_data
14
+
15
+ def initialize(path, type = nil)
16
+ @path = path
17
+ @type = type
18
+ @cleaned_data = PreProcess.input(path, type) do |record|
19
+ if block_given?
20
+ yield(record) ? record : nil
21
+ else
22
+ record
23
+ end
24
+ end
25
+ end
26
+
27
+ def identified_user
28
+ UserIdentification.execute(@cleaned_data)
29
+ end
30
+
31
+ def identified_session
32
+ SessionIdentification.execute(@cleaned_data)
33
+ end
34
+
35
+ def transformation
36
+ Transformation.execute(identified_session)
37
+ end
38
+
39
+ def rule_generation(conf = 0.5, sup = 60)
40
+ result = RuleGeneration.execute(transformation, conf, sup)
41
+ move_data
42
+ result
43
+ end
44
+
45
+ def move_data
46
+ return unless File.directory?('/home/app/data/waazabag/')
47
+
48
+ system('mv', "transform_data_#{Time.now.strftime('%Y%m%d')}.txt", '/home/app/data/waazabag/')
49
+ system('mv', "output_#{Time.now.strftime('%Y%m%d')}.txt", '/home/app/data/waazabag/')
50
+ system('mv', "map_uri_#{Time.now.strftime('%Y%m%d')}.txt", '/home/app/data/waazabag/')
51
+ end
52
+ end
@@ -0,0 +1,84 @@
1
+ require 'ipaddr'
2
+ require 'useragent'
3
+ require 'log_analysis/model/user_identity'
4
+ require 'active_support/core_ext/numeric/time'
5
+
6
+ class Record
7
+ attr_accessor :time,
8
+ :host,
9
+ :status,
10
+ :size,
11
+ :request_length,
12
+ :req,
13
+ :method,
14
+ :uri,
15
+ :referer,
16
+ :ua,
17
+ :reqtime,
18
+ :runtime,
19
+ :apptime,
20
+ :cache,
21
+ :vhost,
22
+ :user,
23
+ :forwardedfor,
24
+ :forwardedproto
25
+
26
+ attr_reader :params
27
+
28
+ DATA_TYPE = %w[.txt .json .js .css .jpg .jpeg .gif .woff2 .ico .png .bmp .mp3 .wav .avi .mpeg .vmw .mpg .map .pdf .doc .svg .otf].freeze
29
+ REGEX_BOT = /facebookexternalhit|Mediapartners-Google|AWS|-|Crawler|spider|Detection/.freeze
30
+
31
+ def initialize(params)
32
+ @params = params
33
+ record_params.keys.each { |key| instance_variable_set("@#{key}", record_params[key]) }
34
+ end
35
+
36
+ def status_200?
37
+ status == 200
38
+ end
39
+
40
+ def method_get?
41
+ method == 'GET'
42
+ end
43
+
44
+ def uri_without_data
45
+ return false if uri.nil? || uri.end_with?(*DATA_TYPE)
46
+
47
+ true
48
+ end
49
+
50
+ def robot?
51
+ ua.bot? || ua.to_s.match?(REGEX_BOT)
52
+ end
53
+
54
+ private
55
+
56
+ def record_params
57
+ {}.tap do |p|
58
+ p['time'] = Time.parse(@params['time'].split.first.sub(/:/, ' ')) || nil
59
+ p['host'] = IPAddr.new(@params['host'])
60
+ p['status'] = @params['status'].to_i || 0
61
+ p['size'] = @params['size'].to_i || 0
62
+ p['request_length'] = @params['request_length'].to_i || 0
63
+ p['req'] = validate_string(@params['req'])
64
+ p['method'] = validate_string(@params['method'])
65
+ p['uri'] = validate_string(@params['uri'])
66
+ p['referer'] = validate_string(@params['referer'])
67
+ p['ua'] = UserAgent.parse(@params['ua'])
68
+ p['reqtime'] = @params['reqtime'].to_f
69
+ p['runtime'] = @params['runtime'].to_f
70
+ p['apptime'] = @params['apptime'].to_f
71
+ p['cache'] = validate_string(@params['cache'])
72
+ p['vhost'] = validate_string(@params['vhost'])
73
+ p['user'] = @params['user'] || nil
74
+ p['forwardedfor'] = validate_string(@params['forwardedfor'])
75
+ p['forwardedproto'] = validate_string(@params['forwardedproto'])
76
+ end
77
+ rescue IPAddr::InvalidAddressError => e
78
+ puts e
79
+ end
80
+
81
+ def validate_string(value)
82
+ value == '-' ? nil : value
83
+ end
84
+ end
@@ -0,0 +1,8 @@
1
+ class SessionIdentity
2
+ attr_accessor :user, :records
3
+
4
+ def initialize(params)
5
+ @user = params[:user]
6
+ @records = params[:records]
7
+ end
8
+ end
@@ -0,0 +1,13 @@
1
+ require 'active_support/core_ext/module/delegation'
2
+ require 'useragent'
3
+
4
+ class UserIdentity
5
+ attr_accessor :host, :user_agent
6
+
7
+ delegate :browser, :version, :os, :platform, :mobile?, :application, :localization, to: :user_agent
8
+
9
+ def initialize(params)
10
+ @host = params[:host]
11
+ @user_agent = params[:user_agent]
12
+ end
13
+ end
@@ -0,0 +1,72 @@
1
+ require 'log_analysis/model/record'
2
+ require 'log_analysis/model/user_identity'
3
+ require 'json'
4
+
5
+ module PreProcess
6
+ class Error < StandardError; end
7
+ # Your code goes here...
8
+
9
+ REGEX_KEYS = /(time:| host:| status:| size:| request_length:| req:| method:| uri:| referer:| ua:| reqtime:| runtime:| apptime:| cache:| vhost:| server:| user:| forwardedfor:| forwardedproto:)/.freeze
10
+ REGEX_NGINX = /\A^(?<host>\S*) (?<identity>\S*) (?<user>\S*) \[(?<time>[^\]]*)\] "(?<method>\S+)(?: +(?<path>[^\"]*?)(?:\s+\S*)?)?" (?<code>\S*) (?<size>\S*)(?: "(?<referer>[^\"]*)" "(?<agent>[^\"]*)"(?:\s+(?<http_x_forwarded_for>\S+))?)?$/.freeze
11
+ REGEX_APACHE = %r{(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - (.{0})- \[([^\]]+?)\] "(GET|POST|PUT|DELETE) ([^\s]+?) (HTTP\/1\.1)" (\d+) (\d+) "-" "(.*)"}.freeze
12
+
13
+ CONVERT_RECORD = { 'nginx' => 'convert_nginx_logs', 'apache' => 'convert_apache_logs', 'default' => 'to_records' }.freeze
14
+
15
+ def self.input(file_path, type)
16
+ @users = []
17
+ text_file = File.readlines(file_path)
18
+
19
+ text_file.each_with_object([]).with_index do |(line, arr), i|
20
+ preprocessed_log = type.nil? ? line.gsub(/[\t]/i, ' ').chomp! : line
21
+ record_params = send(CONVERT_RECORD[type.nil? ? 'nginx' : type], preprocessed_log)
22
+ record = Record.new(record_params) if record_params && preprocessed_log
23
+ record = yield(record) if block_given? && record
24
+
25
+ arr.push(record) if record && record.status_200? && record.method_get? && record.uri_without_data && !record.robot?
26
+
27
+ system('clear')
28
+ puts "#{((i.to_f / text_file.size) * 100).round}/100"
29
+ end
30
+ end
31
+
32
+ def self.to_record(log)
33
+ o = log.gsub!('\t', ' ')
34
+ o = log.split(REGEX_KEYS)
35
+ o = o.map(&:strip)
36
+ o.delete('')
37
+ o.each_slice(2).to_a.each_with_object({}) { |pair, log_obj| log_obj.merge!(to_json(pair)) }
38
+ end
39
+
40
+ def self.convert_nginx_logs(log)
41
+ o = log.split(REGEX_NGINX)
42
+
43
+ return false if o.size <= 1
44
+ o.delete('')
45
+
46
+ {}.tap do |p|
47
+ p['host'] = o[0]
48
+ p['user'] = o[2]
49
+ p['time'] = o[3]
50
+ p['method'] = o[4]
51
+ p['uri'] = o[5]
52
+ p['status'] = o[6]
53
+ p['size'] = o[7]
54
+ p['referer'] = o[8]
55
+ p['ua'] = o[9]
56
+ p['forwarded'] = o[10]
57
+ p['user'] = save_user(o)
58
+ end
59
+ end
60
+
61
+ def self.to_json(pair)
62
+ { pair.first.delete(':') => pair.last }
63
+ end
64
+
65
+ def self.save_user(log)
66
+ user = @users.find { |i| i.host == log[0] && i.user_agent.to_s == log[9] }
67
+ return user unless user.nil?
68
+
69
+ @users.push(UserIdentity.new(host: IPAddr.new(log[0]), user_agent: UserAgent.parse(log[9])))
70
+ @users.last
71
+ end
72
+ end
@@ -0,0 +1,59 @@
1
+ require 'time'
2
+ require 'log_analysis/version'
3
+
4
+ module RuleGeneration
5
+ JAR_FILE_PATH = File.join(File.dirname(__FILE__), './files/spmf.jar')
6
+ TRANSFORM_DATA_PATH = "transform_data_#{Time.now.strftime('%Y%m%d')}.txt".freeze
7
+ RULE_FILE_PATH = "output_#{Time.now.strftime('%Y%m%d')}.txt".freeze
8
+ MAP_URI_FILE_PATH = "map_uri_#{Time.now.strftime('%Y%m%d')}.txt".freeze
9
+
10
+ class Error < StandardError; end
11
+ # Your code goes here...
12
+
13
+ def self.execute(transform_data, min_conf, min_sup)
14
+ File.open(TRANSFORM_DATA_PATH, 'w+') { |f| transform_data.keys.each { |e| f.puts(transform_data[e].map { |i| i.is_a?(Array) ? i.join(' ') : i }.join(' -1 ').concat(' -1 -2')) } }
15
+ system("java -jar #{JAR_FILE_PATH} run SPADE #{TRANSFORM_DATA_PATH} #{RULE_FILE_PATH} #{min_sup}%")
16
+ result = rule_gen(get_seq(File.read(RULE_FILE_PATH)), min_conf)
17
+ map_uri = File.read(MAP_URI_FILE_PATH).split(' ')
18
+
19
+ result.map do |rule|
20
+ seq, sub, rea = rule
21
+ [seq.map { |i| map_uri[i.to_i] }, sub.map { |i| map_uri[i.to_i] }, rea]
22
+ end
23
+ end
24
+
25
+ def self.rule_gen(seqs, min_conf)
26
+ seqs.each_with_object([]) { |seq, arr| seqs.each { |sub| arr.push([seq[0], sub[0], seq[1] / sub[1]]) if sub[0] != seq[0] && sub_seq?(sub[0], seq[0]) && seq[1] / sub[1] >= min_conf } }
27
+ end
28
+
29
+ def self.sub_seq?(first, second)
30
+ ptr = 0
31
+ first.each do |sub|
32
+ return false if ptr >= second.size
33
+
34
+ (ptr..second.size - 1).each do |n|
35
+ if sub?(second[n], sub)
36
+ ptr = n + 1
37
+ break
38
+ end
39
+ return false if ptr == second.size - 1
40
+ end
41
+ end
42
+ true
43
+ end
44
+
45
+ def self.sub?(str, sub)
46
+ mark_sub = 0
47
+ sub.split(' ').each { |char| mark_sub += 1 if str.include?(char) }
48
+
49
+ mark_sub == sub.split(' ').size
50
+ end
51
+
52
+ def self.get_seq(seq_str)
53
+ seq = seq_str.split("\n")
54
+ seq.each_with_object([]) do |s, arr|
55
+ split_seq = s.split('-1')
56
+ arr.push([split_seq[0..-2], split_seq[-1][-1].to_f])
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,29 @@
1
+ require 'active_support/core_ext/numeric/time'
2
+ require 'log_analysis/model/session_identity'
3
+
4
+ module SessionIdentification
5
+ MINUTE_THRESHOLD = 10
6
+
7
+ class Error < StandardError; end
8
+ # Your code goes here...
9
+
10
+ def self.execute(cleaned_data)
11
+ cleaned_data.each_with_object([]) do |record, arr|
12
+ isession = arr.rindex { |s| s.user == record.user }
13
+ isession.present? && validate_time_session(arr[isession].records.last.time, record.time) ? arr[isession].records << record : arr << SessionIdentity.new(session_identity_params(record))
14
+ end
15
+ end
16
+
17
+ private
18
+
19
+ def self.session_identity_params(record)
20
+ {}.tap do |p|
21
+ p[:user] = record.user
22
+ p[:records] = [record]
23
+ end
24
+ end
25
+
26
+ def self.validate_time_session(last_session_record_time, time)
27
+ (time - last_session_record_time) / 60 < MINUTE_THRESHOLD
28
+ end
29
+ end
@@ -0,0 +1,26 @@
1
+ require 'log_analysis/model/session_identity'
2
+ require 'log_analysis/model/user_identity'
3
+ require 'log_analysis/version'
4
+
5
+ module Transformation
6
+ MAP_URI_FILE_PATH = "map_uri_#{Time.now.strftime('%Y%m%d')}.txt".freeze
7
+
8
+ class Error < StandardError; end
9
+ # Your code goes here...
10
+
11
+ def self.execute(identified_session)
12
+ map_uri = []
13
+ transform = identified_session.each_with_object({}) do |v, hash|
14
+ uries = v.records.map(&:uri)
15
+ uries.each { |i| map_uri.push(i) unless map_uri.include?(i) }
16
+ if hash.key?(v.user.host.to_s)
17
+ uries.size == 1 ? hash[v.user.host.to_s] += v.records.map { |i| map_uri.index(i.uri) } : hash[v.user.host.to_s].push(v.records.map { |i| map_uri.index(i.uri) })
18
+ else
19
+ hash.merge!(v.user.host.to_s => v.records.map { |i| map_uri.index(i.uri) })
20
+ end
21
+ end
22
+
23
+ File.open(MAP_URI_FILE_PATH, 'w+') { |f| f.write(map_uri.join(' ')) }
24
+ transform
25
+ end
26
+ end
@@ -0,0 +1,11 @@
1
+ require 'ipaddr'
2
+ require 'log_analysis/model/user_identity'
3
+
4
+ module UserIdentification
5
+ class Error < StandardError; end
6
+ # Your code goes here...
7
+
8
+ def self.execute(cleaned_data)
9
+ cleaned_data.map(&:user).uniq
10
+ end
11
+ end
@@ -0,0 +1,3 @@
1
+ class LogAnalysis
2
+ VERSION = '0.1.4'
3
+ end
@@ -0,0 +1,31 @@
1
+ require_relative 'lib/log_analysis/version'
2
+
3
+ Gem::Specification.new do |spec|
4
+ spec.name = "log_analysis"
5
+ spec.version = LogAnalysis::VERSION
6
+ spec.authors = ["Michael Tran"]
7
+ spec.email = ["mictran205@gmail.com"]
8
+
9
+ spec.summary = 'Log Analysis for thesis Huflit'
10
+ spec.description = 'Preprocess step of web mining'
11
+ spec.homepage = 'https://github.com/michaelt0520/log_analysis_thesis'
12
+ spec.license = "MIT"
13
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
14
+
15
+ # spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'"
16
+
17
+ spec.metadata["homepage_uri"] = spec.homepage
18
+ spec.metadata["source_code_uri"] = "https://github.com/michaelt0520/log_analysis_thesis"
19
+
20
+ spec.add_dependency "useragent", "~> 0.16.10"
21
+ spec.add_dependency "activesupport", "~> 6.0.3.1"
22
+
23
+ # Specify which files should be added to the gem when it is released.
24
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
25
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
26
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
27
+ end
28
+ spec.bindir = "exe"
29
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
30
+ spec.require_paths = ["lib"]
31
+ end
metadata ADDED
@@ -0,0 +1,96 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: log_analysis
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.4
5
+ platform: ruby
6
+ authors:
7
+ - Michael Tran
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2020-07-14 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: useragent
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 0.16.10
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 0.16.10
27
+ - !ruby/object:Gem::Dependency
28
+ name: activesupport
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 6.0.3.1
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 6.0.3.1
41
+ description: Preprocess step of web mining
42
+ email:
43
+ - mictran205@gmail.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - ".gitignore"
49
+ - ".ruby-version"
50
+ - CODE_OF_CONDUCT.md
51
+ - Gemfile
52
+ - Gemfile.lock
53
+ - LICENSE.txt
54
+ - README.md
55
+ - Rakefile
56
+ - access.log
57
+ - bin/console
58
+ - bin/setup
59
+ - lib/log_analysis.rb
60
+ - lib/log_analysis/files/spmf.jar
61
+ - lib/log_analysis/model/record.rb
62
+ - lib/log_analysis/model/session_identity.rb
63
+ - lib/log_analysis/model/user_identity.rb
64
+ - lib/log_analysis/preprocess.rb
65
+ - lib/log_analysis/rule_generation.rb
66
+ - lib/log_analysis/session_identification.rb
67
+ - lib/log_analysis/transformation.rb
68
+ - lib/log_analysis/user_identification.rb
69
+ - lib/log_analysis/version.rb
70
+ - log_analysis.gemspec
71
+ homepage: https://github.com/michaelt0520/log_analysis_thesis
72
+ licenses:
73
+ - MIT
74
+ metadata:
75
+ homepage_uri: https://github.com/michaelt0520/log_analysis_thesis
76
+ source_code_uri: https://github.com/michaelt0520/log_analysis_thesis
77
+ post_install_message:
78
+ rdoc_options: []
79
+ require_paths:
80
+ - lib
81
+ required_ruby_version: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: 2.3.0
86
+ required_rubygems_version: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - ">="
89
+ - !ruby/object:Gem::Version
90
+ version: '0'
91
+ requirements: []
92
+ rubygems_version: 3.1.2
93
+ signing_key:
94
+ specification_version: 4
95
+ summary: Log Analysis for thesis Huflit
96
+ test_files: []