log_analysis 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "log_analysis"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,52 @@
1
+ require 'log_analysis/version'
2
+ require 'log_analysis/preprocess'
3
+ require 'log_analysis/user_identification'
4
+ require 'log_analysis/session_identification'
5
+ require 'log_analysis/transformation'
6
+ require 'log_analysis/rule_generation'
7
+ require 'time'
8
+
9
+ class LogAnalysis
10
+ class Error < StandardError; end
11
+ # Your code goes here...
12
+
13
+ attr_reader :path, :type, :cleaned_data
14
+
15
+ def initialize(path, type = nil)
16
+ @path = path
17
+ @type = type
18
+ @cleaned_data = PreProcess.input(path, type) do |record|
19
+ if block_given?
20
+ yield(record) ? record : nil
21
+ else
22
+ record
23
+ end
24
+ end
25
+ end
26
+
27
+ def identified_user
28
+ UserIdentification.execute(@cleaned_data)
29
+ end
30
+
31
+ def identified_session
32
+ SessionIdentification.execute(@cleaned_data)
33
+ end
34
+
35
+ def transformation
36
+ Transformation.execute(identified_session)
37
+ end
38
+
39
+ def rule_generation(conf = 0.5, sup = 60)
40
+ result = RuleGeneration.execute(transformation, conf, sup)
41
+ move_data
42
+ result
43
+ end
44
+
45
+ def move_data
46
+ return unless File.directory?('/home/app/data/waazabag/')
47
+
48
+ system('mv', "transform_data_#{Time.now.strftime('%Y%m%d')}.txt", '/home/app/data/waazabag/')
49
+ system('mv', "output_#{Time.now.strftime('%Y%m%d')}.txt", '/home/app/data/waazabag/')
50
+ system('mv', "map_uri_#{Time.now.strftime('%Y%m%d')}.txt", '/home/app/data/waazabag/')
51
+ end
52
+ end
@@ -0,0 +1,84 @@
1
+ require 'ipaddr'
2
+ require 'useragent'
3
+ require 'log_analysis/model/user_identity'
4
+ require 'active_support/core_ext/numeric/time'
5
+
6
+ class Record
7
+ attr_accessor :time,
8
+ :host,
9
+ :status,
10
+ :size,
11
+ :request_length,
12
+ :req,
13
+ :method,
14
+ :uri,
15
+ :referer,
16
+ :ua,
17
+ :reqtime,
18
+ :runtime,
19
+ :apptime,
20
+ :cache,
21
+ :vhost,
22
+ :user,
23
+ :forwardedfor,
24
+ :forwardedproto
25
+
26
+ attr_reader :params
27
+
28
+ DATA_TYPE = %w[.txt .json .js .css .jpg .jpeg .gif .woff2 .ico .png .bmp .mp3 .wav .avi .mpeg .vmw .mpg .map .pdf .doc .svg .otf].freeze
29
+ REGEX_BOT = /facebookexternalhit|Mediapartners-Google|AWS|-|Crawler|spider|Detection/.freeze
30
+
31
+ def initialize(params)
32
+ @params = params
33
+ record_params.keys.each { |key| instance_variable_set("@#{key}", record_params[key]) }
34
+ end
35
+
36
+ def status_200?
37
+ status == 200
38
+ end
39
+
40
+ def method_get?
41
+ method == 'GET'
42
+ end
43
+
44
+ def uri_without_data
45
+ return false if uri.nil? || uri.end_with?(*DATA_TYPE)
46
+
47
+ true
48
+ end
49
+
50
+ def robot?
51
+ ua.bot? || ua.to_s.match?(REGEX_BOT)
52
+ end
53
+
54
+ private
55
+
56
+ def record_params
57
+ {}.tap do |p|
58
+ p['time'] = Time.parse(@params['time'].split.first.sub(/:/, ' ')) || nil
59
+ p['host'] = IPAddr.new(@params['host'])
60
+ p['status'] = @params['status'].to_i || 0
61
+ p['size'] = @params['size'].to_i || 0
62
+ p['request_length'] = @params['request_length'].to_i || 0
63
+ p['req'] = validate_string(@params['req'])
64
+ p['method'] = validate_string(@params['method'])
65
+ p['uri'] = validate_string(@params['uri'])
66
+ p['referer'] = validate_string(@params['referer'])
67
+ p['ua'] = UserAgent.parse(@params['ua'])
68
+ p['reqtime'] = @params['reqtime'].to_f
69
+ p['runtime'] = @params['runtime'].to_f
70
+ p['apptime'] = @params['apptime'].to_f
71
+ p['cache'] = validate_string(@params['cache'])
72
+ p['vhost'] = validate_string(@params['vhost'])
73
+ p['user'] = @params['user'] || nil
74
+ p['forwardedfor'] = validate_string(@params['forwardedfor'])
75
+ p['forwardedproto'] = validate_string(@params['forwardedproto'])
76
+ end
77
+ rescue IPAddr::InvalidAddressError => e
78
+ puts e
79
+ end
80
+
81
+ def validate_string(value)
82
+ value == '-' ? nil : value
83
+ end
84
+ end
@@ -0,0 +1,8 @@
1
+ class SessionIdentity
2
+ attr_accessor :user, :records
3
+
4
+ def initialize(params)
5
+ @user = params[:user]
6
+ @records = params[:records]
7
+ end
8
+ end
@@ -0,0 +1,13 @@
1
+ require 'active_support/core_ext/module/delegation'
2
+ require 'useragent'
3
+
4
+ class UserIdentity
5
+ attr_accessor :host, :user_agent
6
+
7
+ delegate :browser, :version, :os, :platform, :mobile?, :application, :localization, to: :user_agent
8
+
9
+ def initialize(params)
10
+ @host = params[:host]
11
+ @user_agent = params[:user_agent]
12
+ end
13
+ end
@@ -0,0 +1,72 @@
1
+ require 'log_analysis/model/record'
2
+ require 'log_analysis/model/user_identity'
3
+ require 'json'
4
+
5
+ module PreProcess
6
+ class Error < StandardError; end
7
+ # Your code goes here...
8
+
9
+ REGEX_KEYS = /(time:| host:| status:| size:| request_length:| req:| method:| uri:| referer:| ua:| reqtime:| runtime:| apptime:| cache:| vhost:| server:| user:| forwardedfor:| forwardedproto:)/.freeze
10
+ REGEX_NGINX = /\A^(?<host>\S*) (?<identity>\S*) (?<user>\S*) \[(?<time>[^\]]*)\] "(?<method>\S+)(?: +(?<path>[^\"]*?)(?:\s+\S*)?)?" (?<code>\S*) (?<size>\S*)(?: "(?<referer>[^\"]*)" "(?<agent>[^\"]*)"(?:\s+(?<http_x_forwarded_for>\S+))?)?$/.freeze
11
+ REGEX_APACHE = %r{(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - (.{0})- \[([^\]]+?)\] "(GET|POST|PUT|DELETE) ([^\s]+?) (HTTP\/1\.1)" (\d+) (\d+) "-" "(.*)"}.freeze
12
+
13
+ CONVERT_RECORD = { 'nginx' => 'convert_nginx_logs', 'apache' => 'convert_apache_logs', 'default' => 'to_records' }.freeze
14
+
15
+ def self.input(file_path, type)
16
+ @users = []
17
+ text_file = File.readlines(file_path)
18
+
19
+ text_file.each_with_object([]).with_index do |(line, arr), i|
20
+ preprocessed_log = type.nil? ? line.gsub(/[\t]/i, ' ').chomp! : line
21
+ record_params = send(CONVERT_RECORD[type.nil? ? 'nginx' : type], preprocessed_log)
22
+ record = Record.new(record_params) if record_params && preprocessed_log
23
+ record = yield(record) if block_given? && record
24
+
25
+ arr.push(record) if record && record.status_200? && record.method_get? && record.uri_without_data && !record.robot?
26
+
27
+ system('clear')
28
+ puts "#{((i.to_f / text_file.size) * 100).round}/100"
29
+ end
30
+ end
31
+
32
+ def self.to_record(log)
33
+ o = log.gsub!('\t', ' ')
34
+ o = log.split(REGEX_KEYS)
35
+ o = o.map(&:strip)
36
+ o.delete('')
37
+ o.each_slice(2).to_a.each_with_object({}) { |pair, log_obj| log_obj.merge!(to_json(pair)) }
38
+ end
39
+
40
+ def self.convert_nginx_logs(log)
41
+ o = log.split(REGEX_NGINX)
42
+
43
+ return false if o.size <= 1
44
+ o.delete('')
45
+
46
+ {}.tap do |p|
47
+ p['host'] = o[0]
48
+ p['user'] = o[2]
49
+ p['time'] = o[3]
50
+ p['method'] = o[4]
51
+ p['uri'] = o[5]
52
+ p['status'] = o[6]
53
+ p['size'] = o[7]
54
+ p['referer'] = o[8]
55
+ p['ua'] = o[9]
56
+ p['forwarded'] = o[10]
57
+ p['user'] = save_user(o)
58
+ end
59
+ end
60
+
61
+ def self.to_json(pair)
62
+ { pair.first.delete(':') => pair.last }
63
+ end
64
+
65
+ def self.save_user(log)
66
+ user = @users.find { |i| i.host == log[0] && i.user_agent.to_s == log[9] }
67
+ return user unless user.nil?
68
+
69
+ @users.push(UserIdentity.new(host: IPAddr.new(log[0]), user_agent: UserAgent.parse(log[9])))
70
+ @users.last
71
+ end
72
+ end
@@ -0,0 +1,59 @@
1
+ require 'time'
2
+ require 'log_analysis/version'
3
+
4
+ module RuleGeneration
5
+ JAR_FILE_PATH = File.join(File.dirname(__FILE__), './files/spmf.jar')
6
+ TRANSFORM_DATA_PATH = "transform_data_#{Time.now.strftime('%Y%m%d')}.txt".freeze
7
+ RULE_FILE_PATH = "output_#{Time.now.strftime('%Y%m%d')}.txt".freeze
8
+ MAP_URI_FILE_PATH = "map_uri_#{Time.now.strftime('%Y%m%d')}.txt".freeze
9
+
10
+ class Error < StandardError; end
11
+ # Your code goes here...
12
+
13
+ def self.execute(transform_data, min_conf, min_sup)
14
+ File.open(TRANSFORM_DATA_PATH, 'w+') { |f| transform_data.keys.each { |e| f.puts(transform_data[e].map { |i| i.is_a?(Array) ? i.join(' ') : i }.join(' -1 ').concat(' -1 -2')) } }
15
+ system("java -jar #{JAR_FILE_PATH} run SPADE #{TRANSFORM_DATA_PATH} #{RULE_FILE_PATH} #{min_sup}%")
16
+ result = rule_gen(get_seq(File.read(RULE_FILE_PATH)), min_conf)
17
+ map_uri = File.read(MAP_URI_FILE_PATH).split(' ')
18
+
19
+ result.map do |rule|
20
+ seq, sub, rea = rule
21
+ [seq.map { |i| map_uri[i.to_i] }, sub.map { |i| map_uri[i.to_i] }, rea]
22
+ end
23
+ end
24
+
25
+ def self.rule_gen(seqs, min_conf)
26
+ seqs.each_with_object([]) { |seq, arr| seqs.each { |sub| arr.push([seq[0], sub[0], seq[1] / sub[1]]) if sub[0] != seq[0] && sub_seq?(sub[0], seq[0]) && seq[1] / sub[1] >= min_conf } }
27
+ end
28
+
29
+ def self.sub_seq?(first, second)
30
+ ptr = 0
31
+ first.each do |sub|
32
+ return false if ptr >= second.size
33
+
34
+ (ptr..second.size - 1).each do |n|
35
+ if sub?(second[n], sub)
36
+ ptr = n + 1
37
+ break
38
+ end
39
+ return false if ptr == second.size - 1
40
+ end
41
+ end
42
+ true
43
+ end
44
+
45
+ def self.sub?(str, sub)
46
+ mark_sub = 0
47
+ sub.split(' ').each { |char| mark_sub += 1 if str.include?(char) }
48
+
49
+ mark_sub == sub.split(' ').size
50
+ end
51
+
52
+ def self.get_seq(seq_str)
53
+ seq = seq_str.split("\n")
54
+ seq.each_with_object([]) do |s, arr|
55
+ split_seq = s.split('-1')
56
+ arr.push([split_seq[0..-2], split_seq[-1][-1].to_f])
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,29 @@
1
+ require 'active_support/core_ext/numeric/time'
2
+ require 'log_analysis/model/session_identity'
3
+
4
+ module SessionIdentification
5
+ MINUTE_THRESHOLD = 10
6
+
7
+ class Error < StandardError; end
8
+ # Your code goes here...
9
+
10
+ def self.execute(cleaned_data)
11
+ cleaned_data.each_with_object([]) do |record, arr|
12
+ isession = arr.rindex { |s| s.user == record.user }
13
+ isession.present? && validate_time_session(arr[isession].records.last.time, record.time) ? arr[isession].records << record : arr << SessionIdentity.new(session_identity_params(record))
14
+ end
15
+ end
16
+
17
+ private
18
+
19
+ def self.session_identity_params(record)
20
+ {}.tap do |p|
21
+ p[:user] = record.user
22
+ p[:records] = [record]
23
+ end
24
+ end
25
+
26
+ def self.validate_time_session(last_session_record_time, time)
27
+ (time - last_session_record_time) / 60 < MINUTE_THRESHOLD
28
+ end
29
+ end
@@ -0,0 +1,26 @@
1
+ require 'log_analysis/model/session_identity'
2
+ require 'log_analysis/model/user_identity'
3
+ require 'log_analysis/version'
4
+
5
+ module Transformation
6
+ MAP_URI_FILE_PATH = "map_uri_#{Time.now.strftime('%Y%m%d')}.txt".freeze
7
+
8
+ class Error < StandardError; end
9
+ # Your code goes here...
10
+
11
+ def self.execute(identified_session)
12
+ map_uri = []
13
+ transform = identified_session.each_with_object({}) do |v, hash|
14
+ uries = v.records.map(&:uri)
15
+ uries.each { |i| map_uri.push(i) unless map_uri.include?(i) }
16
+ if hash.key?(v.user.host.to_s)
17
+ uries.size == 1 ? hash[v.user.host.to_s] += v.records.map { |i| map_uri.index(i.uri) } : hash[v.user.host.to_s].push(v.records.map { |i| map_uri.index(i.uri) })
18
+ else
19
+ hash.merge!(v.user.host.to_s => v.records.map { |i| map_uri.index(i.uri) })
20
+ end
21
+ end
22
+
23
+ File.open(MAP_URI_FILE_PATH, 'w+') { |f| f.write(map_uri.join(' ')) }
24
+ transform
25
+ end
26
+ end
@@ -0,0 +1,11 @@
1
+ require 'ipaddr'
2
+ require 'log_analysis/model/user_identity'
3
+
4
+ module UserIdentification
5
+ class Error < StandardError; end
6
+ # Your code goes here...
7
+
8
+ def self.execute(cleaned_data)
9
+ cleaned_data.map(&:user).uniq
10
+ end
11
+ end
@@ -0,0 +1,3 @@
1
+ class LogAnalysis
2
+ VERSION = '0.1.4'
3
+ end
@@ -0,0 +1,31 @@
1
+ require_relative 'lib/log_analysis/version'
2
+
3
+ Gem::Specification.new do |spec|
4
+ spec.name = "log_analysis"
5
+ spec.version = LogAnalysis::VERSION
6
+ spec.authors = ["Michael Tran"]
7
+ spec.email = ["mictran205@gmail.com"]
8
+
9
+ spec.summary = 'Log Analysis for thesis Huflit'
10
+ spec.description = 'Preprocess step of web mining'
11
+ spec.homepage = 'https://github.com/michaelt0520/log_analysis_thesis'
12
+ spec.license = "MIT"
13
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
14
+
15
+ # spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'"
16
+
17
+ spec.metadata["homepage_uri"] = spec.homepage
18
+ spec.metadata["source_code_uri"] = "https://github.com/michaelt0520/log_analysis_thesis"
19
+
20
+ spec.add_dependency "useragent", "~> 0.16.10"
21
+ spec.add_dependency "activesupport", "~> 6.0.3.1"
22
+
23
+ # Specify which files should be added to the gem when it is released.
24
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
25
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
26
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
27
+ end
28
+ spec.bindir = "exe"
29
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
30
+ spec.require_paths = ["lib"]
31
+ end
metadata ADDED
@@ -0,0 +1,96 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: log_analysis
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.4
5
+ platform: ruby
6
+ authors:
7
+ - Michael Tran
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2020-07-14 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: useragent
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 0.16.10
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 0.16.10
27
+ - !ruby/object:Gem::Dependency
28
+ name: activesupport
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 6.0.3.1
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 6.0.3.1
41
+ description: Preprocess step of web mining
42
+ email:
43
+ - mictran205@gmail.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - ".gitignore"
49
+ - ".ruby-version"
50
+ - CODE_OF_CONDUCT.md
51
+ - Gemfile
52
+ - Gemfile.lock
53
+ - LICENSE.txt
54
+ - README.md
55
+ - Rakefile
56
+ - access.log
57
+ - bin/console
58
+ - bin/setup
59
+ - lib/log_analysis.rb
60
+ - lib/log_analysis/files/spmf.jar
61
+ - lib/log_analysis/model/record.rb
62
+ - lib/log_analysis/model/session_identity.rb
63
+ - lib/log_analysis/model/user_identity.rb
64
+ - lib/log_analysis/preprocess.rb
65
+ - lib/log_analysis/rule_generation.rb
66
+ - lib/log_analysis/session_identification.rb
67
+ - lib/log_analysis/transformation.rb
68
+ - lib/log_analysis/user_identification.rb
69
+ - lib/log_analysis/version.rb
70
+ - log_analysis.gemspec
71
+ homepage: https://github.com/michaelt0520/log_analysis_thesis
72
+ licenses:
73
+ - MIT
74
+ metadata:
75
+ homepage_uri: https://github.com/michaelt0520/log_analysis_thesis
76
+ source_code_uri: https://github.com/michaelt0520/log_analysis_thesis
77
+ post_install_message:
78
+ rdoc_options: []
79
+ require_paths:
80
+ - lib
81
+ required_ruby_version: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: 2.3.0
86
+ required_rubygems_version: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - ">="
89
+ - !ruby/object:Gem::Version
90
+ version: '0'
91
+ requirements: []
92
+ rubygems_version: 3.1.2
93
+ signing_key:
94
+ specification_version: 4
95
+ summary: Log Analysis for thesis Huflit
96
+ test_files: []