log2counter 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,220 @@
1
+ # This program parses weblogs in the NCSA Common (access log) format or
2
+ # NCSA Combined log format
3
+ #
4
+ # One line consists of
5
+ # host rfc931 username date:time request statuscode bytes
6
+ # For example
7
+ # 1.2.3.4 - dsmith [10/Oct/1999:21:15:05 +0500] "GET /index.html HTTP/1.0" 200 12
8
+ # [dd/MMM/yyyy:hh:mm:ss +-hhmm]
9
+ # Where
10
+ # dd is the day of the month
11
+ # MMM is the month
12
+ # yyy is the year
13
+ # :hh is the hour
14
+ # :mm is the minute
15
+ # :ss is the seconds
16
+ # +-hhmm is the time zone
17
+ #
18
+ # In practice, the day is typically logged in two-digit format even for
19
+ # single-digit days.
20
+ # For example, the second day of the month would be represented as 02.
21
+ # However, some HTTP servers do log a single digit day as a single digit.
22
+ # When parsing log records, you should be aware of both possible day
23
+ # representations.
24
+ #
25
+ # Author:: Jan Wikholm [jw@jw.fi]
26
+ # License:: MIT
27
+ #
28
+ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
29
+ #
30
+ # Modified by Jens Wille <jens.wille@uni-koeln.de>; see log_parser.rb.orig for the
31
+ # original file.
32
+
33
+ require 'logger'
34
+
35
+ class LogFormat
36
+
37
+ attr_reader :name, :format, :format_symbols, :format_regex
38
+
39
+ # Add more format directives here.
40
+ DIRECTIVES = {
41
+ # format string char => [:symbol to use, /regex to use when matching against log/]
42
+ 'h' => [:ip, /(?:\d+\.\d+\.\d+\.\d+)|(?:[\w.-]+)/],
43
+ 'l' => [:auth, /.*?/],
44
+ 'u' => [:username, /.*?/],
45
+ 't' => [:datetime, /\[.*?\]/],
46
+ 'r' => [:request, /.*?/],
47
+ 'R' => [:request, /.*?(:?\"|\z)/],
48
+ 's' => [:status, /\d+/],
49
+ 'b' => [:bytecount, /-|\d+/],
50
+ 'v' => [:domain, /.*?/],
51
+ 'i' => [:header_lines, /.*?/],
52
+ }
53
+
54
+ def initialize(name, format)
55
+ @name, @format = name, format
56
+ parse_format(format)
57
+ end
58
+
59
+ # The symbols are used to map the log to the env variables.
60
+ # The regex is used when checking what format the log is and to extract data.
61
+ def parse_format(format)
62
+ format_directive = /%(.*?)(\{.*?\})?([#{[DIRECTIVES.keys.join('|')]}])([\s\\"]*)/
63
+
64
+ log_format_symbols = []
65
+ format_regex = ''
66
+
67
+ format.scan(format_directive) { |condition, subdirective, directive_char, ignored|
68
+ log_format, match_regex = process_directive(directive_char, subdirective, condition)
69
+
70
+ ignored.gsub!(/\s/, '\\s') if ignored
71
+
72
+ log_format_symbols << log_format
73
+ format_regex << "(#{match_regex})#{ignored}"
74
+ }
75
+
76
+ @format_symbols = log_format_symbols
77
+ @format_regex = /\A#{format_regex}/
78
+ end
79
+
80
+ def process_directive(directive_char, subdirective, condition)
81
+ directive = DIRECTIVES[directive_char]
82
+
83
+ case directive_char
84
+ when 'i'
85
+ log_format = subdirective[1...-1].downcase.tr('-', '_').to_sym
86
+ [log_format, directive[1].source]
87
+ else
88
+ [directive[0], directive[1].source]
89
+ end
90
+ end
91
+
92
+ end
93
+
94
+ class LogParser
95
+
96
+ LOG_FORMATS = {
97
+ :common => '%h %l %u %t \"%r\" %>s %b',
98
+ :combined => '%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\"',
99
+ :minimal => '%h %l %u %t \"%R'
100
+ }
101
+
102
+ # Add any values that you may return here.
103
+ STAT_ENV_MAP = {
104
+ :ip => 'REMOTE_ADDR',
105
+ :page => 'PATH_INFO',
106
+ :datetime => 'DATETIME',
107
+ :status => 'STATUS',
108
+ :domain => 'HTTP_HOST',
109
+ :referer => 'HTTP_REFERER',
110
+ :user_agent => 'HTTP_USER_AGENT'
111
+ }
112
+
113
+ attr_reader :constraint, :known_formats, :log_format
114
+
115
+ def initialize(format = nil, constraint = nil)
116
+ @format = format
117
+ @constraint = constraint
118
+
119
+ initialize_known_formats
120
+
121
+ @log_format = known_formats[@format] if @format
122
+ end
123
+
124
+ # Processes the format string into symbols and test regex
125
+ # and saves using LogFormat class.
126
+ def initialize_known_formats
127
+ @known_formats = {}
128
+
129
+ LOG_FORMATS.each { |name, format|
130
+ @known_formats[name] = LogFormat.new(name, format)
131
+ }
132
+ end
133
+
134
+ # Checks which standard the log file (well one line) is.
135
+ # Automatically checks for most complex (longest) regex first.
136
+ def check_format(line)
137
+ @known_formats.sort_by { |key, log_format|
138
+ log_format.format_regex.source.size
139
+ }.reverse.each { |key, log_format|
140
+ return key if line.match(log_format.format_regex)
141
+ }
142
+
143
+ return :unknown
144
+ end
145
+
146
+ # This is where the magic happens.
147
+ # This is the end-to-end business logic of the class.
148
+ #
149
+ # Call with a block that will be called with each line, as a hash.
150
+ def parse_io_stream(stream)
151
+ stats = []
152
+ lines_parsed = 0
153
+
154
+ stream.each { |line|
155
+ line.chomp!
156
+ lines_parsed += 1
157
+ warn "##{lines_parsed}" if (lines_parsed % 10000).zero?
158
+
159
+ next if constraint && line !~ constraint
160
+
161
+ begin
162
+ parsed_data = parse_line(line)
163
+ yield generate_stats(parsed_data)
164
+ rescue FormatError
165
+ warn "Corrupt line [#{lines_parsed}]: #{line.inspect}"
166
+ rescue => err
167
+ raise err.class, "#{err.class} [#{lines_parsed}]: #{line.inspect}\n\n" <<
168
+ "#{parsed_data.inspect}\n\n#{err}"
169
+ end
170
+ }
171
+ end
172
+
173
+ # Populate a stats hash one line at a time.
174
+ # Add extra fields into the STAT_ENV_MAP hash at the top of this file.
175
+ def generate_stats(parsed_data)
176
+ stats = { 'PATH_INFO' => get_page(parsed_data[:request]) }
177
+
178
+ STAT_ENV_MAP.each { |stat_name, env_name|
179
+ stats[env_name] = parsed_data[stat_name] if parsed_data.has_key?(stat_name)
180
+ }
181
+
182
+ stats
183
+ end
184
+
185
+ def get_page(request)
186
+ (request[/\/.*?(?:\s|\z)/] || request).strip
187
+ end
188
+
189
+ def parse_line(line)
190
+ unless @format && @log_format
191
+ @format = check_format(line)
192
+ @log_format = known_formats[@format]
193
+
194
+ unless log_format && line =~ log_format.format_regex
195
+ raise FormatError, line
196
+ end
197
+ end
198
+
199
+ data = line.scan(log_format.format_regex).flatten
200
+
201
+ parsed_data = {}
202
+ log_format.format_symbols.each_with_index { |format_symbol, index|
203
+ parsed_data[format_symbol] = data[index]
204
+ }
205
+
206
+ # Remove [] from time.
207
+ parsed_data[:datetime] &&= parsed_data[:datetime][1...-1]
208
+
209
+ # Add IP as domain if we don't have a domain (virtual host).
210
+ # Assumes we always have an IP.
211
+ parsed_data[:domain] ||= parsed_data[:ip]
212
+
213
+ parsed_data[:format] = @format
214
+
215
+ parsed_data
216
+ end
217
+
218
+ class FormatError < ArgumentError; end
219
+
220
+ end
@@ -0,0 +1,188 @@
1
+ #!/usr/bin/ruby
2
+ # This program parses weblogs in the NCSA Common (access log) format or
3
+ # NCSA Combined log format
4
+ #
5
+ # One line consists of
6
+ # host rfc931 username date:time request statuscode bytes
7
+ # For example
8
+ # 1.2.3.4 - dsmith [10/Oct/1999:21:15:05 +0500] "GET /index.html HTTP/1.0" 200 12
9
+ # [dd/MMM/yyyy:hh:mm:ss +-hhmm]
10
+ # Where
11
+ # dd is the day of the month
12
+ # MMM is the month
13
+ # yyy is the year
14
+ # :hh is the hour
15
+ # :mm is the minute
16
+ # :ss is the seconds
17
+ # +-hhmm is the time zone
18
+ #
19
+ # In practice, the day is typically logged in two-digit format even for
20
+ # single-digit days.
21
+ # For example, the second day of the month would be represented as 02.
22
+ # However, some HTTP servers do log a single digit day as a single digit.
23
+ # When parsing log records, you should be aware of both possible day
24
+ # representations.
25
+ #
26
+ # Author:: Jan Wikholm [jw@jw.fi]
27
+ # License:: MIT
28
+
29
+ require 'logger'
30
+
31
+ class LogFormat
32
+ attr_reader :name, :format, :format_symbols, :format_regex
33
+
34
+ # add more format directives here..
35
+ DIRECTIVES = {
36
+ # format string char => [:symbol to use, /regex to use when matching against log/]
37
+ 'h' => [:ip, /\d+\.\d+\.\d+\.\d+/],
38
+ 'l' => [:auth, /.*?/],
39
+ 'u' => [:username, /.*?/],
40
+ 't' => [:datetime, /\[.*?\]/],
41
+ 'r' => [:request, /.*?/],
42
+ 's' => [:status, /\d+/],
43
+ 'b' => [:bytecount, /-|\d+/],
44
+ 'v' => [:domain, /.*?/],
45
+ 'i' => [:header_lines, /.*?/],
46
+ }
47
+
48
+ def initialize(name, format)
49
+ @name, @format = name, format
50
+ parse_format(format)
51
+ end
52
+
53
+ # The symbols are used to map the log to the env variables
54
+ # The regex is used when checking what format the log is and to extract data
55
+ def parse_format(format)
56
+ format_directive = /%(.*?)(\{.*?\})?([#{[DIRECTIVES.keys.join('|')]}])([\s\\"]*)/
57
+
58
+ log_format_symbols = []
59
+ format_regex = ""
60
+ format.scan(format_directive) do |condition, subdirective, directive_char, ignored|
61
+ log_format, match_regex = process_directive(directive_char, subdirective, condition)
62
+ ignored.gsub!(/\s/, '\\s') unless ignored.nil?
63
+ log_format_symbols << log_format
64
+ format_regex << "(#{match_regex})#{ignored}"
65
+ end
66
+ @format_symbols = log_format_symbols
67
+ @format_regex = /^#{format_regex}/
68
+ end
69
+
70
+ def process_directive(directive_char, subdirective, condition)
71
+ directive = DIRECTIVES[directive_char]
72
+ case directive_char
73
+ when 'i'
74
+ log_format = subdirective[1...-1].downcase.tr('-', '_').to_sym
75
+ [log_format, directive[1].source]
76
+ else
77
+ [directive[0], directive[1].source]
78
+ end
79
+ end
80
+ end
81
+
82
+ class LogParser
83
+
84
+ LOG_FORMATS = {
85
+ :common => '%h %l %u %t \"%r\" %>s %b',
86
+ :common_with_virtual => '%v %h %l %u %t \"%r\" %>s %b',
87
+ :combined => '%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\"',
88
+ :combined_with_virtual => '%v %h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\"',
89
+ :combined_with_cookies => '%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\" \"%{Cookies}i\"',
90
+ }
91
+
92
+ # add any values that you may return here
93
+ STAT_ENV_MAP = {
94
+ :referer => "HTTP_REFERER",
95
+ :user_agent => "HTTP_USER_AGENT",
96
+ :ip => "REMOTE_ADDR",
97
+ :page => "PATH_INFO",
98
+ :domain => "HTTP_HOST",
99
+ :datetime => 'DATETIME',
100
+ :status => 'STATUS'
101
+ }
102
+
103
+ attr_reader :known_formats
104
+
105
+ @@log = ActiveRecord::Base.logger
106
+
107
+ def initialize()
108
+ @log_format = []
109
+ initialize_known_formats
110
+ end
111
+
112
+ # processes the format string into symbols and test regex
113
+ # and saves using LogFormat class
114
+ def initialize_known_formats
115
+ @known_formats = {}
116
+ LOG_FORMATS.each do |name, format|
117
+ @known_formats[name] = LogFormat.new(name, format)
118
+ end
119
+ end
120
+
121
+
122
+ # Checks which standard the log file (well one line) is
123
+ # Automatigally checks for most complex (longest) regex first..
124
+ def check_format(line)
125
+ @known_formats.sort_by { |key, log_format| log_format.format_regex.source.size }.reverse.each { |key, log_format|
126
+ #@@log.debug "check format: #{key}"
127
+ return key if line.match(log_format.format_regex)
128
+ }
129
+ return :unknown
130
+ end
131
+
132
+ # This is where the magic happens
133
+ # This is the end-to-end business logic of the class
134
+ #
135
+ # Call with a block that will be called with each line, as a hash
136
+ def parse_io_stream(stream)
137
+ stats = []
138
+ lines_parsed = 0
139
+ begin
140
+ stream.each do |line|
141
+ lines_parsed += 1
142
+ #@@log.debug("parse_io_stream() line: #{line.to_s}")
143
+ raw_data = parse_line(line)
144
+ #@@log.debug(raw_data.inspect)
145
+ #@@log.debug("parse_io_stream() lines parsed: #{lines_parsed}")
146
+ yield generate_stats(raw_data)
147
+ end
148
+ end
149
+ end
150
+
151
+ # Generate_stats will populate a stats hash one line at a time
152
+ # Add extra fields into the STAT_ENV_MAP hash at the top of this file.
153
+ def generate_stats(raw_data)
154
+ stats = { "PATH_INFO" => get_page(raw_data[:request]) }
155
+ STAT_ENV_MAP.each do |stat_name, env_name|
156
+ stats[env_name] = raw_data[stat_name] if raw_data.has_key? stat_name
157
+ end
158
+ #@@log.debug("stats: " + stats.inspect)
159
+ stats
160
+ end
161
+
162
+ def get_page(request)
163
+ @@log.debug "get_page: #{request}"
164
+ request[/\/.*?\s/].rstrip
165
+ end
166
+
167
+ def parse_line(line)
168
+ @format = check_format(line)
169
+ log_format = @known_formats[@format]
170
+ raise ArgumentError if log_format.nil? or line !~ log_format.format_regex
171
+ data = line.scan(log_format.format_regex).flatten
172
+ #@@log.debug "parse_line() scanned data: #{data.inspect}"
173
+ parsed_data = {}
174
+ log_format.format_symbols.size.times do |i|
175
+ #@@log.debug "setting #{log_format.format_symbols[i]} to #{data[i]}"
176
+ parsed_data[log_format.format_symbols[i]] = data[i]
177
+ end
178
+
179
+ #remove [] from time if present
180
+ parsed_data[:datetime] = parsed_data[:datetime][1...-1] if parsed_data[:datetime]
181
+ # Add ip as domain if we don't have a domain (virtual host)
182
+ # Assumes we always have an ip
183
+ parsed_data[:domain] = parsed_data[:ip] unless parsed_data[:domain]
184
+ parsed_data[:format] = @format
185
+ #@@log.debug "parse_line() parsed data: #{parsed_data.inspect}"
186
+ parsed_data
187
+ end
188
+ end
@@ -0,0 +1,27 @@
1
+ module Log2COUNTER
2
+
3
+ module Version
4
+
5
+ MAJOR = 0
6
+ MINOR = 0
7
+ TINY = 3
8
+
9
+ class << self
10
+
11
+ # Returns array representation.
12
+ def to_a
13
+ [MAJOR, MINOR, TINY]
14
+ end
15
+
16
+ # Short-cut for version string.
17
+ def to_s
18
+ to_a.join('.')
19
+ end
20
+
21
+ end
22
+
23
+ end
24
+
25
+ VERSION = Version.to_s
26
+
27
+ end
@@ -0,0 +1,53 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # log2counter -- Convert (analyse) Apache log files to COUNTER CSV. #
5
+ # #
6
+ # Copyright (C) 2007-2009 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # log2counter is free software; you can redistribute it and/or modify it #
14
+ # under the terms of the GNU General Public License as published by the Free #
15
+ # Software Foundation; either version 3 of the License, or (at your option) #
16
+ # any later version. #
17
+ # #
18
+ # log2counter is distributed in the hope that it will be useful, but WITHOUT #
19
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or #
20
+ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for #
21
+ # more details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with log2counter. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ module Log2COUNTER
30
+ end
31
+
32
+ require 'log2counter/parser'
33
+ require 'log2counter/printer'
34
+
35
+ require 'log2counter/version'
36
+
37
+ module Log2COUNTER
38
+
39
+ extend self
40
+
41
+ def load(*args)
42
+ Parser.load(*args)
43
+ end
44
+
45
+ def parse(*args)
46
+ Parser.new(*args).parse
47
+ end
48
+
49
+ def print(stats, *args)
50
+ Printer.new(*args).print(stats)
51
+ end
52
+
53
+ end
@@ -0,0 +1,8 @@
1
+ ---
2
+ #"Somewhere, Inst.":
3
+ # :name: "Inst_Somewhere"
4
+ # :ip:
5
+ # - "12.34.56"
6
+ # :id:
7
+ # - "inst.somewhere"
8
+ # :export: true
metadata ADDED
@@ -0,0 +1,87 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: log2counter
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.3
5
+ platform: ruby
6
+ authors:
7
+ - Jens Wille
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-07-23 00:00:00 +02:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: fastercsv
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 1.2.3
24
+ version:
25
+ description: Convert (analyse) Apache log files to COUNTER CSV.
26
+ email: jens.wille@uni-koeln.de
27
+ executables:
28
+ - log2counter
29
+ extensions: []
30
+
31
+ extra_rdoc_files:
32
+ - COPYING
33
+ - ChangeLog
34
+ - README
35
+ files:
36
+ - lib/log2counter.rb
37
+ - lib/log2counter/vendor/log_parser.rb
38
+ - lib/log2counter/printer.rb
39
+ - lib/log2counter/core_ext/compare_strings_and_fixnums.rb
40
+ - lib/log2counter/core_ext/sort_by_ip_or_host.rb
41
+ - lib/log2counter/version.rb
42
+ - lib/log2counter/parser.rb
43
+ - bin/log2counter
44
+ - Rakefile
45
+ - COPYING
46
+ - ChangeLog
47
+ - README
48
+ - sample/licensees.yaml
49
+ - lib/log2counter/vendor/log_parser.rb.orig
50
+ has_rdoc: true
51
+ homepage: http://prometheus.rubyforge.org/log2counter
52
+ licenses: []
53
+
54
+ post_install_message:
55
+ rdoc_options:
56
+ - --charset
57
+ - UTF-8
58
+ - --main
59
+ - README
60
+ - --line-numbers
61
+ - --inline-source
62
+ - --all
63
+ - --title
64
+ - log2counter Application documentation
65
+ require_paths:
66
+ - lib
67
+ required_ruby_version: !ruby/object:Gem::Requirement
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ version: "0"
72
+ version:
73
+ required_rubygems_version: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - ">="
76
+ - !ruby/object:Gem::Version
77
+ version: "0"
78
+ version:
79
+ requirements: []
80
+
81
+ rubyforge_project: prometheus
82
+ rubygems_version: 1.3.5
83
+ signing_key:
84
+ specification_version: 3
85
+ summary: Convert (analyse) Apache log files to COUNTER CSV.
86
+ test_files: []
87
+