log2counter 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,220 @@
1
+ # This program parses weblogs in the NCSA Common (access log) format or
2
+ # NCSA Combined log format
3
+ #
4
+ # One line consists of
5
+ # host rfc931 username date:time request statuscode bytes
6
+ # For example
7
+ # 1.2.3.4 - dsmith [10/Oct/1999:21:15:05 +0500] "GET /index.html HTTP/1.0" 200 12
8
+ # [dd/MMM/yyyy:hh:mm:ss +-hhmm]
9
+ # Where
10
+ # dd is the day of the month
11
+ # MMM is the month
12
+ # yyy is the year
13
+ # :hh is the hour
14
+ # :mm is the minute
15
+ # :ss is the seconds
16
+ # +-hhmm is the time zone
17
+ #
18
+ # In practice, the day is typically logged in two-digit format even for
19
+ # single-digit days.
20
+ # For example, the second day of the month would be represented as 02.
21
+ # However, some HTTP servers do log a single digit day as a single digit.
22
+ # When parsing log records, you should be aware of both possible day
23
+ # representations.
24
+ #
25
+ # Author:: Jan Wikholm [jw@jw.fi]
26
+ # License:: MIT
27
+ #
28
+ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
29
+ #
30
+ # Modified by Jens Wille <jens.wille@uni-koeln.de>; see log_parser.rb.orig for the
31
+ # original file.
32
+
33
+ require 'logger'
34
+
35
+ class LogFormat
36
+
37
+ attr_reader :name, :format, :format_symbols, :format_regex
38
+
39
+ # Add more format directives here.
40
+ DIRECTIVES = {
41
+ # format string char => [:symbol to use, /regex to use when matching against log/]
42
+ 'h' => [:ip, /(?:\d+\.\d+\.\d+\.\d+)|(?:[\w.-]+)/],
43
+ 'l' => [:auth, /.*?/],
44
+ 'u' => [:username, /.*?/],
45
+ 't' => [:datetime, /\[.*?\]/],
46
+ 'r' => [:request, /.*?/],
47
+ 'R' => [:request, /.*?(:?\"|\z)/],
48
+ 's' => [:status, /\d+/],
49
+ 'b' => [:bytecount, /-|\d+/],
50
+ 'v' => [:domain, /.*?/],
51
+ 'i' => [:header_lines, /.*?/],
52
+ }
53
+
54
+ def initialize(name, format)
55
+ @name, @format = name, format
56
+ parse_format(format)
57
+ end
58
+
59
+ # The symbols are used to map the log to the env variables.
60
+ # The regex is used when checking what format the log is and to extract data.
61
+ def parse_format(format)
62
+ format_directive = /%(.*?)(\{.*?\})?([#{[DIRECTIVES.keys.join('|')]}])([\s\\"]*)/
63
+
64
+ log_format_symbols = []
65
+ format_regex = ''
66
+
67
+ format.scan(format_directive) { |condition, subdirective, directive_char, ignored|
68
+ log_format, match_regex = process_directive(directive_char, subdirective, condition)
69
+
70
+ ignored.gsub!(/\s/, '\\s') if ignored
71
+
72
+ log_format_symbols << log_format
73
+ format_regex << "(#{match_regex})#{ignored}"
74
+ }
75
+
76
+ @format_symbols = log_format_symbols
77
+ @format_regex = /\A#{format_regex}/
78
+ end
79
+
80
+ def process_directive(directive_char, subdirective, condition)
81
+ directive = DIRECTIVES[directive_char]
82
+
83
+ case directive_char
84
+ when 'i'
85
+ log_format = subdirective[1...-1].downcase.tr('-', '_').to_sym
86
+ [log_format, directive[1].source]
87
+ else
88
+ [directive[0], directive[1].source]
89
+ end
90
+ end
91
+
92
+ end
93
+
94
+ class LogParser
95
+
96
+ LOG_FORMATS = {
97
+ :common => '%h %l %u %t \"%r\" %>s %b',
98
+ :combined => '%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\"',
99
+ :minimal => '%h %l %u %t \"%R'
100
+ }
101
+
102
+ # Add any values that you may return here.
103
+ STAT_ENV_MAP = {
104
+ :ip => 'REMOTE_ADDR',
105
+ :page => 'PATH_INFO',
106
+ :datetime => 'DATETIME',
107
+ :status => 'STATUS',
108
+ :domain => 'HTTP_HOST',
109
+ :referer => 'HTTP_REFERER',
110
+ :user_agent => 'HTTP_USER_AGENT'
111
+ }
112
+
113
+ attr_reader :constraint, :known_formats, :log_format
114
+
115
+ def initialize(format = nil, constraint = nil)
116
+ @format = format
117
+ @constraint = constraint
118
+
119
+ initialize_known_formats
120
+
121
+ @log_format = known_formats[@format] if @format
122
+ end
123
+
124
+ # Processes the format string into symbols and test regex
125
+ # and saves using LogFormat class.
126
+ def initialize_known_formats
127
+ @known_formats = {}
128
+
129
+ LOG_FORMATS.each { |name, format|
130
+ @known_formats[name] = LogFormat.new(name, format)
131
+ }
132
+ end
133
+
134
+ # Checks which standard the log file (well one line) is.
135
+ # Automatically checks for most complex (longest) regex first.
136
+ def check_format(line)
137
+ @known_formats.sort_by { |key, log_format|
138
+ log_format.format_regex.source.size
139
+ }.reverse.each { |key, log_format|
140
+ return key if line.match(log_format.format_regex)
141
+ }
142
+
143
+ return :unknown
144
+ end
145
+
146
+ # This is where the magic happens.
147
+ # This is the end-to-end business logic of the class.
148
+ #
149
+ # Call with a block that will be called with each line, as a hash.
150
+ def parse_io_stream(stream)
151
+ stats = []
152
+ lines_parsed = 0
153
+
154
+ stream.each { |line|
155
+ line.chomp!
156
+ lines_parsed += 1
157
+ warn "##{lines_parsed}" if (lines_parsed % 10000).zero?
158
+
159
+ next if constraint && line !~ constraint
160
+
161
+ begin
162
+ parsed_data = parse_line(line)
163
+ yield generate_stats(parsed_data)
164
+ rescue FormatError
165
+ warn "Corrupt line [#{lines_parsed}]: #{line.inspect}"
166
+ rescue => err
167
+ raise err.class, "#{err.class} [#{lines_parsed}]: #{line.inspect}\n\n" <<
168
+ "#{parsed_data.inspect}\n\n#{err}"
169
+ end
170
+ }
171
+ end
172
+
173
+ # Populate a stats hash one line at a time.
174
+ # Add extra fields into the STAT_ENV_MAP hash at the top of this file.
175
+ def generate_stats(parsed_data)
176
+ stats = { 'PATH_INFO' => get_page(parsed_data[:request]) }
177
+
178
+ STAT_ENV_MAP.each { |stat_name, env_name|
179
+ stats[env_name] = parsed_data[stat_name] if parsed_data.has_key?(stat_name)
180
+ }
181
+
182
+ stats
183
+ end
184
+
185
+ def get_page(request)
186
+ (request[/\/.*?(?:\s|\z)/] || request).strip
187
+ end
188
+
189
+ def parse_line(line)
190
+ unless @format && @log_format
191
+ @format = check_format(line)
192
+ @log_format = known_formats[@format]
193
+
194
+ unless log_format && line =~ log_format.format_regex
195
+ raise FormatError, line
196
+ end
197
+ end
198
+
199
+ data = line.scan(log_format.format_regex).flatten
200
+
201
+ parsed_data = {}
202
+ log_format.format_symbols.each_with_index { |format_symbol, index|
203
+ parsed_data[format_symbol] = data[index]
204
+ }
205
+
206
+ # Remove [] from time.
207
+ parsed_data[:datetime] &&= parsed_data[:datetime][1...-1]
208
+
209
+ # Add IP as domain if we don't have a domain (virtual host).
210
+ # Assumes we always have an IP.
211
+ parsed_data[:domain] ||= parsed_data[:ip]
212
+
213
+ parsed_data[:format] = @format
214
+
215
+ parsed_data
216
+ end
217
+
218
+ class FormatError < ArgumentError; end
219
+
220
+ end
@@ -0,0 +1,188 @@
1
+ #!/usr/bin/ruby
2
+ # This program parses weblogs in the NCSA Common (access log) format or
3
+ # NCSA Combined log format
4
+ #
5
+ # One line consists of
6
+ # host rfc931 username date:time request statuscode bytes
7
+ # For example
8
+ # 1.2.3.4 - dsmith [10/Oct/1999:21:15:05 +0500] "GET /index.html HTTP/1.0" 200 12
9
+ # [dd/MMM/yyyy:hh:mm:ss +-hhmm]
10
+ # Where
11
+ # dd is the day of the month
12
+ # MMM is the month
13
+ # yyy is the year
14
+ # :hh is the hour
15
+ # :mm is the minute
16
+ # :ss is the seconds
17
+ # +-hhmm is the time zone
18
+ #
19
+ # In practice, the day is typically logged in two-digit format even for
20
+ # single-digit days.
21
+ # For example, the second day of the month would be represented as 02.
22
+ # However, some HTTP servers do log a single digit day as a single digit.
23
+ # When parsing log records, you should be aware of both possible day
24
+ # representations.
25
+ #
26
+ # Author:: Jan Wikholm [jw@jw.fi]
27
+ # License:: MIT
28
+
29
+ require 'logger'
30
+
31
+ class LogFormat
32
+ attr_reader :name, :format, :format_symbols, :format_regex
33
+
34
+ # add more format directives here..
35
+ DIRECTIVES = {
36
+ # format string char => [:symbol to use, /regex to use when matching against log/]
37
+ 'h' => [:ip, /\d+\.\d+\.\d+\.\d+/],
38
+ 'l' => [:auth, /.*?/],
39
+ 'u' => [:username, /.*?/],
40
+ 't' => [:datetime, /\[.*?\]/],
41
+ 'r' => [:request, /.*?/],
42
+ 's' => [:status, /\d+/],
43
+ 'b' => [:bytecount, /-|\d+/],
44
+ 'v' => [:domain, /.*?/],
45
+ 'i' => [:header_lines, /.*?/],
46
+ }
47
+
48
+ def initialize(name, format)
49
+ @name, @format = name, format
50
+ parse_format(format)
51
+ end
52
+
53
+ # The symbols are used to map the log to the env variables
54
+ # The regex is used when checking what format the log is and to extract data
55
+ def parse_format(format)
56
+ format_directive = /%(.*?)(\{.*?\})?([#{[DIRECTIVES.keys.join('|')]}])([\s\\"]*)/
57
+
58
+ log_format_symbols = []
59
+ format_regex = ""
60
+ format.scan(format_directive) do |condition, subdirective, directive_char, ignored|
61
+ log_format, match_regex = process_directive(directive_char, subdirective, condition)
62
+ ignored.gsub!(/\s/, '\\s') unless ignored.nil?
63
+ log_format_symbols << log_format
64
+ format_regex << "(#{match_regex})#{ignored}"
65
+ end
66
+ @format_symbols = log_format_symbols
67
+ @format_regex = /^#{format_regex}/
68
+ end
69
+
70
+ def process_directive(directive_char, subdirective, condition)
71
+ directive = DIRECTIVES[directive_char]
72
+ case directive_char
73
+ when 'i'
74
+ log_format = subdirective[1...-1].downcase.tr('-', '_').to_sym
75
+ [log_format, directive[1].source]
76
+ else
77
+ [directive[0], directive[1].source]
78
+ end
79
+ end
80
+ end
81
+
82
+ class LogParser
83
+
84
+ LOG_FORMATS = {
85
+ :common => '%h %l %u %t \"%r\" %>s %b',
86
+ :common_with_virtual => '%v %h %l %u %t \"%r\" %>s %b',
87
+ :combined => '%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\"',
88
+ :combined_with_virtual => '%v %h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\"',
89
+ :combined_with_cookies => '%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\" \"%{Cookies}i\"',
90
+ }
91
+
92
+ # add any values that you may return here
93
+ STAT_ENV_MAP = {
94
+ :referer => "HTTP_REFERER",
95
+ :user_agent => "HTTP_USER_AGENT",
96
+ :ip => "REMOTE_ADDR",
97
+ :page => "PATH_INFO",
98
+ :domain => "HTTP_HOST",
99
+ :datetime => 'DATETIME',
100
+ :status => 'STATUS'
101
+ }
102
+
103
+ attr_reader :known_formats
104
+
105
+ @@log = ActiveRecord::Base.logger
106
+
107
+ def initialize()
108
+ @log_format = []
109
+ initialize_known_formats
110
+ end
111
+
112
+ # processes the format string into symbols and test regex
113
+ # and saves using LogFormat class
114
+ def initialize_known_formats
115
+ @known_formats = {}
116
+ LOG_FORMATS.each do |name, format|
117
+ @known_formats[name] = LogFormat.new(name, format)
118
+ end
119
+ end
120
+
121
+
122
+ # Checks which standard the log file (well one line) is
123
+ # Automatigally checks for most complex (longest) regex first..
124
+ def check_format(line)
125
+ @known_formats.sort_by { |key, log_format| log_format.format_regex.source.size }.reverse.each { |key, log_format|
126
+ #@@log.debug "check format: #{key}"
127
+ return key if line.match(log_format.format_regex)
128
+ }
129
+ return :unknown
130
+ end
131
+
132
+ # This is where the magic happens
133
+ # This is the end-to-end business logic of the class
134
+ #
135
+ # Call with a block that will be called with each line, as a hash
136
+ def parse_io_stream(stream)
137
+ stats = []
138
+ lines_parsed = 0
139
+ begin
140
+ stream.each do |line|
141
+ lines_parsed += 1
142
+ #@@log.debug("parse_io_stream() line: #{line.to_s}")
143
+ raw_data = parse_line(line)
144
+ #@@log.debug(raw_data.inspect)
145
+ #@@log.debug("parse_io_stream() lines parsed: #{lines_parsed}")
146
+ yield generate_stats(raw_data)
147
+ end
148
+ end
149
+ end
150
+
151
+ # Generate_stats will populate a stats hash one line at a time
152
+ # Add extra fields into the STAT_ENV_MAP hash at the top of this file.
153
+ def generate_stats(raw_data)
154
+ stats = { "PATH_INFO" => get_page(raw_data[:request]) }
155
+ STAT_ENV_MAP.each do |stat_name, env_name|
156
+ stats[env_name] = raw_data[stat_name] if raw_data.has_key? stat_name
157
+ end
158
+ #@@log.debug("stats: " + stats.inspect)
159
+ stats
160
+ end
161
+
162
+ def get_page(request)
163
+ @@log.debug "get_page: #{request}"
164
+ request[/\/.*?\s/].rstrip
165
+ end
166
+
167
+ def parse_line(line)
168
+ @format = check_format(line)
169
+ log_format = @known_formats[@format]
170
+ raise ArgumentError if log_format.nil? or line !~ log_format.format_regex
171
+ data = line.scan(log_format.format_regex).flatten
172
+ #@@log.debug "parse_line() scanned data: #{data.inspect}"
173
+ parsed_data = {}
174
+ log_format.format_symbols.size.times do |i|
175
+ #@@log.debug "setting #{log_format.format_symbols[i]} to #{data[i]}"
176
+ parsed_data[log_format.format_symbols[i]] = data[i]
177
+ end
178
+
179
+ #remove [] from time if present
180
+ parsed_data[:datetime] = parsed_data[:datetime][1...-1] if parsed_data[:datetime]
181
+ # Add ip as domain if we don't have a domain (virtual host)
182
+ # Assumes we always have an ip
183
+ parsed_data[:domain] = parsed_data[:ip] unless parsed_data[:domain]
184
+ parsed_data[:format] = @format
185
+ #@@log.debug "parse_line() parsed data: #{parsed_data.inspect}"
186
+ parsed_data
187
+ end
188
+ end
@@ -0,0 +1,27 @@
1
+ module Log2COUNTER
2
+
3
+ module Version
4
+
5
+ MAJOR = 0
6
+ MINOR = 0
7
+ TINY = 3
8
+
9
+ class << self
10
+
11
+ # Returns array representation.
12
+ def to_a
13
+ [MAJOR, MINOR, TINY]
14
+ end
15
+
16
+ # Short-cut for version string.
17
+ def to_s
18
+ to_a.join('.')
19
+ end
20
+
21
+ end
22
+
23
+ end
24
+
25
+ VERSION = Version.to_s
26
+
27
+ end
@@ -0,0 +1,53 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # log2counter -- Convert (analyse) Apache log files to COUNTER CSV. #
5
+ # #
6
+ # Copyright (C) 2007-2009 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # log2counter is free software; you can redistribute it and/or modify it #
14
+ # under the terms of the GNU General Public License as published by the Free #
15
+ # Software Foundation; either version 3 of the License, or (at your option) #
16
+ # any later version. #
17
+ # #
18
+ # log2counter is distributed in the hope that it will be useful, but WITHOUT #
19
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or #
20
+ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for #
21
+ # more details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with log2counter. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ module Log2COUNTER
30
+ end
31
+
32
+ require 'log2counter/parser'
33
+ require 'log2counter/printer'
34
+
35
+ require 'log2counter/version'
36
+
37
+ module Log2COUNTER
38
+
39
+ extend self
40
+
41
+ def load(*args)
42
+ Parser.load(*args)
43
+ end
44
+
45
+ def parse(*args)
46
+ Parser.new(*args).parse
47
+ end
48
+
49
+ def print(stats, *args)
50
+ Printer.new(*args).print(stats)
51
+ end
52
+
53
+ end
@@ -0,0 +1,8 @@
1
+ ---
2
+ #"Somewhere, Inst.":
3
+ # :name: "Inst_Somewhere"
4
+ # :ip:
5
+ # - "12.34.56"
6
+ # :id:
7
+ # - "inst.somewhere"
8
+ # :export: true
metadata ADDED
@@ -0,0 +1,87 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: log2counter
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.3
5
+ platform: ruby
6
+ authors:
7
+ - Jens Wille
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-07-23 00:00:00 +02:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: fastercsv
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 1.2.3
24
+ version:
25
+ description: Convert (analyse) Apache log files to COUNTER CSV.
26
+ email: jens.wille@uni-koeln.de
27
+ executables:
28
+ - log2counter
29
+ extensions: []
30
+
31
+ extra_rdoc_files:
32
+ - COPYING
33
+ - ChangeLog
34
+ - README
35
+ files:
36
+ - lib/log2counter.rb
37
+ - lib/log2counter/vendor/log_parser.rb
38
+ - lib/log2counter/printer.rb
39
+ - lib/log2counter/core_ext/compare_strings_and_fixnums.rb
40
+ - lib/log2counter/core_ext/sort_by_ip_or_host.rb
41
+ - lib/log2counter/version.rb
42
+ - lib/log2counter/parser.rb
43
+ - bin/log2counter
44
+ - Rakefile
45
+ - COPYING
46
+ - ChangeLog
47
+ - README
48
+ - sample/licensees.yaml
49
+ - lib/log2counter/vendor/log_parser.rb.orig
50
+ has_rdoc: true
51
+ homepage: http://prometheus.rubyforge.org/log2counter
52
+ licenses: []
53
+
54
+ post_install_message:
55
+ rdoc_options:
56
+ - --charset
57
+ - UTF-8
58
+ - --main
59
+ - README
60
+ - --line-numbers
61
+ - --inline-source
62
+ - --all
63
+ - --title
64
+ - log2counter Application documentation
65
+ require_paths:
66
+ - lib
67
+ required_ruby_version: !ruby/object:Gem::Requirement
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ version: "0"
72
+ version:
73
+ required_rubygems_version: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - ">="
76
+ - !ruby/object:Gem::Version
77
+ version: "0"
78
+ version:
79
+ requirements: []
80
+
81
+ rubyforge_project: prometheus
82
+ rubygems_version: 1.3.5
83
+ signing_key:
84
+ specification_version: 3
85
+ summary: Convert (analyse) Apache log files to COUNTER CSV.
86
+ test_files: []
87
+