log2counter 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
data/README ADDED
@@ -0,0 +1,33 @@
1
+ = log2counter - Convert (analyse) Apache log files to COUNTER CSV.
2
+
3
+ == VERSION
4
+
5
+ This documentation refers to log2counter version 0.0.3
6
+
7
+
8
+ == DESCRIPTION
9
+
10
+ Convert (analyse) Apache log files to COUNTER[http://www.projectcounter.org/] CSV.
11
+
12
+
13
+ == AUTHORS
14
+
15
+ * Jens Wille <mailto:jens.wille@uni-koeln.de>
16
+
17
+
18
+ == LICENSE AND COPYRIGHT
19
+
20
+ Copyright (C) 2007-2009 University of Cologne,
21
+ Albertus-Magnus-Platz, 50932 Cologne, Germany
22
+
23
+ log2counter is free software: you can redistribute it and/or modify it under
24
+ the terms of the GNU General Public License as published by the Free Software
25
+ Foundation, either version 3 of the License, or (at your option) any later
26
+ version.
27
+
28
+ log2counter is distributed in the hope that it will be useful, but WITHOUT
29
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
30
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
31
+
32
+ You should have received a copy of the GNU General Public License along with
33
+ log2counter. If not, see <http://www.gnu.org/licenses/>.
data/Rakefile ADDED
@@ -0,0 +1,24 @@
1
+ require %q{lib/log2counter/version}
2
+
3
+ begin
4
+ require 'hen'
5
+
6
+ Hen.lay! {{
7
+ :rubyforge => {
8
+ :project => %q{prometheus},
9
+ :package => %q{log2counter}
10
+ },
11
+
12
+ :gem => {
13
+ :version => Log2COUNTER::VERSION,
14
+ :summary => %q{Convert (analyse) Apache log files to COUNTER CSV.},
15
+ :files => FileList['lib/**/*.rb', 'bin/*'].to_a,
16
+ :extra_files => FileList['[A-Z]*', 'sample/*', 'lib/**/vendor/*'].to_a,
17
+ :dependencies => [['fastercsv', '>= 1.2.3']]
18
+ }
19
+ }}
20
+ rescue LoadError
21
+ abort "Please install the 'hen' gem first."
22
+ end
23
+
24
+ ### Place your custom Rake tasks here.
data/bin/log2counter ADDED
@@ -0,0 +1,143 @@
1
+ #! /usr/bin/ruby
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # log2counter -- Convert (analyse) Apache log files to COUNTER CSV. #
7
+ # #
8
+ # Copyright (C) 2007-2009 University of Cologne, #
9
+ # Albertus-Magnus-Platz, #
10
+ # 50932 Cologne, Germany #
11
+ # #
12
+ # Authors: #
13
+ # Jens Wille <jens.wille@uni-koeln.de> #
14
+ # #
15
+ # log2counter is free software; you can redistribute it and/or modify it #
16
+ # under the terms of the GNU General Public License as published by the Free #
17
+ # Software Foundation; either version 3 of the License, or (at your option) #
18
+ # any later version. #
19
+ # #
20
+ # log2counter is distributed in the hope that it will be useful, but WITHOUT #
21
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or #
22
+ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for #
23
+ # more details. #
24
+ # #
25
+ # You should have received a copy of the GNU General Public License along #
26
+ # with log2counter. If not, see <http://www.gnu.org/licenses/>. #
27
+ # #
28
+ ###############################################################################
29
+ #++
30
+
31
+ require 'optparse'
32
+ require 'yaml'
33
+ require 'zlib'
34
+
35
+ $: << File.join(File.dirname(__FILE__), '..', 'lib')
36
+
37
+ require 'log2counter'
38
+
39
+ USAGE = "Usage: #{$0} [-h|--help] [options]"
40
+ abort USAGE if ARGV.empty?
41
+
42
+ def open_file(file, mode = 'r')
43
+ if file =~ /\.gz/i
44
+ case mode
45
+ when 'r' then Zlib::GzipReader.open(file)
46
+ when 'w' then Zlib::GzipWriter.open(file)
47
+ else raise ArgumentError, "invalid mode #{mode}"
48
+ end
49
+ else
50
+ File.open(file, mode)
51
+ end
52
+ end
53
+
54
+ options = {
55
+ :licensees => 'licensees.yaml',
56
+ :log_file => STDIN,
57
+ :csv_file => STDOUT,
58
+ :months => nil,
59
+ :regexp => nil,
60
+ :summarize => false,
61
+ :summarize_file => nil
62
+ }
63
+
64
+ OptionParser.new { |opts|
65
+ opts.banner = USAGE
66
+
67
+ opts.separator ' '
68
+ opts.separator 'Options:'
69
+
70
+ opts.on('-i', '--log-file LOG', 'Input log file [Default: STDIN]') { |f|
71
+ abort "Can't find log file #{f}" unless File.readable?(f)
72
+ options[:log_file] = open_file(f)
73
+ }
74
+
75
+ opts.on('-o', '--csv-file CSV', 'Output CSV file [Default: STDOUT]') { |f|
76
+ options[:csv_file] = open_file(f, 'w')
77
+ }
78
+
79
+ opts.separator ' '
80
+
81
+ opts.on('-l', '--licensees YAML', "Licensees list [Default: #{options[:licensees]}]") { |f|
82
+ abort "Can't find licensee file #{f}" unless File.readable?(f)
83
+ options[:licensees] = f
84
+ }
85
+
86
+ opts.separator ' '
87
+
88
+ opts.on('-m', '--months LIST', 'Comma-separated list of month(s) [YYYY_MM]') { |m|
89
+ options[:months] = m.split(',')
90
+ }
91
+
92
+ opts.on('-r', '--regexp REGEXP', 'Comma-separated list of regular expressions', '[id=...,login=...,search=...,download=...]') { |r|
93
+ r.split(',').each { |str|
94
+ key, pat = str.split('=', 2)
95
+ (options[:regexp] ||= {})[key.to_sym] = Regexp.new(pat)
96
+ }
97
+ }
98
+
99
+ opts.separator ' '
100
+
101
+ opts.on('-s', '--summarize', "Summarize stats per licensee per month; don't", 'print client details') {
102
+ options[:summarize] = true
103
+ }
104
+
105
+ opts.on('-S', '--summarize-file CSV', 'Summarize existing CSV file (in full format);', "use '-' for STDIN") { |f|
106
+ options[:summarize] = true
107
+
108
+ if f == '-'
109
+ options[:summarize_file] = STDIN
110
+ else
111
+ abort "Can't find CSV file #{f}" unless File.readable?(f)
112
+ options[:summarize_file] = open_file(f)
113
+ end
114
+ }
115
+
116
+ opts.separator ' '
117
+ opts.separator 'Generic options:'
118
+
119
+ opts.on('-h', '--help', 'Print this help message and exit') {
120
+ puts opts
121
+ exit
122
+ }
123
+
124
+ opts.on('--version', 'Print program version and exit') {
125
+ puts "#{File.basename($0)} v#{Log2COUNTER::VERSION}"
126
+ exit
127
+ }
128
+
129
+ opts.separator ' '
130
+ opts.separator "If LOG or CSV ends in '.gz', it's treated as a Gzipped file."
131
+ }.parse!
132
+
133
+ begin
134
+ stats = if options[:summarize_file]
135
+ Log2COUNTER.load(options[:summarize_file])
136
+ else
137
+ Log2COUNTER.parse(options[:log_file], YAML.load_file(options[:licensees]), options[:months], options[:regexp])
138
+ end
139
+
140
+ Log2COUNTER.print(stats, options[:csv_file], options[:summarize])
141
+ ensure
142
+ options.each { |k, v| v.close if v.respond_to?(:close) }
143
+ end
@@ -0,0 +1,56 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of log2counter, the Apache log to COUNTER CSV converter. #
5
+ # #
6
+ # Copyright (C) 2007-2009 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # log2counter is free software; you can redistribute it and/or modify it #
14
+ # under the terms of the GNU General Public License as published by the Free #
15
+ # Software Foundation; either version 3 of the License, or (at your option) #
16
+ # any later version. #
17
+ # #
18
+ # log2counter is distributed in the hope that it will be useful, but WITHOUT #
19
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or #
20
+ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for #
21
+ # more details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with log2counter. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ # Allow to compare strings and fixnums; the latter sorting before the other.
30
+
31
+ class String
32
+
33
+ alias_method :_log2counter_original_cmp, :<=>
34
+
35
+ def <=>(other)
36
+ case other
37
+ when Fixnum then 1 # Fixnums always sort before us.
38
+ else _log2counter_original_cmp(other)
39
+ end
40
+ end
41
+
42
+ end
43
+
44
+
45
+ class Fixnum
46
+
47
+ alias_method :_log2counter_original_cmp, :<=>
48
+
49
+ def <=>(other)
50
+ case other
51
+ when String then -1 # Strings always sort after us.
52
+ else _log2counter_original_cmp(other)
53
+ end
54
+ end
55
+
56
+ end
@@ -0,0 +1,43 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of log2counter, the Apache log to COUNTER CSV converter. #
5
+ # #
6
+ # Copyright (C) 2007-2009 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # log2counter is free software; you can redistribute it and/or modify it #
14
+ # under the terms of the GNU General Public License as published by the Free #
15
+ # Software Foundation; either version 3 of the License, or (at your option) #
16
+ # any later version. #
17
+ # #
18
+ # log2counter is distributed in the hope that it will be useful, but WITHOUT #
19
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or #
20
+ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for #
21
+ # more details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with log2counter. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ # We need strings and fixnums to be comparable.
30
+ require 'log2counter/core_ext/compare_strings_and_fixnums'
31
+
32
+ class Hash
33
+
34
+ # Sort IP addresses numerically by net part, and host names just as usual.
35
+ def sort_by_ip_or_host
36
+ sort_by { |key, _|
37
+ key ? key.split('.').map { |part|
38
+ part =~ /\A\d+\z/ ? part.to_i : part
39
+ } : []
40
+ }
41
+ end
42
+
43
+ end
@@ -0,0 +1,213 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of log2counter, the Apache log to COUNTER CSV converter. #
5
+ # #
6
+ # Copyright (C) 2007-2009 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # log2counter is free software; you can redistribute it and/or modify it #
14
+ # under the terms of the GNU General Public License as published by the Free #
15
+ # Software Foundation; either version 3 of the License, or (at your option) #
16
+ # any later version. #
17
+ # #
18
+ # log2counter is distributed in the hope that it will be useful, but WITHOUT #
19
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or #
20
+ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for #
21
+ # more details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with log2counter. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ require 'resolv'
30
+ require 'uri'
31
+
32
+ # Get us some help for the hard part... -- um, it's not that hard, is it? ;-)
33
+ require 'log2counter/vendor/log_parser'
34
+
35
+ class Log2COUNTER::Parser
36
+
37
+ # Map month abbreviations to their two-digit number equivalent.
38
+ ABBR2MONTH = {}
39
+ Date::ABBR_MONTHNAMES.each_with_index { |abbr, index|
40
+ ABBR2MONTH[abbr] = '%02d' % index
41
+ }
42
+
43
+ year = Time.now.year
44
+
45
+ # By default we will consider the current year.
46
+ DEFAULT_MONTHS = (1..12).map { |month| '%d_%02d' % [year, month] }
47
+
48
+ # This is what we start with -- all zero.
49
+ DEFAULT_STATS = {
50
+ :sessions => 0,
51
+ :searches => 0,
52
+ :downloads => 0
53
+ }
54
+
55
+ # NOTE: <tt>:id</tt> should contain capture group for ID
56
+ DEFAULT_REGEXP = {
57
+ :id => //,
58
+ :login => //,
59
+ :search => //,
60
+ :download => //
61
+ }
62
+
63
+ class << self
64
+
65
+ def load(csv_file)
66
+ FasterCSV.new(csv_file, :headers => true).inject({}) { |stats, row|
67
+ month, licensee, name, address, sessions, searches, downloads = row.fields
68
+
69
+ (((stats[month] ||= {})[licensee] ||= {})[name] ||= {})[address] = {
70
+ :sessions => sessions.to_i,
71
+ :searches => searches.to_i,
72
+ :downloads => downloads.to_i
73
+ }
74
+
75
+ stats
76
+ }
77
+ end
78
+
79
+ end
80
+
81
+ attr_reader :log_file, :licensees, :months, :licensees_by_ip, :licensees_by_id, :regexp, :constraint
82
+
83
+ def initialize(log_file, licensees, months = nil, regexp = nil)
84
+ @log_file = log_file
85
+
86
+ @months = months || DEFAULT_MONTHS
87
+ raise ArgumentError, "illegal format for month; must be YYYY_MM" if @months.any? { |month|
88
+ month !~ /\A\d\d\d\d_\d\d\z/
89
+ }
90
+
91
+ @regexp = DEFAULT_REGEXP.merge(regexp || {})
92
+ @constraint = Regexp.union(*@regexp.values)
93
+
94
+ @licensees = licensees.reject { |_, hash| !hash[:export] }
95
+ initialize_licensees
96
+ end
97
+
98
+ # Now here's the method you want to call. Returns a hash:
99
+ #
100
+ # stats = {
101
+ # '2007_06' => {
102
+ # 'Somewhere, Inst.' => {
103
+ # '12.34.56.78' => {
104
+ # :sessions => 12,
105
+ # :searches => 34,
106
+ # :downloads => 56
107
+ # },
108
+ # ...
109
+ # },
110
+ # ...
111
+ # },
112
+ # ...
113
+ # }
114
+ def parse
115
+ # Cache resolved host names.
116
+ addr2addr = Hash.new { |hash, addr|
117
+ hash[addr] = begin
118
+ Resolv.getaddress(addr)
119
+ rescue Resolv::ResolvError
120
+ addr
121
+ end
122
+ }
123
+
124
+ # Cache licensees.
125
+ addr2lcee = Hash.new { |hash, addr|
126
+ hash[addr] = licensees_by_ip.get(addr)
127
+ }
128
+
129
+ # Our result hash
130
+ stats = {}
131
+
132
+ # Create a new LogParser and send our log file. Yields a hash per line.
133
+ LogParser.new(:minimal, constraint).parse_io_stream(log_file) { |stat|
134
+ path = stat['PATH_INFO']
135
+
136
+ # Skip lines that don't have any useful information for us anyway.
137
+ next unless path =~ constraint
138
+
139
+ # Maybe we already captured the licensee ID? (see DEFAULT_REGEXP above)
140
+ id = $1
141
+
142
+ m, y = stat['DATETIME'][/\/(.*?):/, 1].split('/') # Extract month and year
143
+ month = [y, ABBR2MONTH[m]].join('_') # Target format is 'YYYY_MM'
144
+
145
+ # Skip lines that fall out of the range we're interested in.
146
+ next unless months.include?(month)
147
+
148
+ address = addr2addr[stat['REMOTE_ADDR']]
149
+ licensee = addr2lcee[address] || licensees_by_id[
150
+ URI.decode(id || path[regexp[:id], 1] || '')
151
+ ]
152
+
153
+ # Couldn't find a matching licensee? Skip it!
154
+ next unless licensee
155
+
156
+ name = licensee[:name]
157
+ licensee = licensee[:licensee]
158
+
159
+ (((stats[month] ||= {})[licensee] ||= {})[name] ||= {})[address] ||= DEFAULT_STATS.dup
160
+ _address = stats[month][licensee][name][address]
161
+
162
+ # Increment our counts, since that's what we're here for...
163
+ _address[:sessions] += 1 if path =~ regexp[:login]
164
+ _address[:searches] += 1 if path =~ regexp[:search]
165
+ _address[:downloads] += 1 if path =~ regexp[:download]
166
+ }
167
+
168
+ # Now we need to fill in any months and licensees we didn't come across before.
169
+ months.each { |month|
170
+ stats[month] ||= {}
171
+
172
+ licensees.each { |licensee, hash|
173
+ stats[month][licensee] ||= {}
174
+ addresses = stats[month][licensee][hash[:name]]
175
+
176
+ if addresses
177
+ # Drop entries with zero sessions -- how come they occur, anyway?
178
+ addresses.delete_if { |_, stat| stat[:sessions].zero? }
179
+ end
180
+
181
+ # Add a default "empty" entry for completeness' sake.
182
+ if addresses.nil? || addresses.empty?
183
+ stats[month][licensee][hash[:name]] = { nil => DEFAULT_STATS }
184
+ end
185
+ }
186
+ }
187
+
188
+ # That's it, return what we've got.
189
+ stats
190
+ end
191
+
192
+ protected
193
+
194
+ # Create additional mappings for our licensees.
195
+ def initialize_licensees
196
+ @licensees_by_ip = {}
197
+ @licensees_by_id = {}
198
+
199
+ licensees.each { |licensee, hash|
200
+ _hash = { :licensee => licensee, :name => hash[:name]}
201
+
202
+ hash[:ip].each { |ip| licensees_by_ip[ip] = _hash }
203
+ hash[:id].each { |id| licensees_by_id[id] = _hash }
204
+ }
205
+
206
+ # Convenience method to get a licensee from an address. Note that
207
+ # +licensees_by_ip+ usually has subnets instead of full IPs.
208
+ def licensees_by_ip.get(ip)
209
+ find(lambda { [] }) { |key, _| ip[0, key.length] == key }.last
210
+ end
211
+ end
212
+
213
+ end
@@ -0,0 +1,100 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of log2counter, the Apache log to COUNTER CSV converter. #
5
+ # #
6
+ # Copyright (C) 2007-2009 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # log2counter is free software; you can redistribute it and/or modify it #
14
+ # under the terms of the GNU General Public License as published by the Free #
15
+ # Software Foundation; either version 3 of the License, or (at your option) #
16
+ # any later version. #
17
+ # #
18
+ # log2counter is distributed in the hope that it will be useful, but WITHOUT #
19
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or #
20
+ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for #
21
+ # more details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with log2counter. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ require 'rubygems'
30
+ require 'fastercsv' # >= 1.2.3 for Gzip support
31
+
32
+ # We'd like to sort hashes by keys of IPs and host names.
33
+ require 'log2counter/core_ext/sort_by_ip_or_host'
34
+
35
+ class Log2COUNTER::Printer
36
+
37
+ COLUMNS = %w[
38
+ Jahr_Monat
39
+ Konsortial-Mitglied
40
+ Nutzer
41
+ Einzelkunden-Subidentifier
42
+ Sessions
43
+ Searches
44
+ Downloads
45
+ ]
46
+
47
+ attr_reader :csv, :summarize
48
+
49
+ def initialize(csv_file, summarize = false)
50
+ @csv = FasterCSV.new(csv_file)
51
+
52
+ @summarize = summarize
53
+ end
54
+
55
+ def print(stats)
56
+ # Output CSV header.
57
+ csv << COLUMNS
58
+
59
+ stats.sort.each { |month, licensees|
60
+ licensees.sort.each { |licensee, names|
61
+ names.sort.each { |name, addresses|
62
+ if summarize
63
+ total = Log2COUNTER::Parser::DEFAULT_STATS.dup
64
+
65
+ addresses.each { |address, stat|
66
+ total[:sessions] += stat[:sessions]
67
+ total[:searches] += stat[:searches]
68
+ total[:downloads] += stat[:downloads]
69
+ }
70
+
71
+ csv << [
72
+ month,
73
+ licensee,
74
+ name,
75
+ nil,
76
+ total[:sessions],
77
+ total[:searches],
78
+ total[:downloads]
79
+ ]
80
+ else
81
+ addresses.sort_by_ip_or_host.each { |address, stat|
82
+ csv << [
83
+ month,
84
+ licensee,
85
+ name,
86
+ address,
87
+ stat[:sessions],
88
+ stat[:searches],
89
+ stat[:downloads]
90
+ ]
91
+ }
92
+ end
93
+ }
94
+ }
95
+ }
96
+
97
+ csv.flush
98
+ end
99
+
100
+ end