log2counter 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README ADDED
@@ -0,0 +1,33 @@
1
+ = log2counter - Convert (analyse) Apache log files to COUNTER CSV.
2
+
3
+ == VERSION
4
+
5
+ This documentation refers to log2counter version 0.0.3
6
+
7
+
8
+ == DESCRIPTION
9
+
10
+ Convert (analyse) Apache log files to COUNTER[http://www.projectcounter.org/] CSV.
11
+
12
+
13
+ == AUTHORS
14
+
15
+ * Jens Wille <mailto:jens.wille@uni-koeln.de>
16
+
17
+
18
+ == LICENSE AND COPYRIGHT
19
+
20
+ Copyright (C) 2007-2009 University of Cologne,
21
+ Albertus-Magnus-Platz, 50932 Cologne, Germany
22
+
23
+ log2counter is free software: you can redistribute it and/or modify it under
24
+ the terms of the GNU General Public License as published by the Free Software
25
+ Foundation, either version 3 of the License, or (at your option) any later
26
+ version.
27
+
28
+ log2counter is distributed in the hope that it will be useful, but WITHOUT
29
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
30
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
31
+
32
+ You should have received a copy of the GNU General Public License along with
33
+ log2counter. If not, see <http://www.gnu.org/licenses/>.
data/Rakefile ADDED
@@ -0,0 +1,24 @@
1
+ require %q{lib/log2counter/version}
2
+
3
+ begin
4
+ require 'hen'
5
+
6
+ Hen.lay! {{
7
+ :rubyforge => {
8
+ :project => %q{prometheus},
9
+ :package => %q{log2counter}
10
+ },
11
+
12
+ :gem => {
13
+ :version => Log2COUNTER::VERSION,
14
+ :summary => %q{Convert (analyse) Apache log files to COUNTER CSV.},
15
+ :files => FileList['lib/**/*.rb', 'bin/*'].to_a,
16
+ :extra_files => FileList['[A-Z]*', 'sample/*', 'lib/**/vendor/*'].to_a,
17
+ :dependencies => [['fastercsv', '>= 1.2.3']]
18
+ }
19
+ }}
20
+ rescue LoadError
21
+ abort "Please install the 'hen' gem first."
22
+ end
23
+
24
+ ### Place your custom Rake tasks here.
data/bin/log2counter ADDED
@@ -0,0 +1,143 @@
1
+ #! /usr/bin/ruby
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # log2counter -- Convert (analyse) Apache log files to COUNTER CSV. #
7
+ # #
8
+ # Copyright (C) 2007-2009 University of Cologne, #
9
+ # Albertus-Magnus-Platz, #
10
+ # 50932 Cologne, Germany #
11
+ # #
12
+ # Authors: #
13
+ # Jens Wille <jens.wille@uni-koeln.de> #
14
+ # #
15
+ # log2counter is free software; you can redistribute it and/or modify it #
16
+ # under the terms of the GNU General Public License as published by the Free #
17
+ # Software Foundation; either version 3 of the License, or (at your option) #
18
+ # any later version. #
19
+ # #
20
+ # log2counter is distributed in the hope that it will be useful, but WITHOUT #
21
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or #
22
+ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for #
23
+ # more details. #
24
+ # #
25
+ # You should have received a copy of the GNU General Public License along #
26
+ # with log2counter. If not, see <http://www.gnu.org/licenses/>. #
27
+ # #
28
+ ###############################################################################
29
+ #++
30
+
31
+ require 'optparse'
32
+ require 'yaml'
33
+ require 'zlib'
34
+
35
+ $: << File.join(File.dirname(__FILE__), '..', 'lib')
36
+
37
+ require 'log2counter'
38
+
39
+ USAGE = "Usage: #{$0} [-h|--help] [options]"
40
+ abort USAGE if ARGV.empty?
41
+
42
+ def open_file(file, mode = 'r')
43
+ if file =~ /\.gz/i
44
+ case mode
45
+ when 'r' then Zlib::GzipReader.open(file)
46
+ when 'w' then Zlib::GzipWriter.open(file)
47
+ else raise ArgumentError, "invalid mode #{mode}"
48
+ end
49
+ else
50
+ File.open(file, mode)
51
+ end
52
+ end
53
+
54
+ options = {
55
+ :licensees => 'licensees.yaml',
56
+ :log_file => STDIN,
57
+ :csv_file => STDOUT,
58
+ :months => nil,
59
+ :regexp => nil,
60
+ :summarize => false,
61
+ :summarize_file => nil
62
+ }
63
+
64
+ OptionParser.new { |opts|
65
+ opts.banner = USAGE
66
+
67
+ opts.separator ' '
68
+ opts.separator 'Options:'
69
+
70
+ opts.on('-i', '--log-file LOG', 'Input log file [Default: STDIN]') { |f|
71
+ abort "Can't find log file #{f}" unless File.readable?(f)
72
+ options[:log_file] = open_file(f)
73
+ }
74
+
75
+ opts.on('-o', '--csv-file CSV', 'Output CSV file [Default: STDOUT]') { |f|
76
+ options[:csv_file] = open_file(f, 'w')
77
+ }
78
+
79
+ opts.separator ' '
80
+
81
+ opts.on('-l', '--licensees YAML', "Licensees list [Default: #{options[:licensees]}]") { |f|
82
+ abort "Can't find licensee file #{f}" unless File.readable?(f)
83
+ options[:licensees] = f
84
+ }
85
+
86
+ opts.separator ' '
87
+
88
+ opts.on('-m', '--months LIST', 'Comma-separated list of month(s) [YYYY_MM]') { |m|
89
+ options[:months] = m.split(',')
90
+ }
91
+
92
+ opts.on('-r', '--regexp REGEXP', 'Comma-separated list of regular expressions', '[id=...,login=...,search=...,download=...]') { |r|
93
+ r.split(',').each { |str|
94
+ key, pat = str.split('=', 2)
95
+ (options[:regexp] ||= {})[key.to_sym] = Regexp.new(pat)
96
+ }
97
+ }
98
+
99
+ opts.separator ' '
100
+
101
+ opts.on('-s', '--summarize', "Summarize stats per licensee per month; don't", 'print client details') {
102
+ options[:summarize] = true
103
+ }
104
+
105
+ opts.on('-S', '--summarize-file CSV', 'Summarize existing CSV file (in full format);', "use '-' for STDIN") { |f|
106
+ options[:summarize] = true
107
+
108
+ if f == '-'
109
+ options[:summarize_file] = STDIN
110
+ else
111
+ abort "Can't find CSV file #{f}" unless File.readable?(f)
112
+ options[:summarize_file] = open_file(f)
113
+ end
114
+ }
115
+
116
+ opts.separator ' '
117
+ opts.separator 'Generic options:'
118
+
119
+ opts.on('-h', '--help', 'Print this help message and exit') {
120
+ puts opts
121
+ exit
122
+ }
123
+
124
+ opts.on('--version', 'Print program version and exit') {
125
+ puts "#{File.basename($0)} v#{Log2COUNTER::VERSION}"
126
+ exit
127
+ }
128
+
129
+ opts.separator ' '
130
+ opts.separator "If LOG or CSV ends in '.gz', it's treated as a Gzipped file."
131
+ }.parse!
132
+
133
+ begin
134
+ stats = if options[:summarize_file]
135
+ Log2COUNTER.load(options[:summarize_file])
136
+ else
137
+ Log2COUNTER.parse(options[:log_file], YAML.load_file(options[:licensees]), options[:months], options[:regexp])
138
+ end
139
+
140
+ Log2COUNTER.print(stats, options[:csv_file], options[:summarize])
141
+ ensure
142
+ options.each { |k, v| v.close if v.respond_to?(:close) }
143
+ end
@@ -0,0 +1,56 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of log2counter, the Apache log to COUNTER CSV converter. #
5
+ # #
6
+ # Copyright (C) 2007-2009 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # log2counter is free software; you can redistribute it and/or modify it #
14
+ # under the terms of the GNU General Public License as published by the Free #
15
+ # Software Foundation; either version 3 of the License, or (at your option) #
16
+ # any later version. #
17
+ # #
18
+ # log2counter is distributed in the hope that it will be useful, but WITHOUT #
19
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or #
20
+ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for #
21
+ # more details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with log2counter. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ # Allow to compare strings and fixnums; the latter sorting before the other.
30
+
31
+ class String
32
+
33
+ alias_method :_log2counter_original_cmp, :<=>
34
+
35
+ def <=>(other)
36
+ case other
37
+ when Fixnum then 1 # Fixnums always sort before us.
38
+ else _log2counter_original_cmp(other)
39
+ end
40
+ end
41
+
42
+ end
43
+
44
+
45
+ class Fixnum
46
+
47
+ alias_method :_log2counter_original_cmp, :<=>
48
+
49
+ def <=>(other)
50
+ case other
51
+ when String then -1 # Strings always sort after us.
52
+ else _log2counter_original_cmp(other)
53
+ end
54
+ end
55
+
56
+ end
@@ -0,0 +1,43 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of log2counter, the Apache log to COUNTER CSV converter. #
5
+ # #
6
+ # Copyright (C) 2007-2009 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # log2counter is free software; you can redistribute it and/or modify it #
14
+ # under the terms of the GNU General Public License as published by the Free #
15
+ # Software Foundation; either version 3 of the License, or (at your option) #
16
+ # any later version. #
17
+ # #
18
+ # log2counter is distributed in the hope that it will be useful, but WITHOUT #
19
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or #
20
+ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for #
21
+ # more details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with log2counter. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ # We need strings and fixnums to be comparable.
30
+ require 'log2counter/core_ext/compare_strings_and_fixnums'
31
+
32
+ class Hash
33
+
34
+ # Sort IP addresses numerically by net part, and host names just as usual.
35
+ def sort_by_ip_or_host
36
+ sort_by { |key, _|
37
+ key ? key.split('.').map { |part|
38
+ part =~ /\A\d+\z/ ? part.to_i : part
39
+ } : []
40
+ }
41
+ end
42
+
43
+ end
@@ -0,0 +1,213 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of log2counter, the Apache log to COUNTER CSV converter. #
5
+ # #
6
+ # Copyright (C) 2007-2009 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # log2counter is free software; you can redistribute it and/or modify it #
14
+ # under the terms of the GNU General Public License as published by the Free #
15
+ # Software Foundation; either version 3 of the License, or (at your option) #
16
+ # any later version. #
17
+ # #
18
+ # log2counter is distributed in the hope that it will be useful, but WITHOUT #
19
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or #
20
+ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for #
21
+ # more details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with log2counter. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ require 'resolv'
30
+ require 'uri'
31
+
32
+ # Get us some help for the hard part... -- um, it's not that hard, is it? ;-)
33
+ require 'log2counter/vendor/log_parser'
34
+
35
+ class Log2COUNTER::Parser
36
+
37
+ # Map month abbreviations to their two-digit number equivalent.
38
+ ABBR2MONTH = {}
39
+ Date::ABBR_MONTHNAMES.each_with_index { |abbr, index|
40
+ ABBR2MONTH[abbr] = '%02d' % index
41
+ }
42
+
43
+ year = Time.now.year
44
+
45
+ # By default we will consider the current year.
46
+ DEFAULT_MONTHS = (1..12).map { |month| '%d_%02d' % [year, month] }
47
+
48
+ # This is what we start with -- all zero.
49
+ DEFAULT_STATS = {
50
+ :sessions => 0,
51
+ :searches => 0,
52
+ :downloads => 0
53
+ }
54
+
55
+ # NOTE: <tt>:id</tt> should contain capture group for ID
56
+ DEFAULT_REGEXP = {
57
+ :id => //,
58
+ :login => //,
59
+ :search => //,
60
+ :download => //
61
+ }
62
+
63
+ class << self
64
+
65
+ def load(csv_file)
66
+ FasterCSV.new(csv_file, :headers => true).inject({}) { |stats, row|
67
+ month, licensee, name, address, sessions, searches, downloads = row.fields
68
+
69
+ (((stats[month] ||= {})[licensee] ||= {})[name] ||= {})[address] = {
70
+ :sessions => sessions.to_i,
71
+ :searches => searches.to_i,
72
+ :downloads => downloads.to_i
73
+ }
74
+
75
+ stats
76
+ }
77
+ end
78
+
79
+ end
80
+
81
+ attr_reader :log_file, :licensees, :months, :licensees_by_ip, :licensees_by_id, :regexp, :constraint
82
+
83
+ def initialize(log_file, licensees, months = nil, regexp = nil)
84
+ @log_file = log_file
85
+
86
+ @months = months || DEFAULT_MONTHS
87
+ raise ArgumentError, "illegal format for month; must be YYYY_MM" if @months.any? { |month|
88
+ month !~ /\A\d\d\d\d_\d\d\z/
89
+ }
90
+
91
+ @regexp = DEFAULT_REGEXP.merge(regexp || {})
92
+ @constraint = Regexp.union(*@regexp.values)
93
+
94
+ @licensees = licensees.reject { |_, hash| !hash[:export] }
95
+ initialize_licensees
96
+ end
97
+
98
+ # Now here's the method you want to call. Returns a hash:
99
+ #
100
+ # stats = {
101
+ # '2007_06' => {
102
+ # 'Somewhere, Inst.' => {
103
+ # '12.34.56.78' => {
104
+ # :sessions => 12,
105
+ # :searches => 34,
106
+ # :downloads => 56
107
+ # },
108
+ # ...
109
+ # },
110
+ # ...
111
+ # },
112
+ # ...
113
+ # }
114
+ def parse
115
+ # Cache resolved host names.
116
+ addr2addr = Hash.new { |hash, addr|
117
+ hash[addr] = begin
118
+ Resolv.getaddress(addr)
119
+ rescue Resolv::ResolvError
120
+ addr
121
+ end
122
+ }
123
+
124
+ # Cache licensees.
125
+ addr2lcee = Hash.new { |hash, addr|
126
+ hash[addr] = licensees_by_ip.get(addr)
127
+ }
128
+
129
+ # Our result hash
130
+ stats = {}
131
+
132
+ # Create a new LogParser and send our log file. Yields a hash per line.
133
+ LogParser.new(:minimal, constraint).parse_io_stream(log_file) { |stat|
134
+ path = stat['PATH_INFO']
135
+
136
+ # Skip lines that don't have any useful information for us anyway.
137
+ next unless path =~ constraint
138
+
139
+ # Maybe we already captured the licensee ID? (see DEFAULT_REGEXP above)
140
+ id = $1
141
+
142
+ m, y = stat['DATETIME'][/\/(.*?):/, 1].split('/') # Extract month and year
143
+ month = [y, ABBR2MONTH[m]].join('_') # Target format is 'YYYY_MM'
144
+
145
+ # Skip lines that fall out of the range we're interested in.
146
+ next unless months.include?(month)
147
+
148
+ address = addr2addr[stat['REMOTE_ADDR']]
149
+ licensee = addr2lcee[address] || licensees_by_id[
150
+ URI.decode(id || path[regexp[:id], 1] || '')
151
+ ]
152
+
153
+ # Couldn't find a matching licensee? Skip it!
154
+ next unless licensee
155
+
156
+ name = licensee[:name]
157
+ licensee = licensee[:licensee]
158
+
159
+ (((stats[month] ||= {})[licensee] ||= {})[name] ||= {})[address] ||= DEFAULT_STATS.dup
160
+ _address = stats[month][licensee][name][address]
161
+
162
+ # Increment our counts, since that's what we're here for...
163
+ _address[:sessions] += 1 if path =~ regexp[:login]
164
+ _address[:searches] += 1 if path =~ regexp[:search]
165
+ _address[:downloads] += 1 if path =~ regexp[:download]
166
+ }
167
+
168
+ # Now we need to fill in any months and licensees we didn't come across before.
169
+ months.each { |month|
170
+ stats[month] ||= {}
171
+
172
+ licensees.each { |licensee, hash|
173
+ stats[month][licensee] ||= {}
174
+ addresses = stats[month][licensee][hash[:name]]
175
+
176
+ if addresses
177
+ # Drop entries with zero sessions -- how come they occur, anyway?
178
+ addresses.delete_if { |_, stat| stat[:sessions].zero? }
179
+ end
180
+
181
+ # Add a default "empty" entry for completeness' sake.
182
+ if addresses.nil? || addresses.empty?
183
+ stats[month][licensee][hash[:name]] = { nil => DEFAULT_STATS }
184
+ end
185
+ }
186
+ }
187
+
188
+ # That's it, return what we've got.
189
+ stats
190
+ end
191
+
192
+ protected
193
+
194
+ # Create additional mappings for our licensees.
195
+ def initialize_licensees
196
+ @licensees_by_ip = {}
197
+ @licensees_by_id = {}
198
+
199
+ licensees.each { |licensee, hash|
200
+ _hash = { :licensee => licensee, :name => hash[:name]}
201
+
202
+ hash[:ip].each { |ip| licensees_by_ip[ip] = _hash }
203
+ hash[:id].each { |id| licensees_by_id[id] = _hash }
204
+ }
205
+
206
+ # Convenience method to get a licensee from an address. Note that
207
+ # +licensees_by_ip+ usually has subnets instead of full IPs.
208
+ def licensees_by_ip.get(ip)
209
+ find(lambda { [] }) { |key, _| ip[0, key.length] == key }.last
210
+ end
211
+ end
212
+
213
+ end
@@ -0,0 +1,100 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of log2counter, the Apache log to COUNTER CSV converter. #
5
+ # #
6
+ # Copyright (C) 2007-2009 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # log2counter is free software; you can redistribute it and/or modify it #
14
+ # under the terms of the GNU General Public License as published by the Free #
15
+ # Software Foundation; either version 3 of the License, or (at your option) #
16
+ # any later version. #
17
+ # #
18
+ # log2counter is distributed in the hope that it will be useful, but WITHOUT #
19
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or #
20
+ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for #
21
+ # more details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with log2counter. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ require 'rubygems'
30
+ require 'fastercsv' # >= 1.2.3 for Gzip support
31
+
32
+ # We'd like to sort hashes by keys of IPs and host names.
33
+ require 'log2counter/core_ext/sort_by_ip_or_host'
34
+
35
+ class Log2COUNTER::Printer
36
+
37
+ COLUMNS = %w[
38
+ Jahr_Monat
39
+ Konsortial-Mitglied
40
+ Nutzer
41
+ Einzelkunden-Subidentifier
42
+ Sessions
43
+ Searches
44
+ Downloads
45
+ ]
46
+
47
+ attr_reader :csv, :summarize
48
+
49
+ def initialize(csv_file, summarize = false)
50
+ @csv = FasterCSV.new(csv_file)
51
+
52
+ @summarize = summarize
53
+ end
54
+
55
+ def print(stats)
56
+ # Output CSV header.
57
+ csv << COLUMNS
58
+
59
+ stats.sort.each { |month, licensees|
60
+ licensees.sort.each { |licensee, names|
61
+ names.sort.each { |name, addresses|
62
+ if summarize
63
+ total = Log2COUNTER::Parser::DEFAULT_STATS.dup
64
+
65
+ addresses.each { |address, stat|
66
+ total[:sessions] += stat[:sessions]
67
+ total[:searches] += stat[:searches]
68
+ total[:downloads] += stat[:downloads]
69
+ }
70
+
71
+ csv << [
72
+ month,
73
+ licensee,
74
+ name,
75
+ nil,
76
+ total[:sessions],
77
+ total[:searches],
78
+ total[:downloads]
79
+ ]
80
+ else
81
+ addresses.sort_by_ip_or_host.each { |address, stat|
82
+ csv << [
83
+ month,
84
+ licensee,
85
+ name,
86
+ address,
87
+ stat[:sessions],
88
+ stat[:searches],
89
+ stat[:downloads]
90
+ ]
91
+ }
92
+ end
93
+ }
94
+ }
95
+ }
96
+
97
+ csv.flush
98
+ end
99
+
100
+ end