apache_log_report 0.9.0 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,505 @@
1
- require 'apache_log_report/log_parser_sqlite3.rb'
2
- require 'apache_log_report/option_parser.rb'
3
- require 'apache_log_report/version.rb'
1
+ module ApacheLogReport
2
+
3
+ #
4
+ # parse command line options
5
+ #
6
+ require 'optparse'
7
+ require 'optparse/date'
8
+
9
+ def self.options_parse options
10
+ limit = 30
11
+ args = {}
12
+
13
+ opt_parser = OptionParser.new do |opts|
14
+ opts.banner = "Usage: log-analyzer.rb [options] logfile"
15
+
16
+ opts.on("-lN", "--limit=N", Integer, "Number of entries to show (defaults to #{limit})") do |n|
17
+ args[:limit] = n
18
+ end
19
+
20
+ opts.on("-bDATE", "--from-date=DATE", DateTime, "Consider entries after or on DATE") do |n|
21
+ args[:from_date] = n
22
+ end
23
+
24
+ opts.on("-eDATE", "--to-date=DATE", DateTime, "Consider entries before or on DATE") do |n|
25
+ args[:to_date] = n
26
+ end
27
+
28
+ opts.on("-i", "--ignore-crawlers", "Ignore crawlers") do |n|
29
+ args[:ignore_crawlers] = true
30
+ end
31
+
32
+ opts.on("-p", "--ignore-selfpoll", "Ignore apaches self poll entries (from ::1)") do |n|
33
+ args[:no_selfpoll] = true
34
+ end
35
+
36
+ opts.on("-c", "--only-crawlers", "Perform analysis on crawlers only") do |n|
37
+ args[:only_crawlers] = true
38
+ end
39
+
40
+ opts.on("-u", "--prefix=PREFIX", String, "Prefix to add to all plots (used to run multiple analyses in the same dir)") do |n|
41
+ args[:prefix] = n
42
+ end
43
+
44
+ opts.on("-w", "--suffix=SUFFIX", String, "Suffix to add to all plots (used to run multiple analyses in the same dir)") do |n|
45
+ args[:suffix] = n
46
+ end
47
+
48
+ opts.on("-h", "--help", "Prints this help") do
49
+ puts opts
50
+ exit
51
+ end
52
+ end
53
+
54
+ opt_parser.parse!(options)
55
+
56
+ args[:limit] ||= limit
57
+ args[:ignore_crawlers] ||= false
58
+ args[:no_selfpoll] ||= false
59
+ args[:only_crawlers] ||= false
60
+ args[:prefix] ||= ""
61
+ args[:suffic] ||= ""
62
+
63
+ return args
64
+ end
65
+
66
+
67
+
68
+ #
69
+ # parse an Apache log file and return a SQLite3 DB
70
+ #
71
+ require 'apache_log/parser'
72
+ require 'sqlite3'
73
+ require 'browser'
74
+
75
+ def self.parse filename, options = {}
76
+ content = filename ? File.readlines(filename) : ARGF.readlines
77
+
78
+ db = SQLite3::Database.new ":memory:"
79
+ db.execute "CREATE TABLE IF NOT EXISTS LogLine(
80
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
81
+ datetime TEXT,
82
+ ip TEXT,
83
+ user TEXT,
84
+ unique_visitor TEXT,
85
+ method TEXT,
86
+ path TEXT,
87
+ extension TEXT,
88
+ status TEXT,
89
+ size INTEGER,
90
+ referer TEXT,
91
+ user_agent TEXT,
92
+ bot INTEGER,
93
+ browser TEXT,
94
+ browser_version TEXT,
95
+ platform TEXT,
96
+ platform_version TEXT)"
97
+
98
+ ins = db.prepare('insert into LogLine (
99
+ datetime,
100
+ ip,
101
+ user,
102
+ unique_visitor,
103
+ method,
104
+ path,
105
+ extension,
106
+ status,
107
+ size,
108
+ referer,
109
+ user_agent,
110
+ bot,
111
+ browser,
112
+ browser_version,
113
+ platform,
114
+ platform_version)
115
+ values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)')
116
+
117
+ parser = ApacheLog::Parser.new(options[:format] || 'combined')
118
+
119
+ content.collect { |line|
120
+ hash = parser.parse line
121
+
122
+ if hash != {}
123
+ ua = Browser.new(hash[:user_agent], accept_language: "en-us")
124
+
125
+ ins.execute(
126
+ hash[:datetime].iso8601,
127
+ hash[:remote_host],
128
+ hash[:user],
129
+ hash[:remote_host] + hash[:user_agent],
130
+ hash[:request][:method],
131
+ hash[:request][:path],
132
+ (hash[:request][:path] ? File.extname(hash[:request][:path]) : ""),
133
+ hash[:status],
134
+ hash[:size].to_i,
135
+ hash[:referer],
136
+ hash[:user_agent],
137
+ ua.bot? ? 1 : 0,
138
+ (ua.name || ""),
139
+ (ua.version || ""),
140
+ (ua.platform.name || ""),
141
+ (ua.platform.version || "")
142
+ )
143
+ end
144
+ }
145
+
146
+ db
147
+ end
148
+
149
+
150
+ def self.reasonable_response_type ext
151
+
152
+ end
153
+
154
+ #
155
+ # take a sqlite3 databae and analyze data
156
+ #
157
+ def self.analyze_data db, options = {}
158
+
159
+ @first_day = db.execute "SELECT datetime from LogLine order by datetime limit 1"
160
+ @last_day = db.execute "SELECT datetime from LogLine order by datetime desc limit 1"
161
+ @log_size = db.execute "SELECT count(datetime) from LogLine"
162
+ @crawlers_size = db.execute "SELECT count(datetime) from LogLine where bot == 1"
163
+ @selfpolls_size = db.execute "SELECT count(datetime) from LogLine where ip == '::1'"
164
+
165
+ #
166
+ # generate the where clause corresponding to the command line options to filter data
167
+ #
168
+ @filter = [
169
+ (options[:from_date] ? "date(datetime) >= '#{options[:from_date]}'" : nil),
170
+ (options[:to_date] ? "date(datetime) <= '#{options[:to_date]}'" : nil),
171
+ (options[:only_crawlers] ? "bot == 1" : nil),
172
+ (options[:ignore_crawlers] ? "bot == 0" : nil),
173
+ (options[:no_selfpolls] ? "ip != '::1'" : nil),
174
+ "true"
175
+ ].compact.join " and "
176
+
177
+ @total_hits = db.execute "SELECT count(datetime) from LogLine where #{@filter}"
178
+ @total_unique_visitors = db.execute "SELECT count(distinct(unique_visitor)) from LogLine where #{@filter}"
179
+ @total_size = db.execute "SELECT sum(size) from LogLine where #{@filter}"
180
+ @total_days = (Date.parse(@last_day[0][0]) - Date.parse(@first_day[0][0])).to_i
181
+
182
+ @daily_distribution = db.execute "SELECT date(datetime), count(datetime), count(distinct(unique_visitor)), sum(size) from LogLine where #{@filter} group by date(datetime)"
183
+
184
+ @time_distribution = db.execute "SELECT strftime('%H', datetime), count(datetime), count(distinct(unique_visitor)), sum(size) from LogLine where #{@filter} group by strftime('%H', datetime)"
185
+
186
+ @most_requested_pages = db.execute "SELECT path, count(path), count(distinct(unique_visitor)), sum(size) from LogLine where extension == '.html' and #{@filter} group by path order by count(path) desc limit #{options[:limit]}"
187
+
188
+ @most_requested_resources = db.execute "SELECT path, count(path), count(distinct(unique_visitor)), sum(size) from LogLine where #{@filter} group by path order by count(path) desc limit #{options[:limit]}"
189
+
190
+ @missed_pages = db.execute "SELECT path, count(path), count(distinct(unique_visitor)) from LogLine where status == '404' and extension == '.html' and #{@filter} group by path order by count(path) desc limit #{options[:limit]}"
191
+
192
+ @missed_resources = db.execute "SELECT path, count(path), count(distinct(unique_visitor)) from LogLine where status == '404' and #{@filter} group by path order by count(path) desc limit #{options[:limit]}"
193
+
194
+ @reasonable_requests_exts = [ ".html", ".css", ".js", ".jpg", ".svg", ".png", ".woff", ".xml", ".ttf", ".ico", ".pdf", ".htm", ".txt", ".org" ].map { |x|
195
+ "extension == '#{x}'"
196
+ }.join " or "
197
+
198
+ @attacks = db.execute "SELECT path, count(path), count(distinct(unique_visitor)) from LogLine where status == '404' and #{@filter} and (#{@reasonable_requests_exts}) group by path order by count(path) desc limit #{options[:limit]}"
199
+
200
+ @statuses = db.execute "SELECT status, count(status) from LogLine where #{@filter} group by status order by status"
201
+
202
+ @by_day_4xx = db.execute "SELECT date(datetime), count(datetime) from LogLine where substr(status, 1,1) == '4' and #{@filter} group by date(datetime)"
203
+ @by_day_3xx = db.execute "SELECT date(datetime), count(datetime) from LogLine where substr(status, 1,1) == '3' and #{@filter} group by date(datetime)"
204
+ @by_day_2xx = db.execute "SELECT date(datetime), count(datetime) from LogLine where substr(status, 1,1) == '2' and #{@filter} group by date(datetime)"
205
+
206
+ @statuses_by_day = (@by_day_2xx + @by_day_3xx + @by_day_4xx).group_by { |x| x[0] }.to_a.map { |x|
207
+ [x[0], x[1].map { |y| y[1] }].flatten
208
+ }
209
+
210
+ @browsers = db.execute "SELECT browser, count(browser), count(distinct(unique_visitor)), sum(size) from LogLine where #{@filter} group by browser order by count(browser) desc"
211
+
212
+ @platforms = db.execute "SELECT platform, count(platform), count(distinct(unique_visitor)), sum(size) from LogLine where #{@filter} group by platform order by count(platform) desc"
213
+
214
+ @ips = db.execute "SELECT ip, count(ip), count(distinct(unique_visitor)), sum(size) from LogLine where #{@filter} group by ip order by count(ip) desc limit #{options[:limit]}"
215
+
216
+ @referers = db.execute "SELECT referer, count(referer), count(distinct(unique_visitor)), sum(size) from LogLine where #{@filter} group by referer order by count(referer) desc limit #{options[:limit]}"
217
+ end
218
+
219
+
220
+ #
221
+ # Emit Data
222
+ #
223
+
224
+ require 'terminal-table'
225
+
226
+ def self.output_table name, headings, rows
227
+ name = "#+NAME: #{name}"
228
+ table = Terminal::Table.new headings: headings, rows: rows, style: { border_x: "-", border_i: "|" }
229
+
230
+ #(2..headings.size).each do |i|
231
+ # table.align_column(i, :right)
232
+ #end
233
+
234
+ name + "\n" + table.to_s
235
+ end
236
+
237
+ def self.emit options = {}, command, log_file, started_at, ended_at, duration
238
+ @prefx = options[:prefix]
239
+ @suffix = options[:suffix]
240
+
241
+ <<EOS
242
+ #+TITLE: Apache Log Analysis: #{log_file}
243
+ #+DATE: <#{Date.today}>
244
+ #+STARTUP: showall
245
+ #+OPTIONS: ^:{}
246
+ #+HTML_HEAD: <link rel="stylesheet" type="text/css" href="ala-style.css" />
247
+ #+OPTIONS: html-style:nil
248
+
249
+ * Summary
250
+
251
+ | Hits | #{"%10d" % @total_hits[0][0]} |
252
+ | Unique Visitors | #{"%10d" % @total_unique_visitors[0][0] } |
253
+ | Tx | #{"%10d" % @total_size[0][0] } |
254
+ | Days | #{"%10d" % @total_days[0][0] } |
255
+
256
+ * Daily Distribution
257
+
258
+ #{ output_table "daily_distribution", ["Day", "Hits", "Visits", "Size"], @daily_distribution }
259
+
260
+ #+BEGIN_SRC gnuplot :var data = daily_distribution :results output :exports both :file #{@prefix}daily#{@suffix}.svg
261
+ reset
262
+ set grid ytics linestyle 0
263
+ set grid xtics linestyle 0
264
+ set terminal svg size 1200,800 fname 'Arial'
265
+
266
+ set xdata time
267
+ set timefmt "%Y-%m-%d"
268
+ set format x "%a, %b %d"
269
+ set xtics rotate by 60 right
270
+
271
+ set title "Hits and Visitors"
272
+ set xlabel "Date"
273
+ set ylabel "Hits"
274
+ set ylabel2 "Visits"
275
+
276
+ set style fill transparent solid 0.2 noborder
277
+
278
+ plot data using 1:2 with linespoints lw 3 lc rgb "#0000AA" pointtype 5 title "Hits" axes x1y2, \\
279
+ data using 1:2 with filledcurves below x1 linecolor rgb "#0000AA" notitle axes x1y2, \\
280
+ data using 1:3 with linespoints lw 3 lc rgb "#AA0000" pointtype 7 title "Visitors", \\
281
+ data using 1:3 with filledcurves below x1 notitle linecolor rgb "#AA0000", \\
282
+ data using 1:($3+10):3 with labels notitle textcolor rgb "#AA0000", \\
283
+ data using 1:($2+100):2 with labels notitle textcolor rgb "#0000AA" axes x1y2
284
+ #+END_SRC
285
+
286
+
287
+ * Time Distribution
288
+
289
+ #{ output_table "time_distribution", ["Hour", "Hits", "Visits", "Size"], @time_distribution }
290
+
291
+
292
+ #+BEGIN_SRC gnuplot :var data = time_distribution :results output :exports both :file #{@prefix}time#{@suffix}.svg
293
+ reset
294
+ set terminal svg size 1200,800 fname 'Arial' fsize 10
295
+
296
+ set grid ytics linestyle 0
297
+
298
+ set title "Hits and Visitors"
299
+ set xlabel "Date"
300
+ set ylabel "Hits and Visits"
301
+
302
+ set style fill solid 0.25
303
+ set boxwidth 0.6
304
+
305
+ set style data histograms
306
+ set style histogram clustered gap 1
307
+
308
+ plot data using 2:xtic(1) lc rgb "#0000AA" title "Hits", \\
309
+ data using 3 lc rgb "#AA0000" title "Visitors" axes x1y2, \\
310
+ data using ($0 - 0.2):($2 + 10):2 with labels title "" textcolor rgb("#0000AA"), \\
311
+ data using ($0 + 0.2):($3 + 10):3 with labels title "" textcolor rgb("#AA0000") axes x1y2
312
+ #+END_SRC
313
+
314
+ #+BEGIN_SRC gnuplot :var data = time_distribution :results output :exports both :file #{@prefix}time-traffic#{@suffix}.svg
315
+ reset
316
+ set terminal svg size 1200,800 fname 'Arial' fsize 10
317
+
318
+ set grid ytics linestyle 0
319
+
320
+ set title "Traffic"
321
+ set xlabel "Date"
322
+ set ylabel "Traffic"
323
+
324
+ set style fill solid 0.50
325
+ set boxwidth 0.6
326
+
327
+ set style data histograms
328
+ set style histogram clustered gap 1
329
+
330
+ plot data using 2:xtic(1) lc rgb "#00AA00" title "Traffic", \\
331
+ data using ($0):($2 + 10):2 with labels title "" textcolor rgb("#00AA00")
332
+ #+END_SRC
333
+
334
+ * Most Requested Pages
335
+
336
+ #{ output_table "most_requested_pages", ["Path", "Hits", "Visits", "Size"], @most_requested_pages }
337
+
338
+ * Most Requested URIs
339
+
340
+ #{ output_table "most_requested_resources", ["Path", "Hits", "Visits", "Size"], @most_requested_resources }
341
+
342
+ * 404s on HTML files
343
+
344
+ #{ output_table "pages_404", ["Path", "Hits", "Visitors"], @missed_pages }
345
+
346
+ * 404s on other resources
347
+
348
+ #{ output_table "resources_404", ["Path", "Hits", "Visitors"], @missed_resources }
349
+
350
+ * Possible Attacks
351
+
352
+ #{ output_table "Attacks", ["Path", "Hits", "Visitors"], @attacks }
353
+
354
+ * Statuses
355
+
356
+ #{ output_table "statuses", ["Status", "Count"], @statuses }
357
+
358
+ #+BEGIN_SRC gnuplot :var data = statuses :results output :exports both :file #{@prefix}statuses#{@suffix}.svg
359
+ reset
360
+ set grid ytics linestyle 0
361
+ set terminal svg size 1200,800 fname 'Arial' fsize 10
362
+
363
+ set style fill solid 0.25
364
+ set boxwidth 0.6
365
+
366
+ plot data using 2:xtic(1) with boxes lc rgb "#0000AA" title "Hits", \\
367
+ data using ($0):($2+100):2 with labels textcolor rgb "#0000AA"
368
+ #+END_SRC
369
+
370
+ * Daily Statuses
371
+
372
+ #{ output_table "daily_statuses", ["Status", "2xx", "3xx", "4xx"], @statuses_by_day }
373
+
374
+ #+BEGIN_SRC gnuplot :var data = daily_statuses :results output :exports both :file #{@prefix}daily-statuses#{@suffix}.svg
375
+ reset
376
+ set terminal svg size 1200,800 fname 'Arial' fsize 10
377
+
378
+ set grid ytics linestyle 0
379
+
380
+ set title "Daily Statuses"
381
+ set xlabel "Date"
382
+ set ylabel "Number of Hits"
383
+ set xtics rotate by 60 right
384
+
385
+ set style fill solid 0.25
386
+ set boxwidth 0.6
387
+
388
+ set style data histograms
389
+ set style histogram clustered gap 1
390
+
391
+ plot data using 2:xtic(1) lc rgb "#CC0000" title "4xx", \\
392
+ data using 3 lc rgb "#0000CC" title "3xx", \\
393
+ data using 4 lc rgb "#00AA00" title "2xx", \\
394
+ data using ($0 - 1. / 4):($2 + 0.5):2 with labels title "" textcolor rgb("#CC0000"), \\
395
+ data using ($0):($3 + 0.5):3 with labels title "" textcolor rgb("#0000CC"), \\
396
+ data using ($0 + 1. / 4):($4 + 0.5):4 with labels title "" textcolor rgb("#00AA00")
397
+ #+END_SRC
398
+
399
+ * Browsers
400
+
401
+ #{ output_table "browsers", ["Browser", "Hits", "Visitors", "Size"], @browsers }
402
+
403
+ #+BEGIN_SRC gnuplot :var data = browsers :results output :exports both :file #{@prefix}browser#{@suffix}.svg
404
+ reset
405
+ set grid ytics linestyle 0
406
+ set terminal svg size 1200,800 fname 'Arial' fsize 10
407
+
408
+ set style fill solid 0.25
409
+ set boxwidth 0.6
410
+
411
+ plot data using 2:xtic(1) with boxes lc rgb "#0000AA" title "Hits", \\
412
+ data using ($0):($2+100):2 with labels textcolor rgb "#0000AA"
413
+ #+END_SRC
414
+
415
+ * Platforms
416
+
417
+ #{ output_table "platforms", ["Platform", "Hits", "Visitors", "Size"], @platforms }
418
+
419
+ #+BEGIN_SRC gnuplot :var data = platforms :results output :exports both :file #{@prefix}platforms#{@suffix}.svg
420
+ reset
421
+ set grid ytics linestyle 0
422
+ set terminal svg size 1200,800 fname 'Arial' fsize 10
423
+
424
+ set style fill solid 0.25
425
+ set boxwidth 0.6
426
+
427
+ plot data using 2:xtic(1) with boxes lc rgb "#0000AA" title "Hits", \\
428
+ data using ($0):($2+100):2 with labels textcolor rgb "#0000AA"
429
+ #+END_SRC
430
+
431
+ * IPs
432
+
433
+ #{ output_table "ips", ["IPs", "Hits", "Visitors", "Size"], @ips }
434
+
435
+
436
+ * Referers
437
+
438
+ #{ output_table "referers", ["Referers", "Hits", "Visitors", "Size"], @referers }
439
+
440
+ #+BEGIN_SRC gnuplot :var data = referers :results output :exports both :file #{@prefix}referers#{@suffix}.svg
441
+ reset
442
+ set terminal svg size 1200,800 fname 'Arial' fsize 10
443
+
444
+ set grid ytics linestyle 0
445
+ set grid xtics linestyle 0
446
+
447
+ set title "Referers"
448
+ set xlabel "Date"
449
+ set xtics rotate by 60 right
450
+ set ylabel "Hits and Visits"
451
+
452
+ set style fill solid 0.45
453
+ set boxwidth 0.7
454
+
455
+ set style data histograms
456
+ set style histogram clustered gap 1
457
+
458
+ plot data using 2:xtic(1) lc rgb "#AA00AA" title "Hits", \\
459
+ data using 3 lc rgb "#0AAAA0" title "Visits", \\
460
+ data using ($0 - 1. / 3):($2 + 50):2 with labels title "" textcolor rgb("#AA00AA"), \\
461
+ data using ($0 + 1. / 3):($3 + 50):3 with labels title "" textcolor rgb("#0AAAA0")
462
+ #+END_SRC
463
+
464
+ * Command Invocation and Performance
465
+
466
+ ** Command Invocation
467
+
468
+ #+BEGIN_EXAMPLE shell
469
+ #{command}
470
+ #+END_EXAMPLE
471
+
472
+ | Input file | #{"%-50s" % (log_file || "stdin")} |
473
+ | Ignore crawlers | #{"%-50s" % options[:ignore_crawlers]} |
474
+ | Only crawlers | #{"%-50s" % options[:only_crawlers]} |
475
+ | No selfpoll | #{"%-50s" % options[:no_selfpoll]} |
476
+ | Filter by date | #{"%-50s" % (options[:from_date] != nil or options[:to_date] != nil)} |
477
+ | Prefix | #{"%-50s" % @prefix} |
478
+ | Suffix | #{"%-50s" % @suffix} |
479
+
480
+ ** Log Structure
481
+
482
+ | Log size | #{"%10d" % @log_size[0][0]} |
483
+ | Self poll entries | #{"%10d" % @selfpolls_size[0][0]} |
484
+ | Crawlers | #{"%10d" % @crawlers_size[0][0]} |
485
+ | Entries considered | #{"%10d" % @total_hits[0][0]} |
486
+
487
+ ** Performance
488
+
489
+ | Analysis started at | #{started_at.to_s} |
490
+ | Analysis ended at | #{ended_at.to_s} |
491
+ | Duration (sec) | #{"%5.3d" % duration } |
492
+ | Duration (min) | #{"%5.3d" % (duration / 60 )} |
493
+ | Log size | #{"%9d" % @log_size[0][0]} |
494
+ | Lines/sec | #{"%6.2f" % (@log_size[0][0] / duration)} |
495
+
496
+ * Local Variables :noexport:
497
+ # Local Variables:
498
+ # org-confirm-babel-evaluate: nil
499
+ # org-display-inline-images: t
500
+ # end:
501
+ EOS
502
+ end
503
+ end
504
+
4
505