log_sense 1.3.5 → 1.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 30e8194103003ca9861272072bbf9ef199d7d80b67a0b73fb38d510f23adacee
4
- data.tar.gz: 00b36829dd41b27e79cd6a41cacb412479631c0be9a9e2b79d585f81e0c7efa8
3
+ metadata.gz: '0128717b2ba709bc5dfbb7b755762a757c575ad4307159910cc894aaa3b88f42'
4
+ data.tar.gz: e05054b8eee79a439f5b077e60bc0d95a3e7706c550853333d7d631c458abf91
5
5
  SHA512:
6
- metadata.gz: 41b392e7d5ec01052dbb645d50b6dc6736a82c04ea231ac33f3fa2c3679cc27a959164a8f5f9524f1604f1b495c032978a2e4e14ca01a706448fc6a8d7556185
7
- data.tar.gz: fea27d4e0765fec9b090101a3efec3eb5ebd162af1c7389162d34c876754980b5dbccb22141b3aa4a0ec3eeb44bfd3936e17cc5463dc93148c3718bb51a1dcec
6
+ metadata.gz: 9d9e3dc495f7479292ae96d1bf6298f531258cae74df47ff705aea9613880b0d50aa6a19328f70685484ad1c606bd12a8fb7c87632fe5c9cbefefe4893d9bb4d
7
+ data.tar.gz: b417049bcc119ed82ab4c33d007e15804ba85b2485308ca28448fba512814fc5e8b8b215310bfb5c8108fcdc87292322eefc6826b18718fcbc7fced29eea77cb
data/CHANGELOG.org CHANGED
@@ -2,6 +2,52 @@
2
2
  #+AUTHOR: Adolfo Villafiorita
3
3
  #+STARTUP: showall
4
4
 
5
+ * 1.5.0
6
+
7
+ - [User] Present Unique Visits / day as integer
8
+ - [User] Added Country and Streaks report for rails
9
+ - [User] Changed Streak report in Apache
10
+
11
+ - [Gem] Updated DBIP
12
+ - [Gem] Updated Bundle
13
+
14
+ - [Code] Refactored all reports, so that they are specified
15
+ in the same way
16
+ - [Code] Refactor warning message in textual reports
17
+ - [Code] Build HTML menu for report specification
18
+ - [Code] Various refactoring passes on the code
19
+
20
+ * 1.4.1
21
+
22
+ - [User] New textual report for Apache
23
+ - [User] New option -w sets maximum width of URL, Path, and
24
+ Description columns in textual reports
25
+ - [User] Removed option -i, since input filenames are now taken
26
+ as direct arguments
27
+ - [User] Allow multiple files in input
28
+ - [Fixed] Complain if input format is not supported
29
+ - [Code] Refactoring of reports to manage better output to
30
+ multiple formats
31
+
32
+ * 1.4.0
33
+
34
+ - [User] The Apache Log report now organizes page requests in four
35
+ tables:
36
+ - success on HTML pages
37
+ - success on other resources
38
+ - failures on HTML pages
39
+ - failures on other resources
40
+ - [User] Increased the default limit of pages in reports to 900
41
+ - [User] The return status in now included in the page and resources
42
+ reports
43
+ - [User] The "Attack" table has been removed, since the data can be
44
+ gotten from the previous tables
45
+ - [Fixed] HTML pages are those with extension ".html" and ".htm"
46
+ - [Fixed] Wrong data on summary table of the apache report has
47
+ been fixed
48
+ - [Fixed] Better JavaScript escaping to avoid log poisoning
49
+ - [Fixed] Strengthened the Apache log parser
50
+
5
51
  * 1.3.3 and 1.3.4
6
52
 
7
53
  - [Gem] Moved repository to Github and fixes to gemspec
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- log_sense (1.3.1)
4
+ log_sense (1.4.2)
5
5
  browser
6
6
  ipaddr
7
7
  iso_country_codes
@@ -13,9 +13,9 @@ GEM
13
13
  specs:
14
14
  browser (5.3.1)
15
15
  byebug (11.1.3)
16
- ipaddr (1.2.3)
16
+ ipaddr (1.2.4)
17
17
  iso_country_codes (0.7.8)
18
- minitest (5.14.4)
18
+ minitest (5.15.0)
19
19
  rake (12.3.3)
20
20
  sqlite3 (1.4.2)
21
21
  terminal-table (3.0.2)
@@ -32,4 +32,4 @@ DEPENDENCIES
32
32
  rake (~> 12.0)
33
33
 
34
34
  BUNDLED WITH
35
- 2.2.32
35
+ 2.3.3
data/README.org CHANGED
@@ -19,8 +19,6 @@ LogSense reports the following data:
19
19
  - OS, browsers, and devices
20
20
  - IP Country location, thanks to the DPIP lite country DB
21
21
  - Streaks: resources accessed by a given IP over time
22
- - Potential attacks: access to resources which are not meant to be
23
- served by a web server serving static websites
24
22
  - Performance of Rails requests
25
23
 
26
24
  Filters from the command line allow to analyze specific periods and
@@ -33,6 +31,18 @@ And, of course, the compulsory screenshot:
33
31
  #+ATTR_HTML: :width 80%
34
32
  [[file:./apache-screenshot.png]]
35
33
 
34
+
35
+ * An important word of warning
36
+
37
+ [[https://owasp.org/www-community/attacks/Log_Injection][Log poisoning]] is a technique whereby attackers send requests with invalidated
38
+ user input to forge log entries or inject malicious content into the logs.
39
+
40
+ log_sense sanitizes entries of HTML reports, to try and protect from log
41
+ poisoning. *Log entries and URLs in SQLite3, however, are not sanitized*:
42
+ they are stored and read from the log. This is not, in general, an issue,
43
+ unless you use the data from SQLite in environments in which URLs can be
44
+ opened or code executed.
45
+
36
46
  * Motivation
37
47
 
38
48
  LogSense moves along the lines of tools such as [[https://goaccess.io/][GoAccess]] (which
@@ -54,6 +64,7 @@ generated files are then made available on a private area on the web.
54
64
  gem install log_sense
55
65
  #+end_src
56
66
 
67
+
57
68
  * Usage
58
69
 
59
70
  #+begin_src bash :results raw output :wrap example
@@ -62,21 +73,22 @@ generated files are then made available on a private area on the web.
62
73
 
63
74
  #+RESULTS:
64
75
  #+begin_example
65
- Usage: log_sense [options] [logfile]
76
+ Usage: log_sense [options] [logfile ...]
66
77
  --title=TITLE Title to use in the report
67
78
  -f, --input-format=FORMAT Input format (either rails or apache)
68
- -i, --input-file=INPUT_FILE Input file
69
79
  -t, --output-format=FORMAT Output format: html, org, txt, sqlite. See below for available formats
70
80
  -o, --output-file=OUTPUT_FILE Output file
71
81
  -b, --begin=DATE Consider entries after or on DATE
72
82
  -e, --end=DATE Consider entries before or on DATE
73
- -l, --limit=N Number of entries to show (defaults to 30)
83
+ -l, --limit=N Limit to the N most requested resources (defaults to 900)
84
+ -w, --width=WIDTH Maximum width of URL and description columns in text reports
74
85
  -c, --crawlers=POLICY Decide what to do with crawlers (applies to Apache Logs)
75
86
  -n, --no-selfpolls Ignore self poll entries (requests from ::1; applies to Apache Logs)
87
+ --verbose Inform about progress (prints to STDERR)
76
88
  -v, --version Prints version information
77
89
  -h, --help Prints this help
78
90
 
79
- This is version 1.3.1
91
+ This is version 1.5.0
80
92
 
81
93
  Output formats
82
94
  rails parsing can produce the following outputs:
@@ -85,6 +97,7 @@ generated files are then made available on a private area on the web.
85
97
  - html
86
98
  apache parsing can produce the following outputs:
87
99
  - sqlite
100
+ - txt
88
101
  - html
89
102
  #+end_example
90
103
 
@@ -95,6 +108,7 @@ log_sense -f apache -i access.log -t txt > access-data.txt
95
108
  log_sense -f rails -i production.log -t html -o performance.txt
96
109
  #+end_example
97
110
 
111
+
98
112
  * Change Log
99
113
 
100
114
  See the [[file:CHANGELOG.org][CHANGELOG]] file.
@@ -109,8 +123,8 @@ Concerning the outputs:
109
123
  - HTML reports use [[https://get.foundation/][Zurb Foundation]], [[https://www.datatables.net/][Data Tables]], and [[https://vega.github.io/vega-lite/][Vega Light]], which
110
124
  are all downloaded from a CDN
111
125
  - The textual format is compatible with [[https://orgmode.org/][Org Mode]] and can be further
112
- processed to any format [[https://orgmode.org/][Org Mode]] can be exported to (including HTML
113
- and PDF)
126
+ processed to any format [[https://orgmode.org/][Org Mode]] can be exported to, including HTML
127
+ and PDF, with the word of warning in the section above.
114
128
 
115
129
  * Author and Contributors
116
130
 
@@ -118,8 +132,8 @@ Concerning the outputs:
118
132
 
119
133
  * Known Bugs
120
134
 
121
- No known bugs; an unknown number of unknown bugs.
122
- (See the open issues for the known bugs.)
135
+ No known bugs; an unknown number of unknown bugs. (See the open issues for
136
+ the known bugs.)
123
137
 
124
138
  * License
125
139
 
data/Rakefile CHANGED
@@ -9,7 +9,21 @@ end
9
9
  require_relative './lib/log_sense/ip_locator.rb'
10
10
 
11
11
  desc "Convert Geolocation DB to sqlite"
12
- task :dbip_to_sqlite3, [:filename] do |tasks, args|
13
- filename = args[:filename]
14
- ApacheLogReport::IpLocator::dbip_to_sqlite filename
12
+ task :dbip_to_sqlite3, [:year_month] do |tasks, args|
13
+ filename = "./ip_locations/dbip-country-lite-#{args[:year_month]}.csv"
14
+
15
+ if !File.exist? filename
16
+ puts "Error. Could not find: #{filename}"
17
+ puts
18
+ puts 'I see the following files:'
19
+ puts Dir.glob("ip_locations/dbip-country-lite*").map { |x| "- #{x}\n" }
20
+ puts ''
21
+ puts '1. Download (if necessary) a more recent version from: https://db-ip.com/db/download/ip-to-country-lite'
22
+ puts '2. Save downloaded file to ip_locations/'
23
+ puts '3. Relaunch with YYYY-MM'
24
+
25
+ exit
26
+ else
27
+ LogSense::IpLocator::dbip_to_sqlite filename
28
+ end
15
29
  end
data/exe/log_sense CHANGED
@@ -7,21 +7,15 @@ require 'log_sense.rb'
7
7
  #
8
8
 
9
9
  # this better be here... OptionsParser consumes ARGV
10
- @command_line = ARGV.join(" ")
11
-
10
+ @command_line = ARGV.join(' ')
12
11
  @options = LogSense::OptionsParser.parse ARGV
13
- @input_file = @options[:input_file] || ARGV[0]
14
12
  @output_file = @options[:output_file]
15
13
 
16
- if not @input_file
17
- puts "Error: no input file specified."
18
- exit
19
- end
20
-
21
- if not File.exist? @input_file
22
- puts "Error: input file '#{@input_file}' does not exist"
14
+ if ARGV.map { |x| File.exist?(x) }.include?(false)
15
+ $stderr.puts "Error: input file(s) '#{ARGV.reject { |x| File.exist(x) }.join(', ')}' do not exist"
23
16
  exit 1
24
17
  end
18
+ @input_files = ARGV.empty? ? [$stdin] : ARGV.map { |x| File.open(x, 'r') }
25
19
 
26
20
  #
27
21
  # Parse Log and Track Statistics
@@ -36,32 +30,46 @@ when 'apache'
36
30
  when 'rails'
37
31
  parser_klass = LogSense::RailsLogParser
38
32
  cruncher_klass = LogSense::RailsDataCruncher
33
+ else
34
+ $stderr.puts "Error: input format #{@options[:input_format]} not understood."
35
+ exit 1
39
36
  end
40
37
 
41
- @db = parser_klass.parse @input_file
38
+ $stderr.puts "Parsing input files..." if @options[:verbose]
39
+ @db = parser_klass.parse @input_files
42
40
 
43
- if @options[:output_format] == "sqlite"
44
- ddb = SQLite3::Database.new(@output_file || "db.sqlite3")
41
+ if @options[:output_format] == 'sqlite'
42
+ $stderr.puts "Saving to SQLite3..." if @options[:verbose]
43
+ ddb = SQLite3::Database.new(@output_file || 'db.sqlite3')
45
44
  b = SQLite3::Backup.new(ddb, 'main', @db, 'main')
46
45
  b.step(-1) #=> DONE
47
46
  b.finish
48
47
  else
48
+ $stderr.puts "Aggregating data..." if @options[:verbose]
49
49
  @data = cruncher_klass.crunch @db, @options
50
+
51
+ $stderr.puts "Geolocating..." if @options[:verbose]
50
52
  @data = LogSense::IpLocator.geolocate @data
51
53
 
54
+ $stderr.puts "Grouping by country..." if @options[:verbose]
55
+ country_col = @data[:ips][0].size - 1
56
+ @data[:countries] = @data[:ips].group_by { |x| x[country_col] }
57
+
52
58
  @ended_at = Time.now
53
59
  @duration = @ended_at - @started_at
54
60
 
55
61
  @data = @data.merge({
56
62
  command: @command_line,
57
- log_file: @input_file,
63
+ filenames: ARGV,
64
+ log_files: @input_files,
58
65
  started_at: @started_at,
59
66
  ended_at: @ended_at,
60
- duration: @duration
67
+ duration: @duration,
68
+ width: @options[:width]
61
69
  })
62
-
63
70
  #
64
71
  # Emit Output
65
72
  #
73
+ $stderr.puts "Emitting..." if @options[:verbose]
66
74
  puts LogSense::Emitter.emit @data, @options
67
75
  end
Binary file
@@ -6,7 +6,7 @@ module LogSense
6
6
  # @ variables are automatically put in the returned data
7
7
  #
8
8
 
9
- def self.crunch db, options = { limit: 30 }
9
+ def self.crunch db, options = { limit: 900 }
10
10
  first_day_s = db.execute "SELECT datetime from LogLine order by datetime limit 1"
11
11
  last_day_s = db.execute "SELECT datetime from LogLine order by datetime desc limit 1"
12
12
 
@@ -15,17 +15,17 @@ module LogSense
15
15
  @last_day = last_day_s&.first&.first ? Date.parse(last_day_s[0][0]) : nil
16
16
 
17
17
  @total_days = 0
18
- if @first_day and @last_day
19
- @total_days = (@last_day - @first_day).to_i
20
- end
18
+ @total_days = (@last_day - @first_day).to_i if @first_day && @last_day
19
+
20
+ @source_files = db.execute 'SELECT distinct(source_file) from LogLine'
21
21
 
22
- @log_size = db.execute "SELECT count(datetime) from LogLine"
22
+ @log_size = db.execute 'SELECT count(datetime) from LogLine'
23
23
  @log_size = @log_size[0][0]
24
24
 
25
25
  @selfpolls_size = db.execute "SELECT count(datetime) from LogLine where ip == '::1'"
26
26
  @selfpolls_size = @selfpolls_size[0][0]
27
27
 
28
- @crawlers_size = db.execute "SELECT count(datetime) from LogLine where bot == 1"
28
+ @crawlers_size = db.execute 'SELECT count(datetime) from LogLine where bot == 1'
29
29
  @crawlers_size = @crawlers_size[0][0]
30
30
 
31
31
  @first_day_requested = options[:from_date]
@@ -35,7 +35,7 @@ module LogSense
35
35
  @last_day_in_analysis = date_intersect options[:to_date], @last_day, :min
36
36
 
37
37
  @total_days_in_analysis = 0
38
- if @first_day_in_analysis and @last_day_in_analysis
38
+ if @first_day_in_analysis && @last_day_in_analysis
39
39
  @total_days_in_analysis = (@last_day_in_analysis - @first_day_in_analysis).to_i
40
40
  end
41
41
 
@@ -45,24 +45,24 @@ module LogSense
45
45
  filter = [
46
46
  (options[:from_date] ? "date(datetime) >= '#{options[:from_date]}'" : nil),
47
47
  (options[:to_date] ? "date(datetime) <= '#{options[:to_date]}'" : nil),
48
- (options[:only_crawlers] ? "bot == 1" : nil),
49
- (options[:ignore_crawlers] ? "bot == 0" : nil),
48
+ (options[:only_crawlers] ? 'bot == 1' : nil),
49
+ (options[:ignore_crawlers] ? 'bot == 0' : nil),
50
50
  (options[:no_selfpolls] ? "ip != '::1'" : nil),
51
- "true"
51
+ 'true'
52
52
  ].compact.join " and "
53
53
 
54
54
  mega = 1024 * 1024
55
55
  giga = mega * 1024
56
56
  tera = giga * 1024
57
-
57
+
58
58
  # in alternative to sum(size)
59
59
  human_readable_size = <<-EOS
60
- CASE
60
+ CASE
61
61
  WHEN sum(size) < 1024 THEN sum(size) || ' B'
62
62
  WHEN sum(size) >= 1024 AND sum(size) < (#{mega}) THEN ROUND((CAST(sum(size) AS REAL) / 1024), 2) || ' KB'
63
63
  WHEN sum(size) >= (#{mega}) AND sum(size) < (#{giga}) THEN ROUND((CAST(sum(size) AS REAL) / (#{mega})), 2) || ' MB'
64
64
  WHEN sum(size) >= (#{giga}) AND sum(size) < (#{tera}) THEN ROUND((CAST(sum(size) AS REAL) / (#{giga})), 2) || ' GB'
65
- WHEN sum(size) >= (#{tera}) THEN ROUND((CAST(sum(size) AS REAL) / (#{tera})), 2) || ' TB'
65
+ WHEN sum(size) >= (#{tera}) THEN ROUND((CAST(sum(size) AS REAL) / (#{tera})), 2) || ' TB'
66
66
  END AS size
67
67
  EOS
68
68
 
@@ -89,16 +89,18 @@ module LogSense
89
89
 
90
90
  @daily_distribution = db.execute "SELECT date(datetime), #{human_readable_day}, count(datetime), count(distinct(unique_visitor)), #{human_readable_size} from LogLine where #{filter} group by date(datetime)"
91
91
  @time_distribution = db.execute "SELECT strftime('%H', datetime), count(datetime), count(distinct(unique_visitor)), #{human_readable_size} from LogLine where #{filter} group by strftime('%H', datetime)"
92
- @most_requested_pages = db.execute "SELECT path, count(path), count(distinct(unique_visitor)), #{human_readable_size} from LogLine where extension == '.html' and #{filter} group by path order by count(path) desc limit #{options[:limit]}"
93
- @most_requested_resources = db.execute "SELECT path, count(path), count(distinct(unique_visitor)), #{human_readable_size} from LogLine where #{filter} group by path order by count(path) desc limit #{options[:limit]}"
94
- @missed_pages = db.execute "SELECT path, count(path), count(distinct(unique_visitor)) from LogLine where status == '404' and extension == '.html' and #{filter} group by path order by count(path) desc limit #{options[:limit]}"
95
- @missed_resources = db.execute "SELECT path, count(path), count(distinct(unique_visitor)) from LogLine where status == '404' and #{filter} group by path order by count(path) desc limit #{options[:limit]}"
96
92
 
97
- @reasonable_requests_exts = [ ".html", ".css", ".js", ".jpg", ".svg", ".png", ".woff", ".xml", ".ttf", ".ico", ".pdf", ".htm", ".txt", ".org" ].map { |x|
98
- "extension != '#{x}'"
99
- }.join " and "
93
+ good_statuses = "(status like '2%' or status like '3%')"
94
+ bad_statuses = "(status like '4%' or status like '5%')"
95
+ html_page = "(extension like '.htm%')"
96
+ non_html_page = "(extension not like '.htm%')"
97
+
98
+ @most_requested_pages = db.execute "SELECT path, count(path), count(distinct(unique_visitor)), #{human_readable_size}, status from LogLine where #{good_statuses} and #{html_page} and #{filter} group by path order by count(path) desc limit #{options[:limit]}"
99
+ @most_requested_resources = db.execute "SELECT path, count(path), count(distinct(unique_visitor)), #{human_readable_size}, status from LogLine where #{good_statuses} and #{non_html_page} and #{filter} group by path order by count(path) desc limit #{options[:limit]}"
100
+
101
+ @missed_pages = db.execute "SELECT path, count(path), count(distinct(unique_visitor)), status from LogLine where #{bad_statuses} and #{html_page} and #{filter} group by path order by count(path) desc limit #{options[:limit]}"
102
+ @missed_resources = db.execute "SELECT path, count(path), count(distinct(unique_visitor)), status from LogLine where #{bad_statuses} and #{filter} group by path order by count(path) desc limit #{options[:limit]}"
100
103
 
101
- @attacks = db.execute "SELECT path, count(path), count(distinct(unique_visitor)) from LogLine where status == '404' and #{filter} and (#{@reasonable_requests_exts}) group by path order by count(path) desc limit #{options[:limit]}"
102
104
  @statuses = db.execute "SELECT status, count(status) from LogLine where #{filter} group by status order by status"
103
105
 
104
106
  @by_day_4xx = db.execute "SELECT date(datetime), count(datetime) from LogLine where substr(status, 1,1) == '4' and #{filter} group by date(datetime)"
@@ -115,20 +117,19 @@ module LogSense
115
117
 
116
118
  @ips = db.execute "SELECT ip, count(ip), count(distinct(unique_visitor)), #{human_readable_size} from LogLine where #{filter} group by ip order by count(ip) desc limit #{options[:limit]}"
117
119
 
118
- @streaks = db.execute "SELECT ip, substr(datetime, 1, 10), path from LogLine order by ip, datetime"
120
+ @streaks = db.execute 'SELECT ip, substr(datetime, 1, 10), path from LogLine order by ip, datetime'
119
121
  data = {}
120
122
 
121
- self.instance_variables.each do |variable|
122
- var_as_symbol = variable.to_s[1..-1].to_sym
123
- data[var_as_symbol] = eval(variable.to_s)
123
+ instance_variables.each do |variable|
124
+ var_as_symbol = variable.to_s[1..].to_sym
125
+ data[var_as_symbol] = instance_variable_get(variable)
124
126
  end
127
+
125
128
  data
126
129
  end
127
130
 
128
- private
129
-
130
- def self.date_intersect date1, date2, method
131
- if date1 and date2
131
+ def self.date_intersect(date1, date2, method)
132
+ if date1 && date2
132
133
  [date1, date2].send(method)
133
134
  elsif date1
134
135
  date1
@@ -138,4 +139,3 @@ module LogSense
138
139
  end
139
140
  end
140
141
  end
141
-
@@ -31,22 +31,21 @@ module LogSense
31
31
 
32
32
  TIMESTAMP = /(?<date>#{DAY}\/#{MONTH}\/#{YEAR}):(?<time>#{TIMEC}:#{TIMEC}:#{TIMEC} #{TIMEZONE})/
33
33
 
34
- HTTP_METHODS=/GET|HEAD|POST|PUT|DELETE|CONNECT|OPTIONS|TRACE|PATCH/
35
- WEBDAV_METHODS=/COPY|LOCK|MKCOL|MOVE|PROPFIND|PROPPATCH|UNLOCK/
36
- OTHER_METHODS=/SEARCH|REPORT/
37
- METHOD=/(?<method>#{HTTP_METHODS}|#{WEBDAV_METHODS}|#{OTHER_METHODS})/
38
- PROTOCOL=/(?<protocol>HTTP\/[0-9]\.[0-9])/
39
- URL=/(?<url>[^ ]+)/
40
- REFERER=/(?<referer>[^ ]+)/
41
- RETURN_CODE=/(?<status>[1-5][0-9][0-9])/
42
- SIZE=/(?<size>[0-9]+|-)/
43
-
44
- USER_AGENT = /(?<user_agent>[^"]+)/
34
+ HTTP_METHODS = /GET|HEAD|POST|PUT|DELETE|CONNECT|OPTIONS|TRACE|PATCH/
35
+ WEBDAV_METHODS = /COPY|LOCK|MKCOL|MOVE|PROPFIND|PROPPATCH|UNLOCK/
36
+ OTHER_METHODS = /SEARCH|REPORT|PRI|HEAD\/robots.txt/
37
+ METHOD = /(?<method>#{HTTP_METHODS}|#{WEBDAV_METHODS}|#{OTHER_METHODS})/
38
+ PROTOCOL = /(?<protocol>HTTP\/[0-9]\.[0-9]|-|.*)/
39
+ URL = /(?<url>[^ ]+)/
40
+ REFERER = /(?<referer>[^"]*)/
41
+ RETURN_CODE = /(?<status>[1-5][0-9][0-9])/
42
+ SIZE = /(?<size>[0-9]+|-)/
43
+ USER_AGENT = /(?<user_agent>[^"]*)/
45
44
 
46
45
  attr_reader :format
47
46
 
48
- def initialize
49
- @format = /#{IP} #{IDENT} #{USERID} \[#{TIMESTAMP}\] "#{METHOD} #{URL} #{PROTOCOL}" #{RETURN_CODE} #{SIZE} "#{REFERER}" "#{USER_AGENT}"/
47
+ def initialize
48
+ @format = /#{IP} #{IDENT} #{USERID} \[#{TIMESTAMP}\] "(#{METHOD} #{URL} #{PROTOCOL}|-|.+)" #{RETURN_CODE} #{SIZE} "#{REFERER}" "#{USER_AGENT}"/
50
49
  end
51
50
 
52
51
  def parse line
@@ -7,10 +7,9 @@ module LogSense
7
7
  # parse an Apache log file and return a SQLite3 DB
8
8
  #
9
9
 
10
- def self.parse filename, options = {}
11
- content = filename ? File.readlines(filename) : ARGF.readlines
10
+ def self.parse(streams, options = {})
11
+ db = SQLite3::Database.new ':memory:'
12
12
 
13
- db = SQLite3::Database.new ":memory:"
14
13
  db.execute "CREATE TABLE IF NOT EXISTS LogLine(
15
14
  id INTEGER PRIMARY KEY AUTOINCREMENT,
16
15
  datetime TEXT,
@@ -28,15 +27,18 @@ module LogSense
28
27
  browser TEXT,
29
28
  browser_version TEXT,
30
29
  platform TEXT,
31
- platform_version TEXT)"
30
+ platform_version TEXT,
31
+ source_file TEXT,
32
+ line_number INTEGER
33
+ )"
32
34
 
33
- ins = db.prepare('insert into LogLine (
34
- datetime,
35
+ ins = db.prepare("insert into LogLine (
36
+ datetime,
35
37
  ip,
36
38
  user,
37
39
  unique_visitor,
38
40
  method,
39
- path,
41
+ path,
40
42
  extension,
41
43
  status,
42
44
  size,
@@ -46,44 +48,50 @@ module LogSense
46
48
  browser,
47
49
  browser_version,
48
50
  platform,
49
- platform_version)
50
- values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)')
51
+ platform_version,
52
+ source_file,
53
+ line_number
54
+ )
55
+ values (#{Array.new(18, '?').join(', ')})")
51
56
 
52
57
  parser = ApacheLogLineParser.new
53
-
54
- content.each do |line|
55
- begin
56
- hash = parser.parse line
57
- ua = Browser.new(hash[:user_agent], accept_language: "en-us")
58
- ins.execute(
59
- DateTime.parse("#{hash[:date]}T#{hash[:time]}").iso8601,
60
- hash[:ip],
61
- hash[:userid],
62
- unique_visitor_id(hash),
63
- hash[:method],
64
- hash[:url],
65
- (hash[:url] ? File.extname(hash[:url]) : ""),
66
- hash[:status],
67
- hash[:size].to_i,
68
- hash[:referer],
69
- hash[:user_agent],
70
- ua.bot? ? 1 : 0,
71
- (ua.name || ""),
72
- (ua.version || ""),
73
- (ua.platform.name || ""),
74
- (ua.platform.version || "")
75
- )
76
- rescue StandardError => e
77
- STDERR.puts e.message
58
+
59
+ streams.each do |stream|
60
+ stream.readlines.each_with_index do |line, line_number|
61
+ begin
62
+ hash = parser.parse line
63
+ ua = Browser.new(hash[:user_agent], accept_language: 'en-us')
64
+ ins.execute(
65
+ DateTime.parse("#{hash[:date]}T#{hash[:time]}").iso8601,
66
+ hash[:ip],
67
+ hash[:userid],
68
+ unique_visitor_id(hash),
69
+ hash[:method],
70
+ hash[:url],
71
+ (hash[:url] ? File.extname(hash[:url]) : ''),
72
+ hash[:status],
73
+ hash[:size].to_i,
74
+ hash[:referer],
75
+ hash[:user_agent],
76
+ ua.bot? ? 1 : 0,
77
+ (ua.name || ''),
78
+ (ua.version || ''),
79
+ (ua.platform.name || ''),
80
+ (ua.platform.version || ''),
81
+ stream == $stdin ? "stdin" : stream.path,
82
+ line_number
83
+ )
84
+ rescue StandardError => e
85
+ $stderr.puts e.message
86
+ end
78
87
  end
79
88
  end
80
-
89
+
81
90
  db
82
91
  end
83
92
 
84
93
  def self.unique_visitor_id hash
85
94
  "#{hash[:date]} #{hash[:ip]} #{hash[:user_agent]}"
86
95
  end
87
-
88
96
  end
89
97
  end