log_line_parser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 86f95f21fb4df9bd1a358a43e988c4c78bdfada6
4
+ data.tar.gz: a14d81562c43a2f80525a436138d263816d60617
5
+ SHA512:
6
+ metadata.gz: a59e7d1a346527f9a7cb32760c80d65b7a58d6b24b5b7ca28a9205028856ece88f717983261862c65c54b7afca16e02832357d846a1f9e272c576972ad7799b5
7
+ data.tar.gz: b6153aaec48fb5340a7445b488c73c24cb776ffb7c0f1ab937f8d48fb47b9aa87fce661b31b0c17eab4590f33bb4b48ed1e34d6d1a37499d85bf228218e47eab
data/.gitignore ADDED
@@ -0,0 +1,14 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ .ruby-version
11
+ *~
12
+ \#*\#
13
+ /*.gem
14
+ /*.zip
data/.travis.yml ADDED
@@ -0,0 +1,3 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.1.6
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in log_line_parser.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2016 HASHIMOTO, Naoki
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,216 @@
1
+ # LogLineParser
2
+
3
+ LogLineParser is a simple parser of Apache access logs. It parses a line of Apache access log and turns it into an array of strings or a Hash object.
4
+ And from the command line, you can use it as a conversion tool of file formats or as a filtering tool of access records.
5
+
6
+ ## Installation
7
+
8
+ Add this line to your application's Gemfile:
9
+
10
+ ```ruby
11
+ gem 'log_line_parser'
12
+ ```
13
+
14
+ And then execute:
15
+
16
+ $ bundle
17
+
18
+ Or install it yourself as:
19
+
20
+ $ gem install log_line_parser
21
+
22
+ ## Usage
23
+
24
+ ### As a converter
25
+
26
+
27
+ ```ruby
28
+ require 'log_line_parser'
29
+
30
+ line = '192.168.3.4 - - [07/Feb/2016: ... ] ...'
31
+ LogLineParser.parse(line).to_a
32
+ # => ["192.168.3.4", "-", "-", "07/Feb/2016: ... ", ... ]
33
+ ```
34
+
35
+ Or in limited cases, parsers corresponding to certain LogFormats are available:
36
+
37
+ ```ruby
38
+ require 'log_line_parser'
39
+
40
+ line = '192.168.3.4 - quidam [07/Feb/2016:07:39:42 +0900] "GET /index.html HTTP/1.1" 200 432 "http://www.example.org/start.html" "Mozilla/5.0 (X11; U; Linux i686; ja-JP; rv:1.7.5) Gecko/20041108 Firefox/1.0"'
41
+ LogLineParser::CombinedLogParser.to_hash(line)
42
+ # => {
43
+ # "%h" => "192.168.3.4",
44
+ # "%l" => "-",
45
+ # "%u" => "quidam",
46
+ # "%t" => "07/Feb/2016:07:39:42 +0900",
47
+ # "%r" => "GET /index.html HTTP/1.1",
48
+ # "%>s" => "200",
49
+ # "%b" => "432",
50
+ # "%{Referer}i" => "http://www.example.org/start.html",
51
+ # "%{User-agent}i" => "Mozilla/5.0 (X11; U; Linux i686; ja-JP; rv:1.7.5) Gecko/20041108 Firefox/1.0",
52
+ # "%m" => "GET",
53
+ # "%H" => "HTTP/1.1",
54
+ # "%U%q" => "/index.html"
55
+ # }
56
+ ```
57
+
58
+ Three parsers are predefined for such cases:
59
+
60
+ <dl>
61
+ <dt>LogLineParser::CommonLogParser</dt>
62
+ <dd>For Common Log Format (CLF)</dd>
63
+ <dt>LogLineParser::CommonLogWithVHParser</dt>
64
+ <dd>For Common Log Format with Virtual Host</dd>
65
+ <dt>LogLineParser::CombinedLogParser</dt>
66
+ <dd>NCSA extended/combined log format</dd>
67
+ </dl>
68
+
69
+ #### Defining a parser
70
+
71
+ You can define your own parser as in the following example:
72
+
73
+ ```ruby
74
+ require 'log_line_parser'
75
+
76
+ RefererLogParser = LogLineParser.parser('"%r" %>s %b %{Referer}i -> %U')
77
+
78
+ line = '"GET /index.html HTTP/1.1" 200 432 http://www.example.org/start.html -> /index.html'
79
+
80
+ RefererLogParser.to_hash(line)
81
+ # => {
82
+ # "%r" => "GET /index.html HTTP/1.1",
83
+ # "%>s" => "200",
84
+ # "%b" => "432",
85
+ # "%{Referer}i" => "http://www.example.org/start.html",
86
+ # "->" => "->",
87
+ # "%U" => "/index.html",
88
+ # "%m" => "GET",
89
+ # "%H" => "HTTP/1.1",
90
+ # "%U%q" => "/index.html"
91
+ # }
92
+ ```
93
+
94
+ #### Limitations
95
+
96
+ * Currently, you should include at least `%r`, `%>s` and `%b` in the format strings passed to `LogLineParser.parser`.
97
+ * If the value of a field is expected to contain a space, such field should be enclosed in double quotes (that means you have to change access log settings).
98
+
99
+ ### As a command-line application
100
+
101
+ The command line tool `log_line_parser` can be used for two purposes:
102
+
103
+ 1. For converting file formats
104
+ 2. For picking up log records that satisfy certain criteria
105
+
106
+ For the first purpose, the tool support conversion from an Apache log format to CSV or TSV format.
107
+ And for the second purpose, criteria such as :not_found?(= :status_code_404?) or :access_by_bots? are defined, and you can combine them by writing a configuration file.
108
+
109
+ #### For converting file formats
110
+
111
+ Suppose you have an Apache log file [example_combined_log.log](./test/data/example_combined_log.log), and run the following command in your terminal:
112
+
113
+ $ log_line_parser example_combined_log.log > expected_combined_log.csv
114
+
115
+ Then you will get [expected_combined_log.csv](./test/data/expected_combined_log.csv).
116
+
117
+ To convert into TSV format:
118
+
119
+ $ log_line_parser --to=tsv example_combined_log.log > expected_combined_log.tsv
120
+
121
+ And you will get [expected_combined_log.tsv](./test/data/expected_combined_log.tsv).
122
+
123
+ #### For picking up log records
124
+
125
+ First, you have to prepare a configuration file in YAML format. [samples/sample_config.yml](./samples/sample_config.yml) is an example.
126
+
127
+ Second, run the following command if you want to pick up from [samples/sample_combined_log.log](./samples/sample_combined_log.log) the log records that meet the definitions in the configuration file:
128
+
129
+ $ log_line_parser --filter-mode --log-format combined --config=samples/sample_config.yml --output-dir=samples/output samples/sample_combined_log.log
130
+
131
+ Then the results are in [samples/output](https://github.com/nico-hn/LogLineParser/tree/master/samples/output/) directory.
132
+
133
+ ##### Format of configuration
134
+
135
+ An example of configurations is below:
136
+
137
+ ```yaml
138
+ ---
139
+ host_name: www.example.org
140
+ resources:
141
+ - /end.html
142
+ - /subdir/big.pdf
143
+ match:
144
+ - :access_to_resources?
145
+ match_type: any
146
+ output_log_name: access-to-two-specific-files
147
+ ---
148
+ host_name: www.example.org
149
+ resources:
150
+ - /
151
+ match:
152
+ - :access_to_under_resources?
153
+ match_type: any
154
+ ignore_match:
155
+ - :access_by_bots?
156
+ - :not_found?
157
+ output_log_name: all-but-bots-and-not-found
158
+ ---
159
+ host_name: www.example.org
160
+ resources:
161
+ - /index.html
162
+ match:
163
+ - :access_to_resources?
164
+ - :access_by_bots?
165
+ match_type: all
166
+ output_log_name: index-page-accessed-by-bot
167
+ ```
168
+ It contains three configurations, and each of them consists of parameters in the following table:
169
+
170
+ |Parameters |Note |
171
+ |------------------------|-----------------------------------------------------------------------------------------------------------|
172
+ |host_name (optional) |Currently, the specified value is compared with the host part of the value of "%{Referer}i". |
173
+ |resources |The values will be compared with the value of "%U%q" field or the path part of the value of "%{Referer}i". |
174
+ |match |The criteria that a log record should satisfy. |
175
+ |ignore_match (optional) |If a log record satisfies any of the criteria listed under this parameter, the record is ignored. |
176
+ |match_type (optional) |The value is "any" (default) or "all". "any" means a log record is picked up if any of the criteria listed under the "match" parameter is satisfied. "all" means all of the criteria must be satisfied for the picking up. |
177
+ |output_log_name |Log records picked up are written in the file specified by this parameter. |
178
+
179
+
180
+ ##### Criteria for "match" and "ignore_match" parameters
181
+
182
+ |Available criteria |Note |
183
+ |----------------------------------------|------------------------------------------------------------------------------------------|
184
+ |:access_by_bots? |Access by major web crawlers such as Googlebot or Bingbot. |
185
+ |:referred_from_resources? |The path part of the value of "%{Referer}i" matches any of the values of "resources". |
186
+ |:referred_from_under_resources? |The path part of the value of "%{Referer}i" begins with any of the values of "resources". |
187
+ |:access_to_resources? |The value of "%U%q" matches any of the values of "resources". |
188
+ |:access_to_under_resources? |The value of "%U%q" begins with any of the values of "resources". |
189
+ |:partial_content? / :status_code_206? |The value of "%>s" is 206. |
190
+ |:moved_permanently? / :status_code_301? |The value of "%>s" is 301. |
191
+ |:not_modified? / :status_code_304? |The value of "%>s" is 304. |
192
+ |:not_found? / :status_code_404? |The value of "%>s" is 404. |
193
+ |:options_method? |The value of "%m" is OPTIONS |
194
+ |:get_method? |The value of "%m" is GET. |
195
+ |:head_method? |The value of "%m" is HEAD. |
196
+ |:post_method? |The value of "%m" is POST. |
197
+ |:put_method? |The value of "%m" is PUT. |
198
+ |:delete_method? |The value of "%m" is DELETE. |
199
+ |:trace_method? |The value of "%m" is TRACE. |
200
+ |:connect_method? |The value of "%m" is CONNECT. |
201
+ |:patch_method? |The value of "%m" is PATCH. |
202
+
203
+
204
+ ## Development
205
+
206
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment. Run `bundle exec log_line_parser` to use the code located in this directory, ignoring other installed copies of this gem.
207
+
208
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release` to create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
209
+
210
+ ## Contributing
211
+
212
+ 1. Fork it ( https://github.com/nico-hn/LogLineParser/fork )
213
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
214
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
215
+ 4. Push to the branch (`git push origin my-new-feature`)
216
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,11 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/testtask"
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.libs = ["lib", "test"]
6
+ t.warning = true
7
+ t.verbose = true
8
+ t.test_files = FileList['test/test_*.rb']
9
+ end
10
+
11
+ task :default => :test
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "log_line_parser"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+
5
+ bundle install
6
+
7
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "log_line_parser"
4
+ require "log_line_parser/command_line_interface"
5
+ require "log_line_parser/moe"
6
+
7
+ LogLineParser::CommandLineInterface.execute
@@ -0,0 +1,80 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ module LogLineParser
4
+ module Apache
5
+ =begin
6
+ All of the format strings listed in http://httpd.apache.org/docs/current/mod/mod_log_config.html#formats:
7
+ %% %a %{c}a %A %B %b %{VARNAME}C %D %{VARNAME}e %f %h %H %{VARNAME}i %k %l %L %m %{VARNAME}n %{VARNAME}o %p %{format}p %P %{format}P %q %r %R %s %t %{format}t %T %{UNIT}T %u %U %v %V %X %I %O %S %{VARNAME}^ti %{VARNAME}^to
8
+
9
+ As explained in http://httpd.apache.org/docs/current/logs.html:
10
+ "%r" = "%m %U%q %H"
11
+ =end
12
+
13
+ module LogFormat
14
+ COMMON = "%h %l %u %t \"%r\" %>s %b"
15
+ COMMON_WITH_VH = "%v %h %l %u %t \"%r\" %>s %b"
16
+ COMBINED = "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\""
17
+ end
18
+
19
+ FORMAT_STRING_SYMBOLE_TABLE = {
20
+ "%%" => :percent,
21
+ "%a" => :remote_ip,
22
+ "%{c}a" => :underlying_peer_ip,
23
+ "%A" => :local_ip,
24
+ "%B" => :response_bytes,
25
+ "%b" => :response_bytes,
26
+ # "%{VARNAME}C" => :cookie,
27
+ "%D" => :time_taken_us,
28
+ # "%{VARNAME}e" => :,
29
+ "%f" => :filename,
30
+ "%h" => :remote_host,
31
+ "%H" => :protocol,
32
+ # "%{VARNAME}i" => :,
33
+ "%{Referer}i" => :referer,
34
+ "%{User-agent}i" => :user_agent,
35
+ "%{X-Forwarded-For}i" => :x_forwarded_for,
36
+ "%k" => :keepalive_number,
37
+ "%l" => :remote_logname,
38
+ "%L" => :error_log_request_id,
39
+ "%m" => :method,
40
+ # "%{VARNAME}n" => :,
41
+ # "%{VARNAME}o" => :,
42
+ "%p" => :server_port,
43
+ # "%{format}p" => :,
44
+ "%P" => :pid,
45
+ # "%{format}P" => :,
46
+ "%q" => :query_string,
47
+ "%r" => :first_line_of_request,
48
+ # "%R" => :handler,
49
+ "%s" => :original_request_status,
50
+ "%>s" => :last_request_status, # final status
51
+ "%t" => :time, # Time the request was received
52
+ # "%{format}t" => :,
53
+ "%T" => :time_taken_s,
54
+ # "%{UNIT}T" => :,
55
+ "%u" => :remote_user,
56
+ "%U" => :url_path,
57
+ "%U%q" => :resource,
58
+ "%v" => :virtual_host,
59
+ "%V" => :server_name2,
60
+ "%X" => :connection_status,
61
+ "%I" => :received_bytes_including_headers,
62
+ "%O" => :sent_bytes_including_headers,
63
+ "%S" => :bytes_transferred,
64
+ # "%{VARNAME}^ti" => :,
65
+ # "%{VARNAME}^to" => :,
66
+ }
67
+
68
+ def self.parse_log_format(log_format)
69
+ log_format.split(/ /).map do |string|
70
+ string.sub(/^"/, "".freeze).sub(/"$/, "".freeze)
71
+ end
72
+ end
73
+
74
+ def self.format_strings_to_symbols(format_strings)
75
+ format_strings.map do |string|
76
+ FORMAT_STRING_SYMBOLE_TABLE[string]||string.to_sym
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,126 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'yaml'
4
+ require 'optparse'
5
+ require 'log_line_parser'
6
+ require 'log_line_parser/query'
7
+ require 'log_line_parser/utils'
8
+
9
+ module LogLineParser
10
+ module CommandLineInterface
11
+ class UnsupportedFormatError < StandardError; end
12
+
13
+ DEFAULT_FORMAT = "csv"
14
+
15
+ def self.read_configs(config)
16
+ YAML.load_stream(config).to_a
17
+ end
18
+
19
+ def self.parse_options
20
+ options = {}
21
+
22
+ OptionParser.new("USAGE: #{File.basename($0)} [OPTION]... [LOG_FILE]...") do |opt|
23
+ opt.on("-c [config_file]", "--config [=config_file]",
24
+ "Give a configuration file in yaml format") do |config_file|
25
+ options[:config_file] = config_file
26
+ end
27
+
28
+ opt.on("-f", "--filter-mode",
29
+ "Mode for choosing log records that satisfy certain criteria") do
30
+ options[:filter_mode] = true
31
+ end
32
+
33
+ opt.on("-l [LogFormat]", "--log-format [=LogFormat]",
34
+ "Specify LogFormat") do |log_format|
35
+ options[:log_format] = log_format
36
+ end
37
+
38
+ opt.on("-o [output_dir]", "--output-dir [=output_dir]",
39
+ "Specify the output directory for log files") do |output_dir|
40
+ options[:output_dir] = output_dir
41
+ end
42
+
43
+ opt.on("-t [format]", "--to [=format]",
44
+ "Specify a format") do |format|
45
+ options[:format] = format
46
+ end
47
+
48
+ opt.parse!
49
+ end
50
+
51
+ options
52
+ end
53
+
54
+ def self.load_config_file(config_file)
55
+ open(File.expand_path(config_file)) do |f|
56
+ read_configs(f.read)
57
+ end
58
+ end
59
+
60
+ def self.choose_log_parser(log_format)
61
+ return LogLineParser::CombinedLogParser unless log_format
62
+ parser = LogLineParser::PREDEFINED_FORMATS[log_format]
63
+ parser || LogLineParser.parser(log_format)
64
+ end
65
+
66
+ def self.execute
67
+ options = parse_options
68
+ if options[:filter_mode]
69
+ execute_as_filter(options)
70
+ else
71
+ execute_as_converter(options)
72
+ end
73
+ end
74
+
75
+ def self.execute_as_filter(options)
76
+ configs = load_config_file(options[:config_file])
77
+ parser = choose_log_parser(options[:log_format])
78
+ output_dir = options[:output_dir]
79
+ output_log_names = collect_output_log_names(configs)
80
+ Utils.open_multiple_output_files(output_log_names, output_dir) do |logs|
81
+ queries = setup_queries_from_configs(configs, logs)
82
+ LogLineParser.each_record(record_type: parser) do |line, record|
83
+ queries.each {|query| query.call(line, record) }
84
+ end
85
+ end
86
+ end
87
+
88
+ def self.execute_as_converter(options, output=STDOUT, input=ARGF)
89
+ output_format = options[:format] || DEFAULT_FORMAT
90
+ case output_format
91
+ when DEFAULT_FORMAT
92
+ convert_to_csv(input, output)
93
+ when "tsv"
94
+ convert_to_tsv(input, output)
95
+ else
96
+ raise UnsupportedFormatError.new(output_format)
97
+ end
98
+ end
99
+
100
+ private
101
+
102
+ def self.collect_output_log_names(configs)
103
+ configs.map do |config|
104
+ config[Query::ConfigFields::OUTPUT_LOG_NAME]
105
+ end
106
+ end
107
+
108
+ def self.setup_queries_from_configs(configs, logs)
109
+ configs.map do |config|
110
+ Query.register_query_to_log(config, logs)
111
+ end
112
+ end
113
+
114
+ def self.convert_to_csv(input, output)
115
+ input.each_line do |line|
116
+ output.print Utils.to_csv(line.chomp)
117
+ end
118
+ end
119
+
120
+ def self.convert_to_tsv(input, output)
121
+ input.each_line do |line|
122
+ output.puts Utils.to_tsv(line.chomp)
123
+ end
124
+ end
125
+ end
126
+ end