log_line_parser 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6ce0b6e321f311f85bc375942205567ab4d60743
4
- data.tar.gz: ee2d0b41508c833d39cade577581426dbb06cb23
3
+ metadata.gz: 658d4493353541baf8aec1350bcd803b4b11b9e9
4
+ data.tar.gz: 1158015fed2a075f93da9c94bcd8921734b83226
5
5
  SHA512:
6
- metadata.gz: c2b363c6be57677fda905270e36f0ee23c39f9cb0915199eaad63a5fabc566a814437e92ad1706040e874e171107c3c901e90d195c1d530ea4e0282edac0d2b6
7
- data.tar.gz: ea2ae4f5b9c0eb962c221e605927f0cdf977c194d612fff620a0cc2bb113825620762b67c4e377abab4654dcc1877dc3134f73625d07119d7cfa70610eb9d2a7
6
+ metadata.gz: 3bda73a0a7fa68a464b37e0b2287ce82206c5ba7fd973888b90935f7b6835c93f9c7aff93b7282f42f4311dbb801145353d306a5ae5ca5e047de6a01a7f19fd0
7
+ data.tar.gz: a89dbdf52206cfa8ca0e5752dfbac7041b689def7b9d448a98701cdd577cbd21bd7023140f31b66f85fdb33a23c8269aa3838afbcf3ce914453d605a6759c78a
data/.gitignore CHANGED
@@ -12,3 +12,4 @@
12
12
  \#*\#
13
13
  /*.gem
14
14
  /*.zip
15
+ /test/data/tmp/
data/.travis.yml CHANGED
@@ -1,3 +1,12 @@
1
1
  language: ruby
2
2
  rvm:
3
- - 2.1.6
3
+ - 2.2.0
4
+ - 2.3.3
5
+ - 2.4.0
6
+ - jruby-9.1.8.0
7
+ sudo: required
8
+ dist: trusty
9
+ addons:
10
+ apt:
11
+ packages:
12
+ - haveged
data/README.md CHANGED
@@ -1,7 +1,7 @@
1
1
  # LogLineParser
2
2
 
3
- LogLineParser is a simple parser of Apache access logs. It parses a line of Apache access log and turns it into an array of strings or a Hash object.
4
- And from the command line, you can use it as a conversion tool of file formats or as a filtering tool of access records.
3
+ LogLineParser is a simple parser of Apache access log. It parses a line of Apache access log and turns it into an array of strings or a Hash object.
4
+ And from the command line, you can use it as a conversion tool of file format (to CSV or TSV) or as a filtering tool of access records.
5
5
 
6
6
  ## Installation
7
7
 
@@ -19,6 +19,21 @@ Or install it yourself as:
19
19
 
20
20
  $ gem install log_line_parser
21
21
 
22
+ ### If you have millions of records to be parsed
23
+
24
+ I recommend you to use [JRuby](http://jruby.org/) (version >= 9.1.8), because it is significantly faster than MRI.
25
+
26
+ When you use JRuby, you have to set RUBYOPT environment variable at first. For example:
27
+
28
+ # For Bash or Zsh
29
+ $ RUBYOPT='-Xcompile.invokedynamic=true'
30
+ $ export RUBYOPT
31
+
32
+ Or
33
+
34
+ # For MS Windows
35
+ set RUBYOPT=-Xcompile.invokedynamic=true
36
+
22
37
  ## Usage
23
38
 
24
39
  ### As a converter
@@ -63,7 +78,7 @@ Three parsers are predefined for such cases:
63
78
  <dt>LogLineParser::CommonLogWithVHParser</dt>
64
79
  <dd>For Common Log Format with Virtual Host</dd>
65
80
  <dt>LogLineParser::CombinedLogParser</dt>
66
- <dd>NCSA extended/combined log format</dd>
81
+ <dd>For NCSA extended/combined log format</dd>
67
82
  </dl>
68
83
 
69
84
  #### Defining a parser
@@ -249,6 +264,21 @@ The values given by the configuration file are compiled into a regular expressio
249
264
 
250
265
  $ log_line_parser --show-current-settings --bots-config=bots_config.yml
251
266
 
267
+ #### Command line options
268
+
269
+ The following command options are available.
270
+
271
+ |Short |Long |Description |
272
+ |----------------------|----------------------------------|----------------------------------------------------------------------------------------------------------------------|
273
+ |-c [config_file] |--config [=config_file] |Give a configuration file in yaml format |
274
+ |-b [bots_config_file] |--bots-config [=bots_config_file] |Give a configuration file in yaml format. Default bots: Googlebot, Googlebot-Mobile, Mediapartners-Google, Bingbot, Slurp, Baiduspider, BaiduImagespider, BaiduMobaider, YetiBot, Applebot |
275
+ |-s |--show-current-settings |Show the detail of the current settings |
276
+ |-f |--filter-mode |Mode for choosing log records that satisfy certain criteria |
277
+ |-l [LogFormat] |--log-format [=LogFormat] |Specify LogFormat by giving a LogFormat or one of formats predefined as "common", "common_with_vh", "combined", "moe" |
278
+ |-o [output_dir] |--output-dir [=output_dir] |Specify the output directory for log files |
279
+ |-t [format] |--to [=format] |Specify a format: csv, tsv or ltsv |
280
+ |-e [error_log_file] |--error-log [=error_log_file] |Specify a file for error logging |
281
+
252
282
 
253
283
  ## Development
254
284
 
@@ -268,8 +268,18 @@ module LogLineParser
268
268
  begin
269
269
  yield line, parser.parse(line)
270
270
  rescue MalFormedRecordError => e
271
- error_output.print e.message
271
+ error_output.print error_message(input, e)
272
272
  end
273
273
  end
274
274
  end
275
+
276
+ def self.error_message(input, e)
277
+ if input == ARGF
278
+ "#{ARGF.filename}:#{ARGF.file.lineno}:#{e.message}"
279
+ else
280
+ e.message
281
+ end
282
+ end
283
+
284
+ private_class_method :error_message
275
285
  end
@@ -10,10 +10,93 @@ module LogLineParser
10
10
  module CommandLineInterface
11
11
  class UnsupportedFormatError < StandardError; end
12
12
 
13
+ class Converter
14
+ def execute(options, output=STDOUT, input=ARGF)
15
+ output_format = options[:format] || DEFAULT_FORMAT
16
+ case output_format
17
+ when DEFAULT_FORMAT
18
+ to_csv(input, output)
19
+ when "tsv"
20
+ to_tsv(input, output)
21
+ when "ltsv"
22
+ to_ltsv(input, output, options[:log_format])
23
+ else
24
+ raise UnsupportedFormatError.new(output_format)
25
+ end
26
+ end
27
+
28
+ def to_csv(input, output)
29
+ input.each_line do |line|
30
+ output.print Utils.to_csv(line.chomp)
31
+ end
32
+ end
33
+
34
+ def to_tsv(input, output)
35
+ input.each_line do |line|
36
+ output.puts Utils.to_tsv(line.chomp)
37
+ end
38
+ end
39
+
40
+ def to_ltsv(input, output, parser)
41
+ input.each_line do |line|
42
+ output.puts parser.to_ltsv(line.chomp)
43
+ end
44
+ end
45
+ end
46
+
47
+ class Filter
48
+ OptionValues = Struct.new(:configs, :bots_re, :output_log_names,
49
+ :output_dir, :log_format, :error_log)
50
+
51
+ def execute(options)
52
+ opt = option_values(options)
53
+ Utils.open_multiple_output_files(opt.output_log_names,
54
+ opt.output_dir) do |logs|
55
+ queries = setup_queries_from_configs(opt.configs, logs, opt.bots_re)
56
+ LogLineParser.each_record(error_output: opt.error_log || STDERR,
57
+ parser: opt.log_format) do |line, record|
58
+ queries.each {|query| query.call(line, record) }
59
+ end
60
+ end
61
+ ensure
62
+ opt.error_log.close if opt.error_log
63
+ end
64
+
65
+ private
66
+
67
+ def option_values(options)
68
+ configs = Utils.load_config_file(options[:config_file])
69
+ bots_re = Utils.compile_bots_re_from_config_file(options[:bots_config_file])
70
+ error_log = open_error_log(options[:error_log_file])
71
+ OptionValues.new(configs,
72
+ bots_re,
73
+ collect_output_log_names(configs),
74
+ options[:output_dir],
75
+ options[:log_format],
76
+ error_log)
77
+ end
78
+
79
+ def collect_output_log_names(configs)
80
+ configs.map do |config|
81
+ config[Query::ConfigFields::OUTPUT_LOG_NAME]
82
+ end
83
+ end
84
+
85
+ def setup_queries_from_configs(configs, logs, bots_re)
86
+ configs.map do |config|
87
+ Query.register_query_to_log(config, logs, bots_re)
88
+ end
89
+ end
90
+
91
+ def open_error_log(log_file)
92
+ open(File.expand_path(log_file), "wb") if log_file
93
+ end
94
+ end
95
+
13
96
  DEFAULT_FORMAT = "csv"
14
97
 
15
98
  def self.parse_options
16
- options = {}
99
+ options = { log_format: LogLineParser::CombinedLogParser }
17
100
 
18
101
  OptionParser.new("USAGE: #{File.basename($0)} [OPTION]... [LOG_FILE]...") do |opt|
19
102
  opt.on("-c [config_file]", "--config [=config_file]",
@@ -40,7 +123,7 @@ Default bots: #{Bots::DEFAULT_BOTS.join(', ')}") do |config_file|
40
123
  opt.on("-l [LogFormat]", "--log-format [=LogFormat]",
41
124
  "Specify LogFormat by giving a LogFormat or one of \
42
125
  formats predefined as #{predefined_options_for_log_format}") do |log_format|
43
- options[:log_format] = log_format
126
+ options[:log_format] = choose_log_parser(log_format)
44
127
  end
45
128
 
46
129
  opt.on("-o [output_dir]", "--output-dir [=output_dir]",
@@ -53,6 +136,11 @@ formats predefined as #{predefined_options_for_log_format}") do |log_format|
53
136
  options[:format] = format
54
137
  end
55
138
 
139
+ opt.on("-e [error_log_file]", "--error-log [=error_log_file]",
140
+ "Specify a file for error logging") do |error_log_file|
141
+ options[:error_log_file] = error_log_file
142
+ end
143
+
56
144
  opt.parse!
57
145
  end
58
146
 
@@ -60,7 +148,6 @@ formats predefined as #{predefined_options_for_log_format}") do |log_format|
60
148
  end
61
149
 
62
150
  def self.choose_log_parser(log_format)
63
- return LogLineParser::CombinedLogParser unless log_format
64
151
  parser = LogLineParser::PREDEFINED_FORMATS[log_format]
65
152
  parser || LogLineParser.parser(log_format)
66
153
  end
@@ -77,33 +164,18 @@ formats predefined as #{predefined_options_for_log_format}") do |log_format|
77
164
  end
78
165
 
79
166
  def self.show_settings(options)
80
- bots_re = compile_bots_re_from_config_file(options[:bots_config_file])
81
- parser = choose_log_parser(options[:log_format])
167
+ bots_re = Utils.compile_bots_re_from_config_file(options[:bots_config_file])
168
+ parser = options[:log_format]
82
169
  puts "The regular expression for bots: #{bots_re}"
83
170
  puts "LogFormat: #{parser.format_strings}"
84
171
  end
85
172
 
86
173
  def self.execute_as_filter(options)
87
- configs = Utils.load_config_file(options[:config_file])
88
- parser = choose_log_parser(options[:log_format])
89
- output_dir = options[:output_dir]
90
- bots_re = compile_bots_re_from_config_file(options[:bots_config_file])
91
- execute_queries(configs, parser, output_dir, bots_re)
174
+ Filter.new.execute(options)
92
175
  end
93
176
 
94
177
  def self.execute_as_converter(options, output=STDOUT, input=ARGF)
95
- output_format = options[:format] || DEFAULT_FORMAT
96
- case output_format
97
- when DEFAULT_FORMAT
98
- convert_to_csv(input, output)
99
- when "tsv"
100
- convert_to_tsv(input, output)
101
- when "ltsv"
102
- convert_to_ltsv(input, output,
103
- choose_log_parser(options[:log_format]))
104
- else
105
- raise UnsupportedFormatError.new(output_format)
106
- end
178
+ Converter.new.execute(options, output, input)
107
179
  end
108
180
 
109
181
  # private class methods
@@ -114,59 +186,6 @@ formats predefined as #{predefined_options_for_log_format}") do |log_format|
114
186
  join(", ")
115
187
  end
116
188
 
117
- def self.compile_bots_re_from_config_file(bots_config_file)
118
- return Bots::DEFAULT_RE unless bots_config_file
119
- configs = Utils.load_config_file(bots_config_file)[0]
120
- Bots.compile_bots_re(configs)
121
- end
122
-
123
- def self.collect_output_log_names(configs)
124
- configs.map do |config|
125
- config[Query::ConfigFields::OUTPUT_LOG_NAME]
126
- end
127
- end
128
-
129
- def self.execute_queries(configs, parser, output_dir, bots_re)
130
- output_log_names = collect_output_log_names(configs)
131
- Utils.open_multiple_output_files(output_log_names, output_dir) do |logs|
132
- queries = setup_queries_from_configs(configs, logs, bots_re)
133
- LogLineParser.each_record(parser: parser) do |line, record|
134
- queries.each {|query| query.call(line, record) }
135
- end
136
- end
137
- end
138
-
139
- def self.setup_queries_from_configs(configs, logs, bots_re)
140
- configs.map do |config|
141
- Query.register_query_to_log(config, logs, bots_re)
142
- end
143
- end
144
-
145
- def self.convert_to_csv(input, output)
146
- input.each_line do |line|
147
- output.print Utils.to_csv(line.chomp)
148
- end
149
- end
150
-
151
- def self.convert_to_tsv(input, output)
152
- input.each_line do |line|
153
- output.puts Utils.to_tsv(line.chomp)
154
- end
155
- end
156
-
157
- def self.convert_to_ltsv(input, output, parser)
158
- input.each_line do |line|
159
- output.puts parser.to_ltsv(line.chomp)
160
- end
161
- end
162
-
163
- private_class_method(:predefined_options_for_log_format,
164
- :compile_bots_re_from_config_file,
165
- :collect_output_log_names,
166
- :execute_queries,
167
- :setup_queries_from_configs,
168
- :convert_to_csv,
169
- :convert_to_tsv,
170
- :convert_to_ltsv)
189
+ private_class_method :predefined_options_for_log_format
171
190
  end
172
191
  end
@@ -3,13 +3,17 @@
3
3
  module LineParser
4
4
  class Tokenizer
5
5
  class << self
6
- attr_reader :special_token_re, :non_special_token_re
6
+ attr_reader :special_token_re
7
7
 
8
8
  def tokenize(str, tokens=[])
9
9
  @scanner.string = str
10
- token = true # to start looping, you should assign a truthy value
11
- while token
12
- tokens.push token if token = scan_token
10
+ cur_pos = 0 # instead of @scanner.pos
11
+ while chunk_size = @scanner.skip_until(@special_token_re)
12
+ token = @scanner.matched
13
+ pre_match_size = chunk_size - token.bytesize
14
+ tokens.push str.byteslice(cur_pos, pre_match_size) if pre_match_size > 0
15
+ tokens.push token
16
+ cur_pos += chunk_size
13
17
  end
14
18
 
15
19
  tokens.push @scanner.rest unless @scanner.eos?
@@ -20,16 +24,11 @@ module LineParser
20
24
  @special_tokens = special_tokens
21
25
  @unescaped_special_tokens = unescaped_special_tokens
22
26
  @scanner = StringScanner.new("".freeze)
23
- @special_token_re, @non_special_token_re = compose_re(@special_tokens)
27
+ @special_token_re = compose_re(@special_tokens)
24
28
  end
25
29
 
26
30
  private
27
31
 
28
- def scan_token
29
- @scanner.scan(@special_token_re) ||
30
- @scanner.scan_until(@non_special_token_re)
31
- end
32
-
33
32
  def compose_special_tokens_str(special_tokens)
34
33
  sorted = special_tokens.sort {|x, y| y.length <=> x.length }
35
34
  escaped = sorted.map {|token| Regexp.escape(token) }
@@ -39,7 +38,7 @@ module LineParser
39
38
 
40
39
  def compose_re(special_tokens)
41
40
  tokens_str = compose_special_tokens_str(special_tokens)
42
- return Regexp.compile(tokens_str), Regexp.compile("(?=#{tokens_str})")
41
+ return Regexp.compile(tokens_str)
43
42
  end
44
43
  end
45
44
  end
@@ -1,7 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  module LogLineParser
4
-
5
4
  module Ltsv
6
5
  LABEL_SEPARATOR = ":"
7
6
  TAB = "\t"
@@ -1,7 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require 'log_line_parser'
4
- require 'log_line_parser/query'
5
4
  require 'csv'
6
5
  require 'yaml'
7
6
 
@@ -24,9 +23,7 @@ module LogLineParser
24
23
  end
25
24
  yield logs
26
25
  ensure
27
- logs.each do |k, v|
28
- v.close
29
- end
26
+ logs.each_value {|v| v.close }
30
27
  end
31
28
 
32
29
  def self.read_configs(config)
@@ -39,6 +36,12 @@ module LogLineParser
39
36
  end
40
37
  end
41
38
 
39
+ def self.compile_bots_re_from_config_file(bots_config_file)
40
+ return Bots::DEFAULT_RE unless bots_config_file
41
+ configs = load_config_file(bots_config_file)[0]
42
+ Bots.compile_bots_re(configs)
43
+ end
44
+
42
45
  def self.to_tsv(line, escape=true)
43
46
  LogLineParser.parse(line).to_a.map do |field|
44
47
  escape ? escape_special_chars(field) : field
@@ -1,3 +1,3 @@
1
1
  module LogLineParser
2
- VERSION = "0.3.0"
2
+ VERSION = "0.4.0"
3
3
  end
@@ -10,8 +10,8 @@ Gem::Specification.new do |spec|
10
10
  spec.authors = ["HASHIMOTO, Naoki"]
11
11
  spec.email = ["hashimoto.naoki@gmail.com"]
12
12
 
13
- spec.summary = %q{A simple parser of Apache access logs}
14
- spec.description = %q{A simple parser of Apache access logs: it parses a line of Apache access log and turns it into an array of strings or a Hash object. And from the command line, you can use it as a conversion tool of file formats or as a filtering tool of access records.}
13
+ spec.summary = %q{A simple parser of Apache access log}
14
+ spec.description = %q{A simple parser of Apache access log: it parses a line of Apache access log and turns it into an array of strings or a Hash object. And from the command line, you can use it as a conversion tool of file format (to CSV/TSV) or as a filtering tool of access records.}
15
15
  spec.homepage = "https://github.com/nico-hn/LogLineParser"
16
16
  spec.license = "MIT"
17
17
 
@@ -28,6 +28,7 @@ Gem::Specification.new do |spec|
28
28
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
29
29
  spec.require_paths = ["lib"]
30
30
 
31
- spec.add_development_dependency "bundler", "~> 1.9"
31
+ spec.add_development_dependency "bundler", ">= 1.7"
32
32
  spec.add_development_dependency "rake", "~> 10.0"
33
+ spec.add_development_dependency "minitest"
33
34
  end
metadata CHANGED
@@ -1,29 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: log_line_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - HASHIMOTO, Naoki
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-04-10 00:00:00.000000000 Z
11
+ date: 2017-04-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: '1.9'
19
+ version: '1.7'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "~>"
24
+ - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: '1.9'
26
+ version: '1.7'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -38,10 +38,24 @@ dependencies:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '10.0'
41
- description: 'A simple parser of Apache access logs: it parses a line of Apache access
41
+ - !ruby/object:Gem::Dependency
42
+ name: minitest
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: 'A simple parser of Apache access log: it parses a line of Apache access
42
56
  log and turns it into an array of strings or a Hash object. And from the command
43
- line, you can use it as a conversion tool of file formats or as a filtering tool
44
- of access records.'
57
+ line, you can use it as a conversion tool of file format (to CSV/TSV) or as a filtering
58
+ tool of access records.'
45
59
  email:
46
60
  - hashimoto.naoki@gmail.com
47
61
  executables:
@@ -99,5 +113,5 @@ rubyforge_project:
99
113
  rubygems_version: 2.2.3
100
114
  signing_key:
101
115
  specification_version: 4
102
- summary: A simple parser of Apache access logs
116
+ summary: A simple parser of Apache access log
103
117
  test_files: []