log_line_parser 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6ce0b6e321f311f85bc375942205567ab4d60743
4
- data.tar.gz: ee2d0b41508c833d39cade577581426dbb06cb23
3
+ metadata.gz: 658d4493353541baf8aec1350bcd803b4b11b9e9
4
+ data.tar.gz: 1158015fed2a075f93da9c94bcd8921734b83226
5
5
  SHA512:
6
- metadata.gz: c2b363c6be57677fda905270e36f0ee23c39f9cb0915199eaad63a5fabc566a814437e92ad1706040e874e171107c3c901e90d195c1d530ea4e0282edac0d2b6
7
- data.tar.gz: ea2ae4f5b9c0eb962c221e605927f0cdf977c194d612fff620a0cc2bb113825620762b67c4e377abab4654dcc1877dc3134f73625d07119d7cfa70610eb9d2a7
6
+ metadata.gz: 3bda73a0a7fa68a464b37e0b2287ce82206c5ba7fd973888b90935f7b6835c93f9c7aff93b7282f42f4311dbb801145353d306a5ae5ca5e047de6a01a7f19fd0
7
+ data.tar.gz: a89dbdf52206cfa8ca0e5752dfbac7041b689def7b9d448a98701cdd577cbd21bd7023140f31b66f85fdb33a23c8269aa3838afbcf3ce914453d605a6759c78a
data/.gitignore CHANGED
@@ -12,3 +12,4 @@
12
12
  \#*\#
13
13
  /*.gem
14
14
  /*.zip
15
+ /test/data/tmp/
data/.travis.yml CHANGED
@@ -1,3 +1,12 @@
1
1
  language: ruby
2
2
  rvm:
3
- - 2.1.6
3
+ - 2.2.0
4
+ - 2.3.3
5
+ - 2.4.0
6
+ - jruby-9.1.8.0
7
+ sudo: required
8
+ dist: trusty
9
+ addons:
10
+ apt:
11
+ packages:
12
+ - haveged
data/README.md CHANGED
@@ -1,7 +1,7 @@
1
1
  # LogLineParser
2
2
 
3
- LogLineParser is a simple parser of Apache access logs. It parses a line of Apache access log and turns it into an array of strings or a Hash object.
4
- And from the command line, you can use it as a conversion tool of file formats or as a filtering tool of access records.
3
+ LogLineParser is a simple parser of Apache access log. It parses a line of Apache access log and turns it into an array of strings or a Hash object.
4
+ And from the command line, you can use it as a conversion tool of file format (to CSV or TSV) or as a filtering tool of access records.
5
5
 
6
6
  ## Installation
7
7
 
@@ -19,6 +19,21 @@ Or install it yourself as:
19
19
 
20
20
  $ gem install log_line_parser
21
21
 
22
+ ### If you have millions of records to be parsed
23
+
24
+ I recommend you to use [JRuby](http://jruby.org/) (version >= 9.1.8), because it is significantly faster than MRI.
25
+
26
+ When you use JRuby, you have to set RUBYOPT environment variable at first. For example:
27
+
28
+ # For Bash or Zsh
29
+ $ RUBYOPT='-Xcompile.invokedynamic=true'
30
+ $ export RUBYOPT
31
+
32
+ Or
33
+
34
+ # For MS Windows
35
+ set RUBYOPT=-Xcompile.invokedynamic=true
36
+
22
37
  ## Usage
23
38
 
24
39
  ### As a converter
@@ -63,7 +78,7 @@ Three parsers are predefined for such cases:
63
78
  <dt>LogLineParser::CommonLogWithVHParser</dt>
64
79
  <dd>For Common Log Format with Virtual Host</dd>
65
80
  <dt>LogLineParser::CombinedLogParser</dt>
66
- <dd>NCSA extended/combined log format</dd>
81
+ <dd>For NCSA extended/combined log format</dd>
67
82
  </dl>
68
83
 
69
84
  #### Defining a parser
@@ -249,6 +264,21 @@ The values given by the configuration file are compiled into a regular expressio
249
264
 
250
265
  $ log_line_parser --show-current-settings --bots-config=bots_config.yml
251
266
 
267
+ #### Command line options
268
+
269
+ The following command options are available.
270
+
271
+ |Short |Long |Description |
272
+ |----------------------|----------------------------------|----------------------------------------------------------------------------------------------------------------------|
273
+ |-c [config_file] |--config [=config_file] |Give a configuration file in yaml format |
274
+ |-b [bots_config_file] |--bots-config [=bots_config_file] |Give a configuration file in yaml format. Default bots: Googlebot, Googlebot-Mobile, Mediapartners-Google, Bingbot, Slurp, Baiduspider, BaiduImagespider, BaiduMobaider, YetiBot, Applebot |
275
+ |-s |--show-current-settings |Show the detail of the current settings |
276
+ |-f |--filter-mode |Mode for choosing log records that satisfy certain criteria |
277
+ |-l [LogFormat] |--log-format [=LogFormat] |Specify LogFormat by giving a LogFormat or one of formats predefined as "common", "common_with_vh", "combined", "moe" |
278
+ |-o [output_dir] |--output-dir [=output_dir] |Specify the output directory for log files |
279
+ |-t [format] |--to [=format] |Specify a format: csv, tsv or ltsv |
280
+ |-e [error_log_file] |--error-log [=error_log_file] |Specify a file for error logging |
281
+
252
282
 
253
283
  ## Development
254
284
 
@@ -268,8 +268,18 @@ module LogLineParser
268
268
  begin
269
269
  yield line, parser.parse(line)
270
270
  rescue MalFormedRecordError => e
271
- error_output.print e.message
271
+ error_output.print error_message(input, e)
272
272
  end
273
273
  end
274
274
  end
275
+
276
+ def self.error_message(input, e)
277
+ if input == ARGF
278
+ "#{ARGF.filename}:#{ARGF.file.lineno}:#{e.message}"
279
+ else
280
+ e.message
281
+ end
282
+ end
283
+
284
+ private_class_method :error_message
275
285
  end
@@ -10,10 +10,93 @@ module LogLineParser
10
10
  module CommandLineInterface
11
11
  class UnsupportedFormatError < StandardError; end
12
12
 
13
+ class Converter
14
+ def execute(options, output=STDOUT, input=ARGF)
15
+ output_format = options[:format] || DEFAULT_FORMAT
16
+ case output_format
17
+ when DEFAULT_FORMAT
18
+ to_csv(input, output)
19
+ when "tsv"
20
+ to_tsv(input, output)
21
+ when "ltsv"
22
+ to_ltsv(input, output, options[:log_format])
23
+ else
24
+ raise UnsupportedFormatError.new(output_format)
25
+ end
26
+ end
27
+
28
+ def to_csv(input, output)
29
+ input.each_line do |line|
30
+ output.print Utils.to_csv(line.chomp)
31
+ end
32
+ end
33
+
34
+ def to_tsv(input, output)
35
+ input.each_line do |line|
36
+ output.puts Utils.to_tsv(line.chomp)
37
+ end
38
+ end
39
+
40
+ def to_ltsv(input, output, parser)
41
+ input.each_line do |line|
42
+ output.puts parser.to_ltsv(line.chomp)
43
+ end
44
+ end
45
+ end
46
+
47
+ class Filter
48
+ OptionValues = Struct.new(:configs, :bots_re, :output_log_names,
49
+ :output_dir, :log_format, :error_log)
50
+
51
+ def execute(options)
52
+ opt = option_values(options)
53
+ Utils.open_multiple_output_files(opt.output_log_names,
54
+ opt.output_dir) do |logs|
55
+ queries = setup_queries_from_configs(opt.configs, logs, opt.bots_re)
56
+ LogLineParser.each_record(error_output: opt.error_log || STDERR,
57
+ parser: opt.log_format) do |line, record|
58
+ queries.each {|query| query.call(line, record) }
59
+ end
60
+ end
61
+ ensure
62
+ opt.error_log.close if opt.error_log
63
+ end
64
+
65
+ private
66
+
67
+ def option_values(options)
68
+ configs = Utils.load_config_file(options[:config_file])
69
+ bots_re = Utils.compile_bots_re_from_config_file(options[:bots_config_file])
70
+ error_log = open_error_log(options[:error_log_file])
71
+ OptionValues.new(configs,
72
+ bots_re,
73
+ collect_output_log_names(configs),
74
+ options[:output_dir],
75
+ options[:log_format],
76
+ error_log)
77
+ end
78
+
79
+ def collect_output_log_names(configs)
80
+ configs.map do |config|
81
+ config[Query::ConfigFields::OUTPUT_LOG_NAME]
82
+ end
83
+ end
84
+
85
+ def setup_queries_from_configs(configs, logs, bots_re)
86
+ configs.map do |config|
87
+ Query.register_query_to_log(config, logs, bots_re)
88
+ end
89
+ end
90
+
91
+ def open_error_log(log_file)
92
+ open(File.expand_path(log_file), "wb") if log_file
93
+ end
94
+ end
95
+
13
96
  DEFAULT_FORMAT = "csv"
14
97
 
15
98
  def self.parse_options
16
- options = {}
99
+ options = { log_format: LogLineParser::CombinedLogParser }
17
100
 
18
101
  OptionParser.new("USAGE: #{File.basename($0)} [OPTION]... [LOG_FILE]...") do |opt|
19
102
  opt.on("-c [config_file]", "--config [=config_file]",
@@ -40,7 +123,7 @@ Default bots: #{Bots::DEFAULT_BOTS.join(', ')}") do |config_file|
40
123
  opt.on("-l [LogFormat]", "--log-format [=LogFormat]",
41
124
  "Specify LogFormat by giving a LogFormat or one of \
42
125
  formats predefined as #{predefined_options_for_log_format}") do |log_format|
43
- options[:log_format] = log_format
126
+ options[:log_format] = choose_log_parser(log_format)
44
127
  end
45
128
 
46
129
  opt.on("-o [output_dir]", "--output-dir [=output_dir]",
@@ -53,6 +136,11 @@ formats predefined as #{predefined_options_for_log_format}") do |log_format|
53
136
  options[:format] = format
54
137
  end
55
138
 
139
+ opt.on("-e [error_log_file]", "--error-log [=error_log_file]",
140
+ "Specify a file for error logging") do |error_log_file|
141
+ options[:error_log_file] = error_log_file
142
+ end
143
+
56
144
  opt.parse!
57
145
  end
58
146
 
@@ -60,7 +148,6 @@ formats predefined as #{predefined_options_for_log_format}") do |log_format|
60
148
  end
61
149
 
62
150
  def self.choose_log_parser(log_format)
63
- return LogLineParser::CombinedLogParser unless log_format
64
151
  parser = LogLineParser::PREDEFINED_FORMATS[log_format]
65
152
  parser || LogLineParser.parser(log_format)
66
153
  end
@@ -77,33 +164,18 @@ formats predefined as #{predefined_options_for_log_format}") do |log_format|
77
164
  end
78
165
 
79
166
  def self.show_settings(options)
80
- bots_re = compile_bots_re_from_config_file(options[:bots_config_file])
81
- parser = choose_log_parser(options[:log_format])
167
+ bots_re = Utils.compile_bots_re_from_config_file(options[:bots_config_file])
168
+ parser = options[:log_format]
82
169
  puts "The regular expression for bots: #{bots_re}"
83
170
  puts "LogFormat: #{parser.format_strings}"
84
171
  end
85
172
 
86
173
  def self.execute_as_filter(options)
87
- configs = Utils.load_config_file(options[:config_file])
88
- parser = choose_log_parser(options[:log_format])
89
- output_dir = options[:output_dir]
90
- bots_re = compile_bots_re_from_config_file(options[:bots_config_file])
91
- execute_queries(configs, parser, output_dir, bots_re)
174
+ Filter.new.execute(options)
92
175
  end
93
176
 
94
177
  def self.execute_as_converter(options, output=STDOUT, input=ARGF)
95
- output_format = options[:format] || DEFAULT_FORMAT
96
- case output_format
97
- when DEFAULT_FORMAT
98
- convert_to_csv(input, output)
99
- when "tsv"
100
- convert_to_tsv(input, output)
101
- when "ltsv"
102
- convert_to_ltsv(input, output,
103
- choose_log_parser(options[:log_format]))
104
- else
105
- raise UnsupportedFormatError.new(output_format)
106
- end
178
+ Converter.new.execute(options, output, input)
107
179
  end
108
180
 
109
181
  # private class methods
@@ -114,59 +186,6 @@ formats predefined as #{predefined_options_for_log_format}") do |log_format|
114
186
  join(", ")
115
187
  end
116
188
 
117
- def self.compile_bots_re_from_config_file(bots_config_file)
118
- return Bots::DEFAULT_RE unless bots_config_file
119
- configs = Utils.load_config_file(bots_config_file)[0]
120
- Bots.compile_bots_re(configs)
121
- end
122
-
123
- def self.collect_output_log_names(configs)
124
- configs.map do |config|
125
- config[Query::ConfigFields::OUTPUT_LOG_NAME]
126
- end
127
- end
128
-
129
- def self.execute_queries(configs, parser, output_dir, bots_re)
130
- output_log_names = collect_output_log_names(configs)
131
- Utils.open_multiple_output_files(output_log_names, output_dir) do |logs|
132
- queries = setup_queries_from_configs(configs, logs, bots_re)
133
- LogLineParser.each_record(parser: parser) do |line, record|
134
- queries.each {|query| query.call(line, record) }
135
- end
136
- end
137
- end
138
-
139
- def self.setup_queries_from_configs(configs, logs, bots_re)
140
- configs.map do |config|
141
- Query.register_query_to_log(config, logs, bots_re)
142
- end
143
- end
144
-
145
- def self.convert_to_csv(input, output)
146
- input.each_line do |line|
147
- output.print Utils.to_csv(line.chomp)
148
- end
149
- end
150
-
151
- def self.convert_to_tsv(input, output)
152
- input.each_line do |line|
153
- output.puts Utils.to_tsv(line.chomp)
154
- end
155
- end
156
-
157
- def self.convert_to_ltsv(input, output, parser)
158
- input.each_line do |line|
159
- output.puts parser.to_ltsv(line.chomp)
160
- end
161
- end
162
-
163
- private_class_method(:predefined_options_for_log_format,
164
- :compile_bots_re_from_config_file,
165
- :collect_output_log_names,
166
- :execute_queries,
167
- :setup_queries_from_configs,
168
- :convert_to_csv,
169
- :convert_to_tsv,
170
- :convert_to_ltsv)
189
+ private_class_method :predefined_options_for_log_format
171
190
  end
172
191
  end
@@ -3,13 +3,17 @@
3
3
  module LineParser
4
4
  class Tokenizer
5
5
  class << self
6
- attr_reader :special_token_re, :non_special_token_re
6
+ attr_reader :special_token_re
7
7
 
8
8
  def tokenize(str, tokens=[])
9
9
  @scanner.string = str
10
- token = true # to start looping, you should assign a truthy value
11
- while token
12
- tokens.push token if token = scan_token
10
+ cur_pos = 0 # instead of @scanner.pos
11
+ while chunk_size = @scanner.skip_until(@special_token_re)
12
+ token = @scanner.matched
13
+ pre_match_size = chunk_size - token.bytesize
14
+ tokens.push str.byteslice(cur_pos, pre_match_size) if pre_match_size > 0
15
+ tokens.push token
16
+ cur_pos += chunk_size
13
17
  end
14
18
 
15
19
  tokens.push @scanner.rest unless @scanner.eos?
@@ -20,16 +24,11 @@ module LineParser
20
24
  @special_tokens = special_tokens
21
25
  @unescaped_special_tokens = unescaped_special_tokens
22
26
  @scanner = StringScanner.new("".freeze)
23
- @special_token_re, @non_special_token_re = compose_re(@special_tokens)
27
+ @special_token_re = compose_re(@special_tokens)
24
28
  end
25
29
 
26
30
  private
27
31
 
28
- def scan_token
29
- @scanner.scan(@special_token_re) ||
30
- @scanner.scan_until(@non_special_token_re)
31
- end
32
-
33
32
  def compose_special_tokens_str(special_tokens)
34
33
  sorted = special_tokens.sort {|x, y| y.length <=> x.length }
35
34
  escaped = sorted.map {|token| Regexp.escape(token) }
@@ -39,7 +38,7 @@ module LineParser
39
38
 
40
39
  def compose_re(special_tokens)
41
40
  tokens_str = compose_special_tokens_str(special_tokens)
42
- return Regexp.compile(tokens_str), Regexp.compile("(?=#{tokens_str})")
41
+ return Regexp.compile(tokens_str)
43
42
  end
44
43
  end
45
44
  end
@@ -1,7 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  module LogLineParser
4
-
5
4
  module Ltsv
6
5
  LABEL_SEPARATOR = ":"
7
6
  TAB = "\t"
@@ -1,7 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require 'log_line_parser'
4
- require 'log_line_parser/query'
5
4
  require 'csv'
6
5
  require 'yaml'
7
6
 
@@ -24,9 +23,7 @@ module LogLineParser
24
23
  end
25
24
  yield logs
26
25
  ensure
27
- logs.each do |k, v|
28
- v.close
29
- end
26
+ logs.each_value {|v| v.close }
30
27
  end
31
28
 
32
29
  def self.read_configs(config)
@@ -39,6 +36,12 @@ module LogLineParser
39
36
  end
40
37
  end
41
38
 
39
+ def self.compile_bots_re_from_config_file(bots_config_file)
40
+ return Bots::DEFAULT_RE unless bots_config_file
41
+ configs = load_config_file(bots_config_file)[0]
42
+ Bots.compile_bots_re(configs)
43
+ end
44
+
42
45
  def self.to_tsv(line, escape=true)
43
46
  LogLineParser.parse(line).to_a.map do |field|
44
47
  escape ? escape_special_chars(field) : field
@@ -1,3 +1,3 @@
1
1
  module LogLineParser
2
- VERSION = "0.3.0"
2
+ VERSION = "0.4.0"
3
3
  end
@@ -10,8 +10,8 @@ Gem::Specification.new do |spec|
10
10
  spec.authors = ["HASHIMOTO, Naoki"]
11
11
  spec.email = ["hashimoto.naoki@gmail.com"]
12
12
 
13
- spec.summary = %q{A simple parser of Apache access logs}
14
- spec.description = %q{A simple parser of Apache access logs: it parses a line of Apache access log and turns it into an array of strings or a Hash object. And from the command line, you can use it as a conversion tool of file formats or as a filtering tool of access records.}
13
+ spec.summary = %q{A simple parser of Apache access log}
14
+ spec.description = %q{A simple parser of Apache access log: it parses a line of Apache access log and turns it into an array of strings or a Hash object. And from the command line, you can use it as a conversion tool of file format (to CSV/TSV) or as a filtering tool of access records.}
15
15
  spec.homepage = "https://github.com/nico-hn/LogLineParser"
16
16
  spec.license = "MIT"
17
17
 
@@ -28,6 +28,7 @@ Gem::Specification.new do |spec|
28
28
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
29
29
  spec.require_paths = ["lib"]
30
30
 
31
- spec.add_development_dependency "bundler", "~> 1.9"
31
+ spec.add_development_dependency "bundler", ">= 1.7"
32
32
  spec.add_development_dependency "rake", "~> 10.0"
33
+ spec.add_development_dependency "minitest"
33
34
  end
metadata CHANGED
@@ -1,29 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: log_line_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - HASHIMOTO, Naoki
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-04-10 00:00:00.000000000 Z
11
+ date: 2017-04-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: '1.9'
19
+ version: '1.7'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "~>"
24
+ - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: '1.9'
26
+ version: '1.7'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -38,10 +38,24 @@ dependencies:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '10.0'
41
- description: 'A simple parser of Apache access logs: it parses a line of Apache access
41
+ - !ruby/object:Gem::Dependency
42
+ name: minitest
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: 'A simple parser of Apache access log: it parses a line of Apache access
42
56
  log and turns it into an array of strings or a Hash object. And from the command
43
- line, you can use it as a conversion tool of file formats or as a filtering tool
44
- of access records.'
57
+ line, you can use it as a conversion tool of file format (to CSV/TSV) or as a filtering
58
+ tool of access records.'
45
59
  email:
46
60
  - hashimoto.naoki@gmail.com
47
61
  executables:
@@ -99,5 +113,5 @@ rubyforge_project:
99
113
  rubygems_version: 2.2.3
100
114
  signing_key:
101
115
  specification_version: 4
102
- summary: A simple parser of Apache access logs
116
+ summary: A simple parser of Apache access log
103
117
  test_files: []