teeth 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +11 -0
- data/README.rdoc +123 -0
- data/Rakefile +112 -0
- data/VERSION.yml +5 -0
- data/ext/scan_apache_logs/extconf.rb +4 -0
- data/ext/scan_apache_logs/scan_apache_logs.yy +274 -0
- data/ext/scan_apache_logs/scan_apache_logs.yy.c +9345 -0
- data/ext/scan_rails_logs/extconf.rb +4 -0
- data/ext/scan_rails_logs/scan_rails_logs.yy +378 -0
- data/ext/scan_rails_logs/scan_rails_logs.yy.c +11528 -0
- data/lib/teeth.rb +14 -0
- data/lib/teeth/rule_statement.rb +61 -0
- data/lib/teeth/scanner.rb +101 -0
- data/lib/teeth/scanner_definition.rb +117 -0
- data/lib/teeth/scanner_definitions/scan_apache_logs.rb +28 -0
- data/lib/teeth/scanner_definitions/scan_rails_logs.rb +70 -0
- data/lib/teeth/templates/tokenizer.yy.erb +168 -0
- data/spec/fixtures/rails_1x.log +59 -0
- data/spec/fixtures/rails_22.log +12 -0
- data/spec/fixtures/rails_22_cached.log +10 -0
- data/spec/fixtures/rails_unordered.log +24 -0
- data/spec/playground/scan_rails_logs.rb +56 -0
- data/spec/playground/show_apache_processing.rb +13 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +17 -0
- data/spec/unit/rule_statement_spec.rb +60 -0
- data/spec/unit/scan_apache_spec.rb +110 -0
- data/spec/unit/scan_rails_logs_spec.rb +100 -0
- data/spec/unit/scaner_definition_spec.rb +65 -0
- data/spec/unit/scanner_spec.rb +108 -0
- data/teeth.gemspec +78 -0
- metadata +100 -0
data/LICENSE
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
Copyright (c) 2009, Daniel Stephen DeLeo
|
2
|
+
Portions Copyright (c) 2004 Apple Computer, Inc. All rights reserved.
|
3
|
+
All rights reserved.
|
4
|
+
|
5
|
+
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
|
6
|
+
|
7
|
+
Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
|
8
|
+
Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
|
9
|
+
Neither the name of the author nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
|
10
|
+
|
11
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,123 @@
|
|
1
|
+
= Teeth
|
2
|
+
Teeth is a library for fast parsing of log files such as Apache access and error logs. It uses C extensions generated by flex[http://flex.sourceforge.net/index.html] (as in Flex and Bison). If you only want to use the built-in scanners, you don't need flex. If you want to add support for new/different log formats, you'll need to have flex installed.
|
3
|
+
|
4
|
+
= Example
|
5
|
+
require "teeth"
|
6
|
+
|
7
|
+
access_log = %q{myhost.localdomain:80 172.16.115.1 - - [13/Dec/2008:19:26:11 -0500] "GET /favicon.ico HTTP/1.1" 404 241 "http://172.16.115.130/" "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_4_11; en) AppleWebKit/525.27.1 (KHTML, like Gecko) Version/3.2.1 Safari/525.27.1"}
|
8
|
+
access_log.scan_apache_logs
|
9
|
+
=> {:strings=>["241"],
|
10
|
+
:apache_access_datetime=>["13/Dec/2008:19:26:11 -0500"],
|
11
|
+
:absolute_url=>["http://172.16.115.130/"],
|
12
|
+
:message=>"myhost.localdomain:80 172.16.115.1 - - [13/Dec/2008:19:26:11 -0500] \"GET /favicon.ico HTTP/1.1\" 404 241 \"http://172.16.115.130/\" \"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_4_11; en) AppleWebKit/525.27.1 (KHTML, like Gecko) Version/3.2.1 Safari/525.27.1\"",
|
13
|
+
:http_method=>["GET"],
|
14
|
+
:browser_string=>["Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_4_11; en) AppleWebKit/525.27.1 (KHTML, like Gecko) Version/3.2.1 Safari/525.27.1"],
|
15
|
+
:relative_url=>["/favicon.ico"],
|
16
|
+
:http_version=>["HTTP/1.1"],
|
17
|
+
:host=>["myhost.localdomain:80"],
|
18
|
+
:id=>"8AD5CBCC1CB011DE8CE10017F22FF48F",
|
19
|
+
:http_response=>["404"],
|
20
|
+
:ipv4_addr=>["172.16.115.1"]}
|
21
|
+
|
22
|
+
= Supported Log Formats
|
23
|
+
* Apache (access and error logs)
|
24
|
+
* Rails
|
25
|
+
|
26
|
+
Support for other web servers, app servers, and applications as well as other types of servers (e.g., SMTP, etc.) and generic syslog logs is planned for the future.
|
27
|
+
|
28
|
+
== Creating Your Own Scanners
|
29
|
+
Teeth includes a library that can generate a flex scanner definition using a simplified definition written in ruby. This cuts down on the repetition involved in writing all the C code by hand. The included scanners for Apache and Rails logs are defined this way. You can find them in the scanners directory.
|
30
|
+
|
31
|
+
Here's an example based on the definition for the Rails log scanner:
|
32
|
+
|
33
|
+
require File.dirname(__FILE__) + "/../lib/teeth"
|
34
|
+
scanner = Teeth::Scanner.new(:rails_logs, File.dirname(__FILE__) + '/../ext/scan_rails_logs/')
|
35
|
+
|
36
|
+
Flex definitions are kinda like macros for regular expressions.
|
37
|
+
We include some of the available defaults here to make writing
|
38
|
+
the scanner easier
|
39
|
+
|
40
|
+
scanner.load_default_definitions_for(:whitespace, :ip, :time, :web)
|
41
|
+
|
42
|
+
Add some more definitions
|
43
|
+
scanner.definitions do |define|
|
44
|
+
define.RAILS_TEASER '(processing|filter\ chain\ halted|rendered)'
|
45
|
+
define.CONTROLLER_ACTION '[a-z0-9]+#[a-z0-9]+'
|
46
|
+
|
47
|
+
Scanner is case insensitive
|
48
|
+
define.RAILS_ERROR_CLASS '([a-z]+\:\:)*[a-z]+error'
|
49
|
+
|
50
|
+
"start conditions" are a feature of flex that allows
|
51
|
+
us to have some regular expressions that are only active
|
52
|
+
when we tell the scanner to enter a certain state. Here
|
53
|
+
we define the ``REQUEST_COMPLETED'' state, and specify
|
54
|
+
that it is exclusive. This means that if the scanner is
|
55
|
+
in this state, it only matches rules written for this state
|
56
|
+
define.REQUEST_COMPLETED :start_condition => :exclusive
|
57
|
+
end
|
58
|
+
|
59
|
+
Define rules. These are the actions that the scanner executes when
|
60
|
+
it sees text that matches a regular expression. The default action
|
61
|
+
is to add :action_name => [matched_text] to the results Hash, or push
|
62
|
+
the matched text on the end of the array if it already exists.
|
63
|
+
scanner.rules do |r|
|
64
|
+
|
65
|
+
This will add something like :teaser => ["Processing"] to the results
|
66
|
+
r.teaser '{RAILS_TEASER}'
|
67
|
+
r.controller_action '{CONTROLLER_ACTION}'
|
68
|
+
|
69
|
+
Use some of the default definitions we added above.
|
70
|
+
r.datetime '{YEAR}"-"{MONTH_NUM}"-"{MDAY}{WS}{HOUR}":"{MINSEC}":"{MINSEC}'
|
71
|
+
r.http_method '{HTTP_VERB}'
|
72
|
+
|
73
|
+
With :skip_line => true, scanner stops processing the line immediately
|
74
|
+
r.skip_lines '{RAILS_SKIP_LINES}', :skip_line => true
|
75
|
+
r.error '{RAILS_ERROR_CLASS}'
|
76
|
+
|
77
|
+
with :strip_ends => true, scanner removes first and last characters from matched text
|
78
|
+
r.error_message '\(({WS}|{NON_WS})+\)', :strip_ends => true
|
79
|
+
|
80
|
+
Puts scanner in the ``REQUEST_COMPLETED'' state we defined above.
|
81
|
+
The scanner only matches rules beginning with ``<REQUEST_COMPLETED>''
|
82
|
+
now
|
83
|
+
r.teaser 'completed\ in', :begin => "REQUEST_COMPLETED"
|
84
|
+
|
85
|
+
These rules only apply to the ``REQUEST_COMPLETED'' State
|
86
|
+
r.duration_s '<REQUEST_COMPLETED>[0-9]+\.[0-9]+'
|
87
|
+
r.duration_ms '<REQUEST_COMPLETED>[0-9]+/ms'
|
88
|
+
r.http_response '<REQUEST_COMPLETED>{HTTPCODE}'
|
89
|
+
|
90
|
+
Need a "catchall" rule -- flex scanner "jams" if there isn't a default rule (the
|
91
|
+
catchall rule for the default/INITIAL state is automatically included).
|
92
|
+
note that :ignore => true makes the scanner ignore what it matches but doesn't
|
93
|
+
stop processing of the line.
|
94
|
+
r.ignore_others '<REQUEST_COMPLETED>{CATCHALL}', :ignore => true
|
95
|
+
|
96
|
+
The "strings" action is special. It keeps track of whether the last token
|
97
|
+
was also a string, and if it was, the new string is appended to the
|
98
|
+
last string instead of being pushed to the array. For example, when scanning
|
99
|
+
an apache error log, ``Invalid URI in request'' will be extracted as a complete
|
100
|
+
string (instead of ["Invalid", "URI", "in", "request"])
|
101
|
+
r.strings '{NON_WS}{NON_WS}*'
|
102
|
+
end
|
103
|
+
|
104
|
+
Writes the generated scanner and an extconf.rb for it to the directory
|
105
|
+
we specified when we initialized the scanner.
|
106
|
+
scanner.write!
|
107
|
+
|
108
|
+
There's not much in the way of documentation for the scanner generator, but you can
|
109
|
+
refer to the specs and the definitions for Apache and Rails logs to get a sense
|
110
|
+
of how it works. It would probably help to learn about flex's regex syntax
|
111
|
+
and other features.
|
112
|
+
|
113
|
+
= Ruby 1.9
|
114
|
+
Ruby 1.9 is supported on the master branch. Don't use the ruby1.9 branch, it is orphaned.
|
115
|
+
|
116
|
+
= Shortcomings and Known Issues
|
117
|
+
In addition to the lack of support for formats other than Apache and Rails described above:
|
118
|
+
* It's a new project, lots of API changes
|
119
|
+
* Does not convert datetimes to Ruby Time objects
|
120
|
+
* Does not always use context or knowledge of the log format to its advantage. This is improving now that the scanner can utilize start conditions.
|
121
|
+
|
122
|
+
= Performance
|
123
|
+
On my laptop, a white MacBook 2.0 GHz Intel Core Duo, teeth can process more than 30k lines of Apache access logs per second. So it's pretty fast. If modified to not create a UUID or keep the full message, this can be increased to around 45k lines/sec. One could potentially do pretty well on the wide_finder2[http://www.tbray.org/ongoing/When/200x/2008/05/01/Wide-Finder-2]...
|
data/Rakefile
ADDED
@@ -0,0 +1,112 @@
|
|
1
|
+
require "spec/rake/spectask"
|
2
|
+
require "rake/clean"
|
3
|
+
require "rake/rdoctask"
|
4
|
+
|
5
|
+
desc "Run all of the specs"
|
6
|
+
Spec::Rake::SpecTask.new do |t|
|
7
|
+
t.spec_opts = ['--options', "\"spec/spec.opts\""]
|
8
|
+
t.fail_on_error = false
|
9
|
+
end
|
10
|
+
|
11
|
+
task :default => :spec
|
12
|
+
|
13
|
+
namespace :spec do
|
14
|
+
|
15
|
+
desc "Generate HTML report for failing examples"
|
16
|
+
Spec::Rake::SpecTask.new('report') do |t|
|
17
|
+
t.spec_files = FileList['failing_examples/**/*.rb']
|
18
|
+
t.spec_opts = ["--format", "html:doc/tools/reports/failing_examples.html", "--diff", '--options', '"spec/spec.opts"']
|
19
|
+
t.fail_on_error = false
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
Rake::RDocTask.new do |rdt|
|
25
|
+
rdt.rdoc_dir = "doc"
|
26
|
+
rdt.main = "README.rdoc"
|
27
|
+
rdt.rdoc_files.include("README.rdoc", "lib/*", "ext/*/*.yy.c")
|
28
|
+
end
|
29
|
+
|
30
|
+
begin
|
31
|
+
require 'jeweler'
|
32
|
+
Jeweler::Tasks.new do |s|
|
33
|
+
s.name = 'teeth'
|
34
|
+
s.summary = 'Fast log file parsing in Ruby'
|
35
|
+
s.description = s.summary
|
36
|
+
s.email = 'ddeleo@basecommander.net'
|
37
|
+
s.homepage = "http://github.com/danielsdeleo/teeth"
|
38
|
+
s.platform = Gem::Platform::RUBY
|
39
|
+
s.has_rdoc = true
|
40
|
+
s.extra_rdoc_files = ["README.rdoc"]
|
41
|
+
s.require_path = ["lib"]
|
42
|
+
s.authors = ["Daniel DeLeo"]
|
43
|
+
s.extensions = ["ext/scan_apache_logs/extconf.rb", "ext/scan_rails_logs/extconf.rb"]
|
44
|
+
|
45
|
+
# ruby -rpp -e' pp `git ls-files`.split("\n") '
|
46
|
+
s.files = `git ls-files`.split("\n").reject {|f| f =~ /git/}
|
47
|
+
end
|
48
|
+
rescue LoadError
|
49
|
+
puts "Jeweler not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
|
50
|
+
end
|
51
|
+
|
52
|
+
desc "outputs a list of files suitable for use with the gemspec"
|
53
|
+
task :list_files do
|
54
|
+
sh %q{ruby -rpp -e' pp `git ls-files`.split("\n").reject {|f| f =~ /git/} '}
|
55
|
+
end
|
56
|
+
|
57
|
+
|
58
|
+
CLEAN.add ["ext/*/*.bundle", "ext/*/*.so", "ext/*/*.o"]
|
59
|
+
CLOBBER.add ["ext/*/Makefile", "ext/*/*.c"]
|
60
|
+
|
61
|
+
namespace :ext do
|
62
|
+
desc "Installs the C extensions. Usually requires root."
|
63
|
+
task :install => :build do
|
64
|
+
Dir.glob("ext/*/").each do |ext_dir|
|
65
|
+
Dir.chdir(ext_dir) {sh "make install"}
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
desc "Compiles the C extensions"
|
70
|
+
task :build do |t|
|
71
|
+
Dir.glob("ext/*/").each do |ext_dir|
|
72
|
+
cd(ext_dir) {sh "make"}
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
desc "Generates Makefiles with extconf/mkmf"
|
77
|
+
task :makefiles
|
78
|
+
|
79
|
+
FileList["ext/*/*.yy"].each do |flex_file|
|
80
|
+
flex_generated_c = flex_file.ext("yy.c")
|
81
|
+
file flex_generated_c => flex_file do |t|
|
82
|
+
sh "flex -i -s -o #{flex_generated_c} #{flex_file}"
|
83
|
+
end
|
84
|
+
task :build => flex_generated_c
|
85
|
+
file flex_file
|
86
|
+
end
|
87
|
+
|
88
|
+
FileList["ext/*/extconf.rb"].each do |extconf_file|
|
89
|
+
extension_dir = extconf_file.sub("extconf.rb", '')
|
90
|
+
makefile = extension_dir + "Makefile"
|
91
|
+
file makefile => extconf_file do |t|
|
92
|
+
Dir.chdir(extension_dir) {ruby "./extconf.rb"}
|
93
|
+
end
|
94
|
+
file extconf_file
|
95
|
+
task :build => makefile
|
96
|
+
end
|
97
|
+
|
98
|
+
desc "Compiles Teeth::Scanner scanner definitions into flex scanner definition"
|
99
|
+
task :scanners do
|
100
|
+
FileList["scanners/*"].each do |scanner|
|
101
|
+
ruby scanner
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
|
106
|
+
end
|
107
|
+
|
108
|
+
desc "deletes generated gem"
|
109
|
+
task :clobber_gem do
|
110
|
+
FileList['pkg/*gem'].each { |gemfile| rm gemfile }
|
111
|
+
end
|
112
|
+
task :clobber => :clobber_gem
|
data/VERSION.yml
ADDED
@@ -0,0 +1,274 @@
|
|
1
|
+
%option prefix="apache_logs_yy"
|
2
|
+
%option full
|
3
|
+
%option never-interactive
|
4
|
+
%option read
|
5
|
+
%option nounput
|
6
|
+
%option noyywrap noreject noyymore nodefault
|
7
|
+
%{
|
8
|
+
#include <ruby.h>
|
9
|
+
#include <uuid/uuid.h>
|
10
|
+
/* Data types */
|
11
|
+
typedef struct {
|
12
|
+
char *key;
|
13
|
+
char *value;
|
14
|
+
} KVPAIR;
|
15
|
+
const KVPAIR EOF_KVPAIR = {"EOF", "EOF"};
|
16
|
+
/* prototypes */
|
17
|
+
char *strip_ends(char *);
|
18
|
+
VALUE t_scan_apache_logs(VALUE);
|
19
|
+
void new_uuid(char *str_ptr);
|
20
|
+
void raise_error_for_string_too_long(VALUE string);
|
21
|
+
void include_message_in_token_hash(VALUE message, VALUE token_hash);
|
22
|
+
void add_uuid_to_token_hash(VALUE token_hash);
|
23
|
+
void push_kv_pair_to_hash(KVPAIR key_value, VALUE token_hash);
|
24
|
+
void concat_word_to_string(KVPAIR key_value, VALUE token_hash);
|
25
|
+
/* Set the scanner name, and return type */
|
26
|
+
#define YY_DECL KVPAIR scan_apache_logs(void)
|
27
|
+
#define yyterminate() return EOF_KVPAIR
|
28
|
+
/* Ruby 1.8 and 1.9 compatibility */
|
29
|
+
#if !defined(RSTRING_LEN)
|
30
|
+
# define RSTRING_LEN(x) (RSTRING(x)->len)
|
31
|
+
# define RSTRING_PTR(x) (RSTRING(x)->ptr)
|
32
|
+
#endif
|
33
|
+
|
34
|
+
%}
|
35
|
+
|
36
|
+
/* Definitions */
|
37
|
+
|
38
|
+
CATCHALL (.|"\n")
|
39
|
+
|
40
|
+
|
41
|
+
WS [[:space:]]
|
42
|
+
|
43
|
+
NON_WS ([a-z]|[0-9]|[:punct:])
|
44
|
+
|
45
|
+
IP4_OCT [0-9]|[0-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]
|
46
|
+
|
47
|
+
HOST ([a-z0-9][a-z0-9\-]*\.[a-z0-9][a-z0-9\-]*.[a-z0-9][a-z0-9\-\.]*[a-z]+(\:[0-9]+)?)|localhost
|
48
|
+
|
49
|
+
WDAY mon|tue|wed|thu|fri|sat|sun
|
50
|
+
|
51
|
+
MON jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec
|
52
|
+
|
53
|
+
MONTH_NUM 0[1-9]|1[0-2]
|
54
|
+
|
55
|
+
MDAY 3[0-1]|[1-2][0-9]|0[1-9]
|
56
|
+
|
57
|
+
HOUR 2[0-3]|[0-1][0-9]
|
58
|
+
|
59
|
+
MINSEC [0-5][0-9]|60
|
60
|
+
|
61
|
+
YEAR [0-9][0-9][0-9][0-9]
|
62
|
+
|
63
|
+
PLUSMINUS (\+|\-)
|
64
|
+
|
65
|
+
TIMING [0-9]+\.[0-9]+
|
66
|
+
|
67
|
+
REL_URL (\/|\\|\.)[a-z0-9\._\~\-\/\?&;#=\%\:\+\[\]\\]*
|
68
|
+
|
69
|
+
PROTO (http:|https:)
|
70
|
+
|
71
|
+
ERR_LVL (emerg|alert|crit|err|error|warn|warning|notice|info|debug)
|
72
|
+
|
73
|
+
HTTP_VERS HTTP\/(1.0|1.1)
|
74
|
+
|
75
|
+
HTTP_VERB (get|head|put|post|delete|trace|connect)
|
76
|
+
|
77
|
+
HTTPCODE (100|101|20[0-6]|30[0-5]|307|40[0-9]|41[0-7]|50[0-5])
|
78
|
+
|
79
|
+
BROWSER_STR \"(moz|msie|lynx|reconnoiter|pingdom)[^"]+\"
|
80
|
+
|
81
|
+
|
82
|
+
%%
|
83
|
+
/*
|
84
|
+
Actions
|
85
|
+
*/
|
86
|
+
|
87
|
+
|
88
|
+
{TIMING} {
|
89
|
+
KVPAIR timing = {"timing", yytext};
|
90
|
+
return timing;
|
91
|
+
}
|
92
|
+
|
93
|
+
{IP4_OCT}"."{IP4_OCT}"."{IP4_OCT}"."{IP4_OCT} {
|
94
|
+
KVPAIR ipv4_addr = {"ipv4_addr", yytext};
|
95
|
+
return ipv4_addr;
|
96
|
+
}
|
97
|
+
|
98
|
+
{WDAY}{WS}{MON}{WS}{MDAY}{WS}{HOUR}":"{MINSEC}":"{MINSEC}{WS}{YEAR} {
|
99
|
+
KVPAIR apache_err_datetime = {"apache_err_datetime", yytext};
|
100
|
+
return apache_err_datetime;
|
101
|
+
}
|
102
|
+
|
103
|
+
{MDAY}\/{MON}\/{YEAR}":"{HOUR}":"{MINSEC}":"{MINSEC}{WS}{PLUSMINUS}{YEAR} {
|
104
|
+
KVPAIR apache_access_datetime = {"apache_access_datetime", yytext};
|
105
|
+
return apache_access_datetime;
|
106
|
+
}
|
107
|
+
|
108
|
+
{HTTP_VERS} {
|
109
|
+
KVPAIR http_version = {"http_version", yytext};
|
110
|
+
return http_version;
|
111
|
+
}
|
112
|
+
|
113
|
+
{BROWSER_STR} {
|
114
|
+
KVPAIR browser_string = {"browser_string", strip_ends(yytext)};
|
115
|
+
return browser_string;
|
116
|
+
}
|
117
|
+
|
118
|
+
{PROTO}"\/\/"({HOST}|{IP4_OCT}"."{IP4_OCT}"."{IP4_OCT}"."{IP4_OCT})({REL_URL}|"\/")? {
|
119
|
+
KVPAIR absolute_url = {"absolute_url", yytext};
|
120
|
+
return absolute_url;
|
121
|
+
}
|
122
|
+
|
123
|
+
{HOST} {
|
124
|
+
KVPAIR host = {"host", yytext};
|
125
|
+
return host;
|
126
|
+
}
|
127
|
+
|
128
|
+
{REL_URL} {
|
129
|
+
KVPAIR relative_url = {"relative_url", yytext};
|
130
|
+
return relative_url;
|
131
|
+
}
|
132
|
+
|
133
|
+
{ERR_LVL} {
|
134
|
+
KVPAIR error_level = {"error_level", yytext};
|
135
|
+
return error_level;
|
136
|
+
}
|
137
|
+
|
138
|
+
{HTTPCODE} {
|
139
|
+
KVPAIR http_response = {"http_response", yytext};
|
140
|
+
return http_response;
|
141
|
+
}
|
142
|
+
|
143
|
+
{HTTP_VERB} {
|
144
|
+
KVPAIR http_method = {"http_method", yytext};
|
145
|
+
return http_method;
|
146
|
+
}
|
147
|
+
|
148
|
+
{NON_WS}{NON_WS}* {
|
149
|
+
KVPAIR strings = {"strings", yytext};
|
150
|
+
return strings;
|
151
|
+
}
|
152
|
+
|
153
|
+
{CATCHALL} /* ignore */
|
154
|
+
%%
|
155
|
+
|
156
|
+
char *strip_ends(char *string) {
|
157
|
+
string[yyleng-1] = '\0';
|
158
|
+
++string;
|
159
|
+
return string;
|
160
|
+
}
|
161
|
+
|
162
|
+
void uuid_unparse_upper_sans_dash(const uuid_t uu, char *out)
|
163
|
+
{
|
164
|
+
sprintf(out,
|
165
|
+
"%02X%02X%02X%02X"
|
166
|
+
"%02X%02X"
|
167
|
+
"%02X%02X"
|
168
|
+
"%02X%02X"
|
169
|
+
"%02X%02X%02X%02X%02X%02X",
|
170
|
+
uu[0], uu[1], uu[2], uu[3],
|
171
|
+
uu[4], uu[5],
|
172
|
+
uu[6], uu[7],
|
173
|
+
uu[8], uu[9],
|
174
|
+
uu[10], uu[11], uu[12], uu[13], uu[14], uu[15]);
|
175
|
+
}
|
176
|
+
|
177
|
+
void new_uuid(char *str_ptr){
|
178
|
+
uuid_t new_uuid;
|
179
|
+
uuid_generate_time(new_uuid);
|
180
|
+
uuid_unparse_upper_sans_dash(new_uuid, str_ptr);
|
181
|
+
}
|
182
|
+
|
183
|
+
void raise_error_for_string_too_long(VALUE string){
|
184
|
+
if( RSTRING_LEN(string) > 1000000){
|
185
|
+
rb_raise(rb_eArgError, "string too long for scan_apache_logs! max length is 1,000,000 chars");
|
186
|
+
}
|
187
|
+
}
|
188
|
+
|
189
|
+
/* Scans self, which is expected to be a single line from an Apache error or
|
190
|
+
* access log, and returns a Hash of the components of the log message. The
|
191
|
+
* following parts of the log message are returned if they are present:
|
192
|
+
* IPv4 address, datetime, HTTP Version used, the browser string given by the
|
193
|
+
* client, any absolute or relative URLs, the error level, HTTP response code,
|
194
|
+
* HTTP Method (verb), and any other uncategorized strings present. */
|
195
|
+
VALUE t_scan_apache_logs(VALUE self) {
|
196
|
+
KVPAIR kv_result;
|
197
|
+
int scan_complete = 0;
|
198
|
+
int building_words_to_string = 0;
|
199
|
+
VALUE token_hash = rb_hash_new();
|
200
|
+
|
201
|
+
BEGIN(INITIAL);
|
202
|
+
|
203
|
+
/* error out on absurdly large strings */
|
204
|
+
raise_error_for_string_too_long(self);
|
205
|
+
/* {:message => self()} */
|
206
|
+
include_message_in_token_hash(self, token_hash);
|
207
|
+
/* {:id => UUID} */
|
208
|
+
add_uuid_to_token_hash(token_hash);
|
209
|
+
yy_scan_string(RSTRING_PTR(self));
|
210
|
+
while (scan_complete == 0) {
|
211
|
+
kv_result = scan_apache_logs();
|
212
|
+
if (kv_result.key == "EOF"){
|
213
|
+
scan_complete = 1;
|
214
|
+
}
|
215
|
+
else if (kv_result.key == "strings"){
|
216
|
+
/* build a string until we get a non-word */
|
217
|
+
if (building_words_to_string == 0){
|
218
|
+
building_words_to_string = 1;
|
219
|
+
push_kv_pair_to_hash(kv_result, token_hash);
|
220
|
+
}
|
221
|
+
else{
|
222
|
+
concat_word_to_string(kv_result, token_hash);
|
223
|
+
}
|
224
|
+
}
|
225
|
+
else {
|
226
|
+
building_words_to_string = 0;
|
227
|
+
push_kv_pair_to_hash(kv_result, token_hash);
|
228
|
+
}
|
229
|
+
}
|
230
|
+
yy_delete_buffer(YY_CURRENT_BUFFER);
|
231
|
+
return rb_obj_dup(token_hash);
|
232
|
+
}
|
233
|
+
|
234
|
+
void add_uuid_to_token_hash(VALUE token_hash) {
|
235
|
+
char new_uuid_str[33];
|
236
|
+
new_uuid(new_uuid_str);
|
237
|
+
VALUE hsh_key_id = ID2SYM(rb_intern("id"));
|
238
|
+
VALUE hsh_val_id = rb_tainted_str_new2(new_uuid_str);
|
239
|
+
rb_hash_aset(token_hash, hsh_key_id, hsh_val_id);
|
240
|
+
}
|
241
|
+
|
242
|
+
void include_message_in_token_hash(VALUE message, VALUE token_hash) {
|
243
|
+
/* {:message => self()} */
|
244
|
+
VALUE hsh_key_msg = ID2SYM(rb_intern("message"));
|
245
|
+
rb_hash_aset(token_hash, hsh_key_msg, message);
|
246
|
+
}
|
247
|
+
|
248
|
+
void concat_word_to_string(KVPAIR key_value, VALUE token_hash) {
|
249
|
+
char * space = " ";
|
250
|
+
VALUE hsh_key = ID2SYM(rb_intern(key_value.key));
|
251
|
+
VALUE hsh_value = rb_hash_aref(token_hash, hsh_key);
|
252
|
+
VALUE string = rb_ary_entry(hsh_value, -1);
|
253
|
+
rb_str_cat(string, space, 1);
|
254
|
+
rb_str_cat(string, key_value.value, yyleng);
|
255
|
+
}
|
256
|
+
|
257
|
+
void push_kv_pair_to_hash(KVPAIR key_value, VALUE token_hash) {
|
258
|
+
VALUE hsh_key = ID2SYM(rb_intern(key_value.key));
|
259
|
+
VALUE hsh_value = rb_hash_aref(token_hash, hsh_key);
|
260
|
+
VALUE ary_for_token_type = rb_ary_new();
|
261
|
+
switch (TYPE(hsh_value)) {
|
262
|
+
case T_NIL:
|
263
|
+
rb_ary_push(ary_for_token_type, rb_tainted_str_new2(key_value.value));
|
264
|
+
rb_hash_aset(token_hash, hsh_key, ary_for_token_type);
|
265
|
+
break;
|
266
|
+
case T_ARRAY:
|
267
|
+
rb_ary_push(hsh_value, rb_tainted_str_new2(key_value.value));
|
268
|
+
break;
|
269
|
+
}
|
270
|
+
}
|
271
|
+
|
272
|
+
void Init_scan_apache_logs() {
|
273
|
+
rb_define_method(rb_cString, "scan_apache_logs", t_scan_apache_logs, 0);
|
274
|
+
}
|