embulk-parser-apache-log 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 586d2710f54542414362463663676f224f5287ff
4
+ data.tar.gz: 26c1f301e858a6ef15070451d601ca8a207a5628
5
+ SHA512:
6
+ metadata.gz: df4f42605d9d71b255c35553c9f406b55b2ec2467d47a8f96b4d7f6834969f3effaa9d14a1ed57ef35788d94e5af20353b7fa649577fce23a880fe3c1cc8b637
7
+ data.tar.gz: 0bb2734e31c25e7fd3c3920b07bc7456afd2968b69a2d5ab091fbb60517266e6d555780e05255f1024882fd033c27af9a3b6ce8b0e0ec759b34b67802b1ed186
data/.gitignore ADDED
@@ -0,0 +1,8 @@
1
+ *~
2
+ /pkg/
3
+ /tmp/
4
+ *.gemspec
5
+ .gradle/
6
+ /classpath/
7
+ build/
8
+ .idea
data/CHANGES.md ADDED
@@ -0,0 +1,3 @@
1
+ 0.1.0 (2015-05-30)
2
+ ------------------
3
+ * First release.
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+
2
+ MIT License
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining
5
+ a copy of this software and associated documentation files (the
6
+ "Software"), to deal in the Software without restriction, including
7
+ without limitation the rights to use, copy, modify, merge, publish,
8
+ distribute, sublicense, and/or sell copies of the Software, and to
9
+ permit persons to whom the Software is furnished to do so, subject to
10
+ the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,35 @@
1
+ # Apache Log parser plugin for Embulk
2
+
3
+ Embulk parser plugin for apache log (common, combined)
4
+
5
+ ## Overview
6
+
7
+ * **Plugin type**: parser
8
+ * **Guess supported**: no
9
+
10
+ ## Configuration
11
+
12
+ - **format**: log format(common,combined) (string, default: common)
13
+
14
+ ## Example
15
+
16
+ ```yaml
17
+ in:
18
+ type: any file input plugin type
19
+ parser:
20
+ type: apache-log
21
+ format: common
22
+ ```
23
+
24
+ (If guess supported) you don't have to write `parser:` section in the configuration file. After writing `in:` section, you can let embulk guess `parser:` section using this command:
25
+
26
+ ```
27
+ $ embulk install embulk-parser-apache-log
28
+ $ embulk guess -g apache-log config.yml -o guessed.yml
29
+ ```
30
+
31
+ ## Build
32
+
33
+ ```
34
+ $ ./gradlew gem
35
+ ```
data/build.gradle ADDED
@@ -0,0 +1,73 @@
1
+ plugins {
2
+ id "com.jfrog.bintray" version "1.1"
3
+ id "com.github.jruby-gradle.base" version "0.1.5"
4
+ id "java"
5
+ }
6
+ import com.github.jrubygradle.JRubyExec
7
+ repositories {
8
+ mavenCentral()
9
+ jcenter()
10
+ }
11
+ configurations {
12
+ provided
13
+ }
14
+
15
+ version = "0.1.0"
16
+
17
+ dependencies {
18
+ compile "org.embulk:embulk-core:0.6.10"
19
+ provided "org.embulk:embulk-core:0.6.10"
20
+ // compile "YOUR_JAR_DEPENDENCY_GROUP:YOUR_JAR_DEPENDENCY_MODULE:YOUR_JAR_DEPENDENCY_VERSION"
21
+ testCompile "junit:junit:4.+"
22
+ }
23
+
24
+ task classpath(type: Copy, dependsOn: ["jar"]) {
25
+ doFirst { file("classpath").deleteDir() }
26
+ from (configurations.runtime - configurations.provided + files(jar.archivePath))
27
+ into "classpath"
28
+ }
29
+ clean { delete "classpath" }
30
+
31
+ task gem(type: JRubyExec, dependsOn: ["gemspec", "classpath"]) {
32
+ jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "build"
33
+ script "${project.name}.gemspec"
34
+ doLast { ant.move(file: "${project.name}-${project.version}.gem", todir: "pkg") }
35
+ }
36
+
37
+ task gemPush(type: JRubyExec, dependsOn: ["gem"]) {
38
+ jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "push"
39
+ script "pkg/${project.name}-${project.version}.gem"
40
+ }
41
+
42
+ task "package"(dependsOn: ["gemspec", "classpath"]) << {
43
+ println "> Build succeeded."
44
+ println "> You can run embulk with '-L ${file(".").absolutePath}' argument."
45
+ }
46
+
47
+ task gemspec {
48
+ ext.gemspecFile = file("${project.name}.gemspec")
49
+ inputs.file "build.gradle"
50
+ outputs.file gemspecFile
51
+ doLast { gemspecFile.write($/
52
+ Gem::Specification.new do |spec|
53
+ spec.name = "${project.name}"
54
+ spec.version = "${project.version}"
55
+ spec.authors = ["Hiroyuki Sato"]
56
+ spec.summary = %[Apache Log parser plugin for Embulk]
57
+ spec.description = %[Parses Apache Log files read by other file input plugins.]
58
+ spec.email = ["hiroysato@gmail.com"]
59
+ spec.licenses = ["MIT"]
60
+ spec.homepage = "https://github.com/hiroyuki-sato/embulk-parser-apache-log"
61
+
62
+ spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"]
63
+ spec.test_files = spec.files.grep(%r"^(test|spec)/")
64
+ spec.require_paths = ["lib"]
65
+
66
+ #spec.add_dependency 'YOUR_GEM_DEPENDENCY', ['~> YOUR_GEM_DEPENDENCY_VERSION']
67
+ spec.add_development_dependency 'bundler', ['~> 1.0']
68
+ spec.add_development_dependency 'rake', ['>= 10.0']
69
+ end
70
+ /$)
71
+ }
72
+ }
73
+ clean { delete "${project.name}.gemspec" }
Binary file
@@ -0,0 +1,6 @@
1
+ #Wed Feb 04 13:46:12 PST 2015
2
+ distributionBase=GRADLE_USER_HOME
3
+ distributionPath=wrapper/dists
4
+ zipStoreBase=GRADLE_USER_HOME
5
+ zipStorePath=wrapper/dists
6
+ distributionUrl=https\://services.gradle.org/distributions/gradle-2.2.1-bin.zip
data/gradlew ADDED
@@ -0,0 +1,164 @@
1
+ #!/usr/bin/env bash
2
+
3
+ ##############################################################################
4
+ ##
5
+ ## Gradle start up script for UN*X
6
+ ##
7
+ ##############################################################################
8
+
9
+ # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
10
+ DEFAULT_JVM_OPTS=""
11
+
12
+ APP_NAME="Gradle"
13
+ APP_BASE_NAME=`basename "$0"`
14
+
15
+ # Use the maximum available, or set MAX_FD != -1 to use that value.
16
+ MAX_FD="maximum"
17
+
18
+ warn ( ) {
19
+ echo "$*"
20
+ }
21
+
22
+ die ( ) {
23
+ echo
24
+ echo "$*"
25
+ echo
26
+ exit 1
27
+ }
28
+
29
+ # OS specific support (must be 'true' or 'false').
30
+ cygwin=false
31
+ msys=false
32
+ darwin=false
33
+ case "`uname`" in
34
+ CYGWIN* )
35
+ cygwin=true
36
+ ;;
37
+ Darwin* )
38
+ darwin=true
39
+ ;;
40
+ MINGW* )
41
+ msys=true
42
+ ;;
43
+ esac
44
+
45
+ # For Cygwin, ensure paths are in UNIX format before anything is touched.
46
+ if $cygwin ; then
47
+ [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"`
48
+ fi
49
+
50
+ # Attempt to set APP_HOME
51
+ # Resolve links: $0 may be a link
52
+ PRG="$0"
53
+ # Need this for relative symlinks.
54
+ while [ -h "$PRG" ] ; do
55
+ ls=`ls -ld "$PRG"`
56
+ link=`expr "$ls" : '.*-> \(.*\)$'`
57
+ if expr "$link" : '/.*' > /dev/null; then
58
+ PRG="$link"
59
+ else
60
+ PRG=`dirname "$PRG"`"/$link"
61
+ fi
62
+ done
63
+ SAVED="`pwd`"
64
+ cd "`dirname \"$PRG\"`/" >&-
65
+ APP_HOME="`pwd -P`"
66
+ cd "$SAVED" >&-
67
+
68
+ CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
69
+
70
+ # Determine the Java command to use to start the JVM.
71
+ if [ -n "$JAVA_HOME" ] ; then
72
+ if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
73
+ # IBM's JDK on AIX uses strange locations for the executables
74
+ JAVACMD="$JAVA_HOME/jre/sh/java"
75
+ else
76
+ JAVACMD="$JAVA_HOME/bin/java"
77
+ fi
78
+ if [ ! -x "$JAVACMD" ] ; then
79
+ die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
80
+
81
+ Please set the JAVA_HOME variable in your environment to match the
82
+ location of your Java installation."
83
+ fi
84
+ else
85
+ JAVACMD="java"
86
+ which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
87
+
88
+ Please set the JAVA_HOME variable in your environment to match the
89
+ location of your Java installation."
90
+ fi
91
+
92
+ # Increase the maximum file descriptors if we can.
93
+ if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then
94
+ MAX_FD_LIMIT=`ulimit -H -n`
95
+ if [ $? -eq 0 ] ; then
96
+ if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
97
+ MAX_FD="$MAX_FD_LIMIT"
98
+ fi
99
+ ulimit -n $MAX_FD
100
+ if [ $? -ne 0 ] ; then
101
+ warn "Could not set maximum file descriptor limit: $MAX_FD"
102
+ fi
103
+ else
104
+ warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
105
+ fi
106
+ fi
107
+
108
+ # For Darwin, add options to specify how the application appears in the dock
109
+ if $darwin; then
110
+ GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
111
+ fi
112
+
113
+ # For Cygwin, switch paths to Windows format before running java
114
+ if $cygwin ; then
115
+ APP_HOME=`cygpath --path --mixed "$APP_HOME"`
116
+ CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
117
+
118
+ # We build the pattern for arguments to be converted via cygpath
119
+ ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
120
+ SEP=""
121
+ for dir in $ROOTDIRSRAW ; do
122
+ ROOTDIRS="$ROOTDIRS$SEP$dir"
123
+ SEP="|"
124
+ done
125
+ OURCYGPATTERN="(^($ROOTDIRS))"
126
+ # Add a user-defined pattern to the cygpath arguments
127
+ if [ "$GRADLE_CYGPATTERN" != "" ] ; then
128
+ OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
129
+ fi
130
+ # Now convert the arguments - kludge to limit ourselves to /bin/sh
131
+ i=0
132
+ for arg in "$@" ; do
133
+ CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
134
+ CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
135
+
136
+ if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
137
+ eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
138
+ else
139
+ eval `echo args$i`="\"$arg\""
140
+ fi
141
+ i=$((i+1))
142
+ done
143
+ case $i in
144
+ (0) set -- ;;
145
+ (1) set -- "$args0" ;;
146
+ (2) set -- "$args0" "$args1" ;;
147
+ (3) set -- "$args0" "$args1" "$args2" ;;
148
+ (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
149
+ (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
150
+ (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
151
+ (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
152
+ (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
153
+ (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
154
+ esac
155
+ fi
156
+
157
+ # Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules
158
+ function splitJvmOpts() {
159
+ JVM_OPTS=("$@")
160
+ }
161
+ eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS
162
+ JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME"
163
+
164
+ exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@"
data/gradlew.bat ADDED
@@ -0,0 +1,90 @@
1
+ @if "%DEBUG%" == "" @echo off
2
+ @rem ##########################################################################
3
+ @rem
4
+ @rem Gradle startup script for Windows
5
+ @rem
6
+ @rem ##########################################################################
7
+
8
+ @rem Set local scope for the variables with windows NT shell
9
+ if "%OS%"=="Windows_NT" setlocal
10
+
11
+ @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
12
+ set DEFAULT_JVM_OPTS=
13
+
14
+ set DIRNAME=%~dp0
15
+ if "%DIRNAME%" == "" set DIRNAME=.
16
+ set APP_BASE_NAME=%~n0
17
+ set APP_HOME=%DIRNAME%
18
+
19
+ @rem Find java.exe
20
+ if defined JAVA_HOME goto findJavaFromJavaHome
21
+
22
+ set JAVA_EXE=java.exe
23
+ %JAVA_EXE% -version >NUL 2>&1
24
+ if "%ERRORLEVEL%" == "0" goto init
25
+
26
+ echo.
27
+ echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28
+ echo.
29
+ echo Please set the JAVA_HOME variable in your environment to match the
30
+ echo location of your Java installation.
31
+
32
+ goto fail
33
+
34
+ :findJavaFromJavaHome
35
+ set JAVA_HOME=%JAVA_HOME:"=%
36
+ set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37
+
38
+ if exist "%JAVA_EXE%" goto init
39
+
40
+ echo.
41
+ echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42
+ echo.
43
+ echo Please set the JAVA_HOME variable in your environment to match the
44
+ echo location of your Java installation.
45
+
46
+ goto fail
47
+
48
+ :init
49
+ @rem Get command-line arguments, handling Windowz variants
50
+
51
+ if not "%OS%" == "Windows_NT" goto win9xME_args
52
+ if "%@eval[2+2]" == "4" goto 4NT_args
53
+
54
+ :win9xME_args
55
+ @rem Slurp the command line arguments.
56
+ set CMD_LINE_ARGS=
57
+ set _SKIP=2
58
+
59
+ :win9xME_args_slurp
60
+ if "x%~1" == "x" goto execute
61
+
62
+ set CMD_LINE_ARGS=%*
63
+ goto execute
64
+
65
+ :4NT_args
66
+ @rem Get arguments from the 4NT Shell from JP Software
67
+ set CMD_LINE_ARGS=%$
68
+
69
+ :execute
70
+ @rem Setup the command line
71
+
72
+ set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
73
+
74
+ @rem Execute Gradle
75
+ "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
76
+
77
+ :end
78
+ @rem End local scope for the variables with windows NT shell
79
+ if "%ERRORLEVEL%"=="0" goto mainEnd
80
+
81
+ :fail
82
+ rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
83
+ rem the _cmd.exe /c_ return code!
84
+ if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
85
+ exit /b 1
86
+
87
+ :mainEnd
88
+ if "%OS%"=="Windows_NT" endlocal
89
+
90
+ :omega
@@ -0,0 +1,61 @@
1
+ module Embulk
2
+ module Guess
3
+
4
+ # TODO implement guess plugin to make this command work:
5
+ # $ embulk guess -g "apache-log" partial-config.yml
6
+ #
7
+ # Depending on the file format the plugin uses, you can use choose
8
+ # one of binary guess (GuessPlugin), text guess (TextGuessPlugin),
9
+ # or line guess (LineGuessPlugin).
10
+
11
+ #class ApacheLogParserGuessPlugin < GuessPlugin
12
+ # Plugin.register_guess("apache-log", self)
13
+ #
14
+ # def guess(config, sample_buffer)
15
+ # if sample_buffer[0,2] == GZIP_HEADER
16
+ # guessed = {}
17
+ # guessed["type"] = "apache-log"
18
+ # guessed["property1"] = "guessed-value"
19
+ # return {"parser" => guessed}
20
+ # else
21
+ # return {}
22
+ # end
23
+ # end
24
+ #end
25
+
26
+ #class ApacheLogParserGuessPlugin < TextGuessPlugin
27
+ # Plugin.register_guess("apache-log", self)
28
+ #
29
+ # def guess_text(config, sample_text)
30
+ # js = JSON.parse(sample_text) rescue nil
31
+ # if js && js["mykeyword"] == "keyword"
32
+ # guessed = {}
33
+ # guessed["type"] = "apache-log"
34
+ # guessed["property1"] = "guessed-value"
35
+ # return {"parser" => guessed}
36
+ # else
37
+ # return {}
38
+ # end
39
+ # end
40
+ #end
41
+
42
+ #class ApacheLogParserGuessPlugin < LineGuessPlugin
43
+ # Plugin.register_guess("apache-log", self)
44
+ #
45
+ # def guess_lines(config, sample_lines)
46
+ # all_line_matched = sample_lines.all? do |line|
47
+ # line =~ /mypattern/
48
+ # end
49
+ # if all_line_matched
50
+ # guessed = {}
51
+ # guessed["type"] = "apache-log"
52
+ # guessed["property1"] = "guessed-value"
53
+ # return {"parser" => guessed}
54
+ # else
55
+ # return {}
56
+ # end
57
+ # end
58
+ #end
59
+
60
+ end
61
+ end
@@ -0,0 +1,3 @@
1
+ Embulk::JavaPlugin.register_parser(
2
+ "apache-log", "org.embulk.parser.ApacheLogParserPlugin",
3
+ File.expand_path('../../../../classpath', __FILE__))
@@ -0,0 +1,162 @@
1
+ package org.embulk.parser;
2
+
3
+ import org.embulk.config.Config;
4
+ import org.embulk.config.ConfigDefault;
5
+ import org.embulk.config.ConfigDiff;
6
+ import org.embulk.config.ConfigSource;
7
+ import org.embulk.config.Task;
8
+ import org.embulk.config.TaskSource;
9
+ import org.embulk.spi.ParserPlugin;
10
+ import org.embulk.spi.FileInput;
11
+ import org.embulk.spi.PageOutput;
12
+ import org.embulk.spi.Schema;
13
+ import org.embulk.spi.SchemaConfig;
14
+
15
+ import org.embulk.spi.Exec;
16
+ import org.embulk.spi.PageBuilder;
17
+ import org.embulk.spi.util.LineDecoder;
18
+ import org.embulk.spi.type.TimestampType;
19
+ import org.embulk.spi.time.TimestampParser;
20
+ import org.embulk.spi.time.TimestampParseException;
21
+ import org.embulk.spi.ColumnConfig;
22
+ import java.util.ArrayList;
23
+
24
+ //import static org.embulk.spi.type.Types.BOOLEAN;
25
+ //import static org.embulk.spi.type.Types.DOUBLE;
26
+ //import static org.embulk.spi.type.Types.LONG;
27
+ import static org.embulk.spi.type.Types.STRING;
28
+ import static org.embulk.spi.type.Types.TIMESTAMP;
29
+
30
+ import java.util.regex.Matcher;
31
+ import java.util.regex.Pattern;
32
+
33
+ import com.google.common.base.Throwables;
34
+
35
+ public class ApacheLogParserPlugin
36
+ implements ParserPlugin
37
+ {
38
+ public enum LogFormat
39
+ {
40
+ combined("combined"),
41
+ common("common");
42
+ private final String string;
43
+
44
+ private LogFormat(String string)
45
+ {
46
+ this.string = string;
47
+ }
48
+ public String getString()
49
+ {
50
+ return string;
51
+ }
52
+ }
53
+ public interface PluginTask
54
+ extends Task, LineDecoder.DecoderTask, TimestampParser.ParserTask
55
+ {
56
+
57
+ @Config("format")
58
+ @ConfigDefault("\"combined\"")
59
+ public LogFormat getFormat();
60
+
61
+ }
62
+
63
+ @Override
64
+ public void transaction(ConfigSource config, ParserPlugin.Control control)
65
+ {
66
+ PluginTask task = config.loadConfig(PluginTask.class);
67
+ ArrayList<ColumnConfig> columns = new ArrayList<ColumnConfig>();
68
+ final LogFormat format = task.getFormat();
69
+
70
+ columns.add(new ColumnConfig("remote_host",STRING ,null));
71
+ columns.add(new ColumnConfig("identity_check",STRING ,null));
72
+ columns.add(new ColumnConfig("user",STRING ,null));
73
+ columns.add(new ColumnConfig("datetime",TIMESTAMP,null));
74
+ columns.add(new ColumnConfig("method",STRING ,null));
75
+ columns.add(new ColumnConfig("path",STRING ,null));
76
+ columns.add(new ColumnConfig("protocol",STRING ,null));
77
+ columns.add(new ColumnConfig("status",STRING ,null));
78
+ columns.add(new ColumnConfig("size",STRING ,null));
79
+
80
+ // combined
81
+ if( format == LogFormat.combined ){
82
+ columns.add(new ColumnConfig("referer",STRING ,null));
83
+ columns.add(new ColumnConfig("user_agent",STRING ,null));
84
+ }
85
+
86
+ Schema schema = new SchemaConfig(columns).toSchema();
87
+ control.run(task.dump(), schema);
88
+ }
89
+
90
+ @Override
91
+ public void run(TaskSource taskSource, Schema schema,
92
+ FileInput input, PageOutput output)
93
+ {
94
+ PluginTask task = taskSource.loadTask(PluginTask.class);
95
+ LineDecoder lineDecoder = new LineDecoder(input,task);
96
+ PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output);
97
+ String line = null;
98
+ final LogFormat format = task.getFormat();
99
+
100
+ Pattern accessLogPattern = Pattern.compile(getAccessLogRegex(format),
101
+ Pattern.CASE_INSENSITIVE
102
+ | Pattern.DOTALL);
103
+ Matcher accessLogEntryMatcher;
104
+ final TimestampParser time_parser = new TimestampParser("%d/%b/%Y:%T %z",task);
105
+
106
+ while( input.nextFile() ){
107
+ while(true){
108
+ line = lineDecoder.poll();
109
+
110
+ if( line == null ){
111
+ break;
112
+ }
113
+ accessLogEntryMatcher = accessLogPattern.matcher(line);
114
+
115
+ if(!accessLogEntryMatcher.matches()){
116
+ throw new RuntimeException("unmatched line" + line);
117
+ }
118
+
119
+ pageBuilder.setString(0,accessLogEntryMatcher.group(1));
120
+ pageBuilder.setString(1,accessLogEntryMatcher.group(2));
121
+ pageBuilder.setString(2,accessLogEntryMatcher.group(3));
122
+ try {
123
+ pageBuilder.setTimestamp(3,time_parser.parse(accessLogEntryMatcher.group(4)));
124
+ } catch(TimestampParseException ex) {
125
+ throw Throwables.propagate(ex);
126
+ }
127
+ pageBuilder.setString(4,accessLogEntryMatcher.group(5));
128
+ pageBuilder.setString(5,accessLogEntryMatcher.group(6));
129
+ pageBuilder.setString(6,accessLogEntryMatcher.group(7));
130
+ pageBuilder.setString(7,accessLogEntryMatcher.group(8));
131
+ pageBuilder.setString(8,accessLogEntryMatcher.group(9));
132
+ if( format == LogFormat.combined ){
133
+ pageBuilder.setString(9,accessLogEntryMatcher.group(10));
134
+ pageBuilder.setString(10,accessLogEntryMatcher.group(11));
135
+ }
136
+ pageBuilder.addRecord();
137
+ }
138
+ }
139
+ pageBuilder.finish();
140
+ }
141
+
142
+ private String getAccessLogRegex(LogFormat type)
143
+ {
144
+ final String rexa = "(\\d+(?:\\.\\d+){3})"; // an IP address
145
+ final String rexs = "(\\S+)"; // a single token (no spaces)
146
+ final String rexdt = "\\[([^\\]]+)\\]"; // something between [ and ]
147
+ final String rexstr = "\"([^\"]*?)\""; // a quoted string
148
+ final String rexi = "(\\d+)"; // unsigned integer
149
+ final String rexp = "\"(\\S+)\\s(\\S+)\\s(\\S+)\""; // method, path, protocol
150
+ String rex;
151
+
152
+ if( type == LogFormat.combined ){
153
+ rex = "^" + String.join( " ", rexa, rexs, rexs, rexdt, rexp,
154
+ rexi, rexi, rexstr, rexstr) + "$";
155
+ } else {
156
+ rex = "^" + String.join( " ", rexa, rexs, rexs, rexdt, rexp,
157
+ rexi, rexi) + "$";
158
+ }
159
+
160
+ return rex;
161
+ }
162
+ }
@@ -0,0 +1,5 @@
1
+ package org.embulk.parser;
2
+
3
+ public class TestApacheLogParserPlugin
4
+ {
5
+ }
metadata ADDED
@@ -0,0 +1,86 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: embulk-parser-apache-log
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Hiroyuki Sato
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-05-30 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ~>
17
+ - !ruby/object:Gem::Version
18
+ version: '1.0'
19
+ name: bundler
20
+ prerelease: false
21
+ type: :development
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.0'
27
+ - !ruby/object:Gem::Dependency
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '10.0'
33
+ name: rake
34
+ prerelease: false
35
+ type: :development
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ description: Parses Apache Log files read by other file input plugins.
42
+ email:
43
+ - hiroysato@gmail.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - .gitignore
49
+ - CHANGES.md
50
+ - LICENSE.txt
51
+ - README.md
52
+ - build.gradle
53
+ - gradle/wrapper/gradle-wrapper.jar
54
+ - gradle/wrapper/gradle-wrapper.properties
55
+ - gradlew
56
+ - gradlew.bat
57
+ - lib/embulk/guess/apache-log.rb
58
+ - lib/embulk/parser/apache-log.rb
59
+ - src/main/java/org/embulk/parser/ApacheLogParserPlugin.java
60
+ - src/test/java/org/embulk/parser/TestApacheLogParserPlugin.java
61
+ - classpath/embulk-parser-apache-log-0.1.0.jar
62
+ homepage: https://github.com/hiroyuki-sato/embulk-parser-apache-log
63
+ licenses:
64
+ - MIT
65
+ metadata: {}
66
+ post_install_message:
67
+ rdoc_options: []
68
+ require_paths:
69
+ - lib
70
+ required_ruby_version: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - '>='
73
+ - !ruby/object:Gem::Version
74
+ version: '0'
75
+ required_rubygems_version: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - '>='
78
+ - !ruby/object:Gem::Version
79
+ version: '0'
80
+ requirements: []
81
+ rubyforge_project:
82
+ rubygems_version: 2.1.9
83
+ signing_key:
84
+ specification_version: 4
85
+ summary: Apache Log parser plugin for Embulk
86
+ test_files: []