embulk-parser-regex 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 5334bd1ab64c181d169a2eb53c1ae9c0888ee8a2
4
+ data.tar.gz: 03e5d5c3d3183d2ce0bd26cd9dfcf1383fb5a5b3
5
+ SHA512:
6
+ metadata.gz: 3271593ae11fba49ce1b5171342492249ddfdda1f50c1ad9ca0bed0046fbebdba64efb109cd386bae3d9f3dfb1c24d309fa07d5c32b4a0abdb580810a9f30a1b
7
+ data.tar.gz: ebb736e2051d8604e4ec21f9ddcce9aa449dd36e4ac1bf4ccce4a9376220078597a1a4802204213f3c5ede5c9971f39b16df6303c1f00df37f851c634108fcb3
data/.gitignore ADDED
@@ -0,0 +1,8 @@
1
+ *~
2
+ /pkg/
3
+ /tmp/
4
+ *.gemspec
5
+ .gradle/
6
+ /classpath/
7
+ build/
8
+ .idea
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2015 Ken Morishita, Powered By Yumemi Inc.
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,55 @@
1
+ # Regex parser plugin for Embulk
2
+
3
+ A simple parser Using Regular Expression.
4
+
5
+ ## Overview
6
+
7
+ * **Plugin type**: parser
8
+ * **Guess supported**: yes
9
+
10
+ ## Configuration
11
+
12
+ - **regex**: regular expression that must use [Named Capturing Group](https://blogs.oracle.com/xuemingshen/entry/named_capturing_group_in_jdk7) (string, required)
13
+ - **columns**: column definition (list of object)
14
+ - **regexName**: 'Named Capturing Group' can only include `[a-zA-Z0-9]`, so alias group name in regex can be specified (string, default: `<name> attr value`)
15
+ - **skip_if_unmatch**: if false, when a line don't match the regex, raise RuntimeException. If true, skip the line. (boolean, default: `false`)
16
+
17
+ ## Example
18
+
19
+ ```yaml
20
+ in:
21
+ type: any file input plugin type
22
+ parser:
23
+ type: regex
24
+ regex: ^(?<remoteHost>[.:0-9]+) (?<identity>\S+) (?<user>\S+) \[(?<datetime>[^\]]*)\] "((?<method>\S+) (?<path>\S+) (?<protocol>HTTP/\d+\.\d+)|-)" (?<status>[0-9]+) (?<size>[0-9]+|-) "(?<referer>[^"]*)" "(?<userAgent>[^"]*)" (?<inByte>[0-9]+) (?<outByte>[0-9]+)$
25
+ columns:
26
+ - {name: remote_host, type: string, regexName: remoteHost}
27
+ - {name: identity, type: string}
28
+ - {name: user, type: string}
29
+ - {name: datetime, type: timestamp, format: '%d/%b/%Y:%H:%M:%S %z'}
30
+ - {name: method, type: string}
31
+ - {name: path, type: string}
32
+ - {name: protocol, type: string}
33
+ - {name: status, type: long}
34
+ - {name: size, type: long}
35
+ - {name: referer, type: string}
36
+ - {name: user_agent, type: string, regexName: userAgent}
37
+ - {name: in_byte, type: long, regexName: inByte}
38
+ - {name: out_byte, type: long, regexName: outByte}
39
+ ```
40
+
41
+ ### Guess
42
+ Some apache LogFormats can be guessed.
43
+ After writing `in:` section, you can let embulk guess `parser:` section using this command:
44
+
45
+
46
+ ```
47
+ $ embulk gem install embulk-parser-regex
48
+ $ embulk guess -g regex config.yml -o guessed.yml
49
+ ```
50
+
51
+ ## Build
52
+
53
+ ```
54
+ $ ./gradlew gem # -t to watch change of files and rebuild continuously
55
+ ```
data/build.gradle ADDED
@@ -0,0 +1,74 @@
1
+ plugins {
2
+ id "com.jfrog.bintray" version "1.1"
3
+ id "com.github.jruby-gradle.base" version "0.1.5"
4
+ id "java"
5
+ }
6
+
7
+ import com.github.jrubygradle.JRubyExec
8
+ repositories {
9
+ mavenCentral()
10
+ jcenter()
11
+ }
12
+ configurations {
13
+ provided
14
+ }
15
+
16
+ version = "0.1.0"
17
+
18
+ dependencies {
19
+ compile "org.embulk:embulk-core:0.7.4"
20
+ provided "org.embulk:embulk-core:0.7.4"
21
+ // compile "YOUR_JAR_DEPENDENCY_GROUP:YOUR_JAR_DEPENDENCY_MODULE:YOUR_JAR_DEPENDENCY_VERSION"
22
+ testCompile "junit:junit:4.+"
23
+ }
24
+
25
+ task classpath(type: Copy, dependsOn: ["jar"]) {
26
+ doFirst { file("classpath").deleteDir() }
27
+ from (configurations.runtime - configurations.provided + files(jar.archivePath))
28
+ into "classpath"
29
+ }
30
+ clean { delete "classpath" }
31
+
32
+ task gem(type: JRubyExec, dependsOn: ["gemspec", "classpath"]) {
33
+ jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "build"
34
+ script "${project.name}.gemspec"
35
+ doLast { ant.move(file: "${project.name}-${project.version}.gem", todir: "pkg") }
36
+ }
37
+
38
+ task gemPush(type: JRubyExec, dependsOn: ["gem"]) {
39
+ jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "push"
40
+ script "pkg/${project.name}-${project.version}.gem"
41
+ }
42
+
43
+ task "package"(dependsOn: ["gemspec", "classpath"]) << {
44
+ println "> Build succeeded."
45
+ println "> You can run embulk with '-L ${file(".").absolutePath}' argument."
46
+ }
47
+
48
+ task gemspec {
49
+ ext.gemspecFile = file("${project.name}.gemspec")
50
+ inputs.file "build.gradle"
51
+ outputs.file gemspecFile
52
+ doLast { gemspecFile.write($/
53
+ Gem::Specification.new do |spec|
54
+ spec.name = "${project.name}"
55
+ spec.version = "${project.version}"
56
+ spec.authors = ["Ken Morishita"]
57
+ spec.summary = %[Regex parser plugin for Embulk]
58
+ spec.description = %[Parses lines using regular-expression in files read by other file input plugins.]
59
+ spec.email = ["mokemokechicken@gmail.com"]
60
+ spec.licenses = ["MIT"]
61
+ spec.homepage = "https://github.com/mokemokechicken/embulk-parser-regex"
62
+
63
+ spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"]
64
+ spec.test_files = spec.files.grep(%r"^(test|spec)/")
65
+ spec.require_paths = ["lib"]
66
+
67
+ spec.add_development_dependency 'bundler', ['~> 1.0']
68
+ spec.add_development_dependency 'rake', ['~> 10.0']
69
+ end
70
+ /$)
71
+ }
72
+ }
73
+ clean { delete "${project.name}.gemspec" }
74
+
@@ -0,0 +1,49 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module external.linked.project.id="embulk-parser-regex" external.linked.project.path="$MODULE_DIR$" external.root.project.path="$MODULE_DIR$" external.system.id="GRADLE" external.system.module.group="" external.system.module.version="0.1.0" type="JAVA_MODULE" version="4">
3
+ <component name="NewModuleRootManager" inherit-compiler-output="false">
4
+ <output url="file://$MODULE_DIR$/build/classes/main" />
5
+ <output-test url="file://$MODULE_DIR$/build/classes/test" />
6
+ <exclude-output />
7
+ <content url="file://$MODULE_DIR$">
8
+ <sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
9
+ <sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" />
10
+ <sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" />
11
+ <sourceFolder url="file://$MODULE_DIR$/src/test/resources" type="java-test-resource" />
12
+ <excludeFolder url="file://$MODULE_DIR$/.gradle" />
13
+ <excludeFolder url="file://$MODULE_DIR$/build" />
14
+ </content>
15
+ <orderEntry type="inheritedJdk" />
16
+ <orderEntry type="sourceFolder" forTests="false" />
17
+ <orderEntry type="library" name="Gradle: org.embulk:embulk-core:0.7.4" level="project" />
18
+ <orderEntry type="library" name="Gradle: com.google.guava:guava:18.0" level="project" />
19
+ <orderEntry type="library" name="Gradle: com.google.inject:guice:4.0" level="project" />
20
+ <orderEntry type="library" name="Gradle: com.google.inject.extensions:guice-multibindings:4.0" level="project" />
21
+ <orderEntry type="library" name="Gradle: javax.inject:javax.inject:1" level="project" />
22
+ <orderEntry type="library" name="Gradle: com.fasterxml.jackson.core:jackson-annotations:2.5.3" level="project" />
23
+ <orderEntry type="library" name="Gradle: com.fasterxml.jackson.core:jackson-core:2.5.3" level="project" />
24
+ <orderEntry type="library" name="Gradle: com.fasterxml.jackson.core:jackson-databind:2.5.3" level="project" />
25
+ <orderEntry type="library" name="Gradle: com.fasterxml.jackson.datatype:jackson-datatype-guava:2.5.3" level="project" />
26
+ <orderEntry type="library" name="Gradle: com.fasterxml.jackson.datatype:jackson-datatype-joda:2.5.3" level="project" />
27
+ <orderEntry type="library" name="Gradle: com.fasterxml.jackson.module:jackson-module-guice:2.5.3" level="project" />
28
+ <orderEntry type="library" name="Gradle: ch.qos.logback:logback-classic:1.1.3" level="project" />
29
+ <orderEntry type="library" name="Gradle: org.slf4j:slf4j-api:1.7.12" level="project" />
30
+ <orderEntry type="library" name="Gradle: org.jruby:jruby-complete:9.0.0.0" level="project" />
31
+ <orderEntry type="library" name="Gradle: com.google.code.findbugs:annotations:3.0.0" level="project" />
32
+ <orderEntry type="library" name="Gradle: org.yaml:snakeyaml:1.14" level="project" />
33
+ <orderEntry type="library" name="Gradle: javax.validation:validation-api:1.1.0.Final" level="project" />
34
+ <orderEntry type="library" name="Gradle: org.apache.bval:bval-jsr303:0.5" level="project" />
35
+ <orderEntry type="library" name="Gradle: io.airlift:slice:0.9" level="project" />
36
+ <orderEntry type="library" name="Gradle: joda-time:joda-time:2.8.1" level="project" />
37
+ <orderEntry type="library" name="Gradle: io.netty:netty-buffer:5.0.0.Alpha1" level="project" />
38
+ <orderEntry type="library" name="Gradle: org.fusesource.jansi:jansi:1.11" level="project" />
39
+ <orderEntry type="library" name="Gradle: com.ibm.icu:icu4j:54.1.1" level="project" />
40
+ <orderEntry type="library" name="Gradle: aopalliance:aopalliance:1.0" level="project" />
41
+ <orderEntry type="library" name="Gradle: ch.qos.logback:logback-core:1.1.3" level="project" />
42
+ <orderEntry type="library" name="Gradle: org.apache.bval:bval-core:0.5" level="project" />
43
+ <orderEntry type="library" name="Gradle: org.apache.commons:commons-lang3:3.1" level="project" />
44
+ <orderEntry type="library" name="Gradle: io.netty:netty-common:5.0.0.Alpha1" level="project" />
45
+ <orderEntry type="library" name="Gradle: commons-beanutils:commons-beanutils-core:1.8.3" level="project" />
46
+ <orderEntry type="library" scope="TEST" name="Gradle: junit:junit:4.12" level="project" />
47
+ <orderEntry type="library" scope="TEST" name="Gradle: org.hamcrest:hamcrest-core:1.3" level="project" />
48
+ </component>
49
+ </module>
Binary file
@@ -0,0 +1,6 @@
1
+ #Sun Aug 30 14:56:03 JST 2015
2
+ distributionBase=GRADLE_USER_HOME
3
+ distributionPath=wrapper/dists
4
+ zipStoreBase=GRADLE_USER_HOME
5
+ zipStorePath=wrapper/dists
6
+ distributionUrl=https\://services.gradle.org/distributions/gradle-2.6-all.zip
data/gradlew ADDED
@@ -0,0 +1,164 @@
1
+ #!/usr/bin/env bash
2
+
3
+ ##############################################################################
4
+ ##
5
+ ## Gradle start up script for UN*X
6
+ ##
7
+ ##############################################################################
8
+
9
+ # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
10
+ DEFAULT_JVM_OPTS=""
11
+
12
+ APP_NAME="Gradle"
13
+ APP_BASE_NAME=`basename "$0"`
14
+
15
+ # Use the maximum available, or set MAX_FD != -1 to use that value.
16
+ MAX_FD="maximum"
17
+
18
+ warn ( ) {
19
+ echo "$*"
20
+ }
21
+
22
+ die ( ) {
23
+ echo
24
+ echo "$*"
25
+ echo
26
+ exit 1
27
+ }
28
+
29
+ # OS specific support (must be 'true' or 'false').
30
+ cygwin=false
31
+ msys=false
32
+ darwin=false
33
+ case "`uname`" in
34
+ CYGWIN* )
35
+ cygwin=true
36
+ ;;
37
+ Darwin* )
38
+ darwin=true
39
+ ;;
40
+ MINGW* )
41
+ msys=true
42
+ ;;
43
+ esac
44
+
45
+ # For Cygwin, ensure paths are in UNIX format before anything is touched.
46
+ if $cygwin ; then
47
+ [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"`
48
+ fi
49
+
50
+ # Attempt to set APP_HOME
51
+ # Resolve links: $0 may be a link
52
+ PRG="$0"
53
+ # Need this for relative symlinks.
54
+ while [ -h "$PRG" ] ; do
55
+ ls=`ls -ld "$PRG"`
56
+ link=`expr "$ls" : '.*-> \(.*\)$'`
57
+ if expr "$link" : '/.*' > /dev/null; then
58
+ PRG="$link"
59
+ else
60
+ PRG=`dirname "$PRG"`"/$link"
61
+ fi
62
+ done
63
+ SAVED="`pwd`"
64
+ cd "`dirname \"$PRG\"`/" >&-
65
+ APP_HOME="`pwd -P`"
66
+ cd "$SAVED" >&-
67
+
68
+ CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
69
+
70
+ # Determine the Java command to use to start the JVM.
71
+ if [ -n "$JAVA_HOME" ] ; then
72
+ if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
73
+ # IBM's JDK on AIX uses strange locations for the executables
74
+ JAVACMD="$JAVA_HOME/jre/sh/java"
75
+ else
76
+ JAVACMD="$JAVA_HOME/bin/java"
77
+ fi
78
+ if [ ! -x "$JAVACMD" ] ; then
79
+ die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
80
+
81
+ Please set the JAVA_HOME variable in your environment to match the
82
+ location of your Java installation."
83
+ fi
84
+ else
85
+ JAVACMD="java"
86
+ which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
87
+
88
+ Please set the JAVA_HOME variable in your environment to match the
89
+ location of your Java installation."
90
+ fi
91
+
92
+ # Increase the maximum file descriptors if we can.
93
+ if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then
94
+ MAX_FD_LIMIT=`ulimit -H -n`
95
+ if [ $? -eq 0 ] ; then
96
+ if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
97
+ MAX_FD="$MAX_FD_LIMIT"
98
+ fi
99
+ ulimit -n $MAX_FD
100
+ if [ $? -ne 0 ] ; then
101
+ warn "Could not set maximum file descriptor limit: $MAX_FD"
102
+ fi
103
+ else
104
+ warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
105
+ fi
106
+ fi
107
+
108
+ # For Darwin, add options to specify how the application appears in the dock
109
+ if $darwin; then
110
+ GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
111
+ fi
112
+
113
+ # For Cygwin, switch paths to Windows format before running java
114
+ if $cygwin ; then
115
+ APP_HOME=`cygpath --path --mixed "$APP_HOME"`
116
+ CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
117
+
118
+ # We build the pattern for arguments to be converted via cygpath
119
+ ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
120
+ SEP=""
121
+ for dir in $ROOTDIRSRAW ; do
122
+ ROOTDIRS="$ROOTDIRS$SEP$dir"
123
+ SEP="|"
124
+ done
125
+ OURCYGPATTERN="(^($ROOTDIRS))"
126
+ # Add a user-defined pattern to the cygpath arguments
127
+ if [ "$GRADLE_CYGPATTERN" != "" ] ; then
128
+ OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
129
+ fi
130
+ # Now convert the arguments - kludge to limit ourselves to /bin/sh
131
+ i=0
132
+ for arg in "$@" ; do
133
+ CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
134
+ CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
135
+
136
+ if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
137
+ eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
138
+ else
139
+ eval `echo args$i`="\"$arg\""
140
+ fi
141
+ i=$((i+1))
142
+ done
143
+ case $i in
144
+ (0) set -- ;;
145
+ (1) set -- "$args0" ;;
146
+ (2) set -- "$args0" "$args1" ;;
147
+ (3) set -- "$args0" "$args1" "$args2" ;;
148
+ (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
149
+ (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
150
+ (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
151
+ (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
152
+ (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
153
+ (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
154
+ esac
155
+ fi
156
+
157
+ # Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules
158
+ function splitJvmOpts() {
159
+ JVM_OPTS=("$@")
160
+ }
161
+ eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS
162
+ JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME"
163
+
164
+ exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@"
data/gradlew.bat ADDED
@@ -0,0 +1,90 @@
1
+ @if "%DEBUG%" == "" @echo off
2
+ @rem ##########################################################################
3
+ @rem
4
+ @rem Gradle startup script for Windows
5
+ @rem
6
+ @rem ##########################################################################
7
+
8
+ @rem Set local scope for the variables with windows NT shell
9
+ if "%OS%"=="Windows_NT" setlocal
10
+
11
+ @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
12
+ set DEFAULT_JVM_OPTS=
13
+
14
+ set DIRNAME=%~dp0
15
+ if "%DIRNAME%" == "" set DIRNAME=.
16
+ set APP_BASE_NAME=%~n0
17
+ set APP_HOME=%DIRNAME%
18
+
19
+ @rem Find java.exe
20
+ if defined JAVA_HOME goto findJavaFromJavaHome
21
+
22
+ set JAVA_EXE=java.exe
23
+ %JAVA_EXE% -version >NUL 2>&1
24
+ if "%ERRORLEVEL%" == "0" goto init
25
+
26
+ echo.
27
+ echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28
+ echo.
29
+ echo Please set the JAVA_HOME variable in your environment to match the
30
+ echo location of your Java installation.
31
+
32
+ goto fail
33
+
34
+ :findJavaFromJavaHome
35
+ set JAVA_HOME=%JAVA_HOME:"=%
36
+ set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37
+
38
+ if exist "%JAVA_EXE%" goto init
39
+
40
+ echo.
41
+ echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42
+ echo.
43
+ echo Please set the JAVA_HOME variable in your environment to match the
44
+ echo location of your Java installation.
45
+
46
+ goto fail
47
+
48
+ :init
49
+ @rem Get command-line arguments, handling Windowz variants
50
+
51
+ if not "%OS%" == "Windows_NT" goto win9xME_args
52
+ if "%@eval[2+2]" == "4" goto 4NT_args
53
+
54
+ :win9xME_args
55
+ @rem Slurp the command line arguments.
56
+ set CMD_LINE_ARGS=
57
+ set _SKIP=2
58
+
59
+ :win9xME_args_slurp
60
+ if "x%~1" == "x" goto execute
61
+
62
+ set CMD_LINE_ARGS=%*
63
+ goto execute
64
+
65
+ :4NT_args
66
+ @rem Get arguments from the 4NT Shell from JP Software
67
+ set CMD_LINE_ARGS=%$
68
+
69
+ :execute
70
+ @rem Setup the command line
71
+
72
+ set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
73
+
74
+ @rem Execute Gradle
75
+ "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
76
+
77
+ :end
78
+ @rem End local scope for the variables with windows NT shell
79
+ if "%ERRORLEVEL%"=="0" goto mainEnd
80
+
81
+ :fail
82
+ rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
83
+ rem the _cmd.exe /c_ return code!
84
+ if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
85
+ exit /b 1
86
+
87
+ :mainEnd
88
+ if "%OS%"=="Windows_NT" endlocal
89
+
90
+ :omega
@@ -0,0 +1,141 @@
1
+ module Embulk
2
+ module Guess
3
+
4
+ # TODO implement guess plugin to make this command work:
5
+ # $ embulk guess -g "regex" partial-config.yml
6
+ #
7
+ # Depending on the file format the plugin uses, you can use choose
8
+ # one of binary guess (GuessPlugin), text guess (TextGuessPlugin),
9
+ # or line guess (LineGuessPlugin).
10
+
11
+ class Regex < LineGuessPlugin
12
+ Plugin.register_guess("regex", self)
13
+
14
+ def guess_lines(config, sample_lines)
15
+ guesser_list = []
16
+ guesser_list << apache_common(config, sample_lines)
17
+ guesser_list << apache_combined(config, sample_lines)
18
+ guesser_list << apache_combinedio(config, sample_lines)
19
+ guesser_list << apache_x_forwarded_for + apache_combined(config, sample_lines)
20
+ guesser_list << apache_x_forwarded_for + apache_combinedio(config, sample_lines)
21
+ guesser_list.each do |g|
22
+ return {"parser" => g.guessed} if g.match_all?(sample_lines)
23
+ end
24
+ return {}
25
+ end
26
+
27
+ def apache_x_forwarded_for
28
+ RegexApacheLogGuesser.new
29
+ .ip_or_minus(:x_forwarded_for, regexName: 'forwardedFor')
30
+ end
31
+
32
+ def apache_common(config, sample_lines)
33
+ RegexApacheLogGuesser.new
34
+ .ip(:remote_host, regexName: 'remoteHost').token(:identity).token(:user)
35
+ .kakko(:datetime, format: '%d/%b/%Y:%H:%M:%S %z', type: 'timestamp')
36
+ .method_path_protocol
37
+ .integer(:status).integer_or_minus(:size)
38
+ end
39
+
40
+ def apache_combined(config, sample_lines)
41
+ apache_common(config, sample_lines)
42
+ .string(:referer).string(:user_agent, regexName: 'userAgent')
43
+ end
44
+
45
+ def apache_combinedio(config, sample_lines)
46
+ apache_combined(config, sample_lines)
47
+ .integer(:in_byte, regexName: 'inByte').integer(:out_byte, regexName: 'outByte')
48
+ end
49
+ end
50
+
51
+ class RegexApacheLogGuesser
52
+ attr_reader :columns, :patterns
53
+
54
+ def initialize(patterns=nil, columns=nil)
55
+ @patterns = (patterns || [])
56
+ @columns = (columns || [])
57
+ end
58
+
59
+ def +(guesser)
60
+ RegexApacheLogGuesser.new(@patterns + guesser.patterns, @columns + guesser.columns)
61
+ end
62
+
63
+ def match_all?(lines)
64
+ ptn = compile
65
+ lines.all? {|line| ptn.match(line)}
66
+ end
67
+
68
+ def guessed
69
+ ret = {}
70
+ ret["type"] = "regex"
71
+ ret["regex"] = pattern_str
72
+ ret["columns"] = columns
73
+ ret
74
+ end
75
+
76
+ def compile
77
+ Regexp.compile pattern_str
78
+ end
79
+
80
+ def pattern_str
81
+ '^' + @patterns.join(' ') + '$'
82
+ end
83
+
84
+ def ip(name, opts={})
85
+ @patterns << "(?<#{opts[:regexName] || name}>[.:0-9]+)"
86
+ @columns << {:name => name, :type => 'string'}.merge(opts)
87
+ self
88
+ end
89
+
90
+ def ip_or_minus(name, opts={})
91
+ @patterns << "(?<#{opts[:regexName] || name}>[.:0-9]+|-)"
92
+ @columns << {:name => name, :type => 'string'}.merge(opts)
93
+ self
94
+ end
95
+
96
+ def token(name, opts={})
97
+ @patterns << "(?<#{opts[:regexName] || name}>\\S+)"
98
+ @columns << {:name => name, :type => 'string'}.merge(opts)
99
+ self
100
+ end
101
+
102
+ def string(name, opts={})
103
+ @patterns << "\"(?<#{opts[:regexName] || name}>[^\"]*)\""
104
+ @columns << {:name => name, :type => 'string'}.merge(opts)
105
+ self
106
+ end
107
+
108
+ def string_or_minus(name, opts={})
109
+ @patterns << "\"(?<#{opts[:regexName] || name}>[^\"]*|-)\""
110
+ @columns << {:name => name, :type => 'string'}.merge(opts)
111
+ self
112
+ end
113
+
114
+ def integer(name, opts={})
115
+ @patterns << "(?<#{opts[:regexName] || name}>[0-9]+)"
116
+ @columns << {:name => name, :type => 'long'}.merge(opts)
117
+ self
118
+ end
119
+
120
+ def integer_or_minus(name, opts={})
121
+ @patterns << "(?<#{opts[:regexName] || name}>[0-9]+|-)"
122
+ @columns << {:name => name, :type => 'long'}.merge(opts)
123
+ self
124
+ end
125
+
126
+ def kakko(name, opts={})
127
+ @patterns << "\\[(?<#{opts[:regexName] || name}>[^\\]]*)\\]"
128
+ @columns << {:name => name, :type => 'string'}.merge(opts)
129
+ self
130
+ end
131
+
132
+ def method_path_protocol
133
+ @patterns << '"((?<method>\S+) (?<path>\S+) (?<protocol>HTTP/\d+\.\d+)|-)"'
134
+ @columns << {:name => 'method', :type => 'string'}
135
+ @columns << {:name => 'path', :type => 'string'}
136
+ @columns << {:name => 'protocol', :type => 'string'}
137
+ self
138
+ end
139
+ end
140
+ end
141
+ end
@@ -0,0 +1,3 @@
1
+ Embulk::JavaPlugin.register_parser(
2
+ "regex", "org.embulk.parser.regex.RegexParserPlugin",
3
+ File.expand_path('../../../../classpath', __FILE__))
@@ -0,0 +1,23 @@
1
+ in:
2
+ type: file
3
+ path_prefix: sample/apache_1/data_apache_
4
+ parser:
5
+ charset: UTF-8
6
+ newline: LF
7
+ type: regex
8
+ regex: ^(?<remoteHost>[.:0-9]+) (?<identity>\S+) (?<user>\S+) \[(?<datetime>[^\]]*)\] "((?<method>\S+) (?<path>\S+) (?<protocol>HTTP/\d+\.\d+)|-)" (?<status>[0-9]+) (?<size>[0-9]+|-) "(?<referer>[^"]*)" "(?<userAgent>[^"]*)" (?<inByte>[0-9]+) (?<outByte>[0-9]+)$
9
+ columns:
10
+ - {name: remote_host, type: string, regexName: remoteHost}
11
+ - {name: identity, type: string}
12
+ - {name: user, type: string}
13
+ - {name: datetime, type: timestamp, format: '%d/%b/%Y:%H:%M:%S %z'}
14
+ - {name: method, type: string}
15
+ - {name: path, type: string}
16
+ - {name: protocol, type: string}
17
+ - {name: status, type: long}
18
+ - {name: size, type: long}
19
+ - {name: referer, type: string}
20
+ - {name: user_agent, type: string, regexName: userAgent}
21
+ - {name: in_byte, type: long, regexName: inByte}
22
+ - {name: out_byte, type: long, regexName: outByte}
23
+ out: {type: stdout}
@@ -0,0 +1,2 @@
1
+ 172.16.111.11 - - [30/Aug/2015:00:01:27 +0000] "GET /news/checkNews HTTP/1.1" 200 89 "-" "check_http/v1.4.16 (nagios-plugins 1.4.16)" 185 300
2
+ 172.31.222.22 - - [30/Aug/2015:00:01:27 +0000] "-" 408 - "-" "-" 0 0
@@ -0,0 +1,9 @@
1
+ in:
2
+ type: file
3
+ path_prefix: sample/apache_1/data_apache_
4
+ parser:
5
+ charset: UTF-8
6
+ newline: LF
7
+ out:
8
+ type: stdout
9
+
@@ -0,0 +1,22 @@
1
+ in:
2
+ type: file
3
+ path_prefix: sample/apache_2/data_apache_
4
+ parser:
5
+ charset: UTF-8
6
+ newline: LF
7
+ type: regex
8
+ regex: ^(?<forwardedFor>[.:0-9]+|-) (?<remoteHost>[.:0-9]+) (?<identity>\S+) (?<user>\S+) \[(?<datetime>[^\]]*)\] "((?<method>\S+) (?<path>\S+) (?<protocol>HTTP/\d+\.\d+)|-)" (?<status>[0-9]+) (?<size>[0-9]+|-) "(?<referer>[^"]*)" "(?<userAgent>[^"]*)"$
9
+ columns:
10
+ - {name: x_forwarded_for, type: string, regexName: forwardedFor}
11
+ - {name: remote_host, type: string, regexName: remoteHost}
12
+ - {name: identity, type: string}
13
+ - {name: user, type: string}
14
+ - {name: datetime, type: timestamp, format: '%d/%b/%Y:%H:%M:%S %z'}
15
+ - {name: method, type: string}
16
+ - {name: path, type: string}
17
+ - {name: protocol, type: string}
18
+ - {name: status, type: long}
19
+ - {name: size, type: long}
20
+ - {name: referer, type: string}
21
+ - {name: user_agent, type: string, regexName: userAgent}
22
+ out: {type: stdout}
@@ -0,0 +1,2 @@
1
+ - 172.16.111.11 - - [30/Aug/2015:00:01:27 +0000] "GET /news/checkNews HTTP/1.1" 200 89 "-" "check_http/v1.4.16 (nagios-plugins 1.4.16)"
2
+ 172.31.55.99 172.31.222.22 - - [30/Aug/2015:00:01:27 +0000] "-" 408 - "-" "-"
@@ -0,0 +1,9 @@
1
+ in:
2
+ type: file
3
+ path_prefix: sample/apache_2/data_apache_
4
+ parser:
5
+ charset: UTF-8
6
+ newline: LF
7
+ out:
8
+ type: stdout
9
+
@@ -0,0 +1,16 @@
1
+ in:
2
+ type: file
3
+ path_prefix: sample/simple/data_simple_
4
+ parser:
5
+ charset: UTF-8
6
+ newline: LF
7
+ type: regex
8
+ regex: '^(?<name>[^|]+)\|(?<age>[0-9]+)\|(?<time>[^|]+)$'
9
+ skip_if_unmatch: true
10
+ columns:
11
+ - {name: name, type: string}
12
+ - {name: age, type: long}
13
+ - {name: time, type: timestamp, format: '%Y-%m-%d %H:%M:%S %z'}
14
+ out:
15
+ type: stdout
16
+
@@ -0,0 +1,3 @@
1
+ Yamada Taro|20|2015-08-31 23:59:59 +0900
2
+ Error Case|88|
3
+ Suzuki Hanako|19|2015-08-31 00:00:00 +0000
@@ -0,0 +1,139 @@
1
+ package org.embulk.parser.regex;
2
+
3
+ // Many Copies from
4
+ // https://github.com/frsyuki/embulk-parser-msgpack/blob/master/src/main/java/org/embulk/parser/msgpack/MsgpackParserPlugin.java
5
+
6
+ import com.google.common.base.Optional;
7
+ import org.embulk.EmbulkEmbed;
8
+ import org.embulk.config.*;
9
+ import org.embulk.spi.*;
10
+ import org.embulk.spi.time.TimestampFormatter;
11
+ import org.embulk.spi.time.TimestampParser;
12
+ import org.embulk.spi.type.*;
13
+ import org.embulk.spi.util.DynamicColumnSetter;
14
+ import org.embulk.spi.util.LineDecoder;
15
+ import org.embulk.spi.util.Timestamps;
16
+ import org.embulk.spi.util.dynamic.*;
17
+
18
+ import java.util.HashMap;
19
+ import java.util.Map;
20
+ import java.util.regex.Matcher;
21
+ import java.util.regex.Pattern;
22
+
23
+ public class RegexParserPlugin implements ParserPlugin {
24
+
25
+ public interface PluginTask extends Task, LineDecoder.DecoderTask, TimestampParser.Task {
26
+ @Config("regex")
27
+ public String getRegex();
28
+
29
+ @Config("columns")
30
+ public SchemaConfig getSchemaConfig();
31
+
32
+ @Config("skip_if_unmatch")
33
+ @ConfigDefault("false")
34
+ public boolean getSkipIfUnmatch();
35
+ }
36
+
37
+ public interface PluginTaskFormatter
38
+ extends Task, TimestampFormatter.Task {
39
+ }
40
+
41
+ private interface TimestampColumnOption
42
+ extends Task, TimestampFormatter.TimestampColumnOption {
43
+ }
44
+
45
+ public void transaction(ConfigSource config, ParserPlugin.Control control) {
46
+ PluginTask task = config.loadConfig(PluginTask.class);
47
+ control.run(task.dump(), task.getSchemaConfig().toSchema());
48
+ }
49
+
50
+ @Override
51
+ public void run(TaskSource taskSource, Schema schema, FileInput input, PageOutput output) {
52
+ PluginTask task = taskSource.loadTask(PluginTask.class);
53
+ LineDecoder lineDecoder = new LineDecoder(input, task);
54
+ PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output);
55
+ TimestampParser[] timestampParsers = Timestamps.newTimestampColumnParsers(task, task.getSchemaConfig());
56
+
57
+ Pattern pattern = Pattern.compile(task.getRegex());
58
+ Map<String, DynamicColumnSetter> setterMap = setupSetters(pageBuilder, task.getSchemaConfig(),
59
+ timestampParsers, taskSource.loadTask(PluginTaskFormatter.class));
60
+
61
+ while (input.nextFile()) {
62
+ while (true) {
63
+ String line = lineDecoder.poll();
64
+ if (line == null) {
65
+ break;
66
+ }
67
+ Matcher matcher = pattern.matcher(line);
68
+ if (!matcher.matches()) {
69
+ if (task.getSkipIfUnmatch()) {
70
+ // TODO: How to Log?
71
+ continue;
72
+ } else {
73
+ throw new RuntimeException("Unmatched Line: " + line);
74
+ }
75
+ }
76
+
77
+ for (Map.Entry<String, DynamicColumnSetter> pair : setterMap.entrySet()) {
78
+ String value = matcher.group(pair.getKey());
79
+ if (value == null) {
80
+ pair.getValue().setNull();
81
+ } else {
82
+ pair.getValue().set(value);
83
+ }
84
+ }
85
+ pageBuilder.addRecord();
86
+ }
87
+ }
88
+ pageBuilder.finish();
89
+ }
90
+
91
+ private Map<String, DynamicColumnSetter> setupSetters(PageBuilder pageBuilder,
92
+ SchemaConfig schema,
93
+ TimestampParser[] timestampParsers,
94
+ TimestampFormatter.Task formatterTask) {
95
+ Map<String, DynamicColumnSetter> setterMap = new HashMap<>();
96
+
97
+ int index = -1;
98
+ for (ColumnConfig c : schema.getColumns()) {
99
+ index++;
100
+ String name = c.getName();
101
+ Type type = c.getType();
102
+ Column column = c.toColumn(index);
103
+ String regexName = c.getOption().get(String.class, "regexName", name);
104
+
105
+ DefaultValueSetter defaultValue = new NullDefaultValueSetter();
106
+ DynamicColumnSetter setter;
107
+
108
+ if (type instanceof BooleanType) {
109
+ setter = new BooleanColumnSetter(pageBuilder, column, defaultValue);
110
+
111
+ } else if (type instanceof LongType) {
112
+ setter = new LongColumnSetter(pageBuilder, column, defaultValue);
113
+
114
+ } else if (type instanceof DoubleType) {
115
+ setter = new DoubleColumnSetter(pageBuilder, column, defaultValue);
116
+
117
+ } else if (type instanceof StringType) {
118
+ TimestampFormatter formatter = new TimestampFormatter(formatterTask,
119
+ Optional.of(c.getOption().loadConfig(TimestampColumnOption.class)));
120
+ setter = new StringColumnSetter(pageBuilder, column, defaultValue, formatter);
121
+
122
+ } else if (type instanceof TimestampType) {
123
+ // TODO use flexible time format like Ruby's Time.parse
124
+ TimestampParser parser = timestampParsers[column.getIndex()];
125
+ setter = new TimestampColumnSetter(pageBuilder, column, defaultValue, parser);
126
+
127
+ } else {
128
+ throw new ConfigException("Unknown column type: " + type);
129
+ }
130
+ setterMap.put(regexName, setter);
131
+ }
132
+
133
+ return setterMap;
134
+ }
135
+
136
+ }
137
+
138
+
139
+
@@ -0,0 +1,5 @@
1
+ package org.embulk.parser.regex;
2
+
3
+ public class TestRegexParserPlugin
4
+ {
5
+ }
metadata ADDED
@@ -0,0 +1,95 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: embulk-parser-regex
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Ken Morishita
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-08-31 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ description: Parses lines using regular-expression in files read by other file input
42
+ plugins.
43
+ email:
44
+ - mokemokechicken@gmail.com
45
+ executables: []
46
+ extensions: []
47
+ extra_rdoc_files: []
48
+ files:
49
+ - ".gitignore"
50
+ - LICENSE.txt
51
+ - README.md
52
+ - build.gradle
53
+ - classpath/embulk-parser-regex-0.1.0.jar
54
+ - embulk-parser-regex.iml
55
+ - gradle/wrapper/gradle-wrapper.jar
56
+ - gradle/wrapper/gradle-wrapper.properties
57
+ - gradlew
58
+ - gradlew.bat
59
+ - lib/embulk/guess/regex.rb
60
+ - lib/embulk/parser/regex.rb
61
+ - sample/apache_1/config.yml
62
+ - sample/apache_1/data_apache_1.txt
63
+ - sample/apache_1/pre_config.yml
64
+ - sample/apache_2/config.yml
65
+ - sample/apache_2/data_apache_1.txt
66
+ - sample/apache_2/pre_config.yml
67
+ - sample/simple/config_simple.yml
68
+ - sample/simple/data_simple_1.txt
69
+ - src/main/java/org/embulk/parser/regex/RegexParserPlugin.java
70
+ - src/test/java/org/embulk/parser/regex/TestRegexParserPlugin.java
71
+ homepage: https://github.com/mokemokechicken/embulk-parser-regex
72
+ licenses:
73
+ - MIT
74
+ metadata: {}
75
+ post_install_message:
76
+ rdoc_options: []
77
+ require_paths:
78
+ - lib
79
+ required_ruby_version: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ version: '0'
84
+ required_rubygems_version: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ requirements: []
90
+ rubyforge_project:
91
+ rubygems_version: 2.2.2
92
+ signing_key:
93
+ specification_version: 4
94
+ summary: Regex parser plugin for Embulk
95
+ test_files: []