embulk-parser-joni_regexp 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +13 -0
  3. data/.travis.yml +7 -0
  4. data/LICENSE.txt +21 -0
  5. data/README.md +107 -0
  6. data/build.gradle +101 -0
  7. data/config/checkstyle/checkstyle.xml +128 -0
  8. data/config/checkstyle/default.xml +108 -0
  9. data/example/apache.txt +5000 -0
  10. data/example/apache.yml +18 -0
  11. data/example/conf.yml +12 -0
  12. data/example/conf2.yml +12 -0
  13. data/example/conf3.yml +12 -0
  14. data/example/seed.yml +8 -0
  15. data/example/test.txt +8 -0
  16. data/example/test2.txt +3 -0
  17. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  18. data/gradle/wrapper/gradle-wrapper.properties +6 -0
  19. data/gradlew +169 -0
  20. data/gradlew.bat +84 -0
  21. data/lib/embulk/guess/joni_regexp.rb +28 -0
  22. data/lib/embulk/parser/joni_regexp.rb +3 -0
  23. data/src/main/java/org/embulk/parser/joni_regexp/ColumnCaster.java +97 -0
  24. data/src/main/java/org/embulk/parser/joni_regexp/ColumnVisitorImpl.java +167 -0
  25. data/src/main/java/org/embulk/parser/joni_regexp/JoniRegexpParserPlugin.java +169 -0
  26. data/src/main/java/org/embulk/parser/joni_regexp/JsonRecordValidateException.java +22 -0
  27. data/src/main/java/org/embulk/parser/joni_regexp/cast/BooleanCast.java +39 -0
  28. data/src/main/java/org/embulk/parser/joni_regexp/cast/DoubleCast.java +41 -0
  29. data/src/main/java/org/embulk/parser/joni_regexp/cast/JsonCast.java +40 -0
  30. data/src/main/java/org/embulk/parser/joni_regexp/cast/LongCast.java +47 -0
  31. data/src/main/java/org/embulk/parser/joni_regexp/cast/StringCast.java +82 -0
  32. data/src/test/java/org/embulk/parser/joni_regexp/TestColumnCaster.java +256 -0
  33. data/src/test/java/org/embulk/parser/joni_regexp/TestJoniRegexpParserPlugin.java +432 -0
  34. data/src/test/java/org/embulk/parser/joni_regexp/cast/TestBooleanCast.java +56 -0
  35. data/src/test/java/org/embulk/parser/joni_regexp/cast/TestDoubleCast.java +49 -0
  36. data/src/test/java/org/embulk/parser/joni_regexp/cast/TestJsonCast.java +79 -0
  37. data/src/test/java/org/embulk/parser/joni_regexp/cast/TestLongCast.java +41 -0
  38. data/src/test/java/org/embulk/parser/joni_regexp/cast/TestStringCast.java +103 -0
  39. metadata +112 -0
@@ -0,0 +1,18 @@
1
+ in:
2
+ type: file
3
+ path_prefix: example/apache.txt
4
+ parser:
5
+ type: joni_regexp
6
+ columns:
7
+ - { name: host, type: string }
8
+ - { name: user, type: string }
9
+ - { name: time, type: timestamp, format: "%d/%b/%Y:%H:%M:%S %z" }
10
+ - { name: method, type: string }
11
+ - { name: path, type: string }
12
+ - { name: code, type: string }
13
+ - { name: size, type: string }
14
+ - { name: referer, type: string }
15
+ - { name: agent, type: string }
16
+ format: '^(?<host>[^ ]*) [^ ]* (?<user>[^ ]*) \[(?<time>[^\]]*)\] "(?<method>\S+)(?: +(?<path>[^ ]*) +\S*)?" (?<code>[^ ]*) (?<size>[^ ]*)(?: "(?<referer>[^\"]*)" "(?<agent>[^\"]*)")?$'
17
+ out:
18
+ type: stdout
@@ -0,0 +1,12 @@
1
+ in:
2
+ type: file
3
+ path_prefix: example/test.txt
4
+ parser:
5
+ type: joni_regexp
6
+ columns:
7
+ - { name: name2, type: string }
8
+ - { name: fuga, type: string }
9
+ # format: "(?<hoge>aaa*),(?<fuga>bbb*),(?<test>ccc)"
10
+ format: '(?<name>a*) (?<fuga>bbbb*)'
11
+ out:
12
+ type: stdout
@@ -0,0 +1,12 @@
1
+ in:
2
+ type: file
3
+ path_prefix: example/test2.txt
4
+ parser:
5
+ type: joni_regexp
6
+ columns:
7
+ - { name: name, type: string }
8
+ - { name: birth, type: timestamp, format: "%Y-%m-%d" }
9
+ - { name: age, type: long }
10
+ format: '(?<name>[^,]+),(?<birth>\d{4}-\d{2}-\d{2}),(?<age>\d+)'
11
+ out:
12
+ type: stdout
@@ -0,0 +1,12 @@
1
+ in:
2
+ type: file
3
+ path_prefix: example/test2.txt
4
+ parser:
5
+ type: joni_regexp
6
+ columns:
7
+ - { name: name, type: string }
8
+ - { name: birth, type: timestamp, format: "%Y-%m-%d" }
9
+ - { name: age, type: long }
10
+ format: "(?<hoge>[^,]+),(?<birth>\\d{4}-\\d{2}-\\d{2}),(?<age>\\d+)"
11
+ out:
12
+ type: stdout
@@ -0,0 +1,8 @@
1
+ in:
2
+ type: file
3
+ path_prefix: example/test2.txt
4
+ parser:
5
+ type: joni_regexp
6
+ format: "(?<name>[^,]+),(?<birth>\\d{4}-\\d{2}-\\d{2}),(?<age>\\d+)"
7
+ out:
8
+ type: stdout
@@ -0,0 +1,8 @@
1
+ aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbb
2
+ aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbb
3
+ aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbb
4
+ aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbb
5
+ aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbb
6
+ aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbb
7
+ aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbb
8
+ aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbb
@@ -0,0 +1,3 @@
1
+ John,2010-02-01,7
2
+ Mike,2000-12-31,27
3
+ Bob,1999-02-01,18
@@ -0,0 +1,6 @@
1
+ #Sun Jan 08 00:35:58 PST 2017
2
+ distributionBase=GRADLE_USER_HOME
3
+ distributionPath=wrapper/dists
4
+ zipStoreBase=GRADLE_USER_HOME
5
+ zipStorePath=wrapper/dists
6
+ distributionUrl=https\://services.gradle.org/distributions/gradle-3.2.1-bin.zip
data/gradlew ADDED
@@ -0,0 +1,169 @@
1
+ #!/usr/bin/env bash
2
+
3
+ ##############################################################################
4
+ ##
5
+ ## Gradle start up script for UN*X
6
+ ##
7
+ ##############################################################################
8
+
9
+ # Attempt to set APP_HOME
10
+ # Resolve links: $0 may be a link
11
+ PRG="$0"
12
+ # Need this for relative symlinks.
13
+ while [ -h "$PRG" ] ; do
14
+ ls=`ls -ld "$PRG"`
15
+ link=`expr "$ls" : '.*-> \(.*\)$'`
16
+ if expr "$link" : '/.*' > /dev/null; then
17
+ PRG="$link"
18
+ else
19
+ PRG=`dirname "$PRG"`"/$link"
20
+ fi
21
+ done
22
+ SAVED="`pwd`"
23
+ cd "`dirname \"$PRG\"`/" >/dev/null
24
+ APP_HOME="`pwd -P`"
25
+ cd "$SAVED" >/dev/null
26
+
27
+ APP_NAME="Gradle"
28
+ APP_BASE_NAME=`basename "$0"`
29
+
30
+ # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
31
+ DEFAULT_JVM_OPTS=""
32
+
33
+ # Use the maximum available, or set MAX_FD != -1 to use that value.
34
+ MAX_FD="maximum"
35
+
36
+ warn ( ) {
37
+ echo "$*"
38
+ }
39
+
40
+ die ( ) {
41
+ echo
42
+ echo "$*"
43
+ echo
44
+ exit 1
45
+ }
46
+
47
+ # OS specific support (must be 'true' or 'false').
48
+ cygwin=false
49
+ msys=false
50
+ darwin=false
51
+ nonstop=false
52
+ case "`uname`" in
53
+ CYGWIN* )
54
+ cygwin=true
55
+ ;;
56
+ Darwin* )
57
+ darwin=true
58
+ ;;
59
+ MINGW* )
60
+ msys=true
61
+ ;;
62
+ NONSTOP* )
63
+ nonstop=true
64
+ ;;
65
+ esac
66
+
67
+ CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
68
+
69
+ # Determine the Java command to use to start the JVM.
70
+ if [ -n "$JAVA_HOME" ] ; then
71
+ if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
72
+ # IBM's JDK on AIX uses strange locations for the executables
73
+ JAVACMD="$JAVA_HOME/jre/sh/java"
74
+ else
75
+ JAVACMD="$JAVA_HOME/bin/java"
76
+ fi
77
+ if [ ! -x "$JAVACMD" ] ; then
78
+ die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
79
+
80
+ Please set the JAVA_HOME variable in your environment to match the
81
+ location of your Java installation."
82
+ fi
83
+ else
84
+ JAVACMD="java"
85
+ which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
86
+
87
+ Please set the JAVA_HOME variable in your environment to match the
88
+ location of your Java installation."
89
+ fi
90
+
91
+ # Increase the maximum file descriptors if we can.
92
+ if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
93
+ MAX_FD_LIMIT=`ulimit -H -n`
94
+ if [ $? -eq 0 ] ; then
95
+ if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
96
+ MAX_FD="$MAX_FD_LIMIT"
97
+ fi
98
+ ulimit -n $MAX_FD
99
+ if [ $? -ne 0 ] ; then
100
+ warn "Could not set maximum file descriptor limit: $MAX_FD"
101
+ fi
102
+ else
103
+ warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
104
+ fi
105
+ fi
106
+
107
+ # For Darwin, add options to specify how the application appears in the dock
108
+ if $darwin; then
109
+ GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
110
+ fi
111
+
112
+ # For Cygwin, switch paths to Windows format before running java
113
+ if $cygwin ; then
114
+ APP_HOME=`cygpath --path --mixed "$APP_HOME"`
115
+ CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
116
+ JAVACMD=`cygpath --unix "$JAVACMD"`
117
+
118
+ # We build the pattern for arguments to be converted via cygpath
119
+ ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
120
+ SEP=""
121
+ for dir in $ROOTDIRSRAW ; do
122
+ ROOTDIRS="$ROOTDIRS$SEP$dir"
123
+ SEP="|"
124
+ done
125
+ OURCYGPATTERN="(^($ROOTDIRS))"
126
+ # Add a user-defined pattern to the cygpath arguments
127
+ if [ "$GRADLE_CYGPATTERN" != "" ] ; then
128
+ OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
129
+ fi
130
+ # Now convert the arguments - kludge to limit ourselves to /bin/sh
131
+ i=0
132
+ for arg in "$@" ; do
133
+ CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
134
+ CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
135
+
136
+ if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
137
+ eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
138
+ else
139
+ eval `echo args$i`="\"$arg\""
140
+ fi
141
+ i=$((i+1))
142
+ done
143
+ case $i in
144
+ (0) set -- ;;
145
+ (1) set -- "$args0" ;;
146
+ (2) set -- "$args0" "$args1" ;;
147
+ (3) set -- "$args0" "$args1" "$args2" ;;
148
+ (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
149
+ (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
150
+ (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
151
+ (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
152
+ (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
153
+ (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
154
+ esac
155
+ fi
156
+
157
+ # Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules
158
+ function splitJvmOpts() {
159
+ JVM_OPTS=("$@")
160
+ }
161
+ eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS
162
+ JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME"
163
+
164
+ # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
165
+ if [[ "$(uname)" == "Darwin" ]] && [[ "$HOME" == "$PWD" ]]; then
166
+ cd "$(dirname "$0")"
167
+ fi
168
+
169
+ exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@"
@@ -0,0 +1,84 @@
1
+ @if "%DEBUG%" == "" @echo off
2
+ @rem ##########################################################################
3
+ @rem
4
+ @rem Gradle startup script for Windows
5
+ @rem
6
+ @rem ##########################################################################
7
+
8
+ @rem Set local scope for the variables with windows NT shell
9
+ if "%OS%"=="Windows_NT" setlocal
10
+
11
+ set DIRNAME=%~dp0
12
+ if "%DIRNAME%" == "" set DIRNAME=.
13
+ set APP_BASE_NAME=%~n0
14
+ set APP_HOME=%DIRNAME%
15
+
16
+ @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
17
+ set DEFAULT_JVM_OPTS=
18
+
19
+ @rem Find java.exe
20
+ if defined JAVA_HOME goto findJavaFromJavaHome
21
+
22
+ set JAVA_EXE=java.exe
23
+ %JAVA_EXE% -version >NUL 2>&1
24
+ if "%ERRORLEVEL%" == "0" goto init
25
+
26
+ echo.
27
+ echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28
+ echo.
29
+ echo Please set the JAVA_HOME variable in your environment to match the
30
+ echo location of your Java installation.
31
+
32
+ goto fail
33
+
34
+ :findJavaFromJavaHome
35
+ set JAVA_HOME=%JAVA_HOME:"=%
36
+ set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37
+
38
+ if exist "%JAVA_EXE%" goto init
39
+
40
+ echo.
41
+ echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42
+ echo.
43
+ echo Please set the JAVA_HOME variable in your environment to match the
44
+ echo location of your Java installation.
45
+
46
+ goto fail
47
+
48
+ :init
49
+ @rem Get command-line arguments, handling Windows variants
50
+
51
+ if not "%OS%" == "Windows_NT" goto win9xME_args
52
+
53
+ :win9xME_args
54
+ @rem Slurp the command line arguments.
55
+ set CMD_LINE_ARGS=
56
+ set _SKIP=2
57
+
58
+ :win9xME_args_slurp
59
+ if "x%~1" == "x" goto execute
60
+
61
+ set CMD_LINE_ARGS=%*
62
+
63
+ :execute
64
+ @rem Setup the command line
65
+
66
+ set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
67
+
68
+ @rem Execute Gradle
69
+ "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
70
+
71
+ :end
72
+ @rem End local scope for the variables with windows NT shell
73
+ if "%ERRORLEVEL%"=="0" goto mainEnd
74
+
75
+ :fail
76
+ rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
77
+ rem the _cmd.exe /c_ return code!
78
+ if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
79
+ exit /b 1
80
+
81
+ :mainEnd
82
+ if "%OS%"=="Windows_NT" endlocal
83
+
84
+ :omega
@@ -0,0 +1,28 @@
1
+ module Embulk
2
+ module Guess
3
+
4
+ class JoniRegexp < GuessPlugin
5
+ Plugin.register_guess("joni_regexp", self)
6
+
7
+ def guess(config, sample_buffer)
8
+ parser_config = config["parser"]
9
+ return {} unless parser_config
10
+ format = parser_config["format"]
11
+ raise StandardError,"Guess need string `format` parameter" unless format.kind_of?(String)
12
+ # raise ConfigError,"Guess need string `format` parameter" unless format.kind_of?(String)
13
+ guessed = {}
14
+ begin
15
+ regex = Regexp.new(format)
16
+ columns = []
17
+ guessed["type"] = "joni_regexp"
18
+ guessed["format"] = format
19
+ columns = regex.names.map{ |x| {'name' => x, 'type' => 'string'} }
20
+ guessed["columns"] = columns
21
+ return {"parser" => guessed}
22
+ rescue
23
+ return {}
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,3 @@
1
+ Embulk::JavaPlugin.register_parser(
2
+ "joni_regexp", "org.embulk.parser.joni_regexp.JoniRegexpParserPlugin",
3
+ File.expand_path('../../../../classpath', __FILE__))
@@ -0,0 +1,97 @@
1
+ package org.embulk.parser.joni_regexp;
2
+
3
+ import org.embulk.parser.joni_regexp.cast.BooleanCast;
4
+ import org.embulk.parser.joni_regexp.cast.DoubleCast;
5
+ import org.embulk.parser.joni_regexp.cast.JsonCast;
6
+ import org.embulk.parser.joni_regexp.cast.LongCast;
7
+ import org.embulk.parser.joni_regexp.cast.StringCast;
8
+ import org.embulk.spi.DataException;
9
+ import org.embulk.spi.time.Timestamp;
10
+ import org.embulk.spi.time.TimestampParser;
11
+ import org.msgpack.value.Value;
12
+
13
+ class ColumnCaster
14
+ {
15
+ private ColumnCaster() {}
16
+
17
+ public static boolean asBoolean(Value value) throws DataException
18
+ {
19
+ if (value.isBooleanValue()) {
20
+ return value.asBooleanValue().getBoolean();
21
+ }
22
+ else if (value.isIntegerValue()) {
23
+ return LongCast.asBoolean(value.asIntegerValue().asLong());
24
+ }
25
+ else if (value.isFloatValue()) {
26
+ return DoubleCast.asBoolean(value.asFloatValue().toDouble());
27
+ }
28
+ else if (value.isStringValue()) {
29
+ return StringCast.asBoolean(value.asStringValue().asString());
30
+ }
31
+ else {
32
+ return JsonCast.asBoolean(value);
33
+ }
34
+ }
35
+
36
+ public static long asLong(Value value) throws DataException
37
+ {
38
+ if (value.isBooleanValue()) {
39
+ return BooleanCast.asLong(value.asBooleanValue().getBoolean());
40
+ }
41
+ else if (value.isIntegerValue()) {
42
+ return value.asIntegerValue().asLong();
43
+ }
44
+ else if (value.isFloatValue()) {
45
+ return DoubleCast.asLong(value.asFloatValue().toDouble());
46
+ }
47
+ else if (value.isStringValue()) {
48
+ return StringCast.asLong(value.asStringValue().asString());
49
+ }
50
+ else {
51
+ return JsonCast.asLong(value);
52
+ }
53
+ }
54
+
55
+ public static double asDouble(Value value) throws DataException
56
+ {
57
+ if (value.isBooleanValue()) {
58
+ return BooleanCast.asDouble(value.asBooleanValue().getBoolean());
59
+ }
60
+ else if (value.isIntegerValue()) {
61
+ return LongCast.asDouble(value.asIntegerValue().asLong());
62
+ }
63
+ else if (value.isFloatValue()) {
64
+ return value.asFloatValue().toDouble();
65
+ }
66
+ else if (value.isStringValue()) {
67
+ return StringCast.asDouble(value.asStringValue().asString());
68
+ }
69
+ else {
70
+ return JsonCast.asDouble(value);
71
+ }
72
+ }
73
+
74
+ public static String asString(Value value) throws DataException
75
+ {
76
+ return value.toString();
77
+ }
78
+
79
+ public static Timestamp asTimestamp(Value value, TimestampParser parser) throws DataException
80
+ {
81
+ if (value.isBooleanValue()) {
82
+ return BooleanCast.asTimestamp(value.asBooleanValue().getBoolean());
83
+ }
84
+ else if (value.isIntegerValue()) {
85
+ return LongCast.asTimestamp(value.asIntegerValue().asLong());
86
+ }
87
+ else if (value.isFloatValue()) {
88
+ return DoubleCast.asTimestamp(value.asFloatValue().toDouble());
89
+ }
90
+ else if (value.isStringValue()) {
91
+ return StringCast.asTimestamp(value.asStringValue().asString(), parser);
92
+ }
93
+ else {
94
+ return JsonCast.asTimestamp(value);
95
+ }
96
+ }
97
+ }