embulk-parser-msgpack 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: c714192fba25bbfb20964048784d77afd5f19b34
4
+ data.tar.gz: e200b96c3d623fb750b803fc80cde5b5a5ba41ea
5
+ SHA512:
6
+ metadata.gz: f0737548867e3171551cb58e7585cda84d98d12a60518e3aa390e13f54afec0a456947c2e70b4a95a842112b50dc099e559856844185af2c19df3390a6495288
7
+ data.tar.gz: f592867d5a4216101605a800ac0c84ce5d00b127d8850e8279011476e675e7ed2573523cc90aaf132a577877f043664b294944ca21a29ddef67efa96bd6c5945
data/.gitignore ADDED
@@ -0,0 +1,8 @@
1
+ *~
2
+ /pkg/
3
+ /tmp/
4
+ *.gemspec
5
+ .gradle/
6
+ /classpath/
7
+ build/
8
+ .idea
data/COPYING ADDED
@@ -0,0 +1,14 @@
1
+ Copyright (C) 2015 Sadayuki Furuhashi
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+
data/ChangeLog ADDED
@@ -0,0 +1,4 @@
1
+ Release 0.1.0 - 2015-08-11
2
+
3
+ * The first release
4
+
data/README.md ADDED
@@ -0,0 +1,39 @@
1
+ # MessagePack parser plugin for Embulk
2
+
3
+ Parses files encoded in MessagePack.
4
+
5
+ ## Overview
6
+
7
+ * **Plugin type**: parser
8
+ * **Guess supported**: yes
9
+
10
+ ## Configuration
11
+
12
+ - **row_encoding**: type of a row. "array" or "map" (enum, default: map)
13
+ - **file_encoding**: if a file includes a big array, set "array". Otherwise, if a file includes sequence of rows, set "sequence" (enum, default: sequence)
14
+ - **columns**: description (schema, required)
15
+
16
+ ## Example
17
+
18
+ ```yaml
19
+ in:
20
+ type: any file input plugin type
21
+ parser:
22
+ type: msgpack
23
+ row_encoding: map
24
+ file_encoding: sequence
25
+ columns:
26
+ - {index: 0, name: a, type: long}
27
+ - {index: 1, name: b, type: string}
28
+ ```
29
+
30
+ ```
31
+ $ embulk gem install embulk-parser-msgpack
32
+ $ embulk guess -g msgpack config.yml -o guessed.yml
33
+ ```
34
+
35
+ ## Build
36
+
37
+ ```
38
+ $ ./gradlew gem
39
+ ```
data/build.gradle ADDED
@@ -0,0 +1,73 @@
1
+ plugins {
2
+ id "com.jfrog.bintray" version "1.1"
3
+ id "com.github.jruby-gradle.base" version "0.1.5"
4
+ id "java"
5
+ }
6
+ import com.github.jrubygradle.JRubyExec
7
+ repositories {
8
+ mavenCentral()
9
+ mavenLocal()
10
+ jcenter()
11
+ }
12
+ configurations {
13
+ provided
14
+ }
15
+
16
+ version = "0.1.0"
17
+
18
+ dependencies {
19
+ compile "org.embulk:embulk-core:0.6.22"
20
+ provided "org.embulk:embulk-core:0.6.22"
21
+ compile "org.msgpack:msgpack-core:0.7.0-M6"
22
+ testCompile "junit:junit:4.+"
23
+ }
24
+
25
+ task classpath(type: Copy, dependsOn: ["jar"]) {
26
+ doFirst { file("classpath").deleteDir() }
27
+ from (configurations.runtime - configurations.provided + files(jar.archivePath))
28
+ into "classpath"
29
+ }
30
+ clean { delete "classpath" }
31
+
32
+ task gem(type: JRubyExec, dependsOn: ["gemspec", "classpath"]) {
33
+ jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "build"
34
+ script "${project.name}.gemspec"
35
+ doLast { ant.move(file: "${project.name}-${project.version}.gem", todir: "pkg") }
36
+ }
37
+
38
+ task gemPush(type: JRubyExec, dependsOn: ["gem"]) {
39
+ jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "push"
40
+ script "pkg/${project.name}-${project.version}.gem"
41
+ }
42
+
43
+ task "package"(dependsOn: ["gemspec", "classpath"]) << {
44
+ println "> Build succeeded."
45
+ println "> You can run embulk with '-L ${file(".").absolutePath}' argument."
46
+ }
47
+
48
+ task gemspec {
49
+ ext.gemspecFile = file("${project.name}.gemspec")
50
+ inputs.file "build.gradle"
51
+ outputs.file gemspecFile
52
+ doLast { gemspecFile.write($/
53
+ Gem::Specification.new do |spec|
54
+ spec.name = "${project.name}"
55
+ spec.version = "${project.version}"
56
+ spec.authors = ["Sadayuki Furuhashi"]
57
+ spec.summary = %[MessagePack parser plugin for Embulk]
58
+ spec.description = %[Parses files encoded in MessagePack.]
59
+ spec.email = ["frsyuki@gmail.com"]
60
+ spec.licenses = ["Apache 2.0"]
61
+ spec.homepage = "https://github.com/frsyuki/embulk-parser-msgpack"
62
+
63
+ spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"]
64
+ spec.test_files = spec.files.grep(%r"^(test|spec)/")
65
+ spec.require_paths = ["lib"]
66
+
67
+ spec.add_development_dependency 'bundler', ['~> 1.0']
68
+ spec.add_development_dependency 'rake', ['>= 10.0']
69
+ end
70
+ /$)
71
+ }
72
+ }
73
+ clean { delete "${project.name}.gemspec" }
Binary file
Binary file
@@ -0,0 +1,6 @@
1
+ #Wed Feb 04 13:46:12 PST 2015
2
+ distributionBase=GRADLE_USER_HOME
3
+ distributionPath=wrapper/dists
4
+ zipStoreBase=GRADLE_USER_HOME
5
+ zipStorePath=wrapper/dists
6
+ distributionUrl=https\://services.gradle.org/distributions/gradle-2.2.1-bin.zip
data/gradlew ADDED
@@ -0,0 +1,164 @@
1
+ #!/usr/bin/env bash
2
+
3
+ ##############################################################################
4
+ ##
5
+ ## Gradle start up script for UN*X
6
+ ##
7
+ ##############################################################################
8
+
9
+ # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
10
+ DEFAULT_JVM_OPTS=""
11
+
12
+ APP_NAME="Gradle"
13
+ APP_BASE_NAME=`basename "$0"`
14
+
15
+ # Use the maximum available, or set MAX_FD != -1 to use that value.
16
+ MAX_FD="maximum"
17
+
18
+ warn ( ) {
19
+ echo "$*"
20
+ }
21
+
22
+ die ( ) {
23
+ echo
24
+ echo "$*"
25
+ echo
26
+ exit 1
27
+ }
28
+
29
+ # OS specific support (must be 'true' or 'false').
30
+ cygwin=false
31
+ msys=false
32
+ darwin=false
33
+ case "`uname`" in
34
+ CYGWIN* )
35
+ cygwin=true
36
+ ;;
37
+ Darwin* )
38
+ darwin=true
39
+ ;;
40
+ MINGW* )
41
+ msys=true
42
+ ;;
43
+ esac
44
+
45
+ # For Cygwin, ensure paths are in UNIX format before anything is touched.
46
+ if $cygwin ; then
47
+ [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"`
48
+ fi
49
+
50
+ # Attempt to set APP_HOME
51
+ # Resolve links: $0 may be a link
52
+ PRG="$0"
53
+ # Need this for relative symlinks.
54
+ while [ -h "$PRG" ] ; do
55
+ ls=`ls -ld "$PRG"`
56
+ link=`expr "$ls" : '.*-> \(.*\)$'`
57
+ if expr "$link" : '/.*' > /dev/null; then
58
+ PRG="$link"
59
+ else
60
+ PRG=`dirname "$PRG"`"/$link"
61
+ fi
62
+ done
63
+ SAVED="`pwd`"
64
+ cd "`dirname \"$PRG\"`/" >&-
65
+ APP_HOME="`pwd -P`"
66
+ cd "$SAVED" >&-
67
+
68
+ CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
69
+
70
+ # Determine the Java command to use to start the JVM.
71
+ if [ -n "$JAVA_HOME" ] ; then
72
+ if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
73
+ # IBM's JDK on AIX uses strange locations for the executables
74
+ JAVACMD="$JAVA_HOME/jre/sh/java"
75
+ else
76
+ JAVACMD="$JAVA_HOME/bin/java"
77
+ fi
78
+ if [ ! -x "$JAVACMD" ] ; then
79
+ die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
80
+
81
+ Please set the JAVA_HOME variable in your environment to match the
82
+ location of your Java installation."
83
+ fi
84
+ else
85
+ JAVACMD="java"
86
+ which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
87
+
88
+ Please set the JAVA_HOME variable in your environment to match the
89
+ location of your Java installation."
90
+ fi
91
+
92
+ # Increase the maximum file descriptors if we can.
93
+ if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then
94
+ MAX_FD_LIMIT=`ulimit -H -n`
95
+ if [ $? -eq 0 ] ; then
96
+ if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
97
+ MAX_FD="$MAX_FD_LIMIT"
98
+ fi
99
+ ulimit -n $MAX_FD
100
+ if [ $? -ne 0 ] ; then
101
+ warn "Could not set maximum file descriptor limit: $MAX_FD"
102
+ fi
103
+ else
104
+ warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
105
+ fi
106
+ fi
107
+
108
+ # For Darwin, add options to specify how the application appears in the dock
109
+ if $darwin; then
110
+ GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
111
+ fi
112
+
113
+ # For Cygwin, switch paths to Windows format before running java
114
+ if $cygwin ; then
115
+ APP_HOME=`cygpath --path --mixed "$APP_HOME"`
116
+ CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
117
+
118
+ # We build the pattern for arguments to be converted via cygpath
119
+ ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
120
+ SEP=""
121
+ for dir in $ROOTDIRSRAW ; do
122
+ ROOTDIRS="$ROOTDIRS$SEP$dir"
123
+ SEP="|"
124
+ done
125
+ OURCYGPATTERN="(^($ROOTDIRS))"
126
+ # Add a user-defined pattern to the cygpath arguments
127
+ if [ "$GRADLE_CYGPATTERN" != "" ] ; then
128
+ OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
129
+ fi
130
+ # Now convert the arguments - kludge to limit ourselves to /bin/sh
131
+ i=0
132
+ for arg in "$@" ; do
133
+ CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
134
+ CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
135
+
136
+ if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
137
+ eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
138
+ else
139
+ eval `echo args$i`="\"$arg\""
140
+ fi
141
+ i=$((i+1))
142
+ done
143
+ case $i in
144
+ (0) set -- ;;
145
+ (1) set -- "$args0" ;;
146
+ (2) set -- "$args0" "$args1" ;;
147
+ (3) set -- "$args0" "$args1" "$args2" ;;
148
+ (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
149
+ (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
150
+ (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
151
+ (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
152
+ (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
153
+ (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
154
+ esac
155
+ fi
156
+
157
+ # Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules
158
+ function splitJvmOpts() {
159
+ JVM_OPTS=("$@")
160
+ }
161
+ eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS
162
+ JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME"
163
+
164
+ exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@"
data/gradlew.bat ADDED
@@ -0,0 +1,90 @@
1
+ @if "%DEBUG%" == "" @echo off
2
+ @rem ##########################################################################
3
+ @rem
4
+ @rem Gradle startup script for Windows
5
+ @rem
6
+ @rem ##########################################################################
7
+
8
+ @rem Set local scope for the variables with windows NT shell
9
+ if "%OS%"=="Windows_NT" setlocal
10
+
11
+ @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
12
+ set DEFAULT_JVM_OPTS=
13
+
14
+ set DIRNAME=%~dp0
15
+ if "%DIRNAME%" == "" set DIRNAME=.
16
+ set APP_BASE_NAME=%~n0
17
+ set APP_HOME=%DIRNAME%
18
+
19
+ @rem Find java.exe
20
+ if defined JAVA_HOME goto findJavaFromJavaHome
21
+
22
+ set JAVA_EXE=java.exe
23
+ %JAVA_EXE% -version >NUL 2>&1
24
+ if "%ERRORLEVEL%" == "0" goto init
25
+
26
+ echo.
27
+ echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28
+ echo.
29
+ echo Please set the JAVA_HOME variable in your environment to match the
30
+ echo location of your Java installation.
31
+
32
+ goto fail
33
+
34
+ :findJavaFromJavaHome
35
+ set JAVA_HOME=%JAVA_HOME:"=%
36
+ set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37
+
38
+ if exist "%JAVA_EXE%" goto init
39
+
40
+ echo.
41
+ echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42
+ echo.
43
+ echo Please set the JAVA_HOME variable in your environment to match the
44
+ echo location of your Java installation.
45
+
46
+ goto fail
47
+
48
+ :init
49
+ @rem Get command-line arguments, handling Windowz variants
50
+
51
+ if not "%OS%" == "Windows_NT" goto win9xME_args
52
+ if "%@eval[2+2]" == "4" goto 4NT_args
53
+
54
+ :win9xME_args
55
+ @rem Slurp the command line arguments.
56
+ set CMD_LINE_ARGS=
57
+ set _SKIP=2
58
+
59
+ :win9xME_args_slurp
60
+ if "x%~1" == "x" goto execute
61
+
62
+ set CMD_LINE_ARGS=%*
63
+ goto execute
64
+
65
+ :4NT_args
66
+ @rem Get arguments from the 4NT Shell from JP Software
67
+ set CMD_LINE_ARGS=%$
68
+
69
+ :execute
70
+ @rem Setup the command line
71
+
72
+ set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
73
+
74
+ @rem Execute Gradle
75
+ "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
76
+
77
+ :end
78
+ @rem End local scope for the variables with windows NT shell
79
+ if "%ERRORLEVEL%"=="0" goto mainEnd
80
+
81
+ :fail
82
+ rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
83
+ rem the _cmd.exe /c_ return code!
84
+ if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
85
+ exit /b 1
86
+
87
+ :mainEnd
88
+ if "%OS%"=="Windows_NT" endlocal
89
+
90
+ :omega
@@ -0,0 +1,89 @@
1
+ module Embulk
2
+ module Guess
3
+
4
+ class Msgpack < GuessPlugin
5
+ Plugin.register_guess("msgpack", self)
6
+
7
+ def guess(config, sample_buffer)
8
+ return {} unless config.fetch("parser", {}).fetch("type", "msgpack") == "msgpack"
9
+
10
+ parser_config = config["parser"] || {}
11
+
12
+ classpath = File.expand_path('../../../../classpath', __FILE__)
13
+ Dir["#{classpath}/*.jar"].each {|jar| require jar }
14
+
15
+ file_encoding = parser_config["file_encoding"]
16
+ row_encoding = parser_config["row_encoding"]
17
+
18
+ if !file_encoding || !row_encoding
19
+ uk = new_unpacker(sample_buffer)
20
+ begin
21
+ n = uk.unpackArrayHeader
22
+ begin
23
+ n = uk.unpackArrayHeader
24
+ file_encoding = "array"
25
+ row_encoding = "array"
26
+ rescue org.msgpack.core.MessageTypeException
27
+ file_encoding = "sequence"
28
+ row_encoding = "array"
29
+ end
30
+ rescue org.msgpack.core.MessageTypeException
31
+ uk = new_unpacker(sample_buffer) # TODO unpackArrayHeader consumes buffer (unexpectedly)
32
+ begin
33
+ n = uk.unpackMapHeader
34
+ file_encoding = "sequence"
35
+ row_encoding = "map"
36
+ rescue org.msgpack.core.MessageTypeException
37
+ return {} # not a msgpack
38
+ end
39
+ end
40
+ end
41
+
42
+ uk = new_unpacker(sample_buffer)
43
+
44
+ case file_encoding
45
+ when "array"
46
+ uk.unpackArrayHeader # skip array header to convert to sequence
47
+ when "sequence"
48
+ # do nothing
49
+ end
50
+
51
+ rows = []
52
+
53
+ begin
54
+ while true
55
+ rows << JSON.parse(uk.unpackValue.toJson)
56
+ end
57
+ rescue java.io.EOFException
58
+ end
59
+
60
+ if rows.size <= 3
61
+ return {}
62
+ end
63
+
64
+ case row_encoding
65
+ when "map"
66
+ schema = Embulk::Guess::SchemaGuess.from_hash_records(rows)
67
+ when "array"
68
+ column_count = rows.map {|r| r.size }.max
69
+ column_names = column_count.times.map {|i| "c#{i}" }
70
+ schema = Embulk::Guess::SchemaGuess.from_array_records(column_names, rows)
71
+ end
72
+
73
+ parser_guessed = {"type" => "msgpack"}
74
+ parser_guessed["row_encoding"] = row_encoding
75
+ parser_guessed["file_encoding"] = file_encoding
76
+ parser_guessed["columns"] = schema
77
+
78
+ return {"parser" => parser_guessed}
79
+
80
+ rescue org.msgpack.core.MessagePackException
81
+ return {}
82
+ end
83
+
84
+ def new_unpacker(sample_buffer)
85
+ org.msgpack.core.MessagePack.newDefaultUnpacker(sample_buffer.to_java_bytes)
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,3 @@
1
+ Embulk::JavaPlugin.register_parser(
2
+ "msgpack", "org.embulk.parser.msgpack.MsgpackParserPlugin",
3
+ File.expand_path('../../../../classpath', __FILE__))
@@ -0,0 +1,454 @@
1
+ package org.embulk.parser.msgpack;
2
+
3
+ import java.math.BigInteger;
4
+ import java.util.Map;
5
+ import java.util.TreeMap;
6
+ import java.util.Comparator;
7
+ import java.io.IOException;
8
+ import java.io.EOFException;
9
+ import com.google.common.base.Optional;
10
+ import com.google.common.collect.ImmutableMap;
11
+ import com.fasterxml.jackson.annotation.JsonCreator;
12
+ import com.fasterxml.jackson.annotation.JsonValue;
13
+ import org.msgpack.core.MessageFormat;
14
+ import org.msgpack.core.MessageUnpacker;
15
+ import org.msgpack.core.buffer.MessageBuffer;
16
+ import org.msgpack.core.buffer.MessageBufferInput;
17
+ import org.msgpack.value.ValueType;
18
+ import org.embulk.config.Config;
19
+ import org.embulk.config.ConfigException;
20
+ import org.embulk.config.ConfigDefault;
21
+ import org.embulk.config.ConfigDiff;
22
+ import org.embulk.config.ConfigInject;
23
+ import org.embulk.config.ConfigSource;
24
+ import org.embulk.config.Task;
25
+ import org.embulk.config.TaskSource;
26
+ import org.embulk.spi.Buffer;
27
+ import org.embulk.spi.ParserPlugin;
28
+ import org.embulk.spi.FileInput;
29
+ import org.embulk.spi.PageOutput;
30
+ import org.embulk.spi.Schema;
31
+ import org.embulk.spi.SchemaConfig;
32
+ import org.embulk.spi.Column;
33
+ import org.embulk.spi.ColumnConfig;
34
+ import org.embulk.spi.time.Timestamp;
35
+ import org.embulk.spi.time.TimestampParser;
36
+ import org.embulk.spi.time.TimestampFormatter;
37
+ import org.embulk.spi.PageBuilder;
38
+ import org.embulk.spi.BufferAllocator;
39
+ import org.embulk.spi.type.Type;
40
+ import org.embulk.spi.type.BooleanType;
41
+ import org.embulk.spi.type.LongType;
42
+ import org.embulk.spi.type.DoubleType;
43
+ import org.embulk.spi.type.StringType;
44
+ import org.embulk.spi.type.TimestampType;
45
+ import org.embulk.spi.util.Timestamps;
46
+ import org.embulk.spi.util.DynamicPageBuilder;
47
+ import org.embulk.spi.util.DynamicColumnSetter;
48
+ import org.embulk.spi.util.DynamicColumnSetterFactory;
49
+ import org.embulk.spi.util.dynamic.BooleanColumnSetter;
50
+ import org.embulk.spi.util.dynamic.LongColumnSetter;
51
+ import org.embulk.spi.util.dynamic.DoubleColumnSetter;
52
+ import org.embulk.spi.util.dynamic.StringColumnSetter;
53
+ import org.embulk.spi.util.dynamic.TimestampColumnSetter;
54
+ import org.embulk.spi.util.dynamic.DefaultValueSetter;
55
+ import org.embulk.spi.util.dynamic.NullDefaultValueSetter;
56
+
57
+ public class MsgpackParserPlugin
58
+ implements ParserPlugin
59
+ {
60
+ public interface PluginTask
61
+ extends Task, TimestampParser.Task
62
+ {
63
+ @Config("file_encoding")
64
+ @ConfigDefault("\"sequence\"")
65
+ public FileEncoding getFileEncoding();
66
+
67
+ @Config("row_encoding")
68
+ @ConfigDefault("\"map\"")
69
+ public RowEncoding getRowEncoding();
70
+
71
+ @Config("columns")
72
+ public SchemaConfig getSchemaConfig();
73
+
74
+ @ConfigInject
75
+ public BufferAllocator getBufferAllocator();
76
+ }
77
+
78
+ public static enum FileEncoding
79
+ {
80
+ SEQUENCE("sequence"),
81
+ ARRAY("array");
82
+
83
+ private final String name;
84
+
85
+ private FileEncoding(String name)
86
+ {
87
+ this.name = name;
88
+ }
89
+
90
+ @JsonCreator
91
+ public static FileEncoding of(String name)
92
+ {
93
+ for (FileEncoding enc : FileEncoding.values()) {
94
+ if (enc.toString().equals(name)) {
95
+ return enc;
96
+ }
97
+ }
98
+ throw new ConfigException(String.format("Invalid FileEncoding '%s'. Available options are sequence or array", name));
99
+ }
100
+
101
+ @JsonValue
102
+ @Override
103
+ public String toString()
104
+ {
105
+ return name;
106
+ }
107
+ }
108
+
109
+ public static enum RowEncoding
110
+ {
111
+ ARRAY("array"),
112
+ MAP("map");
113
+
114
+ private final String name;
115
+
116
+ private RowEncoding(String name)
117
+ {
118
+ this.name = name;
119
+ }
120
+
121
+ @JsonCreator
122
+ public static RowEncoding of(String name)
123
+ {
124
+ for (RowEncoding enc : RowEncoding.values()) {
125
+ if (enc.toString().equals(name)) {
126
+ return enc;
127
+ }
128
+ }
129
+ if ("object".equals(name)) {
130
+ // alias of map
131
+ return MAP;
132
+ }
133
+ throw new ConfigException(String.format("Invalid RowEncoding '%s'. Available options are array or map", name));
134
+ }
135
+
136
+ @JsonValue
137
+ @Override
138
+ public String toString()
139
+ {
140
+ return name;
141
+ }
142
+ }
143
+
144
+ public interface PluginTaskFormatter
145
+ extends Task, TimestampFormatter.Task
146
+ { }
147
+
148
+ private interface TimestampColumnOption
149
+ extends Task, TimestampFormatter.TimestampColumnOption
150
+ { }
151
+
152
+ private static class FileInputMessageBufferInput
153
+ implements MessageBufferInput
154
+ {
155
+ private final FileInput input;
156
+
157
+ public FileInputMessageBufferInput(FileInput input)
158
+ {
159
+ this.input = input;
160
+ }
161
+
162
+ @Override
163
+ public MessageBuffer next()
164
+ {
165
+ Buffer b = input.poll();
166
+ if (b == null) {
167
+ throw new EndOfBufferException();
168
+ }
169
+ return MessageBuffer.wrap(b.array()).slice(b.offset(), b.limit());
170
+ }
171
+
172
+ @Override
173
+ public void close()
174
+ {
175
+ input.close();
176
+ }
177
+ }
178
+
179
+ private static class EndOfBufferException
180
+ extends RuntimeException
181
+ {
182
+ public EndOfBufferException()
183
+ {
184
+ super("End of buffer");
185
+ }
186
+ }
187
+
188
+ @Override
189
+ public void transaction(ConfigSource config, ParserPlugin.Control control)
190
+ {
191
+ PluginTask task = config.loadConfig(PluginTask.class);
192
+
193
+ control.run(task.dump(), task.getSchemaConfig().toSchema());
194
+ }
195
+
196
+ @Override
197
+ public void run(TaskSource taskSource, Schema schema,
198
+ FileInput input, PageOutput output)
199
+ {
200
+ PluginTask task = taskSource.loadTask(PluginTask.class);
201
+
202
+ RowEncoding rowEncoding = task.getRowEncoding();
203
+ FileEncoding fileEncoding = task.getFileEncoding();
204
+
205
+ try (MessageUnpacker unpacker = new MessageUnpacker(new FileInputMessageBufferInput(input));
206
+ PageBuilder pageBuilder = new PageBuilder(task.getBufferAllocator(), schema, output)) {
207
+
208
+ TimestampParser[] timestampParsers = Timestamps.newTimestampColumnParsers(task, task.getSchemaConfig());
209
+ Map<Column, DynamicColumnSetter> setters = newColumnSetters(pageBuilder,
210
+ task.getSchemaConfig(), timestampParsers, taskSource.loadTask(PluginTaskFormatter.class));
211
+
212
+ RowReader reader;
213
+ switch (rowEncoding) {
214
+ case ARRAY:
215
+ reader = new ArrayRowReader(setters);
216
+ break;
217
+ case MAP:
218
+ reader = new MapRowReader(setters);
219
+ break;
220
+ default:
221
+ throw new IllegalArgumentException("Unexpected row encoding");
222
+ }
223
+
224
+ while (input.nextFile()) {
225
+ switch (fileEncoding) {
226
+ case SEQUENCE:
227
+ // do nothing
228
+ break;
229
+ case ARRAY:
230
+ // skip array header to convert array to sequence
231
+ unpacker.unpackArrayHeader();
232
+ break;
233
+ }
234
+
235
+ while (reader.next(unpacker)) {
236
+ pageBuilder.addRecord();
237
+ }
238
+ }
239
+
240
+ pageBuilder.finish();
241
+
242
+ } catch (IOException ex) {
243
+ throw new RuntimeException(ex);
244
+ }
245
+ }
246
+
247
+ private Map<Column, DynamicColumnSetter> newColumnSetters(PageBuilder pageBuilder,
248
+ SchemaConfig schema, TimestampParser[] timestampParsers, TimestampFormatter.Task formatterTask)
249
+ {
250
+ ImmutableMap.Builder<Column, DynamicColumnSetter> builder = ImmutableMap.builder();
251
+ int index = 0;
252
+ for (ColumnConfig c : schema.getColumns()) {
253
+ Column column = c.toColumn(index);
254
+ Type type = column.getType();
255
+
256
+ DefaultValueSetter defaultValue = new NullDefaultValueSetter();
257
+ DynamicColumnSetter setter;
258
+
259
+ if (type instanceof BooleanType) {
260
+ setter = new BooleanColumnSetter(pageBuilder, column, defaultValue);
261
+
262
+ } else if (type instanceof LongType) {
263
+ setter = new LongColumnSetter(pageBuilder, column, defaultValue);
264
+
265
+ } else if (type instanceof DoubleType) {
266
+ setter = new DoubleColumnSetter(pageBuilder, column, defaultValue);
267
+
268
+ } else if (type instanceof StringType) {
269
+ TimestampFormatter formatter = new TimestampFormatter(formatterTask,
270
+ Optional.of(c.getOption().loadConfig(TimestampColumnOption.class)));
271
+ setter = new StringColumnSetter(pageBuilder, column, defaultValue, formatter);
272
+
273
+ } else if (type instanceof TimestampType) {
274
+ // TODO use flexible time format like Ruby's Time.parse
275
+ TimestampParser parser = timestampParsers[column.getIndex()];
276
+ setter = new TimestampColumnSetter(pageBuilder, column, defaultValue, parser);
277
+
278
+ } else {
279
+ throw new ConfigException("Unknown column type: "+type);
280
+ }
281
+
282
+ builder.put(column, setter);
283
+ index++;
284
+ }
285
+ return builder.build();
286
+ }
287
+
288
+ private static final BigInteger LONG_MAX = BigInteger.valueOf(Long.MAX_VALUE);
289
+ private static final BigInteger LONG_MIN = BigInteger.valueOf(Long.MIN_VALUE);
290
+
291
+ static void unpackToSetter(MessageUnpacker unpacker, DynamicColumnSetter setter)
292
+ throws IOException
293
+ {
294
+ MessageFormat format = unpacker.getNextFormat();
295
+ switch (format.getValueType()) {
296
+ case NIL:
297
+ unpacker.unpackNil();
298
+ setter.setNull();
299
+ break;
300
+
301
+ case BOOLEAN:
302
+ setter.set(unpacker.unpackBoolean());
303
+ break;
304
+
305
+ case INTEGER:
306
+ if (format == MessageFormat.UINT64) {
307
+ BigInteger bi = unpacker.unpackBigInteger();
308
+ if (0 <= bi.compareTo(LONG_MIN) && bi.compareTo(LONG_MAX) <= 0) {
309
+ setter.set(bi.longValue());
310
+ } else {
311
+ setter.setNull(); // TODO set default value
312
+ }
313
+ } else {
314
+ setter.set(unpacker.unpackLong());
315
+ }
316
+ break;
317
+
318
+ case FLOAT:
319
+ setter.set(unpacker.unpackDouble());
320
+ break;
321
+
322
+ case STRING:
323
+ setter.set(unpacker.unpackString());
324
+ break;
325
+
326
+ case BINARY:
327
+ setter.set(unpacker.unpackString());
328
+ break;
329
+
330
+ case ARRAY:
331
+ case MAP:
332
+ // TODO set json?
333
+ //setter.set(unpacker.unpackValue().toJson());
334
+ unpacker.skipValue();
335
+ setter.setNull();
336
+ break;
337
+
338
+ case EXTENSION:
339
+ unpacker.skipValue();
340
+ setter.setNull();
341
+ break;
342
+ }
343
+ }
344
+
345
+ private interface RowReader
346
+ {
347
+ public boolean next(MessageUnpacker unpacker) throws IOException;
348
+ }
349
+
350
+ private class ArrayRowReader
351
+ implements RowReader
352
+ {
353
+ private final DynamicColumnSetter[] columnSetters;
354
+
355
+ public ArrayRowReader(Map<Column, DynamicColumnSetter> setters)
356
+ {
357
+ this.columnSetters = new DynamicColumnSetter[setters.size()];
358
+ for (Map.Entry<Column, DynamicColumnSetter> pair : setters.entrySet()) {
359
+ columnSetters[pair.getKey().getIndex()] = pair.getValue();
360
+ }
361
+ }
362
+
363
+ public boolean next(MessageUnpacker unpacker) throws IOException
364
+ {
365
+ int n;
366
+ try {
367
+ n = unpacker.unpackArrayHeader();
368
+ } catch (EndOfBufferException ex) {
369
+ // TODO EOFException?
370
+ return false;
371
+ }
372
+ for (int i = 0; i < n; i++) {
373
+ if (i < columnSetters.length) {
374
+ unpackToSetter(unpacker, columnSetters[i]);
375
+ } else {
376
+ unpacker.skipValue();
377
+ }
378
+ }
379
+ return true;
380
+ }
381
+ }
382
+
383
+ private class MapRowReader
384
+ implements RowReader
385
+ {
386
+ private final Map<String, DynamicColumnSetter> columnSetters;
387
+
388
+ public MapRowReader(Map<Column, DynamicColumnSetter> setters)
389
+ {
390
+ this.columnSetters = new TreeMap<>();
391
+ for (Map.Entry<Column, DynamicColumnSetter> pair : setters.entrySet()) {
392
+ columnSetters.put(pair.getKey().getName(), pair.getValue());
393
+ }
394
+ }
395
+
396
+ public boolean next(MessageUnpacker unpacker) throws IOException
397
+ {
398
+ int n;
399
+ try {
400
+ n = unpacker.unpackMapHeader();
401
+ } catch (EndOfBufferException ex) {
402
+ // TODO EOFException?
403
+ return false;
404
+ }
405
+ for (int i = 0; i < n; i++) {
406
+ MessageFormat format = unpacker.getNextFormat();
407
+ if (!format.getValueType().isRawType()) {
408
+ unpacker.skipValue();
409
+ continue;
410
+ }
411
+ MessageBuffer key = unpacker.readPayloadAsReference(unpacker.unpackRawStringHeader());
412
+ DynamicColumnSetter setter = columnSetters.get(key);
413
+ if (setter != null) {
414
+ unpackToSetter(unpacker, setter);
415
+ } else {
416
+ unpacker.skipValue();
417
+ }
418
+ }
419
+ return true;
420
+ }
421
+ }
422
+
423
+ private static class MessageBufferEqualComparator
424
+ implements Comparator<MessageBuffer>
425
+ {
426
+ @Override
427
+ public int compare(MessageBuffer o1, MessageBuffer o2)
428
+ {
429
+ if (o1.size() == o2.size()) {
430
+ int offset = 0;
431
+ int length = o1.size();
432
+ while (length - offset > 8) {
433
+ long a = o1.getLong(offset);
434
+ long b = o2.getLong(offset);
435
+ if (a != b) {
436
+ return (int) (a - b);
437
+ }
438
+ offset += 8;
439
+ }
440
+ while (length - offset > 0) {
441
+ byte a = o1.getByte(offset);
442
+ byte b = o2.getByte(offset);
443
+ if (a != b) {
444
+ return a - b;
445
+ }
446
+ offset += 1;
447
+ }
448
+ return 0;
449
+ } else {
450
+ return o1.size() - o2.size();
451
+ }
452
+ }
453
+ }
454
+ }
@@ -0,0 +1,5 @@
1
+ package org.embulk.parser;
2
+
3
+ public class TestMsgpackParserPlugin
4
+ {
5
+ }
metadata ADDED
@@ -0,0 +1,87 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: embulk-parser-msgpack
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Sadayuki Furuhashi
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-08-11 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ version_requirements: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.0'
20
+ requirement: !ruby/object:Gem::Requirement
21
+ requirements:
22
+ - - ~>
23
+ - !ruby/object:Gem::Version
24
+ version: '1.0'
25
+ prerelease: false
26
+ type: :development
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ requirement: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - '>='
37
+ - !ruby/object:Gem::Version
38
+ version: '10.0'
39
+ prerelease: false
40
+ type: :development
41
+ description: Parses files encoded in MessagePack.
42
+ email:
43
+ - frsyuki@gmail.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - .gitignore
49
+ - COPYING
50
+ - ChangeLog
51
+ - README.md
52
+ - build.gradle
53
+ - gradle/wrapper/gradle-wrapper.jar
54
+ - gradle/wrapper/gradle-wrapper.properties
55
+ - gradlew
56
+ - gradlew.bat
57
+ - lib/embulk/guess/msgpack.rb
58
+ - lib/embulk/parser/msgpack.rb
59
+ - src/main/java/org/embulk/parser/msgpack/MsgpackParserPlugin.java
60
+ - src/test/java/org/embulk/parser/TestMsgpackParserPlugin.java
61
+ - classpath/embulk-parser-msgpack-0.1.0.jar
62
+ - classpath/msgpack-core-0.7.0-M6.jar
63
+ homepage: https://github.com/frsyuki/embulk-parser-msgpack
64
+ licenses:
65
+ - Apache 2.0
66
+ metadata: {}
67
+ post_install_message:
68
+ rdoc_options: []
69
+ require_paths:
70
+ - lib
71
+ required_ruby_version: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ required_rubygems_version: !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - '>='
79
+ - !ruby/object:Gem::Version
80
+ version: '0'
81
+ requirements: []
82
+ rubyforge_project:
83
+ rubygems_version: 2.1.9
84
+ signing_key:
85
+ specification_version: 4
86
+ summary: MessagePack parser plugin for Embulk
87
+ test_files: []