embulk-parser-jsonl 0.0.1 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f98fb168c55e6a4c80f5d098bc72945091fdcf8d
4
- data.tar.gz: 5e85f4e3d5aece158e1b55e381d40ee6eda1d69d
3
+ metadata.gz: 973fe894b7704f01da0d8d0ba2b93eaa3804ea7d
4
+ data.tar.gz: fe6ca73d3100595bd64e95e0e08269490671314e
5
5
  SHA512:
6
- metadata.gz: 6452226f7de14018279af312af6f4105dfd98668506168f3acb4e3139519b6a93353e1bac454c0bc272bca96375c4a6fedb201b59f9b639ca0ec7b7774bd69bb
7
- data.tar.gz: 1fa70b407a8ad5d8fa37a6467e621aa7e8fd937cedb908d85f3189bcb2afd6506b476c62433bc4ef47ad241b31fc47b46dfc50c77388d504294348540014c09d
6
+ metadata.gz: 74ad34c4f29980e75f36ac88b9160fd784a975a83b906d6d5c30738edab617211c0653f22d0a9c01ad533bbcc92394b3ceccad20dcc8805b1627298695b5d6ac
7
+ data.tar.gz: dd611c8bff080d4ca1c61a8c071a5baeb2d982b89b2812457af67f9ecaacd5c3e17b42c230356087c26ed53ecb51a7a0728a1fde14c67a441974483a53fcd58b
data/.gitignore CHANGED
@@ -2,4 +2,7 @@
2
2
  /pkg/
3
3
  /tmp/
4
4
  /.bundle/
5
+ build/
6
+ /classpath/
7
+ /.gradle
5
8
  /Gemfile.lock
@@ -0,0 +1,7 @@
1
+ ## 0.1.0 - 2016-02-22
2
+
3
+ Upgrade Embulk v0.8 and support Json type in Java [#3](https://github.com/shun0102/embulk-parser-jsonl/pull/3)
4
+
5
+ ## 0.0.1 - 2015-04-04
6
+
7
+ The first release!!
data/README.md CHANGED
@@ -10,7 +10,7 @@ TODO: Write short description here and embulk-parser-jsonl.gemspec file.
10
10
  ## Configuration
11
11
 
12
12
  - **type**: specify this parser as jsonl
13
- - **schema**: specify column name and type (array, required)
13
+ - **columns**: specify column name and type (array, required)
14
14
 
15
15
  ## Example
16
16
 
@@ -19,7 +19,7 @@ in:
19
19
  type: any file input plugin type
20
20
  parser:
21
21
  type: jsonl
22
- schema:
22
+ columns:
23
23
  - {name: first_name, type: string}
24
24
  - {name: last_name, type: string}
25
25
  - {name: age, type: long}
@@ -35,5 +35,5 @@ $ embulk guess -g jsonl config.yml -o guessed.yml
35
35
  ## Build
36
36
 
37
37
  ```
38
- $ rake
38
+ $ ./gradlew gem classpath
39
39
  ```
@@ -0,0 +1,79 @@
1
+ plugins {
2
+ id "com.jfrog.bintray" version "1.1"
3
+ id "com.github.jruby-gradle.base" version "0.1.5"
4
+ id "java"
5
+ id "jacoco"
6
+ }
7
+ import com.github.jrubygradle.JRubyExec
8
+ repositories {
9
+ mavenCentral()
10
+ jcenter()
11
+ }
12
+ configurations {
13
+ provided
14
+ }
15
+
16
+ version = "0.1.0"
17
+
18
+ compileJava.options.encoding = 'UTF-8' // source encoding
19
+ sourceCompatibility = 1.7
20
+ targetCompatibility = 1.7
21
+
22
+ dependencies {
23
+ compile "org.embulk:embulk-core:0.8.2"
24
+ provided "org.embulk:embulk-core:0.8.2"
25
+
26
+ testCompile "junit:junit:4.+"
27
+ testCompile "org.embulk:embulk-core:0.8.2:tests"
28
+ testCompile "org.embulk:embulk-standards:0.8.2"
29
+ }
30
+
31
+ task classpath(type: Copy, dependsOn: ["jar"]) {
32
+ doFirst { file("classpath").deleteDir() }
33
+ from (configurations.runtime - configurations.provided + files(jar.archivePath))
34
+ into "classpath"
35
+ }
36
+ clean { delete "classpath" }
37
+
38
+ task gem(type: JRubyExec, dependsOn: ["gemspec", "classpath"]) {
39
+ jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "build"
40
+ script "${project.name}.gemspec"
41
+ doLast { ant.move(file: "${project.name}-${project.version}.gem", todir: "pkg") }
42
+ }
43
+
44
+ task gemPush(type: JRubyExec, dependsOn: ["gem"]) {
45
+ jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "push"
46
+ script "pkg/${project.name}-${project.version}.gem"
47
+ }
48
+
49
+ task "package"(dependsOn: ["gemspec", "classpath"]) << {
50
+ println "> Build succeeded."
51
+ println "> You can run embulk with '-L ${file(".").absolutePath}' argument."
52
+ }
53
+
54
+ task gemspec {
55
+ ext.gemspecFile = file("${project.name}.gemspec")
56
+ inputs.file "build.gradle"
57
+ outputs.file gemspecFile
58
+ doLast { gemspecFile.write($/
59
+ Gem::Specification.new do |spec|
60
+ spec.name = "${project.name}"
61
+ spec.version = "${project.version}"
62
+ spec.authors = ["Shunsuke Mikami"]
63
+ spec.summary = "Jsonl parser plugin for Embulk"
64
+ spec.description = "Parses Jsonl files read by other file input plugins."
65
+ spec.email = ["shun0102@gmail.com"]
66
+ spec.licenses = ["MIT"]
67
+ spec.homepage = "https://github.com/shun0102/embulk-parser-jsonl"
68
+
69
+ spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"]
70
+ spec.test_files = spec.files.grep(%r{^(test|spec)/})
71
+ spec.require_paths = ["lib"]
72
+
73
+ spec.add_development_dependency 'bundler', ['~> 1.0']
74
+ spec.add_development_dependency 'rake', ['~> 10.0']
75
+ end
76
+ /$)
77
+ }
78
+ }
79
+ clean { delete "${project.name}.gemspec" }
@@ -1,7 +1,7 @@
1
1
 
2
2
  Gem::Specification.new do |spec|
3
3
  spec.name = "embulk-parser-jsonl"
4
- spec.version = "0.0.1"
4
+ spec.version = "0.1.0"
5
5
  spec.authors = ["Shunsuke Mikami"]
6
6
  spec.summary = "Jsonl parser plugin for Embulk"
7
7
  spec.description = "Parses Jsonl files read by other file input plugins."
@@ -0,0 +1,6 @@
1
+ #Tue Aug 11 00:26:20 PDT 2015
2
+ distributionBase=GRADLE_USER_HOME
3
+ distributionPath=wrapper/dists
4
+ zipStoreBase=GRADLE_USER_HOME
5
+ zipStorePath=wrapper/dists
6
+ distributionUrl=https\://services.gradle.org/distributions/gradle-2.6-bin.zip
data/gradlew ADDED
@@ -0,0 +1,164 @@
1
+ #!/usr/bin/env bash
2
+
3
+ ##############################################################################
4
+ ##
5
+ ## Gradle start up script for UN*X
6
+ ##
7
+ ##############################################################################
8
+
9
+ # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
10
+ DEFAULT_JVM_OPTS=""
11
+
12
+ APP_NAME="Gradle"
13
+ APP_BASE_NAME=`basename "$0"`
14
+
15
+ # Use the maximum available, or set MAX_FD != -1 to use that value.
16
+ MAX_FD="maximum"
17
+
18
+ warn ( ) {
19
+ echo "$*"
20
+ }
21
+
22
+ die ( ) {
23
+ echo
24
+ echo "$*"
25
+ echo
26
+ exit 1
27
+ }
28
+
29
+ # OS specific support (must be 'true' or 'false').
30
+ cygwin=false
31
+ msys=false
32
+ darwin=false
33
+ case "`uname`" in
34
+ CYGWIN* )
35
+ cygwin=true
36
+ ;;
37
+ Darwin* )
38
+ darwin=true
39
+ ;;
40
+ MINGW* )
41
+ msys=true
42
+ ;;
43
+ esac
44
+
45
+ # For Cygwin, ensure paths are in UNIX format before anything is touched.
46
+ if $cygwin ; then
47
+ [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"`
48
+ fi
49
+
50
+ # Attempt to set APP_HOME
51
+ # Resolve links: $0 may be a link
52
+ PRG="$0"
53
+ # Need this for relative symlinks.
54
+ while [ -h "$PRG" ] ; do
55
+ ls=`ls -ld "$PRG"`
56
+ link=`expr "$ls" : '.*-> \(.*\)$'`
57
+ if expr "$link" : '/.*' > /dev/null; then
58
+ PRG="$link"
59
+ else
60
+ PRG=`dirname "$PRG"`"/$link"
61
+ fi
62
+ done
63
+ SAVED="`pwd`"
64
+ cd "`dirname \"$PRG\"`/" >&-
65
+ APP_HOME="`pwd -P`"
66
+ cd "$SAVED" >&-
67
+
68
+ CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
69
+
70
+ # Determine the Java command to use to start the JVM.
71
+ if [ -n "$JAVA_HOME" ] ; then
72
+ if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
73
+ # IBM's JDK on AIX uses strange locations for the executables
74
+ JAVACMD="$JAVA_HOME/jre/sh/java"
75
+ else
76
+ JAVACMD="$JAVA_HOME/bin/java"
77
+ fi
78
+ if [ ! -x "$JAVACMD" ] ; then
79
+ die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
80
+
81
+ Please set the JAVA_HOME variable in your environment to match the
82
+ location of your Java installation."
83
+ fi
84
+ else
85
+ JAVACMD="java"
86
+ which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
87
+
88
+ Please set the JAVA_HOME variable in your environment to match the
89
+ location of your Java installation."
90
+ fi
91
+
92
+ # Increase the maximum file descriptors if we can.
93
+ if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then
94
+ MAX_FD_LIMIT=`ulimit -H -n`
95
+ if [ $? -eq 0 ] ; then
96
+ if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
97
+ MAX_FD="$MAX_FD_LIMIT"
98
+ fi
99
+ ulimit -n $MAX_FD
100
+ if [ $? -ne 0 ] ; then
101
+ warn "Could not set maximum file descriptor limit: $MAX_FD"
102
+ fi
103
+ else
104
+ warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
105
+ fi
106
+ fi
107
+
108
+ # For Darwin, add options to specify how the application appears in the dock
109
+ if $darwin; then
110
+ GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
111
+ fi
112
+
113
+ # For Cygwin, switch paths to Windows format before running java
114
+ if $cygwin ; then
115
+ APP_HOME=`cygpath --path --mixed "$APP_HOME"`
116
+ CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
117
+
118
+ # We build the pattern for arguments to be converted via cygpath
119
+ ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
120
+ SEP=""
121
+ for dir in $ROOTDIRSRAW ; do
122
+ ROOTDIRS="$ROOTDIRS$SEP$dir"
123
+ SEP="|"
124
+ done
125
+ OURCYGPATTERN="(^($ROOTDIRS))"
126
+ # Add a user-defined pattern to the cygpath arguments
127
+ if [ "$GRADLE_CYGPATTERN" != "" ] ; then
128
+ OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
129
+ fi
130
+ # Now convert the arguments - kludge to limit ourselves to /bin/sh
131
+ i=0
132
+ for arg in "$@" ; do
133
+ CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
134
+ CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
135
+
136
+ if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
137
+ eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
138
+ else
139
+ eval `echo args$i`="\"$arg\""
140
+ fi
141
+ i=$((i+1))
142
+ done
143
+ case $i in
144
+ (0) set -- ;;
145
+ (1) set -- "$args0" ;;
146
+ (2) set -- "$args0" "$args1" ;;
147
+ (3) set -- "$args0" "$args1" "$args2" ;;
148
+ (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
149
+ (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
150
+ (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
151
+ (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
152
+ (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
153
+ (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
154
+ esac
155
+ fi
156
+
157
+ # Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules
158
+ function splitJvmOpts() {
159
+ JVM_OPTS=("$@")
160
+ }
161
+ eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS
162
+ JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME"
163
+
164
+ exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@"
@@ -0,0 +1,90 @@
1
+ @if "%DEBUG%" == "" @echo off
2
+ @rem ##########################################################################
3
+ @rem
4
+ @rem Gradle startup script for Windows
5
+ @rem
6
+ @rem ##########################################################################
7
+
8
+ @rem Set local scope for the variables with windows NT shell
9
+ if "%OS%"=="Windows_NT" setlocal
10
+
11
+ @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
12
+ set DEFAULT_JVM_OPTS=
13
+
14
+ set DIRNAME=%~dp0
15
+ if "%DIRNAME%" == "" set DIRNAME=.
16
+ set APP_BASE_NAME=%~n0
17
+ set APP_HOME=%DIRNAME%
18
+
19
+ @rem Find java.exe
20
+ if defined JAVA_HOME goto findJavaFromJavaHome
21
+
22
+ set JAVA_EXE=java.exe
23
+ %JAVA_EXE% -version >NUL 2>&1
24
+ if "%ERRORLEVEL%" == "0" goto init
25
+
26
+ echo.
27
+ echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28
+ echo.
29
+ echo Please set the JAVA_HOME variable in your environment to match the
30
+ echo location of your Java installation.
31
+
32
+ goto fail
33
+
34
+ :findJavaFromJavaHome
35
+ set JAVA_HOME=%JAVA_HOME:"=%
36
+ set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37
+
38
+ if exist "%JAVA_EXE%" goto init
39
+
40
+ echo.
41
+ echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42
+ echo.
43
+ echo Please set the JAVA_HOME variable in your environment to match the
44
+ echo location of your Java installation.
45
+
46
+ goto fail
47
+
48
+ :init
49
+ @rem Get command-line arguments, handling Windowz variants
50
+
51
+ if not "%OS%" == "Windows_NT" goto win9xME_args
52
+ if "%@eval[2+2]" == "4" goto 4NT_args
53
+
54
+ :win9xME_args
55
+ @rem Slurp the command line arguments.
56
+ set CMD_LINE_ARGS=
57
+ set _SKIP=2
58
+
59
+ :win9xME_args_slurp
60
+ if "x%~1" == "x" goto execute
61
+
62
+ set CMD_LINE_ARGS=%*
63
+ goto execute
64
+
65
+ :4NT_args
66
+ @rem Get arguments from the 4NT Shell from JP Software
67
+ set CMD_LINE_ARGS=%$
68
+
69
+ :execute
70
+ @rem Setup the command line
71
+
72
+ set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
73
+
74
+ @rem Execute Gradle
75
+ "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
76
+
77
+ :end
78
+ @rem End local scope for the variables with windows NT shell
79
+ if "%ERRORLEVEL%"=="0" goto mainEnd
80
+
81
+ :fail
82
+ rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
83
+ rem the _cmd.exe /c_ return code!
84
+ if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
85
+ exit /b 1
86
+
87
+ :mainEnd
88
+ if "%OS%"=="Windows_NT" endlocal
89
+
90
+ :omega
@@ -1,84 +1,33 @@
1
1
  require 'json'
2
+ require "embulk/parser/jsonl.rb"
2
3
 
3
4
  module Embulk
4
5
  module Guess
6
+ # $ embulk guess -g "jsonl" partial-config.yml
5
7
 
6
- # TODO implement guess plugin to make this command work:
7
- # $ embulk guess -g "jsonl" partial-config.yml
8
- #
9
- # Depending on the file format the plugin uses, you can use choose
10
- # one of binary guess (GuessPlugin), text guess (TextGuessPlugin),
11
- # or line guess (LineGuessPlugin).
12
-
13
- require "embulk/parser/jsonl.rb"
14
-
15
- #class JsonlParserGuessPlugin < GuessPlugin
16
- # Plugin.register_guess("jsonl", self)
17
- #
18
- # def guess(config, sample_buffer)
19
- # if sample_buffer[0,2] == GZIP_HEADER
20
- # guessed = {}
21
- # guessed["type"] = "jsonl"
22
- # guessed["property1"] = "guessed-value"
23
- # return {"parser" => guessed}
24
- # else
25
- # return {}
26
- # end
27
- # end
28
- #end
29
-
30
- #class JsonlParserGuessPlugin < TextGuessPlugin
31
- # Plugin.register_guess("jsonl", self)
32
- #
33
- # def guess_text(config, sample_text)
34
- # js = JSON.parse(sample_text) rescue nil
35
- # if js && js["mykeyword"] == "keyword"
36
- # guessed = {}
37
- # guessed["type"] = "jsonl"
38
- # guessed["property1"] = "guessed-value"
39
- # return {"parser" => guessed}
40
- # else
41
- # return {}
42
- # end
43
- # end
44
- #end
45
-
46
- class JsonlParserGuessPlugin < LineGuessPlugin
8
+ class Jsonl < LineGuessPlugin # TODO should use GuessPlugin instead of LineGuessPlugin
47
9
  Plugin.register_guess("jsonl", self)
48
10
 
49
11
  def guess_lines(config, sample_lines)
12
+ #return {} unless config.fetch("parser", {}).fetch("type", "jsonl") == "jsonl"
13
+
14
+ rows = []
15
+
50
16
  columns = {}
51
17
  sample_lines.each do |line|
52
- hash = JSON.parse(line)
53
- hash.each do |k, v|
54
- columns[k] = get_embulk_type(v)
55
- end
18
+ rows << JSON.parse(line)
56
19
  end
57
- schema = []
58
- columns.each do |k,v|
59
- schema << {'name' => k, 'type' => v}
60
- end
61
- guessed = {}
62
- guessed["type"] = "jsonl"
63
- guessed["schema"] = schema
64
- return {"parser" => guessed}
65
- end
66
20
 
67
- private
21
+ return {} if rows.size <= 3
68
22
 
69
- def get_embulk_type(val)
70
- case val
71
- when TrueClass
72
- return "boolean"
73
- when FalseClass
74
- return "boolean"
75
- when Integer
76
- return "long"
77
- when Float
78
- return "double"
79
- else
80
- return "string"
23
+ columns = Embulk::Guess::SchemaGuess.from_hash_records(rows).map do |c|
24
+ column = {name: c.name, type: c.type}
25
+ column[:format] = c.format if c.format
26
+ column
81
27
  end
28
+ parser_guessed = {"type" => "jsonl"}
29
+ parser_guessed["columns"] = columns
30
+ return {"parser" => parser_guessed}
82
31
  end
83
32
  end
84
33
  end
@@ -1,67 +1,3 @@
1
- require 'json'
2
-
3
- module Embulk
4
- module Parser
5
-
6
- class JsonlParserPlugin < ParserPlugin
7
- Plugin.register_parser("jsonl", self)
8
-
9
- def self.transaction(config, &control)
10
- parser_task = config.load_config(Java::LineDecoder::DecoderTask)
11
- task = {
12
- "decoder_task" => DataSource.from_java(parser_task.dump),
13
- "schema" => config.param("schema", :array)
14
- }
15
- columns = task["schema"].each_with_index.map do |c, i|
16
- Column.new(i, c["name"], c["type"].to_sym)
17
- end
18
- yield(task, columns)
19
- end
20
-
21
- def init
22
- @decoder_task = task.param("decoder_task", :hash).load_task(Java::LineDecoder::DecoderTask)
23
- end
24
-
25
- def run(file_input)
26
- decoder = Java::LineDecoder.new(file_input.instance_eval { @java_file_input }, @decoder_task)
27
- schema = @task["schema"]
28
-
29
- while decoder.nextFile
30
- while line = decoder.poll
31
- begin
32
- hash = JSON.parse(line)
33
- @page_builder.add(make_record(schema, hash))
34
- rescue
35
- # TODO: logging
36
- end
37
- end
38
- end
39
- page_builder.finish
40
- end
41
-
42
- private
43
-
44
- def make_record(schema, e)
45
- schema.map do |c|
46
- val = e[c["name"]]
47
- v = val.nil? ? "" : val
48
- case c["type"]
49
- when "string"
50
- v
51
- when "long"
52
- v.to_i
53
- when "double"
54
- v.to_f
55
- when "boolean"
56
- ["yes", "true", "1"].include?(v.downcase)
57
- when "timestamp"
58
- v.empty? ? nil : Time.strptime(v, c["time_format"])
59
- else
60
- raise "Unsupported type #{c['type']}"
61
- end
62
- end
63
- end
64
- end
65
-
66
- end
67
- end
1
+ Embulk::JavaPlugin.register_parser(
2
+ "jsonl", "org.embulk.parser.jsonl.JsonlParserPlugin",
3
+ File.expand_path('../../../../classpath', __FILE__))
@@ -0,0 +1 @@
1
+ rootProject.name = 'embulk-parser-jsonl'
@@ -0,0 +1,282 @@
1
+ package org.embulk.parser.jsonl;
2
+
3
+ import com.google.common.base.Optional;
4
+ import com.google.common.collect.ImmutableMap;
5
+ import org.embulk.config.Config;
6
+ import org.embulk.config.ConfigDefault;
7
+ import org.embulk.config.ConfigException;
8
+ import org.embulk.config.ConfigSource;
9
+ import org.embulk.config.Task;
10
+ import org.embulk.config.TaskSource;
11
+ import org.embulk.spi.Column;
12
+ import org.embulk.spi.ColumnVisitor;
13
+ import org.embulk.spi.DataException;
14
+ import org.embulk.spi.Exec;
15
+ import org.embulk.spi.FileInput;
16
+ import org.embulk.spi.PageBuilder;
17
+ import org.embulk.spi.PageOutput;
18
+ import org.embulk.spi.ParserPlugin;
19
+ import org.embulk.spi.Schema;
20
+ import org.embulk.spi.SchemaConfig;
21
+ import org.embulk.spi.json.JsonParser;
22
+ import org.embulk.spi.time.TimestampParser;
23
+ import org.embulk.spi.util.LineDecoder;
24
+ import org.embulk.spi.util.Timestamps;
25
+ import org.msgpack.core.MessageTypeException;
26
+ import org.msgpack.value.BooleanValue;
27
+ import org.msgpack.value.FloatValue;
28
+ import org.msgpack.value.IntegerValue;
29
+ import org.msgpack.value.Value;
30
+ import org.slf4j.Logger;
31
+
32
+ import java.util.Map;
33
+
34
+ import static org.msgpack.value.ValueFactory.newString;
35
+
36
+ public class JsonlParserPlugin
37
+ implements ParserPlugin
38
+ {
39
+ public interface PluginTask
40
+ extends Task, LineDecoder.DecoderTask, TimestampParser.Task
41
+ {
42
+ @Config("columns")
43
+ @ConfigDefault("null")
44
+ Optional<SchemaConfig> getSchemaConfig();
45
+
46
+ @Config("schema")
47
+ @ConfigDefault("null")
48
+ @Deprecated
49
+ Optional<SchemaConfig> getOldSchemaConfig();
50
+
51
+ @Config("stop_on_invalid_record")
52
+ @ConfigDefault("false")
53
+ boolean getStopOnInvalidRecord();
54
+ }
55
+
56
+ private final Logger log;
57
+
58
+ private String line = null;
59
+ private long lineNumber = 0;
60
+ private Map<String, Value> columnNameValues;
61
+
62
+ public JsonlParserPlugin()
63
+ {
64
+ this.log = Exec.getLogger(JsonlParserPlugin.class);
65
+ }
66
+
67
+ @Override
68
+ public void transaction(ConfigSource configSource, Control control)
69
+ {
70
+ PluginTask task = configSource.loadConfig(PluginTask.class);
71
+ control.run(task.dump(), getSchemaConfig(task).toSchema());
72
+ }
73
+
74
+ // this method is to keep the backward compatibility of 'schema' option.
75
+ private SchemaConfig getSchemaConfig(PluginTask task)
76
+ {
77
+ if (task.getOldSchemaConfig().isPresent()) {
78
+ log.warn("Please use 'columns' option instead of 'schema' because the 'schema' option is deprecated. The next version will stop 'schema' option support.");
79
+ }
80
+
81
+ if (task.getSchemaConfig().isPresent()) {
82
+ return task.getSchemaConfig().get();
83
+ }
84
+ else if (task.getOldSchemaConfig().isPresent()) {
85
+ return task.getOldSchemaConfig().get();
86
+ }
87
+ else {
88
+ throw new ConfigException("Attribute 'columns' is required but not set");
89
+ }
90
+ }
91
+
92
+ @Override
93
+ public void run(TaskSource taskSource, Schema schema, FileInput input, PageOutput output)
94
+ {
95
+ PluginTask task = taskSource.loadTask(PluginTask.class);
96
+
97
+ setColumnNameValues(schema);
98
+
99
+ final TimestampParser[] timestampParsers = Timestamps.newTimestampColumnParsers(task, getSchemaConfig(task));
100
+ final LineDecoder decoder = newLineDecoder(input, task);
101
+ final JsonParser jsonParser = newJsonParser();
102
+ final boolean stopOnInvalidRecord = task.getStopOnInvalidRecord();
103
+
104
+ try (final PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output)) {
105
+ while (decoder.nextFile()) { // TODO this implementation should be improved with new JsonParser API on Embulk v0.8.3
106
+ lineNumber = 0;
107
+
108
+ while ((line = decoder.poll()) != null) {
109
+ lineNumber++;
110
+
111
+ try {
112
+ Value value = jsonParser.parse(line);
113
+
114
+ if (!value.isMapValue()) {
115
+ throw new JsonRecordValidateException("Json string is not representing map value.");
116
+ }
117
+
118
+ final Map<Value, Value> record = value.asMapValue().map();
119
+
120
+ schema.visitColumns(new ColumnVisitor() {
121
+ @Override
122
+ public void booleanColumn(Column column)
123
+ {
124
+ Value v = record.get(getColumnNameValue(column));
125
+ if (isNil(v)) {
126
+ pageBuilder.setNull(column);
127
+ }
128
+ else {
129
+ try {
130
+ pageBuilder.setBoolean(column, ((BooleanValue) v).getBoolean());
131
+ }
132
+ catch (MessageTypeException e) {
133
+ throw new JsonRecordValidateException(e);
134
+ }
135
+ }
136
+ }
137
+
138
+ @Override
139
+ public void longColumn(Column column)
140
+ {
141
+ Value v = record.get(getColumnNameValue(column));
142
+ if (isNil(v)) {
143
+ pageBuilder.setNull(column);
144
+ }
145
+ else {
146
+ try {
147
+ pageBuilder.setLong(column, ((IntegerValue) v).asLong());
148
+ }
149
+ catch (MessageTypeException e) {
150
+ throw new JsonRecordValidateException(e);
151
+ }
152
+ }
153
+ }
154
+
155
+ @Override
156
+ public void doubleColumn(Column column)
157
+ {
158
+ Value v = record.get(getColumnNameValue(column));
159
+ if (isNil(v)) {
160
+ pageBuilder.setNull(column);
161
+ }
162
+ else {
163
+ try {
164
+ pageBuilder.setDouble(column, ((FloatValue) v).toDouble());
165
+ }
166
+ catch (MessageTypeException e) {
167
+ throw new JsonRecordValidateException(e);
168
+ }
169
+ }
170
+ }
171
+
172
+ @Override
173
+ public void stringColumn(Column column)
174
+ {
175
+ Value v = record.get(getColumnNameValue(column));
176
+ if (isNil(v)) {
177
+ pageBuilder.setNull(column);
178
+ }
179
+ else {
180
+ try {
181
+ pageBuilder.setString(column, v.toString());
182
+ }
183
+ catch (MessageTypeException e) {
184
+ throw new JsonRecordValidateException(e);
185
+ }
186
+ }
187
+ }
188
+
189
+ @Override
190
+ public void timestampColumn(Column column)
191
+ {
192
+ Value v = record.get(getColumnNameValue(column));
193
+ if (isNil(v)) {
194
+ pageBuilder.setNull(column);
195
+ }
196
+ else {
197
+ try {
198
+ pageBuilder.setTimestamp(column, timestampParsers[column.getIndex()].parse(v.toString()));
199
+ }
200
+ catch (MessageTypeException e) {
201
+ throw new JsonRecordValidateException(e);
202
+ }
203
+ }
204
+ }
205
+
206
+ @Override
207
+ public void jsonColumn(Column column)
208
+ {
209
+ Value v = record.get(getColumnNameValue(column));
210
+ if (isNil(v)) {
211
+ pageBuilder.setNull(column);
212
+ }
213
+ else {
214
+ try {
215
+ pageBuilder.setJson(column, v);
216
+ }
217
+ catch (MessageTypeException e) {
218
+ throw new JsonRecordValidateException(e);
219
+ }
220
+ }
221
+ }
222
+
223
+ private boolean isNil(Value v)
224
+ {
225
+ return v == null || v.isNilValue();
226
+ }
227
+ });
228
+
229
+ pageBuilder.addRecord();
230
+ }
231
+ catch (JsonRecordValidateException e) {
232
+ if (stopOnInvalidRecord) {
233
+ throw new DataException(String.format("Invalid record at line %d: %s", lineNumber, line), e);
234
+ }
235
+ log.warn(String.format("Skipped line %d (%s): %s", lineNumber, e.getMessage(), line));
236
+ }
237
+ }
238
+ }
239
+
240
+ pageBuilder.finish();
241
+ }
242
+ }
243
+
244
+ private void setColumnNameValues(Schema schema)
245
+ {
246
+ ImmutableMap.Builder<String, Value> builder = ImmutableMap.builder();
247
+ for (Column column : schema.getColumns()) {
248
+ String name = column.getName();
249
+ builder.put(name, newString(name));
250
+ }
251
+ columnNameValues = builder.build();
252
+ }
253
+
254
+ private Value getColumnNameValue(Column column)
255
+ {
256
+ return columnNameValues.get(column.getName());
257
+ }
258
+
259
+ public LineDecoder newLineDecoder(FileInput input, PluginTask task)
260
+ {
261
+ return new LineDecoder(input, task);
262
+ }
263
+
264
+ public JsonParser newJsonParser()
265
+ {
266
+ return new JsonParser();
267
+ }
268
+
269
+ static class JsonRecordValidateException
270
+ extends DataException
271
+ {
272
+ JsonRecordValidateException(String message)
273
+ {
274
+ super(message);
275
+ }
276
+
277
+ JsonRecordValidateException(Throwable cause)
278
+ {
279
+ super(cause);
280
+ }
281
+ }
282
+ }
@@ -0,0 +1,228 @@
1
+ package org.embulk.parser.jsonl;
2
+
3
+ import com.google.common.collect.ImmutableList;
4
+ import com.google.common.collect.Lists;
5
+ import org.embulk.EmbulkTestRuntime;
6
+ import org.embulk.config.ConfigSource;
7
+ import org.embulk.config.TaskSource;
8
+ import org.embulk.spi.ColumnConfig;
9
+ import org.embulk.spi.DataException;
10
+ import org.embulk.spi.FileInput;
11
+ import org.embulk.spi.ParserPlugin;
12
+ import org.embulk.spi.Schema;
13
+ import org.embulk.spi.SchemaConfig;
14
+ import org.embulk.spi.TestPageBuilderReader.MockPageOutput;
15
+ import org.embulk.spi.time.Timestamp;
16
+ import org.embulk.spi.type.Type;
17
+ import org.embulk.spi.util.InputStreamFileInput;
18
+ import org.embulk.spi.util.Pages;
19
+ import org.junit.Before;
20
+ import org.junit.Rule;
21
+ import org.junit.Test;
22
+
23
+ import java.io.ByteArrayInputStream;
24
+ import java.io.IOException;
25
+ import java.io.InputStream;
26
+ import java.util.List;
27
+
28
+ import static org.embulk.spi.type.Types.BOOLEAN;
29
+ import static org.embulk.spi.type.Types.DOUBLE;
30
+ import static org.embulk.spi.type.Types.JSON;
31
+ import static org.embulk.spi.type.Types.LONG;
32
+ import static org.embulk.spi.type.Types.STRING;
33
+ import static org.embulk.spi.type.Types.TIMESTAMP;
34
+ import static org.junit.Assert.assertEquals;
35
+ import static org.junit.Assert.assertNull;
36
+ import static org.junit.Assert.assertTrue;
37
+ import static org.junit.Assert.fail;
38
+ import static org.msgpack.value.ValueFactory.newArray;
39
+ import static org.msgpack.value.ValueFactory.newMap;
40
+ import static org.msgpack.value.ValueFactory.newString;
41
+
42
+ public class TestJsonlParserPlugin
43
+ {
44
+ @Rule
45
+ public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
46
+
47
+ private ConfigSource config;
48
+ private JsonlParserPlugin plugin;
49
+ private MockPageOutput output;
50
+
51
+ @Before
52
+ public void createResource()
53
+ {
54
+ config = config().set("type", "jsonl");
55
+ plugin = new JsonlParserPlugin();
56
+ recreatePageOutput();
57
+ }
58
+
59
+ private void recreatePageOutput()
60
+ {
61
+ output = new MockPageOutput();
62
+ }
63
+
64
+ @Test
65
+ public void skipRecords()
66
+ throws Exception
67
+ {
68
+ SchemaConfig schema = schema(
69
+ column("_c0", BOOLEAN), column("_c1", LONG), column("_c2", DOUBLE),
70
+ column("_c3", STRING), column("_c4", TIMESTAMP), column("_c5", JSON));
71
+ ConfigSource config = this.config.deepCopy().set("columns", schema);
72
+
73
+ transaction(config, fileInput(
74
+ "[]",
75
+ "\"embulk\"",
76
+ "10",
77
+ "true",
78
+ "false",
79
+ "null"
80
+ ));
81
+
82
+ List<Object[]> records = Pages.toObjects(schema.toSchema(), output.pages);
83
+ assertEquals(0, records.size());
84
+ }
85
+
86
+ @Test
87
+ public void throwDataException()
88
+ throws Exception
89
+ {
90
+ SchemaConfig schema = schema(
91
+ column("_c0", BOOLEAN), column("_c1", LONG), column("_c2", DOUBLE),
92
+ column("_c3", STRING), column("_c4", TIMESTAMP), column("_c5", JSON));
93
+ ConfigSource config = this.config.deepCopy().set("columns", schema).set("stop_on_invalid_record", true);
94
+
95
+ try {
96
+ transaction(config, fileInput(
97
+ "\"not_map_value\""
98
+ ));
99
+ fail();
100
+ }
101
+ catch (Throwable t) {
102
+ assertTrue(t instanceof DataException);
103
+ }
104
+ }
105
+
106
+ @Test
107
+ public void writeNils()
108
+ throws Exception
109
+ {
110
+ SchemaConfig schema = schema(
111
+ column("_c0", BOOLEAN), column("_c1", LONG), column("_c2", DOUBLE),
112
+ column("_c3", STRING), column("_c4", TIMESTAMP), column("_c5", JSON));
113
+ ConfigSource config = this.config.deepCopy().set("columns", schema);
114
+
115
+ transaction(config, fileInput(
116
+ "{}",
117
+ "{\"_c0\":null,\"_c1\":null,\"_c2\":null}",
118
+ "{\"_c3\":null,\"_c4\":null,\"_c5\":null}",
119
+ "{}"
120
+ ));
121
+
122
+ List<Object[]> records = Pages.toObjects(schema.toSchema(), output.pages);
123
+ assertEquals(4, records.size());
124
+
125
+ for (Object[] record : records) {
126
+ for (int i = 0; i < 6; i++) {
127
+ assertNull(record[i]);
128
+ }
129
+ }
130
+ }
131
+
132
+ @Test
133
+ public void useNormal()
134
+ throws Exception
135
+ {
136
+ SchemaConfig schema = schema(
137
+ column("_c0", BOOLEAN), column("_c1", LONG), column("_c2", DOUBLE),
138
+ column("_c3", STRING), column("_c4", TIMESTAMP, config().set("format", "%Y-%m-%d %H:%M:%S %Z")), column("_c5", JSON));
139
+ List<ConfigSource> configs = Lists.newArrayList(
140
+ this.config.deepCopy().set("columns", schema),
141
+ this.config.deepCopy().set("schema", schema)
142
+ );
143
+
144
+ for (ConfigSource config : configs) {
145
+ transaction(config, fileInput(
146
+ "{\"_c0\":true,\"_c1\":10,\"_c2\":0.1,\"_c3\":\"embulk\",\"_c4\":\"2016-01-01 00:00:00 UTC\",\"_c5\":{\"k\":\"v\"}}",
147
+ "[1, 2, 3]",
148
+ "{\"_c0\":false,\"_c1\":-10,\"_c2\":1.0,\"_c3\":\"エンバルク\",\"_c4\":\"2016-01-01 00:00:00 +0000\",\"_c5\":[\"e0\",\"e1\"]}"
149
+ ));
150
+
151
+ List<Object[]> records = Pages.toObjects(schema.toSchema(), output.pages);
152
+ assertEquals(2, records.size());
153
+
154
+ Object[] record;
155
+ {
156
+ record = records.get(0);
157
+ assertEquals(true, record[0]);
158
+ assertEquals(10L, record[1]);
159
+ assertEquals(0.1, (Double) record[2], 0.0001);
160
+ assertEquals("embulk", record[3]);
161
+ assertEquals(Timestamp.ofEpochSecond(1451606400L), record[4]);
162
+ assertEquals(newMap(newString("k"), newString("v")), record[5]);
163
+ }
164
+ {
165
+ record = records.get(1);
166
+ assertEquals(false, record[0]);
167
+ assertEquals(-10L, record[1]);
168
+ assertEquals(1.0, (Double) record[2], 0.0001);
169
+ assertEquals("エンバルク", record[3]);
170
+ assertEquals(Timestamp.ofEpochSecond(1451606400L), record[4]);
171
+ assertEquals(newArray(newString("e0"), newString("e1")), record[5]);
172
+ }
173
+
174
+ recreatePageOutput();
175
+ }
176
+ }
177
+
178
+ private ConfigSource config()
179
+ {
180
+ return runtime.getExec().newConfigSource();
181
+ }
182
+
183
+ private void transaction(ConfigSource config, final FileInput input)
184
+ {
185
+ plugin.transaction(config, new ParserPlugin.Control()
186
+ {
187
+ @Override
188
+ public void run(TaskSource taskSource, Schema schema)
189
+ {
190
+ plugin.run(taskSource, schema, input, output);
191
+ }
192
+ });
193
+ }
194
+
195
+ private FileInput fileInput(String... lines)
196
+ throws Exception
197
+ {
198
+ StringBuilder sb = new StringBuilder();
199
+ for (String line : lines) {
200
+ sb.append(line).append("\n");
201
+ }
202
+
203
+ ByteArrayInputStream in = new ByteArrayInputStream(sb.toString().getBytes());
204
+ return new InputStreamFileInput(runtime.getBufferAllocator(), provider(in));
205
+ }
206
+
207
+ private InputStreamFileInput.IteratorProvider provider(InputStream... inputStreams)
208
+ throws IOException
209
+ {
210
+ return new InputStreamFileInput.IteratorProvider(
211
+ ImmutableList.copyOf(inputStreams));
212
+ }
213
+
214
+ private SchemaConfig schema(ColumnConfig... columns)
215
+ {
216
+ return new SchemaConfig(Lists.newArrayList(columns));
217
+ }
218
+
219
+ private ColumnConfig column(String name, Type type)
220
+ {
221
+ return column(name, type, config());
222
+ }
223
+
224
+ private ColumnConfig column(String name, Type type, ConfigSource option)
225
+ {
226
+ return new ColumnConfig(name, type, option);
227
+ }
228
+ }
metadata CHANGED
@@ -1,41 +1,41 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-parser-jsonl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shunsuke Mikami
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-04-04 00:00:00.000000000 Z
11
+ date: 2016-02-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: bundler
15
14
  requirement: !ruby/object:Gem::Requirement
16
15
  requirements:
17
- - - "~>"
16
+ - - ~>
18
17
  - !ruby/object:Gem::Version
19
18
  version: '1.0'
20
- type: :development
19
+ name: bundler
21
20
  prerelease: false
21
+ type: :development
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "~>"
24
+ - - ~>
25
25
  - !ruby/object:Gem::Version
26
26
  version: '1.0'
27
27
  - !ruby/object:Gem::Dependency
28
- name: rake
29
28
  requirement: !ruby/object:Gem::Requirement
30
29
  requirements:
31
- - - "~>"
30
+ - - ~>
32
31
  - !ruby/object:Gem::Version
33
32
  version: '10.0'
34
- type: :development
33
+ name: rake
35
34
  prerelease: false
35
+ type: :development
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - "~>"
38
+ - - ~>
39
39
  - !ruby/object:Gem::Version
40
40
  version: '10.0'
41
41
  description: Parses Jsonl files read by other file input plugins.
@@ -45,36 +45,46 @@ executables: []
45
45
  extensions: []
46
46
  extra_rdoc_files: []
47
47
  files:
48
- - ".gitignore"
48
+ - .gitignore
49
+ - CHANGELOG.md
49
50
  - Gemfile
50
51
  - LICENSE.txt
51
52
  - README.md
52
53
  - Rakefile
54
+ - build.gradle
53
55
  - embulk-parser-jsonl.gemspec
56
+ - gradle/wrapper/gradle-wrapper.jar
57
+ - gradle/wrapper/gradle-wrapper.properties
58
+ - gradlew
59
+ - gradlew.bat
54
60
  - lib/embulk/guess/jsonl.rb
55
61
  - lib/embulk/parser/jsonl.rb
62
+ - settings.gradle
63
+ - src/main/java/org/embulk/parser/jsonl/JsonlParserPlugin.java
64
+ - src/test/java/org/embulk/parser/jsonl/TestJsonlParserPlugin.java
65
+ - classpath/embulk-parser-jsonl-0.1.0.jar
56
66
  homepage: https://github.com/shun0102/embulk-parser-jsonl
57
67
  licenses:
58
68
  - MIT
59
69
  metadata: {}
60
- post_install_message:
70
+ post_install_message:
61
71
  rdoc_options: []
62
72
  require_paths:
63
73
  - lib
64
74
  required_ruby_version: !ruby/object:Gem::Requirement
65
75
  requirements:
66
- - - ">="
76
+ - - '>='
67
77
  - !ruby/object:Gem::Version
68
78
  version: '0'
69
79
  required_rubygems_version: !ruby/object:Gem::Requirement
70
80
  requirements:
71
- - - ">="
81
+ - - '>='
72
82
  - !ruby/object:Gem::Version
73
83
  version: '0'
74
84
  requirements: []
75
- rubyforge_project:
76
- rubygems_version: 2.2.2
77
- signing_key:
85
+ rubyforge_project:
86
+ rubygems_version: 2.1.9
87
+ signing_key:
78
88
  specification_version: 4
79
89
  summary: Jsonl parser plugin for Embulk
80
90
  test_files: []