embulk-parser-jsonl 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f98fb168c55e6a4c80f5d098bc72945091fdcf8d
4
- data.tar.gz: 5e85f4e3d5aece158e1b55e381d40ee6eda1d69d
3
+ metadata.gz: 973fe894b7704f01da0d8d0ba2b93eaa3804ea7d
4
+ data.tar.gz: fe6ca73d3100595bd64e95e0e08269490671314e
5
5
  SHA512:
6
- metadata.gz: 6452226f7de14018279af312af6f4105dfd98668506168f3acb4e3139519b6a93353e1bac454c0bc272bca96375c4a6fedb201b59f9b639ca0ec7b7774bd69bb
7
- data.tar.gz: 1fa70b407a8ad5d8fa37a6467e621aa7e8fd937cedb908d85f3189bcb2afd6506b476c62433bc4ef47ad241b31fc47b46dfc50c77388d504294348540014c09d
6
+ metadata.gz: 74ad34c4f29980e75f36ac88b9160fd784a975a83b906d6d5c30738edab617211c0653f22d0a9c01ad533bbcc92394b3ceccad20dcc8805b1627298695b5d6ac
7
+ data.tar.gz: dd611c8bff080d4ca1c61a8c071a5baeb2d982b89b2812457af67f9ecaacd5c3e17b42c230356087c26ed53ecb51a7a0728a1fde14c67a441974483a53fcd58b
data/.gitignore CHANGED
@@ -2,4 +2,7 @@
2
2
  /pkg/
3
3
  /tmp/
4
4
  /.bundle/
5
+ build/
6
+ /classpath/
7
+ /.gradle
5
8
  /Gemfile.lock
@@ -0,0 +1,7 @@
1
+ ## 0.1.0 - 2016-02-22
2
+
3
+ Upgrade Embulk v0.8 and support Json type in Java [#3](https://github.com/shun0102/embulk-parser-jsonl/pull/3)
4
+
5
+ ## 0.0.1 - 2015-04-04
6
+
7
+ The first release!!
data/README.md CHANGED
@@ -10,7 +10,7 @@ TODO: Write short description here and embulk-parser-jsonl.gemspec file.
10
10
  ## Configuration
11
11
 
12
12
  - **type**: specify this parser as jsonl
13
- - **schema**: specify column name and type (array, required)
13
+ - **columns**: specify column name and type (array, required)
14
14
 
15
15
  ## Example
16
16
 
@@ -19,7 +19,7 @@ in:
19
19
  type: any file input plugin type
20
20
  parser:
21
21
  type: jsonl
22
- schema:
22
+ columns:
23
23
  - {name: first_name, type: string}
24
24
  - {name: last_name, type: string}
25
25
  - {name: age, type: long}
@@ -35,5 +35,5 @@ $ embulk guess -g jsonl config.yml -o guessed.yml
35
35
  ## Build
36
36
 
37
37
  ```
38
- $ rake
38
+ $ ./gradlew gem classpath
39
39
  ```
@@ -0,0 +1,79 @@
1
+ plugins {
2
+ id "com.jfrog.bintray" version "1.1"
3
+ id "com.github.jruby-gradle.base" version "0.1.5"
4
+ id "java"
5
+ id "jacoco"
6
+ }
7
+ import com.github.jrubygradle.JRubyExec
8
+ repositories {
9
+ mavenCentral()
10
+ jcenter()
11
+ }
12
+ configurations {
13
+ provided
14
+ }
15
+
16
+ version = "0.1.0"
17
+
18
+ compileJava.options.encoding = 'UTF-8' // source encoding
19
+ sourceCompatibility = 1.7
20
+ targetCompatibility = 1.7
21
+
22
+ dependencies {
23
+ compile "org.embulk:embulk-core:0.8.2"
24
+ provided "org.embulk:embulk-core:0.8.2"
25
+
26
+ testCompile "junit:junit:4.+"
27
+ testCompile "org.embulk:embulk-core:0.8.2:tests"
28
+ testCompile "org.embulk:embulk-standards:0.8.2"
29
+ }
30
+
31
+ task classpath(type: Copy, dependsOn: ["jar"]) {
32
+ doFirst { file("classpath").deleteDir() }
33
+ from (configurations.runtime - configurations.provided + files(jar.archivePath))
34
+ into "classpath"
35
+ }
36
+ clean { delete "classpath" }
37
+
38
+ task gem(type: JRubyExec, dependsOn: ["gemspec", "classpath"]) {
39
+ jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "build"
40
+ script "${project.name}.gemspec"
41
+ doLast { ant.move(file: "${project.name}-${project.version}.gem", todir: "pkg") }
42
+ }
43
+
44
+ task gemPush(type: JRubyExec, dependsOn: ["gem"]) {
45
+ jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "push"
46
+ script "pkg/${project.name}-${project.version}.gem"
47
+ }
48
+
49
+ task "package"(dependsOn: ["gemspec", "classpath"]) << {
50
+ println "> Build succeeded."
51
+ println "> You can run embulk with '-L ${file(".").absolutePath}' argument."
52
+ }
53
+
54
+ task gemspec {
55
+ ext.gemspecFile = file("${project.name}.gemspec")
56
+ inputs.file "build.gradle"
57
+ outputs.file gemspecFile
58
+ doLast { gemspecFile.write($/
59
+ Gem::Specification.new do |spec|
60
+ spec.name = "${project.name}"
61
+ spec.version = "${project.version}"
62
+ spec.authors = ["Shunsuke Mikami"]
63
+ spec.summary = "Jsonl parser plugin for Embulk"
64
+ spec.description = "Parses Jsonl files read by other file input plugins."
65
+ spec.email = ["shun0102@gmail.com"]
66
+ spec.licenses = ["MIT"]
67
+ spec.homepage = "https://github.com/shun0102/embulk-parser-jsonl"
68
+
69
+ spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"]
70
+ spec.test_files = spec.files.grep(%r{^(test|spec)/})
71
+ spec.require_paths = ["lib"]
72
+
73
+ spec.add_development_dependency 'bundler', ['~> 1.0']
74
+ spec.add_development_dependency 'rake', ['~> 10.0']
75
+ end
76
+ /$)
77
+ }
78
+ }
79
+ clean { delete "${project.name}.gemspec" }
@@ -1,7 +1,7 @@
1
1
 
2
2
  Gem::Specification.new do |spec|
3
3
  spec.name = "embulk-parser-jsonl"
4
- spec.version = "0.0.1"
4
+ spec.version = "0.1.0"
5
5
  spec.authors = ["Shunsuke Mikami"]
6
6
  spec.summary = "Jsonl parser plugin for Embulk"
7
7
  spec.description = "Parses Jsonl files read by other file input plugins."
@@ -0,0 +1,6 @@
1
+ #Tue Aug 11 00:26:20 PDT 2015
2
+ distributionBase=GRADLE_USER_HOME
3
+ distributionPath=wrapper/dists
4
+ zipStoreBase=GRADLE_USER_HOME
5
+ zipStorePath=wrapper/dists
6
+ distributionUrl=https\://services.gradle.org/distributions/gradle-2.6-bin.zip
data/gradlew ADDED
@@ -0,0 +1,164 @@
1
+ #!/usr/bin/env bash
2
+
3
+ ##############################################################################
4
+ ##
5
+ ## Gradle start up script for UN*X
6
+ ##
7
+ ##############################################################################
8
+
9
+ # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
10
+ DEFAULT_JVM_OPTS=""
11
+
12
+ APP_NAME="Gradle"
13
+ APP_BASE_NAME=`basename "$0"`
14
+
15
+ # Use the maximum available, or set MAX_FD != -1 to use that value.
16
+ MAX_FD="maximum"
17
+
18
+ warn ( ) {
19
+ echo "$*"
20
+ }
21
+
22
+ die ( ) {
23
+ echo
24
+ echo "$*"
25
+ echo
26
+ exit 1
27
+ }
28
+
29
+ # OS specific support (must be 'true' or 'false').
30
+ cygwin=false
31
+ msys=false
32
+ darwin=false
33
+ case "`uname`" in
34
+ CYGWIN* )
35
+ cygwin=true
36
+ ;;
37
+ Darwin* )
38
+ darwin=true
39
+ ;;
40
+ MINGW* )
41
+ msys=true
42
+ ;;
43
+ esac
44
+
45
+ # For Cygwin, ensure paths are in UNIX format before anything is touched.
46
+ if $cygwin ; then
47
+ [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"`
48
+ fi
49
+
50
+ # Attempt to set APP_HOME
51
+ # Resolve links: $0 may be a link
52
+ PRG="$0"
53
+ # Need this for relative symlinks.
54
+ while [ -h "$PRG" ] ; do
55
+ ls=`ls -ld "$PRG"`
56
+ link=`expr "$ls" : '.*-> \(.*\)$'`
57
+ if expr "$link" : '/.*' > /dev/null; then
58
+ PRG="$link"
59
+ else
60
+ PRG=`dirname "$PRG"`"/$link"
61
+ fi
62
+ done
63
+ SAVED="`pwd`"
64
+ cd "`dirname \"$PRG\"`/" >&-
65
+ APP_HOME="`pwd -P`"
66
+ cd "$SAVED" >&-
67
+
68
+ CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
69
+
70
+ # Determine the Java command to use to start the JVM.
71
+ if [ -n "$JAVA_HOME" ] ; then
72
+ if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
73
+ # IBM's JDK on AIX uses strange locations for the executables
74
+ JAVACMD="$JAVA_HOME/jre/sh/java"
75
+ else
76
+ JAVACMD="$JAVA_HOME/bin/java"
77
+ fi
78
+ if [ ! -x "$JAVACMD" ] ; then
79
+ die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
80
+
81
+ Please set the JAVA_HOME variable in your environment to match the
82
+ location of your Java installation."
83
+ fi
84
+ else
85
+ JAVACMD="java"
86
+ which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
87
+
88
+ Please set the JAVA_HOME variable in your environment to match the
89
+ location of your Java installation."
90
+ fi
91
+
92
+ # Increase the maximum file descriptors if we can.
93
+ if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then
94
+ MAX_FD_LIMIT=`ulimit -H -n`
95
+ if [ $? -eq 0 ] ; then
96
+ if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
97
+ MAX_FD="$MAX_FD_LIMIT"
98
+ fi
99
+ ulimit -n $MAX_FD
100
+ if [ $? -ne 0 ] ; then
101
+ warn "Could not set maximum file descriptor limit: $MAX_FD"
102
+ fi
103
+ else
104
+ warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
105
+ fi
106
+ fi
107
+
108
+ # For Darwin, add options to specify how the application appears in the dock
109
+ if $darwin; then
110
+ GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
111
+ fi
112
+
113
+ # For Cygwin, switch paths to Windows format before running java
114
+ if $cygwin ; then
115
+ APP_HOME=`cygpath --path --mixed "$APP_HOME"`
116
+ CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
117
+
118
+ # We build the pattern for arguments to be converted via cygpath
119
+ ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
120
+ SEP=""
121
+ for dir in $ROOTDIRSRAW ; do
122
+ ROOTDIRS="$ROOTDIRS$SEP$dir"
123
+ SEP="|"
124
+ done
125
+ OURCYGPATTERN="(^($ROOTDIRS))"
126
+ # Add a user-defined pattern to the cygpath arguments
127
+ if [ "$GRADLE_CYGPATTERN" != "" ] ; then
128
+ OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
129
+ fi
130
+ # Now convert the arguments - kludge to limit ourselves to /bin/sh
131
+ i=0
132
+ for arg in "$@" ; do
133
+ CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
134
+ CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
135
+
136
+ if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
137
+ eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
138
+ else
139
+ eval `echo args$i`="\"$arg\""
140
+ fi
141
+ i=$((i+1))
142
+ done
143
+ case $i in
144
+ (0) set -- ;;
145
+ (1) set -- "$args0" ;;
146
+ (2) set -- "$args0" "$args1" ;;
147
+ (3) set -- "$args0" "$args1" "$args2" ;;
148
+ (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
149
+ (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
150
+ (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
151
+ (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
152
+ (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
153
+ (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
154
+ esac
155
+ fi
156
+
157
+ # Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules
158
+ function splitJvmOpts() {
159
+ JVM_OPTS=("$@")
160
+ }
161
+ eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS
162
+ JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME"
163
+
164
+ exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@"
@@ -0,0 +1,90 @@
1
+ @if "%DEBUG%" == "" @echo off
2
+ @rem ##########################################################################
3
+ @rem
4
+ @rem Gradle startup script for Windows
5
+ @rem
6
+ @rem ##########################################################################
7
+
8
+ @rem Set local scope for the variables with windows NT shell
9
+ if "%OS%"=="Windows_NT" setlocal
10
+
11
+ @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
12
+ set DEFAULT_JVM_OPTS=
13
+
14
+ set DIRNAME=%~dp0
15
+ if "%DIRNAME%" == "" set DIRNAME=.
16
+ set APP_BASE_NAME=%~n0
17
+ set APP_HOME=%DIRNAME%
18
+
19
+ @rem Find java.exe
20
+ if defined JAVA_HOME goto findJavaFromJavaHome
21
+
22
+ set JAVA_EXE=java.exe
23
+ %JAVA_EXE% -version >NUL 2>&1
24
+ if "%ERRORLEVEL%" == "0" goto init
25
+
26
+ echo.
27
+ echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28
+ echo.
29
+ echo Please set the JAVA_HOME variable in your environment to match the
30
+ echo location of your Java installation.
31
+
32
+ goto fail
33
+
34
+ :findJavaFromJavaHome
35
+ set JAVA_HOME=%JAVA_HOME:"=%
36
+ set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37
+
38
+ if exist "%JAVA_EXE%" goto init
39
+
40
+ echo.
41
+ echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42
+ echo.
43
+ echo Please set the JAVA_HOME variable in your environment to match the
44
+ echo location of your Java installation.
45
+
46
+ goto fail
47
+
48
+ :init
49
+ @rem Get command-line arguments, handling Windowz variants
50
+
51
+ if not "%OS%" == "Windows_NT" goto win9xME_args
52
+ if "%@eval[2+2]" == "4" goto 4NT_args
53
+
54
+ :win9xME_args
55
+ @rem Slurp the command line arguments.
56
+ set CMD_LINE_ARGS=
57
+ set _SKIP=2
58
+
59
+ :win9xME_args_slurp
60
+ if "x%~1" == "x" goto execute
61
+
62
+ set CMD_LINE_ARGS=%*
63
+ goto execute
64
+
65
+ :4NT_args
66
+ @rem Get arguments from the 4NT Shell from JP Software
67
+ set CMD_LINE_ARGS=%$
68
+
69
+ :execute
70
+ @rem Setup the command line
71
+
72
+ set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
73
+
74
+ @rem Execute Gradle
75
+ "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
76
+
77
+ :end
78
+ @rem End local scope for the variables with windows NT shell
79
+ if "%ERRORLEVEL%"=="0" goto mainEnd
80
+
81
+ :fail
82
+ rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
83
+ rem the _cmd.exe /c_ return code!
84
+ if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
85
+ exit /b 1
86
+
87
+ :mainEnd
88
+ if "%OS%"=="Windows_NT" endlocal
89
+
90
+ :omega
@@ -1,84 +1,33 @@
1
1
  require 'json'
2
+ require "embulk/parser/jsonl.rb"
2
3
 
3
4
  module Embulk
4
5
  module Guess
6
+ # $ embulk guess -g "jsonl" partial-config.yml
5
7
 
6
- # TODO implement guess plugin to make this command work:
7
- # $ embulk guess -g "jsonl" partial-config.yml
8
- #
9
- # Depending on the file format the plugin uses, you can use choose
10
- # one of binary guess (GuessPlugin), text guess (TextGuessPlugin),
11
- # or line guess (LineGuessPlugin).
12
-
13
- require "embulk/parser/jsonl.rb"
14
-
15
- #class JsonlParserGuessPlugin < GuessPlugin
16
- # Plugin.register_guess("jsonl", self)
17
- #
18
- # def guess(config, sample_buffer)
19
- # if sample_buffer[0,2] == GZIP_HEADER
20
- # guessed = {}
21
- # guessed["type"] = "jsonl"
22
- # guessed["property1"] = "guessed-value"
23
- # return {"parser" => guessed}
24
- # else
25
- # return {}
26
- # end
27
- # end
28
- #end
29
-
30
- #class JsonlParserGuessPlugin < TextGuessPlugin
31
- # Plugin.register_guess("jsonl", self)
32
- #
33
- # def guess_text(config, sample_text)
34
- # js = JSON.parse(sample_text) rescue nil
35
- # if js && js["mykeyword"] == "keyword"
36
- # guessed = {}
37
- # guessed["type"] = "jsonl"
38
- # guessed["property1"] = "guessed-value"
39
- # return {"parser" => guessed}
40
- # else
41
- # return {}
42
- # end
43
- # end
44
- #end
45
-
46
- class JsonlParserGuessPlugin < LineGuessPlugin
8
+ class Jsonl < LineGuessPlugin # TODO should use GuessPlugin instead of LineGuessPlugin
47
9
  Plugin.register_guess("jsonl", self)
48
10
 
49
11
  def guess_lines(config, sample_lines)
12
+ #return {} unless config.fetch("parser", {}).fetch("type", "jsonl") == "jsonl"
13
+
14
+ rows = []
15
+
50
16
  columns = {}
51
17
  sample_lines.each do |line|
52
- hash = JSON.parse(line)
53
- hash.each do |k, v|
54
- columns[k] = get_embulk_type(v)
55
- end
18
+ rows << JSON.parse(line)
56
19
  end
57
- schema = []
58
- columns.each do |k,v|
59
- schema << {'name' => k, 'type' => v}
60
- end
61
- guessed = {}
62
- guessed["type"] = "jsonl"
63
- guessed["schema"] = schema
64
- return {"parser" => guessed}
65
- end
66
20
 
67
- private
21
+ return {} if rows.size <= 3
68
22
 
69
- def get_embulk_type(val)
70
- case val
71
- when TrueClass
72
- return "boolean"
73
- when FalseClass
74
- return "boolean"
75
- when Integer
76
- return "long"
77
- when Float
78
- return "double"
79
- else
80
- return "string"
23
+ columns = Embulk::Guess::SchemaGuess.from_hash_records(rows).map do |c|
24
+ column = {name: c.name, type: c.type}
25
+ column[:format] = c.format if c.format
26
+ column
81
27
  end
28
+ parser_guessed = {"type" => "jsonl"}
29
+ parser_guessed["columns"] = columns
30
+ return {"parser" => parser_guessed}
82
31
  end
83
32
  end
84
33
  end
@@ -1,67 +1,3 @@
1
- require 'json'
2
-
3
- module Embulk
4
- module Parser
5
-
6
- class JsonlParserPlugin < ParserPlugin
7
- Plugin.register_parser("jsonl", self)
8
-
9
- def self.transaction(config, &control)
10
- parser_task = config.load_config(Java::LineDecoder::DecoderTask)
11
- task = {
12
- "decoder_task" => DataSource.from_java(parser_task.dump),
13
- "schema" => config.param("schema", :array)
14
- }
15
- columns = task["schema"].each_with_index.map do |c, i|
16
- Column.new(i, c["name"], c["type"].to_sym)
17
- end
18
- yield(task, columns)
19
- end
20
-
21
- def init
22
- @decoder_task = task.param("decoder_task", :hash).load_task(Java::LineDecoder::DecoderTask)
23
- end
24
-
25
- def run(file_input)
26
- decoder = Java::LineDecoder.new(file_input.instance_eval { @java_file_input }, @decoder_task)
27
- schema = @task["schema"]
28
-
29
- while decoder.nextFile
30
- while line = decoder.poll
31
- begin
32
- hash = JSON.parse(line)
33
- @page_builder.add(make_record(schema, hash))
34
- rescue
35
- # TODO: logging
36
- end
37
- end
38
- end
39
- page_builder.finish
40
- end
41
-
42
- private
43
-
44
- def make_record(schema, e)
45
- schema.map do |c|
46
- val = e[c["name"]]
47
- v = val.nil? ? "" : val
48
- case c["type"]
49
- when "string"
50
- v
51
- when "long"
52
- v.to_i
53
- when "double"
54
- v.to_f
55
- when "boolean"
56
- ["yes", "true", "1"].include?(v.downcase)
57
- when "timestamp"
58
- v.empty? ? nil : Time.strptime(v, c["time_format"])
59
- else
60
- raise "Unsupported type #{c['type']}"
61
- end
62
- end
63
- end
64
- end
65
-
66
- end
67
- end
1
+ Embulk::JavaPlugin.register_parser(
2
+ "jsonl", "org.embulk.parser.jsonl.JsonlParserPlugin",
3
+ File.expand_path('../../../../classpath', __FILE__))
@@ -0,0 +1 @@
1
+ rootProject.name = 'embulk-parser-jsonl'
@@ -0,0 +1,282 @@
1
+ package org.embulk.parser.jsonl;
2
+
3
+ import com.google.common.base.Optional;
4
+ import com.google.common.collect.ImmutableMap;
5
+ import org.embulk.config.Config;
6
+ import org.embulk.config.ConfigDefault;
7
+ import org.embulk.config.ConfigException;
8
+ import org.embulk.config.ConfigSource;
9
+ import org.embulk.config.Task;
10
+ import org.embulk.config.TaskSource;
11
+ import org.embulk.spi.Column;
12
+ import org.embulk.spi.ColumnVisitor;
13
+ import org.embulk.spi.DataException;
14
+ import org.embulk.spi.Exec;
15
+ import org.embulk.spi.FileInput;
16
+ import org.embulk.spi.PageBuilder;
17
+ import org.embulk.spi.PageOutput;
18
+ import org.embulk.spi.ParserPlugin;
19
+ import org.embulk.spi.Schema;
20
+ import org.embulk.spi.SchemaConfig;
21
+ import org.embulk.spi.json.JsonParser;
22
+ import org.embulk.spi.time.TimestampParser;
23
+ import org.embulk.spi.util.LineDecoder;
24
+ import org.embulk.spi.util.Timestamps;
25
+ import org.msgpack.core.MessageTypeException;
26
+ import org.msgpack.value.BooleanValue;
27
+ import org.msgpack.value.FloatValue;
28
+ import org.msgpack.value.IntegerValue;
29
+ import org.msgpack.value.Value;
30
+ import org.slf4j.Logger;
31
+
32
+ import java.util.Map;
33
+
34
+ import static org.msgpack.value.ValueFactory.newString;
35
+
36
+ public class JsonlParserPlugin
37
+ implements ParserPlugin
38
+ {
39
+ public interface PluginTask
40
+ extends Task, LineDecoder.DecoderTask, TimestampParser.Task
41
+ {
42
+ @Config("columns")
43
+ @ConfigDefault("null")
44
+ Optional<SchemaConfig> getSchemaConfig();
45
+
46
+ @Config("schema")
47
+ @ConfigDefault("null")
48
+ @Deprecated
49
+ Optional<SchemaConfig> getOldSchemaConfig();
50
+
51
+ @Config("stop_on_invalid_record")
52
+ @ConfigDefault("false")
53
+ boolean getStopOnInvalidRecord();
54
+ }
55
+
56
+ private final Logger log;
57
+
58
+ private String line = null;
59
+ private long lineNumber = 0;
60
+ private Map<String, Value> columnNameValues;
61
+
62
+ public JsonlParserPlugin()
63
+ {
64
+ this.log = Exec.getLogger(JsonlParserPlugin.class);
65
+ }
66
+
67
+ @Override
68
+ public void transaction(ConfigSource configSource, Control control)
69
+ {
70
+ PluginTask task = configSource.loadConfig(PluginTask.class);
71
+ control.run(task.dump(), getSchemaConfig(task).toSchema());
72
+ }
73
+
74
+ // this method is to keep the backward compatibility of 'schema' option.
75
+ private SchemaConfig getSchemaConfig(PluginTask task)
76
+ {
77
+ if (task.getOldSchemaConfig().isPresent()) {
78
+ log.warn("Please use 'columns' option instead of 'schema' because the 'schema' option is deprecated. The next version will stop 'schema' option support.");
79
+ }
80
+
81
+ if (task.getSchemaConfig().isPresent()) {
82
+ return task.getSchemaConfig().get();
83
+ }
84
+ else if (task.getOldSchemaConfig().isPresent()) {
85
+ return task.getOldSchemaConfig().get();
86
+ }
87
+ else {
88
+ throw new ConfigException("Attribute 'columns' is required but not set");
89
+ }
90
+ }
91
+
92
+ @Override
93
+ public void run(TaskSource taskSource, Schema schema, FileInput input, PageOutput output)
94
+ {
95
+ PluginTask task = taskSource.loadTask(PluginTask.class);
96
+
97
+ setColumnNameValues(schema);
98
+
99
+ final TimestampParser[] timestampParsers = Timestamps.newTimestampColumnParsers(task, getSchemaConfig(task));
100
+ final LineDecoder decoder = newLineDecoder(input, task);
101
+ final JsonParser jsonParser = newJsonParser();
102
+ final boolean stopOnInvalidRecord = task.getStopOnInvalidRecord();
103
+
104
+ try (final PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output)) {
105
+ while (decoder.nextFile()) { // TODO this implementation should be improved with new JsonParser API on Embulk v0.8.3
106
+ lineNumber = 0;
107
+
108
+ while ((line = decoder.poll()) != null) {
109
+ lineNumber++;
110
+
111
+ try {
112
+ Value value = jsonParser.parse(line);
113
+
114
+ if (!value.isMapValue()) {
115
+ throw new JsonRecordValidateException("Json string is not representing map value.");
116
+ }
117
+
118
+ final Map<Value, Value> record = value.asMapValue().map();
119
+
120
+ schema.visitColumns(new ColumnVisitor() {
121
+ @Override
122
+ public void booleanColumn(Column column)
123
+ {
124
+ Value v = record.get(getColumnNameValue(column));
125
+ if (isNil(v)) {
126
+ pageBuilder.setNull(column);
127
+ }
128
+ else {
129
+ try {
130
+ pageBuilder.setBoolean(column, ((BooleanValue) v).getBoolean());
131
+ }
132
+ catch (MessageTypeException e) {
133
+ throw new JsonRecordValidateException(e);
134
+ }
135
+ }
136
+ }
137
+
138
+ @Override
139
+ public void longColumn(Column column)
140
+ {
141
+ Value v = record.get(getColumnNameValue(column));
142
+ if (isNil(v)) {
143
+ pageBuilder.setNull(column);
144
+ }
145
+ else {
146
+ try {
147
+ pageBuilder.setLong(column, ((IntegerValue) v).asLong());
148
+ }
149
+ catch (MessageTypeException e) {
150
+ throw new JsonRecordValidateException(e);
151
+ }
152
+ }
153
+ }
154
+
155
+ @Override
156
+ public void doubleColumn(Column column)
157
+ {
158
+ Value v = record.get(getColumnNameValue(column));
159
+ if (isNil(v)) {
160
+ pageBuilder.setNull(column);
161
+ }
162
+ else {
163
+ try {
164
+ pageBuilder.setDouble(column, ((FloatValue) v).toDouble());
165
+ }
166
+ catch (MessageTypeException e) {
167
+ throw new JsonRecordValidateException(e);
168
+ }
169
+ }
170
+ }
171
+
172
+ @Override
173
+ public void stringColumn(Column column)
174
+ {
175
+ Value v = record.get(getColumnNameValue(column));
176
+ if (isNil(v)) {
177
+ pageBuilder.setNull(column);
178
+ }
179
+ else {
180
+ try {
181
+ pageBuilder.setString(column, v.toString());
182
+ }
183
+ catch (MessageTypeException e) {
184
+ throw new JsonRecordValidateException(e);
185
+ }
186
+ }
187
+ }
188
+
189
+ @Override
190
+ public void timestampColumn(Column column)
191
+ {
192
+ Value v = record.get(getColumnNameValue(column));
193
+ if (isNil(v)) {
194
+ pageBuilder.setNull(column);
195
+ }
196
+ else {
197
+ try {
198
+ pageBuilder.setTimestamp(column, timestampParsers[column.getIndex()].parse(v.toString()));
199
+ }
200
+ catch (MessageTypeException e) {
201
+ throw new JsonRecordValidateException(e);
202
+ }
203
+ }
204
+ }
205
+
206
+ @Override
207
+ public void jsonColumn(Column column)
208
+ {
209
+ Value v = record.get(getColumnNameValue(column));
210
+ if (isNil(v)) {
211
+ pageBuilder.setNull(column);
212
+ }
213
+ else {
214
+ try {
215
+ pageBuilder.setJson(column, v);
216
+ }
217
+ catch (MessageTypeException e) {
218
+ throw new JsonRecordValidateException(e);
219
+ }
220
+ }
221
+ }
222
+
223
+ private boolean isNil(Value v)
224
+ {
225
+ return v == null || v.isNilValue();
226
+ }
227
+ });
228
+
229
+ pageBuilder.addRecord();
230
+ }
231
+ catch (JsonRecordValidateException e) {
232
+ if (stopOnInvalidRecord) {
233
+ throw new DataException(String.format("Invalid record at line %d: %s", lineNumber, line), e);
234
+ }
235
+ log.warn(String.format("Skipped line %d (%s): %s", lineNumber, e.getMessage(), line));
236
+ }
237
+ }
238
+ }
239
+
240
+ pageBuilder.finish();
241
+ }
242
+ }
243
+
244
+ private void setColumnNameValues(Schema schema)
245
+ {
246
+ ImmutableMap.Builder<String, Value> builder = ImmutableMap.builder();
247
+ for (Column column : schema.getColumns()) {
248
+ String name = column.getName();
249
+ builder.put(name, newString(name));
250
+ }
251
+ columnNameValues = builder.build();
252
+ }
253
+
254
+ private Value getColumnNameValue(Column column)
255
+ {
256
+ return columnNameValues.get(column.getName());
257
+ }
258
+
259
+ public LineDecoder newLineDecoder(FileInput input, PluginTask task)
260
+ {
261
+ return new LineDecoder(input, task);
262
+ }
263
+
264
+ public JsonParser newJsonParser()
265
+ {
266
+ return new JsonParser();
267
+ }
268
+
269
+ static class JsonRecordValidateException
270
+ extends DataException
271
+ {
272
+ JsonRecordValidateException(String message)
273
+ {
274
+ super(message);
275
+ }
276
+
277
+ JsonRecordValidateException(Throwable cause)
278
+ {
279
+ super(cause);
280
+ }
281
+ }
282
+ }
@@ -0,0 +1,228 @@
1
+ package org.embulk.parser.jsonl;
2
+
3
+ import com.google.common.collect.ImmutableList;
4
+ import com.google.common.collect.Lists;
5
+ import org.embulk.EmbulkTestRuntime;
6
+ import org.embulk.config.ConfigSource;
7
+ import org.embulk.config.TaskSource;
8
+ import org.embulk.spi.ColumnConfig;
9
+ import org.embulk.spi.DataException;
10
+ import org.embulk.spi.FileInput;
11
+ import org.embulk.spi.ParserPlugin;
12
+ import org.embulk.spi.Schema;
13
+ import org.embulk.spi.SchemaConfig;
14
+ import org.embulk.spi.TestPageBuilderReader.MockPageOutput;
15
+ import org.embulk.spi.time.Timestamp;
16
+ import org.embulk.spi.type.Type;
17
+ import org.embulk.spi.util.InputStreamFileInput;
18
+ import org.embulk.spi.util.Pages;
19
+ import org.junit.Before;
20
+ import org.junit.Rule;
21
+ import org.junit.Test;
22
+
23
+ import java.io.ByteArrayInputStream;
24
+ import java.io.IOException;
25
+ import java.io.InputStream;
26
+ import java.util.List;
27
+
28
+ import static org.embulk.spi.type.Types.BOOLEAN;
29
+ import static org.embulk.spi.type.Types.DOUBLE;
30
+ import static org.embulk.spi.type.Types.JSON;
31
+ import static org.embulk.spi.type.Types.LONG;
32
+ import static org.embulk.spi.type.Types.STRING;
33
+ import static org.embulk.spi.type.Types.TIMESTAMP;
34
+ import static org.junit.Assert.assertEquals;
35
+ import static org.junit.Assert.assertNull;
36
+ import static org.junit.Assert.assertTrue;
37
+ import static org.junit.Assert.fail;
38
+ import static org.msgpack.value.ValueFactory.newArray;
39
+ import static org.msgpack.value.ValueFactory.newMap;
40
+ import static org.msgpack.value.ValueFactory.newString;
41
+
42
+ public class TestJsonlParserPlugin
43
+ {
44
+ @Rule
45
+ public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
46
+
47
+ private ConfigSource config;
48
+ private JsonlParserPlugin plugin;
49
+ private MockPageOutput output;
50
+
51
+ @Before
52
+ public void createResource()
53
+ {
54
+ config = config().set("type", "jsonl");
55
+ plugin = new JsonlParserPlugin();
56
+ recreatePageOutput();
57
+ }
58
+
59
+ private void recreatePageOutput()
60
+ {
61
+ output = new MockPageOutput();
62
+ }
63
+
64
+ @Test
65
+ public void skipRecords()
66
+ throws Exception
67
+ {
68
+ SchemaConfig schema = schema(
69
+ column("_c0", BOOLEAN), column("_c1", LONG), column("_c2", DOUBLE),
70
+ column("_c3", STRING), column("_c4", TIMESTAMP), column("_c5", JSON));
71
+ ConfigSource config = this.config.deepCopy().set("columns", schema);
72
+
73
+ transaction(config, fileInput(
74
+ "[]",
75
+ "\"embulk\"",
76
+ "10",
77
+ "true",
78
+ "false",
79
+ "null"
80
+ ));
81
+
82
+ List<Object[]> records = Pages.toObjects(schema.toSchema(), output.pages);
83
+ assertEquals(0, records.size());
84
+ }
85
+
86
+ @Test
87
+ public void throwDataException()
88
+ throws Exception
89
+ {
90
+ SchemaConfig schema = schema(
91
+ column("_c0", BOOLEAN), column("_c1", LONG), column("_c2", DOUBLE),
92
+ column("_c3", STRING), column("_c4", TIMESTAMP), column("_c5", JSON));
93
+ ConfigSource config = this.config.deepCopy().set("columns", schema).set("stop_on_invalid_record", true);
94
+
95
+ try {
96
+ transaction(config, fileInput(
97
+ "\"not_map_value\""
98
+ ));
99
+ fail();
100
+ }
101
+ catch (Throwable t) {
102
+ assertTrue(t instanceof DataException);
103
+ }
104
+ }
105
+
106
+ @Test
107
+ public void writeNils()
108
+ throws Exception
109
+ {
110
+ SchemaConfig schema = schema(
111
+ column("_c0", BOOLEAN), column("_c1", LONG), column("_c2", DOUBLE),
112
+ column("_c3", STRING), column("_c4", TIMESTAMP), column("_c5", JSON));
113
+ ConfigSource config = this.config.deepCopy().set("columns", schema);
114
+
115
+ transaction(config, fileInput(
116
+ "{}",
117
+ "{\"_c0\":null,\"_c1\":null,\"_c2\":null}",
118
+ "{\"_c3\":null,\"_c4\":null,\"_c5\":null}",
119
+ "{}"
120
+ ));
121
+
122
+ List<Object[]> records = Pages.toObjects(schema.toSchema(), output.pages);
123
+ assertEquals(4, records.size());
124
+
125
+ for (Object[] record : records) {
126
+ for (int i = 0; i < 6; i++) {
127
+ assertNull(record[i]);
128
+ }
129
+ }
130
+ }
131
+
132
+ @Test
133
+ public void useNormal()
134
+ throws Exception
135
+ {
136
+ SchemaConfig schema = schema(
137
+ column("_c0", BOOLEAN), column("_c1", LONG), column("_c2", DOUBLE),
138
+ column("_c3", STRING), column("_c4", TIMESTAMP, config().set("format", "%Y-%m-%d %H:%M:%S %Z")), column("_c5", JSON));
139
+ List<ConfigSource> configs = Lists.newArrayList(
140
+ this.config.deepCopy().set("columns", schema),
141
+ this.config.deepCopy().set("schema", schema)
142
+ );
143
+
144
+ for (ConfigSource config : configs) {
145
+ transaction(config, fileInput(
146
+ "{\"_c0\":true,\"_c1\":10,\"_c2\":0.1,\"_c3\":\"embulk\",\"_c4\":\"2016-01-01 00:00:00 UTC\",\"_c5\":{\"k\":\"v\"}}",
147
+ "[1, 2, 3]",
148
+ "{\"_c0\":false,\"_c1\":-10,\"_c2\":1.0,\"_c3\":\"エンバルク\",\"_c4\":\"2016-01-01 00:00:00 +0000\",\"_c5\":[\"e0\",\"e1\"]}"
149
+ ));
150
+
151
+ List<Object[]> records = Pages.toObjects(schema.toSchema(), output.pages);
152
+ assertEquals(2, records.size());
153
+
154
+ Object[] record;
155
+ {
156
+ record = records.get(0);
157
+ assertEquals(true, record[0]);
158
+ assertEquals(10L, record[1]);
159
+ assertEquals(0.1, (Double) record[2], 0.0001);
160
+ assertEquals("embulk", record[3]);
161
+ assertEquals(Timestamp.ofEpochSecond(1451606400L), record[4]);
162
+ assertEquals(newMap(newString("k"), newString("v")), record[5]);
163
+ }
164
+ {
165
+ record = records.get(1);
166
+ assertEquals(false, record[0]);
167
+ assertEquals(-10L, record[1]);
168
+ assertEquals(1.0, (Double) record[2], 0.0001);
169
+ assertEquals("エンバルク", record[3]);
170
+ assertEquals(Timestamp.ofEpochSecond(1451606400L), record[4]);
171
+ assertEquals(newArray(newString("e0"), newString("e1")), record[5]);
172
+ }
173
+
174
+ recreatePageOutput();
175
+ }
176
+ }
177
+
178
+ private ConfigSource config()
179
+ {
180
+ return runtime.getExec().newConfigSource();
181
+ }
182
+
183
+ private void transaction(ConfigSource config, final FileInput input)
184
+ {
185
+ plugin.transaction(config, new ParserPlugin.Control()
186
+ {
187
+ @Override
188
+ public void run(TaskSource taskSource, Schema schema)
189
+ {
190
+ plugin.run(taskSource, schema, input, output);
191
+ }
192
+ });
193
+ }
194
+
195
+ private FileInput fileInput(String... lines)
196
+ throws Exception
197
+ {
198
+ StringBuilder sb = new StringBuilder();
199
+ for (String line : lines) {
200
+ sb.append(line).append("\n");
201
+ }
202
+
203
+ ByteArrayInputStream in = new ByteArrayInputStream(sb.toString().getBytes());
204
+ return new InputStreamFileInput(runtime.getBufferAllocator(), provider(in));
205
+ }
206
+
207
+ private InputStreamFileInput.IteratorProvider provider(InputStream... inputStreams)
208
+ throws IOException
209
+ {
210
+ return new InputStreamFileInput.IteratorProvider(
211
+ ImmutableList.copyOf(inputStreams));
212
+ }
213
+
214
+ private SchemaConfig schema(ColumnConfig... columns)
215
+ {
216
+ return new SchemaConfig(Lists.newArrayList(columns));
217
+ }
218
+
219
+ private ColumnConfig column(String name, Type type)
220
+ {
221
+ return column(name, type, config());
222
+ }
223
+
224
+ private ColumnConfig column(String name, Type type, ConfigSource option)
225
+ {
226
+ return new ColumnConfig(name, type, option);
227
+ }
228
+ }
metadata CHANGED
@@ -1,41 +1,41 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-parser-jsonl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shunsuke Mikami
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-04-04 00:00:00.000000000 Z
11
+ date: 2016-02-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: bundler
15
14
  requirement: !ruby/object:Gem::Requirement
16
15
  requirements:
17
- - - "~>"
16
+ - - ~>
18
17
  - !ruby/object:Gem::Version
19
18
  version: '1.0'
20
- type: :development
19
+ name: bundler
21
20
  prerelease: false
21
+ type: :development
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "~>"
24
+ - - ~>
25
25
  - !ruby/object:Gem::Version
26
26
  version: '1.0'
27
27
  - !ruby/object:Gem::Dependency
28
- name: rake
29
28
  requirement: !ruby/object:Gem::Requirement
30
29
  requirements:
31
- - - "~>"
30
+ - - ~>
32
31
  - !ruby/object:Gem::Version
33
32
  version: '10.0'
34
- type: :development
33
+ name: rake
35
34
  prerelease: false
35
+ type: :development
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - "~>"
38
+ - - ~>
39
39
  - !ruby/object:Gem::Version
40
40
  version: '10.0'
41
41
  description: Parses Jsonl files read by other file input plugins.
@@ -45,36 +45,46 @@ executables: []
45
45
  extensions: []
46
46
  extra_rdoc_files: []
47
47
  files:
48
- - ".gitignore"
48
+ - .gitignore
49
+ - CHANGELOG.md
49
50
  - Gemfile
50
51
  - LICENSE.txt
51
52
  - README.md
52
53
  - Rakefile
54
+ - build.gradle
53
55
  - embulk-parser-jsonl.gemspec
56
+ - gradle/wrapper/gradle-wrapper.jar
57
+ - gradle/wrapper/gradle-wrapper.properties
58
+ - gradlew
59
+ - gradlew.bat
54
60
  - lib/embulk/guess/jsonl.rb
55
61
  - lib/embulk/parser/jsonl.rb
62
+ - settings.gradle
63
+ - src/main/java/org/embulk/parser/jsonl/JsonlParserPlugin.java
64
+ - src/test/java/org/embulk/parser/jsonl/TestJsonlParserPlugin.java
65
+ - classpath/embulk-parser-jsonl-0.1.0.jar
56
66
  homepage: https://github.com/shun0102/embulk-parser-jsonl
57
67
  licenses:
58
68
  - MIT
59
69
  metadata: {}
60
- post_install_message:
70
+ post_install_message:
61
71
  rdoc_options: []
62
72
  require_paths:
63
73
  - lib
64
74
  required_ruby_version: !ruby/object:Gem::Requirement
65
75
  requirements:
66
- - - ">="
76
+ - - '>='
67
77
  - !ruby/object:Gem::Version
68
78
  version: '0'
69
79
  required_rubygems_version: !ruby/object:Gem::Requirement
70
80
  requirements:
71
- - - ">="
81
+ - - '>='
72
82
  - !ruby/object:Gem::Version
73
83
  version: '0'
74
84
  requirements: []
75
- rubyforge_project:
76
- rubygems_version: 2.2.2
77
- signing_key:
85
+ rubyforge_project:
86
+ rubygems_version: 2.1.9
87
+ signing_key:
78
88
  specification_version: 4
79
89
  summary: Jsonl parser plugin for Embulk
80
90
  test_files: []