embulk-parser-xpath2 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e137d05fc96ea14274af6132b979b748f179f13f
4
+ data.tar.gz: b0c73e40ec2c784d6d3c74e4ad034e87c145163c
5
+ SHA512:
6
+ metadata.gz: 646bc1169d84714eee9908bf20963a625d387b1145f9fcb57c32a1257a92b977373a848b2e3657d206714458a8d17c0bb74489e6b07d103329f4a3b6eefb1810
7
+ data.tar.gz: 4e41dca4a70dd30489e689d678656911292dcf40d90cdf2a1332149ba964df11c9cedd06e3b49eb28428b5f31021f5284e28b6b64ee3e97894ee970c73d69891
data/.gitignore ADDED
@@ -0,0 +1,16 @@
1
+ *~
2
+ /pkg/
3
+ /tmp/
4
+ *.gemspec
5
+ .gradle/
6
+ /classpath/
7
+ build/
8
+ .idea
9
+ /.settings/
10
+ /.metadata/
11
+ .classpath
12
+ .project
13
+ /bin/
14
+ project
15
+ *.iml
16
+ out
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2017 maji-KY
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,56 @@
1
+ # Xml parser plugin for Embulk
2
+
3
+ Embulk parser plugin for parsing xml data by XPath perfectly!
4
+
5
+ ## Features
6
+
7
+ - namespace awareness
8
+ - nullable columns
9
+
10
+ ## Overview
11
+
12
+ * **Plugin type**: parser
13
+ * **Guess supported**: no
14
+
15
+ ## Configuration
16
+
17
+ - **type**: specify this plugin as `"xpath2"` (string, required)
18
+ - **root**: root element to start fetching each entries (string, required)
19
+ - **schema**: specify the attribute of table and data type (required)
20
+ - **namespaces**: specify namespaces (required)
21
+
22
+ ## Example
23
+
24
+ ```yaml
25
+ parser:
26
+ type: xpath2
27
+ root: '/ns1:root/ns2:entry'
28
+ schema:
29
+ - { path: 'ns2:id', name: id, type: long }
30
+ - { path: 'ns2:title', name: title, type: string }
31
+ - { path: 'ns2:meta/ns2:author', name: author, type: string }
32
+ - { path: 'ns2:date', name: date, type: timestamp, format: '%Y%m%d' }
33
+ namespaces: {ns1: 'http://example.com/ns1/', ns2: 'http://example.com/ns2/'}
34
+ ```
35
+
36
+ Then you can fetch entries from the following xml:
37
+ ```xml
38
+ <?xml version="1.0"?>
39
+ <ns1:root
40
+ xmlns:ns1="http://example.com/ns1/"
41
+ xmlns:ns2="http://example.com/ns2/">
42
+ <ns2:entry>
43
+ <ns2:id>1</ns2:id>
44
+ <ns2:title>Hello!</ns2:title>
45
+ <ns2:meta>
46
+ <ns2:author>maji-KY</ns2:author>
47
+ </ns2:meta>
48
+ <ns2:date>20010101</ns2:date>
49
+ </ns2:entry>
50
+ </ns1:root>
51
+ ```
52
+ ## Build
53
+
54
+ ```
55
+ $ ./gradlew gem
56
+ ```
data/build.gradle ADDED
@@ -0,0 +1,84 @@
1
+ plugins {
2
+ id "com.jfrog.bintray" version "1.1"
3
+ id "com.github.jruby-gradle.base" version "0.1.5"
4
+ id "java"
5
+ id "scala"
6
+ }
7
+ import com.github.jrubygradle.JRubyExec
8
+ repositories {
9
+ mavenCentral()
10
+ jcenter()
11
+ }
12
+ configurations {
13
+ provided
14
+ }
15
+
16
+ version = "0.0.1"
17
+
18
+ sourceCompatibility = 1.8
19
+ targetCompatibility = 1.8
20
+
21
+ dependencies {
22
+ compile "org.embulk:embulk-core:0.8.32"
23
+ provided "org.embulk:embulk-core:0.8.32"
24
+ testCompile "org.embulk:embulk-core:0.8.32:tests"
25
+ testCompile "org.embulk:embulk-standards:0.8.32"
26
+ testCompile "junit:junit:4.+"
27
+
28
+ compile group: 'org.scala-lang', name: 'scala-library', version: '2.12.3'
29
+ testCompile group: 'org.scalatest', name: 'scalatest_2.12', version: '3.0.4'
30
+
31
+ }
32
+
33
+ task classpath(type: Copy, dependsOn: ["jar"]) {
34
+ doFirst { file("classpath").deleteDir() }
35
+ from (configurations.runtime - configurations.provided + files(jar.archivePath))
36
+ into "classpath"
37
+ }
38
+ clean { delete "classpath" }
39
+
40
+ task gem(type: JRubyExec, dependsOn: ["gemspec", "classpath"]) {
41
+ jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "build"
42
+ script "${project.name}.gemspec"
43
+ doLast { ant.move(file: "${project.name}-${project.version}.gem", todir: "pkg") }
44
+ }
45
+
46
+ task gemPush(type: JRubyExec, dependsOn: ["gem"]) {
47
+ jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "push"
48
+ script "pkg/${project.name}-${project.version}.gem"
49
+ }
50
+
51
+ task "package"(dependsOn: ["gemspec", "classpath"]) {
52
+ doLast {
53
+ println "> Build succeeded."
54
+ println "> You can run embulk with '-L ${file(".").absolutePath}' argument."
55
+ }
56
+ }
57
+
58
+ task gemspec {
59
+ ext.gemspecFile = file("${project.name}.gemspec")
60
+ inputs.file "build.gradle"
61
+ outputs.file gemspecFile
62
+ doLast { gemspecFile.write($/
63
+ Gem::Specification.new do |spec|
64
+ spec.name = "${project.name}"
65
+ spec.version = "${project.version}"
66
+ spec.authors = ["maji-KY"]
67
+ spec.summary = %[Embulk parser plugin for XML]
68
+ spec.description = %[Parses XML files read by other file input plugins.]
69
+ spec.email = ["maji-KY@neco-labo.com"]
70
+ spec.licenses = ["MIT"]
71
+ spec.homepage = "https://github.com/maji-KY/embulk-parser-xpath2"
72
+
73
+ spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"]
74
+ spec.test_files = spec.files.grep(%r"^(test|spec)/")
75
+ spec.require_paths = ["lib"]
76
+
77
+ #spec.add_dependency 'YOUR_GEM_DEPENDENCY', ['~> YOUR_GEM_DEPENDENCY_VERSION']
78
+ spec.add_development_dependency 'bundler', ['~> 1.0']
79
+ spec.add_development_dependency 'rake', ['>= 10.0']
80
+ end
81
+ /$)
82
+ }
83
+ }
84
+ clean { delete "${project.name}.gemspec" }
data/build.sbt ADDED
@@ -0,0 +1,23 @@
1
+ lazy val root = (project in file(".")).
2
+ settings(
3
+ inThisBuild(List(
4
+ organization := "com.github.maji-KY",
5
+ scalaVersion := "2.12.3",
6
+ version := "0.0.1-SNAPSHOT"
7
+ )),
8
+ name := "embulk-parser-xpath2",
9
+ scalacOptions ++= Seq(
10
+ "-deprecation",
11
+ "-feature",
12
+ "-unchecked",
13
+ "-Xlint",
14
+ "-Ywarn-dead-code",
15
+ "-Ywarn-numeric-widen",
16
+ "-Ywarn-unused",
17
+ "-Ywarn-value-discard"
18
+ )
19
+ )
20
+
21
+ resolvers += Resolver.jcenterRepo
22
+
23
+ libraryDependencies ++= Dependencies.value
Binary file
Binary file
@@ -0,0 +1,6 @@
1
+ #Thu Oct 05 20:56:16 JST 2017
2
+ distributionBase=GRADLE_USER_HOME
3
+ distributionPath=wrapper/dists
4
+ zipStoreBase=GRADLE_USER_HOME
5
+ zipStorePath=wrapper/dists
6
+ distributionUrl=https\://services.gradle.org/distributions/gradle-3.5-rc-2-bin.zip
data/gradlew ADDED
@@ -0,0 +1,172 @@
1
+ #!/usr/bin/env sh
2
+
3
+ ##############################################################################
4
+ ##
5
+ ## Gradle start up script for UN*X
6
+ ##
7
+ ##############################################################################
8
+
9
+ # Attempt to set APP_HOME
10
+ # Resolve links: $0 may be a link
11
+ PRG="$0"
12
+ # Need this for relative symlinks.
13
+ while [ -h "$PRG" ] ; do
14
+ ls=`ls -ld "$PRG"`
15
+ link=`expr "$ls" : '.*-> \(.*\)$'`
16
+ if expr "$link" : '/.*' > /dev/null; then
17
+ PRG="$link"
18
+ else
19
+ PRG=`dirname "$PRG"`"/$link"
20
+ fi
21
+ done
22
+ SAVED="`pwd`"
23
+ cd "`dirname \"$PRG\"`/" >/dev/null
24
+ APP_HOME="`pwd -P`"
25
+ cd "$SAVED" >/dev/null
26
+
27
+ APP_NAME="Gradle"
28
+ APP_BASE_NAME=`basename "$0"`
29
+
30
+ # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
31
+ DEFAULT_JVM_OPTS=""
32
+
33
+ # Use the maximum available, or set MAX_FD != -1 to use that value.
34
+ MAX_FD="maximum"
35
+
36
+ warn ( ) {
37
+ echo "$*"
38
+ }
39
+
40
+ die ( ) {
41
+ echo
42
+ echo "$*"
43
+ echo
44
+ exit 1
45
+ }
46
+
47
+ # OS specific support (must be 'true' or 'false').
48
+ cygwin=false
49
+ msys=false
50
+ darwin=false
51
+ nonstop=false
52
+ case "`uname`" in
53
+ CYGWIN* )
54
+ cygwin=true
55
+ ;;
56
+ Darwin* )
57
+ darwin=true
58
+ ;;
59
+ MINGW* )
60
+ msys=true
61
+ ;;
62
+ NONSTOP* )
63
+ nonstop=true
64
+ ;;
65
+ esac
66
+
67
+ CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
68
+
69
+ # Determine the Java command to use to start the JVM.
70
+ if [ -n "$JAVA_HOME" ] ; then
71
+ if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
72
+ # IBM's JDK on AIX uses strange locations for the executables
73
+ JAVACMD="$JAVA_HOME/jre/sh/java"
74
+ else
75
+ JAVACMD="$JAVA_HOME/bin/java"
76
+ fi
77
+ if [ ! -x "$JAVACMD" ] ; then
78
+ die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
79
+
80
+ Please set the JAVA_HOME variable in your environment to match the
81
+ location of your Java installation."
82
+ fi
83
+ else
84
+ JAVACMD="java"
85
+ which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
86
+
87
+ Please set the JAVA_HOME variable in your environment to match the
88
+ location of your Java installation."
89
+ fi
90
+
91
+ # Increase the maximum file descriptors if we can.
92
+ if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
93
+ MAX_FD_LIMIT=`ulimit -H -n`
94
+ if [ $? -eq 0 ] ; then
95
+ if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
96
+ MAX_FD="$MAX_FD_LIMIT"
97
+ fi
98
+ ulimit -n $MAX_FD
99
+ if [ $? -ne 0 ] ; then
100
+ warn "Could not set maximum file descriptor limit: $MAX_FD"
101
+ fi
102
+ else
103
+ warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
104
+ fi
105
+ fi
106
+
107
+ # For Darwin, add options to specify how the application appears in the dock
108
+ if $darwin; then
109
+ GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
110
+ fi
111
+
112
+ # For Cygwin, switch paths to Windows format before running java
113
+ if $cygwin ; then
114
+ APP_HOME=`cygpath --path --mixed "$APP_HOME"`
115
+ CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
116
+ JAVACMD=`cygpath --unix "$JAVACMD"`
117
+
118
+ # We build the pattern for arguments to be converted via cygpath
119
+ ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
120
+ SEP=""
121
+ for dir in $ROOTDIRSRAW ; do
122
+ ROOTDIRS="$ROOTDIRS$SEP$dir"
123
+ SEP="|"
124
+ done
125
+ OURCYGPATTERN="(^($ROOTDIRS))"
126
+ # Add a user-defined pattern to the cygpath arguments
127
+ if [ "$GRADLE_CYGPATTERN" != "" ] ; then
128
+ OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
129
+ fi
130
+ # Now convert the arguments - kludge to limit ourselves to /bin/sh
131
+ i=0
132
+ for arg in "$@" ; do
133
+ CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
134
+ CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
135
+
136
+ if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
137
+ eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
138
+ else
139
+ eval `echo args$i`="\"$arg\""
140
+ fi
141
+ i=$((i+1))
142
+ done
143
+ case $i in
144
+ (0) set -- ;;
145
+ (1) set -- "$args0" ;;
146
+ (2) set -- "$args0" "$args1" ;;
147
+ (3) set -- "$args0" "$args1" "$args2" ;;
148
+ (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
149
+ (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
150
+ (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
151
+ (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
152
+ (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
153
+ (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
154
+ esac
155
+ fi
156
+
157
+ # Escape application args
158
+ save ( ) {
159
+ for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
160
+ echo " "
161
+ }
162
+ APP_ARGS=$(save "$@")
163
+
164
+ # Collect all arguments for the java command, following the shell quoting and substitution rules
165
+ eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
166
+
167
+ # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
168
+ if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
169
+ cd "$(dirname "$0")"
170
+ fi
171
+
172
+ exec "$JAVACMD" "$@"
data/gradlew.bat ADDED
@@ -0,0 +1,84 @@
1
+ @if "%DEBUG%" == "" @echo off
2
+ @rem ##########################################################################
3
+ @rem
4
+ @rem Gradle startup script for Windows
5
+ @rem
6
+ @rem ##########################################################################
7
+
8
+ @rem Set local scope for the variables with windows NT shell
9
+ if "%OS%"=="Windows_NT" setlocal
10
+
11
+ set DIRNAME=%~dp0
12
+ if "%DIRNAME%" == "" set DIRNAME=.
13
+ set APP_BASE_NAME=%~n0
14
+ set APP_HOME=%DIRNAME%
15
+
16
+ @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
17
+ set DEFAULT_JVM_OPTS=
18
+
19
+ @rem Find java.exe
20
+ if defined JAVA_HOME goto findJavaFromJavaHome
21
+
22
+ set JAVA_EXE=java.exe
23
+ %JAVA_EXE% -version >NUL 2>&1
24
+ if "%ERRORLEVEL%" == "0" goto init
25
+
26
+ echo.
27
+ echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28
+ echo.
29
+ echo Please set the JAVA_HOME variable in your environment to match the
30
+ echo location of your Java installation.
31
+
32
+ goto fail
33
+
34
+ :findJavaFromJavaHome
35
+ set JAVA_HOME=%JAVA_HOME:"=%
36
+ set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37
+
38
+ if exist "%JAVA_EXE%" goto init
39
+
40
+ echo.
41
+ echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42
+ echo.
43
+ echo Please set the JAVA_HOME variable in your environment to match the
44
+ echo location of your Java installation.
45
+
46
+ goto fail
47
+
48
+ :init
49
+ @rem Get command-line arguments, handling Windows variants
50
+
51
+ if not "%OS%" == "Windows_NT" goto win9xME_args
52
+
53
+ :win9xME_args
54
+ @rem Slurp the command line arguments.
55
+ set CMD_LINE_ARGS=
56
+ set _SKIP=2
57
+
58
+ :win9xME_args_slurp
59
+ if "x%~1" == "x" goto execute
60
+
61
+ set CMD_LINE_ARGS=%*
62
+
63
+ :execute
64
+ @rem Setup the command line
65
+
66
+ set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
67
+
68
+ @rem Execute Gradle
69
+ "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
70
+
71
+ :end
72
+ @rem End local scope for the variables with windows NT shell
73
+ if "%ERRORLEVEL%"=="0" goto mainEnd
74
+
75
+ :fail
76
+ rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
77
+ rem the _cmd.exe /c_ return code!
78
+ if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
79
+ exit /b 1
80
+
81
+ :mainEnd
82
+ if "%OS%"=="Windows_NT" endlocal
83
+
84
+ :omega
@@ -0,0 +1,7 @@
1
+ module Embulk
2
+ module Guess
3
+
4
+ # TODO implement this
5
+
6
+ end
7
+ end
@@ -0,0 +1,3 @@
1
+ Embulk::JavaPlugin.register_parser(
2
+ "xpath2", "org.embulk.parser.xpath2.XPath2ParserPlugin",
3
+ File.expand_path('../../../../classpath', __FILE__))
@@ -0,0 +1,19 @@
1
+ package org.embulk.parser.xpath2
2
+
3
+ import scala.util.control.Exception.ignoring
4
+
5
+ object LoanPattern {
6
+
7
+ type Closable = { def close(): Unit }
8
+
9
+ def apply[R <: Closable, A](resource: R)(f: R => A): A = {
10
+ try {
11
+ f(resource)
12
+ } finally {
13
+ ignoring(classOf[Throwable]) apply {
14
+ resource.close()
15
+ }
16
+ }
17
+ }
18
+
19
+ }
@@ -0,0 +1,24 @@
1
+ package org.embulk.parser.xpath2
2
+
3
+ import org.embulk.config.Task
4
+ import org.embulk.config.Config
5
+ import org.embulk.config.ConfigDefault
6
+ import org.embulk.parser.xpath2.config.{NamespacesConfig, SchemaConfig}
7
+ import org.embulk.spi.time.TimestampParser
8
+
9
+ trait PluginTask extends Task with TimestampParser.Task {
10
+ @Config("stop_on_invalid_record")
11
+ @ConfigDefault("false")
12
+ def getStopOnInvalidRecord: Boolean
13
+
14
+ @Config("root")
15
+ def getRoot: String
16
+
17
+ @Config("schema")
18
+ def getSchema: SchemaConfig
19
+
20
+ @Config("namespaces")
21
+ def getNamespaces: NamespacesConfig
22
+
23
+ }
24
+
@@ -0,0 +1,106 @@
1
+ package org.embulk.parser.xpath2
2
+
3
+ import java.util
4
+ import javax.xml.namespace.NamespaceContext
5
+ import javax.xml.parsers.{DocumentBuilder, DocumentBuilderFactory}
6
+ import javax.xml.xpath.{XPathConstants, XPathExpression, XPathFactory}
7
+
8
+ import org.embulk.config._
9
+ import org.embulk.parser.xpath2.config.ColumnConfig
10
+ import org.embulk.spi._
11
+ import org.embulk.spi.`type`._
12
+ import org.embulk.spi.time.TimestampParser
13
+ import org.embulk.spi.util.FileInputInputStream
14
+ import org.slf4j.Logger
15
+ import org.w3c.dom.{Document, Node, NodeList}
16
+
17
+ import scala.collection.JavaConverters._
18
+ import scala.collection.immutable
19
+ import scala.util.control.NonFatal
20
+
21
+ class XPath2ParserPlugin extends ParserPlugin {
22
+
23
+ val logger: Logger = Exec.getLogger(classOf[XPath2ParserPlugin])
24
+
25
+ def docBuilder: DocumentBuilder = {
26
+ val factory: DocumentBuilderFactory = DocumentBuilderFactory.newInstance
27
+ factory.setNamespaceAware(true)
28
+ factory.newDocumentBuilder()
29
+ }
30
+
31
+ override def transaction(config: ConfigSource, control: ParserPlugin.Control): Unit = {
32
+ val task = config.loadConfig(classOf[PluginTask])
33
+ control.run(
34
+ task.dump(),
35
+ new Schema(task.getSchema.columns.asScala.zipWithIndex.map { case (x, idx) => new Column(idx, x.name, x.`type`) }.asJava)
36
+ )
37
+ }
38
+
39
+ override def run(taskSource: TaskSource, schema: Schema, input: FileInput, output: PageOutput): Unit = {
40
+
41
+ val task: PluginTask = taskSource.loadTask(classOf[PluginTask])
42
+ val stopOnInvalidRecord: Boolean = task.getStopOnInvalidRecord
43
+
44
+ val xPathInstance = XPathFactory.newInstance.newXPath()
45
+ xPathInstance.setNamespaceContext(new NamespaceContext {
46
+ override def getPrefix(namespaceURI: String): String = task.getNamespaces.conf.asScala.collectFirst { case (_, v) if v == namespaceURI => v }.orNull
47
+ override def getPrefixes(namespaceURI: String): util.Iterator[_] = task.getNamespaces.conf.asScala.keys.asJava.iterator()
48
+ override def getNamespaceURI(prefix: String): String = task.getNamespaces.conf.asScala(prefix)
49
+ })
50
+
51
+ val rootXPath: XPathExpression = xPathInstance.compile(task.getRoot)
52
+ val columnXPaths: immutable.Seq[XPathExpression] = task.getSchema.columns.asScala.map(x => xPathInstance.compile(x.path)).toList
53
+
54
+ val timestampParsers: Map[String, TimestampParser] = task.getSchema.columns.asScala
55
+ .collect { case ColumnConfig(_, name, _, Some(timestampColumnOption), _) => (name, new TimestampParser(task, timestampColumnOption)) }.toMap
56
+
57
+ LoanPattern(new PageBuilder(Exec.getBufferAllocator, schema, output)) { pb =>
58
+ while (input.nextFile()) {
59
+ parseXML(input) match {
60
+ case Right(doc) =>
61
+ val rootNodes = rootXPath.evaluate(doc, XPathConstants.NODESET).asInstanceOf[NodeList]
62
+ (0 until rootNodes.getLength).map(rootNodes.item).foreach { node =>
63
+ columnXPaths.zipWithIndex.foreach { case (xPath, idx) =>
64
+ val value: Node = xPath.evaluate(node, XPathConstants.NODE).asInstanceOf[Node]
65
+ val column = schema.getColumn(idx)
66
+ if (value == null) {
67
+ pb.setNull(column)
68
+ } else {
69
+ setColumn(pb, column, value.getTextContent, timestampParsers)
70
+ }
71
+ }
72
+ pb.addRecord()
73
+ }
74
+ case Left(e) =>
75
+ if(stopOnInvalidRecord) {
76
+ throw new DataException(e)
77
+ } else {
78
+ logger.warn(s"Skipped invalid record $e")
79
+ }
80
+ }
81
+ pb.flush()
82
+ }
83
+ pb.finish()
84
+ pb.close()
85
+ }
86
+ }
87
+
88
+ def parseXML(input: FileInput): Either[Throwable, Document] = {
89
+ val stream = new FileInputInputStream(input)
90
+ try {
91
+ Right(docBuilder.parse(stream))
92
+ } catch {
93
+ case NonFatal(e) => Left(e)
94
+ }
95
+ }
96
+
97
+ def setColumn(pb: PageBuilder, column: Column, value: String, timestampParsers: Map[String, TimestampParser]): Unit = column.getType match {
98
+ case _: StringType => pb.setString(column, value)
99
+ case _: LongType => pb.setLong(column, value.toLong)
100
+ case _: DoubleType => pb.setDouble(column, value.toDouble)
101
+ case _: BooleanType => pb.setBoolean(column, value.toBoolean)
102
+ case _: JsonType => pb.setString(column, value) // treat json as string.
103
+ case _: TimestampType => pb.setTimestamp(column, timestampParsers(column.getName).parse(value))
104
+ }
105
+
106
+ }
@@ -0,0 +1,13 @@
1
+ package org.embulk.parser.xpath2.config
2
+
3
+ import com.fasterxml.jackson.annotation.{JsonCreator, JsonValue}
4
+ import org.embulk.config.ConfigSource
5
+
6
+ import scala.collection.JavaConverters._
7
+
8
+ case class NamespacesConfig(conf: java.util.Map[String, String], src: ConfigSource) {
9
+ @JsonCreator()
10
+ def this(src: ConfigSource) = this(src.getAttributeNames.asScala.map(k => (k, src.get(classOf[String], k))).toMap.asJava, src)
11
+ @JsonValue()
12
+ def getConfigSource: ConfigSource = src.deepCopy()
13
+ }
@@ -0,0 +1,59 @@
1
+ package org.embulk.parser.xpath2.config
2
+
3
+ import java.util
4
+
5
+ import com.fasterxml.jackson.annotation.{JsonCreator, JsonValue}
6
+ import com.google.common.base.Optional
7
+ import org.embulk.config.{Config, ConfigDefault, ConfigSource}
8
+ import org.embulk.spi.`type`.{TimestampType, Type}
9
+ import org.embulk.spi.time.TimestampParser
10
+ import org.embulk.spi.time.TimestampParser.{Task, TimestampColumnOption}
11
+ import org.joda.time.DateTimeZone
12
+
13
+ import scala.beans.BeanProperty
14
+
15
+ case class SchemaConfig @JsonCreator()(columns: java.util.List[ColumnConfig]) {
16
+ @JsonValue()
17
+ def getColumns: util.List[ColumnConfig] = columns
18
+ }
19
+
20
+ case class ColumnConfig(path: String, name: String, `type`: Type, timestampOption: Option[TimestampColumnOption], option: ConfigSource) {
21
+
22
+ @JsonCreator()
23
+ def this(src: ConfigSource) = {
24
+ this(src.get(classOf[String], "path"), src.get(classOf[String], "name"), src.get(classOf[Type], "type"), ColumnConfig.getTimestampOption(src, src.get(classOf[Type], "type")), src)
25
+ }
26
+
27
+ @JsonValue()
28
+ def getConfigSource: ConfigSource = option
29
+
30
+ }
31
+
32
+ private class TimestampColumnOptionImpl(timezone: Optional[DateTimeZone], format: Optional[String], date: Optional[String]) extends TimestampColumnOption {
33
+
34
+ @JsonCreator()
35
+ def this(src: ConfigSource) = {
36
+ this(src.get(classOf[Optional[DateTimeZone]], "timezone", Optional.absent[DateTimeZone]()), src.get(classOf[Optional[String]], "format", Optional.absent[String]()), src.get(classOf[Optional[String]], "date", Optional.absent[String]()))
37
+ }
38
+
39
+ @Config("timezone")
40
+ @ConfigDefault("null")
41
+ override val getTimeZone = timezone
42
+
43
+ @Config("format")
44
+ @ConfigDefault("null")
45
+ override val getFormat = format
46
+
47
+ @Config("date")
48
+ @ConfigDefault("null")
49
+ override val getDate = date
50
+ }
51
+
52
+ object ColumnConfig {
53
+ private def getTimestampOption(src: ConfigSource, `type`: Type): Option[TimestampColumnOption] = `type` match {
54
+ case _: TimestampType => Some(getOption(src).loadConfig(classOf[TimestampColumnOptionImpl]))
55
+ case _ => None
56
+ }
57
+
58
+ private def getOption(src: ConfigSource) = src.deepCopy().remove("path").remove("name").remove("type")
59
+ }
@@ -0,0 +1,13 @@
1
+ <?xml version="1.0"?>
2
+ <ns1:root
3
+ xmlns:ns1="http://example.com/ns1/"
4
+ xmlns:ns2="http://example.com/ns2/">
5
+ <ns2:entry>
6
+ <ns2:id>1</ns2:id>
7
+ <ns2:title>Hello!</ns2:title>
8
+ <ns2:meta>
9
+ <ns2:author>maji-KY</ns2:author>
10
+ </ns2:meta>
11
+ <ns2:date>20010101</ns2:date>
12
+ </ns2:entry>
13
+ </ns1:root>
@@ -0,0 +1,5 @@
1
+ package org.embulk.parser.xpath2
2
+
3
+ import org.scalatest.{DiagrammedAssertions, WordSpec}
4
+
5
+ abstract class UnitSpec extends WordSpec with DiagrammedAssertions
@@ -0,0 +1,134 @@
1
+ package org.embulk.parser.xpath2
2
+
3
+ import java.io.{File, FileInputStream}
4
+ import java.util
5
+
6
+ import org.embulk.EmbulkTestRuntime
7
+ import org.embulk.config.{ConfigSource, TaskSource}
8
+ import org.embulk.spi._
9
+ import org.embulk.spi.time.Timestamp
10
+ import org.embulk.spi.util.InputStreamFileInput
11
+ import org.junit.Assert._
12
+ import org.junit.{Rule, Test}
13
+
14
+ import scala.collection.JavaConverters._
15
+ import scala.collection.mutable
16
+ import scala.collection.mutable.ArrayBuffer
17
+
18
+ class XPath2ParserPluginSpec {
19
+
20
+ @Rule
21
+ def runtime = new EmbulkTestRuntime
22
+
23
+ val dataPath: String = classOf[XPath2ParserPlugin].getClassLoader.getResource("data.xml").getPath
24
+
25
+ @Test def test() {
26
+
27
+ val configSource: ConfigSource = Exec.newConfigSource()
28
+ .set("in", Map[String, String]("type" -> "file", "path_prefix" -> dataPath).asJava)
29
+ .set("root", "/ns1:root/ns2:entry")
30
+ .set("schema", List[util.Map[String, String]](
31
+ Map("path" -> "ns2:id", "name" -> "id", "type" -> "long").asJava,
32
+ Map("path" -> "ns2:title", "name" -> "title", "type" -> "string").asJava,
33
+ Map("path" -> "ns2:meta/ns2:author", "name" -> "author", "type" -> "string").asJava,
34
+ Map("path" -> "ns2:date", "name" -> "date", "type" -> "timestamp", "format" -> "%Y%m%d").asJava
35
+ ).asJava)
36
+ .set("namespaces", Map[String, String]("ns1" -> "http://example.com/ns1/", "ns2" -> "http://example.com/ns2/").asJava)
37
+ .set("out", Map[String, String]("type" -> "stdout").asJava)
38
+
39
+ val task = configSource.loadConfig(classOf[PluginTask])
40
+
41
+ var schema: Schema = null
42
+
43
+ val plugin = new XPath2ParserPlugin()
44
+ plugin.transaction(configSource, (_: TaskSource, s: Schema) => {schema = s})
45
+
46
+ val result: mutable.Buffer[collection.mutable.Map[String, Any]] = mutable.Buffer()
47
+
48
+ plugin.run(
49
+ task.dump(),
50
+ schema,
51
+ new InputStreamFileInput(Exec.getBufferAllocator(), new FileInputStream(new File(dataPath))),
52
+ new TransactionalPageOutput() {
53
+
54
+ import org.embulk.spi.PageReader
55
+
56
+ val reader = new PageReader(schema)
57
+
58
+ override def add(page: Page) = {
59
+ reader.setPage(page)
60
+
61
+ while (reader.nextRecord()) {
62
+ val record: collection.mutable.Map[String, Any] = collection.mutable.Map()
63
+
64
+ schema.getColumns().asScala.foreach { column =>
65
+
66
+ column.visit(new ColumnVisitor() {
67
+ override def timestampColumn(column: Column): Unit = {
68
+ if (reader.isNull(column)) {
69
+ record.put(column.getName, null)
70
+ } else {
71
+ record.put(column.getName, reader.getTimestamp(column))
72
+ }
73
+ }
74
+
75
+ override def stringColumn(column: Column): Unit = {
76
+ if (reader.isNull(column)) {
77
+ record.put(column.getName, null)
78
+ } else {
79
+ record.put(column.getName, reader.getString(column))
80
+ }
81
+ }
82
+
83
+ override def longColumn(column: Column): Unit = {
84
+ if (reader.isNull(column)) {
85
+ record.put(column.getName, null)
86
+ } else {
87
+ record.put(column.getName, reader.getLong(column))
88
+ }
89
+ }
90
+
91
+ override def doubleColumn(column: Column): Unit = {
92
+ if (reader.isNull(column)) {
93
+ record.put(column.getName, null)
94
+ } else {
95
+ record.put(column.getName, reader.getDouble(column))
96
+ }
97
+ }
98
+
99
+ override def booleanColumn(column: Column): Unit = {
100
+ if (reader.isNull(column)) {
101
+ record.put(column.getName, null)
102
+ } else {
103
+ record.put(column.getName, reader.getBoolean(column))
104
+ }
105
+ }
106
+
107
+ override def jsonColumn(column: Column): Unit = {
108
+ if (reader.isNull(column)) {
109
+ record.put(column.getName, null)
110
+ } else {
111
+ record.put(column.getName, reader.getString(column))
112
+ }
113
+ }
114
+ })
115
+
116
+
117
+ }
118
+ result += record
119
+ }
120
+ }
121
+
122
+ override def commit() = Exec.newTaskReport()
123
+ override def abort() = {}
124
+ override def finish() = {}
125
+ override def close() = {}
126
+ }
127
+ )
128
+
129
+ println(result)
130
+
131
+ assertEquals(ArrayBuffer(Map("date" -> Timestamp.ofEpochSecond(978307200L), "title" -> "Hello!", "author" -> "maji-KY", "id" -> 1L)), result)
132
+ }
133
+
134
+ }
metadata ADDED
@@ -0,0 +1,93 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: embulk-parser-xpath2
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - maji-KY
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-10-06 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ~>
17
+ - !ruby/object:Gem::Version
18
+ version: '1.0'
19
+ name: bundler
20
+ prerelease: false
21
+ type: :development
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.0'
27
+ - !ruby/object:Gem::Dependency
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '10.0'
33
+ name: rake
34
+ prerelease: false
35
+ type: :development
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ description: Parses XML files read by other file input plugins.
42
+ email:
43
+ - maji-KY@neco-labo.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - .gitignore
49
+ - LICENSE
50
+ - README.md
51
+ - build.gradle
52
+ - build.sbt
53
+ - gradle/wrapper/gradle-wrapper.jar
54
+ - gradle/wrapper/gradle-wrapper.properties
55
+ - gradlew
56
+ - gradlew.bat
57
+ - lib/embulk/guess/xpath2.rb
58
+ - lib/embulk/parser/xpath2.rb
59
+ - src/main/scala/org/embulk/parser/xpath2/LoanPattern.scala
60
+ - src/main/scala/org/embulk/parser/xpath2/PluginTask.scala
61
+ - src/main/scala/org/embulk/parser/xpath2/XPath2ParserPlugin.scala
62
+ - src/main/scala/org/embulk/parser/xpath2/config/NamespacesConfig.scala
63
+ - src/main/scala/org/embulk/parser/xpath2/config/SchemaConfig.scala
64
+ - src/test/resources/data.xml
65
+ - src/test/scala/org/embulk/parser/xpath2/UnitSpec.scala
66
+ - src/test/scala/org/embulk/parser/xpath2/XPath2ParserPluginSpec.scala
67
+ - classpath/embulk-parser-xpath2-0.0.1.jar
68
+ - classpath/scala-library-2.12.3.jar
69
+ homepage: https://github.com/maji-KY/embulk-parser-xpath2
70
+ licenses:
71
+ - MIT
72
+ metadata: {}
73
+ post_install_message:
74
+ rdoc_options: []
75
+ require_paths:
76
+ - lib
77
+ required_ruby_version: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - '>='
80
+ - !ruby/object:Gem::Version
81
+ version: '0'
82
+ required_rubygems_version: !ruby/object:Gem::Requirement
83
+ requirements:
84
+ - - '>='
85
+ - !ruby/object:Gem::Version
86
+ version: '0'
87
+ requirements: []
88
+ rubyforge_project:
89
+ rubygems_version: 2.1.9
90
+ signing_key:
91
+ specification_version: 4
92
+ summary: Embulk parser plugin for XML
93
+ test_files: []