embulk-parser-xpath2 0.0.4 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/benchmark/src/main/scala/org/embulk/parser/xpath2/ParseBenchmark.scala +1 -1
- data/build.gradle +2 -1
- data/build.sbt +1 -0
- data/src/main/scala/org/embulk/parser/xpath2/XPath2ParserPlugin.scala +55 -55
- data/src/test/resources/invalid-data.xml +33 -0
- data/src/test/scala/org/embulk/parser/xpath2/XPath2ParserPluginSpec.scala +62 -3
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fcd8e55e48a3ea51f186ca9a66440db2a454c877
|
4
|
+
data.tar.gz: 8816fe3c8e734055993ecba64179c68414bccf6d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 751d5eacce78225b0a07e2925ccfca0ee09c6bfc3a01cbaa96331881f8e503170ae3974b2548deba12395aa92c1fbe909c291ae146771c11fe8690a1da137cdc
|
7
|
+
data.tar.gz: 9e5c7072613b5df19dcffbb0a7d354c8b1b7c514934134683ba7ba7e268a2f22d359710d6ff51397a978f5c8709e5ee18d28bb15e1f5903c7ab5a884d833837c
|
data/build.gradle
CHANGED
@@ -13,7 +13,7 @@ configurations {
|
|
13
13
|
provided
|
14
14
|
}
|
15
15
|
|
16
|
-
version = "0.0
|
16
|
+
version = "0.1.0"
|
17
17
|
ext {
|
18
18
|
embulkVersion = "0.8.35"
|
19
19
|
}
|
@@ -22,6 +22,7 @@ sourceCompatibility = 1.8
|
|
22
22
|
targetCompatibility = 1.8
|
23
23
|
|
24
24
|
dependencies {
|
25
|
+
compile "com.ximpleware:vtd-xml:2.13.4"
|
25
26
|
compile "org.embulk:embulk-core:${embulkVersion}"
|
26
27
|
provided "org.embulk:embulk-core:${embulkVersion}"
|
27
28
|
testCompile "org.embulk:embulk-core:${embulkVersion}:tests"
|
data/build.sbt
CHANGED
@@ -16,6 +16,7 @@ lazy val commonSettings = Seq(
|
|
16
16
|
),
|
17
17
|
resolvers += Resolver.jcenterRepo,
|
18
18
|
libraryDependencies ++= Seq(
|
19
|
+
"com.ximpleware" % "vtd-xml" % "2.13.4",
|
19
20
|
"org.embulk" % "embulk-core" % embulkVersion,
|
20
21
|
"org.embulk" % "embulk-core" % embulkVersion classifier "tests",
|
21
22
|
"junit" % "junit" % "4.+" % "test",
|
@@ -1,10 +1,7 @@
|
|
1
1
|
package org.embulk.parser.xpath2
|
2
2
|
|
3
|
-
import
|
4
|
-
import
|
5
|
-
import javax.xml.parsers.{DocumentBuilder, DocumentBuilderFactory}
|
6
|
-
import javax.xml.xpath.{XPathConstants, XPathExpression, XPathFactory}
|
7
|
-
|
3
|
+
import com.google.common.io.ByteStreams
|
4
|
+
import com.ximpleware.{AutoPilot, VTDGen, VTDNav}
|
8
5
|
import org.embulk.config._
|
9
6
|
import org.embulk.parser.xpath2.config.ColumnConfig
|
10
7
|
import org.embulk.spi._
|
@@ -13,21 +10,14 @@ import org.embulk.spi.time.TimestampParser
|
|
13
10
|
import org.embulk.spi.util.FileInputInputStream
|
14
11
|
import org.msgpack.value.{Value, Variable}
|
15
12
|
import org.slf4j.Logger
|
16
|
-
import org.w3c.dom.{Document, Node, NodeList}
|
17
13
|
|
14
|
+
import scala.annotation.tailrec
|
18
15
|
import scala.collection.JavaConverters._
|
19
|
-
import scala.collection.immutable
|
20
16
|
import scala.util.control.NonFatal
|
21
17
|
|
22
18
|
class XPath2ParserPlugin extends ParserPlugin {
|
23
19
|
|
24
|
-
val logger: Logger = Exec.getLogger(classOf[XPath2ParserPlugin])
|
25
|
-
|
26
|
-
def docBuilder: DocumentBuilder = {
|
27
|
-
val factory: DocumentBuilderFactory = DocumentBuilderFactory.newInstance
|
28
|
-
factory.setNamespaceAware(true)
|
29
|
-
factory.newDocumentBuilder()
|
30
|
-
}
|
20
|
+
private[this] val logger: Logger = Exec.getLogger(classOf[XPath2ParserPlugin])
|
31
21
|
|
32
22
|
override def transaction(config: ConfigSource, control: ParserPlugin.Control): Unit = {
|
33
23
|
val task = config.loadConfig(classOf[PluginTask])
|
@@ -42,73 +32,83 @@ class XPath2ParserPlugin extends ParserPlugin {
|
|
42
32
|
val task: PluginTask = taskSource.loadTask(classOf[PluginTask])
|
43
33
|
val stopOnInvalidRecord: Boolean = task.getStopOnInvalidRecord
|
44
34
|
|
45
|
-
val xPathInstance = XPathFactory.newInstance.newXPath()
|
46
|
-
xPathInstance.setNamespaceContext(new NamespaceContext {
|
47
|
-
override def getPrefix(namespaceURI: String): String = task.getNamespaces.conf.asScala.collectFirst { case (_, v) if v == namespaceURI => v }.orNull
|
48
|
-
override def getPrefixes(namespaceURI: String): util.Iterator[_] = task.getNamespaces.conf.asScala.keys.asJava.iterator()
|
49
|
-
override def getNamespaceURI(prefix: String): String = task.getNamespaces.conf.asScala(prefix)
|
50
|
-
})
|
51
|
-
|
52
|
-
val rootXPath: XPathExpression = xPathInstance.compile(task.getRoot)
|
53
|
-
val columnXPaths: immutable.Seq[XPathExpression] = task.getSchema.columns.asScala.map(x => xPathInstance.compile(x.path)).toList
|
54
|
-
|
55
35
|
val timestampParsers: Map[String, TimestampParser] = task.getSchema.columns.asScala
|
56
36
|
.collect { case ColumnConfig(_, name, _, Some(timestampColumnOption), _) => (name, new TimestampParser(task, timestampColumnOption)) }.toMap
|
37
|
+
val columnsWithIndex: Seq[(ColumnConfig, Int)] = task.getSchema.columns.asScala.zipWithIndex
|
38
|
+
|
39
|
+
val vg = new VTDGen
|
57
40
|
|
58
41
|
LoanPattern(new PageBuilder(Exec.getBufferAllocator, schema, output)) { pb =>
|
59
42
|
while (input.nextFile()) {
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
43
|
+
LoanPattern(new FileInputInputStream(input)) { fiis =>
|
44
|
+
|
45
|
+
vg.setDoc(ByteStreams.toByteArray(fiis))
|
46
|
+
vg.parse(true)
|
47
|
+
|
48
|
+
val nav = vg.getNav
|
49
|
+
val rootElementAutoPilot = new AutoPilot(nav)
|
50
|
+
val columnElementAutoPilot = new AutoPilot(nav)
|
51
|
+
task.getNamespaces.conf.asScala.foreach { case (prefix, namespaceURI) =>
|
52
|
+
rootElementAutoPilot.declareXPathNameSpace(prefix, namespaceURI)
|
53
|
+
columnElementAutoPilot.declareXPathNameSpace(prefix, namespaceURI)
|
54
|
+
}
|
55
|
+
|
56
|
+
@tailrec
|
57
|
+
def execEachRecord(rootAp: AutoPilot): Unit = if (rootAp.evalXPath() != -1) {
|
58
|
+
nav.push()
|
59
|
+
try {
|
60
|
+
columnsWithIndex.foreach { case (columnConfig, idx) =>
|
61
|
+
nav.push()
|
62
|
+
columnElementAutoPilot.selectXPath(columnConfig.path)
|
65
63
|
val column = schema.getColumn(idx)
|
66
|
-
handleColumn(pb,
|
64
|
+
handleColumn(pb, nav, columnElementAutoPilot, column, timestampParsers)
|
65
|
+
nav.pop()
|
67
66
|
}
|
68
67
|
pb.addRecord()
|
68
|
+
} catch {
|
69
|
+
case NonFatal(e) => if (stopOnInvalidRecord) {
|
70
|
+
throw new DataException(e)
|
71
|
+
} else {
|
72
|
+
logger.warn(s"Skipped invalid record $e")
|
73
|
+
}
|
69
74
|
}
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
75
|
+
nav.pop()
|
76
|
+
execEachRecord(rootAp)
|
77
|
+
}
|
78
|
+
|
79
|
+
rootElementAutoPilot.selectXPath(task.getRoot)
|
80
|
+
execEachRecord(rootElementAutoPilot)
|
76
81
|
}
|
82
|
+
|
77
83
|
pb.flush()
|
78
84
|
}
|
79
85
|
pb.finish()
|
80
|
-
pb.close()
|
81
|
-
}
|
82
|
-
}
|
83
|
-
|
84
|
-
def parseXML(input: FileInput): Either[Throwable, Document] = {
|
85
|
-
val stream = new FileInputInputStream(input)
|
86
|
-
try {
|
87
|
-
Right(docBuilder.parse(stream))
|
88
|
-
} catch {
|
89
|
-
case NonFatal(e) => Left(e)
|
90
86
|
}
|
91
87
|
}
|
92
88
|
|
93
|
-
def handleColumn(pb: PageBuilder,
|
89
|
+
final def handleColumn(pb: PageBuilder, nav: VTDNav, columnAp: AutoPilot, column: Column, timestampParsers: Map[String, TimestampParser]): Unit = {
|
94
90
|
if (column.getType.isInstanceOf[JsonType]) {
|
95
|
-
val
|
96
|
-
|
97
|
-
|
91
|
+
val list = new java.util.ArrayList[Value]()
|
92
|
+
@tailrec
|
93
|
+
def eachJsonValue(cAp: AutoPilot): Unit = if (cAp.evalXPath() != -1) {
|
94
|
+
val index = nav.getText
|
95
|
+
if (index != -1) list.add(new Variable().setStringValue(nav.toString(index)).asStringValue())
|
96
|
+
eachJsonValue(cAp)
|
98
97
|
}
|
99
|
-
|
98
|
+
eachJsonValue(columnAp)
|
99
|
+
val jsonValue = new Variable().setArrayValue(list).asArrayValue()
|
100
100
|
pb.setJson(column, jsonValue)
|
101
101
|
} else {
|
102
|
-
|
103
|
-
if (value == null) {
|
102
|
+
if (columnAp.evalXPath() == -1) {
|
104
103
|
pb.setNull(column)
|
105
104
|
} else {
|
106
|
-
|
105
|
+
val index = nav.getText
|
106
|
+
setColumn(pb, column, nav.toString(index), timestampParsers)
|
107
107
|
}
|
108
108
|
}
|
109
109
|
}
|
110
110
|
|
111
|
-
def setColumn(pb: PageBuilder, column: Column, value: String, timestampParsers: Map[String, TimestampParser]): Unit = column.getType match {
|
111
|
+
final def setColumn(pb: PageBuilder, column: Column, value: String, timestampParsers: Map[String, TimestampParser]): Unit = column.getType match {
|
112
112
|
case _: StringType => pb.setString(column, value)
|
113
113
|
case _: LongType => pb.setLong(column, value.toLong)
|
114
114
|
case _: DoubleType => pb.setDouble(column, value.toDouble)
|
@@ -0,0 +1,33 @@
|
|
1
|
+
<?xml version="1.0"?>
|
2
|
+
<ns1:root
|
3
|
+
xmlns:ns1="http://example.com/ns1/"
|
4
|
+
xmlns:ns2="http://example.com/ns2/">
|
5
|
+
<ns2:entry>
|
6
|
+
<ns2:id>1</ns2:id>
|
7
|
+
<ns2:title>Hello!</ns2:title>
|
8
|
+
<ns2:meta>
|
9
|
+
<ns2:author>maji-KY</ns2:author>
|
10
|
+
</ns2:meta>
|
11
|
+
<ns2:date>20010101</ns2:date>
|
12
|
+
<ns2:dateTime>2000-12-31 15:00:00</ns2:dateTime>
|
13
|
+
<ns2:list>
|
14
|
+
<ns2:value>a</ns2:value>
|
15
|
+
<ns2:value>b</ns2:value>
|
16
|
+
<ns2:value>c</ns2:value>
|
17
|
+
</ns2:list>
|
18
|
+
<ns2:rating by="subscribers">2.5</ns2:rating>
|
19
|
+
<ns2:rating>3.5</ns2:rating>
|
20
|
+
<ns2:released>true</ns2:released>
|
21
|
+
</ns2:entry>
|
22
|
+
<ns2:entry>
|
23
|
+
<ns2:id>2.5</ns2:id>
|
24
|
+
<ns2:title>Bonjour!</ns2:title>
|
25
|
+
<ns2:meta>
|
26
|
+
<ns2:author>maji-KY</ns2:author>
|
27
|
+
</ns2:meta>
|
28
|
+
<ns2:date>20010101</ns2:date>
|
29
|
+
<ns2:list></ns2:list>
|
30
|
+
<ns2:rating>3.5</ns2:rating>
|
31
|
+
<ns2:released>false</ns2:released>
|
32
|
+
</ns2:entry>
|
33
|
+
</ns1:root>
|
@@ -22,6 +22,7 @@ class XPath2ParserPluginSpec {
|
|
22
22
|
def runtime = new EmbulkTestRuntime
|
23
23
|
|
24
24
|
val dataPath: String = classOf[XPath2ParserPlugin].getClassLoader.getResource("data.xml").getPath
|
25
|
+
val invalidDataPath: String = classOf[XPath2ParserPlugin].getClassLoader.getResource("invalid-data.xml").getPath
|
25
26
|
|
26
27
|
def configSource: ConfigSource = Exec.newConfigSource()
|
27
28
|
.set("in", Map[String, String]("type" -> "file", "path_prefix" -> dataPath).asJava)
|
@@ -39,14 +40,15 @@ class XPath2ParserPluginSpec {
|
|
39
40
|
.set("namespaces", Map[String, String]("ns1" -> "http://example.com/ns1/", "ns2" -> "http://example.com/ns2/").asJava)
|
40
41
|
.set("out", Map[String, String]("type" -> "stdout").asJava)
|
41
42
|
|
42
|
-
@Test def
|
43
|
+
@Test def testParseXML() {
|
43
44
|
|
44
|
-
val
|
45
|
+
val cs = configSource
|
46
|
+
val task = cs.loadConfig(classOf[PluginTask])
|
45
47
|
|
46
48
|
var schema: Schema = null
|
47
49
|
|
48
50
|
val plugin = new XPath2ParserPlugin()
|
49
|
-
plugin.transaction(
|
51
|
+
plugin.transaction(cs, (_: TaskSource, s: Schema) => {schema = s})
|
50
52
|
|
51
53
|
val result: mutable.Buffer[collection.mutable.Map[String, Any]] = mutable.Buffer()
|
52
54
|
|
@@ -83,8 +85,65 @@ class XPath2ParserPluginSpec {
|
|
83
85
|
), result)
|
84
86
|
}
|
85
87
|
|
88
|
+
@Test(expected = classOf[DataException]) def testStopOnInvalid() {
|
89
|
+
|
90
|
+
val cs = configSource.set("stop_on_invalid_record", true)
|
91
|
+
val task = cs.loadConfig(classOf[PluginTask])
|
92
|
+
|
93
|
+
var schema: Schema = null
|
94
|
+
|
95
|
+
val plugin = new XPath2ParserPlugin()
|
96
|
+
plugin.transaction(cs, (_: TaskSource, s: Schema) => {schema = s})
|
97
|
+
|
98
|
+
val result: mutable.Buffer[collection.mutable.Map[String, Any]] = mutable.Buffer()
|
99
|
+
|
100
|
+
plugin.run(
|
101
|
+
task.dump(),
|
102
|
+
schema,
|
103
|
+
new InputStreamFileInput(Exec.getBufferAllocator(), new FileInputStream(new File(invalidDataPath))),
|
104
|
+
new TestTransactionalPageOutput(schema, result)
|
105
|
+
)
|
106
|
+
|
107
|
+
}
|
108
|
+
|
109
|
+
@Test() def testSkipOnInvalid() {
|
110
|
+
|
111
|
+
val cs = configSource
|
112
|
+
val task = cs.loadConfig(classOf[PluginTask])
|
113
|
+
|
114
|
+
var schema: Schema = null
|
115
|
+
|
116
|
+
val plugin = new XPath2ParserPlugin()
|
117
|
+
plugin.transaction(cs, (_: TaskSource, s: Schema) => {schema = s})
|
118
|
+
|
119
|
+
val result: mutable.Buffer[collection.mutable.Map[String, Any]] = mutable.Buffer()
|
120
|
+
|
121
|
+
plugin.run(
|
122
|
+
task.dump(),
|
123
|
+
schema,
|
124
|
+
new InputStreamFileInput(Exec.getBufferAllocator(), new FileInputStream(new File(invalidDataPath))),
|
125
|
+
new TestTransactionalPageOutput(schema, result)
|
126
|
+
)
|
127
|
+
|
128
|
+
|
129
|
+
assertEquals(ArrayBuffer(
|
130
|
+
Map(
|
131
|
+
"id" -> 1L,
|
132
|
+
"title" -> "Hello!",
|
133
|
+
"author" -> "maji-KY",
|
134
|
+
"date" -> Timestamp.ofEpochSecond(978274800L),
|
135
|
+
"date_time" -> Timestamp.ofEpochSecond(978274800L),
|
136
|
+
"list" -> new JsonParser().parse("""["a","b","c"]"""),
|
137
|
+
"rating_sub" -> 2.5d,
|
138
|
+
"released" -> true,
|
139
|
+
)
|
140
|
+
), result)
|
141
|
+
}
|
142
|
+
|
86
143
|
}
|
87
144
|
|
145
|
+
|
146
|
+
|
88
147
|
class TestTransactionalPageOutput(schema: Schema, result: mutable.Buffer[collection.mutable.Map[String, Any]])
|
89
148
|
extends TransactionalPageOutput {
|
90
149
|
import org.embulk.spi.PageReader
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-parser-xpath2
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- maji-KY
|
@@ -66,10 +66,12 @@ files:
|
|
66
66
|
- src/main/scala/org/embulk/parser/xpath2/config/NamespacesConfig.scala
|
67
67
|
- src/main/scala/org/embulk/parser/xpath2/config/SchemaConfig.scala
|
68
68
|
- src/test/resources/data.xml
|
69
|
+
- src/test/resources/invalid-data.xml
|
69
70
|
- src/test/scala/org/embulk/parser/xpath2/UnitSpec.scala
|
70
71
|
- src/test/scala/org/embulk/parser/xpath2/XPath2ParserPluginSpec.scala
|
72
|
+
- classpath/vtd-xml-2.13.4.jar
|
71
73
|
- classpath/scala-library-2.12.4.jar
|
72
|
-
- classpath/embulk-parser-xpath2-0.0.
|
74
|
+
- classpath/embulk-parser-xpath2-0.1.0.jar
|
73
75
|
homepage: https://github.com/maji-KY/embulk-parser-xpath2
|
74
76
|
licenses:
|
75
77
|
- MIT
|