embulk-parser-xpath2 0.0.4 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/benchmark/src/main/scala/org/embulk/parser/xpath2/ParseBenchmark.scala +1 -1
- data/build.gradle +2 -1
- data/build.sbt +1 -0
- data/src/main/scala/org/embulk/parser/xpath2/XPath2ParserPlugin.scala +55 -55
- data/src/test/resources/invalid-data.xml +33 -0
- data/src/test/scala/org/embulk/parser/xpath2/XPath2ParserPluginSpec.scala +62 -3
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fcd8e55e48a3ea51f186ca9a66440db2a454c877
|
4
|
+
data.tar.gz: 8816fe3c8e734055993ecba64179c68414bccf6d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 751d5eacce78225b0a07e2925ccfca0ee09c6bfc3a01cbaa96331881f8e503170ae3974b2548deba12395aa92c1fbe909c291ae146771c11fe8690a1da137cdc
|
7
|
+
data.tar.gz: 9e5c7072613b5df19dcffbb0a7d354c8b1b7c514934134683ba7ba7e268a2f22d359710d6ff51397a978f5c8709e5ee18d28bb15e1f5903c7ab5a884d833837c
|
data/build.gradle
CHANGED
@@ -13,7 +13,7 @@ configurations {
|
|
13
13
|
provided
|
14
14
|
}
|
15
15
|
|
16
|
-
version = "0.0
|
16
|
+
version = "0.1.0"
|
17
17
|
ext {
|
18
18
|
embulkVersion = "0.8.35"
|
19
19
|
}
|
@@ -22,6 +22,7 @@ sourceCompatibility = 1.8
|
|
22
22
|
targetCompatibility = 1.8
|
23
23
|
|
24
24
|
dependencies {
|
25
|
+
compile "com.ximpleware:vtd-xml:2.13.4"
|
25
26
|
compile "org.embulk:embulk-core:${embulkVersion}"
|
26
27
|
provided "org.embulk:embulk-core:${embulkVersion}"
|
27
28
|
testCompile "org.embulk:embulk-core:${embulkVersion}:tests"
|
data/build.sbt
CHANGED
@@ -16,6 +16,7 @@ lazy val commonSettings = Seq(
|
|
16
16
|
),
|
17
17
|
resolvers += Resolver.jcenterRepo,
|
18
18
|
libraryDependencies ++= Seq(
|
19
|
+
"com.ximpleware" % "vtd-xml" % "2.13.4",
|
19
20
|
"org.embulk" % "embulk-core" % embulkVersion,
|
20
21
|
"org.embulk" % "embulk-core" % embulkVersion classifier "tests",
|
21
22
|
"junit" % "junit" % "4.+" % "test",
|
@@ -1,10 +1,7 @@
|
|
1
1
|
package org.embulk.parser.xpath2
|
2
2
|
|
3
|
-
import
|
4
|
-
import
|
5
|
-
import javax.xml.parsers.{DocumentBuilder, DocumentBuilderFactory}
|
6
|
-
import javax.xml.xpath.{XPathConstants, XPathExpression, XPathFactory}
|
7
|
-
|
3
|
+
import com.google.common.io.ByteStreams
|
4
|
+
import com.ximpleware.{AutoPilot, VTDGen, VTDNav}
|
8
5
|
import org.embulk.config._
|
9
6
|
import org.embulk.parser.xpath2.config.ColumnConfig
|
10
7
|
import org.embulk.spi._
|
@@ -13,21 +10,14 @@ import org.embulk.spi.time.TimestampParser
|
|
13
10
|
import org.embulk.spi.util.FileInputInputStream
|
14
11
|
import org.msgpack.value.{Value, Variable}
|
15
12
|
import org.slf4j.Logger
|
16
|
-
import org.w3c.dom.{Document, Node, NodeList}
|
17
13
|
|
14
|
+
import scala.annotation.tailrec
|
18
15
|
import scala.collection.JavaConverters._
|
19
|
-
import scala.collection.immutable
|
20
16
|
import scala.util.control.NonFatal
|
21
17
|
|
22
18
|
class XPath2ParserPlugin extends ParserPlugin {
|
23
19
|
|
24
|
-
val logger: Logger = Exec.getLogger(classOf[XPath2ParserPlugin])
|
25
|
-
|
26
|
-
def docBuilder: DocumentBuilder = {
|
27
|
-
val factory: DocumentBuilderFactory = DocumentBuilderFactory.newInstance
|
28
|
-
factory.setNamespaceAware(true)
|
29
|
-
factory.newDocumentBuilder()
|
30
|
-
}
|
20
|
+
private[this] val logger: Logger = Exec.getLogger(classOf[XPath2ParserPlugin])
|
31
21
|
|
32
22
|
override def transaction(config: ConfigSource, control: ParserPlugin.Control): Unit = {
|
33
23
|
val task = config.loadConfig(classOf[PluginTask])
|
@@ -42,73 +32,83 @@ class XPath2ParserPlugin extends ParserPlugin {
|
|
42
32
|
val task: PluginTask = taskSource.loadTask(classOf[PluginTask])
|
43
33
|
val stopOnInvalidRecord: Boolean = task.getStopOnInvalidRecord
|
44
34
|
|
45
|
-
val xPathInstance = XPathFactory.newInstance.newXPath()
|
46
|
-
xPathInstance.setNamespaceContext(new NamespaceContext {
|
47
|
-
override def getPrefix(namespaceURI: String): String = task.getNamespaces.conf.asScala.collectFirst { case (_, v) if v == namespaceURI => v }.orNull
|
48
|
-
override def getPrefixes(namespaceURI: String): util.Iterator[_] = task.getNamespaces.conf.asScala.keys.asJava.iterator()
|
49
|
-
override def getNamespaceURI(prefix: String): String = task.getNamespaces.conf.asScala(prefix)
|
50
|
-
})
|
51
|
-
|
52
|
-
val rootXPath: XPathExpression = xPathInstance.compile(task.getRoot)
|
53
|
-
val columnXPaths: immutable.Seq[XPathExpression] = task.getSchema.columns.asScala.map(x => xPathInstance.compile(x.path)).toList
|
54
|
-
|
55
35
|
val timestampParsers: Map[String, TimestampParser] = task.getSchema.columns.asScala
|
56
36
|
.collect { case ColumnConfig(_, name, _, Some(timestampColumnOption), _) => (name, new TimestampParser(task, timestampColumnOption)) }.toMap
|
37
|
+
val columnsWithIndex: Seq[(ColumnConfig, Int)] = task.getSchema.columns.asScala.zipWithIndex
|
38
|
+
|
39
|
+
val vg = new VTDGen
|
57
40
|
|
58
41
|
LoanPattern(new PageBuilder(Exec.getBufferAllocator, schema, output)) { pb =>
|
59
42
|
while (input.nextFile()) {
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
43
|
+
LoanPattern(new FileInputInputStream(input)) { fiis =>
|
44
|
+
|
45
|
+
vg.setDoc(ByteStreams.toByteArray(fiis))
|
46
|
+
vg.parse(true)
|
47
|
+
|
48
|
+
val nav = vg.getNav
|
49
|
+
val rootElementAutoPilot = new AutoPilot(nav)
|
50
|
+
val columnElementAutoPilot = new AutoPilot(nav)
|
51
|
+
task.getNamespaces.conf.asScala.foreach { case (prefix, namespaceURI) =>
|
52
|
+
rootElementAutoPilot.declareXPathNameSpace(prefix, namespaceURI)
|
53
|
+
columnElementAutoPilot.declareXPathNameSpace(prefix, namespaceURI)
|
54
|
+
}
|
55
|
+
|
56
|
+
@tailrec
|
57
|
+
def execEachRecord(rootAp: AutoPilot): Unit = if (rootAp.evalXPath() != -1) {
|
58
|
+
nav.push()
|
59
|
+
try {
|
60
|
+
columnsWithIndex.foreach { case (columnConfig, idx) =>
|
61
|
+
nav.push()
|
62
|
+
columnElementAutoPilot.selectXPath(columnConfig.path)
|
65
63
|
val column = schema.getColumn(idx)
|
66
|
-
handleColumn(pb,
|
64
|
+
handleColumn(pb, nav, columnElementAutoPilot, column, timestampParsers)
|
65
|
+
nav.pop()
|
67
66
|
}
|
68
67
|
pb.addRecord()
|
68
|
+
} catch {
|
69
|
+
case NonFatal(e) => if (stopOnInvalidRecord) {
|
70
|
+
throw new DataException(e)
|
71
|
+
} else {
|
72
|
+
logger.warn(s"Skipped invalid record $e")
|
73
|
+
}
|
69
74
|
}
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
75
|
+
nav.pop()
|
76
|
+
execEachRecord(rootAp)
|
77
|
+
}
|
78
|
+
|
79
|
+
rootElementAutoPilot.selectXPath(task.getRoot)
|
80
|
+
execEachRecord(rootElementAutoPilot)
|
76
81
|
}
|
82
|
+
|
77
83
|
pb.flush()
|
78
84
|
}
|
79
85
|
pb.finish()
|
80
|
-
pb.close()
|
81
|
-
}
|
82
|
-
}
|
83
|
-
|
84
|
-
def parseXML(input: FileInput): Either[Throwable, Document] = {
|
85
|
-
val stream = new FileInputInputStream(input)
|
86
|
-
try {
|
87
|
-
Right(docBuilder.parse(stream))
|
88
|
-
} catch {
|
89
|
-
case NonFatal(e) => Left(e)
|
90
86
|
}
|
91
87
|
}
|
92
88
|
|
93
|
-
def handleColumn(pb: PageBuilder,
|
89
|
+
final def handleColumn(pb: PageBuilder, nav: VTDNav, columnAp: AutoPilot, column: Column, timestampParsers: Map[String, TimestampParser]): Unit = {
|
94
90
|
if (column.getType.isInstanceOf[JsonType]) {
|
95
|
-
val
|
96
|
-
|
97
|
-
|
91
|
+
val list = new java.util.ArrayList[Value]()
|
92
|
+
@tailrec
|
93
|
+
def eachJsonValue(cAp: AutoPilot): Unit = if (cAp.evalXPath() != -1) {
|
94
|
+
val index = nav.getText
|
95
|
+
if (index != -1) list.add(new Variable().setStringValue(nav.toString(index)).asStringValue())
|
96
|
+
eachJsonValue(cAp)
|
98
97
|
}
|
99
|
-
|
98
|
+
eachJsonValue(columnAp)
|
99
|
+
val jsonValue = new Variable().setArrayValue(list).asArrayValue()
|
100
100
|
pb.setJson(column, jsonValue)
|
101
101
|
} else {
|
102
|
-
|
103
|
-
if (value == null) {
|
102
|
+
if (columnAp.evalXPath() == -1) {
|
104
103
|
pb.setNull(column)
|
105
104
|
} else {
|
106
|
-
|
105
|
+
val index = nav.getText
|
106
|
+
setColumn(pb, column, nav.toString(index), timestampParsers)
|
107
107
|
}
|
108
108
|
}
|
109
109
|
}
|
110
110
|
|
111
|
-
def setColumn(pb: PageBuilder, column: Column, value: String, timestampParsers: Map[String, TimestampParser]): Unit = column.getType match {
|
111
|
+
final def setColumn(pb: PageBuilder, column: Column, value: String, timestampParsers: Map[String, TimestampParser]): Unit = column.getType match {
|
112
112
|
case _: StringType => pb.setString(column, value)
|
113
113
|
case _: LongType => pb.setLong(column, value.toLong)
|
114
114
|
case _: DoubleType => pb.setDouble(column, value.toDouble)
|
@@ -0,0 +1,33 @@
|
|
1
|
+
<?xml version="1.0"?>
|
2
|
+
<ns1:root
|
3
|
+
xmlns:ns1="http://example.com/ns1/"
|
4
|
+
xmlns:ns2="http://example.com/ns2/">
|
5
|
+
<ns2:entry>
|
6
|
+
<ns2:id>1</ns2:id>
|
7
|
+
<ns2:title>Hello!</ns2:title>
|
8
|
+
<ns2:meta>
|
9
|
+
<ns2:author>maji-KY</ns2:author>
|
10
|
+
</ns2:meta>
|
11
|
+
<ns2:date>20010101</ns2:date>
|
12
|
+
<ns2:dateTime>2000-12-31 15:00:00</ns2:dateTime>
|
13
|
+
<ns2:list>
|
14
|
+
<ns2:value>a</ns2:value>
|
15
|
+
<ns2:value>b</ns2:value>
|
16
|
+
<ns2:value>c</ns2:value>
|
17
|
+
</ns2:list>
|
18
|
+
<ns2:rating by="subscribers">2.5</ns2:rating>
|
19
|
+
<ns2:rating>3.5</ns2:rating>
|
20
|
+
<ns2:released>true</ns2:released>
|
21
|
+
</ns2:entry>
|
22
|
+
<ns2:entry>
|
23
|
+
<ns2:id>2.5</ns2:id>
|
24
|
+
<ns2:title>Bonjour!</ns2:title>
|
25
|
+
<ns2:meta>
|
26
|
+
<ns2:author>maji-KY</ns2:author>
|
27
|
+
</ns2:meta>
|
28
|
+
<ns2:date>20010101</ns2:date>
|
29
|
+
<ns2:list></ns2:list>
|
30
|
+
<ns2:rating>3.5</ns2:rating>
|
31
|
+
<ns2:released>false</ns2:released>
|
32
|
+
</ns2:entry>
|
33
|
+
</ns1:root>
|
@@ -22,6 +22,7 @@ class XPath2ParserPluginSpec {
|
|
22
22
|
def runtime = new EmbulkTestRuntime
|
23
23
|
|
24
24
|
val dataPath: String = classOf[XPath2ParserPlugin].getClassLoader.getResource("data.xml").getPath
|
25
|
+
val invalidDataPath: String = classOf[XPath2ParserPlugin].getClassLoader.getResource("invalid-data.xml").getPath
|
25
26
|
|
26
27
|
def configSource: ConfigSource = Exec.newConfigSource()
|
27
28
|
.set("in", Map[String, String]("type" -> "file", "path_prefix" -> dataPath).asJava)
|
@@ -39,14 +40,15 @@ class XPath2ParserPluginSpec {
|
|
39
40
|
.set("namespaces", Map[String, String]("ns1" -> "http://example.com/ns1/", "ns2" -> "http://example.com/ns2/").asJava)
|
40
41
|
.set("out", Map[String, String]("type" -> "stdout").asJava)
|
41
42
|
|
42
|
-
@Test def
|
43
|
+
@Test def testParseXML() {
|
43
44
|
|
44
|
-
val
|
45
|
+
val cs = configSource
|
46
|
+
val task = cs.loadConfig(classOf[PluginTask])
|
45
47
|
|
46
48
|
var schema: Schema = null
|
47
49
|
|
48
50
|
val plugin = new XPath2ParserPlugin()
|
49
|
-
plugin.transaction(
|
51
|
+
plugin.transaction(cs, (_: TaskSource, s: Schema) => {schema = s})
|
50
52
|
|
51
53
|
val result: mutable.Buffer[collection.mutable.Map[String, Any]] = mutable.Buffer()
|
52
54
|
|
@@ -83,8 +85,65 @@ class XPath2ParserPluginSpec {
|
|
83
85
|
), result)
|
84
86
|
}
|
85
87
|
|
88
|
+
@Test(expected = classOf[DataException]) def testStopOnInvalid() {
|
89
|
+
|
90
|
+
val cs = configSource.set("stop_on_invalid_record", true)
|
91
|
+
val task = cs.loadConfig(classOf[PluginTask])
|
92
|
+
|
93
|
+
var schema: Schema = null
|
94
|
+
|
95
|
+
val plugin = new XPath2ParserPlugin()
|
96
|
+
plugin.transaction(cs, (_: TaskSource, s: Schema) => {schema = s})
|
97
|
+
|
98
|
+
val result: mutable.Buffer[collection.mutable.Map[String, Any]] = mutable.Buffer()
|
99
|
+
|
100
|
+
plugin.run(
|
101
|
+
task.dump(),
|
102
|
+
schema,
|
103
|
+
new InputStreamFileInput(Exec.getBufferAllocator(), new FileInputStream(new File(invalidDataPath))),
|
104
|
+
new TestTransactionalPageOutput(schema, result)
|
105
|
+
)
|
106
|
+
|
107
|
+
}
|
108
|
+
|
109
|
+
@Test() def testSkipOnInvalid() {
|
110
|
+
|
111
|
+
val cs = configSource
|
112
|
+
val task = cs.loadConfig(classOf[PluginTask])
|
113
|
+
|
114
|
+
var schema: Schema = null
|
115
|
+
|
116
|
+
val plugin = new XPath2ParserPlugin()
|
117
|
+
plugin.transaction(cs, (_: TaskSource, s: Schema) => {schema = s})
|
118
|
+
|
119
|
+
val result: mutable.Buffer[collection.mutable.Map[String, Any]] = mutable.Buffer()
|
120
|
+
|
121
|
+
plugin.run(
|
122
|
+
task.dump(),
|
123
|
+
schema,
|
124
|
+
new InputStreamFileInput(Exec.getBufferAllocator(), new FileInputStream(new File(invalidDataPath))),
|
125
|
+
new TestTransactionalPageOutput(schema, result)
|
126
|
+
)
|
127
|
+
|
128
|
+
|
129
|
+
assertEquals(ArrayBuffer(
|
130
|
+
Map(
|
131
|
+
"id" -> 1L,
|
132
|
+
"title" -> "Hello!",
|
133
|
+
"author" -> "maji-KY",
|
134
|
+
"date" -> Timestamp.ofEpochSecond(978274800L),
|
135
|
+
"date_time" -> Timestamp.ofEpochSecond(978274800L),
|
136
|
+
"list" -> new JsonParser().parse("""["a","b","c"]"""),
|
137
|
+
"rating_sub" -> 2.5d,
|
138
|
+
"released" -> true,
|
139
|
+
)
|
140
|
+
), result)
|
141
|
+
}
|
142
|
+
|
86
143
|
}
|
87
144
|
|
145
|
+
|
146
|
+
|
88
147
|
class TestTransactionalPageOutput(schema: Schema, result: mutable.Buffer[collection.mutable.Map[String, Any]])
|
89
148
|
extends TransactionalPageOutput {
|
90
149
|
import org.embulk.spi.PageReader
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-parser-xpath2
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- maji-KY
|
@@ -66,10 +66,12 @@ files:
|
|
66
66
|
- src/main/scala/org/embulk/parser/xpath2/config/NamespacesConfig.scala
|
67
67
|
- src/main/scala/org/embulk/parser/xpath2/config/SchemaConfig.scala
|
68
68
|
- src/test/resources/data.xml
|
69
|
+
- src/test/resources/invalid-data.xml
|
69
70
|
- src/test/scala/org/embulk/parser/xpath2/UnitSpec.scala
|
70
71
|
- src/test/scala/org/embulk/parser/xpath2/XPath2ParserPluginSpec.scala
|
72
|
+
- classpath/vtd-xml-2.13.4.jar
|
71
73
|
- classpath/scala-library-2.12.4.jar
|
72
|
-
- classpath/embulk-parser-xpath2-0.0.
|
74
|
+
- classpath/embulk-parser-xpath2-0.1.0.jar
|
73
75
|
homepage: https://github.com/maji-KY/embulk-parser-xpath2
|
74
76
|
licenses:
|
75
77
|
- MIT
|