embulk-parser-xpath2 0.0.4 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2aebf3205f36802a6d5240064e69da2e4481ab32
4
- data.tar.gz: c74a3fb9310df91b0695bc83a2495bbac12860e0
3
+ metadata.gz: fcd8e55e48a3ea51f186ca9a66440db2a454c877
4
+ data.tar.gz: 8816fe3c8e734055993ecba64179c68414bccf6d
5
5
  SHA512:
6
- metadata.gz: e5d7656c75d2c8c7a82d266fa26fba9730f798551d0e573c1924d9ab40d3debc40f3aec0a9219327e23c5e8282be3e9d9b206768b6783d7d95cb137e784984ea
7
- data.tar.gz: 07d9a54720290242e298724c80743bb75997e6d1149fad566380195c282e6e8860c18a07bf810949ca5e522283d19cabcdc9a2062dffa6272d4d33f78302b40a
6
+ metadata.gz: 751d5eacce78225b0a07e2925ccfca0ee09c6bfc3a01cbaa96331881f8e503170ae3974b2548deba12395aa92c1fbe909c291ae146771c11fe8690a1da137cdc
7
+ data.tar.gz: 9e5c7072613b5df19dcffbb0a7d354c8b1b7c514934134683ba7ba7e268a2f22d359710d6ff51397a978f5c8709e5ee18d28bb15e1f5903c7ab5a884d833837c
@@ -41,7 +41,7 @@ class ParseBenchmark {
41
41
 
42
42
  object ParseBenchmark {
43
43
 
44
- val TestRecordSize = 1000
44
+ val TestRecordSize = 100 * 1000
45
45
 
46
46
  val test = new XPath2ParserPluginSpec()
47
47
  val runtime = new EmbulkTestRuntime
@@ -13,7 +13,7 @@ configurations {
13
13
  provided
14
14
  }
15
15
 
16
- version = "0.0.4"
16
+ version = "0.1.0"
17
17
  ext {
18
18
  embulkVersion = "0.8.35"
19
19
  }
@@ -22,6 +22,7 @@ sourceCompatibility = 1.8
22
22
  targetCompatibility = 1.8
23
23
 
24
24
  dependencies {
25
+ compile "com.ximpleware:vtd-xml:2.13.4"
25
26
  compile "org.embulk:embulk-core:${embulkVersion}"
26
27
  provided "org.embulk:embulk-core:${embulkVersion}"
27
28
  testCompile "org.embulk:embulk-core:${embulkVersion}:tests"
data/build.sbt CHANGED
@@ -16,6 +16,7 @@ lazy val commonSettings = Seq(
16
16
  ),
17
17
  resolvers += Resolver.jcenterRepo,
18
18
  libraryDependencies ++= Seq(
19
+ "com.ximpleware" % "vtd-xml" % "2.13.4",
19
20
  "org.embulk" % "embulk-core" % embulkVersion,
20
21
  "org.embulk" % "embulk-core" % embulkVersion classifier "tests",
21
22
  "junit" % "junit" % "4.+" % "test",
@@ -1,10 +1,7 @@
1
1
  package org.embulk.parser.xpath2
2
2
 
3
- import java.util
4
- import javax.xml.namespace.NamespaceContext
5
- import javax.xml.parsers.{DocumentBuilder, DocumentBuilderFactory}
6
- import javax.xml.xpath.{XPathConstants, XPathExpression, XPathFactory}
7
-
3
+ import com.google.common.io.ByteStreams
4
+ import com.ximpleware.{AutoPilot, VTDGen, VTDNav}
8
5
  import org.embulk.config._
9
6
  import org.embulk.parser.xpath2.config.ColumnConfig
10
7
  import org.embulk.spi._
@@ -13,21 +10,14 @@ import org.embulk.spi.time.TimestampParser
13
10
  import org.embulk.spi.util.FileInputInputStream
14
11
  import org.msgpack.value.{Value, Variable}
15
12
  import org.slf4j.Logger
16
- import org.w3c.dom.{Document, Node, NodeList}
17
13
 
14
+ import scala.annotation.tailrec
18
15
  import scala.collection.JavaConverters._
19
- import scala.collection.immutable
20
16
  import scala.util.control.NonFatal
21
17
 
22
18
  class XPath2ParserPlugin extends ParserPlugin {
23
19
 
24
- val logger: Logger = Exec.getLogger(classOf[XPath2ParserPlugin])
25
-
26
- def docBuilder: DocumentBuilder = {
27
- val factory: DocumentBuilderFactory = DocumentBuilderFactory.newInstance
28
- factory.setNamespaceAware(true)
29
- factory.newDocumentBuilder()
30
- }
20
+ private[this] val logger: Logger = Exec.getLogger(classOf[XPath2ParserPlugin])
31
21
 
32
22
  override def transaction(config: ConfigSource, control: ParserPlugin.Control): Unit = {
33
23
  val task = config.loadConfig(classOf[PluginTask])
@@ -42,73 +32,83 @@ class XPath2ParserPlugin extends ParserPlugin {
42
32
  val task: PluginTask = taskSource.loadTask(classOf[PluginTask])
43
33
  val stopOnInvalidRecord: Boolean = task.getStopOnInvalidRecord
44
34
 
45
- val xPathInstance = XPathFactory.newInstance.newXPath()
46
- xPathInstance.setNamespaceContext(new NamespaceContext {
47
- override def getPrefix(namespaceURI: String): String = task.getNamespaces.conf.asScala.collectFirst { case (_, v) if v == namespaceURI => v }.orNull
48
- override def getPrefixes(namespaceURI: String): util.Iterator[_] = task.getNamespaces.conf.asScala.keys.asJava.iterator()
49
- override def getNamespaceURI(prefix: String): String = task.getNamespaces.conf.asScala(prefix)
50
- })
51
-
52
- val rootXPath: XPathExpression = xPathInstance.compile(task.getRoot)
53
- val columnXPaths: immutable.Seq[XPathExpression] = task.getSchema.columns.asScala.map(x => xPathInstance.compile(x.path)).toList
54
-
55
35
  val timestampParsers: Map[String, TimestampParser] = task.getSchema.columns.asScala
56
36
  .collect { case ColumnConfig(_, name, _, Some(timestampColumnOption), _) => (name, new TimestampParser(task, timestampColumnOption)) }.toMap
37
+ val columnsWithIndex: Seq[(ColumnConfig, Int)] = task.getSchema.columns.asScala.zipWithIndex
38
+
39
+ val vg = new VTDGen
57
40
 
58
41
  LoanPattern(new PageBuilder(Exec.getBufferAllocator, schema, output)) { pb =>
59
42
  while (input.nextFile()) {
60
- parseXML(input) match {
61
- case Right(doc) =>
62
- val rootNodes = rootXPath.evaluate(doc, XPathConstants.NODESET).asInstanceOf[NodeList]
63
- (0 until rootNodes.getLength).map(rootNodes.item).foreach { node =>
64
- columnXPaths.zipWithIndex.foreach { case (xPath, idx) =>
43
+ LoanPattern(new FileInputInputStream(input)) { fiis =>
44
+
45
+ vg.setDoc(ByteStreams.toByteArray(fiis))
46
+ vg.parse(true)
47
+
48
+ val nav = vg.getNav
49
+ val rootElementAutoPilot = new AutoPilot(nav)
50
+ val columnElementAutoPilot = new AutoPilot(nav)
51
+ task.getNamespaces.conf.asScala.foreach { case (prefix, namespaceURI) =>
52
+ rootElementAutoPilot.declareXPathNameSpace(prefix, namespaceURI)
53
+ columnElementAutoPilot.declareXPathNameSpace(prefix, namespaceURI)
54
+ }
55
+
56
+ @tailrec
57
+ def execEachRecord(rootAp: AutoPilot): Unit = if (rootAp.evalXPath() != -1) {
58
+ nav.push()
59
+ try {
60
+ columnsWithIndex.foreach { case (columnConfig, idx) =>
61
+ nav.push()
62
+ columnElementAutoPilot.selectXPath(columnConfig.path)
65
63
  val column = schema.getColumn(idx)
66
- handleColumn(pb, node, xPath, column, timestampParsers)
64
+ handleColumn(pb, nav, columnElementAutoPilot, column, timestampParsers)
65
+ nav.pop()
67
66
  }
68
67
  pb.addRecord()
68
+ } catch {
69
+ case NonFatal(e) => if (stopOnInvalidRecord) {
70
+ throw new DataException(e)
71
+ } else {
72
+ logger.warn(s"Skipped invalid record $e")
73
+ }
69
74
  }
70
- case Left(e) =>
71
- if(stopOnInvalidRecord) {
72
- throw new DataException(e)
73
- } else {
74
- logger.warn(s"Skipped invalid record $e")
75
- }
75
+ nav.pop()
76
+ execEachRecord(rootAp)
77
+ }
78
+
79
+ rootElementAutoPilot.selectXPath(task.getRoot)
80
+ execEachRecord(rootElementAutoPilot)
76
81
  }
82
+
77
83
  pb.flush()
78
84
  }
79
85
  pb.finish()
80
- pb.close()
81
- }
82
- }
83
-
84
- def parseXML(input: FileInput): Either[Throwable, Document] = {
85
- val stream = new FileInputInputStream(input)
86
- try {
87
- Right(docBuilder.parse(stream))
88
- } catch {
89
- case NonFatal(e) => Left(e)
90
86
  }
91
87
  }
92
88
 
93
- def handleColumn(pb: PageBuilder, node: Node, xPath: XPathExpression, column: Column, timestampParsers: Map[String, TimestampParser]): Unit = {
89
+ final def handleColumn(pb: PageBuilder, nav: VTDNav, columnAp: AutoPilot, column: Column, timestampParsers: Map[String, TimestampParser]): Unit = {
94
90
  if (column.getType.isInstanceOf[JsonType]) {
95
- val value: NodeList = xPath.evaluate(node, XPathConstants.NODESET).asInstanceOf[NodeList]
96
- val values: Seq[Value] = (0 until value.getLength).map(value.item).map { valueNode =>
97
- new Variable().setStringValue(valueNode.getTextContent).asStringValue()
91
+ val list = new java.util.ArrayList[Value]()
92
+ @tailrec
93
+ def eachJsonValue(cAp: AutoPilot): Unit = if (cAp.evalXPath() != -1) {
94
+ val index = nav.getText
95
+ if (index != -1) list.add(new Variable().setStringValue(nav.toString(index)).asStringValue())
96
+ eachJsonValue(cAp)
98
97
  }
99
- val jsonValue = new Variable().setArrayValue(values.asJava).asArrayValue()
98
+ eachJsonValue(columnAp)
99
+ val jsonValue = new Variable().setArrayValue(list).asArrayValue()
100
100
  pb.setJson(column, jsonValue)
101
101
  } else {
102
- val value: Node = xPath.evaluate(node, XPathConstants.NODE).asInstanceOf[Node]
103
- if (value == null) {
102
+ if (columnAp.evalXPath() == -1) {
104
103
  pb.setNull(column)
105
104
  } else {
106
- setColumn(pb, column, value.getTextContent, timestampParsers)
105
+ val index = nav.getText
106
+ setColumn(pb, column, nav.toString(index), timestampParsers)
107
107
  }
108
108
  }
109
109
  }
110
110
 
111
- def setColumn(pb: PageBuilder, column: Column, value: String, timestampParsers: Map[String, TimestampParser]): Unit = column.getType match {
111
+ final def setColumn(pb: PageBuilder, column: Column, value: String, timestampParsers: Map[String, TimestampParser]): Unit = column.getType match {
112
112
  case _: StringType => pb.setString(column, value)
113
113
  case _: LongType => pb.setLong(column, value.toLong)
114
114
  case _: DoubleType => pb.setDouble(column, value.toDouble)
@@ -0,0 +1,33 @@
1
+ <?xml version="1.0"?>
2
+ <ns1:root
3
+ xmlns:ns1="http://example.com/ns1/"
4
+ xmlns:ns2="http://example.com/ns2/">
5
+ <ns2:entry>
6
+ <ns2:id>1</ns2:id>
7
+ <ns2:title>Hello!</ns2:title>
8
+ <ns2:meta>
9
+ <ns2:author>maji-KY</ns2:author>
10
+ </ns2:meta>
11
+ <ns2:date>20010101</ns2:date>
12
+ <ns2:dateTime>2000-12-31 15:00:00</ns2:dateTime>
13
+ <ns2:list>
14
+ <ns2:value>a</ns2:value>
15
+ <ns2:value>b</ns2:value>
16
+ <ns2:value>c</ns2:value>
17
+ </ns2:list>
18
+ <ns2:rating by="subscribers">2.5</ns2:rating>
19
+ <ns2:rating>3.5</ns2:rating>
20
+ <ns2:released>true</ns2:released>
21
+ </ns2:entry>
22
+ <ns2:entry>
23
+ <ns2:id>2.5</ns2:id>
24
+ <ns2:title>Bonjour!</ns2:title>
25
+ <ns2:meta>
26
+ <ns2:author>maji-KY</ns2:author>
27
+ </ns2:meta>
28
+ <ns2:date>20010101</ns2:date>
29
+ <ns2:list></ns2:list>
30
+ <ns2:rating>3.5</ns2:rating>
31
+ <ns2:released>false</ns2:released>
32
+ </ns2:entry>
33
+ </ns1:root>
@@ -22,6 +22,7 @@ class XPath2ParserPluginSpec {
22
22
  def runtime = new EmbulkTestRuntime
23
23
 
24
24
  val dataPath: String = classOf[XPath2ParserPlugin].getClassLoader.getResource("data.xml").getPath
25
+ val invalidDataPath: String = classOf[XPath2ParserPlugin].getClassLoader.getResource("invalid-data.xml").getPath
25
26
 
26
27
  def configSource: ConfigSource = Exec.newConfigSource()
27
28
  .set("in", Map[String, String]("type" -> "file", "path_prefix" -> dataPath).asJava)
@@ -39,14 +40,15 @@ class XPath2ParserPluginSpec {
39
40
  .set("namespaces", Map[String, String]("ns1" -> "http://example.com/ns1/", "ns2" -> "http://example.com/ns2/").asJava)
40
41
  .set("out", Map[String, String]("type" -> "stdout").asJava)
41
42
 
42
- @Test def test() {
43
+ @Test def testParseXML() {
43
44
 
44
- val task = configSource.loadConfig(classOf[PluginTask])
45
+ val cs = configSource
46
+ val task = cs.loadConfig(classOf[PluginTask])
45
47
 
46
48
  var schema: Schema = null
47
49
 
48
50
  val plugin = new XPath2ParserPlugin()
49
- plugin.transaction(configSource, (_: TaskSource, s: Schema) => {schema = s})
51
+ plugin.transaction(cs, (_: TaskSource, s: Schema) => {schema = s})
50
52
 
51
53
  val result: mutable.Buffer[collection.mutable.Map[String, Any]] = mutable.Buffer()
52
54
 
@@ -83,8 +85,65 @@ class XPath2ParserPluginSpec {
83
85
  ), result)
84
86
  }
85
87
 
88
+ @Test(expected = classOf[DataException]) def testStopOnInvalid() {
89
+
90
+ val cs = configSource.set("stop_on_invalid_record", true)
91
+ val task = cs.loadConfig(classOf[PluginTask])
92
+
93
+ var schema: Schema = null
94
+
95
+ val plugin = new XPath2ParserPlugin()
96
+ plugin.transaction(cs, (_: TaskSource, s: Schema) => {schema = s})
97
+
98
+ val result: mutable.Buffer[collection.mutable.Map[String, Any]] = mutable.Buffer()
99
+
100
+ plugin.run(
101
+ task.dump(),
102
+ schema,
103
+ new InputStreamFileInput(Exec.getBufferAllocator(), new FileInputStream(new File(invalidDataPath))),
104
+ new TestTransactionalPageOutput(schema, result)
105
+ )
106
+
107
+ }
108
+
109
+ @Test() def testSkipOnInvalid() {
110
+
111
+ val cs = configSource
112
+ val task = cs.loadConfig(classOf[PluginTask])
113
+
114
+ var schema: Schema = null
115
+
116
+ val plugin = new XPath2ParserPlugin()
117
+ plugin.transaction(cs, (_: TaskSource, s: Schema) => {schema = s})
118
+
119
+ val result: mutable.Buffer[collection.mutable.Map[String, Any]] = mutable.Buffer()
120
+
121
+ plugin.run(
122
+ task.dump(),
123
+ schema,
124
+ new InputStreamFileInput(Exec.getBufferAllocator(), new FileInputStream(new File(invalidDataPath))),
125
+ new TestTransactionalPageOutput(schema, result)
126
+ )
127
+
128
+
129
+ assertEquals(ArrayBuffer(
130
+ Map(
131
+ "id" -> 1L,
132
+ "title" -> "Hello!",
133
+ "author" -> "maji-KY",
134
+ "date" -> Timestamp.ofEpochSecond(978274800L),
135
+ "date_time" -> Timestamp.ofEpochSecond(978274800L),
136
+ "list" -> new JsonParser().parse("""["a","b","c"]"""),
137
+ "rating_sub" -> 2.5d,
138
+ "released" -> true,
139
+ )
140
+ ), result)
141
+ }
142
+
86
143
  }
87
144
 
145
+
146
+
88
147
  class TestTransactionalPageOutput(schema: Schema, result: mutable.Buffer[collection.mutable.Map[String, Any]])
89
148
  extends TransactionalPageOutput {
90
149
  import org.embulk.spi.PageReader
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-parser-xpath2
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - maji-KY
@@ -66,10 +66,12 @@ files:
66
66
  - src/main/scala/org/embulk/parser/xpath2/config/NamespacesConfig.scala
67
67
  - src/main/scala/org/embulk/parser/xpath2/config/SchemaConfig.scala
68
68
  - src/test/resources/data.xml
69
+ - src/test/resources/invalid-data.xml
69
70
  - src/test/scala/org/embulk/parser/xpath2/UnitSpec.scala
70
71
  - src/test/scala/org/embulk/parser/xpath2/XPath2ParserPluginSpec.scala
72
+ - classpath/vtd-xml-2.13.4.jar
71
73
  - classpath/scala-library-2.12.4.jar
72
- - classpath/embulk-parser-xpath2-0.0.4.jar
74
+ - classpath/embulk-parser-xpath2-0.1.0.jar
73
75
  homepage: https://github.com/maji-KY/embulk-parser-xpath2
74
76
  licenses:
75
77
  - MIT