embulk-parser-xpath2 0.0.4 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2aebf3205f36802a6d5240064e69da2e4481ab32
4
- data.tar.gz: c74a3fb9310df91b0695bc83a2495bbac12860e0
3
+ metadata.gz: fcd8e55e48a3ea51f186ca9a66440db2a454c877
4
+ data.tar.gz: 8816fe3c8e734055993ecba64179c68414bccf6d
5
5
  SHA512:
6
- metadata.gz: e5d7656c75d2c8c7a82d266fa26fba9730f798551d0e573c1924d9ab40d3debc40f3aec0a9219327e23c5e8282be3e9d9b206768b6783d7d95cb137e784984ea
7
- data.tar.gz: 07d9a54720290242e298724c80743bb75997e6d1149fad566380195c282e6e8860c18a07bf810949ca5e522283d19cabcdc9a2062dffa6272d4d33f78302b40a
6
+ metadata.gz: 751d5eacce78225b0a07e2925ccfca0ee09c6bfc3a01cbaa96331881f8e503170ae3974b2548deba12395aa92c1fbe909c291ae146771c11fe8690a1da137cdc
7
+ data.tar.gz: 9e5c7072613b5df19dcffbb0a7d354c8b1b7c514934134683ba7ba7e268a2f22d359710d6ff51397a978f5c8709e5ee18d28bb15e1f5903c7ab5a884d833837c
@@ -41,7 +41,7 @@ class ParseBenchmark {
41
41
 
42
42
  object ParseBenchmark {
43
43
 
44
- val TestRecordSize = 1000
44
+ val TestRecordSize = 100 * 1000
45
45
 
46
46
  val test = new XPath2ParserPluginSpec()
47
47
  val runtime = new EmbulkTestRuntime
@@ -13,7 +13,7 @@ configurations {
13
13
  provided
14
14
  }
15
15
 
16
- version = "0.0.4"
16
+ version = "0.1.0"
17
17
  ext {
18
18
  embulkVersion = "0.8.35"
19
19
  }
@@ -22,6 +22,7 @@ sourceCompatibility = 1.8
22
22
  targetCompatibility = 1.8
23
23
 
24
24
  dependencies {
25
+ compile "com.ximpleware:vtd-xml:2.13.4"
25
26
  compile "org.embulk:embulk-core:${embulkVersion}"
26
27
  provided "org.embulk:embulk-core:${embulkVersion}"
27
28
  testCompile "org.embulk:embulk-core:${embulkVersion}:tests"
data/build.sbt CHANGED
@@ -16,6 +16,7 @@ lazy val commonSettings = Seq(
16
16
  ),
17
17
  resolvers += Resolver.jcenterRepo,
18
18
  libraryDependencies ++= Seq(
19
+ "com.ximpleware" % "vtd-xml" % "2.13.4",
19
20
  "org.embulk" % "embulk-core" % embulkVersion,
20
21
  "org.embulk" % "embulk-core" % embulkVersion classifier "tests",
21
22
  "junit" % "junit" % "4.+" % "test",
@@ -1,10 +1,7 @@
1
1
  package org.embulk.parser.xpath2
2
2
 
3
- import java.util
4
- import javax.xml.namespace.NamespaceContext
5
- import javax.xml.parsers.{DocumentBuilder, DocumentBuilderFactory}
6
- import javax.xml.xpath.{XPathConstants, XPathExpression, XPathFactory}
7
-
3
+ import com.google.common.io.ByteStreams
4
+ import com.ximpleware.{AutoPilot, VTDGen, VTDNav}
8
5
  import org.embulk.config._
9
6
  import org.embulk.parser.xpath2.config.ColumnConfig
10
7
  import org.embulk.spi._
@@ -13,21 +10,14 @@ import org.embulk.spi.time.TimestampParser
13
10
  import org.embulk.spi.util.FileInputInputStream
14
11
  import org.msgpack.value.{Value, Variable}
15
12
  import org.slf4j.Logger
16
- import org.w3c.dom.{Document, Node, NodeList}
17
13
 
14
+ import scala.annotation.tailrec
18
15
  import scala.collection.JavaConverters._
19
- import scala.collection.immutable
20
16
  import scala.util.control.NonFatal
21
17
 
22
18
  class XPath2ParserPlugin extends ParserPlugin {
23
19
 
24
- val logger: Logger = Exec.getLogger(classOf[XPath2ParserPlugin])
25
-
26
- def docBuilder: DocumentBuilder = {
27
- val factory: DocumentBuilderFactory = DocumentBuilderFactory.newInstance
28
- factory.setNamespaceAware(true)
29
- factory.newDocumentBuilder()
30
- }
20
+ private[this] val logger: Logger = Exec.getLogger(classOf[XPath2ParserPlugin])
31
21
 
32
22
  override def transaction(config: ConfigSource, control: ParserPlugin.Control): Unit = {
33
23
  val task = config.loadConfig(classOf[PluginTask])
@@ -42,73 +32,83 @@ class XPath2ParserPlugin extends ParserPlugin {
42
32
  val task: PluginTask = taskSource.loadTask(classOf[PluginTask])
43
33
  val stopOnInvalidRecord: Boolean = task.getStopOnInvalidRecord
44
34
 
45
- val xPathInstance = XPathFactory.newInstance.newXPath()
46
- xPathInstance.setNamespaceContext(new NamespaceContext {
47
- override def getPrefix(namespaceURI: String): String = task.getNamespaces.conf.asScala.collectFirst { case (_, v) if v == namespaceURI => v }.orNull
48
- override def getPrefixes(namespaceURI: String): util.Iterator[_] = task.getNamespaces.conf.asScala.keys.asJava.iterator()
49
- override def getNamespaceURI(prefix: String): String = task.getNamespaces.conf.asScala(prefix)
50
- })
51
-
52
- val rootXPath: XPathExpression = xPathInstance.compile(task.getRoot)
53
- val columnXPaths: immutable.Seq[XPathExpression] = task.getSchema.columns.asScala.map(x => xPathInstance.compile(x.path)).toList
54
-
55
35
  val timestampParsers: Map[String, TimestampParser] = task.getSchema.columns.asScala
56
36
  .collect { case ColumnConfig(_, name, _, Some(timestampColumnOption), _) => (name, new TimestampParser(task, timestampColumnOption)) }.toMap
37
+ val columnsWithIndex: Seq[(ColumnConfig, Int)] = task.getSchema.columns.asScala.zipWithIndex
38
+
39
+ val vg = new VTDGen
57
40
 
58
41
  LoanPattern(new PageBuilder(Exec.getBufferAllocator, schema, output)) { pb =>
59
42
  while (input.nextFile()) {
60
- parseXML(input) match {
61
- case Right(doc) =>
62
- val rootNodes = rootXPath.evaluate(doc, XPathConstants.NODESET).asInstanceOf[NodeList]
63
- (0 until rootNodes.getLength).map(rootNodes.item).foreach { node =>
64
- columnXPaths.zipWithIndex.foreach { case (xPath, idx) =>
43
+ LoanPattern(new FileInputInputStream(input)) { fiis =>
44
+
45
+ vg.setDoc(ByteStreams.toByteArray(fiis))
46
+ vg.parse(true)
47
+
48
+ val nav = vg.getNav
49
+ val rootElementAutoPilot = new AutoPilot(nav)
50
+ val columnElementAutoPilot = new AutoPilot(nav)
51
+ task.getNamespaces.conf.asScala.foreach { case (prefix, namespaceURI) =>
52
+ rootElementAutoPilot.declareXPathNameSpace(prefix, namespaceURI)
53
+ columnElementAutoPilot.declareXPathNameSpace(prefix, namespaceURI)
54
+ }
55
+
56
+ @tailrec
57
+ def execEachRecord(rootAp: AutoPilot): Unit = if (rootAp.evalXPath() != -1) {
58
+ nav.push()
59
+ try {
60
+ columnsWithIndex.foreach { case (columnConfig, idx) =>
61
+ nav.push()
62
+ columnElementAutoPilot.selectXPath(columnConfig.path)
65
63
  val column = schema.getColumn(idx)
66
- handleColumn(pb, node, xPath, column, timestampParsers)
64
+ handleColumn(pb, nav, columnElementAutoPilot, column, timestampParsers)
65
+ nav.pop()
67
66
  }
68
67
  pb.addRecord()
68
+ } catch {
69
+ case NonFatal(e) => if (stopOnInvalidRecord) {
70
+ throw new DataException(e)
71
+ } else {
72
+ logger.warn(s"Skipped invalid record $e")
73
+ }
69
74
  }
70
- case Left(e) =>
71
- if(stopOnInvalidRecord) {
72
- throw new DataException(e)
73
- } else {
74
- logger.warn(s"Skipped invalid record $e")
75
- }
75
+ nav.pop()
76
+ execEachRecord(rootAp)
77
+ }
78
+
79
+ rootElementAutoPilot.selectXPath(task.getRoot)
80
+ execEachRecord(rootElementAutoPilot)
76
81
  }
82
+
77
83
  pb.flush()
78
84
  }
79
85
  pb.finish()
80
- pb.close()
81
- }
82
- }
83
-
84
- def parseXML(input: FileInput): Either[Throwable, Document] = {
85
- val stream = new FileInputInputStream(input)
86
- try {
87
- Right(docBuilder.parse(stream))
88
- } catch {
89
- case NonFatal(e) => Left(e)
90
86
  }
91
87
  }
92
88
 
93
- def handleColumn(pb: PageBuilder, node: Node, xPath: XPathExpression, column: Column, timestampParsers: Map[String, TimestampParser]): Unit = {
89
+ final def handleColumn(pb: PageBuilder, nav: VTDNav, columnAp: AutoPilot, column: Column, timestampParsers: Map[String, TimestampParser]): Unit = {
94
90
  if (column.getType.isInstanceOf[JsonType]) {
95
- val value: NodeList = xPath.evaluate(node, XPathConstants.NODESET).asInstanceOf[NodeList]
96
- val values: Seq[Value] = (0 until value.getLength).map(value.item).map { valueNode =>
97
- new Variable().setStringValue(valueNode.getTextContent).asStringValue()
91
+ val list = new java.util.ArrayList[Value]()
92
+ @tailrec
93
+ def eachJsonValue(cAp: AutoPilot): Unit = if (cAp.evalXPath() != -1) {
94
+ val index = nav.getText
95
+ if (index != -1) list.add(new Variable().setStringValue(nav.toString(index)).asStringValue())
96
+ eachJsonValue(cAp)
98
97
  }
99
- val jsonValue = new Variable().setArrayValue(values.asJava).asArrayValue()
98
+ eachJsonValue(columnAp)
99
+ val jsonValue = new Variable().setArrayValue(list).asArrayValue()
100
100
  pb.setJson(column, jsonValue)
101
101
  } else {
102
- val value: Node = xPath.evaluate(node, XPathConstants.NODE).asInstanceOf[Node]
103
- if (value == null) {
102
+ if (columnAp.evalXPath() == -1) {
104
103
  pb.setNull(column)
105
104
  } else {
106
- setColumn(pb, column, value.getTextContent, timestampParsers)
105
+ val index = nav.getText
106
+ setColumn(pb, column, nav.toString(index), timestampParsers)
107
107
  }
108
108
  }
109
109
  }
110
110
 
111
- def setColumn(pb: PageBuilder, column: Column, value: String, timestampParsers: Map[String, TimestampParser]): Unit = column.getType match {
111
+ final def setColumn(pb: PageBuilder, column: Column, value: String, timestampParsers: Map[String, TimestampParser]): Unit = column.getType match {
112
112
  case _: StringType => pb.setString(column, value)
113
113
  case _: LongType => pb.setLong(column, value.toLong)
114
114
  case _: DoubleType => pb.setDouble(column, value.toDouble)
@@ -0,0 +1,33 @@
1
+ <?xml version="1.0"?>
2
+ <ns1:root
3
+ xmlns:ns1="http://example.com/ns1/"
4
+ xmlns:ns2="http://example.com/ns2/">
5
+ <ns2:entry>
6
+ <ns2:id>1</ns2:id>
7
+ <ns2:title>Hello!</ns2:title>
8
+ <ns2:meta>
9
+ <ns2:author>maji-KY</ns2:author>
10
+ </ns2:meta>
11
+ <ns2:date>20010101</ns2:date>
12
+ <ns2:dateTime>2000-12-31 15:00:00</ns2:dateTime>
13
+ <ns2:list>
14
+ <ns2:value>a</ns2:value>
15
+ <ns2:value>b</ns2:value>
16
+ <ns2:value>c</ns2:value>
17
+ </ns2:list>
18
+ <ns2:rating by="subscribers">2.5</ns2:rating>
19
+ <ns2:rating>3.5</ns2:rating>
20
+ <ns2:released>true</ns2:released>
21
+ </ns2:entry>
22
+ <ns2:entry>
23
+ <ns2:id>2.5</ns2:id>
24
+ <ns2:title>Bonjour!</ns2:title>
25
+ <ns2:meta>
26
+ <ns2:author>maji-KY</ns2:author>
27
+ </ns2:meta>
28
+ <ns2:date>20010101</ns2:date>
29
+ <ns2:list></ns2:list>
30
+ <ns2:rating>3.5</ns2:rating>
31
+ <ns2:released>false</ns2:released>
32
+ </ns2:entry>
33
+ </ns1:root>
@@ -22,6 +22,7 @@ class XPath2ParserPluginSpec {
22
22
  def runtime = new EmbulkTestRuntime
23
23
 
24
24
  val dataPath: String = classOf[XPath2ParserPlugin].getClassLoader.getResource("data.xml").getPath
25
+ val invalidDataPath: String = classOf[XPath2ParserPlugin].getClassLoader.getResource("invalid-data.xml").getPath
25
26
 
26
27
  def configSource: ConfigSource = Exec.newConfigSource()
27
28
  .set("in", Map[String, String]("type" -> "file", "path_prefix" -> dataPath).asJava)
@@ -39,14 +40,15 @@ class XPath2ParserPluginSpec {
39
40
  .set("namespaces", Map[String, String]("ns1" -> "http://example.com/ns1/", "ns2" -> "http://example.com/ns2/").asJava)
40
41
  .set("out", Map[String, String]("type" -> "stdout").asJava)
41
42
 
42
- @Test def test() {
43
+ @Test def testParseXML() {
43
44
 
44
- val task = configSource.loadConfig(classOf[PluginTask])
45
+ val cs = configSource
46
+ val task = cs.loadConfig(classOf[PluginTask])
45
47
 
46
48
  var schema: Schema = null
47
49
 
48
50
  val plugin = new XPath2ParserPlugin()
49
- plugin.transaction(configSource, (_: TaskSource, s: Schema) => {schema = s})
51
+ plugin.transaction(cs, (_: TaskSource, s: Schema) => {schema = s})
50
52
 
51
53
  val result: mutable.Buffer[collection.mutable.Map[String, Any]] = mutable.Buffer()
52
54
 
@@ -83,8 +85,65 @@ class XPath2ParserPluginSpec {
83
85
  ), result)
84
86
  }
85
87
 
88
+ @Test(expected = classOf[DataException]) def testStopOnInvalid() {
89
+
90
+ val cs = configSource.set("stop_on_invalid_record", true)
91
+ val task = cs.loadConfig(classOf[PluginTask])
92
+
93
+ var schema: Schema = null
94
+
95
+ val plugin = new XPath2ParserPlugin()
96
+ plugin.transaction(cs, (_: TaskSource, s: Schema) => {schema = s})
97
+
98
+ val result: mutable.Buffer[collection.mutable.Map[String, Any]] = mutable.Buffer()
99
+
100
+ plugin.run(
101
+ task.dump(),
102
+ schema,
103
+ new InputStreamFileInput(Exec.getBufferAllocator(), new FileInputStream(new File(invalidDataPath))),
104
+ new TestTransactionalPageOutput(schema, result)
105
+ )
106
+
107
+ }
108
+
109
+ @Test() def testSkipOnInvalid() {
110
+
111
+ val cs = configSource
112
+ val task = cs.loadConfig(classOf[PluginTask])
113
+
114
+ var schema: Schema = null
115
+
116
+ val plugin = new XPath2ParserPlugin()
117
+ plugin.transaction(cs, (_: TaskSource, s: Schema) => {schema = s})
118
+
119
+ val result: mutable.Buffer[collection.mutable.Map[String, Any]] = mutable.Buffer()
120
+
121
+ plugin.run(
122
+ task.dump(),
123
+ schema,
124
+ new InputStreamFileInput(Exec.getBufferAllocator(), new FileInputStream(new File(invalidDataPath))),
125
+ new TestTransactionalPageOutput(schema, result)
126
+ )
127
+
128
+
129
+ assertEquals(ArrayBuffer(
130
+ Map(
131
+ "id" -> 1L,
132
+ "title" -> "Hello!",
133
+ "author" -> "maji-KY",
134
+ "date" -> Timestamp.ofEpochSecond(978274800L),
135
+ "date_time" -> Timestamp.ofEpochSecond(978274800L),
136
+ "list" -> new JsonParser().parse("""["a","b","c"]"""),
137
+ "rating_sub" -> 2.5d,
138
+ "released" -> true,
139
+ )
140
+ ), result)
141
+ }
142
+
86
143
  }
87
144
 
145
+
146
+
88
147
  class TestTransactionalPageOutput(schema: Schema, result: mutable.Buffer[collection.mutable.Map[String, Any]])
89
148
  extends TransactionalPageOutput {
90
149
  import org.embulk.spi.PageReader
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-parser-xpath2
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - maji-KY
@@ -66,10 +66,12 @@ files:
66
66
  - src/main/scala/org/embulk/parser/xpath2/config/NamespacesConfig.scala
67
67
  - src/main/scala/org/embulk/parser/xpath2/config/SchemaConfig.scala
68
68
  - src/test/resources/data.xml
69
+ - src/test/resources/invalid-data.xml
69
70
  - src/test/scala/org/embulk/parser/xpath2/UnitSpec.scala
70
71
  - src/test/scala/org/embulk/parser/xpath2/XPath2ParserPluginSpec.scala
72
+ - classpath/vtd-xml-2.13.4.jar
71
73
  - classpath/scala-library-2.12.4.jar
72
- - classpath/embulk-parser-xpath2-0.0.4.jar
74
+ - classpath/embulk-parser-xpath2-0.1.0.jar
73
75
  homepage: https://github.com/maji-KY/embulk-parser-xpath2
74
76
  licenses:
75
77
  - MIT