embulk-parser-xpath2 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d18d3543eeb861a75f6a19ef91bb00e004aaccf5
4
- data.tar.gz: c585d26f626e4eb8f9e72deb3ece90902c1318d8
3
+ metadata.gz: d739e0975d864c0920c7af2f4bdbaac2d4978925
4
+ data.tar.gz: 07d69b8ad2cefa9e25edd06c98952e4735b45498
5
5
  SHA512:
6
- metadata.gz: e590e135e55fecbd12bd5c4ff757a2824f9ea315d40b34013dc31e61e7f4b08c5b22b465b1197fd6ac501ddb18ed8403cc2cb05207fec958d4ab03d4d81f0354
7
- data.tar.gz: 53daf517e3f70614b71469b97bfea1f12c7752a79117ee831d5943c650b95ec918eb6e34e26a806a5a764efcc9ed4bd1d490655f33eadf8b47cb8690edb31240
6
+ metadata.gz: 8edf4e7762787940f0addf22fe650e5d5912d030cac7d86d0c4bf00e70b275b641c903b6ab65088a7858a38caed425b0d0bdb99df2be51d5cfc417497ea2896e
7
+ data.tar.gz: 0ddc27b1bfab2f31e3992e008cdcedfde8c288e287409551e52a5ecbae37086582ee4b5659e8fe131f1149e97a3212d50fd38f44b05a0747e1693b85002ef8c9
data/.gitignore CHANGED
@@ -6,10 +6,9 @@
6
6
  /classpath/
7
7
  build/
8
8
  .idea
9
- /.settings/
10
- /.metadata/
11
9
  .classpath
12
10
  project/project
13
- /bin/
11
+ project/target
12
+ target
14
13
  *.iml
15
14
  out
data/README.md CHANGED
@@ -11,6 +11,7 @@ Embulk parser plugin for parsing xml data by XPath perfectly!
11
11
 
12
12
  - namespace awareness
13
13
  - nullable columns
14
+ - complex json array columns (with restrictions)
14
15
 
15
16
  ## Overview
16
17
 
@@ -61,6 +62,106 @@ Then you can fetch entries from the following xml:
61
62
  </ns2:entry>
62
63
  </ns1:root>
63
64
  ```
65
+
66
+ ## complex json array column
67
+
68
+ ### Usage
69
+
70
+ ```yaml
71
+ parser:
72
+ type: xpath2
73
+ root: '/ns1:root/ns2:entry'
74
+ schema:
75
+ - { path: 'ns2:id', name: id, type: long }
76
+ - path: 'ns2:list'
77
+ name: list
78
+ type: json
79
+ structure: # adding structure key to enabling complex json array column
80
+ - path: 'ns2:list'
81
+ name: list
82
+ type: array
83
+ - path: 'ns2:list/ns2:elements'
84
+ name: elements
85
+ type: array
86
+ - path: 'ns2:list/ns2:elements/ns2:name'
87
+ name: elementName
88
+ type: string
89
+ - path: 'ns2:list/ns2:elements/ns2:value'
90
+ name: elementValue
91
+ type: long
92
+ - path: 'ns2:list/ns2:elements/ns2:active'
93
+ name: elementActive
94
+ type: boolean
95
+ namespaces: {ns1: 'http://example.com/ns1/', ns2: 'http://example.com/ns2/'}
96
+ ```
97
+
98
+ ### Structure configuration
99
+ - **path**: specify path from the XPath of the column (string, required)
100
+ - **name**: json key name (string)
101
+ - **type**: json data type (One of array, string, long, boolean., required)
102
+
103
+ Then you can fetch entries from the following xml:
104
+ ```xml
105
+ <?xml version="1.0"?>
106
+ <ns1:root
107
+ xmlns:ns1="http://example.com/ns1/"
108
+ xmlns:ns2="http://example.com/ns2/">
109
+ <ns2:entry>
110
+ <ns2:id>1</ns2:id>
111
+ <ns2:list>
112
+ <ns2:elements>
113
+ <ns2:name>foo1</ns2:name>
114
+ <ns2:value>1</ns2:value>
115
+ <ns2:active>true</ns2:active>
116
+ </ns2:elements>
117
+ <ns2:elements>
118
+ <ns2:name>foo2</ns2:name>
119
+ <ns2:value>2</ns2:value>
120
+ <ns2:active>false</ns2:active>
121
+ </ns2:elements>
122
+ </ns2:list>
123
+ <ns2:list>
124
+ <ns2:elements>
125
+ <ns2:name>bar1</ns2:name>
126
+ <ns2:value>3</ns2:value>
127
+ <ns2:active>true</ns2:active>
128
+ </ns2:elements>
129
+ </ns2:list>
130
+ </ns2:entry>
131
+ </ns1:root>
132
+ ```
133
+
134
+ result of `list` column:
135
+ ```json
136
+ {
137
+ "list": [
138
+ {
139
+ "elements": [
140
+ {
141
+ "elementActive": true,
142
+ "elementName": "foo1",
143
+ "elementValue": 1
144
+ },
145
+ {
146
+ "elementActive": false,
147
+ "elementName": "foo2",
148
+ "elementValue": 2
149
+ }
150
+ ]
151
+ },
152
+ {
153
+ "elements": [
154
+ {
155
+ "elementActive": true,
156
+ "elementName": "bar1",
157
+ "elementValue": 3
158
+ }
159
+ ]
160
+ }
161
+ ]
162
+ }
163
+ ```
164
+
64
165
  ## Build
65
166
 
66
167
  ```
@@ -41,7 +41,7 @@ class ParseBenchmark {
41
41
 
42
42
  object ParseBenchmark {
43
43
 
44
- val TestRecordSize = 100 * 1000
44
+ val TestRecordSize = 10 * 1000
45
45
 
46
46
  val test = new XPath2ParserPluginSpec()
47
47
  val runtime = new EmbulkTestRuntime
@@ -13,7 +13,7 @@ configurations {
13
13
  provided
14
14
  }
15
15
 
16
- version = "0.1.2"
16
+ version = "0.2.0"
17
17
  ext {
18
18
  embulkVersion = "0.8.39"
19
19
  }
@@ -0,0 +1,139 @@
1
+ package org.embulk.parser.xpath2
2
+
3
+ import com.ximpleware.{AutoPilot, VTDNav}
4
+ import org.embulk.parser.xpath2.config.JsonStructureElement
5
+ import org.embulk.spi.Column
6
+ import org.msgpack.value.{Value, Variable}
7
+
8
+ import scala.annotation.tailrec
9
+ import scala.collection.JavaConverters._
10
+ import scala.collection.immutable.Queue
11
+
12
+ sealed trait Direction
13
+ case object Parent extends Direction
14
+ case object Sibling extends Direction
15
+ case object Child extends Direction
16
+
17
+ private case class Path(depth: Int, pathFragments: Seq[String], moveDirection: Direction) {
18
+
19
+ def next(depth: Int, elementName: String): Path = {
20
+ if (this.depth > depth) {
21
+ val (rest :+ _) = pathFragments
22
+ Path(depth, rest, Parent)
23
+ } else if (this.depth == depth) {
24
+ val (rest :+ _) = pathFragments
25
+ Path(depth, rest :+ elementName, Sibling)
26
+ } else {
27
+ Path(depth, pathFragments :+ elementName, Child)
28
+ }
29
+ }
30
+
31
+ val path: String = pathFragments.mkString("/")
32
+
33
+ }
34
+
35
+ object MsgPackEncoder {
36
+
37
+ def encode(nav: VTDNav, columnAp: AutoPilot, column: Column, maybeStructure: Option[Seq[JsonStructureElement]]): Value = maybeStructure.map { structure =>
38
+ // complex json array
39
+ val keyValues = Iterator.continually(columnAp.evalXPath()).takeWhile(_ != -1).flatMap { _ =>
40
+ VTD.withinContext(nav) {
41
+ constructJsonMap(nav, columnAp, column, structure).toSeq
42
+ }
43
+ }
44
+ val mergedMap = keyValues.toSeq.groupBy { case (k, _) => k }.map { case (k, v)=>
45
+ val mergedValues = v.flatMap {
46
+ case (_, x: Seq[Any]) => x
47
+ case _ => sys.error("Root element supports array only. Please reconsider the configuration.")
48
+ }
49
+ (k, mergedValues)
50
+ }
51
+ convertToValue(mergedMap)
52
+ } getOrElse {
53
+ // simple string[]
54
+ @tailrec
55
+ def eachJsonValue(cAp: AutoPilot, queue: Queue[Value]): Queue[Value] = if (cAp.evalXPath() != -1) {
56
+ val index = nav.getText
57
+ val nextQueue = if (index != -1) queue :+ new Variable().setStringValue(nav.toString(index)).asStringValue() else queue
58
+ eachJsonValue(cAp, nextQueue)
59
+ } else queue
60
+ asArrayValue(eachJsonValue(columnAp, Queue.empty[Value]))
61
+ }
62
+
63
+ private def constructJsonMap(nav: VTDNav, columnAp: AutoPilot, column: Column, structure: Seq[JsonStructureElement]): Map[String, Any] = {
64
+
65
+ @tailrec
66
+ def eachElement(eAp: AutoPilot, previousPath: Path, obj: Map[String, Any]): Map[String, Any] = if (eAp.iterate()) {
67
+ val current = previousPath.next(nav.getCurrentDepth, nav.toString(nav.getCurrentIndex))
68
+ if (current.moveDirection == Parent) {
69
+ obj
70
+ } else {
71
+ val updated = structure.find(_.path == current.path).map { x =>
72
+ x.`type` match {
73
+ case "array" =>
74
+ val targetArray = obj.getOrElse(x.name, Queue[Any]()).asInstanceOf[Seq[Any]]
75
+ val childStructure = eachArrayElement(eAp, current)
76
+ obj.updated(x.name, targetArray ++ childStructure)
77
+ case "string" => obj.updated(x.name, nav.toNormalizedString(nav.getText))
78
+ case "long" => obj.updated(x.name, nav.toNormalizedString(nav.getText).toLong)
79
+ case "boolean" => obj.updated(x.name, nav.toNormalizedString(nav.getText).toBoolean)
80
+ case notSupported@_ => sys.error(s"type=$notSupported is notSupported")
81
+ }
82
+ }
83
+ eachElement(eAp, current, updated.getOrElse(obj))
84
+ }
85
+ } else obj
86
+
87
+ def isArrayElement(current: Path): Boolean =
88
+ structure.exists(x => x.path == current.path && x.`type` == "array")
89
+
90
+ def eachArrayElement(eAp: AutoPilot, previousPath: Path): Seq[Map[String, Any]] = {
91
+ @tailrec
92
+ def loop(eAp: AutoPilot, previousPath: Path, obj: Map[String, Any], elements: Seq[Map[String, Any]]): Seq[Map[String, Any]] = {
93
+ val arrayContent = eachElement(eAp, previousPath, obj)
94
+ val currentPath = previousPath.next(nav.getCurrentDepth, nav.toString(nav.getCurrentIndex))
95
+ if (isArrayElement(currentPath)) {
96
+ loop(eAp, currentPath, obj, elements :+ arrayContent)
97
+ } else elements :+ arrayContent
98
+ }
99
+ loop(eAp, previousPath, Map.empty[String, Any], Queue.empty[Map[String, Any]])
100
+ }
101
+
102
+ val eachElementAp = new AutoPilot(nav)
103
+ eachElementAp.selectElement("*")
104
+
105
+ val initialPath = Path(-1, Vector.empty, Sibling)
106
+
107
+ eachElement(eachElementAp, initialPath, Map[String, Any]())
108
+ }
109
+
110
+ private def convertToValue(obj: Map[String, Any]): Value = {
111
+ val map = obj.map {
112
+ case (k, v: Seq[_]) => (asStringValue(k), convertToValue(v))
113
+ case (k, v: Map[_, _]) => (asStringValue(k), convertToValue(v.asInstanceOf[Map[String, Any]]))
114
+ case (k, v: String) => (asStringValue(k), asStringValue(v))
115
+ case (k, v: Boolean) => (asStringValue(k), asBooleanValue(v))
116
+ case (k, v: Long) => (asStringValue(k), asLongValue(v))
117
+ case (k, v) => sys.error(s"can't convert: key=$k, value=$v")
118
+ }
119
+ asMapValue(map)
120
+ }
121
+
122
+ private def convertToValue(seq: Seq[Any]): Value = {
123
+ val list = seq.map {
124
+ case v: Seq[_] => convertToValue(v)
125
+ case v: Map[_, _] => convertToValue(v.asInstanceOf[Map[String, Any]])
126
+ case v: String => asStringValue(v)
127
+ case v: Boolean => asBooleanValue(v)
128
+ case v: Long => asLongValue(v)
129
+ }
130
+ asArrayValue(list)
131
+ }
132
+
133
+ private final def asStringValue(value: String): Value = new Variable().setStringValue(value).asStringValue()
134
+ private final def asBooleanValue(value: Boolean): Value = new Variable().setBooleanValue(value).asBooleanValue()
135
+ private final def asLongValue(value: Long): Value = new Variable().setIntegerValue(value).asNumberValue()
136
+ private final def asArrayValue(value: Seq[Value]): Value = new Variable().setArrayValue(value.asJava).asArrayValue()
137
+ private final def asMapValue(value: Map[Value, Value]): Value = new Variable().setMapValue(value.asJava).asMapValue()
138
+
139
+ }
@@ -0,0 +1,12 @@
1
+ package org.embulk.parser.xpath2
2
+
3
+ import com.ximpleware.VTDNav
4
+
5
+ object VTD {
6
+
7
+ final def withinContext[A](nav: VTDNav)(f: => A): A = try {
8
+ nav.push()
9
+ f
10
+ } finally nav.pop()
11
+
12
+ }
@@ -3,12 +3,11 @@ package org.embulk.parser.xpath2
3
3
  import com.google.common.io.ByteStreams
4
4
  import com.ximpleware.{AutoPilot, VTDGen, VTDNav}
5
5
  import org.embulk.config._
6
- import org.embulk.parser.xpath2.config.ColumnConfig
6
+ import org.embulk.parser.xpath2.config.{ColumnConfig, JsonStructureElement}
7
7
  import org.embulk.spi._
8
8
  import org.embulk.spi.`type`._
9
9
  import org.embulk.spi.time.TimestampParser
10
10
  import org.embulk.spi.util.FileInputInputStream
11
- import org.msgpack.value.{Value, Variable}
12
11
  import org.slf4j.Logger
13
12
 
14
13
  import scala.annotation.tailrec
@@ -33,7 +32,10 @@ class XPath2ParserPlugin extends ParserPlugin {
33
32
  val stopOnInvalidRecord: Boolean = task.getStopOnInvalidRecord
34
33
 
35
34
  val timestampParsers: Map[String, TimestampParser] = task.getSchema.columns.asScala
36
- .collect { case ColumnConfig(_, name, _, Some(timestampColumnOption), _) => (name, new TimestampParser(task, timestampColumnOption)) }.toMap
35
+ .collect { case ColumnConfig(_, name, _, Some(timestampColumnOption), _, _) => (name, new TimestampParser(task, timestampColumnOption)) }.toMap
36
+
37
+ val jsonStructures: Map[String, Seq[JsonStructureElement]] = task.getSchema.columns.asScala
38
+ .collect { case ColumnConfig(_, name, _, _, Some(jsonColumnOption), _) => (name, jsonColumnOption.structure.asScala) }.toMap
37
39
 
38
40
  def declareXPathNS(ap: AutoPilot): Unit = {
39
41
  task.getNamespaces.conf.asScala.foreach { case (prefix, namespaceURI) =>
@@ -67,11 +69,11 @@ class XPath2ParserPlugin extends ParserPlugin {
67
69
  nav.push()
68
70
  try {
69
71
  columnElementAutoPilots.zipWithIndex.foreach { case (columnElementAutoPilot, idx) =>
70
- nav.push()
71
- columnElementAutoPilot.resetXPath()
72
- val column = schema.getColumn(idx)
73
- handleColumn(pb, nav, columnElementAutoPilot, column, timestampParsers)
74
- nav.pop()
72
+ VTD.withinContext(nav) {
73
+ columnElementAutoPilot.resetXPath()
74
+ val column = schema.getColumn(idx)
75
+ handleColumn(pb, nav, columnElementAutoPilot, column, timestampParsers, jsonStructures)
76
+ }
75
77
  }
76
78
  pb.addRecord()
77
79
  } catch {
@@ -95,17 +97,9 @@ class XPath2ParserPlugin extends ParserPlugin {
95
97
  }
96
98
  }
97
99
 
98
- final def handleColumn(pb: PageBuilder, nav: VTDNav, columnAp: AutoPilot, column: Column, timestampParsers: Map[String, TimestampParser]): Unit = {
100
+ final def handleColumn(pb: PageBuilder, nav: VTDNav, columnAp: AutoPilot, column: Column, timestampParsers: Map[String, TimestampParser], jsonStructures: Map[String, Seq[JsonStructureElement]]): Unit = {
99
101
  if (column.getType.isInstanceOf[JsonType]) {
100
- val list = new java.util.ArrayList[Value]()
101
- @tailrec
102
- def eachJsonValue(cAp: AutoPilot): Unit = if (cAp.evalXPath() != -1) {
103
- val index = nav.getText
104
- if (index != -1) list.add(new Variable().setStringValue(nav.toString(index)).asStringValue())
105
- eachJsonValue(cAp)
106
- }
107
- eachJsonValue(columnAp)
108
- val jsonValue = new Variable().setArrayValue(list).asArrayValue()
102
+ val jsonValue = MsgPackEncoder.encode(nav, columnAp, column, jsonStructures.get(column.getName))
109
103
  pb.setJson(column, jsonValue)
110
104
  } else {
111
105
  if (columnAp.evalXPath() == -1) {
@@ -2,10 +2,10 @@ package org.embulk.parser.xpath2.config
2
2
 
3
3
  import java.util
4
4
 
5
- import com.fasterxml.jackson.annotation.{JsonCreator, JsonValue}
5
+ import com.fasterxml.jackson.annotation.{JsonCreator, JsonProperty, JsonValue}
6
6
  import com.google.common.base.Optional
7
7
  import org.embulk.config.{Config, ConfigDefault, ConfigSource}
8
- import org.embulk.spi.`type`.{TimestampType, Type}
8
+ import org.embulk.spi.`type`.{JsonType, TimestampType, Type}
9
9
  import org.embulk.spi.time.TimestampParser.TimestampColumnOption
10
10
  import org.joda.time.DateTimeZone
11
11
 
@@ -14,11 +14,18 @@ case class SchemaConfig @JsonCreator()(columns: java.util.List[ColumnConfig]) {
14
14
  def getColumns: util.List[ColumnConfig] = columns
15
15
  }
16
16
 
17
- case class ColumnConfig(path: String, name: String, `type`: Type, timestampOption: Option[TimestampColumnOption], option: ConfigSource) {
17
+ case class ColumnConfig(path: String, name: String, `type`: Type, timestampOption: Option[TimestampColumnOption], jsonOption: Option[JsonColumnOption], option: ConfigSource) {
18
18
 
19
19
  @JsonCreator()
20
20
  def this(src: ConfigSource) = {
21
- this(src.get(classOf[String], "path"), src.get(classOf[String], "name"), src.get(classOf[Type], "type"), ColumnConfig.getTimestampOption(src, src.get(classOf[Type], "type")), src)
21
+ this(
22
+ src.get(classOf[String], "path"),
23
+ src.get(classOf[String], "name"),
24
+ src.get(classOf[Type], "type"),
25
+ ColumnConfig.getTimestampOption(src, src.get(classOf[Type], "type")),
26
+ ColumnConfig.getJsonOption(src, src.get(classOf[Type], "type")),
27
+ src
28
+ )
22
29
  }
23
30
 
24
31
  @JsonValue()
@@ -49,11 +56,25 @@ private class TimestampColumnOptionImpl(timezone: Optional[DateTimeZone], format
49
56
  override val getDate = date
50
57
  }
51
58
 
59
+ class JsonStructureElement(@JsonProperty("path") val path: String, @JsonProperty("type") val `type`: String) {
60
+ @JsonProperty("name")
61
+ val name: String = path
62
+ }
63
+
64
+ case class JsonColumnOption(@JsonProperty("structure") structure: java.util.List[JsonStructureElement])
65
+
52
66
  object ColumnConfig {
53
67
  private def getTimestampOption(src: ConfigSource, `type`: Type): Option[TimestampColumnOption] = `type` match {
54
68
  case _: TimestampType => Some(getOption(src).loadConfig(classOf[TimestampColumnOptionImpl]))
55
69
  case _ => None
56
70
  }
57
71
 
72
+ private def getJsonOption(src: ConfigSource, `type`: Type): Option[JsonColumnOption] = `type` match {
73
+ case _: JsonType =>
74
+ val option = getOption(src)
75
+ if (option.has("structure")) Some(option.loadConfig(classOf[JsonColumnOption])) else None
76
+ case _ => None
77
+ }
78
+
58
79
  private def getOption(src: ConfigSource) = src.deepCopy().remove("path").remove("name").remove("type")
59
80
  }
@@ -0,0 +1,30 @@
1
+ in:
2
+ type: file
3
+ path_prefix: src/test/resources/data.xml
4
+ parser:
5
+ type: xpath2
6
+ root: '/ns1:root/ns2:entry'
7
+ schema:
8
+ - { path: 'ns2:id', name: id, type: long }
9
+ - path: 'ns2:list'
10
+ name: list
11
+ type: json
12
+ structure:
13
+ - path: 'ns2:list'
14
+ name: list
15
+ type: array
16
+ - path: 'ns2:list/ns2:elements'
17
+ name: elements
18
+ type: array
19
+ - path: 'ns2:list/ns2:elements/ns2:name'
20
+ name: elementName
21
+ type: string
22
+ - path: 'ns2:list/ns2:elements/ns2:value'
23
+ name: elementValue
24
+ type: long
25
+ - path: 'ns2:list/ns2:elements/ns2:active'
26
+ name: elementActive
27
+ type: boolean
28
+ namespaces: {ns1: 'http://example.com/ns1/', ns2: 'http://example.com/ns2/'}
29
+ out:
30
+ type: stdout
@@ -0,0 +1,27 @@
1
+ <?xml version="1.0"?>
2
+ <ns1:root
3
+ xmlns:ns1="http://example.com/ns1/"
4
+ xmlns:ns2="http://example.com/ns2/">
5
+ <ns2:entry>
6
+ <ns2:id>1</ns2:id>
7
+ <ns2:list>
8
+ <ns2:elements>
9
+ <ns2:name>foo1</ns2:name>
10
+ <ns2:value>1</ns2:value>
11
+ <ns2:active>true</ns2:active>
12
+ </ns2:elements>
13
+ <ns2:elements>
14
+ <ns2:name>foo2</ns2:name>
15
+ <ns2:value>2</ns2:value>
16
+ <ns2:active>false</ns2:active>
17
+ </ns2:elements>
18
+ </ns2:list>
19
+ <ns2:list>
20
+ <ns2:elements>
21
+ <ns2:name>bar1</ns2:name>
22
+ <ns2:value>3</ns2:value>
23
+ <ns2:active>true</ns2:active>
24
+ </ns2:elements>
25
+ </ns2:list>
26
+ </ns2:entry>
27
+ </ns1:root>
@@ -0,0 +1,53 @@
1
+ package org.embulk.parser.xpath2
2
+
3
+ import org.embulk.spi.{Column, ColumnVisitor, PageReader}
4
+
5
+ class TestColumnVisitor(reader: PageReader, record: collection.mutable.Map[String, Any]) extends ColumnVisitor {
6
+ override def timestampColumn(column: Column): Unit = {
7
+ if (reader.isNull(column)) {
8
+ record.put(column.getName, null)
9
+ } else {
10
+ record.put(column.getName, reader.getTimestamp(column))
11
+ }
12
+ }
13
+
14
+ override def stringColumn(column: Column): Unit = {
15
+ if (reader.isNull(column)) {
16
+ record.put(column.getName, null)
17
+ } else {
18
+ record.put(column.getName, reader.getString(column))
19
+ }
20
+ }
21
+
22
+ override def longColumn(column: Column): Unit = {
23
+ if (reader.isNull(column)) {
24
+ record.put(column.getName, null)
25
+ } else {
26
+ record.put(column.getName, reader.getLong(column))
27
+ }
28
+ }
29
+
30
+ override def doubleColumn(column: Column): Unit = {
31
+ if (reader.isNull(column)) {
32
+ record.put(column.getName, null)
33
+ } else {
34
+ record.put(column.getName, reader.getDouble(column))
35
+ }
36
+ }
37
+
38
+ override def booleanColumn(column: Column): Unit = {
39
+ if (reader.isNull(column)) {
40
+ record.put(column.getName, null)
41
+ } else {
42
+ record.put(column.getName, reader.getBoolean(column))
43
+ }
44
+ }
45
+
46
+ override def jsonColumn(column: Column): Unit = {
47
+ if (reader.isNull(column)) {
48
+ record.put(column.getName, null)
49
+ } else {
50
+ record.put(column.getName, reader.getJson(column))
51
+ }
52
+ }
53
+ }
@@ -0,0 +1,31 @@
1
+ package org.embulk.parser.xpath2
2
+
3
+ import org.embulk.spi.{Exec, Page, Schema, TransactionalPageOutput}
4
+
5
+ import scala.collection.mutable
6
+ import scala.collection.JavaConverters._
7
+
8
+ class TestTransactionalPageOutput(schema: Schema, result: mutable.Buffer[collection.mutable.Map[String, Any]])
9
+ extends TransactionalPageOutput {
10
+ import org.embulk.spi.PageReader
11
+
12
+ val reader = new PageReader(schema)
13
+
14
+ override def add(page: Page) = {
15
+ reader.setPage(page)
16
+
17
+ while (reader.nextRecord()) {
18
+ val record: collection.mutable.Map[String, Any] = collection.mutable.Map()
19
+
20
+ schema.getColumns().asScala.foreach { column =>
21
+ column.visit(new TestColumnVisitor(reader, record))
22
+ }
23
+ result += record
24
+ }
25
+ }
26
+
27
+ override def commit() = Exec.newTaskReport()
28
+ override def abort() = {}
29
+ override def finish() = {}
30
+ override def close() = {}
31
+ }
@@ -0,0 +1,86 @@
1
+ package org.embulk.parser.xpath2
2
+
3
+ import java.io.{File, FileInputStream}
4
+ import java.nio.file
5
+ import java.nio.file.Paths
6
+
7
+ import org.embulk.EmbulkTestRuntime
8
+ import org.embulk.config.{ConfigLoader, ConfigSource, TaskSource}
9
+ import org.embulk.spi.json.JsonParser
10
+ import org.embulk.spi.util.InputStreamFileInput
11
+ import org.embulk.spi.{Exec, _}
12
+ import org.junit.Assert._
13
+ import org.junit.{Rule, Test}
14
+
15
+ import scala.collection.mutable
16
+ import scala.collection.mutable.ArrayBuffer
17
+
18
+ class XPath2ParserPluginJsonSpec {
19
+
20
+ @Rule
21
+ def runtime = new EmbulkTestRuntime
22
+
23
+ val yamlPath: file.Path = Paths.get(classOf[XPath2ParserPlugin].getClassLoader.getResource("json_config.yml").toURI)
24
+ val dataPath: String = classOf[XPath2ParserPlugin].getClassLoader.getResource("json_data.xml").getPath
25
+
26
+ def configSource: ConfigSource = new ConfigLoader(Exec.getModelManager).fromYamlFile(yamlPath.toFile).getNested("in").getNested("parser")
27
+
28
+ @Test def testParseJsonArrayXML() {
29
+
30
+ val cs = configSource
31
+ val task = cs.loadConfig(classOf[PluginTask])
32
+
33
+ var schema: Schema = null
34
+
35
+ val plugin = new XPath2ParserPlugin()
36
+ plugin.transaction(cs, (_: TaskSource, s: Schema) => {schema = s})
37
+
38
+ val result: mutable.Buffer[collection.mutable.Map[String, Any]] = mutable.Buffer()
39
+
40
+ plugin.run(
41
+ task.dump(),
42
+ schema,
43
+ new InputStreamFileInput(Exec.getBufferAllocator, new FileInputStream(new File(dataPath))),
44
+ new TestTransactionalPageOutput(schema, result)
45
+ )
46
+
47
+ println(result)
48
+
49
+ val expectedJson =
50
+ """{
51
+ "list": [
52
+ {
53
+ "elements": [
54
+ {
55
+ "elementActive": true,
56
+ "elementName": "foo1",
57
+ "elementValue": 1
58
+ },
59
+ {
60
+ "elementActive": false,
61
+ "elementName": "foo2",
62
+ "elementValue": 2
63
+ }
64
+ ]
65
+ },
66
+ {
67
+ "elements": [
68
+ {
69
+ "elementActive": true,
70
+ "elementName": "bar1",
71
+ "elementValue": 3
72
+ }
73
+ ]
74
+ }
75
+ ]
76
+ }"""
77
+
78
+ assertEquals(ArrayBuffer(
79
+ Map(
80
+ "id" -> 1L,
81
+ "list" -> new JsonParser().parse(expectedJson)
82
+ )
83
+ ), result)
84
+ }
85
+
86
+ }
@@ -141,80 +141,3 @@ class XPath2ParserPluginSpec {
141
141
  }
142
142
 
143
143
  }
144
-
145
-
146
-
147
- class TestTransactionalPageOutput(schema: Schema, result: mutable.Buffer[collection.mutable.Map[String, Any]])
148
- extends TransactionalPageOutput {
149
- import org.embulk.spi.PageReader
150
-
151
- val reader = new PageReader(schema)
152
-
153
- override def add(page: Page) = {
154
- reader.setPage(page)
155
-
156
- while (reader.nextRecord()) {
157
- val record: collection.mutable.Map[String, Any] = collection.mutable.Map()
158
-
159
- schema.getColumns().asScala.foreach { column =>
160
- column.visit(new TestColumnVisitor(reader, record))
161
- }
162
- result += record
163
- }
164
- }
165
-
166
- override def commit() = Exec.newTaskReport()
167
- override def abort() = {}
168
- override def finish() = {}
169
- override def close() = {}
170
- }
171
-
172
- class TestColumnVisitor(reader: PageReader, record: collection.mutable.Map[String, Any]) extends ColumnVisitor {
173
- override def timestampColumn(column: Column): Unit = {
174
- if (reader.isNull(column)) {
175
- record.put(column.getName, null)
176
- } else {
177
- record.put(column.getName, reader.getTimestamp(column))
178
- }
179
- }
180
-
181
- override def stringColumn(column: Column): Unit = {
182
- if (reader.isNull(column)) {
183
- record.put(column.getName, null)
184
- } else {
185
- record.put(column.getName, reader.getString(column))
186
- }
187
- }
188
-
189
- override def longColumn(column: Column): Unit = {
190
- if (reader.isNull(column)) {
191
- record.put(column.getName, null)
192
- } else {
193
- record.put(column.getName, reader.getLong(column))
194
- }
195
- }
196
-
197
- override def doubleColumn(column: Column): Unit = {
198
- if (reader.isNull(column)) {
199
- record.put(column.getName, null)
200
- } else {
201
- record.put(column.getName, reader.getDouble(column))
202
- }
203
- }
204
-
205
- override def booleanColumn(column: Column): Unit = {
206
- if (reader.isNull(column)) {
207
- record.put(column.getName, null)
208
- } else {
209
- record.put(column.getName, reader.getBoolean(column))
210
- }
211
- }
212
-
213
- override def jsonColumn(column: Column): Unit = {
214
- if (reader.isNull(column)) {
215
- record.put(column.getName, null)
216
- } else {
217
- record.put(column.getName, reader.getJson(column))
218
- }
219
- }
220
- }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-parser-xpath2
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - maji-KY
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-12-12 00:00:00.000000000 Z
11
+ date: 2018-01-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -61,18 +61,25 @@ files:
61
61
  - project/build.properties
62
62
  - project/plugins.sbt
63
63
  - src/main/scala/org/embulk/parser/xpath2/LoanPattern.scala
64
+ - src/main/scala/org/embulk/parser/xpath2/MsgPackEncoder.scala
64
65
  - src/main/scala/org/embulk/parser/xpath2/PluginTask.scala
66
+ - src/main/scala/org/embulk/parser/xpath2/VTD.scala
65
67
  - src/main/scala/org/embulk/parser/xpath2/XPath2ParserPlugin.scala
66
68
  - src/main/scala/org/embulk/parser/xpath2/config/NamespacesConfig.scala
67
69
  - src/main/scala/org/embulk/parser/xpath2/config/SchemaConfig.scala
68
70
  - src/test/resources/config.yml
69
71
  - src/test/resources/data.xml
70
72
  - src/test/resources/invalid-data.xml
73
+ - src/test/resources/json_config.yml
74
+ - src/test/resources/json_data.xml
75
+ - src/test/scala/org/embulk/parser/xpath2/TestColumnVisitor.scala
76
+ - src/test/scala/org/embulk/parser/xpath2/TestTransactionalPageOutput.scala
71
77
  - src/test/scala/org/embulk/parser/xpath2/UnitSpec.scala
78
+ - src/test/scala/org/embulk/parser/xpath2/XPath2ParserPluginJsonSpec.scala
72
79
  - src/test/scala/org/embulk/parser/xpath2/XPath2ParserPluginSpec.scala
73
80
  - classpath/vtd-xml-2.13.4.jar
74
81
  - classpath/scala-library-2.12.4.jar
75
- - classpath/embulk-parser-xpath2-0.1.2.jar
82
+ - classpath/embulk-parser-xpath2-0.2.0.jar
76
83
  homepage: https://github.com/maji-KY/embulk-parser-xpath2
77
84
  licenses:
78
85
  - GPL-2.0