embulk-parser-xpath2 0.1.2 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d18d3543eeb861a75f6a19ef91bb00e004aaccf5
4
- data.tar.gz: c585d26f626e4eb8f9e72deb3ece90902c1318d8
3
+ metadata.gz: d739e0975d864c0920c7af2f4bdbaac2d4978925
4
+ data.tar.gz: 07d69b8ad2cefa9e25edd06c98952e4735b45498
5
5
  SHA512:
6
- metadata.gz: e590e135e55fecbd12bd5c4ff757a2824f9ea315d40b34013dc31e61e7f4b08c5b22b465b1197fd6ac501ddb18ed8403cc2cb05207fec958d4ab03d4d81f0354
7
- data.tar.gz: 53daf517e3f70614b71469b97bfea1f12c7752a79117ee831d5943c650b95ec918eb6e34e26a806a5a764efcc9ed4bd1d490655f33eadf8b47cb8690edb31240
6
+ metadata.gz: 8edf4e7762787940f0addf22fe650e5d5912d030cac7d86d0c4bf00e70b275b641c903b6ab65088a7858a38caed425b0d0bdb99df2be51d5cfc417497ea2896e
7
+ data.tar.gz: 0ddc27b1bfab2f31e3992e008cdcedfde8c288e287409551e52a5ecbae37086582ee4b5659e8fe131f1149e97a3212d50fd38f44b05a0747e1693b85002ef8c9
data/.gitignore CHANGED
@@ -6,10 +6,9 @@
6
6
  /classpath/
7
7
  build/
8
8
  .idea
9
- /.settings/
10
- /.metadata/
11
9
  .classpath
12
10
  project/project
13
- /bin/
11
+ project/target
12
+ target
14
13
  *.iml
15
14
  out
data/README.md CHANGED
@@ -11,6 +11,7 @@ Embulk parser plugin for parsing xml data by XPath perfectly!
11
11
 
12
12
  - namespace awareness
13
13
  - nullable columns
14
+ - complex json array columns (with restrictions)
14
15
 
15
16
  ## Overview
16
17
 
@@ -61,6 +62,106 @@ Then you can fetch entries from the following xml:
61
62
  </ns2:entry>
62
63
  </ns1:root>
63
64
  ```
65
+
66
+ ## complex json array column
67
+
68
+ ### Usage
69
+
70
+ ```yaml
71
+ parser:
72
+ type: xpath2
73
+ root: '/ns1:root/ns2:entry'
74
+ schema:
75
+ - { path: 'ns2:id', name: id, type: long }
76
+ - path: 'ns2:list'
77
+ name: list
78
+ type: json
79
+ structure: # adding structure key to enabling complex json array column
80
+ - path: 'ns2:list'
81
+ name: list
82
+ type: array
83
+ - path: 'ns2:list/ns2:elements'
84
+ name: elements
85
+ type: array
86
+ - path: 'ns2:list/ns2:elements/ns2:name'
87
+ name: elementName
88
+ type: string
89
+ - path: 'ns2:list/ns2:elements/ns2:value'
90
+ name: elementValue
91
+ type: long
92
+ - path: 'ns2:list/ns2:elements/ns2:active'
93
+ name: elementActive
94
+ type: boolean
95
+ namespaces: {ns1: 'http://example.com/ns1/', ns2: 'http://example.com/ns2/'}
96
+ ```
97
+
98
+ ### Structure configuration
99
+ - **path**: specify path from the XPath of the column (string, required)
100
+ - **name**: json key name (string)
101
+ - **type**: json data type (One of array, string, long, boolean., required)
102
+
103
+ Then you can fetch entries from the following xml:
104
+ ```xml
105
+ <?xml version="1.0"?>
106
+ <ns1:root
107
+ xmlns:ns1="http://example.com/ns1/"
108
+ xmlns:ns2="http://example.com/ns2/">
109
+ <ns2:entry>
110
+ <ns2:id>1</ns2:id>
111
+ <ns2:list>
112
+ <ns2:elements>
113
+ <ns2:name>foo1</ns2:name>
114
+ <ns2:value>1</ns2:value>
115
+ <ns2:active>true</ns2:active>
116
+ </ns2:elements>
117
+ <ns2:elements>
118
+ <ns2:name>foo2</ns2:name>
119
+ <ns2:value>2</ns2:value>
120
+ <ns2:active>false</ns2:active>
121
+ </ns2:elements>
122
+ </ns2:list>
123
+ <ns2:list>
124
+ <ns2:elements>
125
+ <ns2:name>bar1</ns2:name>
126
+ <ns2:value>3</ns2:value>
127
+ <ns2:active>true</ns2:active>
128
+ </ns2:elements>
129
+ </ns2:list>
130
+ </ns2:entry>
131
+ </ns1:root>
132
+ ```
133
+
134
+ result of `list` column:
135
+ ```json
136
+ {
137
+ "list": [
138
+ {
139
+ "elements": [
140
+ {
141
+ "elementActive": true,
142
+ "elementName": "foo1",
143
+ "elementValue": 1
144
+ },
145
+ {
146
+ "elementActive": false,
147
+ "elementName": "foo2",
148
+ "elementValue": 2
149
+ }
150
+ ]
151
+ },
152
+ {
153
+ "elements": [
154
+ {
155
+ "elementActive": true,
156
+ "elementName": "bar1",
157
+ "elementValue": 3
158
+ }
159
+ ]
160
+ }
161
+ ]
162
+ }
163
+ ```
164
+
64
165
  ## Build
65
166
 
66
167
  ```
@@ -41,7 +41,7 @@ class ParseBenchmark {
41
41
 
42
42
  object ParseBenchmark {
43
43
 
44
- val TestRecordSize = 100 * 1000
44
+ val TestRecordSize = 10 * 1000
45
45
 
46
46
  val test = new XPath2ParserPluginSpec()
47
47
  val runtime = new EmbulkTestRuntime
@@ -13,7 +13,7 @@ configurations {
13
13
  provided
14
14
  }
15
15
 
16
- version = "0.1.2"
16
+ version = "0.2.0"
17
17
  ext {
18
18
  embulkVersion = "0.8.39"
19
19
  }
@@ -0,0 +1,139 @@
1
+ package org.embulk.parser.xpath2
2
+
3
+ import com.ximpleware.{AutoPilot, VTDNav}
4
+ import org.embulk.parser.xpath2.config.JsonStructureElement
5
+ import org.embulk.spi.Column
6
+ import org.msgpack.value.{Value, Variable}
7
+
8
+ import scala.annotation.tailrec
9
+ import scala.collection.JavaConverters._
10
+ import scala.collection.immutable.Queue
11
+
12
+ sealed trait Direction
13
+ case object Parent extends Direction
14
+ case object Sibling extends Direction
15
+ case object Child extends Direction
16
+
17
+ private case class Path(depth: Int, pathFragments: Seq[String], moveDirection: Direction) {
18
+
19
+ def next(depth: Int, elementName: String): Path = {
20
+ if (this.depth > depth) {
21
+ val (rest :+ _) = pathFragments
22
+ Path(depth, rest, Parent)
23
+ } else if (this.depth == depth) {
24
+ val (rest :+ _) = pathFragments
25
+ Path(depth, rest :+ elementName, Sibling)
26
+ } else {
27
+ Path(depth, pathFragments :+ elementName, Child)
28
+ }
29
+ }
30
+
31
+ val path: String = pathFragments.mkString("/")
32
+
33
+ }
34
+
35
+ object MsgPackEncoder {
36
+
37
+ def encode(nav: VTDNav, columnAp: AutoPilot, column: Column, maybeStructure: Option[Seq[JsonStructureElement]]): Value = maybeStructure.map { structure =>
38
+ // complex json array
39
+ val keyValues = Iterator.continually(columnAp.evalXPath()).takeWhile(_ != -1).flatMap { _ =>
40
+ VTD.withinContext(nav) {
41
+ constructJsonMap(nav, columnAp, column, structure).toSeq
42
+ }
43
+ }
44
+ val mergedMap = keyValues.toSeq.groupBy { case (k, _) => k }.map { case (k, v)=>
45
+ val mergedValues = v.flatMap {
46
+ case (_, x: Seq[Any]) => x
47
+ case _ => sys.error("Root element supports array only. Please reconsider the configuration.")
48
+ }
49
+ (k, mergedValues)
50
+ }
51
+ convertToValue(mergedMap)
52
+ } getOrElse {
53
+ // simple string[]
54
+ @tailrec
55
+ def eachJsonValue(cAp: AutoPilot, queue: Queue[Value]): Queue[Value] = if (cAp.evalXPath() != -1) {
56
+ val index = nav.getText
57
+ val nextQueue = if (index != -1) queue :+ new Variable().setStringValue(nav.toString(index)).asStringValue() else queue
58
+ eachJsonValue(cAp, nextQueue)
59
+ } else queue
60
+ asArrayValue(eachJsonValue(columnAp, Queue.empty[Value]))
61
+ }
62
+
63
+ private def constructJsonMap(nav: VTDNav, columnAp: AutoPilot, column: Column, structure: Seq[JsonStructureElement]): Map[String, Any] = {
64
+
65
+ @tailrec
66
+ def eachElement(eAp: AutoPilot, previousPath: Path, obj: Map[String, Any]): Map[String, Any] = if (eAp.iterate()) {
67
+ val current = previousPath.next(nav.getCurrentDepth, nav.toString(nav.getCurrentIndex))
68
+ if (current.moveDirection == Parent) {
69
+ obj
70
+ } else {
71
+ val updated = structure.find(_.path == current.path).map { x =>
72
+ x.`type` match {
73
+ case "array" =>
74
+ val targetArray = obj.getOrElse(x.name, Queue[Any]()).asInstanceOf[Seq[Any]]
75
+ val childStructure = eachArrayElement(eAp, current)
76
+ obj.updated(x.name, targetArray ++ childStructure)
77
+ case "string" => obj.updated(x.name, nav.toNormalizedString(nav.getText))
78
+ case "long" => obj.updated(x.name, nav.toNormalizedString(nav.getText).toLong)
79
+ case "boolean" => obj.updated(x.name, nav.toNormalizedString(nav.getText).toBoolean)
80
+ case notSupported@_ => sys.error(s"type=$notSupported is notSupported")
81
+ }
82
+ }
83
+ eachElement(eAp, current, updated.getOrElse(obj))
84
+ }
85
+ } else obj
86
+
87
+ def isArrayElement(current: Path): Boolean =
88
+ structure.exists(x => x.path == current.path && x.`type` == "array")
89
+
90
+ def eachArrayElement(eAp: AutoPilot, previousPath: Path): Seq[Map[String, Any]] = {
91
+ @tailrec
92
+ def loop(eAp: AutoPilot, previousPath: Path, obj: Map[String, Any], elements: Seq[Map[String, Any]]): Seq[Map[String, Any]] = {
93
+ val arrayContent = eachElement(eAp, previousPath, obj)
94
+ val currentPath = previousPath.next(nav.getCurrentDepth, nav.toString(nav.getCurrentIndex))
95
+ if (isArrayElement(currentPath)) {
96
+ loop(eAp, currentPath, obj, elements :+ arrayContent)
97
+ } else elements :+ arrayContent
98
+ }
99
+ loop(eAp, previousPath, Map.empty[String, Any], Queue.empty[Map[String, Any]])
100
+ }
101
+
102
+ val eachElementAp = new AutoPilot(nav)
103
+ eachElementAp.selectElement("*")
104
+
105
+ val initialPath = Path(-1, Vector.empty, Sibling)
106
+
107
+ eachElement(eachElementAp, initialPath, Map[String, Any]())
108
+ }
109
+
110
+ private def convertToValue(obj: Map[String, Any]): Value = {
111
+ val map = obj.map {
112
+ case (k, v: Seq[_]) => (asStringValue(k), convertToValue(v))
113
+ case (k, v: Map[_, _]) => (asStringValue(k), convertToValue(v.asInstanceOf[Map[String, Any]]))
114
+ case (k, v: String) => (asStringValue(k), asStringValue(v))
115
+ case (k, v: Boolean) => (asStringValue(k), asBooleanValue(v))
116
+ case (k, v: Long) => (asStringValue(k), asLongValue(v))
117
+ case (k, v) => sys.error(s"can't convert: key=$k, value=$v")
118
+ }
119
+ asMapValue(map)
120
+ }
121
+
122
+ private def convertToValue(seq: Seq[Any]): Value = {
123
+ val list = seq.map {
124
+ case v: Seq[_] => convertToValue(v)
125
+ case v: Map[_, _] => convertToValue(v.asInstanceOf[Map[String, Any]])
126
+ case v: String => asStringValue(v)
127
+ case v: Boolean => asBooleanValue(v)
128
+ case v: Long => asLongValue(v)
129
+ }
130
+ asArrayValue(list)
131
+ }
132
+
133
+ private final def asStringValue(value: String): Value = new Variable().setStringValue(value).asStringValue()
134
+ private final def asBooleanValue(value: Boolean): Value = new Variable().setBooleanValue(value).asBooleanValue()
135
+ private final def asLongValue(value: Long): Value = new Variable().setIntegerValue(value).asNumberValue()
136
+ private final def asArrayValue(value: Seq[Value]): Value = new Variable().setArrayValue(value.asJava).asArrayValue()
137
+ private final def asMapValue(value: Map[Value, Value]): Value = new Variable().setMapValue(value.asJava).asMapValue()
138
+
139
+ }
@@ -0,0 +1,12 @@
1
+ package org.embulk.parser.xpath2
2
+
3
+ import com.ximpleware.VTDNav
4
+
5
+ object VTD {
6
+
7
+ final def withinContext[A](nav: VTDNav)(f: => A): A = try {
8
+ nav.push()
9
+ f
10
+ } finally nav.pop()
11
+
12
+ }
@@ -3,12 +3,11 @@ package org.embulk.parser.xpath2
3
3
  import com.google.common.io.ByteStreams
4
4
  import com.ximpleware.{AutoPilot, VTDGen, VTDNav}
5
5
  import org.embulk.config._
6
- import org.embulk.parser.xpath2.config.ColumnConfig
6
+ import org.embulk.parser.xpath2.config.{ColumnConfig, JsonStructureElement}
7
7
  import org.embulk.spi._
8
8
  import org.embulk.spi.`type`._
9
9
  import org.embulk.spi.time.TimestampParser
10
10
  import org.embulk.spi.util.FileInputInputStream
11
- import org.msgpack.value.{Value, Variable}
12
11
  import org.slf4j.Logger
13
12
 
14
13
  import scala.annotation.tailrec
@@ -33,7 +32,10 @@ class XPath2ParserPlugin extends ParserPlugin {
33
32
  val stopOnInvalidRecord: Boolean = task.getStopOnInvalidRecord
34
33
 
35
34
  val timestampParsers: Map[String, TimestampParser] = task.getSchema.columns.asScala
36
- .collect { case ColumnConfig(_, name, _, Some(timestampColumnOption), _) => (name, new TimestampParser(task, timestampColumnOption)) }.toMap
35
+ .collect { case ColumnConfig(_, name, _, Some(timestampColumnOption), _, _) => (name, new TimestampParser(task, timestampColumnOption)) }.toMap
36
+
37
+ val jsonStructures: Map[String, Seq[JsonStructureElement]] = task.getSchema.columns.asScala
38
+ .collect { case ColumnConfig(_, name, _, _, Some(jsonColumnOption), _) => (name, jsonColumnOption.structure.asScala) }.toMap
37
39
 
38
40
  def declareXPathNS(ap: AutoPilot): Unit = {
39
41
  task.getNamespaces.conf.asScala.foreach { case (prefix, namespaceURI) =>
@@ -67,11 +69,11 @@ class XPath2ParserPlugin extends ParserPlugin {
67
69
  nav.push()
68
70
  try {
69
71
  columnElementAutoPilots.zipWithIndex.foreach { case (columnElementAutoPilot, idx) =>
70
- nav.push()
71
- columnElementAutoPilot.resetXPath()
72
- val column = schema.getColumn(idx)
73
- handleColumn(pb, nav, columnElementAutoPilot, column, timestampParsers)
74
- nav.pop()
72
+ VTD.withinContext(nav) {
73
+ columnElementAutoPilot.resetXPath()
74
+ val column = schema.getColumn(idx)
75
+ handleColumn(pb, nav, columnElementAutoPilot, column, timestampParsers, jsonStructures)
76
+ }
75
77
  }
76
78
  pb.addRecord()
77
79
  } catch {
@@ -95,17 +97,9 @@ class XPath2ParserPlugin extends ParserPlugin {
95
97
  }
96
98
  }
97
99
 
98
- final def handleColumn(pb: PageBuilder, nav: VTDNav, columnAp: AutoPilot, column: Column, timestampParsers: Map[String, TimestampParser]): Unit = {
100
+ final def handleColumn(pb: PageBuilder, nav: VTDNav, columnAp: AutoPilot, column: Column, timestampParsers: Map[String, TimestampParser], jsonStructures: Map[String, Seq[JsonStructureElement]]): Unit = {
99
101
  if (column.getType.isInstanceOf[JsonType]) {
100
- val list = new java.util.ArrayList[Value]()
101
- @tailrec
102
- def eachJsonValue(cAp: AutoPilot): Unit = if (cAp.evalXPath() != -1) {
103
- val index = nav.getText
104
- if (index != -1) list.add(new Variable().setStringValue(nav.toString(index)).asStringValue())
105
- eachJsonValue(cAp)
106
- }
107
- eachJsonValue(columnAp)
108
- val jsonValue = new Variable().setArrayValue(list).asArrayValue()
102
+ val jsonValue = MsgPackEncoder.encode(nav, columnAp, column, jsonStructures.get(column.getName))
109
103
  pb.setJson(column, jsonValue)
110
104
  } else {
111
105
  if (columnAp.evalXPath() == -1) {
@@ -2,10 +2,10 @@ package org.embulk.parser.xpath2.config
2
2
 
3
3
  import java.util
4
4
 
5
- import com.fasterxml.jackson.annotation.{JsonCreator, JsonValue}
5
+ import com.fasterxml.jackson.annotation.{JsonCreator, JsonProperty, JsonValue}
6
6
  import com.google.common.base.Optional
7
7
  import org.embulk.config.{Config, ConfigDefault, ConfigSource}
8
- import org.embulk.spi.`type`.{TimestampType, Type}
8
+ import org.embulk.spi.`type`.{JsonType, TimestampType, Type}
9
9
  import org.embulk.spi.time.TimestampParser.TimestampColumnOption
10
10
  import org.joda.time.DateTimeZone
11
11
 
@@ -14,11 +14,18 @@ case class SchemaConfig @JsonCreator()(columns: java.util.List[ColumnConfig]) {
14
14
  def getColumns: util.List[ColumnConfig] = columns
15
15
  }
16
16
 
17
- case class ColumnConfig(path: String, name: String, `type`: Type, timestampOption: Option[TimestampColumnOption], option: ConfigSource) {
17
+ case class ColumnConfig(path: String, name: String, `type`: Type, timestampOption: Option[TimestampColumnOption], jsonOption: Option[JsonColumnOption], option: ConfigSource) {
18
18
 
19
19
  @JsonCreator()
20
20
  def this(src: ConfigSource) = {
21
- this(src.get(classOf[String], "path"), src.get(classOf[String], "name"), src.get(classOf[Type], "type"), ColumnConfig.getTimestampOption(src, src.get(classOf[Type], "type")), src)
21
+ this(
22
+ src.get(classOf[String], "path"),
23
+ src.get(classOf[String], "name"),
24
+ src.get(classOf[Type], "type"),
25
+ ColumnConfig.getTimestampOption(src, src.get(classOf[Type], "type")),
26
+ ColumnConfig.getJsonOption(src, src.get(classOf[Type], "type")),
27
+ src
28
+ )
22
29
  }
23
30
 
24
31
  @JsonValue()
@@ -49,11 +56,25 @@ private class TimestampColumnOptionImpl(timezone: Optional[DateTimeZone], format
49
56
  override val getDate = date
50
57
  }
51
58
 
59
+ class JsonStructureElement(@JsonProperty("path") val path: String, @JsonProperty("type") val `type`: String) {
60
+ @JsonProperty("name")
61
+ val name: String = path
62
+ }
63
+
64
+ case class JsonColumnOption(@JsonProperty("structure") structure: java.util.List[JsonStructureElement])
65
+
52
66
  object ColumnConfig {
53
67
  private def getTimestampOption(src: ConfigSource, `type`: Type): Option[TimestampColumnOption] = `type` match {
54
68
  case _: TimestampType => Some(getOption(src).loadConfig(classOf[TimestampColumnOptionImpl]))
55
69
  case _ => None
56
70
  }
57
71
 
72
+ private def getJsonOption(src: ConfigSource, `type`: Type): Option[JsonColumnOption] = `type` match {
73
+ case _: JsonType =>
74
+ val option = getOption(src)
75
+ if (option.has("structure")) Some(option.loadConfig(classOf[JsonColumnOption])) else None
76
+ case _ => None
77
+ }
78
+
58
79
  private def getOption(src: ConfigSource) = src.deepCopy().remove("path").remove("name").remove("type")
59
80
  }
@@ -0,0 +1,30 @@
1
+ in:
2
+ type: file
3
+ path_prefix: src/test/resources/data.xml
4
+ parser:
5
+ type: xpath2
6
+ root: '/ns1:root/ns2:entry'
7
+ schema:
8
+ - { path: 'ns2:id', name: id, type: long }
9
+ - path: 'ns2:list'
10
+ name: list
11
+ type: json
12
+ structure:
13
+ - path: 'ns2:list'
14
+ name: list
15
+ type: array
16
+ - path: 'ns2:list/ns2:elements'
17
+ name: elements
18
+ type: array
19
+ - path: 'ns2:list/ns2:elements/ns2:name'
20
+ name: elementName
21
+ type: string
22
+ - path: 'ns2:list/ns2:elements/ns2:value'
23
+ name: elementValue
24
+ type: long
25
+ - path: 'ns2:list/ns2:elements/ns2:active'
26
+ name: elementActive
27
+ type: boolean
28
+ namespaces: {ns1: 'http://example.com/ns1/', ns2: 'http://example.com/ns2/'}
29
+ out:
30
+ type: stdout
@@ -0,0 +1,27 @@
1
+ <?xml version="1.0"?>
2
+ <ns1:root
3
+ xmlns:ns1="http://example.com/ns1/"
4
+ xmlns:ns2="http://example.com/ns2/">
5
+ <ns2:entry>
6
+ <ns2:id>1</ns2:id>
7
+ <ns2:list>
8
+ <ns2:elements>
9
+ <ns2:name>foo1</ns2:name>
10
+ <ns2:value>1</ns2:value>
11
+ <ns2:active>true</ns2:active>
12
+ </ns2:elements>
13
+ <ns2:elements>
14
+ <ns2:name>foo2</ns2:name>
15
+ <ns2:value>2</ns2:value>
16
+ <ns2:active>false</ns2:active>
17
+ </ns2:elements>
18
+ </ns2:list>
19
+ <ns2:list>
20
+ <ns2:elements>
21
+ <ns2:name>bar1</ns2:name>
22
+ <ns2:value>3</ns2:value>
23
+ <ns2:active>true</ns2:active>
24
+ </ns2:elements>
25
+ </ns2:list>
26
+ </ns2:entry>
27
+ </ns1:root>
@@ -0,0 +1,53 @@
1
+ package org.embulk.parser.xpath2
2
+
3
+ import org.embulk.spi.{Column, ColumnVisitor, PageReader}
4
+
5
+ class TestColumnVisitor(reader: PageReader, record: collection.mutable.Map[String, Any]) extends ColumnVisitor {
6
+ override def timestampColumn(column: Column): Unit = {
7
+ if (reader.isNull(column)) {
8
+ record.put(column.getName, null)
9
+ } else {
10
+ record.put(column.getName, reader.getTimestamp(column))
11
+ }
12
+ }
13
+
14
+ override def stringColumn(column: Column): Unit = {
15
+ if (reader.isNull(column)) {
16
+ record.put(column.getName, null)
17
+ } else {
18
+ record.put(column.getName, reader.getString(column))
19
+ }
20
+ }
21
+
22
+ override def longColumn(column: Column): Unit = {
23
+ if (reader.isNull(column)) {
24
+ record.put(column.getName, null)
25
+ } else {
26
+ record.put(column.getName, reader.getLong(column))
27
+ }
28
+ }
29
+
30
+ override def doubleColumn(column: Column): Unit = {
31
+ if (reader.isNull(column)) {
32
+ record.put(column.getName, null)
33
+ } else {
34
+ record.put(column.getName, reader.getDouble(column))
35
+ }
36
+ }
37
+
38
+ override def booleanColumn(column: Column): Unit = {
39
+ if (reader.isNull(column)) {
40
+ record.put(column.getName, null)
41
+ } else {
42
+ record.put(column.getName, reader.getBoolean(column))
43
+ }
44
+ }
45
+
46
+ override def jsonColumn(column: Column): Unit = {
47
+ if (reader.isNull(column)) {
48
+ record.put(column.getName, null)
49
+ } else {
50
+ record.put(column.getName, reader.getJson(column))
51
+ }
52
+ }
53
+ }
@@ -0,0 +1,31 @@
1
+ package org.embulk.parser.xpath2
2
+
3
+ import org.embulk.spi.{Exec, Page, Schema, TransactionalPageOutput}
4
+
5
+ import scala.collection.mutable
6
+ import scala.collection.JavaConverters._
7
+
8
+ class TestTransactionalPageOutput(schema: Schema, result: mutable.Buffer[collection.mutable.Map[String, Any]])
9
+ extends TransactionalPageOutput {
10
+ import org.embulk.spi.PageReader
11
+
12
+ val reader = new PageReader(schema)
13
+
14
+ override def add(page: Page) = {
15
+ reader.setPage(page)
16
+
17
+ while (reader.nextRecord()) {
18
+ val record: collection.mutable.Map[String, Any] = collection.mutable.Map()
19
+
20
+ schema.getColumns().asScala.foreach { column =>
21
+ column.visit(new TestColumnVisitor(reader, record))
22
+ }
23
+ result += record
24
+ }
25
+ }
26
+
27
+ override def commit() = Exec.newTaskReport()
28
+ override def abort() = {}
29
+ override def finish() = {}
30
+ override def close() = {}
31
+ }
@@ -0,0 +1,86 @@
1
+ package org.embulk.parser.xpath2
2
+
3
+ import java.io.{File, FileInputStream}
4
+ import java.nio.file
5
+ import java.nio.file.Paths
6
+
7
+ import org.embulk.EmbulkTestRuntime
8
+ import org.embulk.config.{ConfigLoader, ConfigSource, TaskSource}
9
+ import org.embulk.spi.json.JsonParser
10
+ import org.embulk.spi.util.InputStreamFileInput
11
+ import org.embulk.spi.{Exec, _}
12
+ import org.junit.Assert._
13
+ import org.junit.{Rule, Test}
14
+
15
+ import scala.collection.mutable
16
+ import scala.collection.mutable.ArrayBuffer
17
+
18
+ class XPath2ParserPluginJsonSpec {
19
+
20
+ @Rule
21
+ def runtime = new EmbulkTestRuntime
22
+
23
+ val yamlPath: file.Path = Paths.get(classOf[XPath2ParserPlugin].getClassLoader.getResource("json_config.yml").toURI)
24
+ val dataPath: String = classOf[XPath2ParserPlugin].getClassLoader.getResource("json_data.xml").getPath
25
+
26
+ def configSource: ConfigSource = new ConfigLoader(Exec.getModelManager).fromYamlFile(yamlPath.toFile).getNested("in").getNested("parser")
27
+
28
+ @Test def testParseJsonArrayXML() {
29
+
30
+ val cs = configSource
31
+ val task = cs.loadConfig(classOf[PluginTask])
32
+
33
+ var schema: Schema = null
34
+
35
+ val plugin = new XPath2ParserPlugin()
36
+ plugin.transaction(cs, (_: TaskSource, s: Schema) => {schema = s})
37
+
38
+ val result: mutable.Buffer[collection.mutable.Map[String, Any]] = mutable.Buffer()
39
+
40
+ plugin.run(
41
+ task.dump(),
42
+ schema,
43
+ new InputStreamFileInput(Exec.getBufferAllocator, new FileInputStream(new File(dataPath))),
44
+ new TestTransactionalPageOutput(schema, result)
45
+ )
46
+
47
+ println(result)
48
+
49
+ val expectedJson =
50
+ """{
51
+ "list": [
52
+ {
53
+ "elements": [
54
+ {
55
+ "elementActive": true,
56
+ "elementName": "foo1",
57
+ "elementValue": 1
58
+ },
59
+ {
60
+ "elementActive": false,
61
+ "elementName": "foo2",
62
+ "elementValue": 2
63
+ }
64
+ ]
65
+ },
66
+ {
67
+ "elements": [
68
+ {
69
+ "elementActive": true,
70
+ "elementName": "bar1",
71
+ "elementValue": 3
72
+ }
73
+ ]
74
+ }
75
+ ]
76
+ }"""
77
+
78
+ assertEquals(ArrayBuffer(
79
+ Map(
80
+ "id" -> 1L,
81
+ "list" -> new JsonParser().parse(expectedJson)
82
+ )
83
+ ), result)
84
+ }
85
+
86
+ }
@@ -141,80 +141,3 @@ class XPath2ParserPluginSpec {
141
141
  }
142
142
 
143
143
  }
144
-
145
-
146
-
147
- class TestTransactionalPageOutput(schema: Schema, result: mutable.Buffer[collection.mutable.Map[String, Any]])
148
- extends TransactionalPageOutput {
149
- import org.embulk.spi.PageReader
150
-
151
- val reader = new PageReader(schema)
152
-
153
- override def add(page: Page) = {
154
- reader.setPage(page)
155
-
156
- while (reader.nextRecord()) {
157
- val record: collection.mutable.Map[String, Any] = collection.mutable.Map()
158
-
159
- schema.getColumns().asScala.foreach { column =>
160
- column.visit(new TestColumnVisitor(reader, record))
161
- }
162
- result += record
163
- }
164
- }
165
-
166
- override def commit() = Exec.newTaskReport()
167
- override def abort() = {}
168
- override def finish() = {}
169
- override def close() = {}
170
- }
171
-
172
- class TestColumnVisitor(reader: PageReader, record: collection.mutable.Map[String, Any]) extends ColumnVisitor {
173
- override def timestampColumn(column: Column): Unit = {
174
- if (reader.isNull(column)) {
175
- record.put(column.getName, null)
176
- } else {
177
- record.put(column.getName, reader.getTimestamp(column))
178
- }
179
- }
180
-
181
- override def stringColumn(column: Column): Unit = {
182
- if (reader.isNull(column)) {
183
- record.put(column.getName, null)
184
- } else {
185
- record.put(column.getName, reader.getString(column))
186
- }
187
- }
188
-
189
- override def longColumn(column: Column): Unit = {
190
- if (reader.isNull(column)) {
191
- record.put(column.getName, null)
192
- } else {
193
- record.put(column.getName, reader.getLong(column))
194
- }
195
- }
196
-
197
- override def doubleColumn(column: Column): Unit = {
198
- if (reader.isNull(column)) {
199
- record.put(column.getName, null)
200
- } else {
201
- record.put(column.getName, reader.getDouble(column))
202
- }
203
- }
204
-
205
- override def booleanColumn(column: Column): Unit = {
206
- if (reader.isNull(column)) {
207
- record.put(column.getName, null)
208
- } else {
209
- record.put(column.getName, reader.getBoolean(column))
210
- }
211
- }
212
-
213
- override def jsonColumn(column: Column): Unit = {
214
- if (reader.isNull(column)) {
215
- record.put(column.getName, null)
216
- } else {
217
- record.put(column.getName, reader.getJson(column))
218
- }
219
- }
220
- }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-parser-xpath2
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - maji-KY
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-12-12 00:00:00.000000000 Z
11
+ date: 2018-01-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -61,18 +61,25 @@ files:
61
61
  - project/build.properties
62
62
  - project/plugins.sbt
63
63
  - src/main/scala/org/embulk/parser/xpath2/LoanPattern.scala
64
+ - src/main/scala/org/embulk/parser/xpath2/MsgPackEncoder.scala
64
65
  - src/main/scala/org/embulk/parser/xpath2/PluginTask.scala
66
+ - src/main/scala/org/embulk/parser/xpath2/VTD.scala
65
67
  - src/main/scala/org/embulk/parser/xpath2/XPath2ParserPlugin.scala
66
68
  - src/main/scala/org/embulk/parser/xpath2/config/NamespacesConfig.scala
67
69
  - src/main/scala/org/embulk/parser/xpath2/config/SchemaConfig.scala
68
70
  - src/test/resources/config.yml
69
71
  - src/test/resources/data.xml
70
72
  - src/test/resources/invalid-data.xml
73
+ - src/test/resources/json_config.yml
74
+ - src/test/resources/json_data.xml
75
+ - src/test/scala/org/embulk/parser/xpath2/TestColumnVisitor.scala
76
+ - src/test/scala/org/embulk/parser/xpath2/TestTransactionalPageOutput.scala
71
77
  - src/test/scala/org/embulk/parser/xpath2/UnitSpec.scala
78
+ - src/test/scala/org/embulk/parser/xpath2/XPath2ParserPluginJsonSpec.scala
72
79
  - src/test/scala/org/embulk/parser/xpath2/XPath2ParserPluginSpec.scala
73
80
  - classpath/vtd-xml-2.13.4.jar
74
81
  - classpath/scala-library-2.12.4.jar
75
- - classpath/embulk-parser-xpath2-0.1.2.jar
82
+ - classpath/embulk-parser-xpath2-0.2.0.jar
76
83
  homepage: https://github.com/maji-KY/embulk-parser-xpath2
77
84
  licenses:
78
85
  - GPL-2.0