embulk-parser-xpath2 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -3
- data/README.md +101 -0
- data/benchmark/src/main/scala/org/embulk/parser/xpath2/ParseBenchmark.scala +1 -1
- data/build.gradle +1 -1
- data/src/main/scala/org/embulk/parser/xpath2/MsgPackEncoder.scala +139 -0
- data/src/main/scala/org/embulk/parser/xpath2/VTD.scala +12 -0
- data/src/main/scala/org/embulk/parser/xpath2/XPath2ParserPlugin.scala +12 -18
- data/src/main/scala/org/embulk/parser/xpath2/config/SchemaConfig.scala +25 -4
- data/src/test/resources/json_config.yml +30 -0
- data/src/test/resources/json_data.xml +27 -0
- data/src/test/scala/org/embulk/parser/xpath2/TestColumnVisitor.scala +53 -0
- data/src/test/scala/org/embulk/parser/xpath2/TestTransactionalPageOutput.scala +31 -0
- data/src/test/scala/org/embulk/parser/xpath2/XPath2ParserPluginJsonSpec.scala +86 -0
- data/src/test/scala/org/embulk/parser/xpath2/XPath2ParserPluginSpec.scala +0 -77
- metadata +10 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d739e0975d864c0920c7af2f4bdbaac2d4978925
|
4
|
+
data.tar.gz: 07d69b8ad2cefa9e25edd06c98952e4735b45498
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8edf4e7762787940f0addf22fe650e5d5912d030cac7d86d0c4bf00e70b275b641c903b6ab65088a7858a38caed425b0d0bdb99df2be51d5cfc417497ea2896e
|
7
|
+
data.tar.gz: 0ddc27b1bfab2f31e3992e008cdcedfde8c288e287409551e52a5ecbae37086582ee4b5659e8fe131f1149e97a3212d50fd38f44b05a0747e1693b85002ef8c9
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -11,6 +11,7 @@ Embulk parser plugin for parsing xml data by XPath perfectly!
|
|
11
11
|
|
12
12
|
- namespace awareness
|
13
13
|
- nullable columns
|
14
|
+
- complex json array columns (with restrictions)
|
14
15
|
|
15
16
|
## Overview
|
16
17
|
|
@@ -61,6 +62,106 @@ Then you can fetch entries from the following xml:
|
|
61
62
|
</ns2:entry>
|
62
63
|
</ns1:root>
|
63
64
|
```
|
65
|
+
|
66
|
+
## complex json array column
|
67
|
+
|
68
|
+
### Usage
|
69
|
+
|
70
|
+
```yaml
|
71
|
+
parser:
|
72
|
+
type: xpath2
|
73
|
+
root: '/ns1:root/ns2:entry'
|
74
|
+
schema:
|
75
|
+
- { path: 'ns2:id', name: id, type: long }
|
76
|
+
- path: 'ns2:list'
|
77
|
+
name: list
|
78
|
+
type: json
|
79
|
+
structure: # adding structure key to enabling complex json array column
|
80
|
+
- path: 'ns2:list'
|
81
|
+
name: list
|
82
|
+
type: array
|
83
|
+
- path: 'ns2:list/ns2:elements'
|
84
|
+
name: elements
|
85
|
+
type: array
|
86
|
+
- path: 'ns2:list/ns2:elements/ns2:name'
|
87
|
+
name: elementName
|
88
|
+
type: string
|
89
|
+
- path: 'ns2:list/ns2:elements/ns2:value'
|
90
|
+
name: elementValue
|
91
|
+
type: long
|
92
|
+
- path: 'ns2:list/ns2:elements/ns2:active'
|
93
|
+
name: elementActive
|
94
|
+
type: boolean
|
95
|
+
namespaces: {ns1: 'http://example.com/ns1/', ns2: 'http://example.com/ns2/'}
|
96
|
+
```
|
97
|
+
|
98
|
+
### Structure configuration
|
99
|
+
- **path**: specify path from the XPath of the column (string, required)
|
100
|
+
- **name**: json key name (string)
|
101
|
+
- **type**: json data type (One of array, string, long, boolean., required)
|
102
|
+
|
103
|
+
Then you can fetch entries from the following xml:
|
104
|
+
```xml
|
105
|
+
<?xml version="1.0"?>
|
106
|
+
<ns1:root
|
107
|
+
xmlns:ns1="http://example.com/ns1/"
|
108
|
+
xmlns:ns2="http://example.com/ns2/">
|
109
|
+
<ns2:entry>
|
110
|
+
<ns2:id>1</ns2:id>
|
111
|
+
<ns2:list>
|
112
|
+
<ns2:elements>
|
113
|
+
<ns2:name>foo1</ns2:name>
|
114
|
+
<ns2:value>1</ns2:value>
|
115
|
+
<ns2:active>true</ns2:active>
|
116
|
+
</ns2:elements>
|
117
|
+
<ns2:elements>
|
118
|
+
<ns2:name>foo2</ns2:name>
|
119
|
+
<ns2:value>2</ns2:value>
|
120
|
+
<ns2:active>false</ns2:active>
|
121
|
+
</ns2:elements>
|
122
|
+
</ns2:list>
|
123
|
+
<ns2:list>
|
124
|
+
<ns2:elements>
|
125
|
+
<ns2:name>bar1</ns2:name>
|
126
|
+
<ns2:value>3</ns2:value>
|
127
|
+
<ns2:active>true</ns2:active>
|
128
|
+
</ns2:elements>
|
129
|
+
</ns2:list>
|
130
|
+
</ns2:entry>
|
131
|
+
</ns1:root>
|
132
|
+
```
|
133
|
+
|
134
|
+
result of `list` column:
|
135
|
+
```json
|
136
|
+
{
|
137
|
+
"list": [
|
138
|
+
{
|
139
|
+
"elements": [
|
140
|
+
{
|
141
|
+
"elementActive": true,
|
142
|
+
"elementName": "foo1",
|
143
|
+
"elementValue": 1
|
144
|
+
},
|
145
|
+
{
|
146
|
+
"elementActive": false,
|
147
|
+
"elementName": "foo2",
|
148
|
+
"elementValue": 2
|
149
|
+
}
|
150
|
+
]
|
151
|
+
},
|
152
|
+
{
|
153
|
+
"elements": [
|
154
|
+
{
|
155
|
+
"elementActive": true,
|
156
|
+
"elementName": "bar1",
|
157
|
+
"elementValue": 3
|
158
|
+
}
|
159
|
+
]
|
160
|
+
}
|
161
|
+
]
|
162
|
+
}
|
163
|
+
```
|
164
|
+
|
64
165
|
## Build
|
65
166
|
|
66
167
|
```
|
data/build.gradle
CHANGED
@@ -0,0 +1,139 @@
|
|
1
|
+
package org.embulk.parser.xpath2
|
2
|
+
|
3
|
+
import com.ximpleware.{AutoPilot, VTDNav}
|
4
|
+
import org.embulk.parser.xpath2.config.JsonStructureElement
|
5
|
+
import org.embulk.spi.Column
|
6
|
+
import org.msgpack.value.{Value, Variable}
|
7
|
+
|
8
|
+
import scala.annotation.tailrec
|
9
|
+
import scala.collection.JavaConverters._
|
10
|
+
import scala.collection.immutable.Queue
|
11
|
+
|
12
|
+
sealed trait Direction
|
13
|
+
case object Parent extends Direction
|
14
|
+
case object Sibling extends Direction
|
15
|
+
case object Child extends Direction
|
16
|
+
|
17
|
+
private case class Path(depth: Int, pathFragments: Seq[String], moveDirection: Direction) {
|
18
|
+
|
19
|
+
def next(depth: Int, elementName: String): Path = {
|
20
|
+
if (this.depth > depth) {
|
21
|
+
val (rest :+ _) = pathFragments
|
22
|
+
Path(depth, rest, Parent)
|
23
|
+
} else if (this.depth == depth) {
|
24
|
+
val (rest :+ _) = pathFragments
|
25
|
+
Path(depth, rest :+ elementName, Sibling)
|
26
|
+
} else {
|
27
|
+
Path(depth, pathFragments :+ elementName, Child)
|
28
|
+
}
|
29
|
+
}
|
30
|
+
|
31
|
+
val path: String = pathFragments.mkString("/")
|
32
|
+
|
33
|
+
}
|
34
|
+
|
35
|
+
object MsgPackEncoder {
|
36
|
+
|
37
|
+
def encode(nav: VTDNav, columnAp: AutoPilot, column: Column, maybeStructure: Option[Seq[JsonStructureElement]]): Value = maybeStructure.map { structure =>
|
38
|
+
// complex json array
|
39
|
+
val keyValues = Iterator.continually(columnAp.evalXPath()).takeWhile(_ != -1).flatMap { _ =>
|
40
|
+
VTD.withinContext(nav) {
|
41
|
+
constructJsonMap(nav, columnAp, column, structure).toSeq
|
42
|
+
}
|
43
|
+
}
|
44
|
+
val mergedMap = keyValues.toSeq.groupBy { case (k, _) => k }.map { case (k, v)=>
|
45
|
+
val mergedValues = v.flatMap {
|
46
|
+
case (_, x: Seq[Any]) => x
|
47
|
+
case _ => sys.error("Root element supports array only. Please reconsider the configuration.")
|
48
|
+
}
|
49
|
+
(k, mergedValues)
|
50
|
+
}
|
51
|
+
convertToValue(mergedMap)
|
52
|
+
} getOrElse {
|
53
|
+
// simple string[]
|
54
|
+
@tailrec
|
55
|
+
def eachJsonValue(cAp: AutoPilot, queue: Queue[Value]): Queue[Value] = if (cAp.evalXPath() != -1) {
|
56
|
+
val index = nav.getText
|
57
|
+
val nextQueue = if (index != -1) queue :+ new Variable().setStringValue(nav.toString(index)).asStringValue() else queue
|
58
|
+
eachJsonValue(cAp, nextQueue)
|
59
|
+
} else queue
|
60
|
+
asArrayValue(eachJsonValue(columnAp, Queue.empty[Value]))
|
61
|
+
}
|
62
|
+
|
63
|
+
private def constructJsonMap(nav: VTDNav, columnAp: AutoPilot, column: Column, structure: Seq[JsonStructureElement]): Map[String, Any] = {
|
64
|
+
|
65
|
+
@tailrec
|
66
|
+
def eachElement(eAp: AutoPilot, previousPath: Path, obj: Map[String, Any]): Map[String, Any] = if (eAp.iterate()) {
|
67
|
+
val current = previousPath.next(nav.getCurrentDepth, nav.toString(nav.getCurrentIndex))
|
68
|
+
if (current.moveDirection == Parent) {
|
69
|
+
obj
|
70
|
+
} else {
|
71
|
+
val updated = structure.find(_.path == current.path).map { x =>
|
72
|
+
x.`type` match {
|
73
|
+
case "array" =>
|
74
|
+
val targetArray = obj.getOrElse(x.name, Queue[Any]()).asInstanceOf[Seq[Any]]
|
75
|
+
val childStructure = eachArrayElement(eAp, current)
|
76
|
+
obj.updated(x.name, targetArray ++ childStructure)
|
77
|
+
case "string" => obj.updated(x.name, nav.toNormalizedString(nav.getText))
|
78
|
+
case "long" => obj.updated(x.name, nav.toNormalizedString(nav.getText).toLong)
|
79
|
+
case "boolean" => obj.updated(x.name, nav.toNormalizedString(nav.getText).toBoolean)
|
80
|
+
case notSupported@_ => sys.error(s"type=$notSupported is notSupported")
|
81
|
+
}
|
82
|
+
}
|
83
|
+
eachElement(eAp, current, updated.getOrElse(obj))
|
84
|
+
}
|
85
|
+
} else obj
|
86
|
+
|
87
|
+
def isArrayElement(current: Path): Boolean =
|
88
|
+
structure.exists(x => x.path == current.path && x.`type` == "array")
|
89
|
+
|
90
|
+
def eachArrayElement(eAp: AutoPilot, previousPath: Path): Seq[Map[String, Any]] = {
|
91
|
+
@tailrec
|
92
|
+
def loop(eAp: AutoPilot, previousPath: Path, obj: Map[String, Any], elements: Seq[Map[String, Any]]): Seq[Map[String, Any]] = {
|
93
|
+
val arrayContent = eachElement(eAp, previousPath, obj)
|
94
|
+
val currentPath = previousPath.next(nav.getCurrentDepth, nav.toString(nav.getCurrentIndex))
|
95
|
+
if (isArrayElement(currentPath)) {
|
96
|
+
loop(eAp, currentPath, obj, elements :+ arrayContent)
|
97
|
+
} else elements :+ arrayContent
|
98
|
+
}
|
99
|
+
loop(eAp, previousPath, Map.empty[String, Any], Queue.empty[Map[String, Any]])
|
100
|
+
}
|
101
|
+
|
102
|
+
val eachElementAp = new AutoPilot(nav)
|
103
|
+
eachElementAp.selectElement("*")
|
104
|
+
|
105
|
+
val initialPath = Path(-1, Vector.empty, Sibling)
|
106
|
+
|
107
|
+
eachElement(eachElementAp, initialPath, Map[String, Any]())
|
108
|
+
}
|
109
|
+
|
110
|
+
private def convertToValue(obj: Map[String, Any]): Value = {
|
111
|
+
val map = obj.map {
|
112
|
+
case (k, v: Seq[_]) => (asStringValue(k), convertToValue(v))
|
113
|
+
case (k, v: Map[_, _]) => (asStringValue(k), convertToValue(v.asInstanceOf[Map[String, Any]]))
|
114
|
+
case (k, v: String) => (asStringValue(k), asStringValue(v))
|
115
|
+
case (k, v: Boolean) => (asStringValue(k), asBooleanValue(v))
|
116
|
+
case (k, v: Long) => (asStringValue(k), asLongValue(v))
|
117
|
+
case (k, v) => sys.error(s"can't convert: key=$k, value=$v")
|
118
|
+
}
|
119
|
+
asMapValue(map)
|
120
|
+
}
|
121
|
+
|
122
|
+
private def convertToValue(seq: Seq[Any]): Value = {
|
123
|
+
val list = seq.map {
|
124
|
+
case v: Seq[_] => convertToValue(v)
|
125
|
+
case v: Map[_, _] => convertToValue(v.asInstanceOf[Map[String, Any]])
|
126
|
+
case v: String => asStringValue(v)
|
127
|
+
case v: Boolean => asBooleanValue(v)
|
128
|
+
case v: Long => asLongValue(v)
|
129
|
+
}
|
130
|
+
asArrayValue(list)
|
131
|
+
}
|
132
|
+
|
133
|
+
private final def asStringValue(value: String): Value = new Variable().setStringValue(value).asStringValue()
|
134
|
+
private final def asBooleanValue(value: Boolean): Value = new Variable().setBooleanValue(value).asBooleanValue()
|
135
|
+
private final def asLongValue(value: Long): Value = new Variable().setIntegerValue(value).asNumberValue()
|
136
|
+
private final def asArrayValue(value: Seq[Value]): Value = new Variable().setArrayValue(value.asJava).asArrayValue()
|
137
|
+
private final def asMapValue(value: Map[Value, Value]): Value = new Variable().setMapValue(value.asJava).asMapValue()
|
138
|
+
|
139
|
+
}
|
@@ -3,12 +3,11 @@ package org.embulk.parser.xpath2
|
|
3
3
|
import com.google.common.io.ByteStreams
|
4
4
|
import com.ximpleware.{AutoPilot, VTDGen, VTDNav}
|
5
5
|
import org.embulk.config._
|
6
|
-
import org.embulk.parser.xpath2.config.ColumnConfig
|
6
|
+
import org.embulk.parser.xpath2.config.{ColumnConfig, JsonStructureElement}
|
7
7
|
import org.embulk.spi._
|
8
8
|
import org.embulk.spi.`type`._
|
9
9
|
import org.embulk.spi.time.TimestampParser
|
10
10
|
import org.embulk.spi.util.FileInputInputStream
|
11
|
-
import org.msgpack.value.{Value, Variable}
|
12
11
|
import org.slf4j.Logger
|
13
12
|
|
14
13
|
import scala.annotation.tailrec
|
@@ -33,7 +32,10 @@ class XPath2ParserPlugin extends ParserPlugin {
|
|
33
32
|
val stopOnInvalidRecord: Boolean = task.getStopOnInvalidRecord
|
34
33
|
|
35
34
|
val timestampParsers: Map[String, TimestampParser] = task.getSchema.columns.asScala
|
36
|
-
.collect { case ColumnConfig(_, name, _, Some(timestampColumnOption), _) => (name, new TimestampParser(task, timestampColumnOption)) }.toMap
|
35
|
+
.collect { case ColumnConfig(_, name, _, Some(timestampColumnOption), _, _) => (name, new TimestampParser(task, timestampColumnOption)) }.toMap
|
36
|
+
|
37
|
+
val jsonStructures: Map[String, Seq[JsonStructureElement]] = task.getSchema.columns.asScala
|
38
|
+
.collect { case ColumnConfig(_, name, _, _, Some(jsonColumnOption), _) => (name, jsonColumnOption.structure.asScala) }.toMap
|
37
39
|
|
38
40
|
def declareXPathNS(ap: AutoPilot): Unit = {
|
39
41
|
task.getNamespaces.conf.asScala.foreach { case (prefix, namespaceURI) =>
|
@@ -67,11 +69,11 @@ class XPath2ParserPlugin extends ParserPlugin {
|
|
67
69
|
nav.push()
|
68
70
|
try {
|
69
71
|
columnElementAutoPilots.zipWithIndex.foreach { case (columnElementAutoPilot, idx) =>
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
72
|
+
VTD.withinContext(nav) {
|
73
|
+
columnElementAutoPilot.resetXPath()
|
74
|
+
val column = schema.getColumn(idx)
|
75
|
+
handleColumn(pb, nav, columnElementAutoPilot, column, timestampParsers, jsonStructures)
|
76
|
+
}
|
75
77
|
}
|
76
78
|
pb.addRecord()
|
77
79
|
} catch {
|
@@ -95,17 +97,9 @@ class XPath2ParserPlugin extends ParserPlugin {
|
|
95
97
|
}
|
96
98
|
}
|
97
99
|
|
98
|
-
final def handleColumn(pb: PageBuilder, nav: VTDNav, columnAp: AutoPilot, column: Column, timestampParsers: Map[String, TimestampParser]): Unit = {
|
100
|
+
final def handleColumn(pb: PageBuilder, nav: VTDNav, columnAp: AutoPilot, column: Column, timestampParsers: Map[String, TimestampParser], jsonStructures: Map[String, Seq[JsonStructureElement]]): Unit = {
|
99
101
|
if (column.getType.isInstanceOf[JsonType]) {
|
100
|
-
val
|
101
|
-
@tailrec
|
102
|
-
def eachJsonValue(cAp: AutoPilot): Unit = if (cAp.evalXPath() != -1) {
|
103
|
-
val index = nav.getText
|
104
|
-
if (index != -1) list.add(new Variable().setStringValue(nav.toString(index)).asStringValue())
|
105
|
-
eachJsonValue(cAp)
|
106
|
-
}
|
107
|
-
eachJsonValue(columnAp)
|
108
|
-
val jsonValue = new Variable().setArrayValue(list).asArrayValue()
|
102
|
+
val jsonValue = MsgPackEncoder.encode(nav, columnAp, column, jsonStructures.get(column.getName))
|
109
103
|
pb.setJson(column, jsonValue)
|
110
104
|
} else {
|
111
105
|
if (columnAp.evalXPath() == -1) {
|
@@ -2,10 +2,10 @@ package org.embulk.parser.xpath2.config
|
|
2
2
|
|
3
3
|
import java.util
|
4
4
|
|
5
|
-
import com.fasterxml.jackson.annotation.{JsonCreator, JsonValue}
|
5
|
+
import com.fasterxml.jackson.annotation.{JsonCreator, JsonProperty, JsonValue}
|
6
6
|
import com.google.common.base.Optional
|
7
7
|
import org.embulk.config.{Config, ConfigDefault, ConfigSource}
|
8
|
-
import org.embulk.spi.`type`.{TimestampType, Type}
|
8
|
+
import org.embulk.spi.`type`.{JsonType, TimestampType, Type}
|
9
9
|
import org.embulk.spi.time.TimestampParser.TimestampColumnOption
|
10
10
|
import org.joda.time.DateTimeZone
|
11
11
|
|
@@ -14,11 +14,18 @@ case class SchemaConfig @JsonCreator()(columns: java.util.List[ColumnConfig]) {
|
|
14
14
|
def getColumns: util.List[ColumnConfig] = columns
|
15
15
|
}
|
16
16
|
|
17
|
-
case class ColumnConfig(path: String, name: String, `type`: Type, timestampOption: Option[TimestampColumnOption], option: ConfigSource) {
|
17
|
+
case class ColumnConfig(path: String, name: String, `type`: Type, timestampOption: Option[TimestampColumnOption], jsonOption: Option[JsonColumnOption], option: ConfigSource) {
|
18
18
|
|
19
19
|
@JsonCreator()
|
20
20
|
def this(src: ConfigSource) = {
|
21
|
-
this(
|
21
|
+
this(
|
22
|
+
src.get(classOf[String], "path"),
|
23
|
+
src.get(classOf[String], "name"),
|
24
|
+
src.get(classOf[Type], "type"),
|
25
|
+
ColumnConfig.getTimestampOption(src, src.get(classOf[Type], "type")),
|
26
|
+
ColumnConfig.getJsonOption(src, src.get(classOf[Type], "type")),
|
27
|
+
src
|
28
|
+
)
|
22
29
|
}
|
23
30
|
|
24
31
|
@JsonValue()
|
@@ -49,11 +56,25 @@ private class TimestampColumnOptionImpl(timezone: Optional[DateTimeZone], format
|
|
49
56
|
override val getDate = date
|
50
57
|
}
|
51
58
|
|
59
|
+
class JsonStructureElement(@JsonProperty("path") val path: String, @JsonProperty("type") val `type`: String) {
|
60
|
+
@JsonProperty("name")
|
61
|
+
val name: String = path
|
62
|
+
}
|
63
|
+
|
64
|
+
case class JsonColumnOption(@JsonProperty("structure") structure: java.util.List[JsonStructureElement])
|
65
|
+
|
52
66
|
object ColumnConfig {
|
53
67
|
private def getTimestampOption(src: ConfigSource, `type`: Type): Option[TimestampColumnOption] = `type` match {
|
54
68
|
case _: TimestampType => Some(getOption(src).loadConfig(classOf[TimestampColumnOptionImpl]))
|
55
69
|
case _ => None
|
56
70
|
}
|
57
71
|
|
72
|
+
private def getJsonOption(src: ConfigSource, `type`: Type): Option[JsonColumnOption] = `type` match {
|
73
|
+
case _: JsonType =>
|
74
|
+
val option = getOption(src)
|
75
|
+
if (option.has("structure")) Some(option.loadConfig(classOf[JsonColumnOption])) else None
|
76
|
+
case _ => None
|
77
|
+
}
|
78
|
+
|
58
79
|
private def getOption(src: ConfigSource) = src.deepCopy().remove("path").remove("name").remove("type")
|
59
80
|
}
|
@@ -0,0 +1,30 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: src/test/resources/data.xml
|
4
|
+
parser:
|
5
|
+
type: xpath2
|
6
|
+
root: '/ns1:root/ns2:entry'
|
7
|
+
schema:
|
8
|
+
- { path: 'ns2:id', name: id, type: long }
|
9
|
+
- path: 'ns2:list'
|
10
|
+
name: list
|
11
|
+
type: json
|
12
|
+
structure:
|
13
|
+
- path: 'ns2:list'
|
14
|
+
name: list
|
15
|
+
type: array
|
16
|
+
- path: 'ns2:list/ns2:elements'
|
17
|
+
name: elements
|
18
|
+
type: array
|
19
|
+
- path: 'ns2:list/ns2:elements/ns2:name'
|
20
|
+
name: elementName
|
21
|
+
type: string
|
22
|
+
- path: 'ns2:list/ns2:elements/ns2:value'
|
23
|
+
name: elementValue
|
24
|
+
type: long
|
25
|
+
- path: 'ns2:list/ns2:elements/ns2:active'
|
26
|
+
name: elementActive
|
27
|
+
type: boolean
|
28
|
+
namespaces: {ns1: 'http://example.com/ns1/', ns2: 'http://example.com/ns2/'}
|
29
|
+
out:
|
30
|
+
type: stdout
|
@@ -0,0 +1,27 @@
|
|
1
|
+
<?xml version="1.0"?>
|
2
|
+
<ns1:root
|
3
|
+
xmlns:ns1="http://example.com/ns1/"
|
4
|
+
xmlns:ns2="http://example.com/ns2/">
|
5
|
+
<ns2:entry>
|
6
|
+
<ns2:id>1</ns2:id>
|
7
|
+
<ns2:list>
|
8
|
+
<ns2:elements>
|
9
|
+
<ns2:name>foo1</ns2:name>
|
10
|
+
<ns2:value>1</ns2:value>
|
11
|
+
<ns2:active>true</ns2:active>
|
12
|
+
</ns2:elements>
|
13
|
+
<ns2:elements>
|
14
|
+
<ns2:name>foo2</ns2:name>
|
15
|
+
<ns2:value>2</ns2:value>
|
16
|
+
<ns2:active>false</ns2:active>
|
17
|
+
</ns2:elements>
|
18
|
+
</ns2:list>
|
19
|
+
<ns2:list>
|
20
|
+
<ns2:elements>
|
21
|
+
<ns2:name>bar1</ns2:name>
|
22
|
+
<ns2:value>3</ns2:value>
|
23
|
+
<ns2:active>true</ns2:active>
|
24
|
+
</ns2:elements>
|
25
|
+
</ns2:list>
|
26
|
+
</ns2:entry>
|
27
|
+
</ns1:root>
|
@@ -0,0 +1,53 @@
|
|
1
|
+
package org.embulk.parser.xpath2
|
2
|
+
|
3
|
+
import org.embulk.spi.{Column, ColumnVisitor, PageReader}
|
4
|
+
|
5
|
+
class TestColumnVisitor(reader: PageReader, record: collection.mutable.Map[String, Any]) extends ColumnVisitor {
|
6
|
+
override def timestampColumn(column: Column): Unit = {
|
7
|
+
if (reader.isNull(column)) {
|
8
|
+
record.put(column.getName, null)
|
9
|
+
} else {
|
10
|
+
record.put(column.getName, reader.getTimestamp(column))
|
11
|
+
}
|
12
|
+
}
|
13
|
+
|
14
|
+
override def stringColumn(column: Column): Unit = {
|
15
|
+
if (reader.isNull(column)) {
|
16
|
+
record.put(column.getName, null)
|
17
|
+
} else {
|
18
|
+
record.put(column.getName, reader.getString(column))
|
19
|
+
}
|
20
|
+
}
|
21
|
+
|
22
|
+
override def longColumn(column: Column): Unit = {
|
23
|
+
if (reader.isNull(column)) {
|
24
|
+
record.put(column.getName, null)
|
25
|
+
} else {
|
26
|
+
record.put(column.getName, reader.getLong(column))
|
27
|
+
}
|
28
|
+
}
|
29
|
+
|
30
|
+
override def doubleColumn(column: Column): Unit = {
|
31
|
+
if (reader.isNull(column)) {
|
32
|
+
record.put(column.getName, null)
|
33
|
+
} else {
|
34
|
+
record.put(column.getName, reader.getDouble(column))
|
35
|
+
}
|
36
|
+
}
|
37
|
+
|
38
|
+
override def booleanColumn(column: Column): Unit = {
|
39
|
+
if (reader.isNull(column)) {
|
40
|
+
record.put(column.getName, null)
|
41
|
+
} else {
|
42
|
+
record.put(column.getName, reader.getBoolean(column))
|
43
|
+
}
|
44
|
+
}
|
45
|
+
|
46
|
+
override def jsonColumn(column: Column): Unit = {
|
47
|
+
if (reader.isNull(column)) {
|
48
|
+
record.put(column.getName, null)
|
49
|
+
} else {
|
50
|
+
record.put(column.getName, reader.getJson(column))
|
51
|
+
}
|
52
|
+
}
|
53
|
+
}
|
@@ -0,0 +1,31 @@
|
|
1
|
+
package org.embulk.parser.xpath2
|
2
|
+
|
3
|
+
import org.embulk.spi.{Exec, Page, Schema, TransactionalPageOutput}
|
4
|
+
|
5
|
+
import scala.collection.mutable
|
6
|
+
import scala.collection.JavaConverters._
|
7
|
+
|
8
|
+
class TestTransactionalPageOutput(schema: Schema, result: mutable.Buffer[collection.mutable.Map[String, Any]])
|
9
|
+
extends TransactionalPageOutput {
|
10
|
+
import org.embulk.spi.PageReader
|
11
|
+
|
12
|
+
val reader = new PageReader(schema)
|
13
|
+
|
14
|
+
override def add(page: Page) = {
|
15
|
+
reader.setPage(page)
|
16
|
+
|
17
|
+
while (reader.nextRecord()) {
|
18
|
+
val record: collection.mutable.Map[String, Any] = collection.mutable.Map()
|
19
|
+
|
20
|
+
schema.getColumns().asScala.foreach { column =>
|
21
|
+
column.visit(new TestColumnVisitor(reader, record))
|
22
|
+
}
|
23
|
+
result += record
|
24
|
+
}
|
25
|
+
}
|
26
|
+
|
27
|
+
override def commit() = Exec.newTaskReport()
|
28
|
+
override def abort() = {}
|
29
|
+
override def finish() = {}
|
30
|
+
override def close() = {}
|
31
|
+
}
|
@@ -0,0 +1,86 @@
|
|
1
|
+
package org.embulk.parser.xpath2
|
2
|
+
|
3
|
+
import java.io.{File, FileInputStream}
|
4
|
+
import java.nio.file
|
5
|
+
import java.nio.file.Paths
|
6
|
+
|
7
|
+
import org.embulk.EmbulkTestRuntime
|
8
|
+
import org.embulk.config.{ConfigLoader, ConfigSource, TaskSource}
|
9
|
+
import org.embulk.spi.json.JsonParser
|
10
|
+
import org.embulk.spi.util.InputStreamFileInput
|
11
|
+
import org.embulk.spi.{Exec, _}
|
12
|
+
import org.junit.Assert._
|
13
|
+
import org.junit.{Rule, Test}
|
14
|
+
|
15
|
+
import scala.collection.mutable
|
16
|
+
import scala.collection.mutable.ArrayBuffer
|
17
|
+
|
18
|
+
class XPath2ParserPluginJsonSpec {
|
19
|
+
|
20
|
+
@Rule
|
21
|
+
def runtime = new EmbulkTestRuntime
|
22
|
+
|
23
|
+
val yamlPath: file.Path = Paths.get(classOf[XPath2ParserPlugin].getClassLoader.getResource("json_config.yml").toURI)
|
24
|
+
val dataPath: String = classOf[XPath2ParserPlugin].getClassLoader.getResource("json_data.xml").getPath
|
25
|
+
|
26
|
+
def configSource: ConfigSource = new ConfigLoader(Exec.getModelManager).fromYamlFile(yamlPath.toFile).getNested("in").getNested("parser")
|
27
|
+
|
28
|
+
@Test def testParseJsonArrayXML() {
|
29
|
+
|
30
|
+
val cs = configSource
|
31
|
+
val task = cs.loadConfig(classOf[PluginTask])
|
32
|
+
|
33
|
+
var schema: Schema = null
|
34
|
+
|
35
|
+
val plugin = new XPath2ParserPlugin()
|
36
|
+
plugin.transaction(cs, (_: TaskSource, s: Schema) => {schema = s})
|
37
|
+
|
38
|
+
val result: mutable.Buffer[collection.mutable.Map[String, Any]] = mutable.Buffer()
|
39
|
+
|
40
|
+
plugin.run(
|
41
|
+
task.dump(),
|
42
|
+
schema,
|
43
|
+
new InputStreamFileInput(Exec.getBufferAllocator, new FileInputStream(new File(dataPath))),
|
44
|
+
new TestTransactionalPageOutput(schema, result)
|
45
|
+
)
|
46
|
+
|
47
|
+
println(result)
|
48
|
+
|
49
|
+
val expectedJson =
|
50
|
+
"""{
|
51
|
+
"list": [
|
52
|
+
{
|
53
|
+
"elements": [
|
54
|
+
{
|
55
|
+
"elementActive": true,
|
56
|
+
"elementName": "foo1",
|
57
|
+
"elementValue": 1
|
58
|
+
},
|
59
|
+
{
|
60
|
+
"elementActive": false,
|
61
|
+
"elementName": "foo2",
|
62
|
+
"elementValue": 2
|
63
|
+
}
|
64
|
+
]
|
65
|
+
},
|
66
|
+
{
|
67
|
+
"elements": [
|
68
|
+
{
|
69
|
+
"elementActive": true,
|
70
|
+
"elementName": "bar1",
|
71
|
+
"elementValue": 3
|
72
|
+
}
|
73
|
+
]
|
74
|
+
}
|
75
|
+
]
|
76
|
+
}"""
|
77
|
+
|
78
|
+
assertEquals(ArrayBuffer(
|
79
|
+
Map(
|
80
|
+
"id" -> 1L,
|
81
|
+
"list" -> new JsonParser().parse(expectedJson)
|
82
|
+
)
|
83
|
+
), result)
|
84
|
+
}
|
85
|
+
|
86
|
+
}
|
@@ -141,80 +141,3 @@ class XPath2ParserPluginSpec {
|
|
141
141
|
}
|
142
142
|
|
143
143
|
}
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
class TestTransactionalPageOutput(schema: Schema, result: mutable.Buffer[collection.mutable.Map[String, Any]])
|
148
|
-
extends TransactionalPageOutput {
|
149
|
-
import org.embulk.spi.PageReader
|
150
|
-
|
151
|
-
val reader = new PageReader(schema)
|
152
|
-
|
153
|
-
override def add(page: Page) = {
|
154
|
-
reader.setPage(page)
|
155
|
-
|
156
|
-
while (reader.nextRecord()) {
|
157
|
-
val record: collection.mutable.Map[String, Any] = collection.mutable.Map()
|
158
|
-
|
159
|
-
schema.getColumns().asScala.foreach { column =>
|
160
|
-
column.visit(new TestColumnVisitor(reader, record))
|
161
|
-
}
|
162
|
-
result += record
|
163
|
-
}
|
164
|
-
}
|
165
|
-
|
166
|
-
override def commit() = Exec.newTaskReport()
|
167
|
-
override def abort() = {}
|
168
|
-
override def finish() = {}
|
169
|
-
override def close() = {}
|
170
|
-
}
|
171
|
-
|
172
|
-
class TestColumnVisitor(reader: PageReader, record: collection.mutable.Map[String, Any]) extends ColumnVisitor {
|
173
|
-
override def timestampColumn(column: Column): Unit = {
|
174
|
-
if (reader.isNull(column)) {
|
175
|
-
record.put(column.getName, null)
|
176
|
-
} else {
|
177
|
-
record.put(column.getName, reader.getTimestamp(column))
|
178
|
-
}
|
179
|
-
}
|
180
|
-
|
181
|
-
override def stringColumn(column: Column): Unit = {
|
182
|
-
if (reader.isNull(column)) {
|
183
|
-
record.put(column.getName, null)
|
184
|
-
} else {
|
185
|
-
record.put(column.getName, reader.getString(column))
|
186
|
-
}
|
187
|
-
}
|
188
|
-
|
189
|
-
override def longColumn(column: Column): Unit = {
|
190
|
-
if (reader.isNull(column)) {
|
191
|
-
record.put(column.getName, null)
|
192
|
-
} else {
|
193
|
-
record.put(column.getName, reader.getLong(column))
|
194
|
-
}
|
195
|
-
}
|
196
|
-
|
197
|
-
override def doubleColumn(column: Column): Unit = {
|
198
|
-
if (reader.isNull(column)) {
|
199
|
-
record.put(column.getName, null)
|
200
|
-
} else {
|
201
|
-
record.put(column.getName, reader.getDouble(column))
|
202
|
-
}
|
203
|
-
}
|
204
|
-
|
205
|
-
override def booleanColumn(column: Column): Unit = {
|
206
|
-
if (reader.isNull(column)) {
|
207
|
-
record.put(column.getName, null)
|
208
|
-
} else {
|
209
|
-
record.put(column.getName, reader.getBoolean(column))
|
210
|
-
}
|
211
|
-
}
|
212
|
-
|
213
|
-
override def jsonColumn(column: Column): Unit = {
|
214
|
-
if (reader.isNull(column)) {
|
215
|
-
record.put(column.getName, null)
|
216
|
-
} else {
|
217
|
-
record.put(column.getName, reader.getJson(column))
|
218
|
-
}
|
219
|
-
}
|
220
|
-
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-parser-xpath2
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- maji-KY
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-01-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -61,18 +61,25 @@ files:
|
|
61
61
|
- project/build.properties
|
62
62
|
- project/plugins.sbt
|
63
63
|
- src/main/scala/org/embulk/parser/xpath2/LoanPattern.scala
|
64
|
+
- src/main/scala/org/embulk/parser/xpath2/MsgPackEncoder.scala
|
64
65
|
- src/main/scala/org/embulk/parser/xpath2/PluginTask.scala
|
66
|
+
- src/main/scala/org/embulk/parser/xpath2/VTD.scala
|
65
67
|
- src/main/scala/org/embulk/parser/xpath2/XPath2ParserPlugin.scala
|
66
68
|
- src/main/scala/org/embulk/parser/xpath2/config/NamespacesConfig.scala
|
67
69
|
- src/main/scala/org/embulk/parser/xpath2/config/SchemaConfig.scala
|
68
70
|
- src/test/resources/config.yml
|
69
71
|
- src/test/resources/data.xml
|
70
72
|
- src/test/resources/invalid-data.xml
|
73
|
+
- src/test/resources/json_config.yml
|
74
|
+
- src/test/resources/json_data.xml
|
75
|
+
- src/test/scala/org/embulk/parser/xpath2/TestColumnVisitor.scala
|
76
|
+
- src/test/scala/org/embulk/parser/xpath2/TestTransactionalPageOutput.scala
|
71
77
|
- src/test/scala/org/embulk/parser/xpath2/UnitSpec.scala
|
78
|
+
- src/test/scala/org/embulk/parser/xpath2/XPath2ParserPluginJsonSpec.scala
|
72
79
|
- src/test/scala/org/embulk/parser/xpath2/XPath2ParserPluginSpec.scala
|
73
80
|
- classpath/vtd-xml-2.13.4.jar
|
74
81
|
- classpath/scala-library-2.12.4.jar
|
75
|
-
- classpath/embulk-parser-xpath2-0.
|
82
|
+
- classpath/embulk-parser-xpath2-0.2.0.jar
|
76
83
|
homepage: https://github.com/maji-KY/embulk-parser-xpath2
|
77
84
|
licenses:
|
78
85
|
- GPL-2.0
|