embulk-parser-xpath2 0.1.2 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -3
- data/README.md +101 -0
- data/benchmark/src/main/scala/org/embulk/parser/xpath2/ParseBenchmark.scala +1 -1
- data/build.gradle +1 -1
- data/src/main/scala/org/embulk/parser/xpath2/MsgPackEncoder.scala +139 -0
- data/src/main/scala/org/embulk/parser/xpath2/VTD.scala +12 -0
- data/src/main/scala/org/embulk/parser/xpath2/XPath2ParserPlugin.scala +12 -18
- data/src/main/scala/org/embulk/parser/xpath2/config/SchemaConfig.scala +25 -4
- data/src/test/resources/json_config.yml +30 -0
- data/src/test/resources/json_data.xml +27 -0
- data/src/test/scala/org/embulk/parser/xpath2/TestColumnVisitor.scala +53 -0
- data/src/test/scala/org/embulk/parser/xpath2/TestTransactionalPageOutput.scala +31 -0
- data/src/test/scala/org/embulk/parser/xpath2/XPath2ParserPluginJsonSpec.scala +86 -0
- data/src/test/scala/org/embulk/parser/xpath2/XPath2ParserPluginSpec.scala +0 -77
- metadata +10 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d739e0975d864c0920c7af2f4bdbaac2d4978925
|
4
|
+
data.tar.gz: 07d69b8ad2cefa9e25edd06c98952e4735b45498
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8edf4e7762787940f0addf22fe650e5d5912d030cac7d86d0c4bf00e70b275b641c903b6ab65088a7858a38caed425b0d0bdb99df2be51d5cfc417497ea2896e
|
7
|
+
data.tar.gz: 0ddc27b1bfab2f31e3992e008cdcedfde8c288e287409551e52a5ecbae37086582ee4b5659e8fe131f1149e97a3212d50fd38f44b05a0747e1693b85002ef8c9
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -11,6 +11,7 @@ Embulk parser plugin for parsing xml data by XPath perfectly!
|
|
11
11
|
|
12
12
|
- namespace awareness
|
13
13
|
- nullable columns
|
14
|
+
- complex json array columns (with restrictions)
|
14
15
|
|
15
16
|
## Overview
|
16
17
|
|
@@ -61,6 +62,106 @@ Then you can fetch entries from the following xml:
|
|
61
62
|
</ns2:entry>
|
62
63
|
</ns1:root>
|
63
64
|
```
|
65
|
+
|
66
|
+
## complex json array column
|
67
|
+
|
68
|
+
### Usage
|
69
|
+
|
70
|
+
```yaml
|
71
|
+
parser:
|
72
|
+
type: xpath2
|
73
|
+
root: '/ns1:root/ns2:entry'
|
74
|
+
schema:
|
75
|
+
- { path: 'ns2:id', name: id, type: long }
|
76
|
+
- path: 'ns2:list'
|
77
|
+
name: list
|
78
|
+
type: json
|
79
|
+
structure: # adding structure key to enabling complex json array column
|
80
|
+
- path: 'ns2:list'
|
81
|
+
name: list
|
82
|
+
type: array
|
83
|
+
- path: 'ns2:list/ns2:elements'
|
84
|
+
name: elements
|
85
|
+
type: array
|
86
|
+
- path: 'ns2:list/ns2:elements/ns2:name'
|
87
|
+
name: elementName
|
88
|
+
type: string
|
89
|
+
- path: 'ns2:list/ns2:elements/ns2:value'
|
90
|
+
name: elementValue
|
91
|
+
type: long
|
92
|
+
- path: 'ns2:list/ns2:elements/ns2:active'
|
93
|
+
name: elementActive
|
94
|
+
type: boolean
|
95
|
+
namespaces: {ns1: 'http://example.com/ns1/', ns2: 'http://example.com/ns2/'}
|
96
|
+
```
|
97
|
+
|
98
|
+
### Structure configuration
|
99
|
+
- **path**: specify path from the XPath of the column (string, required)
|
100
|
+
- **name**: json key name (string)
|
101
|
+
- **type**: json data type (One of array, string, long, boolean., required)
|
102
|
+
|
103
|
+
Then you can fetch entries from the following xml:
|
104
|
+
```xml
|
105
|
+
<?xml version="1.0"?>
|
106
|
+
<ns1:root
|
107
|
+
xmlns:ns1="http://example.com/ns1/"
|
108
|
+
xmlns:ns2="http://example.com/ns2/">
|
109
|
+
<ns2:entry>
|
110
|
+
<ns2:id>1</ns2:id>
|
111
|
+
<ns2:list>
|
112
|
+
<ns2:elements>
|
113
|
+
<ns2:name>foo1</ns2:name>
|
114
|
+
<ns2:value>1</ns2:value>
|
115
|
+
<ns2:active>true</ns2:active>
|
116
|
+
</ns2:elements>
|
117
|
+
<ns2:elements>
|
118
|
+
<ns2:name>foo2</ns2:name>
|
119
|
+
<ns2:value>2</ns2:value>
|
120
|
+
<ns2:active>false</ns2:active>
|
121
|
+
</ns2:elements>
|
122
|
+
</ns2:list>
|
123
|
+
<ns2:list>
|
124
|
+
<ns2:elements>
|
125
|
+
<ns2:name>bar1</ns2:name>
|
126
|
+
<ns2:value>3</ns2:value>
|
127
|
+
<ns2:active>true</ns2:active>
|
128
|
+
</ns2:elements>
|
129
|
+
</ns2:list>
|
130
|
+
</ns2:entry>
|
131
|
+
</ns1:root>
|
132
|
+
```
|
133
|
+
|
134
|
+
result of `list` column:
|
135
|
+
```json
|
136
|
+
{
|
137
|
+
"list": [
|
138
|
+
{
|
139
|
+
"elements": [
|
140
|
+
{
|
141
|
+
"elementActive": true,
|
142
|
+
"elementName": "foo1",
|
143
|
+
"elementValue": 1
|
144
|
+
},
|
145
|
+
{
|
146
|
+
"elementActive": false,
|
147
|
+
"elementName": "foo2",
|
148
|
+
"elementValue": 2
|
149
|
+
}
|
150
|
+
]
|
151
|
+
},
|
152
|
+
{
|
153
|
+
"elements": [
|
154
|
+
{
|
155
|
+
"elementActive": true,
|
156
|
+
"elementName": "bar1",
|
157
|
+
"elementValue": 3
|
158
|
+
}
|
159
|
+
]
|
160
|
+
}
|
161
|
+
]
|
162
|
+
}
|
163
|
+
```
|
164
|
+
|
64
165
|
## Build
|
65
166
|
|
66
167
|
```
|
data/build.gradle
CHANGED
@@ -0,0 +1,139 @@
|
|
1
|
+
package org.embulk.parser.xpath2
|
2
|
+
|
3
|
+
import com.ximpleware.{AutoPilot, VTDNav}
|
4
|
+
import org.embulk.parser.xpath2.config.JsonStructureElement
|
5
|
+
import org.embulk.spi.Column
|
6
|
+
import org.msgpack.value.{Value, Variable}
|
7
|
+
|
8
|
+
import scala.annotation.tailrec
|
9
|
+
import scala.collection.JavaConverters._
|
10
|
+
import scala.collection.immutable.Queue
|
11
|
+
|
12
|
+
sealed trait Direction
|
13
|
+
case object Parent extends Direction
|
14
|
+
case object Sibling extends Direction
|
15
|
+
case object Child extends Direction
|
16
|
+
|
17
|
+
private case class Path(depth: Int, pathFragments: Seq[String], moveDirection: Direction) {
|
18
|
+
|
19
|
+
def next(depth: Int, elementName: String): Path = {
|
20
|
+
if (this.depth > depth) {
|
21
|
+
val (rest :+ _) = pathFragments
|
22
|
+
Path(depth, rest, Parent)
|
23
|
+
} else if (this.depth == depth) {
|
24
|
+
val (rest :+ _) = pathFragments
|
25
|
+
Path(depth, rest :+ elementName, Sibling)
|
26
|
+
} else {
|
27
|
+
Path(depth, pathFragments :+ elementName, Child)
|
28
|
+
}
|
29
|
+
}
|
30
|
+
|
31
|
+
val path: String = pathFragments.mkString("/")
|
32
|
+
|
33
|
+
}
|
34
|
+
|
35
|
+
object MsgPackEncoder {
|
36
|
+
|
37
|
+
def encode(nav: VTDNav, columnAp: AutoPilot, column: Column, maybeStructure: Option[Seq[JsonStructureElement]]): Value = maybeStructure.map { structure =>
|
38
|
+
// complex json array
|
39
|
+
val keyValues = Iterator.continually(columnAp.evalXPath()).takeWhile(_ != -1).flatMap { _ =>
|
40
|
+
VTD.withinContext(nav) {
|
41
|
+
constructJsonMap(nav, columnAp, column, structure).toSeq
|
42
|
+
}
|
43
|
+
}
|
44
|
+
val mergedMap = keyValues.toSeq.groupBy { case (k, _) => k }.map { case (k, v)=>
|
45
|
+
val mergedValues = v.flatMap {
|
46
|
+
case (_, x: Seq[Any]) => x
|
47
|
+
case _ => sys.error("Root element supports array only. Please reconsider the configuration.")
|
48
|
+
}
|
49
|
+
(k, mergedValues)
|
50
|
+
}
|
51
|
+
convertToValue(mergedMap)
|
52
|
+
} getOrElse {
|
53
|
+
// simple string[]
|
54
|
+
@tailrec
|
55
|
+
def eachJsonValue(cAp: AutoPilot, queue: Queue[Value]): Queue[Value] = if (cAp.evalXPath() != -1) {
|
56
|
+
val index = nav.getText
|
57
|
+
val nextQueue = if (index != -1) queue :+ new Variable().setStringValue(nav.toString(index)).asStringValue() else queue
|
58
|
+
eachJsonValue(cAp, nextQueue)
|
59
|
+
} else queue
|
60
|
+
asArrayValue(eachJsonValue(columnAp, Queue.empty[Value]))
|
61
|
+
}
|
62
|
+
|
63
|
+
private def constructJsonMap(nav: VTDNav, columnAp: AutoPilot, column: Column, structure: Seq[JsonStructureElement]): Map[String, Any] = {
|
64
|
+
|
65
|
+
@tailrec
|
66
|
+
def eachElement(eAp: AutoPilot, previousPath: Path, obj: Map[String, Any]): Map[String, Any] = if (eAp.iterate()) {
|
67
|
+
val current = previousPath.next(nav.getCurrentDepth, nav.toString(nav.getCurrentIndex))
|
68
|
+
if (current.moveDirection == Parent) {
|
69
|
+
obj
|
70
|
+
} else {
|
71
|
+
val updated = structure.find(_.path == current.path).map { x =>
|
72
|
+
x.`type` match {
|
73
|
+
case "array" =>
|
74
|
+
val targetArray = obj.getOrElse(x.name, Queue[Any]()).asInstanceOf[Seq[Any]]
|
75
|
+
val childStructure = eachArrayElement(eAp, current)
|
76
|
+
obj.updated(x.name, targetArray ++ childStructure)
|
77
|
+
case "string" => obj.updated(x.name, nav.toNormalizedString(nav.getText))
|
78
|
+
case "long" => obj.updated(x.name, nav.toNormalizedString(nav.getText).toLong)
|
79
|
+
case "boolean" => obj.updated(x.name, nav.toNormalizedString(nav.getText).toBoolean)
|
80
|
+
case notSupported@_ => sys.error(s"type=$notSupported is notSupported")
|
81
|
+
}
|
82
|
+
}
|
83
|
+
eachElement(eAp, current, updated.getOrElse(obj))
|
84
|
+
}
|
85
|
+
} else obj
|
86
|
+
|
87
|
+
def isArrayElement(current: Path): Boolean =
|
88
|
+
structure.exists(x => x.path == current.path && x.`type` == "array")
|
89
|
+
|
90
|
+
def eachArrayElement(eAp: AutoPilot, previousPath: Path): Seq[Map[String, Any]] = {
|
91
|
+
@tailrec
|
92
|
+
def loop(eAp: AutoPilot, previousPath: Path, obj: Map[String, Any], elements: Seq[Map[String, Any]]): Seq[Map[String, Any]] = {
|
93
|
+
val arrayContent = eachElement(eAp, previousPath, obj)
|
94
|
+
val currentPath = previousPath.next(nav.getCurrentDepth, nav.toString(nav.getCurrentIndex))
|
95
|
+
if (isArrayElement(currentPath)) {
|
96
|
+
loop(eAp, currentPath, obj, elements :+ arrayContent)
|
97
|
+
} else elements :+ arrayContent
|
98
|
+
}
|
99
|
+
loop(eAp, previousPath, Map.empty[String, Any], Queue.empty[Map[String, Any]])
|
100
|
+
}
|
101
|
+
|
102
|
+
val eachElementAp = new AutoPilot(nav)
|
103
|
+
eachElementAp.selectElement("*")
|
104
|
+
|
105
|
+
val initialPath = Path(-1, Vector.empty, Sibling)
|
106
|
+
|
107
|
+
eachElement(eachElementAp, initialPath, Map[String, Any]())
|
108
|
+
}
|
109
|
+
|
110
|
+
private def convertToValue(obj: Map[String, Any]): Value = {
|
111
|
+
val map = obj.map {
|
112
|
+
case (k, v: Seq[_]) => (asStringValue(k), convertToValue(v))
|
113
|
+
case (k, v: Map[_, _]) => (asStringValue(k), convertToValue(v.asInstanceOf[Map[String, Any]]))
|
114
|
+
case (k, v: String) => (asStringValue(k), asStringValue(v))
|
115
|
+
case (k, v: Boolean) => (asStringValue(k), asBooleanValue(v))
|
116
|
+
case (k, v: Long) => (asStringValue(k), asLongValue(v))
|
117
|
+
case (k, v) => sys.error(s"can't convert: key=$k, value=$v")
|
118
|
+
}
|
119
|
+
asMapValue(map)
|
120
|
+
}
|
121
|
+
|
122
|
+
private def convertToValue(seq: Seq[Any]): Value = {
|
123
|
+
val list = seq.map {
|
124
|
+
case v: Seq[_] => convertToValue(v)
|
125
|
+
case v: Map[_, _] => convertToValue(v.asInstanceOf[Map[String, Any]])
|
126
|
+
case v: String => asStringValue(v)
|
127
|
+
case v: Boolean => asBooleanValue(v)
|
128
|
+
case v: Long => asLongValue(v)
|
129
|
+
}
|
130
|
+
asArrayValue(list)
|
131
|
+
}
|
132
|
+
|
133
|
+
private final def asStringValue(value: String): Value = new Variable().setStringValue(value).asStringValue()
|
134
|
+
private final def asBooleanValue(value: Boolean): Value = new Variable().setBooleanValue(value).asBooleanValue()
|
135
|
+
private final def asLongValue(value: Long): Value = new Variable().setIntegerValue(value).asNumberValue()
|
136
|
+
private final def asArrayValue(value: Seq[Value]): Value = new Variable().setArrayValue(value.asJava).asArrayValue()
|
137
|
+
private final def asMapValue(value: Map[Value, Value]): Value = new Variable().setMapValue(value.asJava).asMapValue()
|
138
|
+
|
139
|
+
}
|
@@ -3,12 +3,11 @@ package org.embulk.parser.xpath2
|
|
3
3
|
import com.google.common.io.ByteStreams
|
4
4
|
import com.ximpleware.{AutoPilot, VTDGen, VTDNav}
|
5
5
|
import org.embulk.config._
|
6
|
-
import org.embulk.parser.xpath2.config.ColumnConfig
|
6
|
+
import org.embulk.parser.xpath2.config.{ColumnConfig, JsonStructureElement}
|
7
7
|
import org.embulk.spi._
|
8
8
|
import org.embulk.spi.`type`._
|
9
9
|
import org.embulk.spi.time.TimestampParser
|
10
10
|
import org.embulk.spi.util.FileInputInputStream
|
11
|
-
import org.msgpack.value.{Value, Variable}
|
12
11
|
import org.slf4j.Logger
|
13
12
|
|
14
13
|
import scala.annotation.tailrec
|
@@ -33,7 +32,10 @@ class XPath2ParserPlugin extends ParserPlugin {
|
|
33
32
|
val stopOnInvalidRecord: Boolean = task.getStopOnInvalidRecord
|
34
33
|
|
35
34
|
val timestampParsers: Map[String, TimestampParser] = task.getSchema.columns.asScala
|
36
|
-
.collect { case ColumnConfig(_, name, _, Some(timestampColumnOption), _) => (name, new TimestampParser(task, timestampColumnOption)) }.toMap
|
35
|
+
.collect { case ColumnConfig(_, name, _, Some(timestampColumnOption), _, _) => (name, new TimestampParser(task, timestampColumnOption)) }.toMap
|
36
|
+
|
37
|
+
val jsonStructures: Map[String, Seq[JsonStructureElement]] = task.getSchema.columns.asScala
|
38
|
+
.collect { case ColumnConfig(_, name, _, _, Some(jsonColumnOption), _) => (name, jsonColumnOption.structure.asScala) }.toMap
|
37
39
|
|
38
40
|
def declareXPathNS(ap: AutoPilot): Unit = {
|
39
41
|
task.getNamespaces.conf.asScala.foreach { case (prefix, namespaceURI) =>
|
@@ -67,11 +69,11 @@ class XPath2ParserPlugin extends ParserPlugin {
|
|
67
69
|
nav.push()
|
68
70
|
try {
|
69
71
|
columnElementAutoPilots.zipWithIndex.foreach { case (columnElementAutoPilot, idx) =>
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
72
|
+
VTD.withinContext(nav) {
|
73
|
+
columnElementAutoPilot.resetXPath()
|
74
|
+
val column = schema.getColumn(idx)
|
75
|
+
handleColumn(pb, nav, columnElementAutoPilot, column, timestampParsers, jsonStructures)
|
76
|
+
}
|
75
77
|
}
|
76
78
|
pb.addRecord()
|
77
79
|
} catch {
|
@@ -95,17 +97,9 @@ class XPath2ParserPlugin extends ParserPlugin {
|
|
95
97
|
}
|
96
98
|
}
|
97
99
|
|
98
|
-
final def handleColumn(pb: PageBuilder, nav: VTDNav, columnAp: AutoPilot, column: Column, timestampParsers: Map[String, TimestampParser]): Unit = {
|
100
|
+
final def handleColumn(pb: PageBuilder, nav: VTDNav, columnAp: AutoPilot, column: Column, timestampParsers: Map[String, TimestampParser], jsonStructures: Map[String, Seq[JsonStructureElement]]): Unit = {
|
99
101
|
if (column.getType.isInstanceOf[JsonType]) {
|
100
|
-
val
|
101
|
-
@tailrec
|
102
|
-
def eachJsonValue(cAp: AutoPilot): Unit = if (cAp.evalXPath() != -1) {
|
103
|
-
val index = nav.getText
|
104
|
-
if (index != -1) list.add(new Variable().setStringValue(nav.toString(index)).asStringValue())
|
105
|
-
eachJsonValue(cAp)
|
106
|
-
}
|
107
|
-
eachJsonValue(columnAp)
|
108
|
-
val jsonValue = new Variable().setArrayValue(list).asArrayValue()
|
102
|
+
val jsonValue = MsgPackEncoder.encode(nav, columnAp, column, jsonStructures.get(column.getName))
|
109
103
|
pb.setJson(column, jsonValue)
|
110
104
|
} else {
|
111
105
|
if (columnAp.evalXPath() == -1) {
|
@@ -2,10 +2,10 @@ package org.embulk.parser.xpath2.config
|
|
2
2
|
|
3
3
|
import java.util
|
4
4
|
|
5
|
-
import com.fasterxml.jackson.annotation.{JsonCreator, JsonValue}
|
5
|
+
import com.fasterxml.jackson.annotation.{JsonCreator, JsonProperty, JsonValue}
|
6
6
|
import com.google.common.base.Optional
|
7
7
|
import org.embulk.config.{Config, ConfigDefault, ConfigSource}
|
8
|
-
import org.embulk.spi.`type`.{TimestampType, Type}
|
8
|
+
import org.embulk.spi.`type`.{JsonType, TimestampType, Type}
|
9
9
|
import org.embulk.spi.time.TimestampParser.TimestampColumnOption
|
10
10
|
import org.joda.time.DateTimeZone
|
11
11
|
|
@@ -14,11 +14,18 @@ case class SchemaConfig @JsonCreator()(columns: java.util.List[ColumnConfig]) {
|
|
14
14
|
def getColumns: util.List[ColumnConfig] = columns
|
15
15
|
}
|
16
16
|
|
17
|
-
case class ColumnConfig(path: String, name: String, `type`: Type, timestampOption: Option[TimestampColumnOption], option: ConfigSource) {
|
17
|
+
case class ColumnConfig(path: String, name: String, `type`: Type, timestampOption: Option[TimestampColumnOption], jsonOption: Option[JsonColumnOption], option: ConfigSource) {
|
18
18
|
|
19
19
|
@JsonCreator()
|
20
20
|
def this(src: ConfigSource) = {
|
21
|
-
this(
|
21
|
+
this(
|
22
|
+
src.get(classOf[String], "path"),
|
23
|
+
src.get(classOf[String], "name"),
|
24
|
+
src.get(classOf[Type], "type"),
|
25
|
+
ColumnConfig.getTimestampOption(src, src.get(classOf[Type], "type")),
|
26
|
+
ColumnConfig.getJsonOption(src, src.get(classOf[Type], "type")),
|
27
|
+
src
|
28
|
+
)
|
22
29
|
}
|
23
30
|
|
24
31
|
@JsonValue()
|
@@ -49,11 +56,25 @@ private class TimestampColumnOptionImpl(timezone: Optional[DateTimeZone], format
|
|
49
56
|
override val getDate = date
|
50
57
|
}
|
51
58
|
|
59
|
+
class JsonStructureElement(@JsonProperty("path") val path: String, @JsonProperty("type") val `type`: String) {
|
60
|
+
@JsonProperty("name")
|
61
|
+
val name: String = path
|
62
|
+
}
|
63
|
+
|
64
|
+
case class JsonColumnOption(@JsonProperty("structure") structure: java.util.List[JsonStructureElement])
|
65
|
+
|
52
66
|
object ColumnConfig {
|
53
67
|
private def getTimestampOption(src: ConfigSource, `type`: Type): Option[TimestampColumnOption] = `type` match {
|
54
68
|
case _: TimestampType => Some(getOption(src).loadConfig(classOf[TimestampColumnOptionImpl]))
|
55
69
|
case _ => None
|
56
70
|
}
|
57
71
|
|
72
|
+
private def getJsonOption(src: ConfigSource, `type`: Type): Option[JsonColumnOption] = `type` match {
|
73
|
+
case _: JsonType =>
|
74
|
+
val option = getOption(src)
|
75
|
+
if (option.has("structure")) Some(option.loadConfig(classOf[JsonColumnOption])) else None
|
76
|
+
case _ => None
|
77
|
+
}
|
78
|
+
|
58
79
|
private def getOption(src: ConfigSource) = src.deepCopy().remove("path").remove("name").remove("type")
|
59
80
|
}
|
@@ -0,0 +1,30 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: src/test/resources/data.xml
|
4
|
+
parser:
|
5
|
+
type: xpath2
|
6
|
+
root: '/ns1:root/ns2:entry'
|
7
|
+
schema:
|
8
|
+
- { path: 'ns2:id', name: id, type: long }
|
9
|
+
- path: 'ns2:list'
|
10
|
+
name: list
|
11
|
+
type: json
|
12
|
+
structure:
|
13
|
+
- path: 'ns2:list'
|
14
|
+
name: list
|
15
|
+
type: array
|
16
|
+
- path: 'ns2:list/ns2:elements'
|
17
|
+
name: elements
|
18
|
+
type: array
|
19
|
+
- path: 'ns2:list/ns2:elements/ns2:name'
|
20
|
+
name: elementName
|
21
|
+
type: string
|
22
|
+
- path: 'ns2:list/ns2:elements/ns2:value'
|
23
|
+
name: elementValue
|
24
|
+
type: long
|
25
|
+
- path: 'ns2:list/ns2:elements/ns2:active'
|
26
|
+
name: elementActive
|
27
|
+
type: boolean
|
28
|
+
namespaces: {ns1: 'http://example.com/ns1/', ns2: 'http://example.com/ns2/'}
|
29
|
+
out:
|
30
|
+
type: stdout
|
@@ -0,0 +1,27 @@
|
|
1
|
+
<?xml version="1.0"?>
|
2
|
+
<ns1:root
|
3
|
+
xmlns:ns1="http://example.com/ns1/"
|
4
|
+
xmlns:ns2="http://example.com/ns2/">
|
5
|
+
<ns2:entry>
|
6
|
+
<ns2:id>1</ns2:id>
|
7
|
+
<ns2:list>
|
8
|
+
<ns2:elements>
|
9
|
+
<ns2:name>foo1</ns2:name>
|
10
|
+
<ns2:value>1</ns2:value>
|
11
|
+
<ns2:active>true</ns2:active>
|
12
|
+
</ns2:elements>
|
13
|
+
<ns2:elements>
|
14
|
+
<ns2:name>foo2</ns2:name>
|
15
|
+
<ns2:value>2</ns2:value>
|
16
|
+
<ns2:active>false</ns2:active>
|
17
|
+
</ns2:elements>
|
18
|
+
</ns2:list>
|
19
|
+
<ns2:list>
|
20
|
+
<ns2:elements>
|
21
|
+
<ns2:name>bar1</ns2:name>
|
22
|
+
<ns2:value>3</ns2:value>
|
23
|
+
<ns2:active>true</ns2:active>
|
24
|
+
</ns2:elements>
|
25
|
+
</ns2:list>
|
26
|
+
</ns2:entry>
|
27
|
+
</ns1:root>
|
@@ -0,0 +1,53 @@
|
|
1
|
+
package org.embulk.parser.xpath2
|
2
|
+
|
3
|
+
import org.embulk.spi.{Column, ColumnVisitor, PageReader}
|
4
|
+
|
5
|
+
class TestColumnVisitor(reader: PageReader, record: collection.mutable.Map[String, Any]) extends ColumnVisitor {
|
6
|
+
override def timestampColumn(column: Column): Unit = {
|
7
|
+
if (reader.isNull(column)) {
|
8
|
+
record.put(column.getName, null)
|
9
|
+
} else {
|
10
|
+
record.put(column.getName, reader.getTimestamp(column))
|
11
|
+
}
|
12
|
+
}
|
13
|
+
|
14
|
+
override def stringColumn(column: Column): Unit = {
|
15
|
+
if (reader.isNull(column)) {
|
16
|
+
record.put(column.getName, null)
|
17
|
+
} else {
|
18
|
+
record.put(column.getName, reader.getString(column))
|
19
|
+
}
|
20
|
+
}
|
21
|
+
|
22
|
+
override def longColumn(column: Column): Unit = {
|
23
|
+
if (reader.isNull(column)) {
|
24
|
+
record.put(column.getName, null)
|
25
|
+
} else {
|
26
|
+
record.put(column.getName, reader.getLong(column))
|
27
|
+
}
|
28
|
+
}
|
29
|
+
|
30
|
+
override def doubleColumn(column: Column): Unit = {
|
31
|
+
if (reader.isNull(column)) {
|
32
|
+
record.put(column.getName, null)
|
33
|
+
} else {
|
34
|
+
record.put(column.getName, reader.getDouble(column))
|
35
|
+
}
|
36
|
+
}
|
37
|
+
|
38
|
+
override def booleanColumn(column: Column): Unit = {
|
39
|
+
if (reader.isNull(column)) {
|
40
|
+
record.put(column.getName, null)
|
41
|
+
} else {
|
42
|
+
record.put(column.getName, reader.getBoolean(column))
|
43
|
+
}
|
44
|
+
}
|
45
|
+
|
46
|
+
override def jsonColumn(column: Column): Unit = {
|
47
|
+
if (reader.isNull(column)) {
|
48
|
+
record.put(column.getName, null)
|
49
|
+
} else {
|
50
|
+
record.put(column.getName, reader.getJson(column))
|
51
|
+
}
|
52
|
+
}
|
53
|
+
}
|
@@ -0,0 +1,31 @@
|
|
1
|
+
package org.embulk.parser.xpath2
|
2
|
+
|
3
|
+
import org.embulk.spi.{Exec, Page, Schema, TransactionalPageOutput}
|
4
|
+
|
5
|
+
import scala.collection.mutable
|
6
|
+
import scala.collection.JavaConverters._
|
7
|
+
|
8
|
+
class TestTransactionalPageOutput(schema: Schema, result: mutable.Buffer[collection.mutable.Map[String, Any]])
|
9
|
+
extends TransactionalPageOutput {
|
10
|
+
import org.embulk.spi.PageReader
|
11
|
+
|
12
|
+
val reader = new PageReader(schema)
|
13
|
+
|
14
|
+
override def add(page: Page) = {
|
15
|
+
reader.setPage(page)
|
16
|
+
|
17
|
+
while (reader.nextRecord()) {
|
18
|
+
val record: collection.mutable.Map[String, Any] = collection.mutable.Map()
|
19
|
+
|
20
|
+
schema.getColumns().asScala.foreach { column =>
|
21
|
+
column.visit(new TestColumnVisitor(reader, record))
|
22
|
+
}
|
23
|
+
result += record
|
24
|
+
}
|
25
|
+
}
|
26
|
+
|
27
|
+
override def commit() = Exec.newTaskReport()
|
28
|
+
override def abort() = {}
|
29
|
+
override def finish() = {}
|
30
|
+
override def close() = {}
|
31
|
+
}
|
@@ -0,0 +1,86 @@
|
|
1
|
+
package org.embulk.parser.xpath2
|
2
|
+
|
3
|
+
import java.io.{File, FileInputStream}
|
4
|
+
import java.nio.file
|
5
|
+
import java.nio.file.Paths
|
6
|
+
|
7
|
+
import org.embulk.EmbulkTestRuntime
|
8
|
+
import org.embulk.config.{ConfigLoader, ConfigSource, TaskSource}
|
9
|
+
import org.embulk.spi.json.JsonParser
|
10
|
+
import org.embulk.spi.util.InputStreamFileInput
|
11
|
+
import org.embulk.spi.{Exec, _}
|
12
|
+
import org.junit.Assert._
|
13
|
+
import org.junit.{Rule, Test}
|
14
|
+
|
15
|
+
import scala.collection.mutable
|
16
|
+
import scala.collection.mutable.ArrayBuffer
|
17
|
+
|
18
|
+
class XPath2ParserPluginJsonSpec {
|
19
|
+
|
20
|
+
@Rule
|
21
|
+
def runtime = new EmbulkTestRuntime
|
22
|
+
|
23
|
+
val yamlPath: file.Path = Paths.get(classOf[XPath2ParserPlugin].getClassLoader.getResource("json_config.yml").toURI)
|
24
|
+
val dataPath: String = classOf[XPath2ParserPlugin].getClassLoader.getResource("json_data.xml").getPath
|
25
|
+
|
26
|
+
def configSource: ConfigSource = new ConfigLoader(Exec.getModelManager).fromYamlFile(yamlPath.toFile).getNested("in").getNested("parser")
|
27
|
+
|
28
|
+
@Test def testParseJsonArrayXML() {
|
29
|
+
|
30
|
+
val cs = configSource
|
31
|
+
val task = cs.loadConfig(classOf[PluginTask])
|
32
|
+
|
33
|
+
var schema: Schema = null
|
34
|
+
|
35
|
+
val plugin = new XPath2ParserPlugin()
|
36
|
+
plugin.transaction(cs, (_: TaskSource, s: Schema) => {schema = s})
|
37
|
+
|
38
|
+
val result: mutable.Buffer[collection.mutable.Map[String, Any]] = mutable.Buffer()
|
39
|
+
|
40
|
+
plugin.run(
|
41
|
+
task.dump(),
|
42
|
+
schema,
|
43
|
+
new InputStreamFileInput(Exec.getBufferAllocator, new FileInputStream(new File(dataPath))),
|
44
|
+
new TestTransactionalPageOutput(schema, result)
|
45
|
+
)
|
46
|
+
|
47
|
+
println(result)
|
48
|
+
|
49
|
+
val expectedJson =
|
50
|
+
"""{
|
51
|
+
"list": [
|
52
|
+
{
|
53
|
+
"elements": [
|
54
|
+
{
|
55
|
+
"elementActive": true,
|
56
|
+
"elementName": "foo1",
|
57
|
+
"elementValue": 1
|
58
|
+
},
|
59
|
+
{
|
60
|
+
"elementActive": false,
|
61
|
+
"elementName": "foo2",
|
62
|
+
"elementValue": 2
|
63
|
+
}
|
64
|
+
]
|
65
|
+
},
|
66
|
+
{
|
67
|
+
"elements": [
|
68
|
+
{
|
69
|
+
"elementActive": true,
|
70
|
+
"elementName": "bar1",
|
71
|
+
"elementValue": 3
|
72
|
+
}
|
73
|
+
]
|
74
|
+
}
|
75
|
+
]
|
76
|
+
}"""
|
77
|
+
|
78
|
+
assertEquals(ArrayBuffer(
|
79
|
+
Map(
|
80
|
+
"id" -> 1L,
|
81
|
+
"list" -> new JsonParser().parse(expectedJson)
|
82
|
+
)
|
83
|
+
), result)
|
84
|
+
}
|
85
|
+
|
86
|
+
}
|
@@ -141,80 +141,3 @@ class XPath2ParserPluginSpec {
|
|
141
141
|
}
|
142
142
|
|
143
143
|
}
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
class TestTransactionalPageOutput(schema: Schema, result: mutable.Buffer[collection.mutable.Map[String, Any]])
|
148
|
-
extends TransactionalPageOutput {
|
149
|
-
import org.embulk.spi.PageReader
|
150
|
-
|
151
|
-
val reader = new PageReader(schema)
|
152
|
-
|
153
|
-
override def add(page: Page) = {
|
154
|
-
reader.setPage(page)
|
155
|
-
|
156
|
-
while (reader.nextRecord()) {
|
157
|
-
val record: collection.mutable.Map[String, Any] = collection.mutable.Map()
|
158
|
-
|
159
|
-
schema.getColumns().asScala.foreach { column =>
|
160
|
-
column.visit(new TestColumnVisitor(reader, record))
|
161
|
-
}
|
162
|
-
result += record
|
163
|
-
}
|
164
|
-
}
|
165
|
-
|
166
|
-
override def commit() = Exec.newTaskReport()
|
167
|
-
override def abort() = {}
|
168
|
-
override def finish() = {}
|
169
|
-
override def close() = {}
|
170
|
-
}
|
171
|
-
|
172
|
-
class TestColumnVisitor(reader: PageReader, record: collection.mutable.Map[String, Any]) extends ColumnVisitor {
|
173
|
-
override def timestampColumn(column: Column): Unit = {
|
174
|
-
if (reader.isNull(column)) {
|
175
|
-
record.put(column.getName, null)
|
176
|
-
} else {
|
177
|
-
record.put(column.getName, reader.getTimestamp(column))
|
178
|
-
}
|
179
|
-
}
|
180
|
-
|
181
|
-
override def stringColumn(column: Column): Unit = {
|
182
|
-
if (reader.isNull(column)) {
|
183
|
-
record.put(column.getName, null)
|
184
|
-
} else {
|
185
|
-
record.put(column.getName, reader.getString(column))
|
186
|
-
}
|
187
|
-
}
|
188
|
-
|
189
|
-
override def longColumn(column: Column): Unit = {
|
190
|
-
if (reader.isNull(column)) {
|
191
|
-
record.put(column.getName, null)
|
192
|
-
} else {
|
193
|
-
record.put(column.getName, reader.getLong(column))
|
194
|
-
}
|
195
|
-
}
|
196
|
-
|
197
|
-
override def doubleColumn(column: Column): Unit = {
|
198
|
-
if (reader.isNull(column)) {
|
199
|
-
record.put(column.getName, null)
|
200
|
-
} else {
|
201
|
-
record.put(column.getName, reader.getDouble(column))
|
202
|
-
}
|
203
|
-
}
|
204
|
-
|
205
|
-
override def booleanColumn(column: Column): Unit = {
|
206
|
-
if (reader.isNull(column)) {
|
207
|
-
record.put(column.getName, null)
|
208
|
-
} else {
|
209
|
-
record.put(column.getName, reader.getBoolean(column))
|
210
|
-
}
|
211
|
-
}
|
212
|
-
|
213
|
-
override def jsonColumn(column: Column): Unit = {
|
214
|
-
if (reader.isNull(column)) {
|
215
|
-
record.put(column.getName, null)
|
216
|
-
} else {
|
217
|
-
record.put(column.getName, reader.getJson(column))
|
218
|
-
}
|
219
|
-
}
|
220
|
-
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-parser-xpath2
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- maji-KY
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-01-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -61,18 +61,25 @@ files:
|
|
61
61
|
- project/build.properties
|
62
62
|
- project/plugins.sbt
|
63
63
|
- src/main/scala/org/embulk/parser/xpath2/LoanPattern.scala
|
64
|
+
- src/main/scala/org/embulk/parser/xpath2/MsgPackEncoder.scala
|
64
65
|
- src/main/scala/org/embulk/parser/xpath2/PluginTask.scala
|
66
|
+
- src/main/scala/org/embulk/parser/xpath2/VTD.scala
|
65
67
|
- src/main/scala/org/embulk/parser/xpath2/XPath2ParserPlugin.scala
|
66
68
|
- src/main/scala/org/embulk/parser/xpath2/config/NamespacesConfig.scala
|
67
69
|
- src/main/scala/org/embulk/parser/xpath2/config/SchemaConfig.scala
|
68
70
|
- src/test/resources/config.yml
|
69
71
|
- src/test/resources/data.xml
|
70
72
|
- src/test/resources/invalid-data.xml
|
73
|
+
- src/test/resources/json_config.yml
|
74
|
+
- src/test/resources/json_data.xml
|
75
|
+
- src/test/scala/org/embulk/parser/xpath2/TestColumnVisitor.scala
|
76
|
+
- src/test/scala/org/embulk/parser/xpath2/TestTransactionalPageOutput.scala
|
71
77
|
- src/test/scala/org/embulk/parser/xpath2/UnitSpec.scala
|
78
|
+
- src/test/scala/org/embulk/parser/xpath2/XPath2ParserPluginJsonSpec.scala
|
72
79
|
- src/test/scala/org/embulk/parser/xpath2/XPath2ParserPluginSpec.scala
|
73
80
|
- classpath/vtd-xml-2.13.4.jar
|
74
81
|
- classpath/scala-library-2.12.4.jar
|
75
|
-
- classpath/embulk-parser-xpath2-0.
|
82
|
+
- classpath/embulk-parser-xpath2-0.2.0.jar
|
76
83
|
homepage: https://github.com/maji-KY/embulk-parser-xpath2
|
77
84
|
licenses:
|
78
85
|
- GPL-2.0
|