embulk-parser-twitter_ads_stats 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +21 -0
- data/.scalafmt.conf +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +79 -0
- data/build.gradle +79 -0
- data/build.sbt +15 -0
- data/classpath/embulk-parser-twitter_ads_stats-0.1.0.jar +0 -0
- data/classpath/scala-library-2.12.3.jar +0 -0
- data/classpath/spray-json_2.12-1.3.3.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +5 -0
- data/gradlew +172 -0
- data/gradlew.bat +84 -0
- data/lib/embulk/guess/twitter_ads_stats.rb +61 -0
- data/lib/embulk/parser/twitter_ads_stats.rb +3 -0
- data/project/Dependencies.scala +11 -0
- data/project/build.properties +1 -0
- data/project/plugins.sbt +2 -0
- data/src/main/scala/org/embulk/parser/twitter_ads_stats/Column.scala +50 -0
- data/src/main/scala/org/embulk/parser/twitter_ads_stats/LoanPattern.scala +19 -0
- data/src/main/scala/org/embulk/parser/twitter_ads_stats/MetricElementNames.scala +47 -0
- data/src/main/scala/org/embulk/parser/twitter_ads_stats/MetricsGroupJson.scala +19 -0
- data/src/main/scala/org/embulk/parser/twitter_ads_stats/ParseException.scala +27 -0
- data/src/main/scala/org/embulk/parser/twitter_ads_stats/PluginTask.scala +9 -0
- data/src/main/scala/org/embulk/parser/twitter_ads_stats/TwitterAdsStatsParserPlugin.scala +98 -0
- data/src/main/scala/org/embulk/parser/twitter_ads_stats/define/Data.scala +19 -0
- data/src/main/scala/org/embulk/parser/twitter_ads_stats/define/FieldNameUtil.scala +9 -0
- data/src/main/scala/org/embulk/parser/twitter_ads_stats/define/IDData.scala +36 -0
- data/src/main/scala/org/embulk/parser/twitter_ads_stats/define/Metrics.scala +23 -0
- data/src/main/scala/org/embulk/parser/twitter_ads_stats/define/Request.scala +42 -0
- data/src/main/scala/org/embulk/parser/twitter_ads_stats/define/Root.scala +21 -0
- data/src/main/scala/org/embulk/parser/twitter_ads_stats/define/RootJson.scala +107 -0
- data/src/main/scala/org/embulk/parser/twitter_ads_stats/package.scala +192 -0
- data/src/test/resources/test.json +759 -0
- data/src/test/scala/org/embulk/parser/twitter_ads_stats/ColumnSpec.scala +33 -0
- data/src/test/scala/org/embulk/parser/twitter_ads_stats/MetricElementNamesSpec.scala +14 -0
- data/src/test/scala/org/embulk/parser/twitter_ads_stats/MetricsGroupJsonSpec.scala +20 -0
- data/src/test/scala/org/embulk/parser/twitter_ads_stats/UnitSpec.scala +5 -0
- data/src/test/scala/org/embulk/parser/twitter_ads_stats/define/MetricsJsonSpec.scala +57 -0
- data/src/test/scala/org/embulk/parser/twitter_ads_stats/define/ParamsSpec.scala +26 -0
- data/src/test/scala/org/embulk/parser/twitter_ads_stats/define/RootJsonSpec.scala +20 -0
- data/src/test/scala/org/embulk/parser/twitter_ads_stats/define/RootSpec.scala +356 -0
- metadata +114 -0
@@ -0,0 +1,61 @@
|
|
1
|
+
module Embulk
|
2
|
+
module Guess
|
3
|
+
|
4
|
+
# TODO implement guess plugin to make this command work:
|
5
|
+
# $ embulk guess -g "twitter_ads_stats" partial-config.yml
|
6
|
+
#
|
7
|
+
# Depending on the file format the plugin uses, you can use choose
|
8
|
+
# one of binary guess (GuessPlugin), text guess (TextGuessPlugin),
|
9
|
+
# or line guess (LineGuessPlugin).
|
10
|
+
|
11
|
+
# class TwitterAdsStats < GuessPlugin
|
12
|
+
# Plugin.register_guess("twitter_ads_stats", self)
|
13
|
+
#
|
14
|
+
# def guess(config, sample_buffer)
|
15
|
+
# if sample_buffer[0,2] == GZIP_HEADER
|
16
|
+
# guessed = {}
|
17
|
+
# guessed["type"] = "twitter_ads_stats"
|
18
|
+
# guessed["property1"] = "guessed-value"
|
19
|
+
# return {"parser" => guessed}
|
20
|
+
# else
|
21
|
+
# return {}
|
22
|
+
# end
|
23
|
+
# end
|
24
|
+
# end
|
25
|
+
|
26
|
+
# class TwitterAdsStats < TextGuessPlugin
|
27
|
+
# Plugin.register_guess("twitter_ads_stats", self)
|
28
|
+
#
|
29
|
+
# def guess_text(config, sample_text)
|
30
|
+
# js = JSON.parse(sample_text) rescue nil
|
31
|
+
# if js && js["mykeyword"] == "keyword"
|
32
|
+
# guessed = {}
|
33
|
+
# guessed["type"] = "twitter_ads_stats"
|
34
|
+
# guessed["property1"] = "guessed-value"
|
35
|
+
# return {"parser" => guessed}
|
36
|
+
# else
|
37
|
+
# return {}
|
38
|
+
# end
|
39
|
+
# end
|
40
|
+
# end
|
41
|
+
|
42
|
+
# class TwitterAdsStats < LineGuessPlugin
|
43
|
+
# Plugin.register_guess("twitter_ads_stats", self)
|
44
|
+
#
|
45
|
+
# def guess_lines(config, sample_lines)
|
46
|
+
# all_line_matched = sample_lines.all? do |line|
|
47
|
+
# line =~ /mypattern/
|
48
|
+
# end
|
49
|
+
# if all_line_matched
|
50
|
+
# guessed = {}
|
51
|
+
# guessed["type"] = "twitter_ads_stats"
|
52
|
+
# guessed["property1"] = "guessed-value"
|
53
|
+
# return {"parser" => guessed}
|
54
|
+
# else
|
55
|
+
# return {}
|
56
|
+
# end
|
57
|
+
# end
|
58
|
+
# end
|
59
|
+
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
import sbt._
|
2
|
+
|
3
|
+
object Dependencies {
|
4
|
+
|
5
|
+
val value = Seq(
|
6
|
+
"org.scalatest" %% "scalatest" % "3.0.1" % Test,
|
7
|
+
"org.embulk" % "embulk-core" % "0.8.35",
|
8
|
+
"org.embulk" % "embulk-core" % "0.8.35" classifier "tests",
|
9
|
+
"io.spray" %% "spray-json" % "1.3.3"
|
10
|
+
)
|
11
|
+
}
|
@@ -0,0 +1 @@
|
|
1
|
+
sbt.version=0.13.16
|
data/project/plugins.sbt
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
package org.embulk.parser.twitter_ads_stats
|
2
|
+
|
3
|
+
import java.time.LocalDate
|
4
|
+
|
5
|
+
import org.embulk.spi.`type`.Types
|
6
|
+
import org.embulk.spi.{Column => EmbulkColumn}
|
7
|
+
|
8
|
+
object Column {
|
9
|
+
def createEmbulkColumns(metricElementNames: MetricElementNames): Seq[EmbulkColumn] = {
|
10
|
+
@scala.annotation.tailrec
|
11
|
+
def loop(
|
12
|
+
curIndex: Int,
|
13
|
+
curNames: List[String],
|
14
|
+
acc: Seq[EmbulkColumn]
|
15
|
+
): Seq[EmbulkColumn] = {
|
16
|
+
curNames match {
|
17
|
+
case Nil => acc.reverse
|
18
|
+
case x :: xs =>
|
19
|
+
loop(
|
20
|
+
curIndex + 1,
|
21
|
+
xs,
|
22
|
+
new EmbulkColumn(curIndex, x, Types.JSON) +: acc
|
23
|
+
)
|
24
|
+
}
|
25
|
+
}
|
26
|
+
|
27
|
+
|
28
|
+
val baseColumns = Seq(
|
29
|
+
new EmbulkColumn(0, "id", Types.STRING),
|
30
|
+
new EmbulkColumn(1, "date", Types.STRING),
|
31
|
+
new EmbulkColumn(2, "segment", Types.STRING),
|
32
|
+
new EmbulkColumn(3, "placement", Types.STRING)
|
33
|
+
)
|
34
|
+
|
35
|
+
val metricsColumns = loop(
|
36
|
+
4,
|
37
|
+
metricElementNames.getSortedMetricsGroupNames,
|
38
|
+
Nil
|
39
|
+
)
|
40
|
+
baseColumns ++ metricsColumns
|
41
|
+
}
|
42
|
+
}
|
43
|
+
|
44
|
+
case class Column(
|
45
|
+
id: String,
|
46
|
+
date: LocalDate,
|
47
|
+
segment: Option[String],
|
48
|
+
placement: String,
|
49
|
+
metricsGroup: Map[String, MetricsGroup]
|
50
|
+
)
|
@@ -0,0 +1,19 @@
|
|
1
|
+
package org.embulk.parser.twitter_ads_stats
|
2
|
+
|
3
|
+
import scala.util.control.Exception.ignoring
|
4
|
+
|
5
|
+
object LoanPattern {
|
6
|
+
|
7
|
+
type Closable = { def close(): Unit }
|
8
|
+
|
9
|
+
def apply[R <: Closable, A](resource: R)(f: R => A): A = {
|
10
|
+
try {
|
11
|
+
f(resource)
|
12
|
+
} finally {
|
13
|
+
ignoring(classOf[Throwable]) apply {
|
14
|
+
resource.close()
|
15
|
+
}
|
16
|
+
}
|
17
|
+
}
|
18
|
+
|
19
|
+
}
|
@@ -0,0 +1,47 @@
|
|
1
|
+
package org.embulk.parser.twitter_ads_stats
|
2
|
+
|
3
|
+
import org.embulk.parser.twitter_ads_stats.define.Metrics
|
4
|
+
import spray.json.{JsObject, JsValue}
|
5
|
+
|
6
|
+
/**
|
7
|
+
* Metricの要素名
|
8
|
+
*
|
9
|
+
*
|
10
|
+
* @param names key: MetricsGroup名 value: Metricの要素s
|
11
|
+
* (example)
|
12
|
+
* Map(
|
13
|
+
* "engagement" -> Seq("engagements", "impressions"),
|
14
|
+
* "web_conversion" -> Seq("conversion_purchases.assisted", "conversion_sign_ups.assisted")
|
15
|
+
* )
|
16
|
+
*/
|
17
|
+
case class MetricElementNames(names: Map[String, Seq[String]]) {
|
18
|
+
|
19
|
+
import MetricElementNames._
|
20
|
+
|
21
|
+
def getSortedMetricsGroupNames:List[String] = names.keys.toList.sorted
|
22
|
+
|
23
|
+
private[twitter_ads_stats] def resolveMetrics(resolveMetricTimeSeries: (List[String], Option[JsValue]) => MetricTimeSeries, json: JsObject): Metrics =
|
24
|
+
Metrics(
|
25
|
+
names.flatMap { v =>
|
26
|
+
v._2.map { value =>
|
27
|
+
val keys = splitSeparator(value)
|
28
|
+
(keys.mkString(separator), resolveMetricTimeSeries(keys, Some(json)))
|
29
|
+
}
|
30
|
+
}
|
31
|
+
)
|
32
|
+
}
|
33
|
+
|
34
|
+
object MetricElementNames {
|
35
|
+
|
36
|
+
private val separator = "_"
|
37
|
+
|
38
|
+
private def splitSeparator(name: String): List[String] = {
|
39
|
+
name.split("[.+]").toList
|
40
|
+
}
|
41
|
+
|
42
|
+
// @todo MetricElementNamesのスコープ内に閉じられるように
|
43
|
+
// @todo 例外処理
|
44
|
+
def replaceSeparator(name: String): String = {
|
45
|
+
name.replaceAll("[.]", separator)
|
46
|
+
}
|
47
|
+
}
|
@@ -0,0 +1,19 @@
|
|
1
|
+
package org.embulk.parser.twitter_ads_stats
|
2
|
+
|
3
|
+
import spray.json.DefaultJsonProtocol._
|
4
|
+
import spray.json._
|
5
|
+
|
6
|
+
object MetricsGroupJson {
|
7
|
+
implicit object MetricsGroupJsonWriter extends RootJsonWriter[MetricsGroup] {
|
8
|
+
|
9
|
+
private def toJValue(element: Option[Long]): JsValue = element match {
|
10
|
+
case Some(e) => e.toJson
|
11
|
+
case None => JsNull
|
12
|
+
}
|
13
|
+
|
14
|
+
override def write(obj: MetricsGroup): JsValue = {
|
15
|
+
val jsFields = obj.map(v => (v._1, toJValue(v._2)))
|
16
|
+
JsObject(jsFields)
|
17
|
+
}
|
18
|
+
}
|
19
|
+
}
|
@@ -0,0 +1,27 @@
|
|
1
|
+
package org.embulk.parser.twitter_ads_stats
|
2
|
+
|
3
|
+
sealed abstract class ParseException(
|
4
|
+
message: String,
|
5
|
+
cause: Throwable
|
6
|
+
) extends Throwable
|
7
|
+
|
8
|
+
|
9
|
+
case class InvalidMetricTimeSeriesException(
|
10
|
+
message: String,
|
11
|
+
cause: Throwable
|
12
|
+
) extends ParseException(message, cause) {
|
13
|
+
def this(cause: Throwable, index: Int) = {
|
14
|
+
this(s"Not Found index: $index", cause)
|
15
|
+
}
|
16
|
+
}
|
17
|
+
|
18
|
+
case class InvalidInputFileException(
|
19
|
+
message: String,
|
20
|
+
cause: Throwable
|
21
|
+
)
|
22
|
+
extends ParseException(message, cause) {
|
23
|
+
|
24
|
+
def this(cause: Throwable) = {
|
25
|
+
this(s"Input file can't parse", cause)
|
26
|
+
}
|
27
|
+
}
|
@@ -0,0 +1,98 @@
|
|
1
|
+
package org.embulk.parser.twitter_ads_stats
|
2
|
+
|
3
|
+
import org.embulk.config._
|
4
|
+
import org.embulk.spi._
|
5
|
+
import org.embulk.spi.json.JsonParser
|
6
|
+
import org.embulk.spi.util.FileInputInputStream
|
7
|
+
import org.slf4j.Logger
|
8
|
+
import spray.json.{pimpAny, pimpString}
|
9
|
+
|
10
|
+
import scala.util.control.NonFatal
|
11
|
+
import scala.collection.JavaConverters._
|
12
|
+
import MetricsGroupJson._
|
13
|
+
import org.embulk.parser.twitter_ads_stats.define.{Root, RootJson}
|
14
|
+
|
15
|
+
class TwitterAdsStatsParserPlugin extends ParserPlugin {
|
16
|
+
|
17
|
+
import TwitterAdsStatsParserPlugin._
|
18
|
+
|
19
|
+
override def transaction(config: ConfigSource, control: ParserPlugin.Control): Unit = {
|
20
|
+
val task = config.loadConfig(classOf[PluginTask])
|
21
|
+
control.run(
|
22
|
+
task.dump(),
|
23
|
+
new Schema(Column.createEmbulkColumns(metricElementNames).asJava)
|
24
|
+
)
|
25
|
+
}
|
26
|
+
|
27
|
+
override def run(taskSource: TaskSource, schema: Schema, input: FileInput, output: PageOutput): Unit = {
|
28
|
+
|
29
|
+
val task = taskSource.loadTask(classOf[PluginTask])
|
30
|
+
val stopOnInvalidRecord = task.getStopOnInvalidRecord
|
31
|
+
|
32
|
+
LoanPattern(new PageBuilder(Exec.getBufferAllocator, schema, output)) { pb =>
|
33
|
+
while (input.nextFile()) {
|
34
|
+
(for {
|
35
|
+
root <- createRootFrom(input)
|
36
|
+
columns <- root.resolveColumns(metricElementNames)
|
37
|
+
} yield addRecord(pb, columns, root)) match {
|
38
|
+
case Right(_) =>
|
39
|
+
case Left(e) =>
|
40
|
+
if (stopOnInvalidRecord) {
|
41
|
+
throw new DataException(e.getMessage, e)
|
42
|
+
} else {
|
43
|
+
logger.warn(s"Skipped invalid record $e")
|
44
|
+
}
|
45
|
+
}
|
46
|
+
}
|
47
|
+
pb.finish()
|
48
|
+
}
|
49
|
+
}
|
50
|
+
|
51
|
+
private def addRecord(pb: PageBuilder, columns: Seq[Column], root: Root): Unit = {
|
52
|
+
columns.foreach { column =>
|
53
|
+
Column.createEmbulkColumns(metricElementNames).foreach { embulkColumn =>
|
54
|
+
(column, embulkColumn.getName) match {
|
55
|
+
case (Column(id, _, _, _, _), "id") =>
|
56
|
+
pb.setString(embulkColumn, id)
|
57
|
+
case (Column(_, date, _, _,_), "date") =>
|
58
|
+
pb.setString(embulkColumn, date.toString)
|
59
|
+
case (Column(_, _, Some(segment),_, _), "segment") =>
|
60
|
+
pb.setString(embulkColumn, segment)
|
61
|
+
case (Column(_, _, None,_, _), "segment") =>
|
62
|
+
pb.setNull(embulkColumn)
|
63
|
+
case (Column(_, _, _,placement, _), "placement") =>
|
64
|
+
pb.setString(embulkColumn, placement)
|
65
|
+
case (Column(_, _, _, _,metricsGroup), key) =>
|
66
|
+
metricsGroup.get(key) match {
|
67
|
+
case Some(m) =>
|
68
|
+
pb.setJson(
|
69
|
+
embulkColumn,
|
70
|
+
jsonParser.parse(m.toJson.compactPrint)
|
71
|
+
)
|
72
|
+
case None =>
|
73
|
+
pb.setNull(embulkColumn)
|
74
|
+
|
75
|
+
}
|
76
|
+
case _ => throw new RuntimeException
|
77
|
+
}
|
78
|
+
}
|
79
|
+
pb.addRecord()
|
80
|
+
}
|
81
|
+
}
|
82
|
+
|
83
|
+
private def createRootFrom(input: FileInput): Either[ParseException, Root] = {
|
84
|
+
val stream = new FileInputInputStream(input)
|
85
|
+
try {
|
86
|
+
val jsValue = scala.io.Source.fromInputStream(stream).mkString.parseJson
|
87
|
+
val root = new RootJson(metricElementNames).RootReader.read(jsValue)
|
88
|
+
Right(root)
|
89
|
+
} catch {
|
90
|
+
case NonFatal(e) => Left(new InvalidInputFileException(e))
|
91
|
+
}
|
92
|
+
}
|
93
|
+
}
|
94
|
+
|
95
|
+
object TwitterAdsStatsParserPlugin {
|
96
|
+
val logger: Logger = Exec.getLogger(classOf[TwitterAdsStatsParserPlugin])
|
97
|
+
val jsonParser = new JsonParser
|
98
|
+
}
|
@@ -0,0 +1,19 @@
|
|
1
|
+
package org.embulk.parser.twitter_ads_stats.define
|
2
|
+
|
3
|
+
import org.embulk.parser.twitter_ads_stats.{Column, MetricElementNames, ParseException}
|
4
|
+
|
5
|
+
case class Data(id: String, id_data: Seq[IDData]) {
|
6
|
+
private[define] def resolveColumns(metricElementNames: MetricElementNames, request: Request): Either[ParseException, Seq[Column]] = {
|
7
|
+
id_data.map { idData =>
|
8
|
+
idData.resolveColumns(id, metricElementNames, request)
|
9
|
+
}.foldRight[Either[ParseException, Seq[Column]]](Right(Nil)) {
|
10
|
+
case (Left(e), _) => Left(e)
|
11
|
+
case (Right(_), Left(e)) => Left(e)
|
12
|
+
case (Right(seq1), Right(seq2)) => Right(seq1 ++: seq2)
|
13
|
+
}
|
14
|
+
}
|
15
|
+
}
|
16
|
+
|
17
|
+
object Data {
|
18
|
+
val fieldNames:Array[String] = FieldNameUtil.fieldList[Data]
|
19
|
+
}
|
@@ -0,0 +1,36 @@
|
|
1
|
+
package org.embulk.parser.twitter_ads_stats.define
|
2
|
+
|
3
|
+
import java.time.LocalDate
|
4
|
+
|
5
|
+
import org.embulk.parser.twitter_ads_stats._
|
6
|
+
|
7
|
+
case class IDData(metrics: Metrics, segment: Option[String]) {
|
8
|
+
private def resolveColumn(
|
9
|
+
id: String,
|
10
|
+
metricElementNames: MetricElementNames,
|
11
|
+
date: (LocalDate, Int),
|
12
|
+
placement: String
|
13
|
+
): Either[ParseException, Column] = {
|
14
|
+
metricElementNames.names.map { name =>
|
15
|
+
(name._1, metrics.findMetricsGroup(date._2, name._2))
|
16
|
+
}.foldRight[Either[ParseException, Map[String, MetricsGroup]]](Right(Map.empty)) {
|
17
|
+
case ((_, Left(e)), _) => Left(e)
|
18
|
+
case ((_, Right(_)), Left(e)) => Left(e)
|
19
|
+
case ((s, Right(r)), Right(acc)) => Right(acc + (s -> r))
|
20
|
+
}.map(Column(id, date._1, segment, placement, _))
|
21
|
+
}
|
22
|
+
|
23
|
+
private[define] def resolveColumns(id: String, metricElementNames: MetricElementNames, request: Request): Either[ParseException, Seq[Column]] = {
|
24
|
+
request.params.targetDates.zipWithIndex.map { date =>
|
25
|
+
resolveColumn(id, metricElementNames, date, request.params.placement)
|
26
|
+
}.foldRight[Either[ParseException, Seq[Column]]](Right(Nil)) {
|
27
|
+
case (Left(e), _) => Left(e)
|
28
|
+
case (Right(_), Left(e)) => Left(e)
|
29
|
+
case (Right(a), Right(seq)) => Right(a +: seq)
|
30
|
+
}
|
31
|
+
}
|
32
|
+
}
|
33
|
+
|
34
|
+
object IDData {
|
35
|
+
val fieldNames: Array[String] = FieldNameUtil.fieldList[IDData]
|
36
|
+
}
|
@@ -0,0 +1,23 @@
|
|
1
|
+
package org.embulk.parser.twitter_ads_stats.define
|
2
|
+
|
3
|
+
import org.embulk.parser.twitter_ads_stats._
|
4
|
+
|
5
|
+
import scala.util.control.NonFatal
|
6
|
+
|
7
|
+
case class Metrics(map: Map[String, MetricTimeSeries]) {
|
8
|
+
|
9
|
+
private[define] def findMetricsGroup(index: Int, metricNames: Seq[String]): Either[ParseException, MetricsGroup] = {
|
10
|
+
try {
|
11
|
+
val metricsGroup = metricNames.map { name =>
|
12
|
+
val n = MetricElementNames.replaceSeparator(name)
|
13
|
+
(
|
14
|
+
n,
|
15
|
+
map.get(n).flatten.map(v => v(index))
|
16
|
+
)
|
17
|
+
}.toMap
|
18
|
+
Right(metricsGroup)
|
19
|
+
} catch {
|
20
|
+
case NonFatal(e) => Left(new InvalidMetricTimeSeriesException(e, index))
|
21
|
+
}
|
22
|
+
}
|
23
|
+
}
|