embulk-filter-key_in_redis 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8c07a54c906730f2de5c0a2e32a1792cc0170877
4
- data.tar.gz: 3cf0b84f743814a7ee32a35cb9a084eca9db287d
3
+ metadata.gz: 37e4700ee70663be6a24b53e20ea86dbeea87336
4
+ data.tar.gz: 845fc7bc8b2d19f964483a43f9f860a9674eb752
5
5
  SHA512:
6
- metadata.gz: 5b51785e0dfe0a5007cea67fd60c96aaa9c805ec605b074db141310532418847cd07f4d38d5d6745ff9a5067f7f5500f0fda004212e9f88a22d9d98018d00b00
7
- data.tar.gz: d73e095906c90aef7216fcba69a5f514a7871acde3d1b94ce07918f16cdfecb5ffa05b04bfe8485baf0f3ef62c408a6a11004864fd9a24e6facb87ddc7ef5508
6
+ metadata.gz: 04400de1c5bc2041f28e29b4a3e2a27c407c4171e49408d70f3e02d75e2109efaf14fda42ba49cb9f2a9e94ece3184248b018fe98f43989f35cca82f234ee73c
7
+ data.tar.gz: 8bef1f124c5fdd3f93f7fac9e9b7a9ad52f071050e1013d07f0be320a015f89b05e05565855f7baa5ead073b60495afe27f9946c7b1f7d53af5722adc5ef65de
@@ -13,7 +13,7 @@ configurations {
13
13
  provided
14
14
  }
15
15
 
16
- version = "0.1.2"
16
+ version = "0.1.3"
17
17
 
18
18
  sourceCompatibility = 1.7
19
19
  targetCompatibility = 1.7
@@ -0,0 +1,26 @@
1
+ akka.actor.deployment {
2
+ /filtering_actor {
3
+ dispatcher = embulk-filter_key_in_redis-dispatcher
4
+ }
5
+ }
6
+
7
+ embulk-filter_key_in_redis-dispatcher {
8
+ # Dispatcher is the name of the event-based dispatcher
9
+ type = Dispatcher
10
+ # What kind of ExecutionService to use
11
+ executor = "fork-join-executor"
12
+ # Configuration for the fork join pool
13
+ fork-join-executor {
14
+ # Min number of threads to cap factor-based parallelism number to
15
+ parallelism-min = 2
16
+ # Parallelism (threads) ... ceil(available processors * factor)
17
+ parallelism-factor = 2.0
18
+ # Max number of threads to cap factor-based parallelism number to
19
+ parallelism-max = 10
20
+ }
21
+
22
+ # Throughput defines the maximum number of messages to be
23
+ # processed per actor before the thread jumps to the next actor.
24
+ # Set to 1 for as fair as possible.
25
+ throughput = 100
26
+ }
@@ -1,9 +1,14 @@
1
1
  package org.embulk.filter.key_in_redis
2
2
 
3
+ import java.util.concurrent.TimeUnit
4
+
3
5
  import com.google.common.base.Optional
4
- import org.embulk.filter.key_in_redis.column._
6
+ import org.embulk.filter.key_in_redis.actor._
7
+ import org.embulk.filter.key_in_redis.row._
8
+ import org.embulk.filter.key_in_redis.ToFutureExtensionOps._
9
+ import akka.pattern.ask
10
+ import akka.util.Timeout
5
11
 
6
- import scala.collection.mutable.ListBuffer
7
12
  import scala.collection.JavaConverters._
8
13
  import org.embulk.spi.time.TimestampFormatter
9
14
  import org.embulk.spi.{
@@ -20,13 +25,13 @@ case class PageOutput(task: PluginTask,
20
25
  output: EmbulkPageOutput)
21
26
  extends EmbulkPageOutput {
22
27
  val pageBuilder = new PageBuilder(Exec.getBufferAllocator, schema, output)
28
+ var finished = false
23
29
  def timestampFormatter(): TimestampFormatter =
24
30
  new TimestampFormatter(task, Optional.absent())
25
31
 
26
32
  override def add(page: Page): Unit = {
27
33
  val baseReader: PageReader = new PageReader(schema)
28
34
  baseReader.setPage(page)
29
- val rows = new ListBuffer[SetValueColumnVisitor]()
30
35
  while (baseReader.nextRecord()) {
31
36
  val setValueVisitor = SetValueColumnVisitor(
32
37
  baseReader,
@@ -36,19 +41,49 @@ case class PageOutput(task: PluginTask,
36
41
  task.getAppender,
37
42
  task.getMatchAsMD5)
38
43
  schema.visitColumns(setValueVisitor)
39
- rows.append(setValueVisitor)
44
+ Actors.register ! setValueVisitor.getRow(pageBuilder)
40
45
  }
41
- KeyInRedisFilterPlugin.redis.foreach { redis =>
42
- val result = redis.exists(rows.map(_.getMatchKey))
43
- rows.foreach { row =>
44
- if (!result(row.getMatchKey)) {
45
- row.addRecord(pageBuilder)
46
- }
46
+ baseReader.close()
47
+ }
48
+
49
+ def counter(): Int = {
50
+ import scala.concurrent.ExecutionContext.Implicits.global
51
+ implicit val timeout = Timeout(24, TimeUnit.HOURS)
52
+ (Actors.register ? Counter(pageBuilder))
53
+ .mapTo[Int]
54
+ .toTask
55
+ .unsafePerformSync
56
+ }
57
+
58
+ def forceWrite(): Unit = {
59
+ Actors.register ! ForceWrite(pageBuilder)
60
+ }
61
+
62
+ override def finish(): Unit = {
63
+ var lastRecord = false
64
+ while (counter() != 0) {
65
+ if (!lastRecord) {
66
+ forceWrite()
67
+ lastRecord = true
47
68
  }
69
+ Thread.sleep(1000)
70
+ }
71
+ if (!finished) {
72
+ pageBuilder.finish()
73
+ finished = true
48
74
  }
49
75
  }
50
76
 
51
- override def finish(): Unit = pageBuilder.finish()
52
- override def close(): Unit = pageBuilder.close()
77
+ override def close(): Unit = {
78
+ var lastRecord = false
79
+ while (counter() != 0 & finished) {
80
+ if (!lastRecord) {
81
+ forceWrite()
82
+ lastRecord = true
83
+ }
84
+ Thread.sleep(1000)
85
+ }
86
+ pageBuilder.close()
87
+ }
53
88
 
54
- }
89
+ }
@@ -1,8 +1,8 @@
1
- package org.embulk.filter.key_in_redis.redis
1
+ package org.embulk.filter.key_in_redis
2
2
 
3
3
  import scala.concurrent.{ExecutionContext, Future}
4
4
  import scala.util.{Failure, Success}
5
- import scalaz._, Scalaz._
5
+ import scalaz.Scalaz._
6
6
  import scalaz.concurrent._
7
7
 
8
8
  final class FutureExtensionOps[A](self: Future[A]) {
@@ -0,0 +1,12 @@
1
+ package org.embulk.filter.key_in_redis.actor
2
+
3
+ import akka.actor.{ActorRef, ActorSystem, Props}
4
+
5
+ object Actors {
6
+ implicit val actorSystem: ActorSystem = akka.actor.ActorSystem(
7
+ "redis-register",
8
+ classLoader = Some(this.getClass.getClassLoader))
9
+ val register: ActorRef =
10
+ actorSystem.actorOf(Props(classOf[Register]), "filtering_actor")
11
+
12
+ }
@@ -0,0 +1,65 @@
1
+ package org.embulk.filter.key_in_redis.actor
2
+
3
+ import akka.actor._
4
+ import org.embulk.filter.key_in_redis.KeyInRedisFilterPlugin
5
+ import org.embulk.filter.key_in_redis.redis.Redis
6
+ import org.embulk.filter.key_in_redis.row.Row
7
+ import org.embulk.spi.PageBuilder
8
+
9
+ import scala.collection.mutable
10
+ import scala.concurrent._
11
+
12
+ class Register extends Actor {
13
+
14
+ implicit val ec: ExecutionContextExecutor = context.system.dispatcher
15
+ var rowList: List[Row] = List.empty[Row]
16
+ lazy val redis: Redis =
17
+ KeyInRedisFilterPlugin.redis.getOrElse(sys.error("redis is undefined."))
18
+ val counter: mutable.Map[PageBuilder, Int] = mutable.Map[PageBuilder, Int]()
19
+
20
+ override def receive: PartialFunction[Any, Unit] = {
21
+ case row: Row =>
22
+ rowList = row :: rowList
23
+ counter.put(row.pageBuilder, counter.getOrElse(row.pageBuilder, 0) + 1)
24
+ if (rowList.size == 500) {
25
+ addRecords(rowList)
26
+ rowList = List.empty[Row]
27
+ }
28
+ case Counter(pb) =>
29
+ sender() ! counter.getOrElse(pb, 0)
30
+ case ForceWrite(pb) =>
31
+ val (owned, other) =
32
+ rowList.partition(_.pageBuilder == pb)
33
+ addRecords(owned)
34
+ rowList = other
35
+ case Add(row) =>
36
+ counter.put(row.pageBuilder, counter(row.pageBuilder) - 1)
37
+ row.addRecord()
38
+ case Ignore(row) =>
39
+ counter.put(row.pageBuilder, counter(row.pageBuilder) - 1)
40
+ case TotalCount =>
41
+ sender() ! counter.foldLeft[Int](0) {
42
+ case (total, (_, counter: Int)) =>
43
+ total + counter
44
+ }
45
+ }
46
+
47
+ private def addRecords(rows: List[Row]): Unit = {
48
+ redis.exists(rows.map(_.matchKey)).foreach { resultMap =>
49
+ rows.foreach { row =>
50
+ val result = resultMap(row.matchKey)
51
+ if (!result) {
52
+ self ! Add(row)
53
+ } else {
54
+ self ! Ignore(row)
55
+ }
56
+ }
57
+ }
58
+ }
59
+ }
60
+
61
+ case object TotalCount
62
+ case class Add(row: Row)
63
+ case class Ignore(row: Row)
64
+ case class Counter(pageBuilder: PageBuilder)
65
+ case class ForceWrite(pageBuilder: PageBuilder)
@@ -1,14 +1,20 @@
1
1
  package org.embulk.filter.key_in_redis.redis
2
2
 
3
+ import java.util.concurrent.TimeUnit
4
+
5
+ import akka.util.Timeout
6
+ import akka.pattern.ask
3
7
  import org.slf4j.Logger
4
8
  import redis._
9
+ import org.embulk.filter.key_in_redis.actor.Actors._
5
10
 
6
11
  import scala.annotation.tailrec
7
12
  import scala.concurrent.duration._
8
13
  import scala.concurrent._
9
14
  import scala.util._
10
-
11
15
  import scala.collection.mutable
16
+ import org.embulk.filter.key_in_redis.ToFutureExtensionOps._
17
+ import org.embulk.filter.key_in_redis.actor._
12
18
 
13
19
  class Redis(setKey: String,
14
20
  host: String,
@@ -16,10 +22,8 @@ class Redis(setKey: String,
16
22
  replicaHosts: Map[String, Int],
17
23
  db: Option[Int],
18
24
  loadOnMemory: Boolean)(implicit logger: Logger) {
19
- implicit val actorSystem = akka.actor.ActorSystem(
20
- "redis-client",
21
- classLoader = Some(this.getClass.getClassLoader))
22
25
 
26
+ implicit val ec: ExecutionContextExecutor = actorSystem.dispatcher
23
27
  lazy val cacheInstance: Option[Cache] = if (loadOnMemory) {
24
28
  Some(Cache(() => loadAll()))
25
29
  } else None
@@ -37,8 +41,7 @@ class Redis(setKey: String,
37
41
 
38
42
  def loadAll(): mutable.Set[String] = {
39
43
  logger.info(s"Loading from Redis start.")
40
- import scala.concurrent.ExecutionContext.Implicits.global
41
- import ToFutureExtensionOps._
44
+ import org.embulk.filter.key_in_redis.ToFutureExtensionOps._
42
45
  val buffer = mutable.Set.empty[String]
43
46
  @tailrec
44
47
  def _scan(cursor: Int): Unit = {
@@ -57,10 +60,10 @@ class Redis(setKey: String,
57
60
  }
58
61
 
59
62
  def ping(): String = {
60
- import scala.concurrent.ExecutionContext.Implicits.global
61
63
  val s: Future[String] = redis.ping()
62
64
  s.onComplete {
63
- case Success(result) => result
65
+ case Success(result) =>
66
+ result
64
67
  case Failure(t) =>
65
68
  actorSystem.shutdown()
66
69
  throw t
@@ -68,38 +71,63 @@ class Redis(setKey: String,
68
71
  Await.result(s, 10.minute)
69
72
  }
70
73
 
71
- def exists(values: Seq[String]): Map[String, Boolean] = cacheInstance match {
72
- case Some(cached) =>
73
- values.map { v =>
74
- v -> cached.contains(v)
75
- }.toMap
76
- case None =>
77
- import scala.concurrent.ExecutionContext.Implicits.global
78
- import ToFutureExtensionOps._
79
- val input = values.zipWithIndex.map(_.swap).toMap
80
- val transaction = redis.transaction()
81
- val f = values.map { v =>
82
- transaction.sismember(setKey, v)
83
- }
84
- transaction.exec()
85
- val results = Future
86
- .sequence(f)
87
- .toTask
88
- .unsafePerformSync
89
- .zipWithIndex
90
- .map(_.swap)
91
- .toMap
92
- results.map {
93
- case (index, result) =>
94
- input(index) -> result
74
+ def keyExists(): Unit = {
75
+ val s: Future[Boolean] = redis.exists(setKey)
76
+ s.onComplete {
77
+ case Success(_) =>
78
+ case Failure(t) =>
79
+ actorSystem.shutdown()
80
+ throw t
81
+ }
82
+ val result = Await.result(s, 10.minute)
83
+ if (!result) {
84
+ actorSystem.shutdown()
85
+ throw sys.error(s"key not found in redis. $setKey")
86
+ }
87
+ }
88
+
89
+ def exists(values: Seq[String]): Future[mutable.Map[String, Boolean]] = {
90
+ val futureResult = cacheInstance match {
91
+ case Some(cached) =>
92
+ values.map { v =>
93
+ Future.successful(v -> cached.contains(v))
94
+ }
95
+ case None =>
96
+ val transaction = redis.transaction()
97
+ val futures = values.map { v =>
98
+ transaction.sismember(setKey, v).map { result =>
99
+ (v ,result)
100
+ }
101
+ }
102
+ transaction.exec()
103
+ futures
104
+ }
105
+ Future.sequence(futureResult).map { sequence =>
106
+ val result = mutable.ListMap[String,Boolean]()
107
+ sequence.foreach {
108
+ case (key, value) =>
109
+ result.put(key, value)
95
110
  }
111
+ result
112
+ }
96
113
  }
97
114
 
98
115
  def close(): Unit = {
116
+ while (counter() != 0) {
117
+ Thread.sleep(1000)
118
+ }
99
119
  redis.stop()
100
120
  // wait for stopping.
101
121
  Thread.sleep(1000)
102
122
  actorSystem.shutdown()
103
123
  }
104
124
 
125
+ def counter(): Int = {
126
+ implicit val timeout: Timeout = Timeout(24, TimeUnit.HOURS)
127
+ (Actors.register ? TotalCount)
128
+ .mapTo[Int]
129
+ .toTask
130
+ .unsafePerformSync
131
+ }
132
+
105
133
  }
@@ -0,0 +1,38 @@
1
+ package org.embulk.filter.key_in_redis.row
2
+
3
+ import org.embulk.spi.PageBuilder
4
+ import org.embulk.spi.`type`._
5
+ import org.embulk.spi.time.Timestamp
6
+ import org.msgpack.value.Value
7
+
8
+ import scala.collection.mutable
9
+
10
+ case class Row(matchKey: String,
11
+ seq: mutable.Set[ValueHolder[_]],
12
+ pageBuilder: PageBuilder) {
13
+ def addRecord(): Unit = {
14
+ seq.foreach { vh =>
15
+ vh.value match {
16
+ case Some(v: Boolean) if vh.column.getType.isInstanceOf[BooleanType] =>
17
+ pageBuilder.setBoolean(vh.column, v)
18
+ case Some(v: Long) if vh.column.getType.isInstanceOf[LongType] =>
19
+ pageBuilder.setLong(vh.column, v)
20
+ case Some(v: Double) if vh.column.getType.isInstanceOf[DoubleType] =>
21
+ pageBuilder.setDouble(vh.column, v)
22
+ case Some(v: String) if vh.column.getType.isInstanceOf[StringType] =>
23
+ pageBuilder.setString(vh.column, v)
24
+ case Some(v: Timestamp)
25
+ if vh.column.getType.isInstanceOf[TimestampType] =>
26
+ pageBuilder.setTimestamp(vh.column, v)
27
+ case Some(v: Value) if vh.column.getType.isInstanceOf[JsonType] =>
28
+ pageBuilder.setJson(vh.column, v)
29
+ case None =>
30
+ pageBuilder.setNull(vh.column)
31
+ case _ =>
32
+ sys.error("unmatched types.")
33
+ }
34
+ }
35
+ pageBuilder.addRecord()
36
+ }
37
+
38
+ }
@@ -1,18 +1,16 @@
1
- package org.embulk.filter.key_in_redis.column
1
+ package org.embulk.filter.key_in_redis.row
2
2
 
3
3
  import java.security.MessageDigest
4
4
 
5
5
  import org.bouncycastle.util.encoders.Hex
6
6
  import org.embulk.filter.key_in_redis.json.JsonParser
7
- import org.embulk.spi.`type`._
8
- import org.embulk.spi.time.{Timestamp, TimestampFormatter}
7
+ import org.embulk.spi.time.TimestampFormatter
9
8
  import org.embulk.spi.{
10
9
  Column,
11
10
  PageBuilder,
12
11
  PageReader,
13
12
  ColumnVisitor => EmbulkColumnVisitor
14
13
  }
15
- import org.msgpack.value.Value
16
14
 
17
15
  case class SetValueColumnVisitor(reader: PageReader,
18
16
  timestampFormatter: TimestampFormatter,
@@ -84,8 +82,6 @@ case class SetValueColumnVisitor(reader: PageReader,
84
82
  result
85
83
  }
86
84
 
87
- case class ValueHolder[A](column: Column, value: Option[A])
88
-
89
85
  def put(column: Column, value: String): Unit = {
90
86
  if (parameterKeys.contains(column.getName)) {
91
87
  recordMap.put(column.getName, value)
@@ -93,41 +89,18 @@ case class SetValueColumnVisitor(reader: PageReader,
93
89
  ()
94
90
  }
95
91
 
96
- def addRecord(pageBuilder: PageBuilder): Unit = {
97
- valueHolderSet.foreach { vh =>
98
- vh.value match {
99
- case Some(v: Boolean) if vh.column.getType.isInstanceOf[BooleanType] =>
100
- pageBuilder.setBoolean(vh.column, v)
101
- case Some(v: Long) if vh.column.getType.isInstanceOf[LongType] =>
102
- pageBuilder.setLong(vh.column, v)
103
- case Some(v: Double) if vh.column.getType.isInstanceOf[DoubleType] =>
104
- pageBuilder.setDouble(vh.column, v)
105
- case Some(v: String) if vh.column.getType.isInstanceOf[StringType] =>
106
- pageBuilder.setString(vh.column, v)
107
- case Some(v: Timestamp)
108
- if vh.column.getType.isInstanceOf[TimestampType] =>
109
- pageBuilder.setTimestamp(vh.column, v)
110
- case Some(v: Value) if vh.column.getType.isInstanceOf[JsonType] =>
111
- pageBuilder.setJson(vh.column, v)
112
- case None =>
113
- pageBuilder.setNull(vh.column)
114
- case _ =>
115
- sys.error("unmatched types.")
116
- }
117
- }
118
- pageBuilder.addRecord()
119
- }
120
-
121
- def getMatchKey: String = {
92
+ lazy val matchKey: String = {
122
93
  val keys = sortedKeys
123
94
  .flatMap { key =>
124
95
  recordMap.get(key)
125
96
  }
126
97
  .mkString(appender)
127
-
128
98
  if (matchAsMd5) {
129
99
  Hex.toHexString(digestMd5.digest(keys.getBytes()))
130
100
  } else keys
131
101
  }
132
102
 
103
+ def getRow(pageBuilder: PageBuilder): Row =
104
+ Row(matchKey, valueHolderSet, pageBuilder)
105
+
133
106
  }
@@ -0,0 +1,5 @@
1
+ package org.embulk.filter.key_in_redis.row
2
+
3
+ import org.embulk.spi.Column
4
+
5
+ case class ValueHolder[A](column: Column, value: Option[A])
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-filter-key_in_redis
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - smdmts
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-08-23 00:00:00.000000000 Z
11
+ date: 2017-08-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -59,14 +59,19 @@ files:
59
59
  - project/build.properties
60
60
  - project/plugins.sbt
61
61
  - settings.gradle
62
+ - src/main/resources/application.conf
62
63
  - src/main/scala/org/embulk/filter/key_in_redis/KeyInRedisFilterPlugin.scala
63
64
  - src/main/scala/org/embulk/filter/key_in_redis/PageOutput.scala
64
65
  - src/main/scala/org/embulk/filter/key_in_redis/PluginTask.scala
65
- - src/main/scala/org/embulk/filter/key_in_redis/column/SetValueColumnVisitor.scala
66
+ - src/main/scala/org/embulk/filter/key_in_redis/TaskExtensionOps.scala
67
+ - src/main/scala/org/embulk/filter/key_in_redis/actor/Actors.scala
68
+ - src/main/scala/org/embulk/filter/key_in_redis/actor/Register.scala
66
69
  - src/main/scala/org/embulk/filter/key_in_redis/json/JsonParser.scala
67
70
  - src/main/scala/org/embulk/filter/key_in_redis/redis/Cache.scala
68
71
  - src/main/scala/org/embulk/filter/key_in_redis/redis/Redis.scala
69
- - src/main/scala/org/embulk/filter/key_in_redis/redis/TaskExtensionOps.scala
72
+ - src/main/scala/org/embulk/filter/key_in_redis/row/Row.scala
73
+ - src/main/scala/org/embulk/filter/key_in_redis/row/SetValueColumnVisitor.scala
74
+ - src/main/scala/org/embulk/filter/key_in_redis/row/ValueHolder.scala
70
75
  - src/test/scala/org/embulk/filter/key_in_redis/json/JsonParserSpec.scala
71
76
  - classpath/akka-actor_2.11-2.3.6.jar
72
77
  - classpath/bcpkix-jdk15on-1.57.jar
@@ -81,7 +86,7 @@ files:
81
86
  - classpath/circe-numbers_2.11-0.8.0.jar
82
87
  - classpath/circe-parser_2.11-0.8.0.jar
83
88
  - classpath/config-1.2.1.jar
84
- - classpath/embulk-filter-key_in_redis-0.1.2.jar
89
+ - classpath/embulk-filter-key_in_redis-0.1.3.jar
85
90
  - classpath/jawn-parser_2.11-0.10.4.jar
86
91
  - classpath/machinist_2.11-0.6.1.jar
87
92
  - classpath/macro-compat_2.11-1.1.1.jar