embulk-filter-key_in_redis 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: be35f78cca388bf1225008706a2b8f5264d384fc
4
- data.tar.gz: 24ab40380a5326311caabfb2660d4fad8eb93863
3
+ metadata.gz: 6e093ef9f81cc900cf6eb92fd2a3b277f5e7340a
4
+ data.tar.gz: 6310042c14d5179a8093d2d289b495adb6a43109
5
5
  SHA512:
6
- metadata.gz: 5e6465ebe30c04cf6885500356cb0d9e89c9838a02adf2146728c215ac33b2c61285c46e75d73694f51c29be58e8a73a905f17cf38439f4ec46373c6d221e72a
7
- data.tar.gz: d574680ca584af8bd6ef0000141ff8cd3f30ac4fa865d5c32a326aafbc8595a9d573a39961b59b61d305ca943d0c6db676390ec16fa27125a3346371795001fc
6
+ metadata.gz: 58ffdafa04ed57b99cd88638e72c03798ec91390f3a0ee26aa8956600753fced81f4f4b62f88b557399ed87496946b14f90b041bd7aa8032424830c440401f7a
7
+ data.tar.gz: e6ad1f2a9993e9ea2b4003d04273fa1212ab4825ea91f24e665e42744d77d7440445aaf7930dfc9e66aff4ec96dc450bce2639bf3bde3af0f87a4ced54feebf9
@@ -0,0 +1,27 @@
1
+ version: 2
2
+ jobs:
3
+ build:
4
+ executorType: docker
5
+ docker:
6
+ - image: hseeberger/scala-sbt
7
+ working_directory: /root/embulk-filter-key_in_redis/
8
+ steps:
9
+ - checkout
10
+ - restore_cache:
11
+ name: Restoring Cache
12
+ keys:
13
+ - sbt
14
+ - setup_remote_docker
15
+ - run:
16
+ name: prepare
17
+ command: sbt update exit
18
+ - save_cache:
19
+ name: Saving Cache sbt
20
+ key: sbt
21
+ paths:
22
+ - "/root/.sbt"
23
+ - "/root/.ivy2"
24
+ - run:
25
+ name: compile
26
+ command: |
27
+ sbt compile test scalafmt::test exit
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Key In Redis filter plugin for Embulk
2
2
 
3
- Filter the aggregated key into set of the Redis plugin.
3
+ Filtering by aggregated the keys in included Redis's SET.
4
4
 
5
5
  This plugin is designed to extract data set diff files used with the combination in below use cases.
6
6
 
@@ -25,8 +25,8 @@ This plugin is designed to extract data set diff files used with the combination
25
25
  | host | string | optional | "127.0.0.1" | redis servers host |
26
26
  | port | integer | optional | "6379" | redis servers port |
27
27
  | db | integer | optional | "null" | redis servers db |
28
- | flush_on_start | boolean | optional | "false" | flush on start specified redis servers db |
29
28
  | redis_set_key | string | required | | redis of key of set name |
29
+ | load_on_memory | boolean | optional | "false" | load all data from redis *1 |
30
30
  | appender | string | optional | "-" | multi key of appender |
31
31
  | match_as_md5 | boolean | optional | "false" | smembers the value to converted md5 |
32
32
  | key_with_index | hash: Map<Int,String> | required with key_with_index or json_key_with_index or only one || index with key name |
@@ -34,6 +34,10 @@ This plugin is designed to extract data set diff files used with the combination
34
34
  | default_timezone | string | optional | UTC | |
35
35
  | default_timestamp_format | string | optional | %Y-%m-%d %H:%M:%S.%6N | |
36
36
 
37
+
38
+ *1: load_on_memory mode requires JVM memory as all records stored on Redis.
39
+
40
+
37
41
  ## Example
38
42
 
39
43
  - inside redis
@@ -13,7 +13,7 @@ configurations {
13
13
  provided
14
14
  }
15
15
 
16
- version = "0.1.0"
16
+ version = "0.1.1"
17
17
 
18
18
  sourceCompatibility = 1.7
19
19
  targetCompatibility = 1.7
@@ -21,11 +21,14 @@ targetCompatibility = 1.7
21
21
  dependencies {
22
22
  compile "org.embulk:embulk-core:0.8.26"
23
23
  compile "org.scala-lang:scala-library:2.11.11"
24
+ compile group: 'com.github.pathikrit', name: 'better-files_2.11', version: '2.17.1'
24
25
  compile group: 'io.circe', name: 'circe-core_2.11', version: '0.8.0'
25
26
  compile group: 'io.circe', name: 'circe-generic_2.11', version: '0.8.0'
26
27
  compile group: 'io.circe', name: 'circe-parser_2.11', version: '0.8.0'
27
28
  compile group: 'com.github.etaty', name: 'rediscala_2.11', version: '1.7.0'
28
29
  compile group: 'org.bouncycastle', name: 'bcpkix-jdk15on', version: '1.57'
30
+ compile group: 'org.scalaz', name: 'scalaz-core_2.11', version: '7.2.14'
31
+ compile group: 'org.scalaz', name: 'scalaz-concurrent_2.11', version: '7.2.14'
29
32
  testCompile group: 'org.scalatest', name: 'scalatest_2.11', version: '3.0.1'
30
33
  provided "org.embulk:embulk-core:0.8.26"
31
34
  testCompile "junit:junit:4.+"
data/build.sbt CHANGED
@@ -22,6 +22,9 @@ libraryDependencies ++= Seq(
22
22
  "org.embulk" % "embulk-core" % "0.8.25",
23
23
  "com.github.etaty" %% "rediscala" % "1.7.0",
24
24
  "org.bouncycastle" % "bcpkix-jdk15on" % "1.57",
25
+ "com.github.pathikrit" %% "better-files" % "2.17.1",
26
+ "org.scalaz" %% "scalaz-core" % "7.2.14",
27
+ "org.scalaz" %% "scalaz-concurrent" % "7.2.14",
25
28
  "io.circe" %% "circe-core" % circeVersion,
26
29
  "io.circe" %% "circe-generic" % circeVersion,
27
30
  "io.circe" %% "circe-parser" % circeVersion,
@@ -4,16 +4,22 @@ import org.embulk.config.{ConfigSource, TaskSource}
4
4
  import org.embulk.filter.key_in_redis.redis.Redis
5
5
  import org.embulk.spi
6
6
  import org.embulk.spi._
7
+ import org.slf4j.Logger
8
+
9
+ import scala.collection.JavaConverters._
7
10
 
8
11
  class KeyInRedisFilterPlugin extends FilterPlugin {
9
12
 
10
13
  override def transaction(config: ConfigSource,
11
14
  inputSchema: Schema,
12
15
  control: FilterPlugin.Control): Unit = {
16
+
13
17
  val task = config.loadConfig(classOf[PluginTask])
18
+ val taskSource = task.dump()
19
+
14
20
  KeyInRedisFilterPlugin.createRedisInstance(task)
15
21
  KeyInRedisFilterPlugin.redis.foreach(_.ping())
16
- control.run(task.dump(), inputSchema)
22
+ control.run(taskSource, inputSchema)
17
23
  KeyInRedisFilterPlugin.redis.foreach(_.close())
18
24
  }
19
25
 
@@ -32,12 +38,20 @@ class KeyInRedisFilterPlugin extends FilterPlugin {
32
38
  }
33
39
 
34
40
  object KeyInRedisFilterPlugin {
41
+ lazy val cacheName = s"${this.getClass.getCanonicalName}-cache"
42
+ implicit val logger: Logger = Exec.getLogger(classOf[KeyInRedisFilterPlugin])
35
43
  var redis: Option[Redis] = None
36
44
  def createRedisInstance(task: PluginTask): Unit = {
37
45
  KeyInRedisFilterPlugin.redis = Some(
38
- Redis(task.getRedisSetKey, task.getHost, task.getPort, {
39
- if (task.getDb.isPresent) Some(task.getDb.get())
40
- else None
41
- }))
46
+ new Redis(
47
+ task.getRedisSetKey,
48
+ task.getHost,
49
+ task.getPort,
50
+ task.getReplicaHosts.asScala.toMap.mapValues(_.toInt), {
51
+ if (task.getDb.isPresent) Some(task.getDb.get())
52
+ else None
53
+ },
54
+ task.getLoadOnMemory
55
+ ))
42
56
  }
43
57
  }
@@ -6,6 +6,8 @@ import com.google.common.base.Optional
6
6
  import org.bouncycastle.util.encoders.Hex
7
7
  import org.embulk.filter.key_in_redis.column._
8
8
 
9
+ import scala.collection.mutable.ListBuffer
10
+
9
11
  import scala.collection.JavaConverters._
10
12
  import org.embulk.spi.time.TimestampFormatter
11
13
  import org.embulk.spi.{
@@ -30,6 +32,7 @@ case class PageOutput(task: PluginTask,
30
32
  override def add(page: Page): Unit = {
31
33
  val reader: PageReader = new PageReader(schema)
32
34
  reader.setPage(page)
35
+ val handlerBuffer = new ListBuffer[PageHandler]()
33
36
  while (reader.nextRecord()) {
34
37
  val setValueVisitor = SetValueColumnVisitor(
35
38
  reader,
@@ -41,12 +44,15 @@ case class PageOutput(task: PluginTask,
41
44
  val matchValue = if (task.getMatchAsMD5) {
42
45
  Hex.toHexString(digestMd5.digest(setValueVisitor.getValue.getBytes()))
43
46
  } else setValueVisitor.getValue
44
- KeyInRedisFilterPlugin.redis.foreach { redis =>
45
- val passthroughColumnVisitor =
46
- PassthroughColumnVisitor(reader, pageBuilder)
47
- if (redis.nonExists(matchValue)) {
48
- schema.visitColumns(passthroughColumnVisitor)
49
- passthroughColumnVisitor.addRecord()
47
+ handlerBuffer.append(
48
+ PageHandler(matchValue, PassthroughColumnVisitor(reader, pageBuilder)))
49
+ }
50
+ KeyInRedisFilterPlugin.redis.foreach { redis =>
51
+ val result = redis.exists(handlerBuffer.map(_.matchValue))
52
+ handlerBuffer.foreach { value =>
53
+ if (!result(value.matchValue)) {
54
+ schema.visitColumns(value.visitor)
55
+ value.visitor.addRecord()
50
56
  }
51
57
  }
52
58
  }
@@ -57,3 +63,5 @@ case class PageOutput(task: PluginTask,
57
63
  override def close(): Unit = pageBuilder.close()
58
64
 
59
65
  }
66
+
67
+ case class PageHandler(matchValue: String, visitor: PassthroughColumnVisitor)
@@ -13,6 +13,10 @@ trait PluginTask extends Task with TimestampFormatter.Task {
13
13
  @ConfigDefault("false")
14
14
  def getMatchAsMD5: Boolean
15
15
 
16
+ @Config("load_on_memory")
17
+ @ConfigDefault("false")
18
+ def getLoadOnMemory: Boolean
19
+
16
20
  @Config("key_with_index")
17
21
  @ConfigDefault("{}")
18
22
  def getKeyWithIndex: java.util.Map[String, String]
@@ -33,6 +37,10 @@ trait PluginTask extends Task with TimestampFormatter.Task {
33
37
  @ConfigDefault("6379")
34
38
  def getPort: Int
35
39
 
40
+ @Config("replica_hosts")
41
+ @ConfigDefault("{}")
42
+ def getReplicaHosts: java.util.Map[String, String]
43
+
36
44
  @Config("db")
37
45
  @ConfigDefault("null")
38
46
  def getDb: Optional[Int]
@@ -0,0 +1,11 @@
1
+ package org.embulk.filter.key_in_redis.redis
2
+
3
+ import org.slf4j.Logger
4
+
5
+ import scala.collection.mutable
6
+
7
+ case class Cache(loadFromStorage: () => mutable.Set[String])(
8
+ implicit logger: Logger) {
9
+ private val cache = loadFromStorage()
10
+ def contains(value: String): Boolean = cache.contains(value)
11
+ }
@@ -1,16 +1,60 @@
1
1
  package org.embulk.filter.key_in_redis.redis
2
2
 
3
- import redis.RedisClient
3
+ import org.slf4j.Logger
4
+ import redis._
4
5
 
6
+ import scala.annotation.tailrec
5
7
  import scala.concurrent.duration._
6
8
  import scala.concurrent._
7
9
  import scala.util._
8
10
 
9
- case class Redis(setKey: String, host: String, port: Int, db: Option[Int]) {
11
+ import scala.collection.mutable
12
+
13
+ class Redis(setKey: String,
14
+ host: String,
15
+ port: Int,
16
+ replicaHosts: Map[String, Int],
17
+ db: Option[Int],
18
+ loadOnMemory: Boolean)(implicit logger: Logger) {
10
19
  implicit val actorSystem = akka.actor.ActorSystem(
11
20
  "redis-client",
12
21
  classLoader = Some(this.getClass.getClassLoader))
13
- val redis = RedisClient(host, port, db = db)
22
+
23
+ lazy val cacheInstance: Option[Cache] = if (loadOnMemory) {
24
+ Some(Cache(() => loadAll()))
25
+ } else None
26
+
27
+ val redisServers: Seq[RedisClient] = {
28
+ val primary = RedisClient(host, port, db = db)
29
+ val replica = replicaHosts.map {
30
+ case (host: String, port: Int) =>
31
+ RedisClient(host, port, db = db)
32
+ }
33
+ Seq(primary) ++ replica.toSeq
34
+ }
35
+
36
+ def redis: RedisClient = Random.shuffle(redisServers).head
37
+
38
+ def loadAll(): mutable.Set[String] = {
39
+ logger.info(s"Loading from Redis start.")
40
+ import scala.concurrent.ExecutionContext.Implicits.global
41
+ import ToFutureExtensionOps._
42
+ val buffer = mutable.Set.empty[String]
43
+ @tailrec
44
+ def _scan(cursor: Int): Unit = {
45
+ val task = redis.sscan[String](setKey, cursor, Option(500)).toTask
46
+ val result = task.unsafePerformSync
47
+ result.data.foreach { v =>
48
+ buffer.add(v)
49
+ }
50
+ if (result.index != 0) {
51
+ _scan(result.index)
52
+ }
53
+ }
54
+ _scan(0)
55
+ logger.info(s"Loading from Redis finished. record size is ${buffer.size}")
56
+ buffer
57
+ }
14
58
 
15
59
  def ping(): String = {
16
60
  import scala.concurrent.ExecutionContext.Implicits.global
@@ -24,13 +68,33 @@ case class Redis(setKey: String, host: String, port: Int, db: Option[Int]) {
24
68
  Await.result(s, 10.minute)
25
69
  }
26
70
 
27
- def exists(value: String): Boolean = {
28
- val s = redis.sismember(setKey, value)
29
- Await.result(s, 10.minute)
71
+ def exists(values: Seq[String]): Map[String, Boolean] = cacheInstance match {
72
+ case Some(cached) =>
73
+ values.map { v =>
74
+ v -> cached.contains(v)
75
+ }.toMap
76
+ case None =>
77
+ import scala.concurrent.ExecutionContext.Implicits.global
78
+ import ToFutureExtensionOps._
79
+ val input = values.zipWithIndex.map(_.swap).toMap
80
+ val transaction = redis.transaction()
81
+ val f = values.map { v =>
82
+ transaction.sismember(setKey, v)
83
+ }
84
+ transaction.exec()
85
+ val results = Future
86
+ .sequence(f)
87
+ .toTask
88
+ .unsafePerformSync
89
+ .zipWithIndex
90
+ .map(_.swap)
91
+ .toMap
92
+ results.map {
93
+ case (index, result) =>
94
+ input(index) -> result
95
+ }
30
96
  }
31
97
 
32
- def nonExists(value: String): Boolean = !exists(value)
33
-
34
98
  def close(): Unit = {
35
99
  redis.stop()
36
100
  // wait for stopping.
@@ -0,0 +1,23 @@
1
+ package org.embulk.filter.key_in_redis.redis
2
+
3
+ import scala.concurrent.{ExecutionContext, Future}
4
+ import scala.util.{Failure, Success}
5
+ import scalaz._, Scalaz._
6
+ import scalaz.concurrent._
7
+
8
+ final class FutureExtensionOps[A](self: Future[A]) {
9
+ def toTask(implicit ec: ExecutionContext): Task[A] = Task.async { register =>
10
+ self.onComplete {
11
+ case Success(v) => register(v.right)
12
+ case Failure(ex) => register(ex.left)
13
+ }
14
+ }
15
+ }
16
+
17
+ trait ToFutureExtensionOps {
18
+ implicit def toFutureExtensionOps[A](
19
+ future: Future[A]): FutureExtensionOps[A] =
20
+ new FutureExtensionOps(future)
21
+ }
22
+
23
+ object ToFutureExtensionOps extends ToFutureExtensionOps
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-filter-key_in_redis
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - smdmts
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-08-14 00:00:00.000000000 Z
11
+ date: 2017-08-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -45,6 +45,7 @@ executables: []
45
45
  extensions: []
46
46
  extra_rdoc_files: []
47
47
  files:
48
+ - .circleci/config.yml
48
49
  - .gitignore
49
50
  - LICENSE.txt
50
51
  - README.md
@@ -64,12 +65,14 @@ files:
64
65
  - src/main/scala/org/embulk/filter/key_in_redis/column/PassthroughColumnVisitor.scala
65
66
  - src/main/scala/org/embulk/filter/key_in_redis/column/SetValueColumnVisitor.scala
66
67
  - src/main/scala/org/embulk/filter/key_in_redis/json/JsonParser.scala
68
+ - src/main/scala/org/embulk/filter/key_in_redis/redis/Cache.scala
67
69
  - src/main/scala/org/embulk/filter/key_in_redis/redis/Redis.scala
68
- - src/test/scala/org/embulk/filter/key_in_redis/.gitkeep
70
+ - src/main/scala/org/embulk/filter/key_in_redis/redis/TaskExtensionOps.scala
69
71
  - src/test/scala/org/embulk/filter/key_in_redis/json/JsonParserSpec.scala
70
72
  - classpath/akka-actor_2.11-2.3.6.jar
71
73
  - classpath/bcpkix-jdk15on-1.57.jar
72
74
  - classpath/bcprov-jdk15on-1.57.jar
75
+ - classpath/better-files_2.11-2.17.1.jar
73
76
  - classpath/cats-core_2.11-0.9.0.jar
74
77
  - classpath/cats-kernel_2.11-0.9.0.jar
75
78
  - classpath/cats-macros_2.11-0.9.0.jar
@@ -79,7 +82,7 @@ files:
79
82
  - classpath/circe-numbers_2.11-0.8.0.jar
80
83
  - classpath/circe-parser_2.11-0.8.0.jar
81
84
  - classpath/config-1.2.1.jar
82
- - classpath/embulk-filter-key_in_redis-0.1.0.jar
85
+ - classpath/embulk-filter-key_in_redis-0.1.1.jar
83
86
  - classpath/jawn-parser_2.11-0.10.4.jar
84
87
  - classpath/machinist_2.11-0.6.1.jar
85
88
  - classpath/macro-compat_2.11-1.1.1.jar
@@ -87,6 +90,9 @@ files:
87
90
  - classpath/scala-library-2.11.11.jar
88
91
  - classpath/scala-reflect-2.11.8.jar
89
92
  - classpath/scala-stm_2.11-0.7.jar
93
+ - classpath/scalaz-concurrent_2.11-7.2.14.jar
94
+ - classpath/scalaz-core_2.11-7.2.14.jar
95
+ - classpath/scalaz-effect_2.11-7.2.14.jar
90
96
  - classpath/shapeless_2.11-2.3.2.jar
91
97
  - classpath/simulacrum_2.11-0.10.0.jar
92
98
  homepage: