embulk-input-pubsub 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (63) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +2 -0
  3. data/.scalafmt.conf +13 -0
  4. data/LICENSE +21 -0
  5. data/README.md +75 -0
  6. data/build.gradle +87 -0
  7. data/classpath/animal-sniffer-annotations-1.18.jar +0 -0
  8. data/classpath/annotations-4.1.1.4.jar +0 -0
  9. data/classpath/auto-value-annotations-1.6.6.jar +0 -0
  10. data/classpath/checker-compat-qual-2.5.5.jar +0 -0
  11. data/classpath/commons-codec-1.11.jar +0 -0
  12. data/classpath/commons-lang3-3.5.jar +0 -0
  13. data/classpath/commons-logging-1.2.jar +0 -0
  14. data/classpath/embulk-input-pubsub-0.0.1-shadow.jar +0 -0
  15. data/classpath/error_prone_annotations-2.3.2.jar +0 -0
  16. data/classpath/google-auth-library-credentials-0.18.0.jar +0 -0
  17. data/classpath/google-auth-library-oauth2-http-0.18.0.jar +0 -0
  18. data/classpath/google-cloud-core-1.91.3.jar +0 -0
  19. data/classpath/google-cloud-core-grpc-1.91.3.jar +0 -0
  20. data/classpath/google-http-client-1.32.1.jar +0 -0
  21. data/classpath/google-http-client-jackson2-1.32.1.jar +0 -0
  22. data/classpath/grpc-alts-1.23.0.jar +0 -0
  23. data/classpath/grpc-auth-1.23.0.jar +0 -0
  24. data/classpath/grpc-context-1.24.1.jar +0 -0
  25. data/classpath/grpc-google-cloud-pubsub-v1-1.82.0.jar +0 -0
  26. data/classpath/grpc-grpclb-1.23.0.jar +0 -0
  27. data/classpath/grpc-protobuf-1.24.1.jar +0 -0
  28. data/classpath/grpc-protobuf-lite-1.24.1.jar +0 -0
  29. data/classpath/gson-2.8.5.jar +0 -0
  30. data/classpath/httpclient-4.5.10.jar +0 -0
  31. data/classpath/httpcore-4.4.12.jar +0 -0
  32. data/classpath/j2objc-annotations-1.3.jar +0 -0
  33. data/classpath/jackson-core-2.9.9.jar +0 -0
  34. data/classpath/javax.annotation-api-1.3.2.jar +0 -0
  35. data/classpath/jsr305-3.0.2.jar +0 -0
  36. data/classpath/listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar +0 -0
  37. data/classpath/opencensus-api-0.24.0.jar +0 -0
  38. data/classpath/opencensus-contrib-grpc-metrics-0.21.0.jar +0 -0
  39. data/classpath/opencensus-contrib-http-util-0.24.0.jar +0 -0
  40. data/classpath/perfmark-api-0.17.0.jar +0 -0
  41. data/classpath/proto-google-cloud-pubsub-v1-1.82.0.jar +0 -0
  42. data/classpath/proto-google-common-protos-1.17.0.jar +0 -0
  43. data/classpath/proto-google-iam-v1-0.13.0.jar +0 -0
  44. data/classpath/protobuf-java-3.10.0.jar +0 -0
  45. data/classpath/protobuf-java-util-3.10.0.jar +0 -0
  46. data/classpath/scala-library-2.13.1.jar +0 -0
  47. data/classpath/threetenbp-1.3.3.jar +0 -0
  48. data/examples/pubsub2stdout.yaml +10 -0
  49. data/gradle.properties +1 -0
  50. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  51. data/gradle/wrapper/gradle-wrapper.properties +6 -0
  52. data/gradlew +172 -0
  53. data/gradlew.bat +84 -0
  54. data/lib/embulk/input/pubsub.rb +3 -0
  55. data/src/main/java/com/embulk/input/pubsub/checkpoint/Checkpoint.java +734 -0
  56. data/src/main/java/com/embulk/input/pubsub/checkpoint/CheckpointOrBuilder.java +33 -0
  57. data/src/main/java/com/embulk/input/pubsub/checkpoint/CheckpointProtos.java +61 -0
  58. data/src/main/resources/checkpoint.proto +11 -0
  59. data/src/main/scala/org/embulk/input/pubsub/PluginTask.scala +42 -0
  60. data/src/main/scala/org/embulk/input/pubsub/PubsubBatchSubscriber.scala +103 -0
  61. data/src/main/scala/org/embulk/input/pubsub/PubsubInputPlugin.scala +142 -0
  62. data/src/main/scala/org/embulk/input/pubsub/checkpoint/StoredCheckpoint.scala +123 -0
  63. metadata +105 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 1cee226b5d7da880c431fdd18df237cea9ca5fec030fc9037f806cc74f174bec
4
+ data.tar.gz: 8f5c25a70cd36aa4a56faf3b5125015c20988474b4a7333942c403147d6969a3
5
+ SHA512:
6
+ metadata.gz: 2d8439f6d0581a8c0993b4756277618ceb707d4b2809b2ff68e6351fcd2865c60dca741ebd349b86d202bbd5d671ec4411bbda12d9be7dd841e003bc22d0b762
7
+ data.tar.gz: 8ec794bf18c5a731d5befe75bc3f74a6d32ff03a1b5c957b8ca54a6dfb1dac8fce2d761b247531d6860ee07a5faa19b1f98c739f66874a34e6fe8441bb00d4b2
@@ -0,0 +1,2 @@
1
+ *.class
2
+ *.log
@@ -0,0 +1,13 @@
1
+ version=2.3.2
2
+ project.git = true
3
+ project.excludeFilters = [
4
+ scalafmt-benchmarks/src/resources,
5
+ sbt-test
6
+ bin/issue
7
+ ]
8
+ align = none
9
+ # Disabled in default since this operation is potentially
10
+ # dangerous if you define your own stripMargin with different
11
+ # semantics from the stdlib stripMargin.
12
+ assumeStandardLibraryStripMargin = true
13
+ onTestFailure = "To fix this, run ./scalafmt from the project root directory"
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2020 Ryo Okubo
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,75 @@
1
+ # embulk-input-pubsub
2
+
3
+ [Google Cloud Pub/Sub](https://cloud.google.com/pubsub?hl=en) input plugin for Embulk.
4
+
5
+ ## Overview
6
+
7
+ * **Plugin type**: input
8
+ * **Guess supported**: no
9
+
10
+ ## Configuration
11
+
12
+ - **project_id**: GCP project_id (string, required)
13
+ - **subscription_id**: Pub/Sub subscription name (string, required)
14
+ - **json_keyfile**: A path to GCP credential json file (string, required)
15
+ - **max_messages**: A max number of messages on once pubsub call (integer, optional)
16
+ - **checkpoint_basedir**: A path to checkpoint dir (string, optional)
17
+ - **checkpoint**: A path to checkpoint file (string, optional)
18
+
19
+ ### Checkpoint
20
+
21
+ Goocle Cloud Pub/Sub removes stored messages by ack calls or expiration.
22
+ So `embulk-input-pubsub` ensures to recovery data-loss with checkpoints which's a fashion used in Apache Flink / Apache Beam.
23
+ It 1) pulls messages from Pub/Sub, 2) preserves a checkpoint which contains the messages and 3) ack to pubsub.
24
+ If you got failures on Embulk tasks, you can embulk-resume with the checkpoints. And also you can do simply `embulk-run` with `checkpoint`.
25
+
26
+ If you want checkpointing, you need to set `checkpoint_basedir` to preserve checkpoint files on local filesystem. if none, it uses on-memory store.
27
+ If you want to recover state from checkpoint, you need to set `checkpoint`. It restores transaction states from given checkpoint instead of pulling message from pubsub.
28
+
29
+ The checkpoint is implemented as a Protocol Buffers message.
30
+
31
+ ## Example
32
+
33
+ - pubsub -> stdout config example
34
+
35
+ ```yaml
36
+ in:
37
+ type: pubsub
38
+ project_id: <your-project-id>
39
+ subscription_id: <your-subscription-name>
40
+ json_keyfile: /path/to/credential.json
41
+ max_messages: 100
42
+ checkpoint_basedir: /tmp/embulk-input-pubsub/
43
+
44
+ out:
45
+ type: stdout
46
+ ```
47
+
48
+ You execute the example, then you'll get the result:
49
+
50
+ ```
51
+ $ embulk run examples/pubsub2stdout.yaml
52
+ 2020-05-06 00:44:05.093 +0900: Embulk v0.9.23
53
+ 2020-05-06 00:44:06.540 +0900 [WARN] (main): DEPRECATION: JRuby org.jruby.embed.ScriptingContainer is directly injected.
54
+ 2020-05-06 00:44:10.743 +0900 [INFO] (main): Gem's home and path are set by default: "/Users/ryo/.embulk/lib/gems"
55
+ 2020-05-06 00:44:12.551 +0900 [INFO] (main): Started Embulk v0.9.23
56
+ 2020-05-06 00:44:12.858 +0900 [INFO] (0001:transaction): Loaded plugin embulk-input-pubsub (0.0.1)
57
+ 2020-05-06 00:44:18.332 +0900 [INFO] (0001:transaction): Created a new checkpoint! : /tmp/embulk-input-pubsub/checkpoint--1576110815
58
+ 2020-05-06 00:44:18.336 +0900 [INFO] (0001:transaction): Using local thread executor with max_threads=8 / output tasks 4 = input tasks 1 * 4
59
+ 2020-05-06 00:44:18.354 +0900 [INFO] (0001:transaction): {done: 0 / 1, running: 0}
60
+ aaa,{}
61
+ 2020-05-06 00:44:18.428 +0900 [INFO] (0001:transaction): {done: 1 / 1, running: 0}
62
+ 2020-05-06 00:44:18.436 +0900 [INFO] (main): Committed.
63
+ 2020-05-06 00:44:18.436 +0900 [INFO] (main): Next config diff: {"in":{},"out":{}}
64
+ ```
65
+
66
+ ## Development
67
+
68
+ ```shell script
69
+ $ ./gradlew gem
70
+ ```
71
+
72
+ ## TODO
73
+
74
+ - Change it to a FileInputPlugin to be applicable for parser plugins
75
+ - Remote filesystem based checkpointing
@@ -0,0 +1,87 @@
1
+ plugins {
2
+ id 'scala'
3
+
4
+ id 'com.github.jruby-gradle.base' version '1.7.0'
5
+ id 'com.github.johnrengelman.shadow' version '5.2.0'
6
+ id 'cz.alenkacz.gradle.scalafmt' version '1.10.0'
7
+ }
8
+ import com.github.jrubygradle.JRubyExec
9
+
10
+ configurations {
11
+ provided
12
+ }
13
+
14
+ // Relocate Guava packages since it's incompatible with Guava's version from Embulk
15
+ shadowJar {
16
+ classifier 'shadow'
17
+
18
+ dependencies {
19
+ include dependency('com.google.guava:guava')
20
+ include dependency('com.google.guava:failureaccess')
21
+
22
+ include dependency('com.google.cloud:google-cloud-pubsub')
23
+ include dependency('com.google.api:api-common')
24
+ include dependency('com.google.api:gax')
25
+ include dependency('com.google.api:gax-grpc')
26
+ include dependency('io.grpc:grpc-api')
27
+ include dependency('io.grpc:grpc-core')
28
+ include dependency('io.grpc:grpc-netty-shaded')
29
+ include dependency('io.grpc:grpc-stub')
30
+ }
31
+
32
+ relocate 'com.google.common', 'relocated.com.google.common'
33
+ }
34
+
35
+ sourceCompatibility = 1.8
36
+ targetCompatibility = 1.8
37
+
38
+ dependencies {
39
+ compile 'org.scala-lang:scala-library:2.13.1'
40
+
41
+ compile 'org.embulk:embulk-core:0.9.12'
42
+ provided 'org.embulk:embulk-core:0.9.12'
43
+
44
+ compile 'com.google.cloud:google-cloud-pubsub:1.100.0'
45
+ compile 'com.google.guava:failureaccess:1.0.1'
46
+ }
47
+
48
+ task classpath(type: Copy, dependsOn: ['jar', 'shadowJar']) {
49
+ doFirst { file('classpath').deleteDir() }
50
+
51
+ from (configurations.runtime
52
+ - configurations.provided
53
+ + configurations.shadow
54
+ - files(shadowJar.getIncludedDependencies())
55
+ + files(shadowJar.archiveFile))
56
+
57
+ into 'classpath'
58
+ }
59
+ clean { delete 'classpath' }
60
+
61
+ task gem(type: JRubyExec, dependsOn: ['build', 'gemspec', 'classpath']) {
62
+ script 'gem'
63
+ scriptArgs 'build', 'build/gemspec'
64
+ doLast { ant.move(file: "${project.name}-${project.version}.gem", todir: 'pkg') }
65
+ }
66
+
67
+ task gemspec {
68
+ doLast {
69
+ file('build').mkdirs()
70
+ file('build/gemspec').write($/
71
+ Gem::Specification.new do |spec|
72
+ spec.name = "${project.name}"
73
+ spec.version = "${project.version}"
74
+ spec.authors = ["Ryo Okubo"]
75
+ spec.summary = %[Google Cloud Pub/Sub input plugin for Embulk]
76
+ spec.description = %[Selects records from Cloud Pub/Sub.]
77
+ spec.email = ["syucream@gmail.com"]
78
+ spec.licenses = ["Apache-2.0"]
79
+ spec.homepage = "https://github.com/syucream/embulk-input-pubsub"
80
+
81
+ spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"]
82
+ spec.test_files = spec.files.grep(%r"^(test|spec)/")
83
+ spec.require_paths = ["lib"]
84
+ end
85
+ /$)
86
+ }
87
+ }
Binary file
@@ -0,0 +1,10 @@
1
+ in:
2
+ type: pubsub
3
+ project_id: <your-project-id>
4
+ subscription_id: <your-subscription-name>
5
+ json_keyfile: /path/to/credential.json
6
+ max_messages: 100
7
+ checkpoint_basedir: /tmp/embulk-input-pubsub/
8
+
9
+ out:
10
+ type: stdout
@@ -0,0 +1 @@
1
+ version=0.0.1
@@ -0,0 +1,6 @@
1
+ #Mon Oct 21 20:40:52 JST 2019
2
+ distributionUrl=https\://services.gradle.org/distributions/gradle-5.2.1-all.zip
3
+ distributionBase=GRADLE_USER_HOME
4
+ distributionPath=wrapper/dists
5
+ zipStorePath=wrapper/dists
6
+ zipStoreBase=GRADLE_USER_HOME
data/gradlew ADDED
@@ -0,0 +1,172 @@
1
+ #!/usr/bin/env sh
2
+
3
+ ##############################################################################
4
+ ##
5
+ ## Gradle start up script for UN*X
6
+ ##
7
+ ##############################################################################
8
+
9
+ # Attempt to set APP_HOME
10
+ # Resolve links: $0 may be a link
11
+ PRG="$0"
12
+ # Need this for relative symlinks.
13
+ while [ -h "$PRG" ] ; do
14
+ ls=`ls -ld "$PRG"`
15
+ link=`expr "$ls" : '.*-> \(.*\)$'`
16
+ if expr "$link" : '/.*' > /dev/null; then
17
+ PRG="$link"
18
+ else
19
+ PRG=`dirname "$PRG"`"/$link"
20
+ fi
21
+ done
22
+ SAVED="`pwd`"
23
+ cd "`dirname \"$PRG\"`/" >/dev/null
24
+ APP_HOME="`pwd -P`"
25
+ cd "$SAVED" >/dev/null
26
+
27
+ APP_NAME="Gradle"
28
+ APP_BASE_NAME=`basename "$0"`
29
+
30
+ # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
31
+ DEFAULT_JVM_OPTS='"-Xmx64m"'
32
+
33
+ # Use the maximum available, or set MAX_FD != -1 to use that value.
34
+ MAX_FD="maximum"
35
+
36
+ warn () {
37
+ echo "$*"
38
+ }
39
+
40
+ die () {
41
+ echo
42
+ echo "$*"
43
+ echo
44
+ exit 1
45
+ }
46
+
47
+ # OS specific support (must be 'true' or 'false').
48
+ cygwin=false
49
+ msys=false
50
+ darwin=false
51
+ nonstop=false
52
+ case "`uname`" in
53
+ CYGWIN* )
54
+ cygwin=true
55
+ ;;
56
+ Darwin* )
57
+ darwin=true
58
+ ;;
59
+ MINGW* )
60
+ msys=true
61
+ ;;
62
+ NONSTOP* )
63
+ nonstop=true
64
+ ;;
65
+ esac
66
+
67
+ CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
68
+
69
+ # Determine the Java command to use to start the JVM.
70
+ if [ -n "$JAVA_HOME" ] ; then
71
+ if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
72
+ # IBM's JDK on AIX uses strange locations for the executables
73
+ JAVACMD="$JAVA_HOME/jre/sh/java"
74
+ else
75
+ JAVACMD="$JAVA_HOME/bin/java"
76
+ fi
77
+ if [ ! -x "$JAVACMD" ] ; then
78
+ die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
79
+
80
+ Please set the JAVA_HOME variable in your environment to match the
81
+ location of your Java installation."
82
+ fi
83
+ else
84
+ JAVACMD="java"
85
+ which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
86
+
87
+ Please set the JAVA_HOME variable in your environment to match the
88
+ location of your Java installation."
89
+ fi
90
+
91
+ # Increase the maximum file descriptors if we can.
92
+ if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
93
+ MAX_FD_LIMIT=`ulimit -H -n`
94
+ if [ $? -eq 0 ] ; then
95
+ if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
96
+ MAX_FD="$MAX_FD_LIMIT"
97
+ fi
98
+ ulimit -n $MAX_FD
99
+ if [ $? -ne 0 ] ; then
100
+ warn "Could not set maximum file descriptor limit: $MAX_FD"
101
+ fi
102
+ else
103
+ warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
104
+ fi
105
+ fi
106
+
107
+ # For Darwin, add options to specify how the application appears in the dock
108
+ if $darwin; then
109
+ GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
110
+ fi
111
+
112
+ # For Cygwin, switch paths to Windows format before running java
113
+ if $cygwin ; then
114
+ APP_HOME=`cygpath --path --mixed "$APP_HOME"`
115
+ CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
116
+ JAVACMD=`cygpath --unix "$JAVACMD"`
117
+
118
+ # We build the pattern for arguments to be converted via cygpath
119
+ ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
120
+ SEP=""
121
+ for dir in $ROOTDIRSRAW ; do
122
+ ROOTDIRS="$ROOTDIRS$SEP$dir"
123
+ SEP="|"
124
+ done
125
+ OURCYGPATTERN="(^($ROOTDIRS))"
126
+ # Add a user-defined pattern to the cygpath arguments
127
+ if [ "$GRADLE_CYGPATTERN" != "" ] ; then
128
+ OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
129
+ fi
130
+ # Now convert the arguments - kludge to limit ourselves to /bin/sh
131
+ i=0
132
+ for arg in "$@" ; do
133
+ CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
134
+ CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
135
+
136
+ if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
137
+ eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
138
+ else
139
+ eval `echo args$i`="\"$arg\""
140
+ fi
141
+ i=$((i+1))
142
+ done
143
+ case $i in
144
+ (0) set -- ;;
145
+ (1) set -- "$args0" ;;
146
+ (2) set -- "$args0" "$args1" ;;
147
+ (3) set -- "$args0" "$args1" "$args2" ;;
148
+ (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
149
+ (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
150
+ (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
151
+ (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
152
+ (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
153
+ (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
154
+ esac
155
+ fi
156
+
157
+ # Escape application args
158
+ save () {
159
+ for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
160
+ echo " "
161
+ }
162
+ APP_ARGS=$(save "$@")
163
+
164
+ # Collect all arguments for the java command, following the shell quoting and substitution rules
165
+ eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
166
+
167
+ # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
168
+ if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
169
+ cd "$(dirname "$0")"
170
+ fi
171
+
172
+ exec "$JAVACMD" "$@"