embulk-input-gcs 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,34 +1,26 @@
1
1
  package org.embulk.input.gcs;
2
2
 
3
- import com.google.api.client.http.HttpResponseException;
4
- import com.google.api.services.storage.Storage;
5
- import com.google.api.services.storage.model.Bucket;
6
- import com.google.api.services.storage.model.Objects;
7
- import com.google.api.services.storage.model.StorageObject;
3
+ import com.google.api.gax.paging.Page;
4
+ import com.google.cloud.storage.Blob;
5
+ import com.google.cloud.storage.Storage;
6
+ import com.google.cloud.storage.StorageException;
7
+ import com.google.common.annotations.VisibleForTesting;
8
8
  import com.google.common.base.Charsets;
9
9
  import com.google.common.io.BaseEncoding;
10
10
  import org.embulk.config.ConfigException;
11
11
  import org.embulk.config.TaskReport;
12
12
  import org.embulk.spi.Exec;
13
13
  import org.embulk.spi.TransactionalFileInput;
14
- import org.embulk.spi.unit.LocalFile;
15
14
  import org.embulk.spi.util.InputStreamFileInput;
16
15
  import org.slf4j.Logger;
17
16
 
18
- import java.io.IOException;
19
- import java.math.BigInteger;
20
- import java.security.GeneralSecurityException;
21
- import java.util.List;
22
- import java.util.Optional;
23
- import java.util.function.Function;
24
-
25
17
  public class GcsFileInput
26
18
  extends InputStreamFileInput
27
19
  implements TransactionalFileInput
28
20
  {
29
- private static final Logger log = Exec.getLogger(org.embulk.input.gcs.GcsFileInput.class);
21
+ private static final Logger LOG = Exec.getLogger(org.embulk.input.gcs.GcsFileInput.class);
30
22
 
31
- public GcsFileInput(PluginTask task, int taskIndex)
23
+ GcsFileInput(PluginTask task, int taskIndex)
32
24
  {
33
25
  super(task.getBufferAllocator(), new SingleFileProvider(task, taskIndex));
34
26
  }
@@ -47,149 +39,77 @@ public class GcsFileInput
47
39
  {
48
40
  }
49
41
 
50
- public static GcsAuthentication newGcsAuth(PluginTask task)
51
- {
52
- try {
53
- return new GcsAuthentication(
54
- task.getAuthMethod().getString(),
55
- task.getServiceAccountEmail(),
56
- task.getP12Keyfile().map(localFileToPathString()),
57
- task.getJsonKeyfile().map(localFileToPathString()),
58
- task.getApplicationName()
59
- );
60
- }
61
- catch (GeneralSecurityException | IOException ex) {
62
- throw new ConfigException(ex);
63
- }
64
- }
65
-
66
- protected static Storage newGcsClient(final PluginTask task, final GcsAuthentication auth)
67
- {
68
- Storage client = null;
69
- try {
70
- client = auth.getGcsClient(task.getBucket(), task.getMaxConnectionRetry());
71
- }
72
- catch (IOException ex) {
73
- throw new ConfigException(ex);
74
- }
75
-
76
- return client;
77
- }
78
-
79
- private static Function<LocalFile, String> localFileToPathString()
80
- {
81
- return new Function<LocalFile, String>()
82
- {
83
- public String apply(LocalFile file)
84
- {
85
- return file.getPath().toString();
86
- }
87
- };
88
- }
89
-
90
- public static FileList listFiles(PluginTask task, Storage client)
91
- {
92
- String bucket = task.getBucket();
93
-
94
- FileList.Builder builder = new FileList.Builder(task);
95
- listGcsFilesByPrefix(builder, client, bucket, task.getPathPrefix().get(), task.getLastPath());
96
- return builder.build();
97
- }
98
-
99
42
  /**
100
43
  * Lists GCS filenames filtered by prefix.
101
44
  *
102
45
  * The resulting list does not include the file that's size == 0.
103
46
  */
104
- public static void listGcsFilesByPrefix(FileList.Builder builder, Storage client, String bucket,
105
- String prefix, Optional<String> lastPath)
47
+ static FileList listFiles(PluginTask task)
106
48
  {
107
- String lastKey = lastPath.isPresent() ? base64Encode(lastPath.get()) : null;
108
-
109
- // @see https://cloud.google.com/storage/docs/json_api/v1/objects#resource
110
- if (log.isDebugEnabled()) {
111
- try {
112
- Storage.Buckets.Get getBucket = client.buckets().get(bucket);
113
- getBucket.setProjection("full");
114
- Bucket bk = getBucket.execute();
49
+ Storage client = AuthUtils.newClient(task);
50
+ String bucket = task.getBucket();
115
51
 
116
- log.debug("bucket name: " + bucket);
117
- log.debug("bucket location: " + bk.getLocation());
118
- log.debug("bucket timeCreated: " + bk.getTimeCreated());
119
- log.debug("bucket owner: " + bk.getOwner());
120
- }
121
- catch (IOException e) {
122
- log.warn("Could not access to bucket:" + bucket);
123
- log.warn(e.getMessage());
124
- }
52
+ // @see https://cloud.google.com/storage/docs/json_api/v1/buckets/get
53
+ if (LOG.isDebugEnabled()) {
54
+ printBucketInfo(client, bucket);
125
55
  }
126
56
 
57
+ String prefix = task.getPathPrefix().orElse("");
58
+ String lastKey = task.getLastPath().isPresent() ? base64Encode(task.getLastPath().get()) : "";
59
+ FileList.Builder builder = new FileList.Builder(task);
60
+
127
61
  try {
128
62
  // @see https://cloud.google.com/storage/docs/json_api/v1/objects/list
129
- Storage.Objects.List listObjects = client.objects().list(bucket);
130
- listObjects.setPrefix(prefix);
131
- listObjects.setPageToken(lastKey);
132
- do {
133
- Objects objects = listObjects.execute();
134
- List<StorageObject> items = objects.getItems();
135
- if (items == null) {
136
- log.info(String.format("No file was found in bucket:%s prefix:%s", bucket, prefix));
137
- break;
138
- }
139
- for (StorageObject o : items) {
140
- if (o.getSize().compareTo(BigInteger.ZERO) > 0) {
141
- builder.add(o.getName(), o.getSize().longValue());
142
- }
143
- log.debug("filename: " + o.getName());
144
- log.debug("updated: " + o.getUpdated());
63
+ Page<Blob> blobs = client.list(bucket, Storage.BlobListOption.prefix(prefix), Storage.BlobListOption.pageToken(lastKey));
64
+ for (Blob blob : blobs.iterateAll()) {
65
+ if (blob.getSize() > 0) {
66
+ builder.add(blob.getName(), blob.getSize());
145
67
  }
146
- lastKey = objects.getNextPageToken();
147
- listObjects.setPageToken(lastKey);
148
- } while (lastKey != null);
68
+ LOG.debug("filename: {}", blob.getName());
69
+ LOG.debug("updated: {}", blob.getUpdateTime());
70
+ }
149
71
  }
150
- catch (IOException e) {
151
- if ((e instanceof HttpResponseException) && ((HttpResponseException) e).getStatusCode() == 400) {
72
+ catch (RuntimeException e) {
73
+ if ((e instanceof StorageException) && ((StorageException) e).getCode() == 400) {
152
74
  throw new ConfigException(String.format("Files listing failed: bucket:%s, prefix:%s, last_path:%s", bucket, prefix, lastKey), e);
153
75
  }
154
76
 
155
- log.warn(String.format("Could not get file list from bucket:%s", bucket));
156
- log.warn(e.getMessage());
77
+ LOG.warn(String.format("Could not get file list from bucket:%s", bucket));
78
+ LOG.warn(e.getMessage());
157
79
  }
80
+ return builder.build();
158
81
  }
159
82
 
160
83
  // String nextToken = base64Encode(0x0a + 0x01~0x27 + filePath);
161
- private static String base64Encode(String path)
84
+ @VisibleForTesting
85
+ static String base64Encode(String path)
162
86
  {
163
87
  byte[] encoding;
164
88
  byte[] utf8 = path.getBytes(Charsets.UTF_8);
165
- log.debug(String.format("path string: %s ,path length:%s \" + ", path, utf8.length));
89
+ LOG.debug("path string: {} ,path length:{} \" + ", path, utf8.length);
166
90
 
167
91
  encoding = new byte[utf8.length + 2];
168
92
  encoding[0] = 0x0a;
169
- encoding[1] = new Byte(String.valueOf(path.length()));
93
+ encoding[1] = Byte.valueOf(String.valueOf(path.length()));
170
94
  System.arraycopy(utf8, 0, encoding, 2, utf8.length);
171
95
 
172
96
  String s = BaseEncoding.base64().encode(encoding);
173
- log.debug(String.format("last_path(base64 encoded): %s", s));
97
+ LOG.debug("last_path(base64 encoded): {}", s);
174
98
  return s;
175
99
  }
176
100
 
177
- public enum AuthMethod
101
+ private static void printBucketInfo(Storage client, String bucket)
178
102
  {
179
- private_key("private_key"),
180
- compute_engine("compute_engine"),
181
- json_key("json_key");
182
-
183
- private final String string;
184
-
185
- AuthMethod(String string)
186
- {
187
- this.string = string;
188
- }
189
-
190
- public String getString()
191
- {
192
- return string;
193
- }
103
+ // get Bucket
104
+ Storage.BucketGetOption fields = Storage.BucketGetOption.fields(
105
+ Storage.BucketField.LOCATION,
106
+ Storage.BucketField.TIME_CREATED,
107
+ Storage.BucketField.OWNER
108
+ );
109
+ com.google.cloud.storage.Bucket bk = client.get(bucket, fields);
110
+ LOG.debug("bucket name: {}", bk.getName());
111
+ LOG.debug("bucket location: {}", bk.getLocation());
112
+ LOG.debug("bucket timeCreated: {}", bk.getCreateTime());
113
+ LOG.debug("bucket owner: {}", bk.getOwner());
194
114
  }
195
115
  }
@@ -1,6 +1,5 @@
1
1
  package org.embulk.input.gcs;
2
2
 
3
- import com.google.api.services.storage.Storage;
4
3
  import com.google.common.base.Throwables;
5
4
  import org.embulk.config.ConfigDiff;
6
5
  import org.embulk.config.ConfigException;
@@ -11,19 +10,14 @@ import org.embulk.spi.Exec;
11
10
  import org.embulk.spi.FileInputPlugin;
12
11
  import org.embulk.spi.TransactionalFileInput;
13
12
  import org.embulk.spi.unit.LocalFile;
14
- import org.slf4j.Logger;
15
13
 
16
14
  import java.io.IOException;
17
- import java.security.GeneralSecurityException;
18
15
  import java.util.List;
19
16
  import java.util.Optional;
20
- import java.util.function.Function;
21
17
 
22
18
  public class GcsFileInputPlugin
23
19
  implements FileInputPlugin
24
20
  {
25
- private static final Logger log = Exec.getLogger(GcsFileInputPlugin.class);
26
-
27
21
  @Override
28
22
  public ConfigDiff transaction(ConfigSource config,
29
23
  FileInputPlugin.Control control)
@@ -42,12 +36,12 @@ public class GcsFileInputPlugin
42
36
  }
43
37
  }
44
38
 
45
- if (task.getAuthMethod().getString().equals("json_key")) {
39
+ if (AuthUtils.AuthMethod.json_key.equals(task.getAuthMethod())) {
46
40
  if (!task.getJsonKeyfile().isPresent()) {
47
41
  throw new ConfigException("If auth_method is json_key, you have to set json_keyfile");
48
42
  }
49
43
  }
50
- else if (task.getAuthMethod().getString().equals("private_key")) {
44
+ else if (AuthUtils.AuthMethod.private_key.equals(task.getAuthMethod())) {
51
45
  if (!task.getP12Keyfile().isPresent() || !task.getServiceAccountEmail().isPresent()) {
52
46
  throw new ConfigException("If auth_method is private_key, you have to set both service_account_email and p12_keyfile");
53
47
  }
@@ -60,11 +54,9 @@ public class GcsFileInputPlugin
60
54
  }
61
55
  }
62
56
 
63
- Storage client = GcsFileInput.newGcsClient(task, newGcsAuth(task));
64
-
65
57
  // list files recursively if path_prefix is specified
66
58
  if (task.getPathPrefix().isPresent()) {
67
- task.setFiles(GcsFileInput.listFiles(task, client));
59
+ task.setFiles(GcsFileInput.listFiles(task));
68
60
  }
69
61
  else {
70
62
  if (task.getPathFiles().isEmpty()) {
@@ -80,22 +72,6 @@ public class GcsFileInputPlugin
80
72
  return resume(task.dump(), task.getFiles().getTaskCount(), control);
81
73
  }
82
74
 
83
- private GcsAuthentication newGcsAuth(PluginTask task)
84
- {
85
- try {
86
- return new GcsAuthentication(
87
- task.getAuthMethod().getString(),
88
- task.getServiceAccountEmail(),
89
- task.getP12Keyfile().map(localFileToPathString()),
90
- task.getJsonKeyfile().map(localFileToPathString()),
91
- task.getApplicationName()
92
- );
93
- }
94
- catch (GeneralSecurityException | IOException ex) {
95
- throw new ConfigException(ex);
96
- }
97
- }
98
-
99
75
  @Override
100
76
  public ConfigDiff resume(TaskSource taskSource,
101
77
  int taskCount,
@@ -121,17 +97,6 @@ public class GcsFileInputPlugin
121
97
  {
122
98
  }
123
99
 
124
- private Function<LocalFile, String> localFileToPathString()
125
- {
126
- return new Function<LocalFile, String>()
127
- {
128
- public String apply(LocalFile file)
129
- {
130
- return file.getPath().toString();
131
- }
132
- };
133
- }
134
-
135
100
  @Override
136
101
  public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
137
102
  {
@@ -5,13 +5,12 @@ import org.embulk.config.ConfigDefault;
5
5
  import org.embulk.config.ConfigInject;
6
6
  import org.embulk.config.Task;
7
7
  import org.embulk.spi.BufferAllocator;
8
- import org.embulk.spi.unit.LocalFile;
9
8
 
10
9
  import java.util.List;
11
10
  import java.util.Optional;
12
11
 
13
12
  public interface PluginTask
14
- extends Task, FileList.Task
13
+ extends Task, AuthUtils.Task, FileList.Task, RetryUtils.Task
15
14
  {
16
15
  @Config("bucket")
17
16
  String getBucket();
@@ -28,44 +27,17 @@ public interface PluginTask
28
27
  @ConfigDefault("true")
29
28
  boolean getIncremental();
30
29
 
31
- @Config("auth_method")
32
- @ConfigDefault("\"private_key\"")
33
- GcsFileInput.AuthMethod getAuthMethod();
34
-
35
- @Config("service_account_email")
36
- @ConfigDefault("null")
37
- Optional<String> getServiceAccountEmail();
38
-
39
30
  @Config("application_name")
40
31
  @ConfigDefault("\"Embulk GCS input plugin\"")
41
32
  String getApplicationName();
42
33
 
43
- // kept for backward compatibility
44
- @Config("p12_keyfile_fullpath")
45
- @ConfigDefault("null")
46
- Optional<String> getP12KeyfileFullpath();
47
-
48
- @Config("p12_keyfile")
49
- @ConfigDefault("null")
50
- Optional<LocalFile> getP12Keyfile();
51
- void setP12Keyfile(Optional<LocalFile> p12Keyfile);
52
-
53
- @Config("json_keyfile")
54
- @ConfigDefault("null")
55
- Optional<LocalFile> getJsonKeyfile();
56
-
57
34
  @Config("paths")
58
35
  @ConfigDefault("[]")
59
36
  List<String> getPathFiles();
60
- void setPathFiles(List<String> files);
61
37
 
62
38
  FileList getFiles();
63
39
  void setFiles(FileList files);
64
40
 
65
- @Config("max_connection_retry")
66
- @ConfigDefault("10") // 10 times retry to connect GCS server if failed.
67
- int getMaxConnectionRetry();
68
-
69
41
  @ConfigInject
70
42
  BufferAllocator getBufferAllocator();
71
43
  }
@@ -0,0 +1,153 @@
1
+ package org.embulk.input.gcs;
2
+
3
+ import com.google.api.client.auth.oauth2.TokenErrorResponse;
4
+ import com.google.api.client.auth.oauth2.TokenResponseException;
5
+ import com.google.api.client.googleapis.json.GoogleJsonResponseException;
6
+ import com.google.cloud.storage.Blob;
7
+ import com.google.cloud.storage.Storage;
8
+ import org.embulk.config.Config;
9
+ import org.embulk.config.ConfigDefault;
10
+ import org.embulk.spi.Exec;
11
+ import org.embulk.spi.util.RetryExecutor;
12
+ import org.slf4j.Logger;
13
+
14
+ import java.util.Optional;
15
+ import java.util.function.Predicate;
16
+
17
+ class RetryUtils
18
+ {
19
+ interface Task extends org.embulk.config.Task
20
+ {
21
+ @Config("max_connection_retry")
22
+ @ConfigDefault("10") // 10 times retry to connect GCS server if failed.
23
+ int getMaxConnectionRetry();
24
+
25
+ @Config("initial_retry_interval_millis")
26
+ @ConfigDefault("1000")
27
+ int getInitialRetryIntervalMillis();
28
+
29
+ @Config("maximum_retry_interval_millis")
30
+ @ConfigDefault("300000")
31
+ int getMaximumRetryIntervalMillis();
32
+ }
33
+
34
+ private RetryUtils()
35
+ {
36
+ }
37
+
38
+ private static final Logger LOG = Exec.getLogger(RetryUtils.class);
39
+
40
+ /**
41
+ * A utility predicate to detect status code 4xx of `GoogleJsonResponseException`
42
+ */
43
+ private static final Predicate<GoogleJsonResponseException> API_ERROR_NOT_RETRY_4XX = e -> {
44
+ if (e.getDetails() == null && e.getContent() != null) {
45
+ LOG.warn("Invalid response was returned : {}", e.getContent());
46
+ return true;
47
+ }
48
+ int statusCode = e.getDetails().getCode();
49
+ return statusCode / 100 != 4;
50
+ };
51
+
52
+ /**
53
+ * A utility predicate to detect status code 4xx of `TokenResponseException`
54
+ * But will retry 400 "Invalid JWS..."
55
+ */
56
+ private static final Predicate<TokenResponseException> TOKEN_ERROR_NOT_RETRY_4XX = e -> {
57
+ Optional<String> errDesc = Optional.ofNullable(e.getDetails()).map(TokenErrorResponse::getErrorDescription);
58
+ if (errDesc.isPresent()) {
59
+ // Retry: 400 BadRequest "Invalid JWT..."
60
+ // Caused by: com.google.api.client.auth.oauth2.TokenResponseException: 400 Bad Request
61
+ // {
62
+ // "error" : "invalid_grant",
63
+ // "error_description" : "Invalid JWT: No valid verifier found for issuer."
64
+ // }
65
+ if (errDesc.get().contains("Invalid JWT")) {
66
+ LOG.warn("Invalid response was returned : {}", errDesc.get());
67
+ return true;
68
+ }
69
+ }
70
+ return e.getStatusCode() / 100 != 4;
71
+ };
72
+
73
+ /**
74
+ * A default (abstract) retryable impl, which makes use of above 2 predicates
75
+ * With default behaviors onRetry, etc.
76
+ *
77
+ * @param <T>
78
+ */
79
+ public abstract static class DefaultRetryable<T> implements RetryExecutor.Retryable<T>
80
+ {
81
+ @Override
82
+ public boolean isRetryableException(Exception exception)
83
+ {
84
+ if (exception instanceof GoogleJsonResponseException) {
85
+ return API_ERROR_NOT_RETRY_4XX.test((GoogleJsonResponseException) exception);
86
+ }
87
+ else if (exception instanceof TokenResponseException) {
88
+ return TOKEN_ERROR_NOT_RETRY_4XX.test((TokenResponseException) exception);
89
+ }
90
+ return true;
91
+ }
92
+
93
+ @Override
94
+ public void onRetry(Exception exception, int retryCount, int retryLimit, int retryWait)
95
+ {
96
+ String message = String.format("GCS GET request failed. Retrying %d/%d after %d seconds. Message: %s: %s",
97
+ retryCount, retryLimit, retryWait / 1000, exception.getClass(), exception.getMessage());
98
+ if (retryCount % 3 == 0) {
99
+ LOG.warn(message, exception);
100
+ }
101
+ else {
102
+ LOG.warn(message);
103
+ }
104
+ }
105
+
106
+ @Override
107
+ public void onGiveup(Exception firstException, Exception lastException)
108
+ {
109
+ }
110
+ }
111
+
112
+ /**
113
+ * Return Blob GET op that is ready for {@code withRetry}
114
+ *
115
+ * @param client
116
+ * @param bucket
117
+ * @param key
118
+ * @return
119
+ */
120
+ static DefaultRetryable<Blob> get(Storage client, String bucket, String key)
121
+ {
122
+ return new DefaultRetryable<Blob>()
123
+ {
124
+ @Override
125
+ public Blob call()
126
+ {
127
+ return client.get(bucket, key);
128
+ }
129
+ };
130
+ }
131
+
132
+ /**
133
+ * Utility method
134
+ *
135
+ * @param task
136
+ * @param op
137
+ * @param <T>
138
+ * @return
139
+ */
140
+ static <T> T withRetry(Task task, RetryExecutor.Retryable<T> op)
141
+ {
142
+ try {
143
+ return RetryExecutor.retryExecutor()
144
+ .withInitialRetryWait(task.getInitialRetryIntervalMillis())
145
+ .withMaxRetryWait(task.getMaximumRetryIntervalMillis())
146
+ .withRetryLimit(task.getMaxConnectionRetry())
147
+ .runInterruptible(op);
148
+ }
149
+ catch (RetryExecutor.RetryGiveupException | InterruptedException e) {
150
+ throw new RuntimeException(e);
151
+ }
152
+ }
153
+ }