embulk-input-gcs 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,34 +1,26 @@
1
1
  package org.embulk.input.gcs;
2
2
 
3
- import com.google.api.client.http.HttpResponseException;
4
- import com.google.api.services.storage.Storage;
5
- import com.google.api.services.storage.model.Bucket;
6
- import com.google.api.services.storage.model.Objects;
7
- import com.google.api.services.storage.model.StorageObject;
3
+ import com.google.api.gax.paging.Page;
4
+ import com.google.cloud.storage.Blob;
5
+ import com.google.cloud.storage.Storage;
6
+ import com.google.cloud.storage.StorageException;
7
+ import com.google.common.annotations.VisibleForTesting;
8
8
  import com.google.common.base.Charsets;
9
9
  import com.google.common.io.BaseEncoding;
10
10
  import org.embulk.config.ConfigException;
11
11
  import org.embulk.config.TaskReport;
12
12
  import org.embulk.spi.Exec;
13
13
  import org.embulk.spi.TransactionalFileInput;
14
- import org.embulk.spi.unit.LocalFile;
15
14
  import org.embulk.spi.util.InputStreamFileInput;
16
15
  import org.slf4j.Logger;
17
16
 
18
- import java.io.IOException;
19
- import java.math.BigInteger;
20
- import java.security.GeneralSecurityException;
21
- import java.util.List;
22
- import java.util.Optional;
23
- import java.util.function.Function;
24
-
25
17
  public class GcsFileInput
26
18
  extends InputStreamFileInput
27
19
  implements TransactionalFileInput
28
20
  {
29
- private static final Logger log = Exec.getLogger(org.embulk.input.gcs.GcsFileInput.class);
21
+ private static final Logger LOG = Exec.getLogger(org.embulk.input.gcs.GcsFileInput.class);
30
22
 
31
- public GcsFileInput(PluginTask task, int taskIndex)
23
+ GcsFileInput(PluginTask task, int taskIndex)
32
24
  {
33
25
  super(task.getBufferAllocator(), new SingleFileProvider(task, taskIndex));
34
26
  }
@@ -47,149 +39,77 @@ public class GcsFileInput
47
39
  {
48
40
  }
49
41
 
50
- public static GcsAuthentication newGcsAuth(PluginTask task)
51
- {
52
- try {
53
- return new GcsAuthentication(
54
- task.getAuthMethod().getString(),
55
- task.getServiceAccountEmail(),
56
- task.getP12Keyfile().map(localFileToPathString()),
57
- task.getJsonKeyfile().map(localFileToPathString()),
58
- task.getApplicationName()
59
- );
60
- }
61
- catch (GeneralSecurityException | IOException ex) {
62
- throw new ConfigException(ex);
63
- }
64
- }
65
-
66
- protected static Storage newGcsClient(final PluginTask task, final GcsAuthentication auth)
67
- {
68
- Storage client = null;
69
- try {
70
- client = auth.getGcsClient(task.getBucket(), task.getMaxConnectionRetry());
71
- }
72
- catch (IOException ex) {
73
- throw new ConfigException(ex);
74
- }
75
-
76
- return client;
77
- }
78
-
79
- private static Function<LocalFile, String> localFileToPathString()
80
- {
81
- return new Function<LocalFile, String>()
82
- {
83
- public String apply(LocalFile file)
84
- {
85
- return file.getPath().toString();
86
- }
87
- };
88
- }
89
-
90
- public static FileList listFiles(PluginTask task, Storage client)
91
- {
92
- String bucket = task.getBucket();
93
-
94
- FileList.Builder builder = new FileList.Builder(task);
95
- listGcsFilesByPrefix(builder, client, bucket, task.getPathPrefix().get(), task.getLastPath());
96
- return builder.build();
97
- }
98
-
99
42
  /**
100
43
  * Lists GCS filenames filtered by prefix.
101
44
  *
102
45
  * The resulting list does not include the file that's size == 0.
103
46
  */
104
- public static void listGcsFilesByPrefix(FileList.Builder builder, Storage client, String bucket,
105
- String prefix, Optional<String> lastPath)
47
+ static FileList listFiles(PluginTask task)
106
48
  {
107
- String lastKey = lastPath.isPresent() ? base64Encode(lastPath.get()) : null;
108
-
109
- // @see https://cloud.google.com/storage/docs/json_api/v1/objects#resource
110
- if (log.isDebugEnabled()) {
111
- try {
112
- Storage.Buckets.Get getBucket = client.buckets().get(bucket);
113
- getBucket.setProjection("full");
114
- Bucket bk = getBucket.execute();
49
+ Storage client = AuthUtils.newClient(task);
50
+ String bucket = task.getBucket();
115
51
 
116
- log.debug("bucket name: " + bucket);
117
- log.debug("bucket location: " + bk.getLocation());
118
- log.debug("bucket timeCreated: " + bk.getTimeCreated());
119
- log.debug("bucket owner: " + bk.getOwner());
120
- }
121
- catch (IOException e) {
122
- log.warn("Could not access to bucket:" + bucket);
123
- log.warn(e.getMessage());
124
- }
52
+ // @see https://cloud.google.com/storage/docs/json_api/v1/buckets/get
53
+ if (LOG.isDebugEnabled()) {
54
+ printBucketInfo(client, bucket);
125
55
  }
126
56
 
57
+ String prefix = task.getPathPrefix().orElse("");
58
+ String lastKey = task.getLastPath().isPresent() ? base64Encode(task.getLastPath().get()) : "";
59
+ FileList.Builder builder = new FileList.Builder(task);
60
+
127
61
  try {
128
62
  // @see https://cloud.google.com/storage/docs/json_api/v1/objects/list
129
- Storage.Objects.List listObjects = client.objects().list(bucket);
130
- listObjects.setPrefix(prefix);
131
- listObjects.setPageToken(lastKey);
132
- do {
133
- Objects objects = listObjects.execute();
134
- List<StorageObject> items = objects.getItems();
135
- if (items == null) {
136
- log.info(String.format("No file was found in bucket:%s prefix:%s", bucket, prefix));
137
- break;
138
- }
139
- for (StorageObject o : items) {
140
- if (o.getSize().compareTo(BigInteger.ZERO) > 0) {
141
- builder.add(o.getName(), o.getSize().longValue());
142
- }
143
- log.debug("filename: " + o.getName());
144
- log.debug("updated: " + o.getUpdated());
63
+ Page<Blob> blobs = client.list(bucket, Storage.BlobListOption.prefix(prefix), Storage.BlobListOption.pageToken(lastKey));
64
+ for (Blob blob : blobs.iterateAll()) {
65
+ if (blob.getSize() > 0) {
66
+ builder.add(blob.getName(), blob.getSize());
145
67
  }
146
- lastKey = objects.getNextPageToken();
147
- listObjects.setPageToken(lastKey);
148
- } while (lastKey != null);
68
+ LOG.debug("filename: {}", blob.getName());
69
+ LOG.debug("updated: {}", blob.getUpdateTime());
70
+ }
149
71
  }
150
- catch (IOException e) {
151
- if ((e instanceof HttpResponseException) && ((HttpResponseException) e).getStatusCode() == 400) {
72
+ catch (RuntimeException e) {
73
+ if ((e instanceof StorageException) && ((StorageException) e).getCode() == 400) {
152
74
  throw new ConfigException(String.format("Files listing failed: bucket:%s, prefix:%s, last_path:%s", bucket, prefix, lastKey), e);
153
75
  }
154
76
 
155
- log.warn(String.format("Could not get file list from bucket:%s", bucket));
156
- log.warn(e.getMessage());
77
+ LOG.warn(String.format("Could not get file list from bucket:%s", bucket));
78
+ LOG.warn(e.getMessage());
157
79
  }
80
+ return builder.build();
158
81
  }
159
82
 
160
83
  // String nextToken = base64Encode(0x0a + 0x01~0x27 + filePath);
161
- private static String base64Encode(String path)
84
+ @VisibleForTesting
85
+ static String base64Encode(String path)
162
86
  {
163
87
  byte[] encoding;
164
88
  byte[] utf8 = path.getBytes(Charsets.UTF_8);
165
- log.debug(String.format("path string: %s ,path length:%s \" + ", path, utf8.length));
89
+ LOG.debug("path string: {} ,path length:{} \" + ", path, utf8.length);
166
90
 
167
91
  encoding = new byte[utf8.length + 2];
168
92
  encoding[0] = 0x0a;
169
- encoding[1] = new Byte(String.valueOf(path.length()));
93
+ encoding[1] = Byte.valueOf(String.valueOf(path.length()));
170
94
  System.arraycopy(utf8, 0, encoding, 2, utf8.length);
171
95
 
172
96
  String s = BaseEncoding.base64().encode(encoding);
173
- log.debug(String.format("last_path(base64 encoded): %s", s));
97
+ LOG.debug("last_path(base64 encoded): {}", s);
174
98
  return s;
175
99
  }
176
100
 
177
- public enum AuthMethod
101
+ private static void printBucketInfo(Storage client, String bucket)
178
102
  {
179
- private_key("private_key"),
180
- compute_engine("compute_engine"),
181
- json_key("json_key");
182
-
183
- private final String string;
184
-
185
- AuthMethod(String string)
186
- {
187
- this.string = string;
188
- }
189
-
190
- public String getString()
191
- {
192
- return string;
193
- }
103
+ // get Bucket
104
+ Storage.BucketGetOption fields = Storage.BucketGetOption.fields(
105
+ Storage.BucketField.LOCATION,
106
+ Storage.BucketField.TIME_CREATED,
107
+ Storage.BucketField.OWNER
108
+ );
109
+ com.google.cloud.storage.Bucket bk = client.get(bucket, fields);
110
+ LOG.debug("bucket name: {}", bk.getName());
111
+ LOG.debug("bucket location: {}", bk.getLocation());
112
+ LOG.debug("bucket timeCreated: {}", bk.getCreateTime());
113
+ LOG.debug("bucket owner: {}", bk.getOwner());
194
114
  }
195
115
  }
@@ -1,6 +1,5 @@
1
1
  package org.embulk.input.gcs;
2
2
 
3
- import com.google.api.services.storage.Storage;
4
3
  import com.google.common.base.Throwables;
5
4
  import org.embulk.config.ConfigDiff;
6
5
  import org.embulk.config.ConfigException;
@@ -11,19 +10,14 @@ import org.embulk.spi.Exec;
11
10
  import org.embulk.spi.FileInputPlugin;
12
11
  import org.embulk.spi.TransactionalFileInput;
13
12
  import org.embulk.spi.unit.LocalFile;
14
- import org.slf4j.Logger;
15
13
 
16
14
  import java.io.IOException;
17
- import java.security.GeneralSecurityException;
18
15
  import java.util.List;
19
16
  import java.util.Optional;
20
- import java.util.function.Function;
21
17
 
22
18
  public class GcsFileInputPlugin
23
19
  implements FileInputPlugin
24
20
  {
25
- private static final Logger log = Exec.getLogger(GcsFileInputPlugin.class);
26
-
27
21
  @Override
28
22
  public ConfigDiff transaction(ConfigSource config,
29
23
  FileInputPlugin.Control control)
@@ -42,12 +36,12 @@ public class GcsFileInputPlugin
42
36
  }
43
37
  }
44
38
 
45
- if (task.getAuthMethod().getString().equals("json_key")) {
39
+ if (AuthUtils.AuthMethod.json_key.equals(task.getAuthMethod())) {
46
40
  if (!task.getJsonKeyfile().isPresent()) {
47
41
  throw new ConfigException("If auth_method is json_key, you have to set json_keyfile");
48
42
  }
49
43
  }
50
- else if (task.getAuthMethod().getString().equals("private_key")) {
44
+ else if (AuthUtils.AuthMethod.private_key.equals(task.getAuthMethod())) {
51
45
  if (!task.getP12Keyfile().isPresent() || !task.getServiceAccountEmail().isPresent()) {
52
46
  throw new ConfigException("If auth_method is private_key, you have to set both service_account_email and p12_keyfile");
53
47
  }
@@ -60,11 +54,9 @@ public class GcsFileInputPlugin
60
54
  }
61
55
  }
62
56
 
63
- Storage client = GcsFileInput.newGcsClient(task, newGcsAuth(task));
64
-
65
57
  // list files recursively if path_prefix is specified
66
58
  if (task.getPathPrefix().isPresent()) {
67
- task.setFiles(GcsFileInput.listFiles(task, client));
59
+ task.setFiles(GcsFileInput.listFiles(task));
68
60
  }
69
61
  else {
70
62
  if (task.getPathFiles().isEmpty()) {
@@ -80,22 +72,6 @@ public class GcsFileInputPlugin
80
72
  return resume(task.dump(), task.getFiles().getTaskCount(), control);
81
73
  }
82
74
 
83
- private GcsAuthentication newGcsAuth(PluginTask task)
84
- {
85
- try {
86
- return new GcsAuthentication(
87
- task.getAuthMethod().getString(),
88
- task.getServiceAccountEmail(),
89
- task.getP12Keyfile().map(localFileToPathString()),
90
- task.getJsonKeyfile().map(localFileToPathString()),
91
- task.getApplicationName()
92
- );
93
- }
94
- catch (GeneralSecurityException | IOException ex) {
95
- throw new ConfigException(ex);
96
- }
97
- }
98
-
99
75
  @Override
100
76
  public ConfigDiff resume(TaskSource taskSource,
101
77
  int taskCount,
@@ -121,17 +97,6 @@ public class GcsFileInputPlugin
121
97
  {
122
98
  }
123
99
 
124
- private Function<LocalFile, String> localFileToPathString()
125
- {
126
- return new Function<LocalFile, String>()
127
- {
128
- public String apply(LocalFile file)
129
- {
130
- return file.getPath().toString();
131
- }
132
- };
133
- }
134
-
135
100
  @Override
136
101
  public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
137
102
  {
@@ -5,13 +5,12 @@ import org.embulk.config.ConfigDefault;
5
5
  import org.embulk.config.ConfigInject;
6
6
  import org.embulk.config.Task;
7
7
  import org.embulk.spi.BufferAllocator;
8
- import org.embulk.spi.unit.LocalFile;
9
8
 
10
9
  import java.util.List;
11
10
  import java.util.Optional;
12
11
 
13
12
  public interface PluginTask
14
- extends Task, FileList.Task
13
+ extends Task, AuthUtils.Task, FileList.Task, RetryUtils.Task
15
14
  {
16
15
  @Config("bucket")
17
16
  String getBucket();
@@ -28,44 +27,17 @@ public interface PluginTask
28
27
  @ConfigDefault("true")
29
28
  boolean getIncremental();
30
29
 
31
- @Config("auth_method")
32
- @ConfigDefault("\"private_key\"")
33
- GcsFileInput.AuthMethod getAuthMethod();
34
-
35
- @Config("service_account_email")
36
- @ConfigDefault("null")
37
- Optional<String> getServiceAccountEmail();
38
-
39
30
  @Config("application_name")
40
31
  @ConfigDefault("\"Embulk GCS input plugin\"")
41
32
  String getApplicationName();
42
33
 
43
- // kept for backward compatibility
44
- @Config("p12_keyfile_fullpath")
45
- @ConfigDefault("null")
46
- Optional<String> getP12KeyfileFullpath();
47
-
48
- @Config("p12_keyfile")
49
- @ConfigDefault("null")
50
- Optional<LocalFile> getP12Keyfile();
51
- void setP12Keyfile(Optional<LocalFile> p12Keyfile);
52
-
53
- @Config("json_keyfile")
54
- @ConfigDefault("null")
55
- Optional<LocalFile> getJsonKeyfile();
56
-
57
34
  @Config("paths")
58
35
  @ConfigDefault("[]")
59
36
  List<String> getPathFiles();
60
- void setPathFiles(List<String> files);
61
37
 
62
38
  FileList getFiles();
63
39
  void setFiles(FileList files);
64
40
 
65
- @Config("max_connection_retry")
66
- @ConfigDefault("10") // 10 times retry to connect GCS server if failed.
67
- int getMaxConnectionRetry();
68
-
69
41
  @ConfigInject
70
42
  BufferAllocator getBufferAllocator();
71
43
  }
@@ -0,0 +1,153 @@
1
+ package org.embulk.input.gcs;
2
+
3
+ import com.google.api.client.auth.oauth2.TokenErrorResponse;
4
+ import com.google.api.client.auth.oauth2.TokenResponseException;
5
+ import com.google.api.client.googleapis.json.GoogleJsonResponseException;
6
+ import com.google.cloud.storage.Blob;
7
+ import com.google.cloud.storage.Storage;
8
+ import org.embulk.config.Config;
9
+ import org.embulk.config.ConfigDefault;
10
+ import org.embulk.spi.Exec;
11
+ import org.embulk.spi.util.RetryExecutor;
12
+ import org.slf4j.Logger;
13
+
14
+ import java.util.Optional;
15
+ import java.util.function.Predicate;
16
+
17
+ class RetryUtils
18
+ {
19
+ interface Task extends org.embulk.config.Task
20
+ {
21
+ @Config("max_connection_retry")
22
+ @ConfigDefault("10") // 10 times retry to connect GCS server if failed.
23
+ int getMaxConnectionRetry();
24
+
25
+ @Config("initial_retry_interval_millis")
26
+ @ConfigDefault("1000")
27
+ int getInitialRetryIntervalMillis();
28
+
29
+ @Config("maximum_retry_interval_millis")
30
+ @ConfigDefault("300000")
31
+ int getMaximumRetryIntervalMillis();
32
+ }
33
+
34
+ private RetryUtils()
35
+ {
36
+ }
37
+
38
+ private static final Logger LOG = Exec.getLogger(RetryUtils.class);
39
+
40
+ /**
41
+ * A utility predicate to detect status code 4xx of `GoogleJsonResponseException`
42
+ */
43
+ private static final Predicate<GoogleJsonResponseException> API_ERROR_NOT_RETRY_4XX = e -> {
44
+ if (e.getDetails() == null && e.getContent() != null) {
45
+ LOG.warn("Invalid response was returned : {}", e.getContent());
46
+ return true;
47
+ }
48
+ int statusCode = e.getDetails().getCode();
49
+ return statusCode / 100 != 4;
50
+ };
51
+
52
+ /**
53
+ * A utility predicate to detect status code 4xx of `TokenResponseException`
54
+ * But will retry 400 "Invalid JWS..."
55
+ */
56
+ private static final Predicate<TokenResponseException> TOKEN_ERROR_NOT_RETRY_4XX = e -> {
57
+ Optional<String> errDesc = Optional.ofNullable(e.getDetails()).map(TokenErrorResponse::getErrorDescription);
58
+ if (errDesc.isPresent()) {
59
+ // Retry: 400 BadRequest "Invalid JWT..."
60
+ // Caused by: com.google.api.client.auth.oauth2.TokenResponseException: 400 Bad Request
61
+ // {
62
+ // "error" : "invalid_grant",
63
+ // "error_description" : "Invalid JWT: No valid verifier found for issuer."
64
+ // }
65
+ if (errDesc.get().contains("Invalid JWT")) {
66
+ LOG.warn("Invalid response was returned : {}", errDesc.get());
67
+ return true;
68
+ }
69
+ }
70
+ return e.getStatusCode() / 100 != 4;
71
+ };
72
+
73
+ /**
74
+ * A default (abstract) retryable impl, which makes use of above 2 predicates
75
+ * With default behaviors onRetry, etc.
76
+ *
77
+ * @param <T>
78
+ */
79
+ public abstract static class DefaultRetryable<T> implements RetryExecutor.Retryable<T>
80
+ {
81
+ @Override
82
+ public boolean isRetryableException(Exception exception)
83
+ {
84
+ if (exception instanceof GoogleJsonResponseException) {
85
+ return API_ERROR_NOT_RETRY_4XX.test((GoogleJsonResponseException) exception);
86
+ }
87
+ else if (exception instanceof TokenResponseException) {
88
+ return TOKEN_ERROR_NOT_RETRY_4XX.test((TokenResponseException) exception);
89
+ }
90
+ return true;
91
+ }
92
+
93
+ @Override
94
+ public void onRetry(Exception exception, int retryCount, int retryLimit, int retryWait)
95
+ {
96
+ String message = String.format("GCS GET request failed. Retrying %d/%d after %d seconds. Message: %s: %s",
97
+ retryCount, retryLimit, retryWait / 1000, exception.getClass(), exception.getMessage());
98
+ if (retryCount % 3 == 0) {
99
+ LOG.warn(message, exception);
100
+ }
101
+ else {
102
+ LOG.warn(message);
103
+ }
104
+ }
105
+
106
+ @Override
107
+ public void onGiveup(Exception firstException, Exception lastException)
108
+ {
109
+ }
110
+ }
111
+
112
+ /**
113
+ * Return Blob GET op that is ready for {@code withRetry}
114
+ *
115
+ * @param client
116
+ * @param bucket
117
+ * @param key
118
+ * @return
119
+ */
120
+ static DefaultRetryable<Blob> get(Storage client, String bucket, String key)
121
+ {
122
+ return new DefaultRetryable<Blob>()
123
+ {
124
+ @Override
125
+ public Blob call()
126
+ {
127
+ return client.get(bucket, key);
128
+ }
129
+ };
130
+ }
131
+
132
+ /**
133
+ * Utility method
134
+ *
135
+ * @param task
136
+ * @param op
137
+ * @param <T>
138
+ * @return
139
+ */
140
+ static <T> T withRetry(Task task, RetryExecutor.Retryable<T> op)
141
+ {
142
+ try {
143
+ return RetryExecutor.retryExecutor()
144
+ .withInitialRetryWait(task.getInitialRetryIntervalMillis())
145
+ .withMaxRetryWait(task.getMaximumRetryIntervalMillis())
146
+ .withRetryLimit(task.getMaxConnectionRetry())
147
+ .runInterruptible(op);
148
+ }
149
+ catch (RetryExecutor.RetryGiveupException | InterruptedException e) {
150
+ throw new RuntimeException(e);
151
+ }
152
+ }
153
+ }