embulk-input-gcs 0.2.5 → 0.2.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,113 +1,27 @@
1
1
  package org.embulk.input.gcs;
2
2
 
3
- import com.google.api.client.http.HttpResponseException;
4
- import com.google.api.client.util.IOUtils;
5
3
  import com.google.api.services.storage.Storage;
6
- import com.google.api.services.storage.model.Bucket;
7
- import com.google.api.services.storage.model.Objects;
8
- import com.google.api.services.storage.model.StorageObject;
9
- import com.google.common.annotations.VisibleForTesting;
10
- import com.google.common.base.Charsets;
11
4
  import com.google.common.base.Function;
12
5
  import com.google.common.base.Optional;
13
6
  import com.google.common.base.Throwables;
14
- import com.google.common.collect.ImmutableList;
15
- import com.google.common.io.BaseEncoding;
16
- import org.embulk.config.Config;
17
- import org.embulk.config.ConfigDefault;
18
7
  import org.embulk.config.ConfigDiff;
19
8
  import org.embulk.config.ConfigException;
20
- import org.embulk.config.ConfigInject;
21
9
  import org.embulk.config.ConfigSource;
22
- import org.embulk.config.Task;
23
10
  import org.embulk.config.TaskReport;
24
11
  import org.embulk.config.TaskSource;
25
- import org.embulk.spi.BufferAllocator;
26
12
  import org.embulk.spi.Exec;
27
13
  import org.embulk.spi.FileInputPlugin;
28
14
  import org.embulk.spi.TransactionalFileInput;
29
15
  import org.embulk.spi.unit.LocalFile;
30
- import org.embulk.spi.util.InputStreamFileInput;
31
- import org.embulk.spi.util.ResumableInputStream;
32
- import org.embulk.spi.util.RetryExecutor.RetryGiveupException;
33
- import org.embulk.spi.util.RetryExecutor.Retryable;
34
16
  import org.slf4j.Logger;
35
- import static org.embulk.spi.util.RetryExecutor.retryExecutor;
36
17
 
37
- import java.io.BufferedInputStream;
38
- import java.io.BufferedOutputStream;
39
- import java.io.File;
40
- import java.io.FileInputStream;
41
- import java.io.FileOutputStream;
42
18
  import java.io.IOException;
43
- import java.io.InputStream;
44
- import java.io.InterruptedIOException;
45
- import java.math.BigInteger;
46
19
  import java.security.GeneralSecurityException;
47
- import java.util.ArrayList;
48
- import java.util.Collections;
49
20
  import java.util.List;
50
21
 
51
22
  public class GcsFileInputPlugin
52
23
  implements FileInputPlugin
53
24
  {
54
- public interface PluginTask
55
- extends Task
56
- {
57
- @Config("bucket")
58
- String getBucket();
59
-
60
- @Config("path_prefix")
61
- @ConfigDefault("null")
62
- Optional<String> getPathPrefix();
63
-
64
- @Config("last_path")
65
- @ConfigDefault("null")
66
- Optional<String> getLastPath();
67
-
68
- @Config("incremental")
69
- @ConfigDefault("true")
70
- boolean getIncremental();
71
-
72
- @Config("auth_method")
73
- @ConfigDefault("\"private_key\"")
74
- AuthMethod getAuthMethod();
75
-
76
- @Config("service_account_email")
77
- @ConfigDefault("null")
78
- Optional<String> getServiceAccountEmail();
79
-
80
- @Config("application_name")
81
- @ConfigDefault("\"Embulk GCS input plugin\"")
82
- String getApplicationName();
83
-
84
- // kept for backward compatibility
85
- @Config("p12_keyfile_fullpath")
86
- @ConfigDefault("null")
87
- Optional<String> getP12KeyfileFullpath();
88
-
89
- @Config("p12_keyfile")
90
- @ConfigDefault("null")
91
- Optional<LocalFile> getP12Keyfile();
92
- void setP12Keyfile(Optional<LocalFile> p12Keyfile);
93
-
94
- @Config("json_keyfile")
95
- @ConfigDefault("null")
96
- Optional<LocalFile> getJsonKeyfile();
97
-
98
- @Config("paths")
99
- @ConfigDefault("[]")
100
- List<String> getFiles();
101
- void setFiles(List<String> files);
102
-
103
- @Config("max_connection_retry")
104
- @ConfigDefault("10") // 10 times retry to connect GCS server if failed.
105
- int getMaxConnectionRetry();
106
-
107
- @ConfigInject
108
- BufferAllocator getBufferAllocator();
109
- }
110
-
111
25
  private static final Logger log = Exec.getLogger(GcsFileInputPlugin.class);
112
26
 
113
27
  @Override
@@ -146,19 +60,24 @@ public class GcsFileInputPlugin
146
60
  }
147
61
  }
148
62
 
149
- Storage client = newGcsClient(task, newGcsAuth(task));
63
+ Storage client = GcsFileInput.newGcsClient(task, newGcsAuth(task));
150
64
 
151
65
  // list files recursively if path_prefix is specified
152
66
  if (task.getPathPrefix().isPresent()) {
153
- task.setFiles(listFiles(task, client));
67
+ task.setFiles(GcsFileInput.listFiles(task, client));
154
68
  }
155
69
  else {
156
- if (task.getFiles().isEmpty()) {
70
+ if (task.getPathFiles().isEmpty()) {
157
71
  throw new ConfigException("No file is found. Confirm paths option isn't empty");
158
72
  }
73
+ FileList.Builder builder = new FileList.Builder(config);
74
+ for (String file: task.getPathFiles()) {
75
+ builder.add(file, 1);
76
+ }
77
+ task.setFiles(builder.build());
159
78
  }
160
79
  // number of processors is same with number of files
161
- return resume(task.dump(), task.getFiles().size(), control);
80
+ return resume(task.dump(), task.getFiles().getTaskCount(), control);
162
81
  }
163
82
 
164
83
  private GcsAuthentication newGcsAuth(PluginTask task)
@@ -188,18 +107,8 @@ public class GcsFileInputPlugin
188
107
 
189
108
  ConfigDiff configDiff = Exec.newConfigDiff();
190
109
 
191
- List<String> files = new ArrayList<String>(task.getFiles());
192
110
  if (task.getIncremental()) {
193
- if (files.isEmpty()) {
194
- // keep the last value if any
195
- if (task.getLastPath().isPresent()) {
196
- configDiff.set("last_path", task.getLastPath().get());
197
- }
198
- }
199
- else {
200
- Collections.sort(files);
201
- configDiff.set("last_path", files.get(files.size() - 1));
202
- }
111
+ configDiff.set("last_path", task.getFiles().getLastPath(task.getLastPath()));
203
112
  }
204
113
 
205
114
  return configDiff;
@@ -212,19 +121,6 @@ public class GcsFileInputPlugin
212
121
  {
213
122
  }
214
123
 
215
- protected Storage newGcsClient(final PluginTask task, final GcsAuthentication auth)
216
- {
217
- Storage client = null;
218
- try {
219
- client = auth.getGcsClient(task.getBucket(), task.getMaxConnectionRetry());
220
- }
221
- catch (IOException ex) {
222
- throw new ConfigException(ex);
223
- }
224
-
225
- return client;
226
- }
227
-
228
124
  private Function<LocalFile, String> localFileToPathString()
229
125
  {
230
126
  return new Function<LocalFile, String>()
@@ -236,258 +132,10 @@ public class GcsFileInputPlugin
236
132
  };
237
133
  }
238
134
 
239
- public List<String> listFiles(PluginTask task, Storage client)
240
- {
241
- String bucket = task.getBucket();
242
-
243
- return listGcsFilesByPrefix(client, bucket, task.getPathPrefix().get(), task.getLastPath());
244
- }
245
-
246
- /**
247
- * Lists GCS filenames filtered by prefix.
248
- *
249
- * The resulting list does not include the file that's size == 0.
250
- */
251
- public static List<String> listGcsFilesByPrefix(Storage client, String bucket,
252
- String prefix, Optional<String> lastPath)
253
- {
254
- ImmutableList.Builder<String> builder = ImmutableList.builder();
255
-
256
- String lastKey = lastPath.isPresent() ? base64Encode(lastPath.get()) : null;
257
-
258
- // @see https://cloud.google.com/storage/docs/json_api/v1/objects#resource
259
- if (log.isDebugEnabled()) {
260
- try {
261
- Storage.Buckets.Get getBucket = client.buckets().get(bucket);
262
- getBucket.setProjection("full");
263
- Bucket bk = getBucket.execute();
264
-
265
- log.debug("bucket name: " + bucket);
266
- log.debug("bucket location: " + bk.getLocation());
267
- log.debug("bucket timeCreated: " + bk.getTimeCreated());
268
- log.debug("bucket owner: " + bk.getOwner());
269
- }
270
- catch (IOException e) {
271
- log.warn("Could not access to bucket:" + bucket);
272
- log.warn(e.getMessage());
273
- }
274
- }
275
-
276
- try {
277
- // @see https://cloud.google.com/storage/docs/json_api/v1/objects/list
278
- Storage.Objects.List listObjects = client.objects().list(bucket);
279
- listObjects.setPrefix(prefix);
280
- listObjects.setPageToken(lastKey);
281
- do {
282
- Objects objects = listObjects.execute();
283
- List<StorageObject> items = objects.getItems();
284
- if (items == null) {
285
- log.info(String.format("No file was found in bucket:%s prefix:%s", bucket, prefix));
286
- break;
287
- }
288
- for (StorageObject o : items) {
289
- if (o.getSize().compareTo(BigInteger.ZERO) > 0) {
290
- builder.add(o.getName());
291
- }
292
- log.debug("filename: " + o.getName());
293
- log.debug("updated: " + o.getUpdated());
294
- }
295
- lastKey = objects.getNextPageToken();
296
- listObjects.setPageToken(lastKey);
297
- } while (lastKey != null);
298
- }
299
- catch (IOException e) {
300
- if ((e instanceof HttpResponseException) && ((HttpResponseException) e).getStatusCode() == 400) {
301
- throw new ConfigException(String.format("Files listing failed: bucket:%s, prefix:%s, last_path:%s", bucket, prefix, lastKey), e);
302
- }
303
-
304
- log.warn(String.format("Could not get file list from bucket:%s", bucket));
305
- log.warn(e.getMessage());
306
- }
307
-
308
- return builder.build();
309
- }
310
-
311
135
  @Override
312
136
  public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
313
137
  {
314
138
  PluginTask task = taskSource.loadTask(PluginTask.class);
315
139
  return new GcsFileInput(task, taskIndex);
316
140
  }
317
-
318
- @VisibleForTesting
319
- static class GcsInputStreamReopener
320
- implements ResumableInputStream.Reopener
321
- {
322
- private final Logger log = Exec.getLogger(GcsInputStreamReopener.class);
323
- private final File tempFile;
324
- private final Storage client;
325
- private final String bucket;
326
- private final String key;
327
- private final int maxConnectionRetry;
328
-
329
- public GcsInputStreamReopener(File tempFile, Storage client, String bucket, String key, int maxConnectionRetry)
330
- {
331
- this.tempFile = tempFile;
332
- this.client = client;
333
- this.bucket = bucket;
334
- this.key = key;
335
- this.maxConnectionRetry = maxConnectionRetry;
336
- }
337
-
338
- @Override
339
- public InputStream reopen(final long offset, final Exception closedCause) throws IOException
340
- {
341
- try {
342
- return retryExecutor()
343
- .withRetryLimit(maxConnectionRetry)
344
- .withInitialRetryWait(500)
345
- .withMaxRetryWait(30 * 1000)
346
- .runInterruptible(new Retryable<InputStream>() {
347
- @Override
348
- public InputStream call() throws IOException
349
- {
350
- log.warn(String.format("GCS read failed. Retrying GET request with %,d bytes offset", offset), closedCause);
351
- Storage.Objects.Get getObject = client.objects().get(bucket, key);
352
-
353
- try (BufferedOutputStream outputStream = new BufferedOutputStream(new FileOutputStream(tempFile))) {
354
- IOUtils.copy(getObject.executeMediaAsInputStream(), outputStream);
355
- }
356
- return new BufferedInputStream(new FileInputStream(tempFile));
357
- }
358
-
359
- @Override
360
- public boolean isRetryableException(Exception exception)
361
- {
362
- return true; // TODO
363
- }
364
-
365
- @Override
366
- public void onRetry(Exception exception, int retryCount, int retryLimit, int retryWait)
367
- throws RetryGiveupException
368
- {
369
- String message = String.format("GCS GET request failed. Retrying %d/%d after %d seconds. Message: %s",
370
- retryCount, retryLimit, retryWait / 1000, exception.getMessage());
371
- if (retryCount % 3 == 0) {
372
- log.warn(message, exception);
373
- }
374
- else {
375
- log.warn(message);
376
- }
377
- }
378
-
379
- @Override
380
- public void onGiveup(Exception firstException, Exception lastException)
381
- throws RetryGiveupException
382
- {
383
- }
384
- });
385
- }
386
- catch (RetryGiveupException ex) {
387
- Throwables.propagateIfInstanceOf(ex.getCause(), IOException.class);
388
- throw Throwables.propagate(ex.getCause());
389
- }
390
- catch (InterruptedException ex) {
391
- throw new InterruptedIOException();
392
- }
393
- }
394
- }
395
-
396
- public class GcsFileInput
397
- extends InputStreamFileInput
398
- implements TransactionalFileInput
399
- {
400
- public GcsFileInput(PluginTask task, int taskIndex)
401
- {
402
- super(task.getBufferAllocator(), new SingleFileProvider(task, taskIndex));
403
- }
404
-
405
- public void abort()
406
- {
407
- }
408
-
409
- public TaskReport commit()
410
- {
411
- return Exec.newTaskReport();
412
- }
413
-
414
- @Override
415
- public void close()
416
- {
417
- }
418
- }
419
-
420
- private class SingleFileProvider
421
- implements InputStreamFileInput.Provider
422
- {
423
- private final Storage client;
424
- private final String bucket;
425
- private final String key;
426
- private final int maxConnectionRetry;
427
- private boolean opened = false;
428
-
429
- public SingleFileProvider(PluginTask task, int taskIndex)
430
- {
431
- this.client = newGcsClient(task, newGcsAuth(task));
432
- this.bucket = task.getBucket();
433
- this.key = task.getFiles().get(taskIndex);
434
- this.maxConnectionRetry = task.getMaxConnectionRetry();
435
- }
436
-
437
- @Override
438
- public InputStream openNext() throws IOException
439
- {
440
- if (opened) {
441
- return null;
442
- }
443
- opened = true;
444
- Storage.Objects.Get getObject = client.objects().get(bucket, key);
445
- File tempFile = Exec.getTempFileSpace().createTempFile();
446
- try (BufferedOutputStream outputStream = new BufferedOutputStream(new FileOutputStream(tempFile))) {
447
- IOUtils.copy(getObject.executeMediaAsInputStream(), outputStream);
448
- }
449
- return new ResumableInputStream(new BufferedInputStream(new FileInputStream(tempFile)), new GcsInputStreamReopener(tempFile, client, bucket, key, maxConnectionRetry));
450
- }
451
-
452
- @Override
453
- public void close()
454
- {
455
- }
456
- }
457
-
458
- // String nextToken = base64Encode(0x0a + 0x01~0x27 + filePath);
459
- private static String base64Encode(String path)
460
- {
461
- byte[] encoding;
462
- byte[] utf8 = path.getBytes(Charsets.UTF_8);
463
- log.debug(String.format("path string: %s ,path length:%s \" + ", path, utf8.length));
464
-
465
- encoding = new byte[utf8.length + 2];
466
- encoding[0] = 0x0a;
467
- encoding[1] = new Byte(String.valueOf(path.length()));
468
- System.arraycopy(utf8, 0, encoding, 2, utf8.length);
469
-
470
- String s = BaseEncoding.base64().encode(encoding);
471
- log.debug(String.format("last_path(base64 encoded): %s", s));
472
- return s;
473
- }
474
-
475
- public enum AuthMethod
476
- {
477
- private_key("private_key"),
478
- compute_engine("compute_engine"),
479
- json_key("json_key");
480
-
481
- private final String string;
482
-
483
- AuthMethod(String string)
484
- {
485
- this.string = string;
486
- }
487
-
488
- public String getString()
489
- {
490
- return string;
491
- }
492
- }
493
141
  }
@@ -0,0 +1,71 @@
1
+ package org.embulk.input.gcs;
2
+
3
+ import com.google.common.base.Optional;
4
+ import org.embulk.config.Config;
5
+ import org.embulk.config.ConfigDefault;
6
+ import org.embulk.config.ConfigInject;
7
+ import org.embulk.config.Task;
8
+ import org.embulk.spi.BufferAllocator;
9
+ import org.embulk.spi.unit.LocalFile;
10
+
11
+ import java.util.List;
12
+
13
+ public interface PluginTask
14
+ extends Task, FileList.Task
15
+ {
16
+ @Config("bucket")
17
+ String getBucket();
18
+
19
+ @Config("path_prefix")
20
+ @ConfigDefault("null")
21
+ Optional<String> getPathPrefix();
22
+
23
+ @Config("last_path")
24
+ @ConfigDefault("null")
25
+ Optional<String> getLastPath();
26
+
27
+ @Config("incremental")
28
+ @ConfigDefault("true")
29
+ boolean getIncremental();
30
+
31
+ @Config("auth_method")
32
+ @ConfigDefault("\"private_key\"")
33
+ GcsFileInput.AuthMethod getAuthMethod();
34
+
35
+ @Config("service_account_email")
36
+ @ConfigDefault("null")
37
+ Optional<String> getServiceAccountEmail();
38
+
39
+ @Config("application_name")
40
+ @ConfigDefault("\"Embulk GCS input plugin\"")
41
+ String getApplicationName();
42
+
43
+ // kept for backward compatibility
44
+ @Config("p12_keyfile_fullpath")
45
+ @ConfigDefault("null")
46
+ Optional<String> getP12KeyfileFullpath();
47
+
48
+ @Config("p12_keyfile")
49
+ @ConfigDefault("null")
50
+ Optional<LocalFile> getP12Keyfile();
51
+ void setP12Keyfile(Optional<LocalFile> p12Keyfile);
52
+
53
+ @Config("json_keyfile")
54
+ @ConfigDefault("null")
55
+ Optional<LocalFile> getJsonKeyfile();
56
+
57
+ @Config("paths")
58
+ @ConfigDefault("[]")
59
+ List<String> getPathFiles();
60
+ void setPathFiles(List<String> files);
61
+
62
+ FileList getFiles();
63
+ void setFiles(FileList files);
64
+
65
+ @Config("max_connection_retry")
66
+ @ConfigDefault("10") // 10 times retry to connect GCS server if failed.
67
+ int getMaxConnectionRetry();
68
+
69
+ @ConfigInject
70
+ BufferAllocator getBufferAllocator();
71
+ }