embulk-input-gcs 0.2.5 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,113 +1,27 @@
1
1
  package org.embulk.input.gcs;
2
2
 
3
- import com.google.api.client.http.HttpResponseException;
4
- import com.google.api.client.util.IOUtils;
5
3
  import com.google.api.services.storage.Storage;
6
- import com.google.api.services.storage.model.Bucket;
7
- import com.google.api.services.storage.model.Objects;
8
- import com.google.api.services.storage.model.StorageObject;
9
- import com.google.common.annotations.VisibleForTesting;
10
- import com.google.common.base.Charsets;
11
4
  import com.google.common.base.Function;
12
5
  import com.google.common.base.Optional;
13
6
  import com.google.common.base.Throwables;
14
- import com.google.common.collect.ImmutableList;
15
- import com.google.common.io.BaseEncoding;
16
- import org.embulk.config.Config;
17
- import org.embulk.config.ConfigDefault;
18
7
  import org.embulk.config.ConfigDiff;
19
8
  import org.embulk.config.ConfigException;
20
- import org.embulk.config.ConfigInject;
21
9
  import org.embulk.config.ConfigSource;
22
- import org.embulk.config.Task;
23
10
  import org.embulk.config.TaskReport;
24
11
  import org.embulk.config.TaskSource;
25
- import org.embulk.spi.BufferAllocator;
26
12
  import org.embulk.spi.Exec;
27
13
  import org.embulk.spi.FileInputPlugin;
28
14
  import org.embulk.spi.TransactionalFileInput;
29
15
  import org.embulk.spi.unit.LocalFile;
30
- import org.embulk.spi.util.InputStreamFileInput;
31
- import org.embulk.spi.util.ResumableInputStream;
32
- import org.embulk.spi.util.RetryExecutor.RetryGiveupException;
33
- import org.embulk.spi.util.RetryExecutor.Retryable;
34
16
  import org.slf4j.Logger;
35
- import static org.embulk.spi.util.RetryExecutor.retryExecutor;
36
17
 
37
- import java.io.BufferedInputStream;
38
- import java.io.BufferedOutputStream;
39
- import java.io.File;
40
- import java.io.FileInputStream;
41
- import java.io.FileOutputStream;
42
18
  import java.io.IOException;
43
- import java.io.InputStream;
44
- import java.io.InterruptedIOException;
45
- import java.math.BigInteger;
46
19
  import java.security.GeneralSecurityException;
47
- import java.util.ArrayList;
48
- import java.util.Collections;
49
20
  import java.util.List;
50
21
 
51
22
  public class GcsFileInputPlugin
52
23
  implements FileInputPlugin
53
24
  {
54
- public interface PluginTask
55
- extends Task
56
- {
57
- @Config("bucket")
58
- String getBucket();
59
-
60
- @Config("path_prefix")
61
- @ConfigDefault("null")
62
- Optional<String> getPathPrefix();
63
-
64
- @Config("last_path")
65
- @ConfigDefault("null")
66
- Optional<String> getLastPath();
67
-
68
- @Config("incremental")
69
- @ConfigDefault("true")
70
- boolean getIncremental();
71
-
72
- @Config("auth_method")
73
- @ConfigDefault("\"private_key\"")
74
- AuthMethod getAuthMethod();
75
-
76
- @Config("service_account_email")
77
- @ConfigDefault("null")
78
- Optional<String> getServiceAccountEmail();
79
-
80
- @Config("application_name")
81
- @ConfigDefault("\"Embulk GCS input plugin\"")
82
- String getApplicationName();
83
-
84
- // kept for backward compatibility
85
- @Config("p12_keyfile_fullpath")
86
- @ConfigDefault("null")
87
- Optional<String> getP12KeyfileFullpath();
88
-
89
- @Config("p12_keyfile")
90
- @ConfigDefault("null")
91
- Optional<LocalFile> getP12Keyfile();
92
- void setP12Keyfile(Optional<LocalFile> p12Keyfile);
93
-
94
- @Config("json_keyfile")
95
- @ConfigDefault("null")
96
- Optional<LocalFile> getJsonKeyfile();
97
-
98
- @Config("paths")
99
- @ConfigDefault("[]")
100
- List<String> getFiles();
101
- void setFiles(List<String> files);
102
-
103
- @Config("max_connection_retry")
104
- @ConfigDefault("10") // 10 times retry to connect GCS server if failed.
105
- int getMaxConnectionRetry();
106
-
107
- @ConfigInject
108
- BufferAllocator getBufferAllocator();
109
- }
110
-
111
25
  private static final Logger log = Exec.getLogger(GcsFileInputPlugin.class);
112
26
 
113
27
  @Override
@@ -146,19 +60,24 @@ public class GcsFileInputPlugin
146
60
  }
147
61
  }
148
62
 
149
- Storage client = newGcsClient(task, newGcsAuth(task));
63
+ Storage client = GcsFileInput.newGcsClient(task, newGcsAuth(task));
150
64
 
151
65
  // list files recursively if path_prefix is specified
152
66
  if (task.getPathPrefix().isPresent()) {
153
- task.setFiles(listFiles(task, client));
67
+ task.setFiles(GcsFileInput.listFiles(task, client));
154
68
  }
155
69
  else {
156
- if (task.getFiles().isEmpty()) {
70
+ if (task.getPathFiles().isEmpty()) {
157
71
  throw new ConfigException("No file is found. Confirm paths option isn't empty");
158
72
  }
73
+ FileList.Builder builder = new FileList.Builder(config);
74
+ for (String file: task.getPathFiles()) {
75
+ builder.add(file, 1);
76
+ }
77
+ task.setFiles(builder.build());
159
78
  }
160
79
  // number of processors is same with number of files
161
- return resume(task.dump(), task.getFiles().size(), control);
80
+ return resume(task.dump(), task.getFiles().getTaskCount(), control);
162
81
  }
163
82
 
164
83
  private GcsAuthentication newGcsAuth(PluginTask task)
@@ -188,18 +107,8 @@ public class GcsFileInputPlugin
188
107
 
189
108
  ConfigDiff configDiff = Exec.newConfigDiff();
190
109
 
191
- List<String> files = new ArrayList<String>(task.getFiles());
192
110
  if (task.getIncremental()) {
193
- if (files.isEmpty()) {
194
- // keep the last value if any
195
- if (task.getLastPath().isPresent()) {
196
- configDiff.set("last_path", task.getLastPath().get());
197
- }
198
- }
199
- else {
200
- Collections.sort(files);
201
- configDiff.set("last_path", files.get(files.size() - 1));
202
- }
111
+ configDiff.set("last_path", task.getFiles().getLastPath(task.getLastPath()));
203
112
  }
204
113
 
205
114
  return configDiff;
@@ -212,19 +121,6 @@ public class GcsFileInputPlugin
212
121
  {
213
122
  }
214
123
 
215
- protected Storage newGcsClient(final PluginTask task, final GcsAuthentication auth)
216
- {
217
- Storage client = null;
218
- try {
219
- client = auth.getGcsClient(task.getBucket(), task.getMaxConnectionRetry());
220
- }
221
- catch (IOException ex) {
222
- throw new ConfigException(ex);
223
- }
224
-
225
- return client;
226
- }
227
-
228
124
  private Function<LocalFile, String> localFileToPathString()
229
125
  {
230
126
  return new Function<LocalFile, String>()
@@ -236,258 +132,10 @@ public class GcsFileInputPlugin
236
132
  };
237
133
  }
238
134
 
239
- public List<String> listFiles(PluginTask task, Storage client)
240
- {
241
- String bucket = task.getBucket();
242
-
243
- return listGcsFilesByPrefix(client, bucket, task.getPathPrefix().get(), task.getLastPath());
244
- }
245
-
246
- /**
247
- * Lists GCS filenames filtered by prefix.
248
- *
249
- * The resulting list does not include the file that's size == 0.
250
- */
251
- public static List<String> listGcsFilesByPrefix(Storage client, String bucket,
252
- String prefix, Optional<String> lastPath)
253
- {
254
- ImmutableList.Builder<String> builder = ImmutableList.builder();
255
-
256
- String lastKey = lastPath.isPresent() ? base64Encode(lastPath.get()) : null;
257
-
258
- // @see https://cloud.google.com/storage/docs/json_api/v1/objects#resource
259
- if (log.isDebugEnabled()) {
260
- try {
261
- Storage.Buckets.Get getBucket = client.buckets().get(bucket);
262
- getBucket.setProjection("full");
263
- Bucket bk = getBucket.execute();
264
-
265
- log.debug("bucket name: " + bucket);
266
- log.debug("bucket location: " + bk.getLocation());
267
- log.debug("bucket timeCreated: " + bk.getTimeCreated());
268
- log.debug("bucket owner: " + bk.getOwner());
269
- }
270
- catch (IOException e) {
271
- log.warn("Could not access to bucket:" + bucket);
272
- log.warn(e.getMessage());
273
- }
274
- }
275
-
276
- try {
277
- // @see https://cloud.google.com/storage/docs/json_api/v1/objects/list
278
- Storage.Objects.List listObjects = client.objects().list(bucket);
279
- listObjects.setPrefix(prefix);
280
- listObjects.setPageToken(lastKey);
281
- do {
282
- Objects objects = listObjects.execute();
283
- List<StorageObject> items = objects.getItems();
284
- if (items == null) {
285
- log.info(String.format("No file was found in bucket:%s prefix:%s", bucket, prefix));
286
- break;
287
- }
288
- for (StorageObject o : items) {
289
- if (o.getSize().compareTo(BigInteger.ZERO) > 0) {
290
- builder.add(o.getName());
291
- }
292
- log.debug("filename: " + o.getName());
293
- log.debug("updated: " + o.getUpdated());
294
- }
295
- lastKey = objects.getNextPageToken();
296
- listObjects.setPageToken(lastKey);
297
- } while (lastKey != null);
298
- }
299
- catch (IOException e) {
300
- if ((e instanceof HttpResponseException) && ((HttpResponseException) e).getStatusCode() == 400) {
301
- throw new ConfigException(String.format("Files listing failed: bucket:%s, prefix:%s, last_path:%s", bucket, prefix, lastKey), e);
302
- }
303
-
304
- log.warn(String.format("Could not get file list from bucket:%s", bucket));
305
- log.warn(e.getMessage());
306
- }
307
-
308
- return builder.build();
309
- }
310
-
311
135
  @Override
312
136
  public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
313
137
  {
314
138
  PluginTask task = taskSource.loadTask(PluginTask.class);
315
139
  return new GcsFileInput(task, taskIndex);
316
140
  }
317
-
318
- @VisibleForTesting
319
- static class GcsInputStreamReopener
320
- implements ResumableInputStream.Reopener
321
- {
322
- private final Logger log = Exec.getLogger(GcsInputStreamReopener.class);
323
- private final File tempFile;
324
- private final Storage client;
325
- private final String bucket;
326
- private final String key;
327
- private final int maxConnectionRetry;
328
-
329
- public GcsInputStreamReopener(File tempFile, Storage client, String bucket, String key, int maxConnectionRetry)
330
- {
331
- this.tempFile = tempFile;
332
- this.client = client;
333
- this.bucket = bucket;
334
- this.key = key;
335
- this.maxConnectionRetry = maxConnectionRetry;
336
- }
337
-
338
- @Override
339
- public InputStream reopen(final long offset, final Exception closedCause) throws IOException
340
- {
341
- try {
342
- return retryExecutor()
343
- .withRetryLimit(maxConnectionRetry)
344
- .withInitialRetryWait(500)
345
- .withMaxRetryWait(30 * 1000)
346
- .runInterruptible(new Retryable<InputStream>() {
347
- @Override
348
- public InputStream call() throws IOException
349
- {
350
- log.warn(String.format("GCS read failed. Retrying GET request with %,d bytes offset", offset), closedCause);
351
- Storage.Objects.Get getObject = client.objects().get(bucket, key);
352
-
353
- try (BufferedOutputStream outputStream = new BufferedOutputStream(new FileOutputStream(tempFile))) {
354
- IOUtils.copy(getObject.executeMediaAsInputStream(), outputStream);
355
- }
356
- return new BufferedInputStream(new FileInputStream(tempFile));
357
- }
358
-
359
- @Override
360
- public boolean isRetryableException(Exception exception)
361
- {
362
- return true; // TODO
363
- }
364
-
365
- @Override
366
- public void onRetry(Exception exception, int retryCount, int retryLimit, int retryWait)
367
- throws RetryGiveupException
368
- {
369
- String message = String.format("GCS GET request failed. Retrying %d/%d after %d seconds. Message: %s",
370
- retryCount, retryLimit, retryWait / 1000, exception.getMessage());
371
- if (retryCount % 3 == 0) {
372
- log.warn(message, exception);
373
- }
374
- else {
375
- log.warn(message);
376
- }
377
- }
378
-
379
- @Override
380
- public void onGiveup(Exception firstException, Exception lastException)
381
- throws RetryGiveupException
382
- {
383
- }
384
- });
385
- }
386
- catch (RetryGiveupException ex) {
387
- Throwables.propagateIfInstanceOf(ex.getCause(), IOException.class);
388
- throw Throwables.propagate(ex.getCause());
389
- }
390
- catch (InterruptedException ex) {
391
- throw new InterruptedIOException();
392
- }
393
- }
394
- }
395
-
396
- public class GcsFileInput
397
- extends InputStreamFileInput
398
- implements TransactionalFileInput
399
- {
400
- public GcsFileInput(PluginTask task, int taskIndex)
401
- {
402
- super(task.getBufferAllocator(), new SingleFileProvider(task, taskIndex));
403
- }
404
-
405
- public void abort()
406
- {
407
- }
408
-
409
- public TaskReport commit()
410
- {
411
- return Exec.newTaskReport();
412
- }
413
-
414
- @Override
415
- public void close()
416
- {
417
- }
418
- }
419
-
420
- private class SingleFileProvider
421
- implements InputStreamFileInput.Provider
422
- {
423
- private final Storage client;
424
- private final String bucket;
425
- private final String key;
426
- private final int maxConnectionRetry;
427
- private boolean opened = false;
428
-
429
- public SingleFileProvider(PluginTask task, int taskIndex)
430
- {
431
- this.client = newGcsClient(task, newGcsAuth(task));
432
- this.bucket = task.getBucket();
433
- this.key = task.getFiles().get(taskIndex);
434
- this.maxConnectionRetry = task.getMaxConnectionRetry();
435
- }
436
-
437
- @Override
438
- public InputStream openNext() throws IOException
439
- {
440
- if (opened) {
441
- return null;
442
- }
443
- opened = true;
444
- Storage.Objects.Get getObject = client.objects().get(bucket, key);
445
- File tempFile = Exec.getTempFileSpace().createTempFile();
446
- try (BufferedOutputStream outputStream = new BufferedOutputStream(new FileOutputStream(tempFile))) {
447
- IOUtils.copy(getObject.executeMediaAsInputStream(), outputStream);
448
- }
449
- return new ResumableInputStream(new BufferedInputStream(new FileInputStream(tempFile)), new GcsInputStreamReopener(tempFile, client, bucket, key, maxConnectionRetry));
450
- }
451
-
452
- @Override
453
- public void close()
454
- {
455
- }
456
- }
457
-
458
- // String nextToken = base64Encode(0x0a + 0x01~0x27 + filePath);
459
- private static String base64Encode(String path)
460
- {
461
- byte[] encoding;
462
- byte[] utf8 = path.getBytes(Charsets.UTF_8);
463
- log.debug(String.format("path string: %s ,path length:%s \" + ", path, utf8.length));
464
-
465
- encoding = new byte[utf8.length + 2];
466
- encoding[0] = 0x0a;
467
- encoding[1] = new Byte(String.valueOf(path.length()));
468
- System.arraycopy(utf8, 0, encoding, 2, utf8.length);
469
-
470
- String s = BaseEncoding.base64().encode(encoding);
471
- log.debug(String.format("last_path(base64 encoded): %s", s));
472
- return s;
473
- }
474
-
475
- public enum AuthMethod
476
- {
477
- private_key("private_key"),
478
- compute_engine("compute_engine"),
479
- json_key("json_key");
480
-
481
- private final String string;
482
-
483
- AuthMethod(String string)
484
- {
485
- this.string = string;
486
- }
487
-
488
- public String getString()
489
- {
490
- return string;
491
- }
492
- }
493
141
  }
@@ -0,0 +1,71 @@
1
+ package org.embulk.input.gcs;
2
+
3
+ import com.google.common.base.Optional;
4
+ import org.embulk.config.Config;
5
+ import org.embulk.config.ConfigDefault;
6
+ import org.embulk.config.ConfigInject;
7
+ import org.embulk.config.Task;
8
+ import org.embulk.spi.BufferAllocator;
9
+ import org.embulk.spi.unit.LocalFile;
10
+
11
+ import java.util.List;
12
+
13
+ public interface PluginTask
14
+ extends Task, FileList.Task
15
+ {
16
+ @Config("bucket")
17
+ String getBucket();
18
+
19
+ @Config("path_prefix")
20
+ @ConfigDefault("null")
21
+ Optional<String> getPathPrefix();
22
+
23
+ @Config("last_path")
24
+ @ConfigDefault("null")
25
+ Optional<String> getLastPath();
26
+
27
+ @Config("incremental")
28
+ @ConfigDefault("true")
29
+ boolean getIncremental();
30
+
31
+ @Config("auth_method")
32
+ @ConfigDefault("\"private_key\"")
33
+ GcsFileInput.AuthMethod getAuthMethod();
34
+
35
+ @Config("service_account_email")
36
+ @ConfigDefault("null")
37
+ Optional<String> getServiceAccountEmail();
38
+
39
+ @Config("application_name")
40
+ @ConfigDefault("\"Embulk GCS input plugin\"")
41
+ String getApplicationName();
42
+
43
+ // kept for backward compatibility
44
+ @Config("p12_keyfile_fullpath")
45
+ @ConfigDefault("null")
46
+ Optional<String> getP12KeyfileFullpath();
47
+
48
+ @Config("p12_keyfile")
49
+ @ConfigDefault("null")
50
+ Optional<LocalFile> getP12Keyfile();
51
+ void setP12Keyfile(Optional<LocalFile> p12Keyfile);
52
+
53
+ @Config("json_keyfile")
54
+ @ConfigDefault("null")
55
+ Optional<LocalFile> getJsonKeyfile();
56
+
57
+ @Config("paths")
58
+ @ConfigDefault("[]")
59
+ List<String> getPathFiles();
60
+ void setPathFiles(List<String> files);
61
+
62
+ FileList getFiles();
63
+ void setFiles(FileList files);
64
+
65
+ @Config("max_connection_retry")
66
+ @ConfigDefault("10") // 10 times retry to connect GCS server if failed.
67
+ int getMaxConnectionRetry();
68
+
69
+ @ConfigInject
70
+ BufferAllocator getBufferAllocator();
71
+ }