embulk-input-gcs 0.2.5 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,143 @@
1
+ package org.embulk.input.gcs;
2
+
3
+ import com.google.api.client.util.IOUtils;
4
+ import com.google.api.services.storage.Storage;
5
+ import com.google.common.annotations.VisibleForTesting;
6
+ import com.google.common.base.Throwables;
7
+ import org.embulk.spi.Exec;
8
+ import org.embulk.spi.util.InputStreamFileInput;
9
+ import org.embulk.spi.util.ResumableInputStream;
10
+ import org.embulk.spi.util.RetryExecutor;
11
+ import org.slf4j.Logger;
12
+
13
+ import java.io.BufferedInputStream;
14
+ import java.io.BufferedOutputStream;
15
+ import java.io.File;
16
+ import java.io.FileInputStream;
17
+ import java.io.FileOutputStream;
18
+ import java.io.IOException;
19
+ import java.io.InputStream;
20
+ import java.io.InterruptedIOException;
21
+ import java.util.Iterator;
22
+
23
+ import static org.embulk.spi.util.RetryExecutor.retryExecutor;
24
+
25
+ public class SingleFileProvider
26
+ implements InputStreamFileInput.Provider
27
+ {
28
+ private final Storage client;
29
+ private final String bucket;
30
+ private final Iterator<String> iterator;
31
+ private final int maxConnectionRetry;
32
+ private boolean opened = false;
33
+
34
+ public SingleFileProvider(PluginTask task, int taskIndex)
35
+ {
36
+ this.client = GcsFileInput.newGcsClient(task, GcsFileInput.newGcsAuth(task));
37
+ this.bucket = task.getBucket();
38
+ this.iterator = task.getFiles().get(taskIndex).iterator();
39
+ this.maxConnectionRetry = task.getMaxConnectionRetry();
40
+ }
41
+
42
+ @Override
43
+ public InputStream openNext() throws IOException
44
+ {
45
+ if (opened) {
46
+ return null;
47
+ }
48
+ opened = true;
49
+ if (!iterator.hasNext()) {
50
+ return null;
51
+ }
52
+ String key = iterator.next();
53
+ Storage.Objects.Get getObject = client.objects().get(bucket, key);
54
+ File tempFile = Exec.getTempFileSpace().createTempFile();
55
+ try (BufferedOutputStream outputStream = new BufferedOutputStream(new FileOutputStream(tempFile))) {
56
+ IOUtils.copy(getObject.executeMediaAsInputStream(), outputStream);
57
+ }
58
+ return new ResumableInputStream(new BufferedInputStream(new FileInputStream(tempFile)), new GcsInputStreamReopener(tempFile, client, bucket, key, maxConnectionRetry));
59
+ }
60
+
61
+ @Override
62
+ public void close()
63
+ {
64
+ }
65
+
66
+ @VisibleForTesting
67
+ static class GcsInputStreamReopener
68
+ implements ResumableInputStream.Reopener
69
+ {
70
+ private final Logger log = Exec.getLogger(GcsInputStreamReopener.class);
71
+ private final File tempFile;
72
+ private final Storage client;
73
+ private final String bucket;
74
+ private final String key;
75
+ private final int maxConnectionRetry;
76
+
77
+ public GcsInputStreamReopener(File tempFile, Storage client, String bucket, String key, int maxConnectionRetry)
78
+ {
79
+ this.tempFile = tempFile;
80
+ this.client = client;
81
+ this.bucket = bucket;
82
+ this.key = key;
83
+ this.maxConnectionRetry = maxConnectionRetry;
84
+ }
85
+
86
+ @Override
87
+ public InputStream reopen(final long offset, final Exception closedCause) throws IOException
88
+ {
89
+ try {
90
+ return retryExecutor()
91
+ .withRetryLimit(maxConnectionRetry)
92
+ .withInitialRetryWait(500)
93
+ .withMaxRetryWait(30 * 1000)
94
+ .runInterruptible(new RetryExecutor.Retryable<InputStream>() {
95
+ @Override
96
+ public InputStream call() throws IOException
97
+ {
98
+ log.warn(String.format("GCS read failed. Retrying GET request with %,d bytes offset", offset), closedCause);
99
+ Storage.Objects.Get getObject = client.objects().get(bucket, key);
100
+
101
+ try (BufferedOutputStream outputStream = new BufferedOutputStream(new FileOutputStream(tempFile))) {
102
+ IOUtils.copy(getObject.executeMediaAsInputStream(), outputStream);
103
+ }
104
+ return new BufferedInputStream(new FileInputStream(tempFile));
105
+ }
106
+
107
+ @Override
108
+ public boolean isRetryableException(Exception exception)
109
+ {
110
+ return true; // TODO
111
+ }
112
+
113
+ @Override
114
+ public void onRetry(Exception exception, int retryCount, int retryLimit, int retryWait)
115
+ throws RetryExecutor.RetryGiveupException
116
+ {
117
+ String message = String.format("GCS GET request failed. Retrying %d/%d after %d seconds. Message: %s",
118
+ retryCount, retryLimit, retryWait / 1000, exception.getMessage());
119
+ if (retryCount % 3 == 0) {
120
+ log.warn(message, exception);
121
+ }
122
+ else {
123
+ log.warn(message);
124
+ }
125
+ }
126
+
127
+ @Override
128
+ public void onGiveup(Exception firstException, Exception lastException)
129
+ throws RetryExecutor.RetryGiveupException
130
+ {
131
+ }
132
+ });
133
+ }
134
+ catch (RetryExecutor.RetryGiveupException ex) {
135
+ Throwables.propagateIfInstanceOf(ex.getCause(), IOException.class);
136
+ throw Throwables.propagate(ex.getCause());
137
+ }
138
+ catch (InterruptedException ex) {
139
+ throw new InterruptedIOException();
140
+ }
141
+ }
142
+ }
143
+ }
@@ -11,7 +11,6 @@ import org.embulk.config.ConfigException;
11
11
  import org.embulk.config.ConfigSource;
12
12
  import org.embulk.config.TaskReport;
13
13
  import org.embulk.config.TaskSource;
14
- import org.embulk.input.gcs.GcsFileInputPlugin.PluginTask;
15
14
  import org.embulk.spi.Exec;
16
15
  import org.embulk.spi.FileInputPlugin;
17
16
  import org.embulk.spi.FileInputRunner;
@@ -37,7 +36,6 @@ import java.util.Arrays;
37
36
  import java.util.List;
38
37
 
39
38
  import static org.junit.Assert.assertEquals;
40
- import static org.junit.Assert.assertFalse;
41
39
  import static org.junit.Assume.assumeNotNull;
42
40
 
43
41
  import java.lang.reflect.InvocationTargetException;
@@ -105,7 +103,7 @@ public class TestGcsFileInputPlugin
105
103
  .set("bucket", GCP_BUCKET)
106
104
  .set("path_prefix", "my-prefix");
107
105
 
108
- GcsFileInputPlugin.PluginTask task = config.loadConfig(PluginTask.class);
106
+ PluginTask task = config.loadConfig(PluginTask.class);
109
107
  assertEquals(true, task.getIncremental());
110
108
  assertEquals("private_key", task.getAuthMethod().toString());
111
109
  assertEquals("Embulk GCS input plugin", task.getApplicationName());
@@ -124,8 +122,8 @@ public class TestGcsFileInputPlugin
124
122
  .set("p12_keyfile_fullpath", GCP_P12_KEYFILE)
125
123
  .set("parser", parserConfig(schemaConfig()));
126
124
 
127
- GcsFileInputPlugin.PluginTask task = config.loadConfig(PluginTask.class);
128
- assertFalse(task.getFiles().isEmpty());
125
+ PluginTask task = config.loadConfig(PluginTask.class);
126
+ assertEquals(2, task.getPathFiles().size());
129
127
  }
130
128
 
131
129
  // both path_prefix and paths are not set
@@ -230,7 +228,7 @@ public class TestGcsFileInputPlugin
230
228
 
231
229
  Method method = GcsFileInputPlugin.class.getDeclaredMethod("newGcsAuth", PluginTask.class);
232
230
  method.setAccessible(true);
233
- plugin.newGcsClient(task, (GcsAuthentication) method.invoke(plugin, task)); // no errors happens
231
+ GcsFileInput.newGcsClient(task, (GcsAuthentication) method.invoke(plugin, task)); // no errors happens
234
232
  }
235
233
 
236
234
  @Test(expected = ConfigException.class)
@@ -251,14 +249,16 @@ public class TestGcsFileInputPlugin
251
249
 
252
250
  Method method = GcsFileInputPlugin.class.getDeclaredMethod("newGcsAuth", PluginTask.class);
253
251
  method.setAccessible(true);
254
- plugin.newGcsClient(task, (GcsAuthentication) method.invoke(plugin, task));
252
+ GcsFileInput.newGcsClient(task, (GcsAuthentication) method.invoke(plugin, task));
255
253
  }
256
254
 
257
255
  @Test
258
256
  public void testResume()
259
257
  {
260
258
  PluginTask task = config.loadConfig(PluginTask.class);
261
- task.setFiles(Arrays.asList(new String[]{"in/aa/a"}));
259
+ FileList.Builder builder = new FileList.Builder(config);
260
+ builder.add("in/aa/a", 1);
261
+ task.setFiles(builder.build());
262
262
  ConfigDiff configDiff = plugin.resume(task.dump(), 0, new FileInputPlugin.Control()
263
263
  {
264
264
  @Override
@@ -298,12 +298,44 @@ public class TestGcsFileInputPlugin
298
298
 
299
299
  Method method = GcsFileInputPlugin.class.getDeclaredMethod("newGcsAuth", PluginTask.class);
300
300
  method.setAccessible(true);
301
- Storage client = plugin.newGcsClient(task, (GcsAuthentication) method.invoke(plugin, task));
302
- List<String> actual = plugin.listGcsFilesByPrefix(client, GCP_BUCKET, GCP_PATH_PREFIX, Optional.<String>absent());
303
- assertEquals(expected, actual);
301
+ Storage client = GcsFileInput.newGcsClient(task, (GcsAuthentication) method.invoke(plugin, task));
302
+ FileList.Builder builder = new FileList.Builder(config);
303
+ GcsFileInput.listGcsFilesByPrefix(builder, client, GCP_BUCKET, GCP_PATH_PREFIX, Optional.<String>absent());
304
+ FileList fileList = builder.build();
305
+ assertEquals(expected.get(0), fileList.get(0).get(0));
306
+ assertEquals(expected.get(1), fileList.get(1).get(0));
304
307
  assertEquals(GCP_BUCKET_DIRECTORY + "sample_02.csv", configDiff.get(String.class, "last_path"));
305
308
  }
306
309
 
310
+ @Test
311
+ public void testListFilesByPrefixWithPattern()
312
+ throws NoSuchMethodException, IllegalAccessException, InvocationTargetException
313
+ {
314
+ List<String> expected = Arrays.asList(
315
+ GCP_BUCKET_DIRECTORY + "sample_01.csv"
316
+ );
317
+
318
+ ConfigSource configWithPattern = config.deepCopy().set("path_match_pattern", "1");
319
+ PluginTask task = configWithPattern.loadConfig(PluginTask.class);
320
+ ConfigDiff configDiff = plugin.transaction(configWithPattern, new FileInputPlugin.Control() {
321
+ @Override
322
+ public List<TaskReport> run(TaskSource taskSource, int taskCount)
323
+ {
324
+ assertEquals(1, taskCount);
325
+ return emptyTaskReports(taskCount);
326
+ }
327
+ });
328
+
329
+ Method method = GcsFileInputPlugin.class.getDeclaredMethod("newGcsAuth", PluginTask.class);
330
+ method.setAccessible(true);
331
+ Storage client = GcsFileInput.newGcsClient(task, (GcsAuthentication) method.invoke(plugin, task));
332
+ FileList.Builder builder = new FileList.Builder(configWithPattern);
333
+ GcsFileInput.listGcsFilesByPrefix(builder, client, GCP_BUCKET, GCP_PATH_PREFIX, Optional.<String>absent());
334
+ FileList fileList = builder.build();
335
+ assertEquals(expected.get(0), fileList.get(0).get(0));
336
+ assertEquals(GCP_BUCKET_DIRECTORY + "sample_01.csv", configDiff.get(String.class, "last_path"));
337
+ }
338
+
307
339
  @Test
308
340
  public void testListFilesByPrefixIncrementalFalse() throws Exception
309
341
  {
@@ -324,8 +356,9 @@ public class TestGcsFileInputPlugin
324
356
 
325
357
  Method method = GcsFileInputPlugin.class.getDeclaredMethod("newGcsAuth", PluginTask.class);
326
358
  method.setAccessible(true);
327
- Storage client = plugin.newGcsClient(task, (GcsAuthentication) method.invoke(plugin, task));
328
- plugin.listGcsFilesByPrefix(client, "non-exists-bucket", "prefix", Optional.<String>absent()); // no errors happens
359
+ Storage client = GcsFileInput.newGcsClient(task, (GcsAuthentication) method.invoke(plugin, task));
360
+ FileList.Builder builder = new FileList.Builder(config);
361
+ GcsFileInput.listGcsFilesByPrefix(builder, client, "non-exists-bucket", "prefix", Optional.<String>absent()); // no errors happens
329
362
  }
330
363
 
331
364
  @Test
@@ -379,10 +412,10 @@ public class TestGcsFileInputPlugin
379
412
  PluginTask task = config.loadConfig(PluginTask.class);
380
413
  runner.transaction(config, new Control());
381
414
 
382
- Method method = GcsFileInputPlugin.class.getDeclaredMethod("newGcsAuth", PluginTask.class);
415
+ Method method = GcsFileInput.class.getDeclaredMethod("newGcsAuth", PluginTask.class);
383
416
  method.setAccessible(true);
384
- Storage client = plugin.newGcsClient(task, (GcsAuthentication) method.invoke(plugin, task));
385
- task.setFiles(plugin.listFiles(task, client));
417
+ Storage client = GcsFileInput.newGcsClient(task, (GcsAuthentication) method.invoke(plugin, task));
418
+ task.setFiles(GcsFileInput.listFiles(task, client));
386
419
 
387
420
  assertRecords(config, output);
388
421
  }
@@ -402,14 +435,14 @@ public class TestGcsFileInputPlugin
402
435
  PluginTask task = config.loadConfig(PluginTask.class);
403
436
  runner.transaction(config, new Control());
404
437
 
405
- Method method = GcsFileInputPlugin.class.getDeclaredMethod("newGcsAuth", PluginTask.class);
438
+ Method method = GcsFileInput.class.getDeclaredMethod("newGcsAuth", PluginTask.class);
406
439
  method.setAccessible(true);
407
- Storage client = plugin.newGcsClient(task, (GcsAuthentication) method.invoke(plugin, task));
440
+ Storage client = GcsFileInput.newGcsClient(task, (GcsAuthentication) method.invoke(plugin, task));
408
441
  File tempFile = Exec.getTempFileSpace().createTempFile();
409
- task.setFiles(plugin.listFiles(task, client));
442
+ task.setFiles(GcsFileInput.listFiles(task, client));
410
443
 
411
444
  String key = GCP_BUCKET_DIRECTORY + "sample_01.csv";
412
- GcsFileInputPlugin.GcsInputStreamReopener opener = new GcsFileInputPlugin.GcsInputStreamReopener(tempFile, client, GCP_BUCKET, key, MAX_CONNECTION_RETRY);
445
+ SingleFileProvider.GcsInputStreamReopener opener = new SingleFileProvider.GcsInputStreamReopener(tempFile, client, GCP_BUCKET, key, MAX_CONNECTION_RETRY);
413
446
  try (InputStream in = opener.reopen(0, new RuntimeException())) {
414
447
  BufferedReader r = new BufferedReader(new InputStreamReader(in));
415
448
  assertEquals("id,account,time,purchase,comment", r.readLine());
@@ -420,7 +453,7 @@ public class TestGcsFileInputPlugin
420
453
  public void testBase64()
421
454
  throws NoSuchMethodException, IllegalAccessException, InvocationTargetException
422
455
  {
423
- Method method = GcsFileInputPlugin.class.getDeclaredMethod("base64Encode", String.class);
456
+ Method method = GcsFileInput.class.getDeclaredMethod("base64Encode", String.class);
424
457
  method.setAccessible(true);
425
458
 
426
459
  assertEquals("CgFj", method.invoke(plugin, "c"));
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-input-gcs
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.5
4
+ version: 0.2.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Satoshi Akama
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-05-19 00:00:00.000000000 Z
11
+ date: 2018-03-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -58,8 +58,12 @@ files:
58
58
  - gradlew.bat
59
59
  - lib/embulk/input/gcs.rb
60
60
  - settings.gradle
61
+ - src/main/java/org/embulk/input/gcs/FileList.java
61
62
  - src/main/java/org/embulk/input/gcs/GcsAuthentication.java
63
+ - src/main/java/org/embulk/input/gcs/GcsFileInput.java
62
64
  - src/main/java/org/embulk/input/gcs/GcsFileInputPlugin.java
65
+ - src/main/java/org/embulk/input/gcs/PluginTask.java
66
+ - src/main/java/org/embulk/input/gcs/SingleFileProvider.java
63
67
  - src/test/java/org/embulk/input/gcs/TestGcsAuthentication.java
64
68
  - src/test/java/org/embulk/input/gcs/TestGcsFileInputPlugin.java
65
69
  - src/test/resources/sample_01.csv
@@ -67,7 +71,7 @@ files:
67
71
  - src/test/resources/secretkeys.tar.enc
68
72
  - classpath/commons-codec-1.3.jar
69
73
  - classpath/commons-logging-1.1.1.jar
70
- - classpath/embulk-input-gcs-0.2.5.jar
74
+ - classpath/embulk-input-gcs-0.2.6.jar
71
75
  - classpath/google-api-client-1.21.0.jar
72
76
  - classpath/google-api-services-storage-v1-rev59-1.21.0.jar
73
77
  - classpath/google-http-client-1.21.0.jar
Binary file