embulk-input-gcs 0.2.5 → 0.2.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 01e9c70bf2d1f4c9a25784ab64828913c056078d
4
- data.tar.gz: 22d6b84e12d965045719b49026924f5247a6609a
3
+ metadata.gz: 04cb5b37d8fb8c70e1c9c4c306cf792ac1ad1ec9
4
+ data.tar.gz: 44ec9518fc188320a19ffbe2ce7db0b07cda30f3
5
5
  SHA512:
6
- metadata.gz: d090628aec5d9512976ceff29215f99e04eb939e4e927106bca4bee47fe95cef7e636fc74b239cde7222cc71f5b259b43905b36ad90d3c515f8b97f8b6c9c220
7
- data.tar.gz: 0ad764f0f9f85818b5924b3477bfe7d65806113ca56e689b45e919d3e8c0b156bfbb39cf60e9f349bb1005264655eaf0ad4446115f0254c554c5e425edac746a
6
+ metadata.gz: 3d2a59336002f07d48bcf8b5b22b3626e9c59df7aad93a4f91721667f6f7e1552bd1db72ebd1323e19ce1dc67f32da64b9ad88bc6d6785591904b8d87059249f
7
+ data.tar.gz: 1533eccae86b7355303ab91bc5ab08cf49c816438fef3541ad1318fc24f0eab948667132952746173a2dd3dabea41bc06cfc2493b8b4701c1908d2e07ff9b635
data/.travis.yml CHANGED
@@ -1,3 +1,4 @@
1
+ dist: precise
1
2
  language: java
2
3
 
3
4
  jdk:
@@ -21,3 +22,7 @@ env:
21
22
  script:
22
23
  - ./gradlew gem
23
24
  - ./gradlew --info check jacocoTestReport
25
+ addons:
26
+ hosts:
27
+ - example.com
28
+ hostname: example.com
data/CHANGELOG.md CHANGED
@@ -1,3 +1,6 @@
1
+ ## 0.2.6 - 2018-03-05
2
+ * [maintenance] Support "path_match_pattern" option [#32](https://github.com/embulk/embulk-input-gcs/pull/32)
3
+
1
4
  ## 0.2.5 - 2017-05-19
2
5
  * [maintenance] Fix InputStream handling to avoid plugin get less records than expected [#27](https://github.com/embulk/embulk-input-gcs/pull/27)
3
6
 
data/README.md CHANGED
@@ -40,6 +40,7 @@ embulk run /path/to/config.yml
40
40
  - **bucket** Google Cloud Storage bucket name (string, required)
41
41
  - **path_prefix** prefix of target keys (string, either of "path_prefix" or "paths" is required)
42
42
  - **paths** list of target keys (array of string, either of "path_prefix" or "paths" is required)
43
+ * **path_match_pattern**: regexp to match file paths. If a file path doesn't match with this pattern, the file will be skipped (regexp string, optional)
43
44
  - **incremental**: enables incremental loading(boolean, optional. default: true. If incremental loading is enabled, config diff for the next execution will include `last_path` parameter so that next execution skips files before the path. Otherwise, `last_path` will not be included.
44
45
  - **auth_method** (string, optional, "private_key", "json_key" or "compute_engine". default value is "private_key")
45
46
  - **service_account_email** Google Cloud Storage service_account_email (string, required when auth_method is private_key)
@@ -89,6 +90,21 @@ in:
89
90
  out: {type: stdout}
90
91
  ```
91
92
 
93
+ To skip files using regexp:
94
+
95
+ ```yaml
96
+ in:
97
+ type: gcs
98
+ bucket: my-gcs-bucket
99
+ path_prefix: logs/csv-
100
+ # ...
101
+ path_match_pattern: \.csv$ # a file will be skipped if its path doesn't match with this pattern
102
+ ## some examples of regexp:
103
+ #path_match_pattern: /archive/ # match files in .../archive/... directory
104
+ #path_match_pattern: /data1/|/data2/ # match files in .../data1/... or .../data2/... directory
105
+ #path_match_pattern: .csv$|.csv.gz$ # match files whose suffix is .csv or .csv.gz
106
+ ```
107
+
92
108
  ## Authentication
93
109
 
94
110
  There are three methods supported to fetch access token for the service account.
data/build.gradle CHANGED
@@ -17,7 +17,7 @@ configurations {
17
17
  sourceCompatibility = 1.7
18
18
  targetCompatibility = 1.7
19
19
 
20
- version = "0.2.5"
20
+ version = "0.2.6"
21
21
 
22
22
  dependencies {
23
23
  compile "org.embulk:embulk-core:0.8.2"
Binary file
@@ -1,6 +1,6 @@
1
- #Wed Jan 13 12:41:02 JST 2016
1
+ #Sun Jan 08 00:35:58 PST 2017
2
2
  distributionBase=GRADLE_USER_HOME
3
3
  distributionPath=wrapper/dists
4
4
  zipStoreBase=GRADLE_USER_HOME
5
5
  zipStorePath=wrapper/dists
6
- distributionUrl=https\://services.gradle.org/distributions/gradle-2.10-bin.zip
6
+ distributionUrl=https\://services.gradle.org/distributions/gradle-3.2.1-bin.zip
@@ -0,0 +1,335 @@
1
+ package org.embulk.input.gcs;
2
+
3
+ import com.fasterxml.jackson.annotation.JsonCreator;
4
+ import com.fasterxml.jackson.annotation.JsonIgnore;
5
+ import com.fasterxml.jackson.annotation.JsonProperty;
6
+ import com.google.common.base.Optional;
7
+ import com.google.common.base.Throwables;
8
+ import org.embulk.config.Config;
9
+ import org.embulk.config.ConfigDefault;
10
+ import org.embulk.config.ConfigSource;
11
+
12
+ import java.io.BufferedInputStream;
13
+ import java.io.BufferedOutputStream;
14
+ import java.io.ByteArrayInputStream;
15
+ import java.io.ByteArrayOutputStream;
16
+ import java.io.IOException;
17
+ import java.io.InputStream;
18
+ import java.io.OutputStream;
19
+ import java.nio.ByteBuffer;
20
+ import java.nio.charset.StandardCharsets;
21
+ import java.util.AbstractList;
22
+ import java.util.ArrayList;
23
+ import java.util.List;
24
+ import java.util.regex.Pattern;
25
+ import java.util.zip.GZIPInputStream;
26
+ import java.util.zip.GZIPOutputStream;
27
+
28
+ public class FileList
29
+ {
30
+ public interface Task
31
+ {
32
+ @Config("path_match_pattern")
33
+ @ConfigDefault("\".*\"")
34
+ String getPathMatchPattern();
35
+
36
+ @Config("total_file_count_limit")
37
+ @ConfigDefault("2147483647")
38
+ int getTotalFileCountLimit();
39
+
40
+ // TODO support more algorithms to combine tasks
41
+ @Config("min_task_size")
42
+ @ConfigDefault("0")
43
+ long getMinTaskSize();
44
+ }
45
+
46
+ public static class Entry
47
+ {
48
+ private int index;
49
+ private long size;
50
+
51
+ @JsonCreator
52
+ public Entry(
53
+ @JsonProperty("index") int index,
54
+ @JsonProperty("size") long size)
55
+ {
56
+ this.index = index;
57
+ this.size = size;
58
+ }
59
+
60
+ @JsonProperty("index")
61
+ public int getIndex()
62
+ {
63
+ return index;
64
+ }
65
+
66
+ @JsonProperty("size")
67
+ public long getSize()
68
+ {
69
+ return size;
70
+ }
71
+ }
72
+
73
+ public static class Builder
74
+ {
75
+ private final ByteArrayOutputStream binary;
76
+ private final OutputStream stream;
77
+ private final List<Entry> entries = new ArrayList<>();
78
+ private String last = null;
79
+
80
+ private int limitCount = Integer.MAX_VALUE;
81
+ private long minTaskSize = 1;
82
+ private Pattern pathMatchPattern;
83
+
84
+ private final ByteBuffer castBuffer = ByteBuffer.allocate(4);
85
+
86
+ public Builder(Task task)
87
+ {
88
+ this();
89
+ this.pathMatchPattern = Pattern.compile(task.getPathMatchPattern());
90
+ this.limitCount = task.getTotalFileCountLimit();
91
+ this.minTaskSize = task.getMinTaskSize();
92
+ }
93
+
94
+ public Builder(ConfigSource config)
95
+ {
96
+ this();
97
+ this.pathMatchPattern = Pattern.compile(config.get(String.class, "path_match_pattern", ".*"));
98
+ this.limitCount = config.get(int.class, "total_file_count_limit", Integer.MAX_VALUE);
99
+ this.minTaskSize = config.get(long.class, "min_task_size", 0L);
100
+ }
101
+
102
+ public Builder()
103
+ {
104
+ binary = new ByteArrayOutputStream();
105
+ try {
106
+ stream = new BufferedOutputStream(new GZIPOutputStream(binary));
107
+ }
108
+ catch (IOException ex) {
109
+ throw Throwables.propagate(ex);
110
+ }
111
+ }
112
+
113
+ public Builder limitTotalFileCount(int limitCount)
114
+ {
115
+ this.limitCount = limitCount;
116
+ return this;
117
+ }
118
+
119
+ public Builder minTaskSize(long bytes)
120
+ {
121
+ this.minTaskSize = bytes;
122
+ return this;
123
+ }
124
+
125
+ public Builder pathMatchPattern(String pattern)
126
+ {
127
+ this.pathMatchPattern = Pattern.compile(pattern);
128
+ return this;
129
+ }
130
+
131
+ public int size()
132
+ {
133
+ return entries.size();
134
+ }
135
+
136
+ public boolean needsMore()
137
+ {
138
+ return size() < limitCount;
139
+ }
140
+
141
+ // returns true if this file is used
142
+ public synchronized boolean add(String path, long size)
143
+ {
144
+ // TODO throw IllegalStateException if stream is already closed
145
+
146
+ if (!needsMore()) {
147
+ return false;
148
+ }
149
+
150
+ if (!pathMatchPattern.matcher(path).find()) {
151
+ return false;
152
+ }
153
+
154
+ int index = entries.size();
155
+ entries.add(new Entry(index, size));
156
+
157
+ byte[] data = path.getBytes(StandardCharsets.UTF_8);
158
+ castBuffer.putInt(0, data.length);
159
+ try {
160
+ stream.write(castBuffer.array());
161
+ stream.write(data);
162
+ }
163
+ catch (IOException ex) {
164
+ throw Throwables.propagate(ex);
165
+ }
166
+
167
+ last = path;
168
+ return true;
169
+ }
170
+
171
+ public FileList build()
172
+ {
173
+ try {
174
+ stream.close();
175
+ }
176
+ catch (IOException ex) {
177
+ throw Throwables.propagate(ex);
178
+ }
179
+ return new FileList(binary.toByteArray(), getSplits(entries), Optional.fromNullable(last));
180
+ }
181
+
182
+ private List<List<Entry>> getSplits(List<Entry> all)
183
+ {
184
+ List<List<Entry>> tasks = new ArrayList<>();
185
+ long currentTaskSize = 0;
186
+ List<Entry> currentTask = new ArrayList<>();
187
+ for (Entry entry : all) {
188
+ currentTask.add(entry);
189
+ currentTaskSize += entry.getSize(); // TODO consider to multiply the size by cost_per_byte, and add cost_per_file
190
+ if (currentTaskSize >= minTaskSize) {
191
+ tasks.add(currentTask);
192
+ currentTask = new ArrayList<>();
193
+ currentTaskSize = 0;
194
+ }
195
+ }
196
+ if (!currentTask.isEmpty()) {
197
+ tasks.add(currentTask);
198
+ }
199
+ return tasks;
200
+ }
201
+ }
202
+
203
+ private final byte[] data;
204
+ private final List<List<Entry>> tasks;
205
+ private final Optional<String> last;
206
+
207
+ @JsonCreator
208
+ @Deprecated
209
+ public FileList(
210
+ @JsonProperty("data") byte[] data,
211
+ @JsonProperty("tasks") List<List<Entry>> tasks,
212
+ @JsonProperty("last") Optional<String> last)
213
+ {
214
+ this.data = data;
215
+ this.tasks = tasks;
216
+ this.last = last;
217
+ }
218
+
219
+ @JsonIgnore
220
+ public Optional<String> getLastPath(Optional<String> lastLastPath)
221
+ {
222
+ if (last.isPresent()) {
223
+ return last;
224
+ }
225
+ return lastLastPath;
226
+ }
227
+
228
+ @JsonIgnore
229
+ public int getTaskCount()
230
+ {
231
+ return tasks.size();
232
+ }
233
+
234
+ @JsonIgnore
235
+ public List<String> get(int i)
236
+ {
237
+ return new EntryList(data, tasks.get(i));
238
+ }
239
+
240
+ @JsonProperty("data")
241
+ @Deprecated
242
+ public byte[] getData()
243
+ {
244
+ return data;
245
+ }
246
+
247
+ @JsonProperty("tasks")
248
+ @Deprecated
249
+ public List<List<Entry>> getTasks()
250
+ {
251
+ return tasks;
252
+ }
253
+
254
+ @JsonProperty("last")
255
+ @Deprecated
256
+ public Optional<String> getLast()
257
+ {
258
+ return last;
259
+ }
260
+
261
+ private class EntryList
262
+ extends AbstractList<String>
263
+ {
264
+ private final byte[] data;
265
+ private final List<Entry> entries;
266
+ private InputStream stream;
267
+ private int current;
268
+
269
+ private final ByteBuffer castBuffer = ByteBuffer.allocate(4);
270
+
271
+ public EntryList(byte[] data, List<Entry> entries)
272
+ {
273
+ this.data = data;
274
+ this.entries = entries;
275
+ try {
276
+ this.stream = new BufferedInputStream(new GZIPInputStream(new ByteArrayInputStream(data)));
277
+ }
278
+ catch (IOException ex) {
279
+ throw Throwables.propagate(ex);
280
+ }
281
+ this.current = 0;
282
+ }
283
+
284
+ @Override
285
+ public synchronized String get(int i)
286
+ {
287
+ Entry e = entries.get(i);
288
+ if (e.getIndex() < current) {
289
+ // rewind to the head
290
+ try {
291
+ stream.close();
292
+ stream = new BufferedInputStream(new GZIPInputStream(new ByteArrayInputStream(data)));
293
+ }
294
+ catch (IOException ex) {
295
+ throw Throwables.propagate(ex);
296
+ }
297
+ current = 0;
298
+ }
299
+
300
+ while (current < e.getIndex()) {
301
+ readNext();
302
+ }
303
+ // now current == e.getIndex()
304
+ return readNextString();
305
+ }
306
+
307
+ @Override
308
+ public int size()
309
+ {
310
+ return entries.size();
311
+ }
312
+
313
+ private byte[] readNext()
314
+ {
315
+ try {
316
+ stream.read(castBuffer.array());
317
+ int n = castBuffer.getInt(0);
318
+ byte[] b = new byte[n]; // here should be able to use a pooled buffer because read data is ignored if readNextString doesn't call this method
319
+ stream.read(b);
320
+
321
+ current++;
322
+
323
+ return b;
324
+ }
325
+ catch (IOException ex) {
326
+ throw Throwables.propagate(ex);
327
+ }
328
+ }
329
+
330
+ private String readNextString()
331
+ {
332
+ return new String(readNext(), StandardCharsets.UTF_8);
333
+ }
334
+ }
335
+ }
@@ -0,0 +1,195 @@
1
+ package org.embulk.input.gcs;
2
+
3
+ import com.google.api.client.http.HttpResponseException;
4
+ import com.google.api.services.storage.Storage;
5
+ import com.google.api.services.storage.model.Bucket;
6
+ import com.google.api.services.storage.model.Objects;
7
+ import com.google.api.services.storage.model.StorageObject;
8
+ import com.google.common.base.Charsets;
9
+ import com.google.common.base.Function;
10
+ import com.google.common.base.Optional;
11
+ import com.google.common.io.BaseEncoding;
12
+ import org.embulk.config.ConfigException;
13
+ import org.embulk.config.TaskReport;
14
+ import org.embulk.spi.Exec;
15
+ import org.embulk.spi.TransactionalFileInput;
16
+ import org.embulk.spi.unit.LocalFile;
17
+ import org.embulk.spi.util.InputStreamFileInput;
18
+ import org.slf4j.Logger;
19
+
20
+ import java.io.IOException;
21
+ import java.math.BigInteger;
22
+ import java.security.GeneralSecurityException;
23
+ import java.util.List;
24
+
25
+ public class GcsFileInput
26
+ extends InputStreamFileInput
27
+ implements TransactionalFileInput
28
+ {
29
+ private static final Logger log = Exec.getLogger(org.embulk.input.gcs.GcsFileInput.class);
30
+
31
+ public GcsFileInput(PluginTask task, int taskIndex)
32
+ {
33
+ super(task.getBufferAllocator(), new SingleFileProvider(task, taskIndex));
34
+ }
35
+
36
+ public void abort()
37
+ {
38
+ }
39
+
40
+ public TaskReport commit()
41
+ {
42
+ return Exec.newTaskReport();
43
+ }
44
+
45
+ @Override
46
+ public void close()
47
+ {
48
+ }
49
+
50
+ public static GcsAuthentication newGcsAuth(PluginTask task)
51
+ {
52
+ try {
53
+ return new GcsAuthentication(
54
+ task.getAuthMethod().getString(),
55
+ task.getServiceAccountEmail(),
56
+ task.getP12Keyfile().transform(localFileToPathString()),
57
+ task.getJsonKeyfile().transform(localFileToPathString()),
58
+ task.getApplicationName()
59
+ );
60
+ }
61
+ catch (GeneralSecurityException | IOException ex) {
62
+ throw new ConfigException(ex);
63
+ }
64
+ }
65
+
66
+ protected static Storage newGcsClient(final PluginTask task, final GcsAuthentication auth)
67
+ {
68
+ Storage client = null;
69
+ try {
70
+ client = auth.getGcsClient(task.getBucket(), task.getMaxConnectionRetry());
71
+ }
72
+ catch (IOException ex) {
73
+ throw new ConfigException(ex);
74
+ }
75
+
76
+ return client;
77
+ }
78
+
79
+ private static Function<LocalFile, String> localFileToPathString()
80
+ {
81
+ return new Function<LocalFile, String>()
82
+ {
83
+ public String apply(LocalFile file)
84
+ {
85
+ return file.getPath().toString();
86
+ }
87
+ };
88
+ }
89
+
90
+ public static FileList listFiles(PluginTask task, Storage client)
91
+ {
92
+ String bucket = task.getBucket();
93
+
94
+ FileList.Builder builder = new FileList.Builder(task);
95
+ listGcsFilesByPrefix(builder, client, bucket, task.getPathPrefix().get(), task.getLastPath());
96
+ return builder.build();
97
+ }
98
+
99
+ /**
100
+ * Lists GCS filenames filtered by prefix.
101
+ *
102
+ * The resulting list does not include the file that's size == 0.
103
+ */
104
+ public static void listGcsFilesByPrefix(FileList.Builder builder, Storage client, String bucket,
105
+ String prefix, Optional<String> lastPath)
106
+ {
107
+ String lastKey = lastPath.isPresent() ? base64Encode(lastPath.get()) : null;
108
+
109
+ // @see https://cloud.google.com/storage/docs/json_api/v1/objects#resource
110
+ if (log.isDebugEnabled()) {
111
+ try {
112
+ Storage.Buckets.Get getBucket = client.buckets().get(bucket);
113
+ getBucket.setProjection("full");
114
+ Bucket bk = getBucket.execute();
115
+
116
+ log.debug("bucket name: " + bucket);
117
+ log.debug("bucket location: " + bk.getLocation());
118
+ log.debug("bucket timeCreated: " + bk.getTimeCreated());
119
+ log.debug("bucket owner: " + bk.getOwner());
120
+ }
121
+ catch (IOException e) {
122
+ log.warn("Could not access to bucket:" + bucket);
123
+ log.warn(e.getMessage());
124
+ }
125
+ }
126
+
127
+ try {
128
+ // @see https://cloud.google.com/storage/docs/json_api/v1/objects/list
129
+ Storage.Objects.List listObjects = client.objects().list(bucket);
130
+ listObjects.setPrefix(prefix);
131
+ listObjects.setPageToken(lastKey);
132
+ do {
133
+ Objects objects = listObjects.execute();
134
+ List<StorageObject> items = objects.getItems();
135
+ if (items == null) {
136
+ log.info(String.format("No file was found in bucket:%s prefix:%s", bucket, prefix));
137
+ break;
138
+ }
139
+ for (StorageObject o : items) {
140
+ if (o.getSize().compareTo(BigInteger.ZERO) > 0) {
141
+ builder.add(o.getName(), o.getSize().longValue());
142
+ }
143
+ log.debug("filename: " + o.getName());
144
+ log.debug("updated: " + o.getUpdated());
145
+ }
146
+ lastKey = objects.getNextPageToken();
147
+ listObjects.setPageToken(lastKey);
148
+ } while (lastKey != null);
149
+ }
150
+ catch (IOException e) {
151
+ if ((e instanceof HttpResponseException) && ((HttpResponseException) e).getStatusCode() == 400) {
152
+ throw new ConfigException(String.format("Files listing failed: bucket:%s, prefix:%s, last_path:%s", bucket, prefix, lastKey), e);
153
+ }
154
+
155
+ log.warn(String.format("Could not get file list from bucket:%s", bucket));
156
+ log.warn(e.getMessage());
157
+ }
158
+ }
159
+
160
+ // String nextToken = base64Encode(0x0a + 0x01~0x27 + filePath);
161
+ private static String base64Encode(String path)
162
+ {
163
+ byte[] encoding;
164
+ byte[] utf8 = path.getBytes(Charsets.UTF_8);
165
+ log.debug(String.format("path string: %s ,path length:%s \" + ", path, utf8.length));
166
+
167
+ encoding = new byte[utf8.length + 2];
168
+ encoding[0] = 0x0a;
169
+ encoding[1] = new Byte(String.valueOf(path.length()));
170
+ System.arraycopy(utf8, 0, encoding, 2, utf8.length);
171
+
172
+ String s = BaseEncoding.base64().encode(encoding);
173
+ log.debug(String.format("last_path(base64 encoded): %s", s));
174
+ return s;
175
+ }
176
+
177
+ public enum AuthMethod
178
+ {
179
+ private_key("private_key"),
180
+ compute_engine("compute_engine"),
181
+ json_key("json_key");
182
+
183
+ private final String string;
184
+
185
+ AuthMethod(String string)
186
+ {
187
+ this.string = string;
188
+ }
189
+
190
+ public String getString()
191
+ {
192
+ return string;
193
+ }
194
+ }
195
+ }