embulk-input-gcs 0.2.5 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 01e9c70bf2d1f4c9a25784ab64828913c056078d
4
- data.tar.gz: 22d6b84e12d965045719b49026924f5247a6609a
3
+ metadata.gz: 04cb5b37d8fb8c70e1c9c4c306cf792ac1ad1ec9
4
+ data.tar.gz: 44ec9518fc188320a19ffbe2ce7db0b07cda30f3
5
5
  SHA512:
6
- metadata.gz: d090628aec5d9512976ceff29215f99e04eb939e4e927106bca4bee47fe95cef7e636fc74b239cde7222cc71f5b259b43905b36ad90d3c515f8b97f8b6c9c220
7
- data.tar.gz: 0ad764f0f9f85818b5924b3477bfe7d65806113ca56e689b45e919d3e8c0b156bfbb39cf60e9f349bb1005264655eaf0ad4446115f0254c554c5e425edac746a
6
+ metadata.gz: 3d2a59336002f07d48bcf8b5b22b3626e9c59df7aad93a4f91721667f6f7e1552bd1db72ebd1323e19ce1dc67f32da64b9ad88bc6d6785591904b8d87059249f
7
+ data.tar.gz: 1533eccae86b7355303ab91bc5ab08cf49c816438fef3541ad1318fc24f0eab948667132952746173a2dd3dabea41bc06cfc2493b8b4701c1908d2e07ff9b635
data/.travis.yml CHANGED
@@ -1,3 +1,4 @@
1
+ dist: precise
1
2
  language: java
2
3
 
3
4
  jdk:
@@ -21,3 +22,7 @@ env:
21
22
  script:
22
23
  - ./gradlew gem
23
24
  - ./gradlew --info check jacocoTestReport
25
+ addons:
26
+ hosts:
27
+ - example.com
28
+ hostname: example.com
data/CHANGELOG.md CHANGED
@@ -1,3 +1,6 @@
1
+ ## 0.2.6 - 2018-03-05
2
+ * [maintenance] Support "path_match_pattern" option [#32](https://github.com/embulk/embulk-input-gcs/pull/32)
3
+
1
4
  ## 0.2.5 - 2017-05-19
2
5
  * [maintenance] Fix InputStream handling to avoid plugin get less records than expected [#27](https://github.com/embulk/embulk-input-gcs/pull/27)
3
6
 
data/README.md CHANGED
@@ -40,6 +40,7 @@ embulk run /path/to/config.yml
40
40
  - **bucket** Google Cloud Storage bucket name (string, required)
41
41
  - **path_prefix** prefix of target keys (string, either of "path_prefix" or "paths" is required)
42
42
  - **paths** list of target keys (array of string, either of "path_prefix" or "paths" is required)
43
+ * **path_match_pattern**: regexp to match file paths. If a file path doesn't match with this pattern, the file will be skipped (regexp string, optional)
43
44
  - **incremental**: enables incremental loading(boolean, optional. default: true. If incremental loading is enabled, config diff for the next execution will include `last_path` parameter so that next execution skips files before the path. Otherwise, `last_path` will not be included.
44
45
  - **auth_method** (string, optional, "private_key", "json_key" or "compute_engine". default value is "private_key")
45
46
  - **service_account_email** Google Cloud Storage service_account_email (string, required when auth_method is private_key)
@@ -89,6 +90,21 @@ in:
89
90
  out: {type: stdout}
90
91
  ```
91
92
 
93
+ To skip files using regexp:
94
+
95
+ ```yaml
96
+ in:
97
+ type: gcs
98
+ bucket: my-gcs-bucket
99
+ path_prefix: logs/csv-
100
+ # ...
101
+ path_match_pattern: \.csv$ # a file will be skipped if its path doesn't match with this pattern
102
+ ## some examples of regexp:
103
+ #path_match_pattern: /archive/ # match files in .../archive/... directory
104
+ #path_match_pattern: /data1/|/data2/ # match files in .../data1/... or .../data2/... directory
105
+ #path_match_pattern: .csv$|.csv.gz$ # match files whose suffix is .csv or .csv.gz
106
+ ```
107
+
92
108
  ## Authentication
93
109
 
94
110
  There are three methods supported to fetch access token for the service account.
data/build.gradle CHANGED
@@ -17,7 +17,7 @@ configurations {
17
17
  sourceCompatibility = 1.7
18
18
  targetCompatibility = 1.7
19
19
 
20
- version = "0.2.5"
20
+ version = "0.2.6"
21
21
 
22
22
  dependencies {
23
23
  compile "org.embulk:embulk-core:0.8.2"
Binary file
@@ -1,6 +1,6 @@
1
- #Wed Jan 13 12:41:02 JST 2016
1
+ #Sun Jan 08 00:35:58 PST 2017
2
2
  distributionBase=GRADLE_USER_HOME
3
3
  distributionPath=wrapper/dists
4
4
  zipStoreBase=GRADLE_USER_HOME
5
5
  zipStorePath=wrapper/dists
6
- distributionUrl=https\://services.gradle.org/distributions/gradle-2.10-bin.zip
6
+ distributionUrl=https\://services.gradle.org/distributions/gradle-3.2.1-bin.zip
@@ -0,0 +1,335 @@
1
+ package org.embulk.input.gcs;
2
+
3
+ import com.fasterxml.jackson.annotation.JsonCreator;
4
+ import com.fasterxml.jackson.annotation.JsonIgnore;
5
+ import com.fasterxml.jackson.annotation.JsonProperty;
6
+ import com.google.common.base.Optional;
7
+ import com.google.common.base.Throwables;
8
+ import org.embulk.config.Config;
9
+ import org.embulk.config.ConfigDefault;
10
+ import org.embulk.config.ConfigSource;
11
+
12
+ import java.io.BufferedInputStream;
13
+ import java.io.BufferedOutputStream;
14
+ import java.io.ByteArrayInputStream;
15
+ import java.io.ByteArrayOutputStream;
16
+ import java.io.IOException;
17
+ import java.io.InputStream;
18
+ import java.io.OutputStream;
19
+ import java.nio.ByteBuffer;
20
+ import java.nio.charset.StandardCharsets;
21
+ import java.util.AbstractList;
22
+ import java.util.ArrayList;
23
+ import java.util.List;
24
+ import java.util.regex.Pattern;
25
+ import java.util.zip.GZIPInputStream;
26
+ import java.util.zip.GZIPOutputStream;
27
+
28
+ public class FileList
29
+ {
30
+ public interface Task
31
+ {
32
+ @Config("path_match_pattern")
33
+ @ConfigDefault("\".*\"")
34
+ String getPathMatchPattern();
35
+
36
+ @Config("total_file_count_limit")
37
+ @ConfigDefault("2147483647")
38
+ int getTotalFileCountLimit();
39
+
40
+ // TODO support more algorithms to combine tasks
41
+ @Config("min_task_size")
42
+ @ConfigDefault("0")
43
+ long getMinTaskSize();
44
+ }
45
+
46
+ public static class Entry
47
+ {
48
+ private int index;
49
+ private long size;
50
+
51
+ @JsonCreator
52
+ public Entry(
53
+ @JsonProperty("index") int index,
54
+ @JsonProperty("size") long size)
55
+ {
56
+ this.index = index;
57
+ this.size = size;
58
+ }
59
+
60
+ @JsonProperty("index")
61
+ public int getIndex()
62
+ {
63
+ return index;
64
+ }
65
+
66
+ @JsonProperty("size")
67
+ public long getSize()
68
+ {
69
+ return size;
70
+ }
71
+ }
72
+
73
+ public static class Builder
74
+ {
75
+ private final ByteArrayOutputStream binary;
76
+ private final OutputStream stream;
77
+ private final List<Entry> entries = new ArrayList<>();
78
+ private String last = null;
79
+
80
+ private int limitCount = Integer.MAX_VALUE;
81
+ private long minTaskSize = 1;
82
+ private Pattern pathMatchPattern;
83
+
84
+ private final ByteBuffer castBuffer = ByteBuffer.allocate(4);
85
+
86
+ public Builder(Task task)
87
+ {
88
+ this();
89
+ this.pathMatchPattern = Pattern.compile(task.getPathMatchPattern());
90
+ this.limitCount = task.getTotalFileCountLimit();
91
+ this.minTaskSize = task.getMinTaskSize();
92
+ }
93
+
94
+ public Builder(ConfigSource config)
95
+ {
96
+ this();
97
+ this.pathMatchPattern = Pattern.compile(config.get(String.class, "path_match_pattern", ".*"));
98
+ this.limitCount = config.get(int.class, "total_file_count_limit", Integer.MAX_VALUE);
99
+ this.minTaskSize = config.get(long.class, "min_task_size", 0L);
100
+ }
101
+
102
+ public Builder()
103
+ {
104
+ binary = new ByteArrayOutputStream();
105
+ try {
106
+ stream = new BufferedOutputStream(new GZIPOutputStream(binary));
107
+ }
108
+ catch (IOException ex) {
109
+ throw Throwables.propagate(ex);
110
+ }
111
+ }
112
+
113
+ public Builder limitTotalFileCount(int limitCount)
114
+ {
115
+ this.limitCount = limitCount;
116
+ return this;
117
+ }
118
+
119
+ public Builder minTaskSize(long bytes)
120
+ {
121
+ this.minTaskSize = bytes;
122
+ return this;
123
+ }
124
+
125
+ public Builder pathMatchPattern(String pattern)
126
+ {
127
+ this.pathMatchPattern = Pattern.compile(pattern);
128
+ return this;
129
+ }
130
+
131
+ public int size()
132
+ {
133
+ return entries.size();
134
+ }
135
+
136
+ public boolean needsMore()
137
+ {
138
+ return size() < limitCount;
139
+ }
140
+
141
+ // returns true if this file is used
142
+ public synchronized boolean add(String path, long size)
143
+ {
144
+ // TODO throw IllegalStateException if stream is already closed
145
+
146
+ if (!needsMore()) {
147
+ return false;
148
+ }
149
+
150
+ if (!pathMatchPattern.matcher(path).find()) {
151
+ return false;
152
+ }
153
+
154
+ int index = entries.size();
155
+ entries.add(new Entry(index, size));
156
+
157
+ byte[] data = path.getBytes(StandardCharsets.UTF_8);
158
+ castBuffer.putInt(0, data.length);
159
+ try {
160
+ stream.write(castBuffer.array());
161
+ stream.write(data);
162
+ }
163
+ catch (IOException ex) {
164
+ throw Throwables.propagate(ex);
165
+ }
166
+
167
+ last = path;
168
+ return true;
169
+ }
170
+
171
+ public FileList build()
172
+ {
173
+ try {
174
+ stream.close();
175
+ }
176
+ catch (IOException ex) {
177
+ throw Throwables.propagate(ex);
178
+ }
179
+ return new FileList(binary.toByteArray(), getSplits(entries), Optional.fromNullable(last));
180
+ }
181
+
182
+ private List<List<Entry>> getSplits(List<Entry> all)
183
+ {
184
+ List<List<Entry>> tasks = new ArrayList<>();
185
+ long currentTaskSize = 0;
186
+ List<Entry> currentTask = new ArrayList<>();
187
+ for (Entry entry : all) {
188
+ currentTask.add(entry);
189
+ currentTaskSize += entry.getSize(); // TODO consider to multiply the size by cost_per_byte, and add cost_per_file
190
+ if (currentTaskSize >= minTaskSize) {
191
+ tasks.add(currentTask);
192
+ currentTask = new ArrayList<>();
193
+ currentTaskSize = 0;
194
+ }
195
+ }
196
+ if (!currentTask.isEmpty()) {
197
+ tasks.add(currentTask);
198
+ }
199
+ return tasks;
200
+ }
201
+ }
202
+
203
+ private final byte[] data;
204
+ private final List<List<Entry>> tasks;
205
+ private final Optional<String> last;
206
+
207
+ @JsonCreator
208
+ @Deprecated
209
+ public FileList(
210
+ @JsonProperty("data") byte[] data,
211
+ @JsonProperty("tasks") List<List<Entry>> tasks,
212
+ @JsonProperty("last") Optional<String> last)
213
+ {
214
+ this.data = data;
215
+ this.tasks = tasks;
216
+ this.last = last;
217
+ }
218
+
219
+ @JsonIgnore
220
+ public Optional<String> getLastPath(Optional<String> lastLastPath)
221
+ {
222
+ if (last.isPresent()) {
223
+ return last;
224
+ }
225
+ return lastLastPath;
226
+ }
227
+
228
+ @JsonIgnore
229
+ public int getTaskCount()
230
+ {
231
+ return tasks.size();
232
+ }
233
+
234
+ @JsonIgnore
235
+ public List<String> get(int i)
236
+ {
237
+ return new EntryList(data, tasks.get(i));
238
+ }
239
+
240
+ @JsonProperty("data")
241
+ @Deprecated
242
+ public byte[] getData()
243
+ {
244
+ return data;
245
+ }
246
+
247
+ @JsonProperty("tasks")
248
+ @Deprecated
249
+ public List<List<Entry>> getTasks()
250
+ {
251
+ return tasks;
252
+ }
253
+
254
+ @JsonProperty("last")
255
+ @Deprecated
256
+ public Optional<String> getLast()
257
+ {
258
+ return last;
259
+ }
260
+
261
+ private class EntryList
262
+ extends AbstractList<String>
263
+ {
264
+ private final byte[] data;
265
+ private final List<Entry> entries;
266
+ private InputStream stream;
267
+ private int current;
268
+
269
+ private final ByteBuffer castBuffer = ByteBuffer.allocate(4);
270
+
271
+ public EntryList(byte[] data, List<Entry> entries)
272
+ {
273
+ this.data = data;
274
+ this.entries = entries;
275
+ try {
276
+ this.stream = new BufferedInputStream(new GZIPInputStream(new ByteArrayInputStream(data)));
277
+ }
278
+ catch (IOException ex) {
279
+ throw Throwables.propagate(ex);
280
+ }
281
+ this.current = 0;
282
+ }
283
+
284
+ @Override
285
+ public synchronized String get(int i)
286
+ {
287
+ Entry e = entries.get(i);
288
+ if (e.getIndex() < current) {
289
+ // rewind to the head
290
+ try {
291
+ stream.close();
292
+ stream = new BufferedInputStream(new GZIPInputStream(new ByteArrayInputStream(data)));
293
+ }
294
+ catch (IOException ex) {
295
+ throw Throwables.propagate(ex);
296
+ }
297
+ current = 0;
298
+ }
299
+
300
+ while (current < e.getIndex()) {
301
+ readNext();
302
+ }
303
+ // now current == e.getIndex()
304
+ return readNextString();
305
+ }
306
+
307
+ @Override
308
+ public int size()
309
+ {
310
+ return entries.size();
311
+ }
312
+
313
+ private byte[] readNext()
314
+ {
315
+ try {
316
+ stream.read(castBuffer.array());
317
+ int n = castBuffer.getInt(0);
318
+ byte[] b = new byte[n]; // here should be able to use a pooled buffer because read data is ignored if readNextString doesn't call this method
319
+ stream.read(b);
320
+
321
+ current++;
322
+
323
+ return b;
324
+ }
325
+ catch (IOException ex) {
326
+ throw Throwables.propagate(ex);
327
+ }
328
+ }
329
+
330
+ private String readNextString()
331
+ {
332
+ return new String(readNext(), StandardCharsets.UTF_8);
333
+ }
334
+ }
335
+ }
@@ -0,0 +1,195 @@
1
+ package org.embulk.input.gcs;
2
+
3
+ import com.google.api.client.http.HttpResponseException;
4
+ import com.google.api.services.storage.Storage;
5
+ import com.google.api.services.storage.model.Bucket;
6
+ import com.google.api.services.storage.model.Objects;
7
+ import com.google.api.services.storage.model.StorageObject;
8
+ import com.google.common.base.Charsets;
9
+ import com.google.common.base.Function;
10
+ import com.google.common.base.Optional;
11
+ import com.google.common.io.BaseEncoding;
12
+ import org.embulk.config.ConfigException;
13
+ import org.embulk.config.TaskReport;
14
+ import org.embulk.spi.Exec;
15
+ import org.embulk.spi.TransactionalFileInput;
16
+ import org.embulk.spi.unit.LocalFile;
17
+ import org.embulk.spi.util.InputStreamFileInput;
18
+ import org.slf4j.Logger;
19
+
20
+ import java.io.IOException;
21
+ import java.math.BigInteger;
22
+ import java.security.GeneralSecurityException;
23
+ import java.util.List;
24
+
25
+ public class GcsFileInput
26
+ extends InputStreamFileInput
27
+ implements TransactionalFileInput
28
+ {
29
+ private static final Logger log = Exec.getLogger(org.embulk.input.gcs.GcsFileInput.class);
30
+
31
+ public GcsFileInput(PluginTask task, int taskIndex)
32
+ {
33
+ super(task.getBufferAllocator(), new SingleFileProvider(task, taskIndex));
34
+ }
35
+
36
+ public void abort()
37
+ {
38
+ }
39
+
40
+ public TaskReport commit()
41
+ {
42
+ return Exec.newTaskReport();
43
+ }
44
+
45
+ @Override
46
+ public void close()
47
+ {
48
+ }
49
+
50
+ public static GcsAuthentication newGcsAuth(PluginTask task)
51
+ {
52
+ try {
53
+ return new GcsAuthentication(
54
+ task.getAuthMethod().getString(),
55
+ task.getServiceAccountEmail(),
56
+ task.getP12Keyfile().transform(localFileToPathString()),
57
+ task.getJsonKeyfile().transform(localFileToPathString()),
58
+ task.getApplicationName()
59
+ );
60
+ }
61
+ catch (GeneralSecurityException | IOException ex) {
62
+ throw new ConfigException(ex);
63
+ }
64
+ }
65
+
66
+ protected static Storage newGcsClient(final PluginTask task, final GcsAuthentication auth)
67
+ {
68
+ Storage client = null;
69
+ try {
70
+ client = auth.getGcsClient(task.getBucket(), task.getMaxConnectionRetry());
71
+ }
72
+ catch (IOException ex) {
73
+ throw new ConfigException(ex);
74
+ }
75
+
76
+ return client;
77
+ }
78
+
79
+ private static Function<LocalFile, String> localFileToPathString()
80
+ {
81
+ return new Function<LocalFile, String>()
82
+ {
83
+ public String apply(LocalFile file)
84
+ {
85
+ return file.getPath().toString();
86
+ }
87
+ };
88
+ }
89
+
90
+ public static FileList listFiles(PluginTask task, Storage client)
91
+ {
92
+ String bucket = task.getBucket();
93
+
94
+ FileList.Builder builder = new FileList.Builder(task);
95
+ listGcsFilesByPrefix(builder, client, bucket, task.getPathPrefix().get(), task.getLastPath());
96
+ return builder.build();
97
+ }
98
+
99
+ /**
100
+ * Lists GCS filenames filtered by prefix.
101
+ *
102
+ * The resulting list does not include the file that's size == 0.
103
+ */
104
+ public static void listGcsFilesByPrefix(FileList.Builder builder, Storage client, String bucket,
105
+ String prefix, Optional<String> lastPath)
106
+ {
107
+ String lastKey = lastPath.isPresent() ? base64Encode(lastPath.get()) : null;
108
+
109
+ // @see https://cloud.google.com/storage/docs/json_api/v1/objects#resource
110
+ if (log.isDebugEnabled()) {
111
+ try {
112
+ Storage.Buckets.Get getBucket = client.buckets().get(bucket);
113
+ getBucket.setProjection("full");
114
+ Bucket bk = getBucket.execute();
115
+
116
+ log.debug("bucket name: " + bucket);
117
+ log.debug("bucket location: " + bk.getLocation());
118
+ log.debug("bucket timeCreated: " + bk.getTimeCreated());
119
+ log.debug("bucket owner: " + bk.getOwner());
120
+ }
121
+ catch (IOException e) {
122
+ log.warn("Could not access to bucket:" + bucket);
123
+ log.warn(e.getMessage());
124
+ }
125
+ }
126
+
127
+ try {
128
+ // @see https://cloud.google.com/storage/docs/json_api/v1/objects/list
129
+ Storage.Objects.List listObjects = client.objects().list(bucket);
130
+ listObjects.setPrefix(prefix);
131
+ listObjects.setPageToken(lastKey);
132
+ do {
133
+ Objects objects = listObjects.execute();
134
+ List<StorageObject> items = objects.getItems();
135
+ if (items == null) {
136
+ log.info(String.format("No file was found in bucket:%s prefix:%s", bucket, prefix));
137
+ break;
138
+ }
139
+ for (StorageObject o : items) {
140
+ if (o.getSize().compareTo(BigInteger.ZERO) > 0) {
141
+ builder.add(o.getName(), o.getSize().longValue());
142
+ }
143
+ log.debug("filename: " + o.getName());
144
+ log.debug("updated: " + o.getUpdated());
145
+ }
146
+ lastKey = objects.getNextPageToken();
147
+ listObjects.setPageToken(lastKey);
148
+ } while (lastKey != null);
149
+ }
150
+ catch (IOException e) {
151
+ if ((e instanceof HttpResponseException) && ((HttpResponseException) e).getStatusCode() == 400) {
152
+ throw new ConfigException(String.format("Files listing failed: bucket:%s, prefix:%s, last_path:%s", bucket, prefix, lastKey), e);
153
+ }
154
+
155
+ log.warn(String.format("Could not get file list from bucket:%s", bucket));
156
+ log.warn(e.getMessage());
157
+ }
158
+ }
159
+
160
+ // String nextToken = base64Encode(0x0a + 0x01~0x27 + filePath);
161
+ private static String base64Encode(String path)
162
+ {
163
+ byte[] encoding;
164
+ byte[] utf8 = path.getBytes(Charsets.UTF_8);
165
+ log.debug(String.format("path string: %s ,path length:%s \" + ", path, utf8.length));
166
+
167
+ encoding = new byte[utf8.length + 2];
168
+ encoding[0] = 0x0a;
169
+ encoding[1] = new Byte(String.valueOf(path.length()));
170
+ System.arraycopy(utf8, 0, encoding, 2, utf8.length);
171
+
172
+ String s = BaseEncoding.base64().encode(encoding);
173
+ log.debug(String.format("last_path(base64 encoded): %s", s));
174
+ return s;
175
+ }
176
+
177
+ public enum AuthMethod
178
+ {
179
+ private_key("private_key"),
180
+ compute_engine("compute_engine"),
181
+ json_key("json_key");
182
+
183
+ private final String string;
184
+
185
+ AuthMethod(String string)
186
+ {
187
+ this.string = string;
188
+ }
189
+
190
+ public String getString()
191
+ {
192
+ return string;
193
+ }
194
+ }
195
+ }