embulk-input-azure_blob_storage 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a85c5e311d426a1f67c47b4947683827a3bb2140
4
- data.tar.gz: 2ae6c9a16e143e4731e4de7518fa504d2cb2f8c1
3
+ metadata.gz: 49b71df05579f8c392ec70089aaeca95f1b81144
4
+ data.tar.gz: a63ffb1663b9bbed5827211dbb62eeb04b20a987
5
5
  SHA512:
6
- metadata.gz: 68588df49d434087541ea542a4731236d5496356df5b66bdb81f620803c305f76c33b39ff7340d176378f80c0f41e0275efedc95b180198828776218a3148f55
7
- data.tar.gz: 7cc8c4758c67360d7cbbd5cf4ca12bfe8f8b975641fcc31fa2b3308e4ba3c8093e7aa0edf577e078d9f3441e8636bae700dc07fbb16c0b181ec71742b3ae963c
6
+ metadata.gz: 1a43a45c850746c1a50b9f12be823f0081ea9deddd4403e831f2533e6af4ab4861405e45cc62824fcb637b6509178fceeb6cad4f583542677e0ee3383beb8e43
7
+ data.tar.gz: d5e6066c047f60034d50ff6084919157476f962adf5c9f2b1a59685a97d83a254637ed0037276629714e2b1223c229c0564b58c162cfffbb58b10a4822f563aa
@@ -1,6 +1,11 @@
1
+ ## 0.1.4 - 2015-03-22
2
+
3
+ * [new feature] Support `last_path` option [#7](https://github.com/sakama/embulk-input-azure_blob_storage/pull/7)
4
+ * [new feature] Support `path_match_pattern` option [#6](https://github.com/sakama/embulk-input-azure_blob_storage/pull/6)
5
+
1
6
  ## 0.1.3 - 2015-03-16
2
7
 
3
- * [maintenance] Add unit test[#4](https://github.com/sakama/embulk-input-azure_blob_storage/pull/4)
8
+ * [maintenance] Add unit test [#4](https://github.com/sakama/embulk-input-azure_blob_storage/pull/4)
4
9
  * [maintenance] Add retry logic [#3](https://github.com/sakama/embulk-input-azure_blob_storage/pull/3)
5
10
 
6
11
  ## 0.1.2 - 2015-10-11
data/README.md CHANGED
@@ -16,6 +16,8 @@ First, create Azure [Storage Account](https://azure.microsoft.com/en-us/document
16
16
  - **account_key**: primary access key (string, required)
17
17
  - **container**: container name data stored (string, required)
18
18
  - **path_prefix**: prefix of target keys (string, required) (string, required)
19
+ - **path_match_pattern**: regexp to match file paths. If a file path doesn't match with this pattern, the file will be skipped (regexp string, optional)
20
+ - **total_file_count_limit**: maximum number of files to read (integer, optional)
19
21
 
20
22
  ## Example
21
23
 
@@ -55,6 +57,21 @@ in:
55
57
  out: {type: stdout}
56
58
  ```
57
59
 
60
+ To filter files using regexp:
61
+
62
+ ```yaml
63
+ in:
64
+ type: sftp
65
+ path_prefix: logs/csv-
66
+ ...
67
+ path_match_pattern: \.csv$ # a file will be skipped if its path doesn't match with this pattern
68
+
69
+ ## some examples of regexp:
70
+ #path_match_pattern: /archive/ # match files in .../archive/... directory
71
+ #path_match_pattern: /data1/|/data2/ # match files in .../data1/... or .../data2/... directory
72
+ #path_match_pattern: .csv$|.csv.gz$ # match files whose suffix is .csv or .csv.gz
73
+ ```
74
+
58
75
  ## Build
59
76
 
60
77
  ```
@@ -17,7 +17,7 @@ configurations {
17
17
  sourceCompatibility = 1.7
18
18
  targetCompatibility = 1.7
19
19
 
20
- version = "0.1.3"
20
+ version = "0.1.4"
21
21
 
22
22
  dependencies {
23
23
  compile "org.embulk:embulk-core:0.8.2"
@@ -1,10 +1,12 @@
1
1
  package org.embulk.input.azure_blob_storage;
2
2
 
3
+ import com.google.common.base.Charsets;
3
4
  import com.google.common.base.Optional;
4
5
  import com.google.common.base.Throwables;
5
- import com.google.common.collect.ImmutableList;
6
+ import com.google.common.io.BaseEncoding;
6
7
  import com.microsoft.azure.storage.CloudStorageAccount;
7
8
  import com.microsoft.azure.storage.ResultContinuation;
9
+ import com.microsoft.azure.storage.ResultContinuationType;
8
10
  import com.microsoft.azure.storage.ResultSegment;
9
11
  import com.microsoft.azure.storage.StorageException;
10
12
  import com.microsoft.azure.storage.blob.CloudBlob;
@@ -31,15 +33,14 @@ import java.io.IOException;
31
33
  import java.io.InputStream;
32
34
  import java.net.URISyntaxException;
33
35
  import java.security.InvalidKeyException;
34
- import java.util.ArrayList;
35
- import java.util.Collections;
36
+ import java.util.Iterator;
36
37
  import java.util.List;
37
38
 
38
39
  public class AzureBlobStorageFileInputPlugin
39
40
  implements FileInputPlugin
40
41
  {
41
42
  public interface PluginTask
42
- extends Task
43
+ extends Task, FileList.Task
43
44
  {
44
45
  @Config("account_name")
45
46
  String getAccountName();
@@ -65,9 +66,8 @@ public class AzureBlobStorageFileInputPlugin
65
66
  @ConfigDefault("5") // 5 times retry to connect sftp server if failed.
66
67
  int getMaxConnectionRetry();
67
68
 
68
- List<String> getFiles();
69
-
70
- void setFiles(List<String> files);
69
+ FileList getFiles();
70
+ void setFiles(FileList files);
71
71
 
72
72
  @ConfigInject
73
73
  BufferAllocator getBufferAllocator();
@@ -83,28 +83,18 @@ public class AzureBlobStorageFileInputPlugin
83
83
  CloudBlobClient blobClient = newAzureClient(task.getAccountName(), task.getAccountKey());
84
84
  task.setFiles(listFiles(blobClient, task));
85
85
 
86
- return resume(task.dump(), task.getFiles().size(), control);
86
+ return resume(task.dump(), task.getFiles().getTaskCount(), control);
87
87
  }
88
88
 
89
89
  @Override
90
90
  public ConfigDiff resume(TaskSource taskSource, int taskCount, FileInputPlugin.Control control)
91
91
  {
92
92
  PluginTask task = taskSource.loadTask(PluginTask.class);
93
-
94
93
  control.run(taskSource, taskCount);
95
94
 
96
95
  ConfigDiff configDiff = Exec.newConfigDiff();
96
+ configDiff.set("last_path", task.getFiles().getLastPath(task.getLastPath()));
97
97
 
98
- List<String> files = new ArrayList<>(task.getFiles());
99
- if (files.isEmpty()) {
100
- if (task.getLastPath().isPresent()) {
101
- configDiff.set("last_path", task.getLastPath().get());
102
- }
103
- }
104
- else {
105
- Collections.sort(files);
106
- configDiff.set("last_path", files.get(files.size() - 1));
107
- }
108
98
  return configDiff;
109
99
  }
110
100
 
@@ -129,22 +119,28 @@ public class AzureBlobStorageFileInputPlugin
129
119
  return account.createCloudBlobClient();
130
120
  }
131
121
 
132
- private List<String> listFiles(CloudBlobClient client, PluginTask task)
122
+ private FileList listFiles(CloudBlobClient client, PluginTask task)
133
123
  {
134
124
  if (task.getPathPrefix().equals("/")) {
135
125
  log.info("Listing files with prefix \"/\". This doesn't mean all files in a bucket. If you intend to read all files, use \"path_prefix: ''\" (empty string) instead.");
136
126
  }
127
+ FileList.Builder builder = new FileList.Builder(task);
137
128
 
138
- return listFilesWithPrefix(client, task.getContainer(), task.getPathPrefix(), task.getLastPath(), task.getMaxResults());
129
+ return listFilesWithPrefix(builder, client, task.getContainer(), task.getPathPrefix(), task.getLastPath(), task.getMaxResults());
139
130
  }
140
131
 
141
- private static List<String> listFilesWithPrefix(CloudBlobClient client, String containerName,
132
+ private static FileList listFilesWithPrefix(FileList.Builder builder, CloudBlobClient client, String containerName,
142
133
  String prefix, Optional<String> lastPath, int maxResults)
143
134
  {
144
- ImmutableList.Builder<String> builder = ImmutableList.builder();
145
- // It seems I can't cast lastKey<String> to token<ResultContinuation> by Azure SDK for Java
146
- String lastKey = lastPath.orNull();
135
+ String lastKey = (lastPath.isPresent() && !lastPath.get().isEmpty()) ? createNextToken(lastPath.get()) : null;
147
136
  ResultContinuation token = null;
137
+ if (lastKey != null) {
138
+ token = new ResultContinuation();
139
+ token.setContinuationType(ResultContinuationType.BLOB);
140
+ log.debug("lastPath: {}", lastPath.get());
141
+ log.debug("lastPath(Base64encoded): {}", lastKey);
142
+ token.setNextMarker(lastKey);
143
+ }
148
144
 
149
145
  try {
150
146
  CloudBlobContainer container = client.getContainerReference(containerName);
@@ -156,7 +152,7 @@ public class AzureBlobStorageFileInputPlugin
156
152
  if (blobItem instanceof CloudBlob) {
157
153
  CloudBlob blob = (CloudBlob) blobItem;
158
154
  if (blob.exists() && !blob.getUri().toString().endsWith("/")) {
159
- builder.add(blob.getName());
155
+ builder.add(blob.getName(), blob.getProperties().getLength());
160
156
  log.debug(String.format("name:%s, class:%s, uri:%s", blob.getName(), blob.getClass(), blob.getUri()));
161
157
  }
162
158
  }
@@ -201,7 +197,7 @@ public class AzureBlobStorageFileInputPlugin
201
197
  {
202
198
  private CloudBlobClient client;
203
199
  private final String containerName;
204
- private final String key;
200
+ private final Iterator<String> iterator;
205
201
  private final int maxConnectionRetry;
206
202
  private boolean opened = false;
207
203
 
@@ -209,14 +205,14 @@ public class AzureBlobStorageFileInputPlugin
209
205
  {
210
206
  this.client = newAzureClient(task.getAccountName(), task.getAccountKey());
211
207
  this.containerName = task.getContainer();
212
- this.key = task.getFiles().get(taskIndex);
208
+ this.iterator = task.getFiles().get(taskIndex).iterator();
213
209
  this.maxConnectionRetry = task.getMaxConnectionRetry();
214
210
  }
215
211
 
216
212
  @Override
217
213
  public InputStream openNext() throws IOException
218
214
  {
219
- if (opened) {
215
+ if (opened || !iterator.hasNext()) {
220
216
  return null;
221
217
  }
222
218
  opened = true;
@@ -225,7 +221,7 @@ public class AzureBlobStorageFileInputPlugin
225
221
  while (true) {
226
222
  try {
227
223
  CloudBlobContainer container = client.getContainerReference(containerName);
228
- CloudBlob blob = container.getBlockBlobReference(key);
224
+ CloudBlob blob = container.getBlockBlobReference(iterator.next());
229
225
  return blob.openInputStream();
230
226
  }
231
227
  catch (StorageException | URISyntaxException ex) {
@@ -250,4 +246,23 @@ public class AzureBlobStorageFileInputPlugin
250
246
  @Override
251
247
  public void close() {}
252
248
  }
249
+
250
+ private static String createNextToken(String path)
251
+ {
252
+ StringBuilder sb = new StringBuilder()
253
+ .append(String.format("%06d", path.length()))
254
+ .append("!")
255
+ .append(path)
256
+ .append("!000028!9999-12-31T23:59:59.9999999Z!");
257
+
258
+ String encodedString = BaseEncoding.base64().encode(sb.toString().getBytes(Charsets.UTF_8));
259
+
260
+ StringBuilder marker = new StringBuilder()
261
+ .append("2")
262
+ .append("!")
263
+ .append(encodedString.length())
264
+ .append("!")
265
+ .append(encodedString);
266
+ return marker.toString();
267
+ }
253
268
  }
@@ -0,0 +1,341 @@
1
+ package org.embulk.input.azure_blob_storage;
2
+
3
+ import com.fasterxml.jackson.annotation.JsonCreator;
4
+ import com.fasterxml.jackson.annotation.JsonIgnore;
5
+ import com.fasterxml.jackson.annotation.JsonProperty;
6
+ import com.google.common.base.Optional;
7
+ import com.google.common.base.Throwables;
8
+ import org.embulk.config.Config;
9
+ import org.embulk.config.ConfigDefault;
10
+ import org.embulk.config.ConfigSource;
11
+ import org.embulk.spi.Exec;
12
+ import org.slf4j.Logger;
13
+
14
+ import java.io.BufferedInputStream;
15
+ import java.io.BufferedOutputStream;
16
+ import java.io.ByteArrayInputStream;
17
+ import java.io.ByteArrayOutputStream;
18
+
19
+ import java.io.IOException;
20
+ import java.io.InputStream;
21
+ import java.io.OutputStream;
22
+ import java.nio.ByteBuffer;
23
+ import java.nio.charset.StandardCharsets;
24
+ import java.util.AbstractList;
25
+ import java.util.ArrayList;
26
+ import java.util.List;
27
+ import java.util.regex.Pattern;
28
+ import java.util.zip.GZIPInputStream;
29
+ import java.util.zip.GZIPOutputStream;
30
+
31
+ // this class should be moved to embulk-core
32
+ public class FileList
33
+ {
34
+ public interface Task
35
+ {
36
+ @Config("path_match_pattern")
37
+ @ConfigDefault("\".*\"")
38
+ String getPathMatchPattern();
39
+
40
+ @Config("total_file_count_limit")
41
+ @ConfigDefault("2147483647")
42
+ int getTotalFileCountLimit();
43
+
44
+ // TODO support more algorithms to combine tasks
45
+ @Config("min_task_size")
46
+ @ConfigDefault("0")
47
+ long getMinTaskSize();
48
+ }
49
+
50
+ public static class Entry
51
+ {
52
+ private int index;
53
+ private long size;
54
+
55
+ @JsonCreator
56
+ public Entry(
57
+ @JsonProperty("index") int index,
58
+ @JsonProperty("size") long size)
59
+ {
60
+ this.index = index;
61
+ this.size = size;
62
+ }
63
+
64
+ @JsonProperty("index")
65
+ public int getIndex()
66
+ {
67
+ return index;
68
+ }
69
+
70
+ @JsonProperty("size")
71
+ public long getSize()
72
+ {
73
+ return size;
74
+ }
75
+ }
76
+
77
+ public static class Builder
78
+ {
79
+ private final Logger log = Exec.getLogger(FileList.class);
80
+ private final ByteArrayOutputStream binary;
81
+ private final OutputStream stream;
82
+ private final List<Entry> entries = new ArrayList<>();
83
+ private String last = null;
84
+
85
+ private int limitCount = Integer.MAX_VALUE;
86
+ private long minTaskSize = 1;
87
+ private Pattern pathMatchPattern;
88
+
89
+ private final ByteBuffer castBuffer = ByteBuffer.allocate(4);
90
+
91
+ public Builder(Task task)
92
+ {
93
+ this();
94
+ this.pathMatchPattern = Pattern.compile(task.getPathMatchPattern());
95
+ this.limitCount = task.getTotalFileCountLimit();
96
+ this.minTaskSize = task.getMinTaskSize();
97
+ }
98
+
99
+ public Builder(ConfigSource config)
100
+ {
101
+ this();
102
+ this.pathMatchPattern = Pattern.compile(config.get(String.class, "path_match_pattern", ".*"));
103
+ this.limitCount = config.get(int.class, "total_file_count_limit", Integer.MAX_VALUE);
104
+ this.minTaskSize = config.get(long.class, "min_task_size", 0L);
105
+ }
106
+
107
+ public Builder()
108
+ {
109
+ binary = new ByteArrayOutputStream();
110
+ try {
111
+ stream = new BufferedOutputStream(new GZIPOutputStream(binary));
112
+ }
113
+ catch (IOException ex) {
114
+ throw Throwables.propagate(ex);
115
+ }
116
+ }
117
+
118
+ public Builder limitTotalFileCount(int limitCount)
119
+ {
120
+ this.limitCount = limitCount;
121
+ return this;
122
+ }
123
+
124
+ public Builder minTaskSize(long bytes)
125
+ {
126
+ this.minTaskSize = bytes;
127
+ return this;
128
+ }
129
+
130
+ public Builder pathMatchPattern(String pattern)
131
+ {
132
+ this.pathMatchPattern = Pattern.compile(pattern);
133
+ return this;
134
+ }
135
+
136
+ public int size()
137
+ {
138
+ return entries.size();
139
+ }
140
+
141
+ public boolean needsMore()
142
+ {
143
+ return size() < limitCount;
144
+ }
145
+
146
+ // returns true if this file is used
147
+ public synchronized boolean add(String path, long size)
148
+ {
149
+ // TODO throw IllegalStateException if stream is already closed
150
+
151
+ if (!needsMore()) {
152
+ return false;
153
+ }
154
+
155
+ if (!pathMatchPattern.matcher(path).find()) {
156
+ return false;
157
+ }
158
+
159
+ int index = entries.size();
160
+ entries.add(new Entry(index, size));
161
+ log.info("add file to the request list: {}", path);
162
+
163
+ byte[] data = path.getBytes(StandardCharsets.UTF_8);
164
+ castBuffer.putInt(0, data.length);
165
+ try {
166
+ stream.write(castBuffer.array());
167
+ stream.write(data);
168
+ }
169
+ catch (IOException ex) {
170
+ throw Throwables.propagate(ex);
171
+ }
172
+
173
+ last = path;
174
+ return true;
175
+ }
176
+
177
+ public FileList build()
178
+ {
179
+ try {
180
+ stream.close();
181
+ }
182
+ catch (IOException ex) {
183
+ throw Throwables.propagate(ex);
184
+ }
185
+ return new FileList(binary.toByteArray(), getSplits(entries), Optional.fromNullable(last));
186
+ }
187
+
188
+ private List<List<Entry>> getSplits(List<Entry> all)
189
+ {
190
+ List<List<Entry>> tasks = new ArrayList<>();
191
+ long currentTaskSize = 0;
192
+ List<Entry> currentTask = new ArrayList<>();
193
+ for (Entry entry : all) {
194
+ currentTask.add(entry);
195
+ currentTaskSize += entry.getSize(); // TODO consider to multiply the size by cost_per_byte, and add cost_per_file
196
+ if (currentTaskSize >= minTaskSize) {
197
+ tasks.add(currentTask);
198
+ currentTask = new ArrayList<>();
199
+ currentTaskSize = 0;
200
+ }
201
+ }
202
+ if (!currentTask.isEmpty()) {
203
+ tasks.add(currentTask);
204
+ }
205
+ return tasks;
206
+ }
207
+ }
208
+
209
+ private final byte[] data;
210
+ private final List<List<Entry>> tasks;
211
+ private final Optional<String> last;
212
+
213
+ @JsonCreator
214
+ @Deprecated
215
+ public FileList(
216
+ @JsonProperty("data") byte[] data,
217
+ @JsonProperty("tasks") List<List<Entry>> tasks,
218
+ @JsonProperty("last") Optional<String> last)
219
+ {
220
+ this.data = data;
221
+ this.tasks = tasks;
222
+ this.last = last;
223
+ }
224
+
225
+ @JsonIgnore
226
+ public Optional<String> getLastPath(Optional<String> lastLastPath)
227
+ {
228
+ if (last.isPresent()) {
229
+ return last;
230
+ }
231
+ return lastLastPath;
232
+ }
233
+
234
+ @JsonIgnore
235
+ public int getTaskCount()
236
+ {
237
+ return tasks.size();
238
+ }
239
+
240
+ @JsonIgnore
241
+ public List<String> get(int i)
242
+ {
243
+ return new EntryList(data, tasks.get(i));
244
+ }
245
+
246
+ @JsonProperty("data")
247
+ @Deprecated
248
+ public byte[] getData()
249
+ {
250
+ return data;
251
+ }
252
+
253
+ @JsonProperty("tasks")
254
+ @Deprecated
255
+ public List<List<Entry>> getTasks()
256
+ {
257
+ return tasks;
258
+ }
259
+
260
+ @JsonProperty("last")
261
+ @Deprecated
262
+ public Optional<String> getLast()
263
+ {
264
+ return last;
265
+ }
266
+
267
+ private class EntryList
268
+ extends AbstractList<String>
269
+ {
270
+ private final byte[] data;
271
+ private final List<Entry> entries;
272
+ private InputStream stream;
273
+ private int current;
274
+
275
+ private final ByteBuffer castBuffer = ByteBuffer.allocate(4);
276
+
277
+ public EntryList(byte[] data, List<Entry> entries)
278
+ {
279
+ this.data = data;
280
+ this.entries = entries;
281
+ try {
282
+ this.stream = new BufferedInputStream(new GZIPInputStream(new ByteArrayInputStream(data)));
283
+ }
284
+ catch (IOException ex) {
285
+ throw Throwables.propagate(ex);
286
+ }
287
+ this.current = 0;
288
+ }
289
+
290
+ @Override
291
+ public synchronized String get(int i)
292
+ {
293
+ Entry e = entries.get(i);
294
+ if (e.getIndex() < current) {
295
+ // rewind to the head
296
+ try {
297
+ stream.close();
298
+ stream = new BufferedInputStream(new GZIPInputStream(new ByteArrayInputStream(data)));
299
+ }
300
+ catch (IOException ex) {
301
+ throw Throwables.propagate(ex);
302
+ }
303
+ current = 0;
304
+ }
305
+
306
+ while (current < e.getIndex()) {
307
+ readNext();
308
+ }
309
+ // now current == e.getIndex()
310
+ return readNextString();
311
+ }
312
+
313
+ @Override
314
+ public int size()
315
+ {
316
+ return entries.size();
317
+ }
318
+
319
+ private byte[] readNext()
320
+ {
321
+ try {
322
+ stream.read(castBuffer.array());
323
+ int n = castBuffer.getInt(0);
324
+ byte[] b = new byte[n]; // here should be able to use a pooled buffer because read data is ignored if readNextString doesn't call this method
325
+ stream.read(b);
326
+
327
+ current++;
328
+
329
+ return b;
330
+ }
331
+ catch (IOException ex) {
332
+ throw Throwables.propagate(ex);
333
+ }
334
+ }
335
+
336
+ private String readNextString()
337
+ {
338
+ return new String(readNext(), StandardCharsets.UTF_8);
339
+ }
340
+ }
341
+ }
@@ -93,18 +93,6 @@ public class TestAzureBlobStorageFileInputPlugin
93
93
  assertEquals(5, task.getMaxConnectionRetry());
94
94
  }
95
95
 
96
- public ConfigSource config()
97
- {
98
- return Exec.newConfigSource()
99
- .set("account_name", AZURE_ACCOUNT_NAME)
100
- .set("account_key", AZURE_ACCOUNT_KEY)
101
- .set("container", AZURE_CONTAINER)
102
- .set("path_prefix", AZURE_PATH_PREFIX)
103
- .set("last_path", "")
104
- .set("file_ext", ".csv")
105
- .set("parser", parserConfig(schemaConfig()));
106
- }
107
-
108
96
  @Test(expected = ConfigException.class)
109
97
  public void checkDefaultValuesAccountNameIsNull()
110
98
  {
@@ -114,7 +102,6 @@ public class TestAzureBlobStorageFileInputPlugin
114
102
  .set("container", AZURE_CONTAINER)
115
103
  .set("path_prefix", AZURE_PATH_PREFIX)
116
104
  .set("last_path", "")
117
- .set("file_ext", ".csv")
118
105
  .set("parser", parserConfig(schemaConfig()));
119
106
 
120
107
  runner.transaction(config, new Control());
@@ -129,7 +116,6 @@ public class TestAzureBlobStorageFileInputPlugin
129
116
  .set("container", AZURE_CONTAINER)
130
117
  .set("path_prefix", AZURE_PATH_PREFIX)
131
118
  .set("last_path", "")
132
- .set("file_ext", ".csv")
133
119
  .set("parser", parserConfig(schemaConfig()));
134
120
 
135
121
  runner.transaction(config, new Control());
@@ -144,7 +130,6 @@ public class TestAzureBlobStorageFileInputPlugin
144
130
  .set("container", null)
145
131
  .set("path_prefix", AZURE_PATH_PREFIX)
146
132
  .set("last_path", "")
147
- .set("file_ext", ".csv")
148
133
  .set("parser", parserConfig(schemaConfig()));
149
134
 
150
135
  runner.transaction(config, new Control());
@@ -166,7 +151,7 @@ public class TestAzureBlobStorageFileInputPlugin
166
151
  public void testResume()
167
152
  {
168
153
  PluginTask task = config.loadConfig(PluginTask.class);
169
- task.setFiles(Arrays.asList("in/aa/a"));
154
+ task.setFiles(createFileList(Arrays.asList("in/aa/a"), task));
170
155
  ConfigDiff configDiff = plugin.resume(task.dump(), 0, new FileInputPlugin.Control()
171
156
  {
172
157
  @Override
@@ -190,8 +175,8 @@ public class TestAzureBlobStorageFileInputPlugin
190
175
  throws NoSuchMethodException, IllegalAccessException, InvocationTargetException
191
176
  {
192
177
  List<String> expected = Arrays.asList(
193
- AZURE_CONTAINER_IMPORT_DIRECTORY + "sample_01.csv",
194
- AZURE_CONTAINER_IMPORT_DIRECTORY + "sample_02.csv"
178
+ AZURE_CONTAINER_IMPORT_DIRECTORY + "sample_01.csv",
179
+ AZURE_CONTAINER_IMPORT_DIRECTORY + "sample_02.csv"
195
180
  );
196
181
 
197
182
  PluginTask task = config.loadConfig(PluginTask.class);
@@ -210,8 +195,9 @@ public class TestAzureBlobStorageFileInputPlugin
210
195
 
211
196
  Method listFiles = AzureBlobStorageFileInputPlugin.class.getDeclaredMethod("listFiles", CloudBlobClient.class, PluginTask.class);
212
197
  listFiles.setAccessible(true);
213
- List<String> actual = (List<String>) listFiles.invoke(plugin, client, task);
214
- assertEquals(expected, actual);
198
+ FileList actual = (FileList) listFiles.invoke(plugin, client, task);
199
+ assertEquals(expected.get(0), actual.get(0).get(0));
200
+ assertEquals(expected.get(1), actual.get(1).get(0));
215
201
  assertEquals(AZURE_CONTAINER_IMPORT_DIRECTORY + "sample_02.csv", configDiff.get(String.class, "last_path"));
216
202
  }
217
203
 
@@ -228,11 +214,26 @@ public class TestAzureBlobStorageFileInputPlugin
228
214
 
229
215
  Method listFiles = AzureBlobStorageFileInputPlugin.class.getDeclaredMethod("listFiles", CloudBlobClient.class, PluginTask.class);
230
216
  listFiles.setAccessible(true);
231
- task.setFiles((List<String>) listFiles.invoke(plugin, client, task));
217
+ task.setFiles((FileList) listFiles.invoke(plugin, client, task));
232
218
 
233
219
  assertRecords(config, output);
234
220
  }
235
221
 
222
+ @Test
223
+ public void testCreateNextToken() throws Exception
224
+ {
225
+ Method base64Encode = AzureBlobStorageFileInputPlugin.class.getDeclaredMethod("createNextToken", String.class);
226
+ base64Encode.setAccessible(true);
227
+
228
+ String expected = "2!92!MDAwMDI1IXJlYWRvbmx5L3NhbXBsZV8wMS50c3YuZ3ohMDAwMDI4ITk5OTktMTItMzFUMjM6NTk6NTkuOTk5OTk5OVoh";
229
+ String lastPath = "readonly/sample_01.tsv.gz";
230
+ assertEquals(expected, base64Encode.invoke(plugin, lastPath).toString());
231
+
232
+ expected = "2!120!MDAwMDQ2IXBhdGgvdGhhdC9oYXZlL2xvbmcvcGF0aC9uYW1lL3NhbXBsZV8wMS50c3YuZ3ohMDAwMDI4ITk5OTktMTItMzFUMjM6NTk6NTkuOTk5OTk5OVoh";
233
+ lastPath = "path/that/have/long/path/name/sample_01.tsv.gz";
234
+ assertEquals(expected, base64Encode.invoke(plugin, lastPath).toString());
235
+ }
236
+
236
237
  static List<TaskReport> emptyTaskReports(int taskCount)
237
238
  {
238
239
  ImmutableList.Builder<TaskReport> reports = new ImmutableList.Builder<>();
@@ -256,6 +257,17 @@ public class TestAzureBlobStorageFileInputPlugin
256
257
  }
257
258
  }
258
259
 
260
+ public ConfigSource config()
261
+ {
262
+ return Exec.newConfigSource()
263
+ .set("account_name", AZURE_ACCOUNT_NAME)
264
+ .set("account_key", AZURE_ACCOUNT_KEY)
265
+ .set("container", AZURE_CONTAINER)
266
+ .set("path_prefix", AZURE_PATH_PREFIX)
267
+ .set("last_path", "")
268
+ .set("parser", parserConfig(schemaConfig()));
269
+ }
270
+
259
271
  private ImmutableMap<String, Object> parserConfig(ImmutableList<Object> schemaConfig)
260
272
  {
261
273
  ImmutableMap.Builder<String, Object> builder = new ImmutableMap.Builder<>();
@@ -340,4 +352,13 @@ public class TestAzureBlobStorageFileInputPlugin
340
352
  }
341
353
  return dir;
342
354
  }
355
+
356
+ private FileList createFileList(List<String> fileList, PluginTask task)
357
+ {
358
+ FileList.Builder builder = new FileList.Builder(task);
359
+ for (String file : fileList) {
360
+ builder.add(file, 0);
361
+ }
362
+ return builder.build();
363
+ }
343
364
  }
@@ -0,0 +1,87 @@
1
+ package org.embulk.input.azure_blob_storage;
2
+
3
+ import org.embulk.EmbulkTestRuntime;
4
+ import org.embulk.config.ConfigSource;
5
+ import org.junit.Before;
6
+ import org.junit.Rule;
7
+ import org.junit.Test;
8
+
9
+ import static org.junit.Assert.assertEquals;
10
+
11
+ public class TestFileList
12
+ {
13
+ @Rule
14
+ public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
15
+
16
+ private ConfigSource config;
17
+
18
+ @Before
19
+ public void createConfigSource()
20
+ {
21
+ config = runtime.getExec().newConfigSource();
22
+ }
23
+
24
+ @Test
25
+ public void checkMinTaskSize()
26
+ throws Exception
27
+ {
28
+ { // not specify min_task_size
29
+ FileList fileList = newFileList(config.deepCopy(),
30
+ "sample_00", 100L,
31
+ "sample_01", 150L,
32
+ "sample_02", 350L);
33
+
34
+ assertEquals(3, fileList.getTaskCount());
35
+ assertEquals("sample_00", fileList.get(0).get(0));
36
+ assertEquals("sample_01", fileList.get(1).get(0));
37
+ assertEquals("sample_02", fileList.get(2).get(0));
38
+ }
39
+
40
+ {
41
+ FileList fileList = newFileList(config.deepCopy().set("min_task_size", 100),
42
+ "sample_00", 100L,
43
+ "sample_01", 150L,
44
+ "sample_02", 350L);
45
+
46
+ assertEquals(3, fileList.getTaskCount());
47
+ assertEquals("sample_00", fileList.get(0).get(0));
48
+ assertEquals("sample_01", fileList.get(1).get(0));
49
+ assertEquals("sample_02", fileList.get(2).get(0));
50
+ }
51
+
52
+ {
53
+ FileList fileList = newFileList(config.deepCopy().set("min_task_size", 200),
54
+ "sample_00", 100L,
55
+ "sample_01", 150L,
56
+ "sample_02", 350L);
57
+
58
+ assertEquals(2, fileList.getTaskCount());
59
+ assertEquals("sample_00", fileList.get(0).get(0));
60
+ assertEquals("sample_01", fileList.get(0).get(1));
61
+ assertEquals("sample_02", fileList.get(1).get(0));
62
+ }
63
+
64
+ {
65
+ FileList fileList = newFileList(config.deepCopy().set("min_task_size", 700),
66
+ "sample_00", 100L,
67
+ "sample_01", 150L,
68
+ "sample_02", 350L);
69
+
70
+ assertEquals(1, fileList.getTaskCount());
71
+ assertEquals("sample_00", fileList.get(0).get(0));
72
+ assertEquals("sample_01", fileList.get(0).get(1));
73
+ assertEquals("sample_02", fileList.get(0).get(2));
74
+ }
75
+ }
76
+
77
+ private static FileList newFileList(ConfigSource config, Object... nameAndSize)
78
+ {
79
+ FileList.Builder builder = new FileList.Builder(config);
80
+
81
+ for (int i = 0; i < nameAndSize.length; i += 2) {
82
+ builder.add((String) nameAndSize[i], (long) nameAndSize[i + 1]);
83
+ }
84
+
85
+ return builder.build();
86
+ }
87
+ }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-input-azure_blob_storage
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Satoshi Akama
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-16 00:00:00.000000000 Z
11
+ date: 2016-03-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -57,12 +57,14 @@ files:
57
57
  - gradlew.bat
58
58
  - lib/embulk/input/azure_blob_storage.rb
59
59
  - src/main/java/org/embulk/input/azure_blob_storage/AzureBlobStorageFileInputPlugin.java
60
+ - src/main/java/org/embulk/input/azure_blob_storage/FileList.java
60
61
  - src/test/java/org/embulk/input/azure_blob_storage/TestAzureBlobStorageFileInputPlugin.java
62
+ - src/test/java/org/embulk/input/azure_blob_storage/TestFileList.java
61
63
  - src/test/resources/sample_01.csv
62
64
  - src/test/resources/sample_02.csv
63
65
  - classpath/azure-storage-4.0.0.jar
64
66
  - classpath/commons-lang3-3.4.jar
65
- - classpath/embulk-input-azure_blob_storage-0.1.3.jar
67
+ - classpath/embulk-input-azure_blob_storage-0.1.4.jar
66
68
  - classpath/jackson-core-2.6.0.jar
67
69
  homepage: https://github.com/sakama/embulk-input-azure_blob_storage
68
70
  licenses: