embulk-input-azure_blob_storage 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -1
- data/README.md +17 -0
- data/build.gradle +1 -1
- data/src/main/java/org/embulk/input/azure_blob_storage/AzureBlobStorageFileInputPlugin.java +45 -30
- data/src/main/java/org/embulk/input/azure_blob_storage/FileList.java +341 -0
- data/src/test/java/org/embulk/input/azure_blob_storage/TestAzureBlobStorageFileInputPlugin.java +42 -21
- data/src/test/java/org/embulk/input/azure_blob_storage/TestFileList.java +87 -0
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 49b71df05579f8c392ec70089aaeca95f1b81144
|
4
|
+
data.tar.gz: a63ffb1663b9bbed5827211dbb62eeb04b20a987
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1a43a45c850746c1a50b9f12be823f0081ea9deddd4403e831f2533e6af4ab4861405e45cc62824fcb637b6509178fceeb6cad4f583542677e0ee3383beb8e43
|
7
|
+
data.tar.gz: d5e6066c047f60034d50ff6084919157476f962adf5c9f2b1a59685a97d83a254637ed0037276629714e2b1223c229c0564b58c162cfffbb58b10a4822f563aa
|
data/CHANGELOG.md
CHANGED
@@ -1,6 +1,11 @@
|
|
1
|
+
## 0.1.4 - 2015-03-22
|
2
|
+
|
3
|
+
* [new feature] Support `last_path` option [#7](https://github.com/sakama/embulk-input-azure_blob_storage/pull/7)
|
4
|
+
* [new feature] Support `path_match_pattern` option [#6](https://github.com/sakama/embulk-input-azure_blob_storage/pull/6)
|
5
|
+
|
1
6
|
## 0.1.3 - 2015-03-16
|
2
7
|
|
3
|
-
* [maintenance] Add unit test[#4](https://github.com/sakama/embulk-input-azure_blob_storage/pull/4)
|
8
|
+
* [maintenance] Add unit test [#4](https://github.com/sakama/embulk-input-azure_blob_storage/pull/4)
|
4
9
|
* [maintenance] Add retry logic [#3](https://github.com/sakama/embulk-input-azure_blob_storage/pull/3)
|
5
10
|
|
6
11
|
## 0.1.2 - 2015-10-11
|
data/README.md
CHANGED
@@ -16,6 +16,8 @@ First, create Azure [Storage Account](https://azure.microsoft.com/en-us/document
|
|
16
16
|
- **account_key**: primary access key (string, required)
|
17
17
|
- **container**: container name data stored (string, required)
|
18
18
|
- **path_prefix**: prefix of target keys (string, required) (string, required)
|
19
|
+
- **path_match_pattern**: regexp to match file paths. If a file path doesn't match with this pattern, the file will be skipped (regexp string, optional)
|
20
|
+
- **total_file_count_limit**: maximum number of files to read (integer, optional)
|
19
21
|
|
20
22
|
## Example
|
21
23
|
|
@@ -55,6 +57,21 @@ in:
|
|
55
57
|
out: {type: stdout}
|
56
58
|
```
|
57
59
|
|
60
|
+
To filter files using regexp:
|
61
|
+
|
62
|
+
```yaml
|
63
|
+
in:
|
64
|
+
type: sftp
|
65
|
+
path_prefix: logs/csv-
|
66
|
+
...
|
67
|
+
path_match_pattern: \.csv$ # a file will be skipped if its path doesn't match with this pattern
|
68
|
+
|
69
|
+
## some examples of regexp:
|
70
|
+
#path_match_pattern: /archive/ # match files in .../archive/... directory
|
71
|
+
#path_match_pattern: /data1/|/data2/ # match files in .../data1/... or .../data2/... directory
|
72
|
+
#path_match_pattern: .csv$|.csv.gz$ # match files whose suffix is .csv or .csv.gz
|
73
|
+
```
|
74
|
+
|
58
75
|
## Build
|
59
76
|
|
60
77
|
```
|
data/build.gradle
CHANGED
@@ -1,10 +1,12 @@
|
|
1
1
|
package org.embulk.input.azure_blob_storage;
|
2
2
|
|
3
|
+
import com.google.common.base.Charsets;
|
3
4
|
import com.google.common.base.Optional;
|
4
5
|
import com.google.common.base.Throwables;
|
5
|
-
import com.google.common.
|
6
|
+
import com.google.common.io.BaseEncoding;
|
6
7
|
import com.microsoft.azure.storage.CloudStorageAccount;
|
7
8
|
import com.microsoft.azure.storage.ResultContinuation;
|
9
|
+
import com.microsoft.azure.storage.ResultContinuationType;
|
8
10
|
import com.microsoft.azure.storage.ResultSegment;
|
9
11
|
import com.microsoft.azure.storage.StorageException;
|
10
12
|
import com.microsoft.azure.storage.blob.CloudBlob;
|
@@ -31,15 +33,14 @@ import java.io.IOException;
|
|
31
33
|
import java.io.InputStream;
|
32
34
|
import java.net.URISyntaxException;
|
33
35
|
import java.security.InvalidKeyException;
|
34
|
-
import java.util.
|
35
|
-
import java.util.Collections;
|
36
|
+
import java.util.Iterator;
|
36
37
|
import java.util.List;
|
37
38
|
|
38
39
|
public class AzureBlobStorageFileInputPlugin
|
39
40
|
implements FileInputPlugin
|
40
41
|
{
|
41
42
|
public interface PluginTask
|
42
|
-
extends Task
|
43
|
+
extends Task, FileList.Task
|
43
44
|
{
|
44
45
|
@Config("account_name")
|
45
46
|
String getAccountName();
|
@@ -65,9 +66,8 @@ public class AzureBlobStorageFileInputPlugin
|
|
65
66
|
@ConfigDefault("5") // 5 times retry to connect sftp server if failed.
|
66
67
|
int getMaxConnectionRetry();
|
67
68
|
|
68
|
-
|
69
|
-
|
70
|
-
void setFiles(List<String> files);
|
69
|
+
FileList getFiles();
|
70
|
+
void setFiles(FileList files);
|
71
71
|
|
72
72
|
@ConfigInject
|
73
73
|
BufferAllocator getBufferAllocator();
|
@@ -83,28 +83,18 @@ public class AzureBlobStorageFileInputPlugin
|
|
83
83
|
CloudBlobClient blobClient = newAzureClient(task.getAccountName(), task.getAccountKey());
|
84
84
|
task.setFiles(listFiles(blobClient, task));
|
85
85
|
|
86
|
-
return resume(task.dump(), task.getFiles().
|
86
|
+
return resume(task.dump(), task.getFiles().getTaskCount(), control);
|
87
87
|
}
|
88
88
|
|
89
89
|
@Override
|
90
90
|
public ConfigDiff resume(TaskSource taskSource, int taskCount, FileInputPlugin.Control control)
|
91
91
|
{
|
92
92
|
PluginTask task = taskSource.loadTask(PluginTask.class);
|
93
|
-
|
94
93
|
control.run(taskSource, taskCount);
|
95
94
|
|
96
95
|
ConfigDiff configDiff = Exec.newConfigDiff();
|
96
|
+
configDiff.set("last_path", task.getFiles().getLastPath(task.getLastPath()));
|
97
97
|
|
98
|
-
List<String> files = new ArrayList<>(task.getFiles());
|
99
|
-
if (files.isEmpty()) {
|
100
|
-
if (task.getLastPath().isPresent()) {
|
101
|
-
configDiff.set("last_path", task.getLastPath().get());
|
102
|
-
}
|
103
|
-
}
|
104
|
-
else {
|
105
|
-
Collections.sort(files);
|
106
|
-
configDiff.set("last_path", files.get(files.size() - 1));
|
107
|
-
}
|
108
98
|
return configDiff;
|
109
99
|
}
|
110
100
|
|
@@ -129,22 +119,28 @@ public class AzureBlobStorageFileInputPlugin
|
|
129
119
|
return account.createCloudBlobClient();
|
130
120
|
}
|
131
121
|
|
132
|
-
private
|
122
|
+
private FileList listFiles(CloudBlobClient client, PluginTask task)
|
133
123
|
{
|
134
124
|
if (task.getPathPrefix().equals("/")) {
|
135
125
|
log.info("Listing files with prefix \"/\". This doesn't mean all files in a bucket. If you intend to read all files, use \"path_prefix: ''\" (empty string) instead.");
|
136
126
|
}
|
127
|
+
FileList.Builder builder = new FileList.Builder(task);
|
137
128
|
|
138
|
-
return listFilesWithPrefix(client, task.getContainer(), task.getPathPrefix(), task.getLastPath(), task.getMaxResults());
|
129
|
+
return listFilesWithPrefix(builder, client, task.getContainer(), task.getPathPrefix(), task.getLastPath(), task.getMaxResults());
|
139
130
|
}
|
140
131
|
|
141
|
-
private static
|
132
|
+
private static FileList listFilesWithPrefix(FileList.Builder builder, CloudBlobClient client, String containerName,
|
142
133
|
String prefix, Optional<String> lastPath, int maxResults)
|
143
134
|
{
|
144
|
-
|
145
|
-
// It seems I can't cast lastKey<String> to token<ResultContinuation> by Azure SDK for Java
|
146
|
-
String lastKey = lastPath.orNull();
|
135
|
+
String lastKey = (lastPath.isPresent() && !lastPath.get().isEmpty()) ? createNextToken(lastPath.get()) : null;
|
147
136
|
ResultContinuation token = null;
|
137
|
+
if (lastKey != null) {
|
138
|
+
token = new ResultContinuation();
|
139
|
+
token.setContinuationType(ResultContinuationType.BLOB);
|
140
|
+
log.debug("lastPath: {}", lastPath.get());
|
141
|
+
log.debug("lastPath(Base64encoded): {}", lastKey);
|
142
|
+
token.setNextMarker(lastKey);
|
143
|
+
}
|
148
144
|
|
149
145
|
try {
|
150
146
|
CloudBlobContainer container = client.getContainerReference(containerName);
|
@@ -156,7 +152,7 @@ public class AzureBlobStorageFileInputPlugin
|
|
156
152
|
if (blobItem instanceof CloudBlob) {
|
157
153
|
CloudBlob blob = (CloudBlob) blobItem;
|
158
154
|
if (blob.exists() && !blob.getUri().toString().endsWith("/")) {
|
159
|
-
builder.add(blob.getName());
|
155
|
+
builder.add(blob.getName(), blob.getProperties().getLength());
|
160
156
|
log.debug(String.format("name:%s, class:%s, uri:%s", blob.getName(), blob.getClass(), blob.getUri()));
|
161
157
|
}
|
162
158
|
}
|
@@ -201,7 +197,7 @@ public class AzureBlobStorageFileInputPlugin
|
|
201
197
|
{
|
202
198
|
private CloudBlobClient client;
|
203
199
|
private final String containerName;
|
204
|
-
private final String
|
200
|
+
private final Iterator<String> iterator;
|
205
201
|
private final int maxConnectionRetry;
|
206
202
|
private boolean opened = false;
|
207
203
|
|
@@ -209,14 +205,14 @@ public class AzureBlobStorageFileInputPlugin
|
|
209
205
|
{
|
210
206
|
this.client = newAzureClient(task.getAccountName(), task.getAccountKey());
|
211
207
|
this.containerName = task.getContainer();
|
212
|
-
this.
|
208
|
+
this.iterator = task.getFiles().get(taskIndex).iterator();
|
213
209
|
this.maxConnectionRetry = task.getMaxConnectionRetry();
|
214
210
|
}
|
215
211
|
|
216
212
|
@Override
|
217
213
|
public InputStream openNext() throws IOException
|
218
214
|
{
|
219
|
-
if (opened) {
|
215
|
+
if (opened || !iterator.hasNext()) {
|
220
216
|
return null;
|
221
217
|
}
|
222
218
|
opened = true;
|
@@ -225,7 +221,7 @@ public class AzureBlobStorageFileInputPlugin
|
|
225
221
|
while (true) {
|
226
222
|
try {
|
227
223
|
CloudBlobContainer container = client.getContainerReference(containerName);
|
228
|
-
CloudBlob blob = container.getBlockBlobReference(
|
224
|
+
CloudBlob blob = container.getBlockBlobReference(iterator.next());
|
229
225
|
return blob.openInputStream();
|
230
226
|
}
|
231
227
|
catch (StorageException | URISyntaxException ex) {
|
@@ -250,4 +246,23 @@ public class AzureBlobStorageFileInputPlugin
|
|
250
246
|
@Override
|
251
247
|
public void close() {}
|
252
248
|
}
|
249
|
+
|
250
|
+
private static String createNextToken(String path)
|
251
|
+
{
|
252
|
+
StringBuilder sb = new StringBuilder()
|
253
|
+
.append(String.format("%06d", path.length()))
|
254
|
+
.append("!")
|
255
|
+
.append(path)
|
256
|
+
.append("!000028!9999-12-31T23:59:59.9999999Z!");
|
257
|
+
|
258
|
+
String encodedString = BaseEncoding.base64().encode(sb.toString().getBytes(Charsets.UTF_8));
|
259
|
+
|
260
|
+
StringBuilder marker = new StringBuilder()
|
261
|
+
.append("2")
|
262
|
+
.append("!")
|
263
|
+
.append(encodedString.length())
|
264
|
+
.append("!")
|
265
|
+
.append(encodedString);
|
266
|
+
return marker.toString();
|
267
|
+
}
|
253
268
|
}
|
@@ -0,0 +1,341 @@
|
|
1
|
+
package org.embulk.input.azure_blob_storage;
|
2
|
+
|
3
|
+
import com.fasterxml.jackson.annotation.JsonCreator;
|
4
|
+
import com.fasterxml.jackson.annotation.JsonIgnore;
|
5
|
+
import com.fasterxml.jackson.annotation.JsonProperty;
|
6
|
+
import com.google.common.base.Optional;
|
7
|
+
import com.google.common.base.Throwables;
|
8
|
+
import org.embulk.config.Config;
|
9
|
+
import org.embulk.config.ConfigDefault;
|
10
|
+
import org.embulk.config.ConfigSource;
|
11
|
+
import org.embulk.spi.Exec;
|
12
|
+
import org.slf4j.Logger;
|
13
|
+
|
14
|
+
import java.io.BufferedInputStream;
|
15
|
+
import java.io.BufferedOutputStream;
|
16
|
+
import java.io.ByteArrayInputStream;
|
17
|
+
import java.io.ByteArrayOutputStream;
|
18
|
+
|
19
|
+
import java.io.IOException;
|
20
|
+
import java.io.InputStream;
|
21
|
+
import java.io.OutputStream;
|
22
|
+
import java.nio.ByteBuffer;
|
23
|
+
import java.nio.charset.StandardCharsets;
|
24
|
+
import java.util.AbstractList;
|
25
|
+
import java.util.ArrayList;
|
26
|
+
import java.util.List;
|
27
|
+
import java.util.regex.Pattern;
|
28
|
+
import java.util.zip.GZIPInputStream;
|
29
|
+
import java.util.zip.GZIPOutputStream;
|
30
|
+
|
31
|
+
// this class should be moved to embulk-core
|
32
|
+
public class FileList
|
33
|
+
{
|
34
|
+
public interface Task
|
35
|
+
{
|
36
|
+
@Config("path_match_pattern")
|
37
|
+
@ConfigDefault("\".*\"")
|
38
|
+
String getPathMatchPattern();
|
39
|
+
|
40
|
+
@Config("total_file_count_limit")
|
41
|
+
@ConfigDefault("2147483647")
|
42
|
+
int getTotalFileCountLimit();
|
43
|
+
|
44
|
+
// TODO support more algorithms to combine tasks
|
45
|
+
@Config("min_task_size")
|
46
|
+
@ConfigDefault("0")
|
47
|
+
long getMinTaskSize();
|
48
|
+
}
|
49
|
+
|
50
|
+
public static class Entry
|
51
|
+
{
|
52
|
+
private int index;
|
53
|
+
private long size;
|
54
|
+
|
55
|
+
@JsonCreator
|
56
|
+
public Entry(
|
57
|
+
@JsonProperty("index") int index,
|
58
|
+
@JsonProperty("size") long size)
|
59
|
+
{
|
60
|
+
this.index = index;
|
61
|
+
this.size = size;
|
62
|
+
}
|
63
|
+
|
64
|
+
@JsonProperty("index")
|
65
|
+
public int getIndex()
|
66
|
+
{
|
67
|
+
return index;
|
68
|
+
}
|
69
|
+
|
70
|
+
@JsonProperty("size")
|
71
|
+
public long getSize()
|
72
|
+
{
|
73
|
+
return size;
|
74
|
+
}
|
75
|
+
}
|
76
|
+
|
77
|
+
public static class Builder
|
78
|
+
{
|
79
|
+
private final Logger log = Exec.getLogger(FileList.class);
|
80
|
+
private final ByteArrayOutputStream binary;
|
81
|
+
private final OutputStream stream;
|
82
|
+
private final List<Entry> entries = new ArrayList<>();
|
83
|
+
private String last = null;
|
84
|
+
|
85
|
+
private int limitCount = Integer.MAX_VALUE;
|
86
|
+
private long minTaskSize = 1;
|
87
|
+
private Pattern pathMatchPattern;
|
88
|
+
|
89
|
+
private final ByteBuffer castBuffer = ByteBuffer.allocate(4);
|
90
|
+
|
91
|
+
public Builder(Task task)
|
92
|
+
{
|
93
|
+
this();
|
94
|
+
this.pathMatchPattern = Pattern.compile(task.getPathMatchPattern());
|
95
|
+
this.limitCount = task.getTotalFileCountLimit();
|
96
|
+
this.minTaskSize = task.getMinTaskSize();
|
97
|
+
}
|
98
|
+
|
99
|
+
public Builder(ConfigSource config)
|
100
|
+
{
|
101
|
+
this();
|
102
|
+
this.pathMatchPattern = Pattern.compile(config.get(String.class, "path_match_pattern", ".*"));
|
103
|
+
this.limitCount = config.get(int.class, "total_file_count_limit", Integer.MAX_VALUE);
|
104
|
+
this.minTaskSize = config.get(long.class, "min_task_size", 0L);
|
105
|
+
}
|
106
|
+
|
107
|
+
public Builder()
|
108
|
+
{
|
109
|
+
binary = new ByteArrayOutputStream();
|
110
|
+
try {
|
111
|
+
stream = new BufferedOutputStream(new GZIPOutputStream(binary));
|
112
|
+
}
|
113
|
+
catch (IOException ex) {
|
114
|
+
throw Throwables.propagate(ex);
|
115
|
+
}
|
116
|
+
}
|
117
|
+
|
118
|
+
public Builder limitTotalFileCount(int limitCount)
|
119
|
+
{
|
120
|
+
this.limitCount = limitCount;
|
121
|
+
return this;
|
122
|
+
}
|
123
|
+
|
124
|
+
public Builder minTaskSize(long bytes)
|
125
|
+
{
|
126
|
+
this.minTaskSize = bytes;
|
127
|
+
return this;
|
128
|
+
}
|
129
|
+
|
130
|
+
public Builder pathMatchPattern(String pattern)
|
131
|
+
{
|
132
|
+
this.pathMatchPattern = Pattern.compile(pattern);
|
133
|
+
return this;
|
134
|
+
}
|
135
|
+
|
136
|
+
public int size()
|
137
|
+
{
|
138
|
+
return entries.size();
|
139
|
+
}
|
140
|
+
|
141
|
+
public boolean needsMore()
|
142
|
+
{
|
143
|
+
return size() < limitCount;
|
144
|
+
}
|
145
|
+
|
146
|
+
// returns true if this file is used
|
147
|
+
public synchronized boolean add(String path, long size)
|
148
|
+
{
|
149
|
+
// TODO throw IllegalStateException if stream is already closed
|
150
|
+
|
151
|
+
if (!needsMore()) {
|
152
|
+
return false;
|
153
|
+
}
|
154
|
+
|
155
|
+
if (!pathMatchPattern.matcher(path).find()) {
|
156
|
+
return false;
|
157
|
+
}
|
158
|
+
|
159
|
+
int index = entries.size();
|
160
|
+
entries.add(new Entry(index, size));
|
161
|
+
log.info("add file to the request list: {}", path);
|
162
|
+
|
163
|
+
byte[] data = path.getBytes(StandardCharsets.UTF_8);
|
164
|
+
castBuffer.putInt(0, data.length);
|
165
|
+
try {
|
166
|
+
stream.write(castBuffer.array());
|
167
|
+
stream.write(data);
|
168
|
+
}
|
169
|
+
catch (IOException ex) {
|
170
|
+
throw Throwables.propagate(ex);
|
171
|
+
}
|
172
|
+
|
173
|
+
last = path;
|
174
|
+
return true;
|
175
|
+
}
|
176
|
+
|
177
|
+
public FileList build()
|
178
|
+
{
|
179
|
+
try {
|
180
|
+
stream.close();
|
181
|
+
}
|
182
|
+
catch (IOException ex) {
|
183
|
+
throw Throwables.propagate(ex);
|
184
|
+
}
|
185
|
+
return new FileList(binary.toByteArray(), getSplits(entries), Optional.fromNullable(last));
|
186
|
+
}
|
187
|
+
|
188
|
+
private List<List<Entry>> getSplits(List<Entry> all)
|
189
|
+
{
|
190
|
+
List<List<Entry>> tasks = new ArrayList<>();
|
191
|
+
long currentTaskSize = 0;
|
192
|
+
List<Entry> currentTask = new ArrayList<>();
|
193
|
+
for (Entry entry : all) {
|
194
|
+
currentTask.add(entry);
|
195
|
+
currentTaskSize += entry.getSize(); // TODO consider to multiply the size by cost_per_byte, and add cost_per_file
|
196
|
+
if (currentTaskSize >= minTaskSize) {
|
197
|
+
tasks.add(currentTask);
|
198
|
+
currentTask = new ArrayList<>();
|
199
|
+
currentTaskSize = 0;
|
200
|
+
}
|
201
|
+
}
|
202
|
+
if (!currentTask.isEmpty()) {
|
203
|
+
tasks.add(currentTask);
|
204
|
+
}
|
205
|
+
return tasks;
|
206
|
+
}
|
207
|
+
}
|
208
|
+
|
209
|
+
private final byte[] data;
|
210
|
+
private final List<List<Entry>> tasks;
|
211
|
+
private final Optional<String> last;
|
212
|
+
|
213
|
+
@JsonCreator
|
214
|
+
@Deprecated
|
215
|
+
public FileList(
|
216
|
+
@JsonProperty("data") byte[] data,
|
217
|
+
@JsonProperty("tasks") List<List<Entry>> tasks,
|
218
|
+
@JsonProperty("last") Optional<String> last)
|
219
|
+
{
|
220
|
+
this.data = data;
|
221
|
+
this.tasks = tasks;
|
222
|
+
this.last = last;
|
223
|
+
}
|
224
|
+
|
225
|
+
@JsonIgnore
|
226
|
+
public Optional<String> getLastPath(Optional<String> lastLastPath)
|
227
|
+
{
|
228
|
+
if (last.isPresent()) {
|
229
|
+
return last;
|
230
|
+
}
|
231
|
+
return lastLastPath;
|
232
|
+
}
|
233
|
+
|
234
|
+
@JsonIgnore
|
235
|
+
public int getTaskCount()
|
236
|
+
{
|
237
|
+
return tasks.size();
|
238
|
+
}
|
239
|
+
|
240
|
+
@JsonIgnore
|
241
|
+
public List<String> get(int i)
|
242
|
+
{
|
243
|
+
return new EntryList(data, tasks.get(i));
|
244
|
+
}
|
245
|
+
|
246
|
+
@JsonProperty("data")
|
247
|
+
@Deprecated
|
248
|
+
public byte[] getData()
|
249
|
+
{
|
250
|
+
return data;
|
251
|
+
}
|
252
|
+
|
253
|
+
@JsonProperty("tasks")
|
254
|
+
@Deprecated
|
255
|
+
public List<List<Entry>> getTasks()
|
256
|
+
{
|
257
|
+
return tasks;
|
258
|
+
}
|
259
|
+
|
260
|
+
@JsonProperty("last")
|
261
|
+
@Deprecated
|
262
|
+
public Optional<String> getLast()
|
263
|
+
{
|
264
|
+
return last;
|
265
|
+
}
|
266
|
+
|
267
|
+
private class EntryList
|
268
|
+
extends AbstractList<String>
|
269
|
+
{
|
270
|
+
private final byte[] data;
|
271
|
+
private final List<Entry> entries;
|
272
|
+
private InputStream stream;
|
273
|
+
private int current;
|
274
|
+
|
275
|
+
private final ByteBuffer castBuffer = ByteBuffer.allocate(4);
|
276
|
+
|
277
|
+
public EntryList(byte[] data, List<Entry> entries)
|
278
|
+
{
|
279
|
+
this.data = data;
|
280
|
+
this.entries = entries;
|
281
|
+
try {
|
282
|
+
this.stream = new BufferedInputStream(new GZIPInputStream(new ByteArrayInputStream(data)));
|
283
|
+
}
|
284
|
+
catch (IOException ex) {
|
285
|
+
throw Throwables.propagate(ex);
|
286
|
+
}
|
287
|
+
this.current = 0;
|
288
|
+
}
|
289
|
+
|
290
|
+
@Override
|
291
|
+
public synchronized String get(int i)
|
292
|
+
{
|
293
|
+
Entry e = entries.get(i);
|
294
|
+
if (e.getIndex() < current) {
|
295
|
+
// rewind to the head
|
296
|
+
try {
|
297
|
+
stream.close();
|
298
|
+
stream = new BufferedInputStream(new GZIPInputStream(new ByteArrayInputStream(data)));
|
299
|
+
}
|
300
|
+
catch (IOException ex) {
|
301
|
+
throw Throwables.propagate(ex);
|
302
|
+
}
|
303
|
+
current = 0;
|
304
|
+
}
|
305
|
+
|
306
|
+
while (current < e.getIndex()) {
|
307
|
+
readNext();
|
308
|
+
}
|
309
|
+
// now current == e.getIndex()
|
310
|
+
return readNextString();
|
311
|
+
}
|
312
|
+
|
313
|
+
@Override
|
314
|
+
public int size()
|
315
|
+
{
|
316
|
+
return entries.size();
|
317
|
+
}
|
318
|
+
|
319
|
+
private byte[] readNext()
|
320
|
+
{
|
321
|
+
try {
|
322
|
+
stream.read(castBuffer.array());
|
323
|
+
int n = castBuffer.getInt(0);
|
324
|
+
byte[] b = new byte[n]; // here should be able to use a pooled buffer because read data is ignored if readNextString doesn't call this method
|
325
|
+
stream.read(b);
|
326
|
+
|
327
|
+
current++;
|
328
|
+
|
329
|
+
return b;
|
330
|
+
}
|
331
|
+
catch (IOException ex) {
|
332
|
+
throw Throwables.propagate(ex);
|
333
|
+
}
|
334
|
+
}
|
335
|
+
|
336
|
+
private String readNextString()
|
337
|
+
{
|
338
|
+
return new String(readNext(), StandardCharsets.UTF_8);
|
339
|
+
}
|
340
|
+
}
|
341
|
+
}
|
data/src/test/java/org/embulk/input/azure_blob_storage/TestAzureBlobStorageFileInputPlugin.java
CHANGED
@@ -93,18 +93,6 @@ public class TestAzureBlobStorageFileInputPlugin
|
|
93
93
|
assertEquals(5, task.getMaxConnectionRetry());
|
94
94
|
}
|
95
95
|
|
96
|
-
public ConfigSource config()
|
97
|
-
{
|
98
|
-
return Exec.newConfigSource()
|
99
|
-
.set("account_name", AZURE_ACCOUNT_NAME)
|
100
|
-
.set("account_key", AZURE_ACCOUNT_KEY)
|
101
|
-
.set("container", AZURE_CONTAINER)
|
102
|
-
.set("path_prefix", AZURE_PATH_PREFIX)
|
103
|
-
.set("last_path", "")
|
104
|
-
.set("file_ext", ".csv")
|
105
|
-
.set("parser", parserConfig(schemaConfig()));
|
106
|
-
}
|
107
|
-
|
108
96
|
@Test(expected = ConfigException.class)
|
109
97
|
public void checkDefaultValuesAccountNameIsNull()
|
110
98
|
{
|
@@ -114,7 +102,6 @@ public class TestAzureBlobStorageFileInputPlugin
|
|
114
102
|
.set("container", AZURE_CONTAINER)
|
115
103
|
.set("path_prefix", AZURE_PATH_PREFIX)
|
116
104
|
.set("last_path", "")
|
117
|
-
.set("file_ext", ".csv")
|
118
105
|
.set("parser", parserConfig(schemaConfig()));
|
119
106
|
|
120
107
|
runner.transaction(config, new Control());
|
@@ -129,7 +116,6 @@ public class TestAzureBlobStorageFileInputPlugin
|
|
129
116
|
.set("container", AZURE_CONTAINER)
|
130
117
|
.set("path_prefix", AZURE_PATH_PREFIX)
|
131
118
|
.set("last_path", "")
|
132
|
-
.set("file_ext", ".csv")
|
133
119
|
.set("parser", parserConfig(schemaConfig()));
|
134
120
|
|
135
121
|
runner.transaction(config, new Control());
|
@@ -144,7 +130,6 @@ public class TestAzureBlobStorageFileInputPlugin
|
|
144
130
|
.set("container", null)
|
145
131
|
.set("path_prefix", AZURE_PATH_PREFIX)
|
146
132
|
.set("last_path", "")
|
147
|
-
.set("file_ext", ".csv")
|
148
133
|
.set("parser", parserConfig(schemaConfig()));
|
149
134
|
|
150
135
|
runner.transaction(config, new Control());
|
@@ -166,7 +151,7 @@ public class TestAzureBlobStorageFileInputPlugin
|
|
166
151
|
public void testResume()
|
167
152
|
{
|
168
153
|
PluginTask task = config.loadConfig(PluginTask.class);
|
169
|
-
task.setFiles(Arrays.asList("in/aa/a"));
|
154
|
+
task.setFiles(createFileList(Arrays.asList("in/aa/a"), task));
|
170
155
|
ConfigDiff configDiff = plugin.resume(task.dump(), 0, new FileInputPlugin.Control()
|
171
156
|
{
|
172
157
|
@Override
|
@@ -190,8 +175,8 @@ public class TestAzureBlobStorageFileInputPlugin
|
|
190
175
|
throws NoSuchMethodException, IllegalAccessException, InvocationTargetException
|
191
176
|
{
|
192
177
|
List<String> expected = Arrays.asList(
|
193
|
-
|
194
|
-
|
178
|
+
AZURE_CONTAINER_IMPORT_DIRECTORY + "sample_01.csv",
|
179
|
+
AZURE_CONTAINER_IMPORT_DIRECTORY + "sample_02.csv"
|
195
180
|
);
|
196
181
|
|
197
182
|
PluginTask task = config.loadConfig(PluginTask.class);
|
@@ -210,8 +195,9 @@ public class TestAzureBlobStorageFileInputPlugin
|
|
210
195
|
|
211
196
|
Method listFiles = AzureBlobStorageFileInputPlugin.class.getDeclaredMethod("listFiles", CloudBlobClient.class, PluginTask.class);
|
212
197
|
listFiles.setAccessible(true);
|
213
|
-
|
214
|
-
assertEquals(expected, actual);
|
198
|
+
FileList actual = (FileList) listFiles.invoke(plugin, client, task);
|
199
|
+
assertEquals(expected.get(0), actual.get(0).get(0));
|
200
|
+
assertEquals(expected.get(1), actual.get(1).get(0));
|
215
201
|
assertEquals(AZURE_CONTAINER_IMPORT_DIRECTORY + "sample_02.csv", configDiff.get(String.class, "last_path"));
|
216
202
|
}
|
217
203
|
|
@@ -228,11 +214,26 @@ public class TestAzureBlobStorageFileInputPlugin
|
|
228
214
|
|
229
215
|
Method listFiles = AzureBlobStorageFileInputPlugin.class.getDeclaredMethod("listFiles", CloudBlobClient.class, PluginTask.class);
|
230
216
|
listFiles.setAccessible(true);
|
231
|
-
task.setFiles((
|
217
|
+
task.setFiles((FileList) listFiles.invoke(plugin, client, task));
|
232
218
|
|
233
219
|
assertRecords(config, output);
|
234
220
|
}
|
235
221
|
|
222
|
+
@Test
|
223
|
+
public void testCreateNextToken() throws Exception
|
224
|
+
{
|
225
|
+
Method base64Encode = AzureBlobStorageFileInputPlugin.class.getDeclaredMethod("createNextToken", String.class);
|
226
|
+
base64Encode.setAccessible(true);
|
227
|
+
|
228
|
+
String expected = "2!92!MDAwMDI1IXJlYWRvbmx5L3NhbXBsZV8wMS50c3YuZ3ohMDAwMDI4ITk5OTktMTItMzFUMjM6NTk6NTkuOTk5OTk5OVoh";
|
229
|
+
String lastPath = "readonly/sample_01.tsv.gz";
|
230
|
+
assertEquals(expected, base64Encode.invoke(plugin, lastPath).toString());
|
231
|
+
|
232
|
+
expected = "2!120!MDAwMDQ2IXBhdGgvdGhhdC9oYXZlL2xvbmcvcGF0aC9uYW1lL3NhbXBsZV8wMS50c3YuZ3ohMDAwMDI4ITk5OTktMTItMzFUMjM6NTk6NTkuOTk5OTk5OVoh";
|
233
|
+
lastPath = "path/that/have/long/path/name/sample_01.tsv.gz";
|
234
|
+
assertEquals(expected, base64Encode.invoke(plugin, lastPath).toString());
|
235
|
+
}
|
236
|
+
|
236
237
|
static List<TaskReport> emptyTaskReports(int taskCount)
|
237
238
|
{
|
238
239
|
ImmutableList.Builder<TaskReport> reports = new ImmutableList.Builder<>();
|
@@ -256,6 +257,17 @@ public class TestAzureBlobStorageFileInputPlugin
|
|
256
257
|
}
|
257
258
|
}
|
258
259
|
|
260
|
+
public ConfigSource config()
|
261
|
+
{
|
262
|
+
return Exec.newConfigSource()
|
263
|
+
.set("account_name", AZURE_ACCOUNT_NAME)
|
264
|
+
.set("account_key", AZURE_ACCOUNT_KEY)
|
265
|
+
.set("container", AZURE_CONTAINER)
|
266
|
+
.set("path_prefix", AZURE_PATH_PREFIX)
|
267
|
+
.set("last_path", "")
|
268
|
+
.set("parser", parserConfig(schemaConfig()));
|
269
|
+
}
|
270
|
+
|
259
271
|
private ImmutableMap<String, Object> parserConfig(ImmutableList<Object> schemaConfig)
|
260
272
|
{
|
261
273
|
ImmutableMap.Builder<String, Object> builder = new ImmutableMap.Builder<>();
|
@@ -340,4 +352,13 @@ public class TestAzureBlobStorageFileInputPlugin
|
|
340
352
|
}
|
341
353
|
return dir;
|
342
354
|
}
|
355
|
+
|
356
|
+
private FileList createFileList(List<String> fileList, PluginTask task)
|
357
|
+
{
|
358
|
+
FileList.Builder builder = new FileList.Builder(task);
|
359
|
+
for (String file : fileList) {
|
360
|
+
builder.add(file, 0);
|
361
|
+
}
|
362
|
+
return builder.build();
|
363
|
+
}
|
343
364
|
}
|
@@ -0,0 +1,87 @@
|
|
1
|
+
package org.embulk.input.azure_blob_storage;
|
2
|
+
|
3
|
+
import org.embulk.EmbulkTestRuntime;
|
4
|
+
import org.embulk.config.ConfigSource;
|
5
|
+
import org.junit.Before;
|
6
|
+
import org.junit.Rule;
|
7
|
+
import org.junit.Test;
|
8
|
+
|
9
|
+
import static org.junit.Assert.assertEquals;
|
10
|
+
|
11
|
+
public class TestFileList
|
12
|
+
{
|
13
|
+
@Rule
|
14
|
+
public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
|
15
|
+
|
16
|
+
private ConfigSource config;
|
17
|
+
|
18
|
+
@Before
|
19
|
+
public void createConfigSource()
|
20
|
+
{
|
21
|
+
config = runtime.getExec().newConfigSource();
|
22
|
+
}
|
23
|
+
|
24
|
+
@Test
|
25
|
+
public void checkMinTaskSize()
|
26
|
+
throws Exception
|
27
|
+
{
|
28
|
+
{ // not specify min_task_size
|
29
|
+
FileList fileList = newFileList(config.deepCopy(),
|
30
|
+
"sample_00", 100L,
|
31
|
+
"sample_01", 150L,
|
32
|
+
"sample_02", 350L);
|
33
|
+
|
34
|
+
assertEquals(3, fileList.getTaskCount());
|
35
|
+
assertEquals("sample_00", fileList.get(0).get(0));
|
36
|
+
assertEquals("sample_01", fileList.get(1).get(0));
|
37
|
+
assertEquals("sample_02", fileList.get(2).get(0));
|
38
|
+
}
|
39
|
+
|
40
|
+
{
|
41
|
+
FileList fileList = newFileList(config.deepCopy().set("min_task_size", 100),
|
42
|
+
"sample_00", 100L,
|
43
|
+
"sample_01", 150L,
|
44
|
+
"sample_02", 350L);
|
45
|
+
|
46
|
+
assertEquals(3, fileList.getTaskCount());
|
47
|
+
assertEquals("sample_00", fileList.get(0).get(0));
|
48
|
+
assertEquals("sample_01", fileList.get(1).get(0));
|
49
|
+
assertEquals("sample_02", fileList.get(2).get(0));
|
50
|
+
}
|
51
|
+
|
52
|
+
{
|
53
|
+
FileList fileList = newFileList(config.deepCopy().set("min_task_size", 200),
|
54
|
+
"sample_00", 100L,
|
55
|
+
"sample_01", 150L,
|
56
|
+
"sample_02", 350L);
|
57
|
+
|
58
|
+
assertEquals(2, fileList.getTaskCount());
|
59
|
+
assertEquals("sample_00", fileList.get(0).get(0));
|
60
|
+
assertEquals("sample_01", fileList.get(0).get(1));
|
61
|
+
assertEquals("sample_02", fileList.get(1).get(0));
|
62
|
+
}
|
63
|
+
|
64
|
+
{
|
65
|
+
FileList fileList = newFileList(config.deepCopy().set("min_task_size", 700),
|
66
|
+
"sample_00", 100L,
|
67
|
+
"sample_01", 150L,
|
68
|
+
"sample_02", 350L);
|
69
|
+
|
70
|
+
assertEquals(1, fileList.getTaskCount());
|
71
|
+
assertEquals("sample_00", fileList.get(0).get(0));
|
72
|
+
assertEquals("sample_01", fileList.get(0).get(1));
|
73
|
+
assertEquals("sample_02", fileList.get(0).get(2));
|
74
|
+
}
|
75
|
+
}
|
76
|
+
|
77
|
+
private static FileList newFileList(ConfigSource config, Object... nameAndSize)
|
78
|
+
{
|
79
|
+
FileList.Builder builder = new FileList.Builder(config);
|
80
|
+
|
81
|
+
for (int i = 0; i < nameAndSize.length; i += 2) {
|
82
|
+
builder.add((String) nameAndSize[i], (long) nameAndSize[i + 1]);
|
83
|
+
}
|
84
|
+
|
85
|
+
return builder.build();
|
86
|
+
}
|
87
|
+
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-input-azure_blob_storage
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Satoshi Akama
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-03-
|
11
|
+
date: 2016-03-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -57,12 +57,14 @@ files:
|
|
57
57
|
- gradlew.bat
|
58
58
|
- lib/embulk/input/azure_blob_storage.rb
|
59
59
|
- src/main/java/org/embulk/input/azure_blob_storage/AzureBlobStorageFileInputPlugin.java
|
60
|
+
- src/main/java/org/embulk/input/azure_blob_storage/FileList.java
|
60
61
|
- src/test/java/org/embulk/input/azure_blob_storage/TestAzureBlobStorageFileInputPlugin.java
|
62
|
+
- src/test/java/org/embulk/input/azure_blob_storage/TestFileList.java
|
61
63
|
- src/test/resources/sample_01.csv
|
62
64
|
- src/test/resources/sample_02.csv
|
63
65
|
- classpath/azure-storage-4.0.0.jar
|
64
66
|
- classpath/commons-lang3-3.4.jar
|
65
|
-
- classpath/embulk-input-azure_blob_storage-0.1.
|
67
|
+
- classpath/embulk-input-azure_blob_storage-0.1.4.jar
|
66
68
|
- classpath/jackson-core-2.6.0.jar
|
67
69
|
homepage: https://github.com/sakama/embulk-input-azure_blob_storage
|
68
70
|
licenses:
|