embulk-input-azure_blob_storage 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -1
- data/README.md +17 -0
- data/build.gradle +1 -1
- data/src/main/java/org/embulk/input/azure_blob_storage/AzureBlobStorageFileInputPlugin.java +45 -30
- data/src/main/java/org/embulk/input/azure_blob_storage/FileList.java +341 -0
- data/src/test/java/org/embulk/input/azure_blob_storage/TestAzureBlobStorageFileInputPlugin.java +42 -21
- data/src/test/java/org/embulk/input/azure_blob_storage/TestFileList.java +87 -0
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 49b71df05579f8c392ec70089aaeca95f1b81144
|
4
|
+
data.tar.gz: a63ffb1663b9bbed5827211dbb62eeb04b20a987
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1a43a45c850746c1a50b9f12be823f0081ea9deddd4403e831f2533e6af4ab4861405e45cc62824fcb637b6509178fceeb6cad4f583542677e0ee3383beb8e43
|
7
|
+
data.tar.gz: d5e6066c047f60034d50ff6084919157476f962adf5c9f2b1a59685a97d83a254637ed0037276629714e2b1223c229c0564b58c162cfffbb58b10a4822f563aa
|
data/CHANGELOG.md
CHANGED
@@ -1,6 +1,11 @@
|
|
1
|
+
## 0.1.4 - 2015-03-22
|
2
|
+
|
3
|
+
* [new feature] Support `last_path` option [#7](https://github.com/sakama/embulk-input-azure_blob_storage/pull/7)
|
4
|
+
* [new feature] Support `path_match_pattern` option [#6](https://github.com/sakama/embulk-input-azure_blob_storage/pull/6)
|
5
|
+
|
1
6
|
## 0.1.3 - 2015-03-16
|
2
7
|
|
3
|
-
* [maintenance] Add unit test[#4](https://github.com/sakama/embulk-input-azure_blob_storage/pull/4)
|
8
|
+
* [maintenance] Add unit test [#4](https://github.com/sakama/embulk-input-azure_blob_storage/pull/4)
|
4
9
|
* [maintenance] Add retry logic [#3](https://github.com/sakama/embulk-input-azure_blob_storage/pull/3)
|
5
10
|
|
6
11
|
## 0.1.2 - 2015-10-11
|
data/README.md
CHANGED
@@ -16,6 +16,8 @@ First, create Azure [Storage Account](https://azure.microsoft.com/en-us/document
|
|
16
16
|
- **account_key**: primary access key (string, required)
|
17
17
|
- **container**: container name data stored (string, required)
|
18
18
|
- **path_prefix**: prefix of target keys (string, required) (string, required)
|
19
|
+
- **path_match_pattern**: regexp to match file paths. If a file path doesn't match with this pattern, the file will be skipped (regexp string, optional)
|
20
|
+
- **total_file_count_limit**: maximum number of files to read (integer, optional)
|
19
21
|
|
20
22
|
## Example
|
21
23
|
|
@@ -55,6 +57,21 @@ in:
|
|
55
57
|
out: {type: stdout}
|
56
58
|
```
|
57
59
|
|
60
|
+
To filter files using regexp:
|
61
|
+
|
62
|
+
```yaml
|
63
|
+
in:
|
64
|
+
type: sftp
|
65
|
+
path_prefix: logs/csv-
|
66
|
+
...
|
67
|
+
path_match_pattern: \.csv$ # a file will be skipped if its path doesn't match with this pattern
|
68
|
+
|
69
|
+
## some examples of regexp:
|
70
|
+
#path_match_pattern: /archive/ # match files in .../archive/... directory
|
71
|
+
#path_match_pattern: /data1/|/data2/ # match files in .../data1/... or .../data2/... directory
|
72
|
+
#path_match_pattern: .csv$|.csv.gz$ # match files whose suffix is .csv or .csv.gz
|
73
|
+
```
|
74
|
+
|
58
75
|
## Build
|
59
76
|
|
60
77
|
```
|
data/build.gradle
CHANGED
@@ -1,10 +1,12 @@
|
|
1
1
|
package org.embulk.input.azure_blob_storage;
|
2
2
|
|
3
|
+
import com.google.common.base.Charsets;
|
3
4
|
import com.google.common.base.Optional;
|
4
5
|
import com.google.common.base.Throwables;
|
5
|
-
import com.google.common.
|
6
|
+
import com.google.common.io.BaseEncoding;
|
6
7
|
import com.microsoft.azure.storage.CloudStorageAccount;
|
7
8
|
import com.microsoft.azure.storage.ResultContinuation;
|
9
|
+
import com.microsoft.azure.storage.ResultContinuationType;
|
8
10
|
import com.microsoft.azure.storage.ResultSegment;
|
9
11
|
import com.microsoft.azure.storage.StorageException;
|
10
12
|
import com.microsoft.azure.storage.blob.CloudBlob;
|
@@ -31,15 +33,14 @@ import java.io.IOException;
|
|
31
33
|
import java.io.InputStream;
|
32
34
|
import java.net.URISyntaxException;
|
33
35
|
import java.security.InvalidKeyException;
|
34
|
-
import java.util.
|
35
|
-
import java.util.Collections;
|
36
|
+
import java.util.Iterator;
|
36
37
|
import java.util.List;
|
37
38
|
|
38
39
|
public class AzureBlobStorageFileInputPlugin
|
39
40
|
implements FileInputPlugin
|
40
41
|
{
|
41
42
|
public interface PluginTask
|
42
|
-
extends Task
|
43
|
+
extends Task, FileList.Task
|
43
44
|
{
|
44
45
|
@Config("account_name")
|
45
46
|
String getAccountName();
|
@@ -65,9 +66,8 @@ public class AzureBlobStorageFileInputPlugin
|
|
65
66
|
@ConfigDefault("5") // 5 times retry to connect sftp server if failed.
|
66
67
|
int getMaxConnectionRetry();
|
67
68
|
|
68
|
-
|
69
|
-
|
70
|
-
void setFiles(List<String> files);
|
69
|
+
FileList getFiles();
|
70
|
+
void setFiles(FileList files);
|
71
71
|
|
72
72
|
@ConfigInject
|
73
73
|
BufferAllocator getBufferAllocator();
|
@@ -83,28 +83,18 @@ public class AzureBlobStorageFileInputPlugin
|
|
83
83
|
CloudBlobClient blobClient = newAzureClient(task.getAccountName(), task.getAccountKey());
|
84
84
|
task.setFiles(listFiles(blobClient, task));
|
85
85
|
|
86
|
-
return resume(task.dump(), task.getFiles().
|
86
|
+
return resume(task.dump(), task.getFiles().getTaskCount(), control);
|
87
87
|
}
|
88
88
|
|
89
89
|
@Override
|
90
90
|
public ConfigDiff resume(TaskSource taskSource, int taskCount, FileInputPlugin.Control control)
|
91
91
|
{
|
92
92
|
PluginTask task = taskSource.loadTask(PluginTask.class);
|
93
|
-
|
94
93
|
control.run(taskSource, taskCount);
|
95
94
|
|
96
95
|
ConfigDiff configDiff = Exec.newConfigDiff();
|
96
|
+
configDiff.set("last_path", task.getFiles().getLastPath(task.getLastPath()));
|
97
97
|
|
98
|
-
List<String> files = new ArrayList<>(task.getFiles());
|
99
|
-
if (files.isEmpty()) {
|
100
|
-
if (task.getLastPath().isPresent()) {
|
101
|
-
configDiff.set("last_path", task.getLastPath().get());
|
102
|
-
}
|
103
|
-
}
|
104
|
-
else {
|
105
|
-
Collections.sort(files);
|
106
|
-
configDiff.set("last_path", files.get(files.size() - 1));
|
107
|
-
}
|
108
98
|
return configDiff;
|
109
99
|
}
|
110
100
|
|
@@ -129,22 +119,28 @@ public class AzureBlobStorageFileInputPlugin
|
|
129
119
|
return account.createCloudBlobClient();
|
130
120
|
}
|
131
121
|
|
132
|
-
private
|
122
|
+
private FileList listFiles(CloudBlobClient client, PluginTask task)
|
133
123
|
{
|
134
124
|
if (task.getPathPrefix().equals("/")) {
|
135
125
|
log.info("Listing files with prefix \"/\". This doesn't mean all files in a bucket. If you intend to read all files, use \"path_prefix: ''\" (empty string) instead.");
|
136
126
|
}
|
127
|
+
FileList.Builder builder = new FileList.Builder(task);
|
137
128
|
|
138
|
-
return listFilesWithPrefix(client, task.getContainer(), task.getPathPrefix(), task.getLastPath(), task.getMaxResults());
|
129
|
+
return listFilesWithPrefix(builder, client, task.getContainer(), task.getPathPrefix(), task.getLastPath(), task.getMaxResults());
|
139
130
|
}
|
140
131
|
|
141
|
-
private static
|
132
|
+
private static FileList listFilesWithPrefix(FileList.Builder builder, CloudBlobClient client, String containerName,
|
142
133
|
String prefix, Optional<String> lastPath, int maxResults)
|
143
134
|
{
|
144
|
-
|
145
|
-
// It seems I can't cast lastKey<String> to token<ResultContinuation> by Azure SDK for Java
|
146
|
-
String lastKey = lastPath.orNull();
|
135
|
+
String lastKey = (lastPath.isPresent() && !lastPath.get().isEmpty()) ? createNextToken(lastPath.get()) : null;
|
147
136
|
ResultContinuation token = null;
|
137
|
+
if (lastKey != null) {
|
138
|
+
token = new ResultContinuation();
|
139
|
+
token.setContinuationType(ResultContinuationType.BLOB);
|
140
|
+
log.debug("lastPath: {}", lastPath.get());
|
141
|
+
log.debug("lastPath(Base64encoded): {}", lastKey);
|
142
|
+
token.setNextMarker(lastKey);
|
143
|
+
}
|
148
144
|
|
149
145
|
try {
|
150
146
|
CloudBlobContainer container = client.getContainerReference(containerName);
|
@@ -156,7 +152,7 @@ public class AzureBlobStorageFileInputPlugin
|
|
156
152
|
if (blobItem instanceof CloudBlob) {
|
157
153
|
CloudBlob blob = (CloudBlob) blobItem;
|
158
154
|
if (blob.exists() && !blob.getUri().toString().endsWith("/")) {
|
159
|
-
builder.add(blob.getName());
|
155
|
+
builder.add(blob.getName(), blob.getProperties().getLength());
|
160
156
|
log.debug(String.format("name:%s, class:%s, uri:%s", blob.getName(), blob.getClass(), blob.getUri()));
|
161
157
|
}
|
162
158
|
}
|
@@ -201,7 +197,7 @@ public class AzureBlobStorageFileInputPlugin
|
|
201
197
|
{
|
202
198
|
private CloudBlobClient client;
|
203
199
|
private final String containerName;
|
204
|
-
private final String
|
200
|
+
private final Iterator<String> iterator;
|
205
201
|
private final int maxConnectionRetry;
|
206
202
|
private boolean opened = false;
|
207
203
|
|
@@ -209,14 +205,14 @@ public class AzureBlobStorageFileInputPlugin
|
|
209
205
|
{
|
210
206
|
this.client = newAzureClient(task.getAccountName(), task.getAccountKey());
|
211
207
|
this.containerName = task.getContainer();
|
212
|
-
this.
|
208
|
+
this.iterator = task.getFiles().get(taskIndex).iterator();
|
213
209
|
this.maxConnectionRetry = task.getMaxConnectionRetry();
|
214
210
|
}
|
215
211
|
|
216
212
|
@Override
|
217
213
|
public InputStream openNext() throws IOException
|
218
214
|
{
|
219
|
-
if (opened) {
|
215
|
+
if (opened || !iterator.hasNext()) {
|
220
216
|
return null;
|
221
217
|
}
|
222
218
|
opened = true;
|
@@ -225,7 +221,7 @@ public class AzureBlobStorageFileInputPlugin
|
|
225
221
|
while (true) {
|
226
222
|
try {
|
227
223
|
CloudBlobContainer container = client.getContainerReference(containerName);
|
228
|
-
CloudBlob blob = container.getBlockBlobReference(
|
224
|
+
CloudBlob blob = container.getBlockBlobReference(iterator.next());
|
229
225
|
return blob.openInputStream();
|
230
226
|
}
|
231
227
|
catch (StorageException | URISyntaxException ex) {
|
@@ -250,4 +246,23 @@ public class AzureBlobStorageFileInputPlugin
|
|
250
246
|
@Override
|
251
247
|
public void close() {}
|
252
248
|
}
|
249
|
+
|
250
|
+
private static String createNextToken(String path)
|
251
|
+
{
|
252
|
+
StringBuilder sb = new StringBuilder()
|
253
|
+
.append(String.format("%06d", path.length()))
|
254
|
+
.append("!")
|
255
|
+
.append(path)
|
256
|
+
.append("!000028!9999-12-31T23:59:59.9999999Z!");
|
257
|
+
|
258
|
+
String encodedString = BaseEncoding.base64().encode(sb.toString().getBytes(Charsets.UTF_8));
|
259
|
+
|
260
|
+
StringBuilder marker = new StringBuilder()
|
261
|
+
.append("2")
|
262
|
+
.append("!")
|
263
|
+
.append(encodedString.length())
|
264
|
+
.append("!")
|
265
|
+
.append(encodedString);
|
266
|
+
return marker.toString();
|
267
|
+
}
|
253
268
|
}
|
@@ -0,0 +1,341 @@
|
|
1
|
+
package org.embulk.input.azure_blob_storage;
|
2
|
+
|
3
|
+
import com.fasterxml.jackson.annotation.JsonCreator;
|
4
|
+
import com.fasterxml.jackson.annotation.JsonIgnore;
|
5
|
+
import com.fasterxml.jackson.annotation.JsonProperty;
|
6
|
+
import com.google.common.base.Optional;
|
7
|
+
import com.google.common.base.Throwables;
|
8
|
+
import org.embulk.config.Config;
|
9
|
+
import org.embulk.config.ConfigDefault;
|
10
|
+
import org.embulk.config.ConfigSource;
|
11
|
+
import org.embulk.spi.Exec;
|
12
|
+
import org.slf4j.Logger;
|
13
|
+
|
14
|
+
import java.io.BufferedInputStream;
|
15
|
+
import java.io.BufferedOutputStream;
|
16
|
+
import java.io.ByteArrayInputStream;
|
17
|
+
import java.io.ByteArrayOutputStream;
|
18
|
+
|
19
|
+
import java.io.IOException;
|
20
|
+
import java.io.InputStream;
|
21
|
+
import java.io.OutputStream;
|
22
|
+
import java.nio.ByteBuffer;
|
23
|
+
import java.nio.charset.StandardCharsets;
|
24
|
+
import java.util.AbstractList;
|
25
|
+
import java.util.ArrayList;
|
26
|
+
import java.util.List;
|
27
|
+
import java.util.regex.Pattern;
|
28
|
+
import java.util.zip.GZIPInputStream;
|
29
|
+
import java.util.zip.GZIPOutputStream;
|
30
|
+
|
31
|
+
// this class should be moved to embulk-core
|
32
|
+
public class FileList
|
33
|
+
{
|
34
|
+
public interface Task
|
35
|
+
{
|
36
|
+
@Config("path_match_pattern")
|
37
|
+
@ConfigDefault("\".*\"")
|
38
|
+
String getPathMatchPattern();
|
39
|
+
|
40
|
+
@Config("total_file_count_limit")
|
41
|
+
@ConfigDefault("2147483647")
|
42
|
+
int getTotalFileCountLimit();
|
43
|
+
|
44
|
+
// TODO support more algorithms to combine tasks
|
45
|
+
@Config("min_task_size")
|
46
|
+
@ConfigDefault("0")
|
47
|
+
long getMinTaskSize();
|
48
|
+
}
|
49
|
+
|
50
|
+
public static class Entry
|
51
|
+
{
|
52
|
+
private int index;
|
53
|
+
private long size;
|
54
|
+
|
55
|
+
@JsonCreator
|
56
|
+
public Entry(
|
57
|
+
@JsonProperty("index") int index,
|
58
|
+
@JsonProperty("size") long size)
|
59
|
+
{
|
60
|
+
this.index = index;
|
61
|
+
this.size = size;
|
62
|
+
}
|
63
|
+
|
64
|
+
@JsonProperty("index")
|
65
|
+
public int getIndex()
|
66
|
+
{
|
67
|
+
return index;
|
68
|
+
}
|
69
|
+
|
70
|
+
@JsonProperty("size")
|
71
|
+
public long getSize()
|
72
|
+
{
|
73
|
+
return size;
|
74
|
+
}
|
75
|
+
}
|
76
|
+
|
77
|
+
public static class Builder
|
78
|
+
{
|
79
|
+
private final Logger log = Exec.getLogger(FileList.class);
|
80
|
+
private final ByteArrayOutputStream binary;
|
81
|
+
private final OutputStream stream;
|
82
|
+
private final List<Entry> entries = new ArrayList<>();
|
83
|
+
private String last = null;
|
84
|
+
|
85
|
+
private int limitCount = Integer.MAX_VALUE;
|
86
|
+
private long minTaskSize = 1;
|
87
|
+
private Pattern pathMatchPattern;
|
88
|
+
|
89
|
+
private final ByteBuffer castBuffer = ByteBuffer.allocate(4);
|
90
|
+
|
91
|
+
public Builder(Task task)
|
92
|
+
{
|
93
|
+
this();
|
94
|
+
this.pathMatchPattern = Pattern.compile(task.getPathMatchPattern());
|
95
|
+
this.limitCount = task.getTotalFileCountLimit();
|
96
|
+
this.minTaskSize = task.getMinTaskSize();
|
97
|
+
}
|
98
|
+
|
99
|
+
public Builder(ConfigSource config)
|
100
|
+
{
|
101
|
+
this();
|
102
|
+
this.pathMatchPattern = Pattern.compile(config.get(String.class, "path_match_pattern", ".*"));
|
103
|
+
this.limitCount = config.get(int.class, "total_file_count_limit", Integer.MAX_VALUE);
|
104
|
+
this.minTaskSize = config.get(long.class, "min_task_size", 0L);
|
105
|
+
}
|
106
|
+
|
107
|
+
public Builder()
|
108
|
+
{
|
109
|
+
binary = new ByteArrayOutputStream();
|
110
|
+
try {
|
111
|
+
stream = new BufferedOutputStream(new GZIPOutputStream(binary));
|
112
|
+
}
|
113
|
+
catch (IOException ex) {
|
114
|
+
throw Throwables.propagate(ex);
|
115
|
+
}
|
116
|
+
}
|
117
|
+
|
118
|
+
public Builder limitTotalFileCount(int limitCount)
|
119
|
+
{
|
120
|
+
this.limitCount = limitCount;
|
121
|
+
return this;
|
122
|
+
}
|
123
|
+
|
124
|
+
public Builder minTaskSize(long bytes)
|
125
|
+
{
|
126
|
+
this.minTaskSize = bytes;
|
127
|
+
return this;
|
128
|
+
}
|
129
|
+
|
130
|
+
public Builder pathMatchPattern(String pattern)
|
131
|
+
{
|
132
|
+
this.pathMatchPattern = Pattern.compile(pattern);
|
133
|
+
return this;
|
134
|
+
}
|
135
|
+
|
136
|
+
public int size()
|
137
|
+
{
|
138
|
+
return entries.size();
|
139
|
+
}
|
140
|
+
|
141
|
+
public boolean needsMore()
|
142
|
+
{
|
143
|
+
return size() < limitCount;
|
144
|
+
}
|
145
|
+
|
146
|
+
// returns true if this file is used
|
147
|
+
public synchronized boolean add(String path, long size)
|
148
|
+
{
|
149
|
+
// TODO throw IllegalStateException if stream is already closed
|
150
|
+
|
151
|
+
if (!needsMore()) {
|
152
|
+
return false;
|
153
|
+
}
|
154
|
+
|
155
|
+
if (!pathMatchPattern.matcher(path).find()) {
|
156
|
+
return false;
|
157
|
+
}
|
158
|
+
|
159
|
+
int index = entries.size();
|
160
|
+
entries.add(new Entry(index, size));
|
161
|
+
log.info("add file to the request list: {}", path);
|
162
|
+
|
163
|
+
byte[] data = path.getBytes(StandardCharsets.UTF_8);
|
164
|
+
castBuffer.putInt(0, data.length);
|
165
|
+
try {
|
166
|
+
stream.write(castBuffer.array());
|
167
|
+
stream.write(data);
|
168
|
+
}
|
169
|
+
catch (IOException ex) {
|
170
|
+
throw Throwables.propagate(ex);
|
171
|
+
}
|
172
|
+
|
173
|
+
last = path;
|
174
|
+
return true;
|
175
|
+
}
|
176
|
+
|
177
|
+
public FileList build()
|
178
|
+
{
|
179
|
+
try {
|
180
|
+
stream.close();
|
181
|
+
}
|
182
|
+
catch (IOException ex) {
|
183
|
+
throw Throwables.propagate(ex);
|
184
|
+
}
|
185
|
+
return new FileList(binary.toByteArray(), getSplits(entries), Optional.fromNullable(last));
|
186
|
+
}
|
187
|
+
|
188
|
+
private List<List<Entry>> getSplits(List<Entry> all)
|
189
|
+
{
|
190
|
+
List<List<Entry>> tasks = new ArrayList<>();
|
191
|
+
long currentTaskSize = 0;
|
192
|
+
List<Entry> currentTask = new ArrayList<>();
|
193
|
+
for (Entry entry : all) {
|
194
|
+
currentTask.add(entry);
|
195
|
+
currentTaskSize += entry.getSize(); // TODO consider to multiply the size by cost_per_byte, and add cost_per_file
|
196
|
+
if (currentTaskSize >= minTaskSize) {
|
197
|
+
tasks.add(currentTask);
|
198
|
+
currentTask = new ArrayList<>();
|
199
|
+
currentTaskSize = 0;
|
200
|
+
}
|
201
|
+
}
|
202
|
+
if (!currentTask.isEmpty()) {
|
203
|
+
tasks.add(currentTask);
|
204
|
+
}
|
205
|
+
return tasks;
|
206
|
+
}
|
207
|
+
}
|
208
|
+
|
209
|
+
private final byte[] data;
|
210
|
+
private final List<List<Entry>> tasks;
|
211
|
+
private final Optional<String> last;
|
212
|
+
|
213
|
+
@JsonCreator
|
214
|
+
@Deprecated
|
215
|
+
public FileList(
|
216
|
+
@JsonProperty("data") byte[] data,
|
217
|
+
@JsonProperty("tasks") List<List<Entry>> tasks,
|
218
|
+
@JsonProperty("last") Optional<String> last)
|
219
|
+
{
|
220
|
+
this.data = data;
|
221
|
+
this.tasks = tasks;
|
222
|
+
this.last = last;
|
223
|
+
}
|
224
|
+
|
225
|
+
@JsonIgnore
|
226
|
+
public Optional<String> getLastPath(Optional<String> lastLastPath)
|
227
|
+
{
|
228
|
+
if (last.isPresent()) {
|
229
|
+
return last;
|
230
|
+
}
|
231
|
+
return lastLastPath;
|
232
|
+
}
|
233
|
+
|
234
|
+
@JsonIgnore
|
235
|
+
public int getTaskCount()
|
236
|
+
{
|
237
|
+
return tasks.size();
|
238
|
+
}
|
239
|
+
|
240
|
+
@JsonIgnore
|
241
|
+
public List<String> get(int i)
|
242
|
+
{
|
243
|
+
return new EntryList(data, tasks.get(i));
|
244
|
+
}
|
245
|
+
|
246
|
+
@JsonProperty("data")
|
247
|
+
@Deprecated
|
248
|
+
public byte[] getData()
|
249
|
+
{
|
250
|
+
return data;
|
251
|
+
}
|
252
|
+
|
253
|
+
@JsonProperty("tasks")
|
254
|
+
@Deprecated
|
255
|
+
public List<List<Entry>> getTasks()
|
256
|
+
{
|
257
|
+
return tasks;
|
258
|
+
}
|
259
|
+
|
260
|
+
@JsonProperty("last")
|
261
|
+
@Deprecated
|
262
|
+
public Optional<String> getLast()
|
263
|
+
{
|
264
|
+
return last;
|
265
|
+
}
|
266
|
+
|
267
|
+
private class EntryList
|
268
|
+
extends AbstractList<String>
|
269
|
+
{
|
270
|
+
private final byte[] data;
|
271
|
+
private final List<Entry> entries;
|
272
|
+
private InputStream stream;
|
273
|
+
private int current;
|
274
|
+
|
275
|
+
private final ByteBuffer castBuffer = ByteBuffer.allocate(4);
|
276
|
+
|
277
|
+
public EntryList(byte[] data, List<Entry> entries)
|
278
|
+
{
|
279
|
+
this.data = data;
|
280
|
+
this.entries = entries;
|
281
|
+
try {
|
282
|
+
this.stream = new BufferedInputStream(new GZIPInputStream(new ByteArrayInputStream(data)));
|
283
|
+
}
|
284
|
+
catch (IOException ex) {
|
285
|
+
throw Throwables.propagate(ex);
|
286
|
+
}
|
287
|
+
this.current = 0;
|
288
|
+
}
|
289
|
+
|
290
|
+
@Override
|
291
|
+
public synchronized String get(int i)
|
292
|
+
{
|
293
|
+
Entry e = entries.get(i);
|
294
|
+
if (e.getIndex() < current) {
|
295
|
+
// rewind to the head
|
296
|
+
try {
|
297
|
+
stream.close();
|
298
|
+
stream = new BufferedInputStream(new GZIPInputStream(new ByteArrayInputStream(data)));
|
299
|
+
}
|
300
|
+
catch (IOException ex) {
|
301
|
+
throw Throwables.propagate(ex);
|
302
|
+
}
|
303
|
+
current = 0;
|
304
|
+
}
|
305
|
+
|
306
|
+
while (current < e.getIndex()) {
|
307
|
+
readNext();
|
308
|
+
}
|
309
|
+
// now current == e.getIndex()
|
310
|
+
return readNextString();
|
311
|
+
}
|
312
|
+
|
313
|
+
@Override
|
314
|
+
public int size()
|
315
|
+
{
|
316
|
+
return entries.size();
|
317
|
+
}
|
318
|
+
|
319
|
+
private byte[] readNext()
|
320
|
+
{
|
321
|
+
try {
|
322
|
+
stream.read(castBuffer.array());
|
323
|
+
int n = castBuffer.getInt(0);
|
324
|
+
byte[] b = new byte[n]; // here should be able to use a pooled buffer because read data is ignored if readNextString doesn't call this method
|
325
|
+
stream.read(b);
|
326
|
+
|
327
|
+
current++;
|
328
|
+
|
329
|
+
return b;
|
330
|
+
}
|
331
|
+
catch (IOException ex) {
|
332
|
+
throw Throwables.propagate(ex);
|
333
|
+
}
|
334
|
+
}
|
335
|
+
|
336
|
+
private String readNextString()
|
337
|
+
{
|
338
|
+
return new String(readNext(), StandardCharsets.UTF_8);
|
339
|
+
}
|
340
|
+
}
|
341
|
+
}
|
data/src/test/java/org/embulk/input/azure_blob_storage/TestAzureBlobStorageFileInputPlugin.java
CHANGED
@@ -93,18 +93,6 @@ public class TestAzureBlobStorageFileInputPlugin
|
|
93
93
|
assertEquals(5, task.getMaxConnectionRetry());
|
94
94
|
}
|
95
95
|
|
96
|
-
public ConfigSource config()
|
97
|
-
{
|
98
|
-
return Exec.newConfigSource()
|
99
|
-
.set("account_name", AZURE_ACCOUNT_NAME)
|
100
|
-
.set("account_key", AZURE_ACCOUNT_KEY)
|
101
|
-
.set("container", AZURE_CONTAINER)
|
102
|
-
.set("path_prefix", AZURE_PATH_PREFIX)
|
103
|
-
.set("last_path", "")
|
104
|
-
.set("file_ext", ".csv")
|
105
|
-
.set("parser", parserConfig(schemaConfig()));
|
106
|
-
}
|
107
|
-
|
108
96
|
@Test(expected = ConfigException.class)
|
109
97
|
public void checkDefaultValuesAccountNameIsNull()
|
110
98
|
{
|
@@ -114,7 +102,6 @@ public class TestAzureBlobStorageFileInputPlugin
|
|
114
102
|
.set("container", AZURE_CONTAINER)
|
115
103
|
.set("path_prefix", AZURE_PATH_PREFIX)
|
116
104
|
.set("last_path", "")
|
117
|
-
.set("file_ext", ".csv")
|
118
105
|
.set("parser", parserConfig(schemaConfig()));
|
119
106
|
|
120
107
|
runner.transaction(config, new Control());
|
@@ -129,7 +116,6 @@ public class TestAzureBlobStorageFileInputPlugin
|
|
129
116
|
.set("container", AZURE_CONTAINER)
|
130
117
|
.set("path_prefix", AZURE_PATH_PREFIX)
|
131
118
|
.set("last_path", "")
|
132
|
-
.set("file_ext", ".csv")
|
133
119
|
.set("parser", parserConfig(schemaConfig()));
|
134
120
|
|
135
121
|
runner.transaction(config, new Control());
|
@@ -144,7 +130,6 @@ public class TestAzureBlobStorageFileInputPlugin
|
|
144
130
|
.set("container", null)
|
145
131
|
.set("path_prefix", AZURE_PATH_PREFIX)
|
146
132
|
.set("last_path", "")
|
147
|
-
.set("file_ext", ".csv")
|
148
133
|
.set("parser", parserConfig(schemaConfig()));
|
149
134
|
|
150
135
|
runner.transaction(config, new Control());
|
@@ -166,7 +151,7 @@ public class TestAzureBlobStorageFileInputPlugin
|
|
166
151
|
public void testResume()
|
167
152
|
{
|
168
153
|
PluginTask task = config.loadConfig(PluginTask.class);
|
169
|
-
task.setFiles(Arrays.asList("in/aa/a"));
|
154
|
+
task.setFiles(createFileList(Arrays.asList("in/aa/a"), task));
|
170
155
|
ConfigDiff configDiff = plugin.resume(task.dump(), 0, new FileInputPlugin.Control()
|
171
156
|
{
|
172
157
|
@Override
|
@@ -190,8 +175,8 @@ public class TestAzureBlobStorageFileInputPlugin
|
|
190
175
|
throws NoSuchMethodException, IllegalAccessException, InvocationTargetException
|
191
176
|
{
|
192
177
|
List<String> expected = Arrays.asList(
|
193
|
-
|
194
|
-
|
178
|
+
AZURE_CONTAINER_IMPORT_DIRECTORY + "sample_01.csv",
|
179
|
+
AZURE_CONTAINER_IMPORT_DIRECTORY + "sample_02.csv"
|
195
180
|
);
|
196
181
|
|
197
182
|
PluginTask task = config.loadConfig(PluginTask.class);
|
@@ -210,8 +195,9 @@ public class TestAzureBlobStorageFileInputPlugin
|
|
210
195
|
|
211
196
|
Method listFiles = AzureBlobStorageFileInputPlugin.class.getDeclaredMethod("listFiles", CloudBlobClient.class, PluginTask.class);
|
212
197
|
listFiles.setAccessible(true);
|
213
|
-
|
214
|
-
assertEquals(expected, actual);
|
198
|
+
FileList actual = (FileList) listFiles.invoke(plugin, client, task);
|
199
|
+
assertEquals(expected.get(0), actual.get(0).get(0));
|
200
|
+
assertEquals(expected.get(1), actual.get(1).get(0));
|
215
201
|
assertEquals(AZURE_CONTAINER_IMPORT_DIRECTORY + "sample_02.csv", configDiff.get(String.class, "last_path"));
|
216
202
|
}
|
217
203
|
|
@@ -228,11 +214,26 @@ public class TestAzureBlobStorageFileInputPlugin
|
|
228
214
|
|
229
215
|
Method listFiles = AzureBlobStorageFileInputPlugin.class.getDeclaredMethod("listFiles", CloudBlobClient.class, PluginTask.class);
|
230
216
|
listFiles.setAccessible(true);
|
231
|
-
task.setFiles((
|
217
|
+
task.setFiles((FileList) listFiles.invoke(plugin, client, task));
|
232
218
|
|
233
219
|
assertRecords(config, output);
|
234
220
|
}
|
235
221
|
|
222
|
+
@Test
|
223
|
+
public void testCreateNextToken() throws Exception
|
224
|
+
{
|
225
|
+
Method base64Encode = AzureBlobStorageFileInputPlugin.class.getDeclaredMethod("createNextToken", String.class);
|
226
|
+
base64Encode.setAccessible(true);
|
227
|
+
|
228
|
+
String expected = "2!92!MDAwMDI1IXJlYWRvbmx5L3NhbXBsZV8wMS50c3YuZ3ohMDAwMDI4ITk5OTktMTItMzFUMjM6NTk6NTkuOTk5OTk5OVoh";
|
229
|
+
String lastPath = "readonly/sample_01.tsv.gz";
|
230
|
+
assertEquals(expected, base64Encode.invoke(plugin, lastPath).toString());
|
231
|
+
|
232
|
+
expected = "2!120!MDAwMDQ2IXBhdGgvdGhhdC9oYXZlL2xvbmcvcGF0aC9uYW1lL3NhbXBsZV8wMS50c3YuZ3ohMDAwMDI4ITk5OTktMTItMzFUMjM6NTk6NTkuOTk5OTk5OVoh";
|
233
|
+
lastPath = "path/that/have/long/path/name/sample_01.tsv.gz";
|
234
|
+
assertEquals(expected, base64Encode.invoke(plugin, lastPath).toString());
|
235
|
+
}
|
236
|
+
|
236
237
|
static List<TaskReport> emptyTaskReports(int taskCount)
|
237
238
|
{
|
238
239
|
ImmutableList.Builder<TaskReport> reports = new ImmutableList.Builder<>();
|
@@ -256,6 +257,17 @@ public class TestAzureBlobStorageFileInputPlugin
|
|
256
257
|
}
|
257
258
|
}
|
258
259
|
|
260
|
+
public ConfigSource config()
|
261
|
+
{
|
262
|
+
return Exec.newConfigSource()
|
263
|
+
.set("account_name", AZURE_ACCOUNT_NAME)
|
264
|
+
.set("account_key", AZURE_ACCOUNT_KEY)
|
265
|
+
.set("container", AZURE_CONTAINER)
|
266
|
+
.set("path_prefix", AZURE_PATH_PREFIX)
|
267
|
+
.set("last_path", "")
|
268
|
+
.set("parser", parserConfig(schemaConfig()));
|
269
|
+
}
|
270
|
+
|
259
271
|
private ImmutableMap<String, Object> parserConfig(ImmutableList<Object> schemaConfig)
|
260
272
|
{
|
261
273
|
ImmutableMap.Builder<String, Object> builder = new ImmutableMap.Builder<>();
|
@@ -340,4 +352,13 @@ public class TestAzureBlobStorageFileInputPlugin
|
|
340
352
|
}
|
341
353
|
return dir;
|
342
354
|
}
|
355
|
+
|
356
|
+
private FileList createFileList(List<String> fileList, PluginTask task)
|
357
|
+
{
|
358
|
+
FileList.Builder builder = new FileList.Builder(task);
|
359
|
+
for (String file : fileList) {
|
360
|
+
builder.add(file, 0);
|
361
|
+
}
|
362
|
+
return builder.build();
|
363
|
+
}
|
343
364
|
}
|
@@ -0,0 +1,87 @@
|
|
1
|
+
package org.embulk.input.azure_blob_storage;
|
2
|
+
|
3
|
+
import org.embulk.EmbulkTestRuntime;
|
4
|
+
import org.embulk.config.ConfigSource;
|
5
|
+
import org.junit.Before;
|
6
|
+
import org.junit.Rule;
|
7
|
+
import org.junit.Test;
|
8
|
+
|
9
|
+
import static org.junit.Assert.assertEquals;
|
10
|
+
|
11
|
+
public class TestFileList
|
12
|
+
{
|
13
|
+
@Rule
|
14
|
+
public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
|
15
|
+
|
16
|
+
private ConfigSource config;
|
17
|
+
|
18
|
+
@Before
|
19
|
+
public void createConfigSource()
|
20
|
+
{
|
21
|
+
config = runtime.getExec().newConfigSource();
|
22
|
+
}
|
23
|
+
|
24
|
+
@Test
|
25
|
+
public void checkMinTaskSize()
|
26
|
+
throws Exception
|
27
|
+
{
|
28
|
+
{ // not specify min_task_size
|
29
|
+
FileList fileList = newFileList(config.deepCopy(),
|
30
|
+
"sample_00", 100L,
|
31
|
+
"sample_01", 150L,
|
32
|
+
"sample_02", 350L);
|
33
|
+
|
34
|
+
assertEquals(3, fileList.getTaskCount());
|
35
|
+
assertEquals("sample_00", fileList.get(0).get(0));
|
36
|
+
assertEquals("sample_01", fileList.get(1).get(0));
|
37
|
+
assertEquals("sample_02", fileList.get(2).get(0));
|
38
|
+
}
|
39
|
+
|
40
|
+
{
|
41
|
+
FileList fileList = newFileList(config.deepCopy().set("min_task_size", 100),
|
42
|
+
"sample_00", 100L,
|
43
|
+
"sample_01", 150L,
|
44
|
+
"sample_02", 350L);
|
45
|
+
|
46
|
+
assertEquals(3, fileList.getTaskCount());
|
47
|
+
assertEquals("sample_00", fileList.get(0).get(0));
|
48
|
+
assertEquals("sample_01", fileList.get(1).get(0));
|
49
|
+
assertEquals("sample_02", fileList.get(2).get(0));
|
50
|
+
}
|
51
|
+
|
52
|
+
{
|
53
|
+
FileList fileList = newFileList(config.deepCopy().set("min_task_size", 200),
|
54
|
+
"sample_00", 100L,
|
55
|
+
"sample_01", 150L,
|
56
|
+
"sample_02", 350L);
|
57
|
+
|
58
|
+
assertEquals(2, fileList.getTaskCount());
|
59
|
+
assertEquals("sample_00", fileList.get(0).get(0));
|
60
|
+
assertEquals("sample_01", fileList.get(0).get(1));
|
61
|
+
assertEquals("sample_02", fileList.get(1).get(0));
|
62
|
+
}
|
63
|
+
|
64
|
+
{
|
65
|
+
FileList fileList = newFileList(config.deepCopy().set("min_task_size", 700),
|
66
|
+
"sample_00", 100L,
|
67
|
+
"sample_01", 150L,
|
68
|
+
"sample_02", 350L);
|
69
|
+
|
70
|
+
assertEquals(1, fileList.getTaskCount());
|
71
|
+
assertEquals("sample_00", fileList.get(0).get(0));
|
72
|
+
assertEquals("sample_01", fileList.get(0).get(1));
|
73
|
+
assertEquals("sample_02", fileList.get(0).get(2));
|
74
|
+
}
|
75
|
+
}
|
76
|
+
|
77
|
+
private static FileList newFileList(ConfigSource config, Object... nameAndSize)
|
78
|
+
{
|
79
|
+
FileList.Builder builder = new FileList.Builder(config);
|
80
|
+
|
81
|
+
for (int i = 0; i < nameAndSize.length; i += 2) {
|
82
|
+
builder.add((String) nameAndSize[i], (long) nameAndSize[i + 1]);
|
83
|
+
}
|
84
|
+
|
85
|
+
return builder.build();
|
86
|
+
}
|
87
|
+
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-input-azure_blob_storage
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Satoshi Akama
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-03-
|
11
|
+
date: 2016-03-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -57,12 +57,14 @@ files:
|
|
57
57
|
- gradlew.bat
|
58
58
|
- lib/embulk/input/azure_blob_storage.rb
|
59
59
|
- src/main/java/org/embulk/input/azure_blob_storage/AzureBlobStorageFileInputPlugin.java
|
60
|
+
- src/main/java/org/embulk/input/azure_blob_storage/FileList.java
|
60
61
|
- src/test/java/org/embulk/input/azure_blob_storage/TestAzureBlobStorageFileInputPlugin.java
|
62
|
+
- src/test/java/org/embulk/input/azure_blob_storage/TestFileList.java
|
61
63
|
- src/test/resources/sample_01.csv
|
62
64
|
- src/test/resources/sample_02.csv
|
63
65
|
- classpath/azure-storage-4.0.0.jar
|
64
66
|
- classpath/commons-lang3-3.4.jar
|
65
|
-
- classpath/embulk-input-azure_blob_storage-0.1.
|
67
|
+
- classpath/embulk-input-azure_blob_storage-0.1.4.jar
|
66
68
|
- classpath/jackson-core-2.6.0.jar
|
67
69
|
homepage: https://github.com/sakama/embulk-input-azure_blob_storage
|
68
70
|
licenses:
|