embulk-input-s3 0.2.6 → 0.2.7
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3fde7e23b02972cc6b615c1845ab820c3e960770
|
4
|
+
data.tar.gz: 7ab94c1a292321bbee5c4642be906c1404db35d2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ed3cfe60161e427fdffe19b0622feae7aa1e5138d2c3f10243125eb82349ee91efbda869ab21644fb7141125415c29172e36c77f20cf352128c85855222a8fc6
|
7
|
+
data.tar.gz: 7235b4b49d0f7df056098084c70d8dea60be33dbe0335b589eac8e0478a835514caa4a82ab1ee1b5a45370a2b0353985b2f6b66a01ca9ce123202417917fcbbf
|
Binary file
|
@@ -38,6 +38,11 @@ public class FileList
|
|
38
38
|
@Config("total_file_count_limit")
|
39
39
|
@ConfigDefault("2147483647")
|
40
40
|
int getTotalFileCountLimit();
|
41
|
+
|
42
|
+
// TODO support more algorithms to combine tasks
|
43
|
+
@Config("min_task_size")
|
44
|
+
@ConfigDefault("0")
|
45
|
+
long getMinTaskSize();
|
41
46
|
}
|
42
47
|
|
43
48
|
public static class Entry
|
@@ -69,6 +74,7 @@ public class FileList
|
|
69
74
|
private String last = null;
|
70
75
|
|
71
76
|
private int limitCount = Integer.MAX_VALUE;
|
77
|
+
private long minTaskSize = 1;
|
72
78
|
private Pattern pathMatchPattern;
|
73
79
|
|
74
80
|
private final ByteBuffer castBuffer = ByteBuffer.allocate(4);
|
@@ -76,8 +82,9 @@ public class FileList
|
|
76
82
|
public Builder(Task task)
|
77
83
|
{
|
78
84
|
this();
|
79
|
-
this.limitCount = task.getTotalFileCountLimit();
|
80
85
|
this.pathMatchPattern = Pattern.compile(task.getPathMatchPattern());
|
86
|
+
this.limitCount = task.getTotalFileCountLimit();
|
87
|
+
this.minTaskSize = task.getMinTaskSize();
|
81
88
|
}
|
82
89
|
|
83
90
|
public Builder(ConfigSource config)
|
@@ -85,6 +92,7 @@ public class FileList
|
|
85
92
|
this();
|
86
93
|
this.pathMatchPattern = Pattern.compile(config.get(String.class, "path_match_pattern", ".*"));
|
87
94
|
this.limitCount = config.get(int.class, "total_file_count_limit", Integer.MAX_VALUE);
|
95
|
+
this.minTaskSize = config.get(long.class, "min_task_size", 0L);
|
88
96
|
}
|
89
97
|
|
90
98
|
public Builder()
|
@@ -104,6 +112,12 @@ public class FileList
|
|
104
112
|
return this;
|
105
113
|
}
|
106
114
|
|
115
|
+
public Builder minTaskSize(long bytes)
|
116
|
+
{
|
117
|
+
this.minTaskSize = bytes;
|
118
|
+
return this;
|
119
|
+
}
|
120
|
+
|
107
121
|
public Builder pathMatchPattern(String pattern)
|
108
122
|
{
|
109
123
|
this.pathMatchPattern = Pattern.compile(pattern);
|
@@ -163,10 +177,20 @@ public class FileList
|
|
163
177
|
|
164
178
|
private List<List<Entry>> getSplits(List<Entry> all)
|
165
179
|
{
|
166
|
-
// TODO combine multiple entries into one task using some configuration parameters
|
167
180
|
List<List<Entry>> tasks = new ArrayList<>();
|
181
|
+
long currentTaskSize = 0;
|
182
|
+
List<Entry> currentTask = new ArrayList<>();
|
168
183
|
for (Entry entry : all) {
|
169
|
-
|
184
|
+
currentTask.add(entry);
|
185
|
+
currentTaskSize += entry.getSize(); // TODO consider to multiply the size by cost_per_byte, and add cost_per_file
|
186
|
+
if (currentTaskSize >= minTaskSize) {
|
187
|
+
tasks.add(currentTask);
|
188
|
+
currentTask = new ArrayList<>();
|
189
|
+
currentTaskSize = 0;
|
190
|
+
}
|
191
|
+
}
|
192
|
+
if (!currentTask.isEmpty()) {
|
193
|
+
tasks.add(currentTask);
|
170
194
|
}
|
171
195
|
return tasks;
|
172
196
|
}
|
@@ -0,0 +1,87 @@
|
|
1
|
+
package org.embulk.input.s3;
|
2
|
+
|
3
|
+
import org.embulk.EmbulkTestRuntime;
|
4
|
+
import org.embulk.config.ConfigSource;
|
5
|
+
import org.junit.Before;
|
6
|
+
import org.junit.Rule;
|
7
|
+
import org.junit.Test;
|
8
|
+
|
9
|
+
import static org.junit.Assert.assertEquals;
|
10
|
+
|
11
|
+
public class TestFileList
|
12
|
+
{
|
13
|
+
@Rule
|
14
|
+
public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
|
15
|
+
|
16
|
+
private ConfigSource config;
|
17
|
+
|
18
|
+
@Before
|
19
|
+
public void createConfigSource()
|
20
|
+
{
|
21
|
+
config = runtime.getExec().newConfigSource();
|
22
|
+
}
|
23
|
+
|
24
|
+
@Test
|
25
|
+
public void checkMinTaskSize()
|
26
|
+
throws Exception
|
27
|
+
{
|
28
|
+
{ // not specify min_task_size
|
29
|
+
FileList fileList = newFileList(config.deepCopy(),
|
30
|
+
"sample_00", 100L,
|
31
|
+
"sample_01", 150L,
|
32
|
+
"sample_02", 350L);
|
33
|
+
|
34
|
+
assertEquals(3, fileList.getTaskCount());
|
35
|
+
assertEquals("sample_00", fileList.get(0).get(0));
|
36
|
+
assertEquals("sample_01", fileList.get(1).get(0));
|
37
|
+
assertEquals("sample_02", fileList.get(2).get(0));
|
38
|
+
}
|
39
|
+
|
40
|
+
{
|
41
|
+
FileList fileList = newFileList(config.deepCopy().set("min_task_size", 100),
|
42
|
+
"sample_00", 100L,
|
43
|
+
"sample_01", 150L,
|
44
|
+
"sample_02", 350L);
|
45
|
+
|
46
|
+
assertEquals(3, fileList.getTaskCount());
|
47
|
+
assertEquals("sample_00", fileList.get(0).get(0));
|
48
|
+
assertEquals("sample_01", fileList.get(1).get(0));
|
49
|
+
assertEquals("sample_02", fileList.get(2).get(0));
|
50
|
+
}
|
51
|
+
|
52
|
+
{
|
53
|
+
FileList fileList = newFileList(config.deepCopy().set("min_task_size", 200),
|
54
|
+
"sample_00", 100L,
|
55
|
+
"sample_01", 150L,
|
56
|
+
"sample_02", 350L);
|
57
|
+
|
58
|
+
assertEquals(2, fileList.getTaskCount());
|
59
|
+
assertEquals("sample_00", fileList.get(0).get(0));
|
60
|
+
assertEquals("sample_01", fileList.get(0).get(1));
|
61
|
+
assertEquals("sample_02", fileList.get(1).get(0));
|
62
|
+
}
|
63
|
+
|
64
|
+
{
|
65
|
+
FileList fileList = newFileList(config.deepCopy().set("min_task_size", 700),
|
66
|
+
"sample_00", 100L,
|
67
|
+
"sample_01", 150L,
|
68
|
+
"sample_02", 350L);
|
69
|
+
|
70
|
+
assertEquals(1, fileList.getTaskCount());
|
71
|
+
assertEquals("sample_00", fileList.get(0).get(0));
|
72
|
+
assertEquals("sample_01", fileList.get(0).get(1));
|
73
|
+
assertEquals("sample_02", fileList.get(0).get(2));
|
74
|
+
}
|
75
|
+
}
|
76
|
+
|
77
|
+
private static FileList newFileList(ConfigSource config, Object... nameAndSize)
|
78
|
+
{
|
79
|
+
FileList.Builder builder = new FileList.Builder(config);
|
80
|
+
|
81
|
+
for (int i = 0; i < nameAndSize.length; i += 2) {
|
82
|
+
builder.add((String) nameAndSize[i], (long) nameAndSize[i + 1]);
|
83
|
+
}
|
84
|
+
|
85
|
+
return builder.build();
|
86
|
+
}
|
87
|
+
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-input-s3
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sadayuki Furuhashi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-12-
|
11
|
+
date: 2015-12-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -53,6 +53,7 @@ files:
|
|
53
53
|
- src/main/java/org/embulk/input/s3/FileList.java
|
54
54
|
- src/main/java/org/embulk/input/s3/S3FileInputPlugin.java
|
55
55
|
- src/test/java/org/embulk/input/s3/TestAwsCredentials.java
|
56
|
+
- src/test/java/org/embulk/input/s3/TestFileList.java
|
56
57
|
- src/test/java/org/embulk/input/s3/TestS3FileInputPlugin.java
|
57
58
|
- src/test/java/org/embulk/input/s3/TestS3InputStreamReopener.java
|
58
59
|
- src/test/resources/sample_01.csv
|
@@ -60,7 +61,7 @@ files:
|
|
60
61
|
- classpath/aws-java-sdk-kms-1.10.33.jar
|
61
62
|
- classpath/aws-java-sdk-s3-1.10.33.jar
|
62
63
|
- classpath/commons-codec-1.6.jar
|
63
|
-
- classpath/embulk-input-s3-0.2.
|
64
|
+
- classpath/embulk-input-s3-0.2.7.jar
|
64
65
|
- classpath/httpclient-4.3.6.jar
|
65
66
|
- classpath/httpcore-4.3.3.jar
|
66
67
|
- classpath/jcl-over-slf4j-1.7.12.jar
|