embulk-input-s3 0.2.6 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3fde7e23b02972cc6b615c1845ab820c3e960770
|
4
|
+
data.tar.gz: 7ab94c1a292321bbee5c4642be906c1404db35d2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ed3cfe60161e427fdffe19b0622feae7aa1e5138d2c3f10243125eb82349ee91efbda869ab21644fb7141125415c29172e36c77f20cf352128c85855222a8fc6
|
7
|
+
data.tar.gz: 7235b4b49d0f7df056098084c70d8dea60be33dbe0335b589eac8e0478a835514caa4a82ab1ee1b5a45370a2b0353985b2f6b66a01ca9ce123202417917fcbbf
|
Binary file
|
@@ -38,6 +38,11 @@ public class FileList
|
|
38
38
|
@Config("total_file_count_limit")
|
39
39
|
@ConfigDefault("2147483647")
|
40
40
|
int getTotalFileCountLimit();
|
41
|
+
|
42
|
+
// TODO support more algorithms to combine tasks
|
43
|
+
@Config("min_task_size")
|
44
|
+
@ConfigDefault("0")
|
45
|
+
long getMinTaskSize();
|
41
46
|
}
|
42
47
|
|
43
48
|
public static class Entry
|
@@ -69,6 +74,7 @@ public class FileList
|
|
69
74
|
private String last = null;
|
70
75
|
|
71
76
|
private int limitCount = Integer.MAX_VALUE;
|
77
|
+
private long minTaskSize = 1;
|
72
78
|
private Pattern pathMatchPattern;
|
73
79
|
|
74
80
|
private final ByteBuffer castBuffer = ByteBuffer.allocate(4);
|
@@ -76,8 +82,9 @@ public class FileList
|
|
76
82
|
public Builder(Task task)
|
77
83
|
{
|
78
84
|
this();
|
79
|
-
this.limitCount = task.getTotalFileCountLimit();
|
80
85
|
this.pathMatchPattern = Pattern.compile(task.getPathMatchPattern());
|
86
|
+
this.limitCount = task.getTotalFileCountLimit();
|
87
|
+
this.minTaskSize = task.getMinTaskSize();
|
81
88
|
}
|
82
89
|
|
83
90
|
public Builder(ConfigSource config)
|
@@ -85,6 +92,7 @@ public class FileList
|
|
85
92
|
this();
|
86
93
|
this.pathMatchPattern = Pattern.compile(config.get(String.class, "path_match_pattern", ".*"));
|
87
94
|
this.limitCount = config.get(int.class, "total_file_count_limit", Integer.MAX_VALUE);
|
95
|
+
this.minTaskSize = config.get(long.class, "min_task_size", 0L);
|
88
96
|
}
|
89
97
|
|
90
98
|
public Builder()
|
@@ -104,6 +112,12 @@ public class FileList
|
|
104
112
|
return this;
|
105
113
|
}
|
106
114
|
|
115
|
+
public Builder minTaskSize(long bytes)
|
116
|
+
{
|
117
|
+
this.minTaskSize = bytes;
|
118
|
+
return this;
|
119
|
+
}
|
120
|
+
|
107
121
|
public Builder pathMatchPattern(String pattern)
|
108
122
|
{
|
109
123
|
this.pathMatchPattern = Pattern.compile(pattern);
|
@@ -163,10 +177,20 @@ public class FileList
|
|
163
177
|
|
164
178
|
private List<List<Entry>> getSplits(List<Entry> all)
|
165
179
|
{
|
166
|
-
// TODO combine multiple entries into one task using some configuration parameters
|
167
180
|
List<List<Entry>> tasks = new ArrayList<>();
|
181
|
+
long currentTaskSize = 0;
|
182
|
+
List<Entry> currentTask = new ArrayList<>();
|
168
183
|
for (Entry entry : all) {
|
169
|
-
|
184
|
+
currentTask.add(entry);
|
185
|
+
currentTaskSize += entry.getSize(); // TODO consider to multiply the size by cost_per_byte, and add cost_per_file
|
186
|
+
if (currentTaskSize >= minTaskSize) {
|
187
|
+
tasks.add(currentTask);
|
188
|
+
currentTask = new ArrayList<>();
|
189
|
+
currentTaskSize = 0;
|
190
|
+
}
|
191
|
+
}
|
192
|
+
if (!currentTask.isEmpty()) {
|
193
|
+
tasks.add(currentTask);
|
170
194
|
}
|
171
195
|
return tasks;
|
172
196
|
}
|
@@ -0,0 +1,87 @@
|
|
1
|
+
package org.embulk.input.s3;
|
2
|
+
|
3
|
+
import org.embulk.EmbulkTestRuntime;
|
4
|
+
import org.embulk.config.ConfigSource;
|
5
|
+
import org.junit.Before;
|
6
|
+
import org.junit.Rule;
|
7
|
+
import org.junit.Test;
|
8
|
+
|
9
|
+
import static org.junit.Assert.assertEquals;
|
10
|
+
|
11
|
+
public class TestFileList
|
12
|
+
{
|
13
|
+
@Rule
|
14
|
+
public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
|
15
|
+
|
16
|
+
private ConfigSource config;
|
17
|
+
|
18
|
+
@Before
|
19
|
+
public void createConfigSource()
|
20
|
+
{
|
21
|
+
config = runtime.getExec().newConfigSource();
|
22
|
+
}
|
23
|
+
|
24
|
+
@Test
|
25
|
+
public void checkMinTaskSize()
|
26
|
+
throws Exception
|
27
|
+
{
|
28
|
+
{ // not specify min_task_size
|
29
|
+
FileList fileList = newFileList(config.deepCopy(),
|
30
|
+
"sample_00", 100L,
|
31
|
+
"sample_01", 150L,
|
32
|
+
"sample_02", 350L);
|
33
|
+
|
34
|
+
assertEquals(3, fileList.getTaskCount());
|
35
|
+
assertEquals("sample_00", fileList.get(0).get(0));
|
36
|
+
assertEquals("sample_01", fileList.get(1).get(0));
|
37
|
+
assertEquals("sample_02", fileList.get(2).get(0));
|
38
|
+
}
|
39
|
+
|
40
|
+
{
|
41
|
+
FileList fileList = newFileList(config.deepCopy().set("min_task_size", 100),
|
42
|
+
"sample_00", 100L,
|
43
|
+
"sample_01", 150L,
|
44
|
+
"sample_02", 350L);
|
45
|
+
|
46
|
+
assertEquals(3, fileList.getTaskCount());
|
47
|
+
assertEquals("sample_00", fileList.get(0).get(0));
|
48
|
+
assertEquals("sample_01", fileList.get(1).get(0));
|
49
|
+
assertEquals("sample_02", fileList.get(2).get(0));
|
50
|
+
}
|
51
|
+
|
52
|
+
{
|
53
|
+
FileList fileList = newFileList(config.deepCopy().set("min_task_size", 200),
|
54
|
+
"sample_00", 100L,
|
55
|
+
"sample_01", 150L,
|
56
|
+
"sample_02", 350L);
|
57
|
+
|
58
|
+
assertEquals(2, fileList.getTaskCount());
|
59
|
+
assertEquals("sample_00", fileList.get(0).get(0));
|
60
|
+
assertEquals("sample_01", fileList.get(0).get(1));
|
61
|
+
assertEquals("sample_02", fileList.get(1).get(0));
|
62
|
+
}
|
63
|
+
|
64
|
+
{
|
65
|
+
FileList fileList = newFileList(config.deepCopy().set("min_task_size", 700),
|
66
|
+
"sample_00", 100L,
|
67
|
+
"sample_01", 150L,
|
68
|
+
"sample_02", 350L);
|
69
|
+
|
70
|
+
assertEquals(1, fileList.getTaskCount());
|
71
|
+
assertEquals("sample_00", fileList.get(0).get(0));
|
72
|
+
assertEquals("sample_01", fileList.get(0).get(1));
|
73
|
+
assertEquals("sample_02", fileList.get(0).get(2));
|
74
|
+
}
|
75
|
+
}
|
76
|
+
|
77
|
+
private static FileList newFileList(ConfigSource config, Object... nameAndSize)
|
78
|
+
{
|
79
|
+
FileList.Builder builder = new FileList.Builder(config);
|
80
|
+
|
81
|
+
for (int i = 0; i < nameAndSize.length; i += 2) {
|
82
|
+
builder.add((String) nameAndSize[i], (long) nameAndSize[i + 1]);
|
83
|
+
}
|
84
|
+
|
85
|
+
return builder.build();
|
86
|
+
}
|
87
|
+
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-input-s3
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sadayuki Furuhashi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-12-
|
11
|
+
date: 2015-12-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -53,6 +53,7 @@ files:
|
|
53
53
|
- src/main/java/org/embulk/input/s3/FileList.java
|
54
54
|
- src/main/java/org/embulk/input/s3/S3FileInputPlugin.java
|
55
55
|
- src/test/java/org/embulk/input/s3/TestAwsCredentials.java
|
56
|
+
- src/test/java/org/embulk/input/s3/TestFileList.java
|
56
57
|
- src/test/java/org/embulk/input/s3/TestS3FileInputPlugin.java
|
57
58
|
- src/test/java/org/embulk/input/s3/TestS3InputStreamReopener.java
|
58
59
|
- src/test/resources/sample_01.csv
|
@@ -60,7 +61,7 @@ files:
|
|
60
61
|
- classpath/aws-java-sdk-kms-1.10.33.jar
|
61
62
|
- classpath/aws-java-sdk-s3-1.10.33.jar
|
62
63
|
- classpath/commons-codec-1.6.jar
|
63
|
-
- classpath/embulk-input-s3-0.2.
|
64
|
+
- classpath/embulk-input-s3-0.2.7.jar
|
64
65
|
- classpath/httpclient-4.3.6.jar
|
65
66
|
- classpath/httpcore-4.3.3.jar
|
66
67
|
- classpath/jcl-over-slf4j-1.7.12.jar
|