embulk-input-s3 0.2.6 → 0.2.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ee2494b43bcc58768baa64772d07f73c521b87f4
4
- data.tar.gz: e12a159f3ad9e6e3cf012bcdc2e779be3242580a
3
+ metadata.gz: 3fde7e23b02972cc6b615c1845ab820c3e960770
4
+ data.tar.gz: 7ab94c1a292321bbee5c4642be906c1404db35d2
5
5
  SHA512:
6
- metadata.gz: 8cf67d08d7f4b11b0c4c35e2b15e2537e9bf984f3a9a77adc0ecd30222c958d493b28817c273045d8bcbfd99bde327ed79760e8fd5118f1dc8c466b93b6f74de
7
- data.tar.gz: 36f4ee1d0bcddd46a5945865b9d2f2f41e997b562d15783b96f94b2389e3037acdc104ebe967108110550eeaed12ebf5a1a961c3826358f247c8b06f21fa1e0c
6
+ metadata.gz: ed3cfe60161e427fdffe19b0622feae7aa1e5138d2c3f10243125eb82349ee91efbda869ab21644fb7141125415c29172e36c77f20cf352128c85855222a8fc6
7
+ data.tar.gz: 7235b4b49d0f7df056098084c70d8dea60be33dbe0335b589eac8e0478a835514caa4a82ab1ee1b5a45370a2b0353985b2f6b66a01ca9ce123202417917fcbbf
@@ -38,6 +38,11 @@ public class FileList
38
38
  @Config("total_file_count_limit")
39
39
  @ConfigDefault("2147483647")
40
40
  int getTotalFileCountLimit();
41
+
42
+ // TODO support more algorithms to combine tasks
43
+ @Config("min_task_size")
44
+ @ConfigDefault("0")
45
+ long getMinTaskSize();
41
46
  }
42
47
 
43
48
  public static class Entry
@@ -69,6 +74,7 @@ public class FileList
69
74
  private String last = null;
70
75
 
71
76
  private int limitCount = Integer.MAX_VALUE;
77
+ private long minTaskSize = 1;
72
78
  private Pattern pathMatchPattern;
73
79
 
74
80
  private final ByteBuffer castBuffer = ByteBuffer.allocate(4);
@@ -76,8 +82,9 @@ public class FileList
76
82
  public Builder(Task task)
77
83
  {
78
84
  this();
79
- this.limitCount = task.getTotalFileCountLimit();
80
85
  this.pathMatchPattern = Pattern.compile(task.getPathMatchPattern());
86
+ this.limitCount = task.getTotalFileCountLimit();
87
+ this.minTaskSize = task.getMinTaskSize();
81
88
  }
82
89
 
83
90
  public Builder(ConfigSource config)
@@ -85,6 +92,7 @@ public class FileList
85
92
  this();
86
93
  this.pathMatchPattern = Pattern.compile(config.get(String.class, "path_match_pattern", ".*"));
87
94
  this.limitCount = config.get(int.class, "total_file_count_limit", Integer.MAX_VALUE);
95
+ this.minTaskSize = config.get(long.class, "min_task_size", 0L);
88
96
  }
89
97
 
90
98
  public Builder()
@@ -104,6 +112,12 @@ public class FileList
104
112
  return this;
105
113
  }
106
114
 
115
+ public Builder minTaskSize(long bytes)
116
+ {
117
+ this.minTaskSize = bytes;
118
+ return this;
119
+ }
120
+
107
121
  public Builder pathMatchPattern(String pattern)
108
122
  {
109
123
  this.pathMatchPattern = Pattern.compile(pattern);
@@ -163,10 +177,20 @@ public class FileList
163
177
 
164
178
  private List<List<Entry>> getSplits(List<Entry> all)
165
179
  {
166
- // TODO combine multiple entries into one task using some configuration parameters
167
180
  List<List<Entry>> tasks = new ArrayList<>();
181
+ long currentTaskSize = 0;
182
+ List<Entry> currentTask = new ArrayList<>();
168
183
  for (Entry entry : all) {
169
- tasks.add(ImmutableList.of(entry));
184
+ currentTask.add(entry);
185
+ currentTaskSize += entry.getSize(); // TODO consider to multiply the size by cost_per_byte, and add cost_per_file
186
+ if (currentTaskSize >= minTaskSize) {
187
+ tasks.add(currentTask);
188
+ currentTask = new ArrayList<>();
189
+ currentTaskSize = 0;
190
+ }
191
+ }
192
+ if (!currentTask.isEmpty()) {
193
+ tasks.add(currentTask);
170
194
  }
171
195
  return tasks;
172
196
  }
@@ -0,0 +1,87 @@
1
+ package org.embulk.input.s3;
2
+
3
+ import org.embulk.EmbulkTestRuntime;
4
+ import org.embulk.config.ConfigSource;
5
+ import org.junit.Before;
6
+ import org.junit.Rule;
7
+ import org.junit.Test;
8
+
9
+ import static org.junit.Assert.assertEquals;
10
+
11
+ public class TestFileList
12
+ {
13
+ @Rule
14
+ public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
15
+
16
+ private ConfigSource config;
17
+
18
+ @Before
19
+ public void createConfigSource()
20
+ {
21
+ config = runtime.getExec().newConfigSource();
22
+ }
23
+
24
+ @Test
25
+ public void checkMinTaskSize()
26
+ throws Exception
27
+ {
28
+ { // not specify min_task_size
29
+ FileList fileList = newFileList(config.deepCopy(),
30
+ "sample_00", 100L,
31
+ "sample_01", 150L,
32
+ "sample_02", 350L);
33
+
34
+ assertEquals(3, fileList.getTaskCount());
35
+ assertEquals("sample_00", fileList.get(0).get(0));
36
+ assertEquals("sample_01", fileList.get(1).get(0));
37
+ assertEquals("sample_02", fileList.get(2).get(0));
38
+ }
39
+
40
+ {
41
+ FileList fileList = newFileList(config.deepCopy().set("min_task_size", 100),
42
+ "sample_00", 100L,
43
+ "sample_01", 150L,
44
+ "sample_02", 350L);
45
+
46
+ assertEquals(3, fileList.getTaskCount());
47
+ assertEquals("sample_00", fileList.get(0).get(0));
48
+ assertEquals("sample_01", fileList.get(1).get(0));
49
+ assertEquals("sample_02", fileList.get(2).get(0));
50
+ }
51
+
52
+ {
53
+ FileList fileList = newFileList(config.deepCopy().set("min_task_size", 200),
54
+ "sample_00", 100L,
55
+ "sample_01", 150L,
56
+ "sample_02", 350L);
57
+
58
+ assertEquals(2, fileList.getTaskCount());
59
+ assertEquals("sample_00", fileList.get(0).get(0));
60
+ assertEquals("sample_01", fileList.get(0).get(1));
61
+ assertEquals("sample_02", fileList.get(1).get(0));
62
+ }
63
+
64
+ {
65
+ FileList fileList = newFileList(config.deepCopy().set("min_task_size", 700),
66
+ "sample_00", 100L,
67
+ "sample_01", 150L,
68
+ "sample_02", 350L);
69
+
70
+ assertEquals(1, fileList.getTaskCount());
71
+ assertEquals("sample_00", fileList.get(0).get(0));
72
+ assertEquals("sample_01", fileList.get(0).get(1));
73
+ assertEquals("sample_02", fileList.get(0).get(2));
74
+ }
75
+ }
76
+
77
+ private static FileList newFileList(ConfigSource config, Object... nameAndSize)
78
+ {
79
+ FileList.Builder builder = new FileList.Builder(config);
80
+
81
+ for (int i = 0; i < nameAndSize.length; i += 2) {
82
+ builder.add((String) nameAndSize[i], (long) nameAndSize[i + 1]);
83
+ }
84
+
85
+ return builder.build();
86
+ }
87
+ }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-input-s3
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.6
4
+ version: 0.2.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sadayuki Furuhashi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-12-05 00:00:00.000000000 Z
11
+ date: 2015-12-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -53,6 +53,7 @@ files:
53
53
  - src/main/java/org/embulk/input/s3/FileList.java
54
54
  - src/main/java/org/embulk/input/s3/S3FileInputPlugin.java
55
55
  - src/test/java/org/embulk/input/s3/TestAwsCredentials.java
56
+ - src/test/java/org/embulk/input/s3/TestFileList.java
56
57
  - src/test/java/org/embulk/input/s3/TestS3FileInputPlugin.java
57
58
  - src/test/java/org/embulk/input/s3/TestS3InputStreamReopener.java
58
59
  - src/test/resources/sample_01.csv
@@ -60,7 +61,7 @@ files:
60
61
  - classpath/aws-java-sdk-kms-1.10.33.jar
61
62
  - classpath/aws-java-sdk-s3-1.10.33.jar
62
63
  - classpath/commons-codec-1.6.jar
63
- - classpath/embulk-input-s3-0.2.6.jar
64
+ - classpath/embulk-input-s3-0.2.7.jar
64
65
  - classpath/httpclient-4.3.6.jar
65
66
  - classpath/httpcore-4.3.3.jar
66
67
  - classpath/jcl-over-slf4j-1.7.12.jar