embulk-input-s3 0.2.6 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ee2494b43bcc58768baa64772d07f73c521b87f4
4
- data.tar.gz: e12a159f3ad9e6e3cf012bcdc2e779be3242580a
3
+ metadata.gz: 3fde7e23b02972cc6b615c1845ab820c3e960770
4
+ data.tar.gz: 7ab94c1a292321bbee5c4642be906c1404db35d2
5
5
  SHA512:
6
- metadata.gz: 8cf67d08d7f4b11b0c4c35e2b15e2537e9bf984f3a9a77adc0ecd30222c958d493b28817c273045d8bcbfd99bde327ed79760e8fd5118f1dc8c466b93b6f74de
7
- data.tar.gz: 36f4ee1d0bcddd46a5945865b9d2f2f41e997b562d15783b96f94b2389e3037acdc104ebe967108110550eeaed12ebf5a1a961c3826358f247c8b06f21fa1e0c
6
+ metadata.gz: ed3cfe60161e427fdffe19b0622feae7aa1e5138d2c3f10243125eb82349ee91efbda869ab21644fb7141125415c29172e36c77f20cf352128c85855222a8fc6
7
+ data.tar.gz: 7235b4b49d0f7df056098084c70d8dea60be33dbe0335b589eac8e0478a835514caa4a82ab1ee1b5a45370a2b0353985b2f6b66a01ca9ce123202417917fcbbf
@@ -38,6 +38,11 @@ public class FileList
38
38
  @Config("total_file_count_limit")
39
39
  @ConfigDefault("2147483647")
40
40
  int getTotalFileCountLimit();
41
+
42
+ // TODO support more algorithms to combine tasks
43
+ @Config("min_task_size")
44
+ @ConfigDefault("0")
45
+ long getMinTaskSize();
41
46
  }
42
47
 
43
48
  public static class Entry
@@ -69,6 +74,7 @@ public class FileList
69
74
  private String last = null;
70
75
 
71
76
  private int limitCount = Integer.MAX_VALUE;
77
+ private long minTaskSize = 1;
72
78
  private Pattern pathMatchPattern;
73
79
 
74
80
  private final ByteBuffer castBuffer = ByteBuffer.allocate(4);
@@ -76,8 +82,9 @@ public class FileList
76
82
  public Builder(Task task)
77
83
  {
78
84
  this();
79
- this.limitCount = task.getTotalFileCountLimit();
80
85
  this.pathMatchPattern = Pattern.compile(task.getPathMatchPattern());
86
+ this.limitCount = task.getTotalFileCountLimit();
87
+ this.minTaskSize = task.getMinTaskSize();
81
88
  }
82
89
 
83
90
  public Builder(ConfigSource config)
@@ -85,6 +92,7 @@ public class FileList
85
92
  this();
86
93
  this.pathMatchPattern = Pattern.compile(config.get(String.class, "path_match_pattern", ".*"));
87
94
  this.limitCount = config.get(int.class, "total_file_count_limit", Integer.MAX_VALUE);
95
+ this.minTaskSize = config.get(long.class, "min_task_size", 0L);
88
96
  }
89
97
 
90
98
  public Builder()
@@ -104,6 +112,12 @@ public class FileList
104
112
  return this;
105
113
  }
106
114
 
115
+ public Builder minTaskSize(long bytes)
116
+ {
117
+ this.minTaskSize = bytes;
118
+ return this;
119
+ }
120
+
107
121
  public Builder pathMatchPattern(String pattern)
108
122
  {
109
123
  this.pathMatchPattern = Pattern.compile(pattern);
@@ -163,10 +177,20 @@ public class FileList
163
177
 
164
178
  private List<List<Entry>> getSplits(List<Entry> all)
165
179
  {
166
- // TODO combine multiple entries into one task using some configuration parameters
167
180
  List<List<Entry>> tasks = new ArrayList<>();
181
+ long currentTaskSize = 0;
182
+ List<Entry> currentTask = new ArrayList<>();
168
183
  for (Entry entry : all) {
169
- tasks.add(ImmutableList.of(entry));
184
+ currentTask.add(entry);
185
+ currentTaskSize += entry.getSize(); // TODO consider to multiply the size by cost_per_byte, and add cost_per_file
186
+ if (currentTaskSize >= minTaskSize) {
187
+ tasks.add(currentTask);
188
+ currentTask = new ArrayList<>();
189
+ currentTaskSize = 0;
190
+ }
191
+ }
192
+ if (!currentTask.isEmpty()) {
193
+ tasks.add(currentTask);
170
194
  }
171
195
  return tasks;
172
196
  }
@@ -0,0 +1,87 @@
1
+ package org.embulk.input.s3;
2
+
3
+ import org.embulk.EmbulkTestRuntime;
4
+ import org.embulk.config.ConfigSource;
5
+ import org.junit.Before;
6
+ import org.junit.Rule;
7
+ import org.junit.Test;
8
+
9
+ import static org.junit.Assert.assertEquals;
10
+
11
+ public class TestFileList
12
+ {
13
+ @Rule
14
+ public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
15
+
16
+ private ConfigSource config;
17
+
18
+ @Before
19
+ public void createConfigSource()
20
+ {
21
+ config = runtime.getExec().newConfigSource();
22
+ }
23
+
24
+ @Test
25
+ public void checkMinTaskSize()
26
+ throws Exception
27
+ {
28
+ { // not specify min_task_size
29
+ FileList fileList = newFileList(config.deepCopy(),
30
+ "sample_00", 100L,
31
+ "sample_01", 150L,
32
+ "sample_02", 350L);
33
+
34
+ assertEquals(3, fileList.getTaskCount());
35
+ assertEquals("sample_00", fileList.get(0).get(0));
36
+ assertEquals("sample_01", fileList.get(1).get(0));
37
+ assertEquals("sample_02", fileList.get(2).get(0));
38
+ }
39
+
40
+ {
41
+ FileList fileList = newFileList(config.deepCopy().set("min_task_size", 100),
42
+ "sample_00", 100L,
43
+ "sample_01", 150L,
44
+ "sample_02", 350L);
45
+
46
+ assertEquals(3, fileList.getTaskCount());
47
+ assertEquals("sample_00", fileList.get(0).get(0));
48
+ assertEquals("sample_01", fileList.get(1).get(0));
49
+ assertEquals("sample_02", fileList.get(2).get(0));
50
+ }
51
+
52
+ {
53
+ FileList fileList = newFileList(config.deepCopy().set("min_task_size", 200),
54
+ "sample_00", 100L,
55
+ "sample_01", 150L,
56
+ "sample_02", 350L);
57
+
58
+ assertEquals(2, fileList.getTaskCount());
59
+ assertEquals("sample_00", fileList.get(0).get(0));
60
+ assertEquals("sample_01", fileList.get(0).get(1));
61
+ assertEquals("sample_02", fileList.get(1).get(0));
62
+ }
63
+
64
+ {
65
+ FileList fileList = newFileList(config.deepCopy().set("min_task_size", 700),
66
+ "sample_00", 100L,
67
+ "sample_01", 150L,
68
+ "sample_02", 350L);
69
+
70
+ assertEquals(1, fileList.getTaskCount());
71
+ assertEquals("sample_00", fileList.get(0).get(0));
72
+ assertEquals("sample_01", fileList.get(0).get(1));
73
+ assertEquals("sample_02", fileList.get(0).get(2));
74
+ }
75
+ }
76
+
77
+ private static FileList newFileList(ConfigSource config, Object... nameAndSize)
78
+ {
79
+ FileList.Builder builder = new FileList.Builder(config);
80
+
81
+ for (int i = 0; i < nameAndSize.length; i += 2) {
82
+ builder.add((String) nameAndSize[i], (long) nameAndSize[i + 1]);
83
+ }
84
+
85
+ return builder.build();
86
+ }
87
+ }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-input-s3
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.6
4
+ version: 0.2.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sadayuki Furuhashi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-12-05 00:00:00.000000000 Z
11
+ date: 2015-12-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -53,6 +53,7 @@ files:
53
53
  - src/main/java/org/embulk/input/s3/FileList.java
54
54
  - src/main/java/org/embulk/input/s3/S3FileInputPlugin.java
55
55
  - src/test/java/org/embulk/input/s3/TestAwsCredentials.java
56
+ - src/test/java/org/embulk/input/s3/TestFileList.java
56
57
  - src/test/java/org/embulk/input/s3/TestS3FileInputPlugin.java
57
58
  - src/test/java/org/embulk/input/s3/TestS3InputStreamReopener.java
58
59
  - src/test/resources/sample_01.csv
@@ -60,7 +61,7 @@ files:
60
61
  - classpath/aws-java-sdk-kms-1.10.33.jar
61
62
  - classpath/aws-java-sdk-s3-1.10.33.jar
62
63
  - classpath/commons-codec-1.6.jar
63
- - classpath/embulk-input-s3-0.2.6.jar
64
+ - classpath/embulk-input-s3-0.2.7.jar
64
65
  - classpath/httpclient-4.3.6.jar
65
66
  - classpath/httpcore-4.3.3.jar
66
67
  - classpath/jcl-over-slf4j-1.7.12.jar