embulk-input-sftp 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 72aaee2da1604f1a298e13d37ab74be174cddb5f
4
- data.tar.gz: 3c062e178443efd8b44874eb9cb2d446385d010b
3
+ metadata.gz: 66e8dab8cd68fdd0a9844dac72d51eaf070cf93a
4
+ data.tar.gz: 9de2db631fa24c526e448a2985003063f05a2ae7
5
5
  SHA512:
6
- metadata.gz: d21a1f92aae66016ea648dc94e2f757e7bc3d968bda4953f35623307f649a49bde0d8b0bbbc84b79cedfb25c35e069f7b6b30134a3959d9211baad7581cb3c59
7
- data.tar.gz: 67c3b0a1c2b054093f61aa5ca0946c9fed2f74a0f223e0ee60c44737fa33cbbd5fd90469f3b5baafc62eb45d74c19e504cd87c498a3ebdf8d3bb1dc7a8e5967f
6
+ metadata.gz: 5079bf38ede9c5f4b611831144cdcc307c3b71acc6f2b2e43f3570d67abf21f4ca762dddfc0273b3be6cb7740072beaad0865fd5e4a98360d8206c1c88cfe3b0
7
+ data.tar.gz: 538c81c868addc2d141770ab5c88d3d38aa0e9e297214a2a42255e80f4f7de91da7ac646eae6aababe301b611ace77df261e7a43931e98c4d3e76bb0100c9e02
data/CHANGELOG.md ADDED
@@ -0,0 +1,6 @@
1
+ ## 0.1.1 - 2015-03-18
2
+
3
+ * [feature] Support last_path_ option [#2](https://github.com/sakama/embulk-input-sftp/pull/2)[#4](https://github.com/sakama/embulk-input-sftp/pull/4)[#7](https://github.com/sakama/embulk-input-sftp/pull/7)
4
+ * [feature] Support path_match_pattern option [#6](https://github.com/sakama/embulk-input-sftp/pull/6)
5
+ * [maintenance] Add unit test [#3](https://github.com/sakama/embulk-input-sftp/pull/3)
6
+ * [maintenance] Skip retry of file downloading when permission denied error happens [#1](https://github.com/sakama/embulk-input-sftp/pull/1)
data/README.md CHANGED
@@ -19,8 +19,11 @@ Reads files stored on remote server using SFTP
19
19
  - **user_directory_is_root**: (boolean, default: `true`)
20
20
  - **timeout**: sftp connection timeout seconds (integer, default: `600`)
21
21
  - **path_prefix**: Prefix of output paths (string, required)
22
+ - **path_match_pattern**: regexp to match file paths. If a file path doesn't match with this pattern, the file will be skipped (regexp string, optional)
23
+ - **total_file_count_limit**: maximum number of files to read (integer, optional)
22
24
  - **file_ext**: Extension of output files (string, required)
23
25
  - **sequence_format**: Format for sequence part of output files (string, default: `".%03d.%02d"`)
26
+ - **min_task_size (experimental)**: minimum size of a task. If this is larger than 0, one task includes multiple input files. This is useful if too many number of tasks impacts performance of output or executor plugins badly. (integer, optional)
24
27
 
25
28
  ### Proxy configuration
26
29
 
@@ -50,6 +53,21 @@ in:
50
53
  path_prefix: /data/sftp
51
54
  ```
52
55
 
56
+ To filter files using regexp:
57
+
58
+ ```yaml
59
+ in:
60
+ type: sftp
61
+ path_prefix: logs/csv-
62
+ ...
63
+ path_match_pattern: \.csv$ # a file will be skipped if its path doesn't match with this pattern
64
+
65
+ ## some examples of regexp:
66
+ #path_match_pattern: /archive/ # match files in .../archive/... directory
67
+ #path_match_pattern: /data1/|/data2/ # match files in .../data1/... or .../data2/... directory
68
+ #path_match_pattern: .csv$|.csv.gz$ # match files whose suffix is .csv or .csv.gz
69
+ ```
70
+
53
71
  With proxy
54
72
  ```yaml
55
73
  in:
data/build.gradle CHANGED
@@ -3,6 +3,7 @@ plugins {
3
3
  id "com.github.jruby-gradle.base" version "0.1.5"
4
4
  id "java"
5
5
  id "checkstyle"
6
+ id "jacoco"
6
7
  }
7
8
  import com.github.jrubygradle.JRubyExec
8
9
  repositories {
@@ -13,7 +14,7 @@ configurations {
13
14
  provided
14
15
  }
15
16
 
16
- version = "0.1.0"
17
+ version = "0.1.1"
17
18
 
18
19
  sourceCompatibility = 1.7
19
20
  targetCompatibility = 1.7
@@ -0,0 +1,341 @@
1
+ package org.embulk.input.sftp;
2
+
3
+ import com.fasterxml.jackson.annotation.JsonCreator;
4
+ import com.fasterxml.jackson.annotation.JsonIgnore;
5
+ import com.fasterxml.jackson.annotation.JsonProperty;
6
+ import com.google.common.base.Optional;
7
+ import com.google.common.base.Throwables;
8
+ import org.embulk.config.Config;
9
+ import org.embulk.config.ConfigDefault;
10
+ import org.embulk.config.ConfigSource;
11
+ import org.embulk.spi.Exec;
12
+ import org.slf4j.Logger;
13
+
14
+ import java.io.BufferedInputStream;
15
+ import java.io.BufferedOutputStream;
16
+ import java.io.ByteArrayInputStream;
17
+ import java.io.ByteArrayOutputStream;
18
+
19
+ import java.io.IOException;
20
+ import java.io.InputStream;
21
+ import java.io.OutputStream;
22
+ import java.nio.ByteBuffer;
23
+ import java.nio.charset.StandardCharsets;
24
+ import java.util.AbstractList;
25
+ import java.util.ArrayList;
26
+ import java.util.List;
27
+ import java.util.regex.Pattern;
28
+ import java.util.zip.GZIPInputStream;
29
+ import java.util.zip.GZIPOutputStream;
30
+
31
+ // this class should be moved to embulk-core
32
+ public class FileList
33
+ {
34
+ public interface Task
35
+ {
36
+ @Config("path_match_pattern")
37
+ @ConfigDefault("\".*\"")
38
+ String getPathMatchPattern();
39
+
40
+ @Config("total_file_count_limit")
41
+ @ConfigDefault("2147483647")
42
+ int getTotalFileCountLimit();
43
+
44
+ // TODO support more algorithms to combine tasks
45
+ @Config("min_task_size")
46
+ @ConfigDefault("0")
47
+ long getMinTaskSize();
48
+ }
49
+
50
+ public static class Entry
51
+ {
52
+ private int index;
53
+ private long size;
54
+
55
+ @JsonCreator
56
+ public Entry(
57
+ @JsonProperty("index") int index,
58
+ @JsonProperty("size") long size)
59
+ {
60
+ this.index = index;
61
+ this.size = size;
62
+ }
63
+
64
+ @JsonProperty("index")
65
+ public int getIndex()
66
+ {
67
+ return index;
68
+ }
69
+
70
+ @JsonProperty("size")
71
+ public long getSize()
72
+ {
73
+ return size;
74
+ }
75
+ }
76
+
77
+ public static class Builder
78
+ {
79
+ private final Logger log = Exec.getLogger(FileList.class);
80
+ private final ByteArrayOutputStream binary;
81
+ private final OutputStream stream;
82
+ private final List<Entry> entries = new ArrayList<>();
83
+ private String last = null;
84
+
85
+ private int limitCount = Integer.MAX_VALUE;
86
+ private long minTaskSize = 1;
87
+ private Pattern pathMatchPattern;
88
+
89
+ private final ByteBuffer castBuffer = ByteBuffer.allocate(4);
90
+
91
+ public Builder(Task task)
92
+ {
93
+ this();
94
+ this.pathMatchPattern = Pattern.compile(task.getPathMatchPattern());
95
+ this.limitCount = task.getTotalFileCountLimit();
96
+ this.minTaskSize = task.getMinTaskSize();
97
+ }
98
+
99
+ public Builder(ConfigSource config)
100
+ {
101
+ this();
102
+ this.pathMatchPattern = Pattern.compile(config.get(String.class, "path_match_pattern", ".*"));
103
+ this.limitCount = config.get(int.class, "total_file_count_limit", Integer.MAX_VALUE);
104
+ this.minTaskSize = config.get(long.class, "min_task_size", 0L);
105
+ }
106
+
107
+ public Builder()
108
+ {
109
+ binary = new ByteArrayOutputStream();
110
+ try {
111
+ stream = new BufferedOutputStream(new GZIPOutputStream(binary));
112
+ }
113
+ catch (IOException ex) {
114
+ throw Throwables.propagate(ex);
115
+ }
116
+ }
117
+
118
+ public Builder limitTotalFileCount(int limitCount)
119
+ {
120
+ this.limitCount = limitCount;
121
+ return this;
122
+ }
123
+
124
+ public Builder minTaskSize(long bytes)
125
+ {
126
+ this.minTaskSize = bytes;
127
+ return this;
128
+ }
129
+
130
+ public Builder pathMatchPattern(String pattern)
131
+ {
132
+ this.pathMatchPattern = Pattern.compile(pattern);
133
+ return this;
134
+ }
135
+
136
+ public int size()
137
+ {
138
+ return entries.size();
139
+ }
140
+
141
+ public boolean needsMore()
142
+ {
143
+ return size() < limitCount;
144
+ }
145
+
146
+ // returns true if this file is used
147
+ public synchronized boolean add(String path, long size)
148
+ {
149
+ // TODO throw IllegalStateException if stream is already closed
150
+
151
+ if (!needsMore()) {
152
+ return false;
153
+ }
154
+
155
+ if (!pathMatchPattern.matcher(path).find()) {
156
+ return false;
157
+ }
158
+
159
+ int index = entries.size();
160
+ entries.add(new Entry(index, size));
161
+ log.info("add file to the request list: {}", path);
162
+
163
+ byte[] data = path.getBytes(StandardCharsets.UTF_8);
164
+ castBuffer.putInt(0, data.length);
165
+ try {
166
+ stream.write(castBuffer.array());
167
+ stream.write(data);
168
+ }
169
+ catch (IOException ex) {
170
+ throw Throwables.propagate(ex);
171
+ }
172
+
173
+ last = path;
174
+ return true;
175
+ }
176
+
177
+ public FileList build()
178
+ {
179
+ try {
180
+ stream.close();
181
+ }
182
+ catch (IOException ex) {
183
+ throw Throwables.propagate(ex);
184
+ }
185
+ return new FileList(binary.toByteArray(), getSplits(entries), Optional.fromNullable(last));
186
+ }
187
+
188
+ private List<List<Entry>> getSplits(List<Entry> all)
189
+ {
190
+ List<List<Entry>> tasks = new ArrayList<>();
191
+ long currentTaskSize = 0;
192
+ List<Entry> currentTask = new ArrayList<>();
193
+ for (Entry entry : all) {
194
+ currentTask.add(entry);
195
+ currentTaskSize += entry.getSize(); // TODO consider to multiply the size by cost_per_byte, and add cost_per_file
196
+ if (currentTaskSize >= minTaskSize) {
197
+ tasks.add(currentTask);
198
+ currentTask = new ArrayList<>();
199
+ currentTaskSize = 0;
200
+ }
201
+ }
202
+ if (!currentTask.isEmpty()) {
203
+ tasks.add(currentTask);
204
+ }
205
+ return tasks;
206
+ }
207
+ }
208
+
209
+ private final byte[] data;
210
+ private final List<List<Entry>> tasks;
211
+ private final Optional<String> last;
212
+
213
+ @JsonCreator
214
+ @Deprecated
215
+ public FileList(
216
+ @JsonProperty("data") byte[] data,
217
+ @JsonProperty("tasks") List<List<Entry>> tasks,
218
+ @JsonProperty("last") Optional<String> last)
219
+ {
220
+ this.data = data;
221
+ this.tasks = tasks;
222
+ this.last = last;
223
+ }
224
+
225
+ @JsonIgnore
226
+ public Optional<String> getLastPath(Optional<String> lastLastPath)
227
+ {
228
+ if (last.isPresent()) {
229
+ return last;
230
+ }
231
+ return lastLastPath;
232
+ }
233
+
234
+ @JsonIgnore
235
+ public int getTaskCount()
236
+ {
237
+ return tasks.size();
238
+ }
239
+
240
+ @JsonIgnore
241
+ public List<String> get(int i)
242
+ {
243
+ return new EntryList(data, tasks.get(i));
244
+ }
245
+
246
+ @JsonProperty("data")
247
+ @Deprecated
248
+ public byte[] getData()
249
+ {
250
+ return data;
251
+ }
252
+
253
+ @JsonProperty("tasks")
254
+ @Deprecated
255
+ public List<List<Entry>> getTasks()
256
+ {
257
+ return tasks;
258
+ }
259
+
260
+ @JsonProperty("last")
261
+ @Deprecated
262
+ public Optional<String> getLast()
263
+ {
264
+ return last;
265
+ }
266
+
267
+ private class EntryList
268
+ extends AbstractList<String>
269
+ {
270
+ private final byte[] data;
271
+ private final List<Entry> entries;
272
+ private InputStream stream;
273
+ private int current;
274
+
275
+ private final ByteBuffer castBuffer = ByteBuffer.allocate(4);
276
+
277
+ public EntryList(byte[] data, List<Entry> entries)
278
+ {
279
+ this.data = data;
280
+ this.entries = entries;
281
+ try {
282
+ this.stream = new BufferedInputStream(new GZIPInputStream(new ByteArrayInputStream(data)));
283
+ }
284
+ catch (IOException ex) {
285
+ throw Throwables.propagate(ex);
286
+ }
287
+ this.current = 0;
288
+ }
289
+
290
+ @Override
291
+ public synchronized String get(int i)
292
+ {
293
+ Entry e = entries.get(i);
294
+ if (e.getIndex() < current) {
295
+ // rewind to the head
296
+ try {
297
+ stream.close();
298
+ stream = new BufferedInputStream(new GZIPInputStream(new ByteArrayInputStream(data)));
299
+ }
300
+ catch (IOException ex) {
301
+ throw Throwables.propagate(ex);
302
+ }
303
+ current = 0;
304
+ }
305
+
306
+ while (current < e.getIndex()) {
307
+ readNext();
308
+ }
309
+ // now current == e.getIndex()
310
+ return readNextString();
311
+ }
312
+
313
+ @Override
314
+ public int size()
315
+ {
316
+ return entries.size();
317
+ }
318
+
319
+ private byte[] readNext()
320
+ {
321
+ try {
322
+ stream.read(castBuffer.array());
323
+ int n = castBuffer.getInt(0);
324
+ byte[] b = new byte[n]; // here should be able to use a pooled buffer because read data is ignored if readNextString doesn't call this method
325
+ stream.read(b);
326
+
327
+ current++;
328
+
329
+ return b;
330
+ }
331
+ catch (IOException ex) {
332
+ throw Throwables.propagate(ex);
333
+ }
334
+ }
335
+
336
+ private String readNextString()
337
+ {
338
+ return new String(readNext(), StandardCharsets.UTF_8);
339
+ }
340
+ }
341
+ }
@@ -8,10 +8,8 @@ import org.embulk.config.Task;
8
8
  import org.embulk.spi.BufferAllocator;
9
9
  import org.embulk.spi.unit.LocalFile;
10
10
 
11
- import java.util.List;
12
-
13
11
  public interface PluginTask
14
- extends Task
12
+ extends Task, FileList.Task
15
13
  {
16
14
  @Config("host")
17
15
  String getHost();
@@ -59,8 +57,8 @@ public interface PluginTask
59
57
  @ConfigDefault("null")
60
58
  Optional<ProxyTask> getProxy();
61
59
 
62
- List<String> getFiles();
63
- void setFiles(List<String> files);
60
+ FileList getFiles();
61
+ void setFiles(FileList files);
64
62
 
65
63
  @ConfigInject
66
64
  BufferAllocator getBufferAllocator();
@@ -1,8 +1,8 @@
1
1
  package org.embulk.input.sftp;
2
2
 
3
3
  import com.google.common.base.Function;
4
+ import com.google.common.base.Optional;
4
5
  import com.google.common.base.Throwables;
5
- import com.google.common.collect.ImmutableList;
6
6
  import org.apache.commons.io.FilenameUtils;
7
7
  import org.apache.commons.vfs2.FileObject;
8
8
  import org.apache.commons.vfs2.FileSystemException;
@@ -21,17 +21,17 @@ import org.slf4j.Logger;
21
21
  import java.io.File;
22
22
  import java.net.URI;
23
23
  import java.net.URISyntaxException;
24
- import java.util.List;
25
24
 
26
25
  public class SftpFileInput
27
26
  extends InputStreamFileInput
28
27
  implements TransactionalFileInput
29
28
  {
30
29
  private static final Logger log = Exec.getLogger(SftpFileInput.class);
30
+ private static boolean isMatchLastKey = false;
31
31
 
32
32
  public SftpFileInput(PluginTask task, int taskIndex)
33
33
  {
34
- super(task.getBufferAllocator(), new SingleFileProvider(task, taskIndex, initializeStandardFileSystemManager()));
34
+ super(task.getBufferAllocator(), new SingleFileProvider(task, taskIndex, initializeStandardFileSystemManager(), initializeFsOptions(task)));
35
35
  }
36
36
 
37
37
  public void abort()
@@ -124,33 +124,54 @@ public class SftpFileInput
124
124
  return fsOptions;
125
125
  }
126
126
 
127
- public static String getSftpFileUri(PluginTask task)
127
+ public static String getSftpFileUri(PluginTask task, String path)
128
128
  {
129
129
  try {
130
- return new URI("sftp", initializeUserInfo(task), task.getHost(), task.getPort(), task.getPathPrefix(), null, null).toString();
130
+ return new URI("sftp", initializeUserInfo(task), task.getHost(), task.getPort(), path, null, null).toString();
131
131
  }
132
132
  catch (URISyntaxException ex) {
133
133
  throw new ConfigException(ex);
134
134
  }
135
135
  }
136
136
 
137
- public static List<String> listFilesByPrefix(PluginTask task)
137
+ public static String getRelativePath(Optional<String> path)
138
138
  {
139
- ImmutableList.Builder<String> builder = ImmutableList.builder();
139
+ try {
140
+ if (path.isPresent()) {
141
+ return new URI(path.get()).getPath();
142
+ }
143
+ else {
144
+ return null;
145
+ }
146
+ }
147
+ catch (URISyntaxException ex) {
148
+ return null;
149
+ }
150
+ }
151
+
152
+ public static FileList listFilesByPrefix(PluginTask task)
153
+ {
154
+ FileList.Builder builder = new FileList.Builder(task);
140
155
  int maxConnectionRetry = task.getMaxConnectionRetry();
156
+ String lastKey = null;
141
157
 
142
158
  StandardFileSystemManager manager = null;
143
159
  int count = 0;
144
160
  while (true) {
145
161
  try {
146
162
  manager = initializeStandardFileSystemManager();
147
- FileObject files = manager.resolveFile(getSftpFileUri(task), initializeFsOptions(task));
148
- String basename = FilenameUtils.getBaseName(task.getPathPrefix());
163
+ FileSystemOptions fsOptions = initializeFsOptions(task);
149
164
 
165
+ if (task.getLastPath().isPresent() && !task.getLastPath().get().isEmpty()) {
166
+ lastKey = manager.resolveFile(getSftpFileUri(task, task.getLastPath().get()), fsOptions).toString();
167
+ }
168
+
169
+ FileObject files = manager.resolveFile(getSftpFileUri(task, task.getPathPrefix()), fsOptions);
170
+ String basename = FilenameUtils.getBaseName(task.getPathPrefix());
150
171
  if (files.isFolder()) {
151
172
  for (FileObject f : files.getChildren()) {
152
173
  if (f.isFile()) {
153
- addFileToList(builder, f.toString(), "");
174
+ addFileToList(builder, f.toString(), f.getContent().getSize(), "", lastKey);
154
175
  }
155
176
  }
156
177
  }
@@ -158,7 +179,7 @@ public class SftpFileInput
158
179
  FileObject parent = files.getParent();
159
180
  for (FileObject f : parent.getChildren()) {
160
181
  if (f.isFile()) {
161
- addFileToList(builder, f.toString(), basename);
182
+ addFileToList(builder, f.toString(), f.getContent().getSize(), basename, lastKey);
162
183
  }
163
184
  }
164
185
  }
@@ -189,18 +210,32 @@ public class SftpFileInput
189
210
  }
190
211
  }
191
212
 
192
- private static void addFileToList(ImmutableList.Builder<String> builder, String fileName, String basename)
213
+ private static void addFileToList(FileList.Builder builder, String fileName, long fileSize, String basename, String lastKey)
193
214
  {
194
215
  if (!basename.isEmpty()) {
195
216
  String remoteBasename = FilenameUtils.getBaseName(fileName);
196
217
  if (remoteBasename.startsWith(basename)) {
197
- builder.add(fileName);
198
- log.info("add file to the request list: {}", fileName);
218
+ if (lastKey != null && !isMatchLastKey) {
219
+ if (!fileName.equals(lastKey)) {
220
+ return;
221
+ }
222
+ else {
223
+ isMatchLastKey = true;
224
+ }
225
+ }
226
+ builder.add(fileName, fileSize);
199
227
  }
200
228
  }
201
229
  else {
202
- builder.add(fileName);
203
- log.info("add file to the request list: {}", fileName);
230
+ if (lastKey != null && !isMatchLastKey) {
231
+ if (!fileName.equals(lastKey)) {
232
+ return;
233
+ }
234
+ else {
235
+ isMatchLastKey = true;
236
+ }
237
+ }
238
+ builder.add(fileName, fileSize);
204
239
  }
205
240
  }
206
241