embulk 0.5.1 → 0.5.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0abe724d59ea2a21922310c9b7a575b612369147
4
- data.tar.gz: 753c67c46772c9fd44a7ed46d1deb185a386edba
3
+ metadata.gz: c731d85a93052e3fa0e40a49c048f365e150eace
4
+ data.tar.gz: 5266d8207396e64c995d8fd6e2b9e60cc228041c
5
5
  SHA512:
6
- metadata.gz: e2632ab7e8c77ffc4d31a4c63c58b2e88cd617dcb2c941e545aa6c51ff75e0909f3bf05faf679b95f176e63a7057c42e07992fcafef8fd5ffc135b5f21008eb3
7
- data.tar.gz: 50619880fc999eaa53cf637f5a8fca1f96740b4966280aa83408d55a6b997ab78fc74cfcb213542116f8804bcf20d87b83590ef4dd0e44fdf9309fc9089a8cce
6
+ metadata.gz: 658da0f21342555a4d267bfa5bd7ee048dddd875fdc45dff82d22fb7f00e19d1f735afed15549adf94f135ccae3ed327636a86b37a5b6a252de0821f4049a5a9
7
+ data.tar.gz: 7888c38856e8e3816eb819bc1b12f33852634882c7e40b68496ad5700cf7c1a202174e097cfeb1589f6847752e768d40fdfc9ef5d5465b698708882488c9aeeb
data/README.md CHANGED
@@ -25,23 +25,31 @@ The single-file package is the simplest way to try Embulk. You can download the
25
25
 
26
26
  ### Linux & Mac & BSD
27
27
 
28
+ Embulk is a Java application. Please make sure that you installed [Java](http://www.oracle.com/technetwork/java/javase/downloads/index.html).
29
+
28
30
  Following 4 commands install embulk to your home directory:
29
31
 
30
32
  ```
31
- curl --create-dirs -o ~/.embulk/bin/embulk -L https://bintray.com/artifact/download/embulk/maven/embulk-0.5.1.jar
33
+ curl --create-dirs -o ~/.embulk/bin/embulk -L https://bintray.com/artifact/download/embulk/maven/embulk-0.5.2.jar
32
34
  chmod +x ~/.embulk/bin/embulk
33
35
  echo 'export PATH="$HOME/.embulk/bin:$PATH"' >> ~/.bashrc
34
36
  source ~/.bashrc
35
37
  ```
36
38
 
39
+ Next step: [Trying examples](#trying-examples)
40
+
37
41
  ### Windows
38
42
 
43
+ Embulk is a Java application. Please make sure that you installed [Java](http://www.oracle.com/technetwork/java/javase/downloads/index.html).
44
+
39
45
  You can assume the jar file is a .bat file.
40
46
 
41
47
  ```
42
- PowerShell -Command "& {Invoke-WebRequest https://bintray.com/artifact/download/embulk/maven/embulk-0.5.1.jar -OutFile embulk.bat}"
48
+ PowerShell -Command "& {Invoke-WebRequest https://bintray.com/artifact/download/embulk/maven/embulk-0.5.2.jar -OutFile embulk.bat}"
43
49
  ```
44
50
 
51
+ Next step: [Trying examples](#trying-examples)
52
+
45
53
  ### Trying examples
46
54
 
47
55
  Let's load a CSV file, for example. `embulk example` subcommand generates a csv file and config file for you.
@@ -53,6 +61,8 @@ embulk preview config.yml
53
61
  embulk run config.yml
54
62
  ```
55
63
 
64
+ Next step: [Using plugins](#using-plugins)
65
+
56
66
  ### Using plugins
57
67
 
58
68
  You can use plugins to load data from/to various systems and file formats.
@@ -63,14 +73,14 @@ embulk gem install embulk-output-postgres-json
63
73
  embulk gem list
64
74
  ```
65
75
 
66
- You can search plugins on RubyGems: [search for "embulk"](https://rubygems.org/search?utf8=%E2%9C%93&query=embulk).
76
+ You can find plugins at the [list of plugins by category](http://www.embulk.org/plugins/).
67
77
 
68
78
  ### Using plugin bundle
69
79
 
70
- `embulk bundle` subcommand creates (or updates if already exists) a *plugin bundle* directory.
80
+ `embulk bundle` subcommand creates (or updates if already exists) a private (isolated) bundle of a plugins.
71
81
  You can use the bundle using `-b <bundle_dir>` option. `embulk bundle` also generates some example plugins to \<bundle_dir>/embulk/\*.rb directory.
72
82
 
73
- See generated \<bundle_dir>/Gemfile file how to plugin bundles work.
83
+ See the generated \<bundle_dir>/Gemfile file how to plugin bundles work.
74
84
 
75
85
  ```
76
86
  embulk bundle ./embulk_bundle
data/build.gradle CHANGED
@@ -12,7 +12,7 @@ def release_projects = [project(":embulk-core"), project(":embulk-standards")]
12
12
 
13
13
  allprojects {
14
14
  group = 'org.embulk'
15
- version = '0.5.1'
15
+ version = '0.5.2'
16
16
 
17
17
  apply plugin: 'java'
18
18
  apply plugin: 'maven-publish'
@@ -35,7 +35,7 @@ public class PooledBufferAllocator
35
35
  extends Buffer
36
36
  {
37
37
  private ByteBuf buf;
38
- private Exception doubleFreeCheck;
38
+ private BufferReleasedBeforeAt doubleFreeCheck;
39
39
 
40
40
  public NettyByteBufBuffer(ByteBuf buf)
41
41
  {
@@ -46,13 +46,26 @@ public class PooledBufferAllocator
46
46
  public void release()
47
47
  {
48
48
  if (doubleFreeCheck != null) {
49
- doubleFreeCheck.printStackTrace();
49
+ new BufferDoubleReleasedException(doubleFreeCheck).printStackTrace();
50
50
  }
51
51
  if (buf != null) {
52
52
  buf.release();
53
53
  buf = null;
54
- doubleFreeCheck = new NullPointerException();
54
+ doubleFreeCheck = new BufferReleasedBeforeAt();
55
55
  }
56
56
  }
57
57
  }
58
+
59
+ static class BufferReleasedBeforeAt
60
+ extends Throwable
61
+ { }
62
+
63
+ static class BufferDoubleReleasedException
64
+ extends IllegalStateException
65
+ {
66
+ public BufferDoubleReleasedException(BufferReleasedBeforeAt releasedAt)
67
+ {
68
+ super("Detected double release() call of a buffer", releasedAt);
69
+ }
70
+ }
58
71
  }
@@ -98,12 +98,13 @@ public class PreviewExecutor
98
98
  {
99
99
  InputPlugin input = newInputPlugin(task);
100
100
  List<FilterPlugin> filterPlugins = newFilterPlugins(task);
101
- Schema filteredSchema = filterSchemas.get(filterSchemas.size() - 1);
101
+ Schema inputSchema = filterSchemas.get(0);
102
+ Schema outputSchema = filterSchemas.get(filterSchemas.size() - 1);
102
103
 
103
- PageOutput out = new SamplingPageOutput(task.getSampleRows(), filteredSchema);
104
+ PageOutput out = new SamplingPageOutput(task.getSampleRows(), outputSchema);
104
105
  try {
105
106
  out = Filters.open(filterPlugins, filterTasks, filterSchemas, out);
106
- input.run(inputTask, filteredSchema, 0, out);
107
+ input.run(inputTask, inputSchema, 0, out);
107
108
  } finally {
108
109
  out.close();
109
110
  }
@@ -46,17 +46,17 @@ public class SamplingParserPlugin
46
46
  throw new SampledNoticeError(buffer);
47
47
  }
48
48
 
49
- public static Buffer runFileInputSampling(final FileInputRunner input, ConfigSource inputConfig)
49
+ public static Buffer runFileInputSampling(final FileInputRunner runner, ConfigSource inputConfig)
50
50
  {
51
51
  // override in.parser.type so that FileInputRunner creates GuessParserPlugin
52
52
  ConfigSource samplingInputConfig = inputConfig.deepCopy();
53
53
  samplingInputConfig.getNestedOrSetEmpty("parser").set("type", "system_sampling");
54
54
 
55
55
  try {
56
- input.transaction(samplingInputConfig, new InputPlugin.Control() {
56
+ runner.transaction(samplingInputConfig, new InputPlugin.Control() {
57
57
  public List<CommitReport> run(TaskSource taskSource, Schema schema, int taskCount)
58
58
  {
59
- input.run(taskSource, schema, 0, new PageOutput() {
59
+ runner.run(taskSource, schema, 0, new PageOutput() {
60
60
  @Override
61
61
  public void add(Page page)
62
62
  {
@@ -8,6 +8,12 @@ Embulk documentation
8
8
 
9
9
  https://github.com/embulk/embulk
10
10
 
11
+ * `Quick Start <https://github.com/embulk/embulk#quick-start>`_
12
+
13
+ * `Linux and Mac OS X <https://github.com/embulk/embulk#linux--mac--bsd>`_
14
+
15
+ * `Windows <https://github.com/embulk/embulk#windows>`_
16
+
11
17
  * `List of Plugins by Category <http://www.embulk.org/plugins/>`_
12
18
 
13
19
  .. toctree::
@@ -56,7 +56,7 @@ You can find the latest embulk binary from the `releases <https://bintray.com/em
56
56
 
57
57
  .. code-block:: console
58
58
 
59
- $ sudo wget https://bintray.com/artifact/download/embulk/maven/embulk-0.5.1.jar -O /usr/local/bin/embulk
59
+ $ sudo wget https://bintray.com/artifact/download/embulk/maven/embulk-0.5.2.jar -O /usr/local/bin/embulk
60
60
  $ sudo chmod +x /usr/local/bin/embulk
61
61
 
62
62
  Step 2. Install Elasticsearch plugin
@@ -23,4 +23,5 @@ Release Notes
23
23
  release/release-0.4.10
24
24
  release/release-0.5.0
25
25
  release/release-0.5.1
26
+ release/release-0.5.2
26
27
 
@@ -0,0 +1,30 @@
1
+ Release 0.5.2
2
+ ==================================
3
+
4
+ Built-in plugins
5
+ ------------------
6
+
7
+ * ``parser-csv`` plugin supports ``skip_header_lines`` parameter to skip first some lines.
8
+
9
+ * ``header_line`` parameter is obsoleted. Although the parameter still works for backward compatibility, setting both ``header_line`` and ``skip_header_lines`` becomes configuration error.
10
+
11
+ * ``guess-csv`` plugin guesses first ignorable lines and sets ``skip_header_lines`` parameter automatically.
12
+
13
+ * ``guess-csv`` plugin guesses quoted column names correctly.
14
+
15
+ * ``formatter-csv`` pugin supports ``delimiter`` parameter (@hiroyuki-sato++).
16
+
17
+ * ``output-stdout`` fixed warning messages due to double-release of buffers.
18
+
19
+
20
+ General Changes
21
+ ------------------
22
+
23
+ * Improved error message when double-release of a ``spi.Buffer`` is detected.
24
+ * Fixed ``preview`` when a filter plugin changes schema (@llibra++).
25
+ * Fixed infinite loop at ``Embulk::FileOutput#flush`` (@goronao++). It happened if a formatter plugin written in Ruby writes more than 32KB of data.
26
+
27
+
28
+ Release Date
29
+ ------------------
30
+ 2015-03-11
@@ -31,6 +31,10 @@ public class CsvFormatterPlugin
31
31
  @Config("header_line")
32
32
  @ConfigDefault("true")
33
33
  public boolean getHeaderLine();
34
+
35
+ @Config("delimiter")
36
+ @ConfigDefault("\",\"")
37
+ public String getDelimiterChar();
34
38
  }
35
39
 
36
40
  @Override
@@ -62,13 +66,14 @@ public class CsvFormatterPlugin
62
66
  final LineEncoder encoder = new LineEncoder(output, task);
63
67
  final Map<Integer, TimestampFormatter> timestampFormatters =
64
68
  newTimestampFormatters(task, schema);
69
+ final String delimiter = task.getDelimiterChar();
65
70
 
66
71
  // create a file
67
72
  encoder.nextFile();
68
73
 
69
74
  // write header
70
75
  if (task.getHeaderLine()) {
71
- writeHeader(schema, encoder);
76
+ writeHeader(schema, encoder, delimiter);
72
77
  }
73
78
 
74
79
  return new PageOutput() {
@@ -124,7 +129,7 @@ public class CsvFormatterPlugin
124
129
  private void addDelimiter(Column column)
125
130
  {
126
131
  if (column.getIndex() != 0) {
127
- encoder.addText(",");
132
+ encoder.addText(delimiter);
128
133
  }
129
134
  }
130
135
  });
@@ -145,11 +150,11 @@ public class CsvFormatterPlugin
145
150
  };
146
151
  }
147
152
 
148
- private void writeHeader(Schema schema, LineEncoder encoder)
153
+ private void writeHeader(Schema schema, LineEncoder encoder, String delimiter)
149
154
  {
150
155
  for (Column column : schema.getColumns()) {
151
156
  if (column.getIndex() != 0) {
152
- encoder.addText(",");
157
+ encoder.addText(delimiter);
153
158
  }
154
159
  encoder.addText(column.getName());
155
160
  }
@@ -1,13 +1,13 @@
1
1
  package org.embulk.standards;
2
2
 
3
3
  import com.google.common.base.Preconditions;
4
- import com.google.common.collect.ImmutableMap;
5
4
  import com.google.common.base.Optional;
6
5
  import com.google.common.collect.ImmutableSet;
7
6
  import org.embulk.config.Task;
8
7
  import org.embulk.config.Config;
9
8
  import org.embulk.config.ConfigDefault;
10
9
  import org.embulk.config.ConfigSource;
10
+ import org.embulk.config.ConfigException;
11
11
  import org.embulk.config.TaskSource;
12
12
  import org.embulk.spi.type.TimestampType;
13
13
  import org.embulk.spi.time.TimestampParser;
@@ -25,8 +25,6 @@ import org.embulk.spi.BufferAllocator;
25
25
  import org.embulk.spi.util.LineDecoder;
26
26
  import org.slf4j.Logger;
27
27
 
28
- import java.util.Map;
29
-
30
28
  public class CsvParserPlugin
31
29
  implements ParserPlugin
32
30
  {
@@ -44,9 +42,14 @@ public class CsvParserPlugin
44
42
  @Config("columns")
45
43
  public SchemaConfig getSchemaConfig();
46
44
 
47
- @Config("header_line") // how to set default value?? TODO @Default("true")
48
- @ConfigDefault("false")
49
- public boolean getHeaderLine();
45
+ @Config("header_line")
46
+ @ConfigDefault("null")
47
+ public Optional<Boolean> getHeaderLine();
48
+
49
+ @Config("skip_header_lines")
50
+ @ConfigDefault("0")
51
+ public int getSkipHeaderLines();
52
+ public void setSkipHeaderLines(int n);
50
53
 
51
54
  @Config("delimiter")
52
55
  @ConfigDefault("\",\"")
@@ -86,20 +89,33 @@ public class CsvParserPlugin
86
89
  public void transaction(ConfigSource config, ParserPlugin.Control control)
87
90
  {
88
91
  PluginTask task = config.loadConfig(PluginTask.class);
92
+
93
+ // backward compatibility
94
+ if (task.getHeaderLine().isPresent()) {
95
+ if (task.getSkipHeaderLines() > 0) {
96
+ throw new ConfigException("'header_line' option is invalid if 'skip_header_lines' is set.");
97
+ }
98
+ if (task.getHeaderLine().get()) {
99
+ task.setSkipHeaderLines(1);
100
+ } else {
101
+ task.setSkipHeaderLines(0);
102
+ }
103
+ }
104
+
89
105
  control.run(task.dump(), task.getSchemaConfig().toSchema());
90
106
  }
91
107
 
92
- private Map<Integer, TimestampParser> newTimestampParsers(
108
+ private TimestampParser[] newTimestampParsers(
93
109
  TimestampParser.ParserTask task, Schema schema)
94
110
  {
95
- ImmutableMap.Builder<Integer, TimestampParser> builder = new ImmutableMap.Builder<>();
111
+ TimestampParser[] parsers = new TimestampParser[schema.getColumnCount()];
96
112
  for (Column column : schema.getColumns()) {
97
113
  if (column.getType() instanceof TimestampType) {
98
114
  TimestampType tt = (TimestampType) column.getType();
99
- builder.put(column.getIndex(), new TimestampParser(tt.getFormat(), task));
115
+ parsers[column.getIndex()] = new TimestampParser(tt.getFormat(), task);
100
116
  }
101
117
  }
102
- return builder.build();
118
+ return parsers;
103
119
  }
104
120
 
105
121
  @Override
@@ -107,19 +123,18 @@ public class CsvParserPlugin
107
123
  FileInput input, PageOutput output)
108
124
  {
109
125
  PluginTask task = taskSource.loadTask(PluginTask.class);
110
- final Map<Integer, TimestampParser> timestampFormatters = newTimestampParsers(task, schema);
111
- final CsvTokenizer tokenizer = new CsvTokenizer(new LineDecoder(input, task), task);
126
+ final TimestampParser[] timestampFormatters = newTimestampParsers(task, schema);
127
+ LineDecoder lineDecoder = new LineDecoder(input, task);
128
+ final CsvTokenizer tokenizer = new CsvTokenizer(lineDecoder, task);
112
129
  final String nullStringOrNull = task.getNullString().orNull();
113
- boolean skipHeaderLine = task.getHeaderLine();
130
+ int skipHeaderLines = task.getSkipHeaderLines();
114
131
 
115
132
  try (final PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output)) {
116
133
  while (tokenizer.nextFile()) {
117
- if (skipHeaderLine) {
118
- // skip the first line
119
- if (tokenizer.nextRecord()) {
120
- for (int i=0; i < schema.getColumnCount(); i++) {
121
- tokenizer.nextColumn(); // TODO check return value?
122
- }
134
+ // skip the header lines for each file
135
+ for (; skipHeaderLines > 0; skipHeaderLines--) {
136
+ if (lineDecoder.poll() == null) {
137
+ break;
123
138
  }
124
139
  }
125
140
 
@@ -187,7 +202,7 @@ public class CsvParserPlugin
187
202
  pageBuilder.setNull(column);
188
203
  } else {
189
204
  try {
190
- pageBuilder.setTimestamp(column, (timestampFormatters.get(column.getIndex()).parse(v)));
205
+ pageBuilder.setTimestamp(column, timestampFormatters[column.getIndex()].parse(v));
191
206
  } catch (TimestampParseException e) {
192
207
  // TODO support default value
193
208
  throw new CsvRecordValidateException(e);
@@ -115,9 +115,14 @@ public class CsvTokenizer
115
115
  }
116
116
  }
117
117
 
118
+ public boolean hasNextColumn()
119
+ {
120
+ return recordState == RecordState.NOT_END;
121
+ }
122
+
118
123
  public String nextColumn()
119
124
  {
120
- Preconditions.checkState(recordState == RecordState.NOT_END, "doesn't have enough columns"); // TODO exception class
125
+ Preconditions.checkState(hasNextColumn(), "doesn't have enough columns"); // TODO exception class
121
126
 
122
127
  // reset last state
123
128
  wasQuotedColumn = false;
@@ -64,7 +64,6 @@ public class StdoutOutputPlugin
64
64
  while (reader.nextRecord()) {
65
65
  System.out.println(printer.printRecord(reader, ","));
66
66
  }
67
- page.release();
68
67
  }
69
68
 
70
69
  public void finish()
@@ -72,7 +71,10 @@ public class StdoutOutputPlugin
72
71
  System.out.flush();
73
72
  }
74
73
 
75
- public void close() { }
74
+ public void close()
75
+ {
76
+ reader.close();
77
+ }
76
78
 
77
79
  public void abort() { }
78
80
 
@@ -31,7 +31,7 @@ public class TestCsvParserPlugin
31
31
  CsvParserPlugin.PluginTask task = config.loadConfig(CsvParserPlugin.PluginTask.class);
32
32
  assertEquals(Charset.forName("utf-8"), task.getCharset());
33
33
  assertEquals(Newline.CRLF, task.getNewline());
34
- assertEquals(false, task.getHeaderLine());
34
+ assertEquals(false, task.getHeaderLine().or(false));
35
35
  assertEquals(',', task.getDelimiterChar());
36
36
  assertEquals('\"', task.getQuoteChar());
37
37
  }
@@ -62,7 +62,7 @@ public class TestCsvParserPlugin
62
62
  CsvParserPlugin.PluginTask task = config.loadConfig(CsvParserPlugin.PluginTask.class);
63
63
  assertEquals(Charset.forName("utf-16"), task.getCharset());
64
64
  assertEquals(Newline.LF, task.getNewline());
65
- assertEquals(true, task.getHeaderLine());
65
+ assertEquals(true, task.getHeaderLine().or(false));
66
66
  assertEquals('\t', task.getDelimiterChar());
67
67
  assertEquals('\\', task.getQuoteChar());
68
68
  }
@@ -376,7 +376,7 @@ examples:
376
376
  end
377
377
 
378
378
  def self.print_exception(ex)
379
- if ex.respond_to?(:to_java)
379
+ if ex.respond_to?(:to_java) && ex.is_a?(java.lang.Throwable)
380
380
  ex.to_java.printStackTrace(java.lang.System.out)
381
381
  else
382
382
  puts "#{ex.to_s}"
@@ -34,6 +34,7 @@ module Embulk
34
34
  def flush
35
35
  unless @buffer.empty?
36
36
  @java_file_output.add(@buffer.to_java)
37
+ @buffer.clear
37
38
  end
38
39
  nil
39
40
  end
@@ -44,7 +45,7 @@ module Embulk
44
45
  end
45
46
 
46
47
  def close
47
- @java_file_output.finish
48
+ @java_file_output.close
48
49
  end
49
50
  end
50
51
 
@@ -24,6 +24,9 @@ module Embulk
24
24
  "\\N", # MySQL LOAD, Hive STORED AS TEXTFILE
25
25
  ]
26
26
 
27
+ MAX_SKIP_LINES = 10
28
+ NO_SKIP_DETECT_LINES = 10
29
+
27
30
  def guess_lines(config, sample_lines)
28
31
  delim = guess_delimiter(sample_lines)
29
32
  unless delim
@@ -32,7 +35,7 @@ module Embulk
32
35
  end
33
36
 
34
37
  parser_config = config["parser"] || {}
35
- parser_guessed = {"type" => "csv", "delimiter" => delim}
38
+ parser_guessed = DataSource.new.merge({"type" => "csv", "delimiter" => delim})
36
39
 
37
40
  quote = guess_quote(sample_lines, delim)
38
41
  parser_guessed["quote"] = quote ? quote : ''
@@ -44,7 +47,10 @@ module Embulk
44
47
  parser_guessed["null_string"] = null_string if null_string
45
48
  # don't even set null_string to avoid confusion of null and 'null' in YAML format
46
49
 
47
- sample_records = sample_lines.map {|line| line.split(delim) } # TODO use CsvTokenizer
50
+ sample_records = split_lines(parser_guessed, sample_lines, delim)
51
+ skip_header_lines = guess_skip_header_lines(sample_records)
52
+ sample_records = sample_records[skip_header_lines..-1]
53
+
48
54
  first_types = SchemaGuess.types_from_array_records(sample_records[0, 1])
49
55
  other_types = SchemaGuess.types_from_array_records(sample_records[1..-1])
50
56
 
@@ -53,12 +59,16 @@ module Embulk
53
59
  return {}
54
60
  end
55
61
 
56
- unless parser_config.has_key?("header_line")
57
- parser_guessed["header_line"] = (first_types != other_types && !first_types.any? {|t| t != "string" })
62
+ header_line = (first_types != other_types && !first_types.any? {|t| t != "string" })
63
+
64
+ if header_line
65
+ parser_guessed["skip_header_lines"] = skip_header_lines + 1
66
+ else
67
+ parser_guessed["skip_header_lines"] = skip_header_lines
58
68
  end
59
69
 
60
70
  unless parser_config.has_key?("columns")
61
- if parser_guessed["header_line"] || parser_config["header_line"]
71
+ if header_line
62
72
  column_names = sample_records.first
63
73
  else
64
74
  column_names = (0..other_types.size).to_a.map {|i| "c#{i}" }
@@ -81,6 +91,32 @@ module Embulk
81
91
 
82
92
  private
83
93
 
94
+ def split_lines(parser_config, sample_lines, delim)
95
+ parser_task = parser_config.merge({"columns" => []}).load_config(org.embulk.standards.CsvParserPlugin::PluginTask)
96
+ data = sample_lines.map {|x| x.force_encoding('UTF-8') }.join(parser_task.getNewline.getString.encode('UTF-8'))
97
+ sample = Buffer.from_ruby_string(data)
98
+ decoder = Java::LineDecoder.new(Java::ListFileInput.new([[sample.to_java]]), parser_task)
99
+ tokenizer = org.embulk.standards.CsvTokenizer.new(decoder, parser_task)
100
+ rows = []
101
+ while tokenizer.nextFile
102
+ while tokenizer.nextRecord
103
+ columns = []
104
+ while true
105
+ begin
106
+ columns << tokenizer.nextColumn
107
+ rescue java.lang.IllegalStateException # TODO exception class
108
+ rows << columns
109
+ break
110
+ end
111
+ end
112
+ end
113
+ end
114
+ return rows
115
+ rescue
116
+ # TODO warning if fallback to this ad-hoc implementation
117
+ sample_lines.map {|line| line.split(delim) }
118
+ end
119
+
84
120
  def guess_delimiter(sample_lines)
85
121
  delim_weights = DELIMITER_CANDIDATES.map do |d|
86
122
  counts = sample_lines.map {|line| line.count(d) }
@@ -154,6 +190,17 @@ module Embulk
154
190
  return found ? found[0] : nil
155
191
  end
156
192
 
193
+ def guess_skip_header_lines(sample_records)
194
+ counts = sample_records.map {|records| records.size }
195
+ (1..[MAX_SKIP_LINES, counts.length - 1].min).each do |i|
196
+ check_row_count = counts[i-1]
197
+ if counts[i, NO_SKIP_DETECT_LINES].all? {|c| c == check_row_count }
198
+ return i - 1
199
+ end
200
+ end
201
+ return 0
202
+ end
203
+
157
204
  def array_sum(array)
158
205
  array.inject(0) {|r,i| r += i }
159
206
  end
@@ -115,6 +115,7 @@ module Embulk
115
115
  while line = decoder.poll
116
116
  sample_lines << line
117
117
  end
118
+ sample_lines.pop unless sample_lines.empty? # last line can be partial
118
119
  end
119
120
 
120
121
  return guess_lines(config, sample_lines);
@@ -38,8 +38,6 @@ module Embulk
38
38
  return true
39
39
  rescue LoadError => e
40
40
  # catch LoadError but don't catch ClassNotFoundException
41
- # TODO: the best code here is to raise exception only if
42
- # `name` file is not in $LOAD_PATH.
43
41
  raise e if e.to_s =~ /java.lang.ClassNotFoundException/
44
42
  raise e if $LOAD_PATH.any? {|dir| File.exists? File.join(dir, "#{name}.rb") }
45
43
  end
@@ -1,3 +1,3 @@
1
1
  module Embulk
2
- VERSION = '0.5.1'
2
+ VERSION = '0.5.2'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.1
4
+ version: 0.5.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sadayuki Furuhashi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-03-05 00:00:00.000000000 Z
11
+ date: 2015-03-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -273,6 +273,7 @@ files:
273
273
  - embulk-docs/src/release/release-0.4.9.rst
274
274
  - embulk-docs/src/release/release-0.5.0.rst
275
275
  - embulk-docs/src/release/release-0.5.1.rst
276
+ - embulk-docs/src/release/release-0.5.2.rst
276
277
  - embulk-standards/build.gradle
277
278
  - embulk-standards/src/main/java/org/embulk/standards/CsvFormatterPlugin.java
278
279
  - embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java
@@ -377,8 +378,8 @@ files:
377
378
  - classpath/bval-jsr303-0.5.jar
378
379
  - classpath/commons-beanutils-core-1.8.3.jar
379
380
  - classpath/commons-lang3-3.1.jar
380
- - classpath/embulk-core-0.5.1.jar
381
- - classpath/embulk-standards-0.5.1.jar
381
+ - classpath/embulk-core-0.5.2.jar
382
+ - classpath/embulk-standards-0.5.2.jar
382
383
  - classpath/guava-18.0.jar
383
384
  - classpath/guice-3.0.jar
384
385
  - classpath/guice-multibindings-3.0.jar