embulk 0.5.1 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0abe724d59ea2a21922310c9b7a575b612369147
4
- data.tar.gz: 753c67c46772c9fd44a7ed46d1deb185a386edba
3
+ metadata.gz: c731d85a93052e3fa0e40a49c048f365e150eace
4
+ data.tar.gz: 5266d8207396e64c995d8fd6e2b9e60cc228041c
5
5
  SHA512:
6
- metadata.gz: e2632ab7e8c77ffc4d31a4c63c58b2e88cd617dcb2c941e545aa6c51ff75e0909f3bf05faf679b95f176e63a7057c42e07992fcafef8fd5ffc135b5f21008eb3
7
- data.tar.gz: 50619880fc999eaa53cf637f5a8fca1f96740b4966280aa83408d55a6b997ab78fc74cfcb213542116f8804bcf20d87b83590ef4dd0e44fdf9309fc9089a8cce
6
+ metadata.gz: 658da0f21342555a4d267bfa5bd7ee048dddd875fdc45dff82d22fb7f00e19d1f735afed15549adf94f135ccae3ed327636a86b37a5b6a252de0821f4049a5a9
7
+ data.tar.gz: 7888c38856e8e3816eb819bc1b12f33852634882c7e40b68496ad5700cf7c1a202174e097cfeb1589f6847752e768d40fdfc9ef5d5465b698708882488c9aeeb
data/README.md CHANGED
@@ -25,23 +25,31 @@ The single-file package is the simplest way to try Embulk. You can download the
25
25
 
26
26
  ### Linux & Mac & BSD
27
27
 
28
+ Embulk is a Java application. Please make sure that you installed [Java](http://www.oracle.com/technetwork/java/javase/downloads/index.html).
29
+
28
30
  Following 4 commands install embulk to your home directory:
29
31
 
30
32
  ```
31
- curl --create-dirs -o ~/.embulk/bin/embulk -L https://bintray.com/artifact/download/embulk/maven/embulk-0.5.1.jar
33
+ curl --create-dirs -o ~/.embulk/bin/embulk -L https://bintray.com/artifact/download/embulk/maven/embulk-0.5.2.jar
32
34
  chmod +x ~/.embulk/bin/embulk
33
35
  echo 'export PATH="$HOME/.embulk/bin:$PATH"' >> ~/.bashrc
34
36
  source ~/.bashrc
35
37
  ```
36
38
 
39
+ Next step: [Trying examples](#trying-examples)
40
+
37
41
  ### Windows
38
42
 
43
+ Embulk is a Java application. Please make sure that you installed [Java](http://www.oracle.com/technetwork/java/javase/downloads/index.html).
44
+
39
45
  You can assume the jar file is a .bat file.
40
46
 
41
47
  ```
42
- PowerShell -Command "& {Invoke-WebRequest https://bintray.com/artifact/download/embulk/maven/embulk-0.5.1.jar -OutFile embulk.bat}"
48
+ PowerShell -Command "& {Invoke-WebRequest https://bintray.com/artifact/download/embulk/maven/embulk-0.5.2.jar -OutFile embulk.bat}"
43
49
  ```
44
50
 
51
+ Next step: [Trying examples](#trying-examples)
52
+
45
53
  ### Trying examples
46
54
 
47
55
  Let's load a CSV file, for example. `embulk example` subcommand generates a csv file and config file for you.
@@ -53,6 +61,8 @@ embulk preview config.yml
53
61
  embulk run config.yml
54
62
  ```
55
63
 
64
+ Next step: [Using plugins](#using-plugins)
65
+
56
66
  ### Using plugins
57
67
 
58
68
  You can use plugins to load data from/to various systems and file formats.
@@ -63,14 +73,14 @@ embulk gem install embulk-output-postgres-json
63
73
  embulk gem list
64
74
  ```
65
75
 
66
- You can search plugins on RubyGems: [search for "embulk"](https://rubygems.org/search?utf8=%E2%9C%93&query=embulk).
76
+ You can find plugins at the [list of plugins by category](http://www.embulk.org/plugins/).
67
77
 
68
78
  ### Using plugin bundle
69
79
 
70
- `embulk bundle` subcommand creates (or updates if already exists) a *plugin bundle* directory.
80
+ `embulk bundle` subcommand creates (or updates if already exists) a private (isolated) bundle of a plugins.
71
81
  You can use the bundle using `-b <bundle_dir>` option. `embulk bundle` also generates some example plugins to \<bundle_dir>/embulk/\*.rb directory.
72
82
 
73
- See generated \<bundle_dir>/Gemfile file how to plugin bundles work.
83
+ See the generated \<bundle_dir>/Gemfile file how to plugin bundles work.
74
84
 
75
85
  ```
76
86
  embulk bundle ./embulk_bundle
data/build.gradle CHANGED
@@ -12,7 +12,7 @@ def release_projects = [project(":embulk-core"), project(":embulk-standards")]
12
12
 
13
13
  allprojects {
14
14
  group = 'org.embulk'
15
- version = '0.5.1'
15
+ version = '0.5.2'
16
16
 
17
17
  apply plugin: 'java'
18
18
  apply plugin: 'maven-publish'
@@ -35,7 +35,7 @@ public class PooledBufferAllocator
35
35
  extends Buffer
36
36
  {
37
37
  private ByteBuf buf;
38
- private Exception doubleFreeCheck;
38
+ private BufferReleasedBeforeAt doubleFreeCheck;
39
39
 
40
40
  public NettyByteBufBuffer(ByteBuf buf)
41
41
  {
@@ -46,13 +46,26 @@ public class PooledBufferAllocator
46
46
  public void release()
47
47
  {
48
48
  if (doubleFreeCheck != null) {
49
- doubleFreeCheck.printStackTrace();
49
+ new BufferDoubleReleasedException(doubleFreeCheck).printStackTrace();
50
50
  }
51
51
  if (buf != null) {
52
52
  buf.release();
53
53
  buf = null;
54
- doubleFreeCheck = new NullPointerException();
54
+ doubleFreeCheck = new BufferReleasedBeforeAt();
55
55
  }
56
56
  }
57
57
  }
58
+
59
+ static class BufferReleasedBeforeAt
60
+ extends Throwable
61
+ { }
62
+
63
+ static class BufferDoubleReleasedException
64
+ extends IllegalStateException
65
+ {
66
+ public BufferDoubleReleasedException(BufferReleasedBeforeAt releasedAt)
67
+ {
68
+ super("Detected double release() call of a buffer", releasedAt);
69
+ }
70
+ }
58
71
  }
@@ -98,12 +98,13 @@ public class PreviewExecutor
98
98
  {
99
99
  InputPlugin input = newInputPlugin(task);
100
100
  List<FilterPlugin> filterPlugins = newFilterPlugins(task);
101
- Schema filteredSchema = filterSchemas.get(filterSchemas.size() - 1);
101
+ Schema inputSchema = filterSchemas.get(0);
102
+ Schema outputSchema = filterSchemas.get(filterSchemas.size() - 1);
102
103
 
103
- PageOutput out = new SamplingPageOutput(task.getSampleRows(), filteredSchema);
104
+ PageOutput out = new SamplingPageOutput(task.getSampleRows(), outputSchema);
104
105
  try {
105
106
  out = Filters.open(filterPlugins, filterTasks, filterSchemas, out);
106
- input.run(inputTask, filteredSchema, 0, out);
107
+ input.run(inputTask, inputSchema, 0, out);
107
108
  } finally {
108
109
  out.close();
109
110
  }
@@ -46,17 +46,17 @@ public class SamplingParserPlugin
46
46
  throw new SampledNoticeError(buffer);
47
47
  }
48
48
 
49
- public static Buffer runFileInputSampling(final FileInputRunner input, ConfigSource inputConfig)
49
+ public static Buffer runFileInputSampling(final FileInputRunner runner, ConfigSource inputConfig)
50
50
  {
51
51
  // override in.parser.type so that FileInputRunner creates GuessParserPlugin
52
52
  ConfigSource samplingInputConfig = inputConfig.deepCopy();
53
53
  samplingInputConfig.getNestedOrSetEmpty("parser").set("type", "system_sampling");
54
54
 
55
55
  try {
56
- input.transaction(samplingInputConfig, new InputPlugin.Control() {
56
+ runner.transaction(samplingInputConfig, new InputPlugin.Control() {
57
57
  public List<CommitReport> run(TaskSource taskSource, Schema schema, int taskCount)
58
58
  {
59
- input.run(taskSource, schema, 0, new PageOutput() {
59
+ runner.run(taskSource, schema, 0, new PageOutput() {
60
60
  @Override
61
61
  public void add(Page page)
62
62
  {
@@ -8,6 +8,12 @@ Embulk documentation
8
8
 
9
9
  https://github.com/embulk/embulk
10
10
 
11
+ * `Quick Start <https://github.com/embulk/embulk#quick-start>`_
12
+
13
+ * `Linux and Mac OS X <https://github.com/embulk/embulk#linux--mac--bsd>`_
14
+
15
+ * `Windows <https://github.com/embulk/embulk#windows>`_
16
+
11
17
  * `List of Plugins by Category <http://www.embulk.org/plugins/>`_
12
18
 
13
19
  .. toctree::
@@ -56,7 +56,7 @@ You can find the latest embulk binary from the `releases <https://bintray.com/em
56
56
 
57
57
  .. code-block:: console
58
58
 
59
- $ sudo wget https://bintray.com/artifact/download/embulk/maven/embulk-0.5.1.jar -O /usr/local/bin/embulk
59
+ $ sudo wget https://bintray.com/artifact/download/embulk/maven/embulk-0.5.2.jar -O /usr/local/bin/embulk
60
60
  $ sudo chmod +x /usr/local/bin/embulk
61
61
 
62
62
  Step 2. Install Elasticsearch plugin
@@ -23,4 +23,5 @@ Release Notes
23
23
  release/release-0.4.10
24
24
  release/release-0.5.0
25
25
  release/release-0.5.1
26
+ release/release-0.5.2
26
27
 
@@ -0,0 +1,30 @@
1
+ Release 0.5.2
2
+ ==================================
3
+
4
+ Built-in plugins
5
+ ------------------
6
+
7
+ * ``parser-csv`` plugin supports ``skip_header_lines`` parameter to skip first some lines.
8
+
9
+ * ``header_line`` parameter is obsoleted. Although the parameter still works for backward compatibility, setting both ``header_line`` and ``skip_header_lines`` becomes configuration error.
10
+
11
+ * ``guess-csv`` plugin guesses first ignorable lines and sets ``skip_header_lines`` parameter automatically.
12
+
13
+ * ``guess-csv`` plugin guesses quoted column names correctly.
14
+
15
+ * ``formatter-csv`` pugin supports ``delimiter`` parameter (@hiroyuki-sato++).
16
+
17
+ * ``output-stdout`` fixed warning messages due to double-release of buffers.
18
+
19
+
20
+ General Changes
21
+ ------------------
22
+
23
+ * Improved error message when double-release of a ``spi.Buffer`` is detected.
24
+ * Fixed ``preview`` when a filter plugin changes schema (@llibra++).
25
+ * Fixed infinite loop at ``Embulk::FileOutput#flush`` (@goronao++). It happened if a formatter plugin written in Ruby writes more than 32KB of data.
26
+
27
+
28
+ Release Date
29
+ ------------------
30
+ 2015-03-11
@@ -31,6 +31,10 @@ public class CsvFormatterPlugin
31
31
  @Config("header_line")
32
32
  @ConfigDefault("true")
33
33
  public boolean getHeaderLine();
34
+
35
+ @Config("delimiter")
36
+ @ConfigDefault("\",\"")
37
+ public String getDelimiterChar();
34
38
  }
35
39
 
36
40
  @Override
@@ -62,13 +66,14 @@ public class CsvFormatterPlugin
62
66
  final LineEncoder encoder = new LineEncoder(output, task);
63
67
  final Map<Integer, TimestampFormatter> timestampFormatters =
64
68
  newTimestampFormatters(task, schema);
69
+ final String delimiter = task.getDelimiterChar();
65
70
 
66
71
  // create a file
67
72
  encoder.nextFile();
68
73
 
69
74
  // write header
70
75
  if (task.getHeaderLine()) {
71
- writeHeader(schema, encoder);
76
+ writeHeader(schema, encoder, delimiter);
72
77
  }
73
78
 
74
79
  return new PageOutput() {
@@ -124,7 +129,7 @@ public class CsvFormatterPlugin
124
129
  private void addDelimiter(Column column)
125
130
  {
126
131
  if (column.getIndex() != 0) {
127
- encoder.addText(",");
132
+ encoder.addText(delimiter);
128
133
  }
129
134
  }
130
135
  });
@@ -145,11 +150,11 @@ public class CsvFormatterPlugin
145
150
  };
146
151
  }
147
152
 
148
- private void writeHeader(Schema schema, LineEncoder encoder)
153
+ private void writeHeader(Schema schema, LineEncoder encoder, String delimiter)
149
154
  {
150
155
  for (Column column : schema.getColumns()) {
151
156
  if (column.getIndex() != 0) {
152
- encoder.addText(",");
157
+ encoder.addText(delimiter);
153
158
  }
154
159
  encoder.addText(column.getName());
155
160
  }
@@ -1,13 +1,13 @@
1
1
  package org.embulk.standards;
2
2
 
3
3
  import com.google.common.base.Preconditions;
4
- import com.google.common.collect.ImmutableMap;
5
4
  import com.google.common.base.Optional;
6
5
  import com.google.common.collect.ImmutableSet;
7
6
  import org.embulk.config.Task;
8
7
  import org.embulk.config.Config;
9
8
  import org.embulk.config.ConfigDefault;
10
9
  import org.embulk.config.ConfigSource;
10
+ import org.embulk.config.ConfigException;
11
11
  import org.embulk.config.TaskSource;
12
12
  import org.embulk.spi.type.TimestampType;
13
13
  import org.embulk.spi.time.TimestampParser;
@@ -25,8 +25,6 @@ import org.embulk.spi.BufferAllocator;
25
25
  import org.embulk.spi.util.LineDecoder;
26
26
  import org.slf4j.Logger;
27
27
 
28
- import java.util.Map;
29
-
30
28
  public class CsvParserPlugin
31
29
  implements ParserPlugin
32
30
  {
@@ -44,9 +42,14 @@ public class CsvParserPlugin
44
42
  @Config("columns")
45
43
  public SchemaConfig getSchemaConfig();
46
44
 
47
- @Config("header_line") // how to set default value?? TODO @Default("true")
48
- @ConfigDefault("false")
49
- public boolean getHeaderLine();
45
+ @Config("header_line")
46
+ @ConfigDefault("null")
47
+ public Optional<Boolean> getHeaderLine();
48
+
49
+ @Config("skip_header_lines")
50
+ @ConfigDefault("0")
51
+ public int getSkipHeaderLines();
52
+ public void setSkipHeaderLines(int n);
50
53
 
51
54
  @Config("delimiter")
52
55
  @ConfigDefault("\",\"")
@@ -86,20 +89,33 @@ public class CsvParserPlugin
86
89
  public void transaction(ConfigSource config, ParserPlugin.Control control)
87
90
  {
88
91
  PluginTask task = config.loadConfig(PluginTask.class);
92
+
93
+ // backward compatibility
94
+ if (task.getHeaderLine().isPresent()) {
95
+ if (task.getSkipHeaderLines() > 0) {
96
+ throw new ConfigException("'header_line' option is invalid if 'skip_header_lines' is set.");
97
+ }
98
+ if (task.getHeaderLine().get()) {
99
+ task.setSkipHeaderLines(1);
100
+ } else {
101
+ task.setSkipHeaderLines(0);
102
+ }
103
+ }
104
+
89
105
  control.run(task.dump(), task.getSchemaConfig().toSchema());
90
106
  }
91
107
 
92
- private Map<Integer, TimestampParser> newTimestampParsers(
108
+ private TimestampParser[] newTimestampParsers(
93
109
  TimestampParser.ParserTask task, Schema schema)
94
110
  {
95
- ImmutableMap.Builder<Integer, TimestampParser> builder = new ImmutableMap.Builder<>();
111
+ TimestampParser[] parsers = new TimestampParser[schema.getColumnCount()];
96
112
  for (Column column : schema.getColumns()) {
97
113
  if (column.getType() instanceof TimestampType) {
98
114
  TimestampType tt = (TimestampType) column.getType();
99
- builder.put(column.getIndex(), new TimestampParser(tt.getFormat(), task));
115
+ parsers[column.getIndex()] = new TimestampParser(tt.getFormat(), task);
100
116
  }
101
117
  }
102
- return builder.build();
118
+ return parsers;
103
119
  }
104
120
 
105
121
  @Override
@@ -107,19 +123,18 @@ public class CsvParserPlugin
107
123
  FileInput input, PageOutput output)
108
124
  {
109
125
  PluginTask task = taskSource.loadTask(PluginTask.class);
110
- final Map<Integer, TimestampParser> timestampFormatters = newTimestampParsers(task, schema);
111
- final CsvTokenizer tokenizer = new CsvTokenizer(new LineDecoder(input, task), task);
126
+ final TimestampParser[] timestampFormatters = newTimestampParsers(task, schema);
127
+ LineDecoder lineDecoder = new LineDecoder(input, task);
128
+ final CsvTokenizer tokenizer = new CsvTokenizer(lineDecoder, task);
112
129
  final String nullStringOrNull = task.getNullString().orNull();
113
- boolean skipHeaderLine = task.getHeaderLine();
130
+ int skipHeaderLines = task.getSkipHeaderLines();
114
131
 
115
132
  try (final PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output)) {
116
133
  while (tokenizer.nextFile()) {
117
- if (skipHeaderLine) {
118
- // skip the first line
119
- if (tokenizer.nextRecord()) {
120
- for (int i=0; i < schema.getColumnCount(); i++) {
121
- tokenizer.nextColumn(); // TODO check return value?
122
- }
134
+ // skip the header lines for each file
135
+ for (; skipHeaderLines > 0; skipHeaderLines--) {
136
+ if (lineDecoder.poll() == null) {
137
+ break;
123
138
  }
124
139
  }
125
140
 
@@ -187,7 +202,7 @@ public class CsvParserPlugin
187
202
  pageBuilder.setNull(column);
188
203
  } else {
189
204
  try {
190
- pageBuilder.setTimestamp(column, (timestampFormatters.get(column.getIndex()).parse(v)));
205
+ pageBuilder.setTimestamp(column, timestampFormatters[column.getIndex()].parse(v));
191
206
  } catch (TimestampParseException e) {
192
207
  // TODO support default value
193
208
  throw new CsvRecordValidateException(e);
@@ -115,9 +115,14 @@ public class CsvTokenizer
115
115
  }
116
116
  }
117
117
 
118
+ public boolean hasNextColumn()
119
+ {
120
+ return recordState == RecordState.NOT_END;
121
+ }
122
+
118
123
  public String nextColumn()
119
124
  {
120
- Preconditions.checkState(recordState == RecordState.NOT_END, "doesn't have enough columns"); // TODO exception class
125
+ Preconditions.checkState(hasNextColumn(), "doesn't have enough columns"); // TODO exception class
121
126
 
122
127
  // reset last state
123
128
  wasQuotedColumn = false;
@@ -64,7 +64,6 @@ public class StdoutOutputPlugin
64
64
  while (reader.nextRecord()) {
65
65
  System.out.println(printer.printRecord(reader, ","));
66
66
  }
67
- page.release();
68
67
  }
69
68
 
70
69
  public void finish()
@@ -72,7 +71,10 @@ public class StdoutOutputPlugin
72
71
  System.out.flush();
73
72
  }
74
73
 
75
- public void close() { }
74
+ public void close()
75
+ {
76
+ reader.close();
77
+ }
76
78
 
77
79
  public void abort() { }
78
80
 
@@ -31,7 +31,7 @@ public class TestCsvParserPlugin
31
31
  CsvParserPlugin.PluginTask task = config.loadConfig(CsvParserPlugin.PluginTask.class);
32
32
  assertEquals(Charset.forName("utf-8"), task.getCharset());
33
33
  assertEquals(Newline.CRLF, task.getNewline());
34
- assertEquals(false, task.getHeaderLine());
34
+ assertEquals(false, task.getHeaderLine().or(false));
35
35
  assertEquals(',', task.getDelimiterChar());
36
36
  assertEquals('\"', task.getQuoteChar());
37
37
  }
@@ -62,7 +62,7 @@ public class TestCsvParserPlugin
62
62
  CsvParserPlugin.PluginTask task = config.loadConfig(CsvParserPlugin.PluginTask.class);
63
63
  assertEquals(Charset.forName("utf-16"), task.getCharset());
64
64
  assertEquals(Newline.LF, task.getNewline());
65
- assertEquals(true, task.getHeaderLine());
65
+ assertEquals(true, task.getHeaderLine().or(false));
66
66
  assertEquals('\t', task.getDelimiterChar());
67
67
  assertEquals('\\', task.getQuoteChar());
68
68
  }
@@ -376,7 +376,7 @@ examples:
376
376
  end
377
377
 
378
378
  def self.print_exception(ex)
379
- if ex.respond_to?(:to_java)
379
+ if ex.respond_to?(:to_java) && ex.is_a?(java.lang.Throwable)
380
380
  ex.to_java.printStackTrace(java.lang.System.out)
381
381
  else
382
382
  puts "#{ex.to_s}"
@@ -34,6 +34,7 @@ module Embulk
34
34
  def flush
35
35
  unless @buffer.empty?
36
36
  @java_file_output.add(@buffer.to_java)
37
+ @buffer.clear
37
38
  end
38
39
  nil
39
40
  end
@@ -44,7 +45,7 @@ module Embulk
44
45
  end
45
46
 
46
47
  def close
47
- @java_file_output.finish
48
+ @java_file_output.close
48
49
  end
49
50
  end
50
51
 
@@ -24,6 +24,9 @@ module Embulk
24
24
  "\\N", # MySQL LOAD, Hive STORED AS TEXTFILE
25
25
  ]
26
26
 
27
+ MAX_SKIP_LINES = 10
28
+ NO_SKIP_DETECT_LINES = 10
29
+
27
30
  def guess_lines(config, sample_lines)
28
31
  delim = guess_delimiter(sample_lines)
29
32
  unless delim
@@ -32,7 +35,7 @@ module Embulk
32
35
  end
33
36
 
34
37
  parser_config = config["parser"] || {}
35
- parser_guessed = {"type" => "csv", "delimiter" => delim}
38
+ parser_guessed = DataSource.new.merge({"type" => "csv", "delimiter" => delim})
36
39
 
37
40
  quote = guess_quote(sample_lines, delim)
38
41
  parser_guessed["quote"] = quote ? quote : ''
@@ -44,7 +47,10 @@ module Embulk
44
47
  parser_guessed["null_string"] = null_string if null_string
45
48
  # don't even set null_string to avoid confusion of null and 'null' in YAML format
46
49
 
47
- sample_records = sample_lines.map {|line| line.split(delim) } # TODO use CsvTokenizer
50
+ sample_records = split_lines(parser_guessed, sample_lines, delim)
51
+ skip_header_lines = guess_skip_header_lines(sample_records)
52
+ sample_records = sample_records[skip_header_lines..-1]
53
+
48
54
  first_types = SchemaGuess.types_from_array_records(sample_records[0, 1])
49
55
  other_types = SchemaGuess.types_from_array_records(sample_records[1..-1])
50
56
 
@@ -53,12 +59,16 @@ module Embulk
53
59
  return {}
54
60
  end
55
61
 
56
- unless parser_config.has_key?("header_line")
57
- parser_guessed["header_line"] = (first_types != other_types && !first_types.any? {|t| t != "string" })
62
+ header_line = (first_types != other_types && !first_types.any? {|t| t != "string" })
63
+
64
+ if header_line
65
+ parser_guessed["skip_header_lines"] = skip_header_lines + 1
66
+ else
67
+ parser_guessed["skip_header_lines"] = skip_header_lines
58
68
  end
59
69
 
60
70
  unless parser_config.has_key?("columns")
61
- if parser_guessed["header_line"] || parser_config["header_line"]
71
+ if header_line
62
72
  column_names = sample_records.first
63
73
  else
64
74
  column_names = (0..other_types.size).to_a.map {|i| "c#{i}" }
@@ -81,6 +91,32 @@ module Embulk
81
91
 
82
92
  private
83
93
 
94
+ def split_lines(parser_config, sample_lines, delim)
95
+ parser_task = parser_config.merge({"columns" => []}).load_config(org.embulk.standards.CsvParserPlugin::PluginTask)
96
+ data = sample_lines.map {|x| x.force_encoding('UTF-8') }.join(parser_task.getNewline.getString.encode('UTF-8'))
97
+ sample = Buffer.from_ruby_string(data)
98
+ decoder = Java::LineDecoder.new(Java::ListFileInput.new([[sample.to_java]]), parser_task)
99
+ tokenizer = org.embulk.standards.CsvTokenizer.new(decoder, parser_task)
100
+ rows = []
101
+ while tokenizer.nextFile
102
+ while tokenizer.nextRecord
103
+ columns = []
104
+ while true
105
+ begin
106
+ columns << tokenizer.nextColumn
107
+ rescue java.lang.IllegalStateException # TODO exception class
108
+ rows << columns
109
+ break
110
+ end
111
+ end
112
+ end
113
+ end
114
+ return rows
115
+ rescue
116
+ # TODO warning if fallback to this ad-hoc implementation
117
+ sample_lines.map {|line| line.split(delim) }
118
+ end
119
+
84
120
  def guess_delimiter(sample_lines)
85
121
  delim_weights = DELIMITER_CANDIDATES.map do |d|
86
122
  counts = sample_lines.map {|line| line.count(d) }
@@ -154,6 +190,17 @@ module Embulk
154
190
  return found ? found[0] : nil
155
191
  end
156
192
 
193
+ def guess_skip_header_lines(sample_records)
194
+ counts = sample_records.map {|records| records.size }
195
+ (1..[MAX_SKIP_LINES, counts.length - 1].min).each do |i|
196
+ check_row_count = counts[i-1]
197
+ if counts[i, NO_SKIP_DETECT_LINES].all? {|c| c == check_row_count }
198
+ return i - 1
199
+ end
200
+ end
201
+ return 0
202
+ end
203
+
157
204
  def array_sum(array)
158
205
  array.inject(0) {|r,i| r += i }
159
206
  end
@@ -115,6 +115,7 @@ module Embulk
115
115
  while line = decoder.poll
116
116
  sample_lines << line
117
117
  end
118
+ sample_lines.pop unless sample_lines.empty? # last line can be partial
118
119
  end
119
120
 
120
121
  return guess_lines(config, sample_lines);
@@ -38,8 +38,6 @@ module Embulk
38
38
  return true
39
39
  rescue LoadError => e
40
40
  # catch LoadError but don't catch ClassNotFoundException
41
- # TODO: the best code here is to raise exception only if
42
- # `name` file is not in $LOAD_PATH.
43
41
  raise e if e.to_s =~ /java.lang.ClassNotFoundException/
44
42
  raise e if $LOAD_PATH.any? {|dir| File.exists? File.join(dir, "#{name}.rb") }
45
43
  end
@@ -1,3 +1,3 @@
1
1
  module Embulk
2
- VERSION = '0.5.1'
2
+ VERSION = '0.5.2'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.1
4
+ version: 0.5.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sadayuki Furuhashi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-03-05 00:00:00.000000000 Z
11
+ date: 2015-03-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -273,6 +273,7 @@ files:
273
273
  - embulk-docs/src/release/release-0.4.9.rst
274
274
  - embulk-docs/src/release/release-0.5.0.rst
275
275
  - embulk-docs/src/release/release-0.5.1.rst
276
+ - embulk-docs/src/release/release-0.5.2.rst
276
277
  - embulk-standards/build.gradle
277
278
  - embulk-standards/src/main/java/org/embulk/standards/CsvFormatterPlugin.java
278
279
  - embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java
@@ -377,8 +378,8 @@ files:
377
378
  - classpath/bval-jsr303-0.5.jar
378
379
  - classpath/commons-beanutils-core-1.8.3.jar
379
380
  - classpath/commons-lang3-3.1.jar
380
- - classpath/embulk-core-0.5.1.jar
381
- - classpath/embulk-standards-0.5.1.jar
381
+ - classpath/embulk-core-0.5.2.jar
382
+ - classpath/embulk-standards-0.5.2.jar
382
383
  - classpath/guava-18.0.jar
383
384
  - classpath/guice-3.0.jar
384
385
  - classpath/guice-multibindings-3.0.jar