embulk-parser-csv_guessable 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 90dc39f04076979425a69d11b3177e1e4b1d5e7a
4
- data.tar.gz: 5dca8965baaeb7fbe51f5f9df63f385b09d9bdb1
3
+ metadata.gz: 1bd20694daee60a0018828d263039d4fcbdc28bf
4
+ data.tar.gz: 34b6b8c95c8e8e375059d7ff87dd46314fb3ef55
5
5
  SHA512:
6
- metadata.gz: 61bd54ee36352ab6667654f6dfdfcccd77de37efdd49f3325924538ba3d921737a1e9139691cc8aa8617f47893cef8219f0f50bff454a456df01309ed4668617
7
- data.tar.gz: d6ab6e9d35ae8932ee5aa1a035c8f1440e6c4d1cf41565010278ca88d13a91cd85e2444476106a29cae6bd4312c5b2987d1211dfb45c26e7b6ae9bf511e12b75
6
+ metadata.gz: 844084c1b76193789c12821bf9eb76a3fef969b33e42e2ea9ff6b9935848276412c3fee3af558d04aef3414ba1e93e72bfbab35393cc0a935d994b9f98ff8e4e
7
+ data.tar.gz: 57c246a6c2d551a55de98e8f420354da9c51970ef1e53184d87c0f6a22dd1dde47e9f49e3df4e1a6987a76fd71aac7ee84590e2c06be50a5a1bf2296bcd26398
data/README.md CHANGED
@@ -1,8 +1,11 @@
1
- # Csv Guessable parser plugin for Embulk
2
- **embulk-parser-csv_gussable** (runtime) guesses and parses csv which has schema in header.
1
+ # Guessable csv parser plugin for Embulk
2
+ **embulk-parser-csv_guessable** (runtime)guesses and parses csv which has schema in header.
3
+
4
+ Csv file sometimes has a schema in the header.
5
+ **embulk-parser-csv_guessable** parses such a csv by using their header as column name.
3
6
  This plugin is useful in case of target csv schema changes frequently.
4
7
 
5
- Also it can behave as original csv parser without **embulk-parser-csv_guessable** specified configs.
8
+ It behaves as original csv parser when **embulk-parser-csv_guessable** conifgs(`schema_file` and `schema_line`) is not defined.
6
9
 
7
10
  ## Overview
8
11
 
@@ -12,12 +15,19 @@ Also it can behave as original csv parser without **embulk-parser-csv_guessable*
12
15
  ## Configuration
13
16
 
14
17
  - **schema_file**: filename which has schema.(string, default: `null`)
15
- - **schema_line**: schema line in header. (integer default: `"1"`)
16
- - **(TODO)columns**: Columns attributes for parse. `embulk-parser-csv_guessable` use this config only when `"schema_file"` is set. If `"schema_file"` isn't set, this is same as original csv parser's `"columns"`. (hash, default: `null`)
18
+ - **schema_line**: schema line in header. (integer default: `1`)
19
+ - **columns**: Columns attributes for parse. `embulk-parser-csv_guessable` use this config only when `schema_file` is set. If `"schema_file"` isn't set, this is same as the original csv parser's `columns`. (hash, default: `null`)
20
+ - **value_name**: Name of the column in the header. rename to `name`
21
+ - **name**: Name of the column
22
+ - **type**: Type of the column
23
+ - **format**: Format of the timestamp if type is timestamp
24
+ - **date**: Set date part if the format doesn't include date part
17
25
  - any other csv configs: see [www.embulk.org](http://www.embulk.org/docs/built-in.html#csv-parser-plugin)
18
26
 
27
+ The `columns`
28
+
19
29
  ## Example
20
- test.csv
30
+ test.csv (There is a schema at the first line.)
21
31
 
22
32
  ```csv
23
33
  id, title, description
@@ -32,11 +42,11 @@ in:
32
42
  type: any file input plugin type
33
43
  parser:
34
44
  type: csv_guessable
35
- schema_file: data/test.csv
45
+ schema_file: test.csv
36
46
  schema_line: 1
37
47
  ```
38
48
 
39
- (To explain)
49
+ (For explain)
40
50
  In case original csv parser
41
51
  config.yml
42
52
  ```yaml
@@ -51,6 +61,22 @@ in:
51
61
  - {name: description, type: string}
52
62
  ```
53
63
 
64
+ ## Example2
65
+ rename column name and set type Example
66
+
67
+ ```yaml
68
+ in:
69
+ type: any file input plugin type
70
+ parser:
71
+ type: csv_guessable
72
+ schema_file test.csv
73
+ schema_line: 1
74
+ columns:
75
+ - {value_name: 'id', name: 'number', type: long}
76
+ - {value_name: 'title', name: 'description', type: string}
77
+ - {value_name: 'status', name: 'ok?', type: string}
78
+ ```
79
+
54
80
  <!--
55
81
  (If guess supported) you don't have to write `parser:` section in the configuration file. After writing `in:` section, you can let embulk guess `parser:` section using this command:
56
82
  -->
data/build.gradle CHANGED
@@ -13,7 +13,7 @@ configurations {
13
13
  provided
14
14
  }
15
15
 
16
- version = "0.1.2"
16
+ version = "0.1.3"
17
17
 
18
18
  sourceCompatibility = 1.7
19
19
  targetCompatibility = 1.7
@@ -25,6 +25,7 @@ dependencies {
25
25
  provided "org.embulk:embulk-core:0.8.22"
26
26
  testCompile "junit:junit:4.+"
27
27
  testCompile "org.embulk:embulk-core:0.8.+:tests"
28
+ testCompile "org.embulk:embulk-standards:0.8.+:tests"
28
29
  }
29
30
 
30
31
  task classpath(type: Copy, dependsOn: ["jar"]) {
@@ -78,8 +79,8 @@ Gem::Specification.new do |spec|
78
79
  spec.name = "${project.name}"
79
80
  spec.version = "${project.version}"
80
81
  spec.authors = ["koooge"]
81
- spec.summary = %[Csv Guessable parser plugin for Embulk]
82
- spec.description = %[Parses Csv Guessable files read by other file input plugins.]
82
+ spec.summary = %[Guessable Csv parser plugin for Embulk]
83
+ spec.description = %[Parses Guessable Csv files read by other file input plugins.]
83
84
  spec.email = ["koooooge@gmail.com"]
84
85
  spec.licenses = ["MIT"]
85
86
  spec.homepage = "https://github.com/koooge/embulk-parser-csv_guessable"
@@ -40,6 +40,7 @@ import java.nio.charset.StandardCharsets;
40
40
  import java.nio.file.Files;
41
41
  import java.nio.file.Path;
42
42
  import java.util.ArrayList;
43
+ import java.util.List;
43
44
 
44
45
  public class CsvGuessableParserPlugin
45
46
  extends CsvParserPlugin
@@ -132,23 +133,38 @@ public class CsvGuessableParserPlugin
132
133
  PluginTask task = config.loadConfig(PluginTask.class);
133
134
  SchemaConfig schemaConfig = null;
134
135
 
135
- if (task.getSchemaFile().isPresent()) { /* embulk-parser-csv_guessable */
136
- if (task.getHeaderLine().isPresent()) {
137
- // TODO: use 'columns' as hints for guess
138
- throw new ConfigException("embulk-parsre-csv_gussable will use 'columnes' as hints for guess as hints for guess. Please delete 'columnes' now.");
139
- }
140
- else { /* guess from header */
141
- int schemaLine = task.getSchemaLine();
142
- task.setSkipHeaderLines(schemaLine); // TODO: use 'skip_header_line'
143
-
144
- String header = readHeader(task.getSchemaFile().get().getPath(), schemaLine);
145
- log.debug(header);
146
- ArrayList<ColumnConfig> columns = newColumns(header, config);
147
- log.debug(columns.toString());
148
- schemaConfig = new SchemaConfig(columns);
136
+ if (task.getSchemaFile().isPresent()) {
137
+ int schemaLine = task.getSchemaLine();
138
+ task.setSkipHeaderLines(schemaLine); // TODO: use 'skip_header_line'
139
+
140
+ String header = readHeader(task.getSchemaFile().get().getPath(), schemaLine);
141
+ log.debug(header);
142
+ ArrayList<ColumnConfig> schema = newColumns(header, config);
143
+
144
+ /* alias and set type */
145
+ if (task.getSchemaConfig().isPresent()) {
146
+ List<ColumnConfig> columns = task.getSchemaConfig().get().getColumns();
147
+ for (ColumnConfig column : columns) {
148
+ String name = column.getName();
149
+ try {
150
+ name = column.getConfigSource().get(String.class, "value_name");
151
+ }
152
+ catch (ConfigException e) {
153
+ /* only setType */
154
+ }
155
+ for (int i = 0; i < schema.size(); ++i) {
156
+ ColumnConfig c = schema.get(i);
157
+ if (c.getName().equals(name)) {
158
+ schema.set(i, new ColumnConfig(name, column.getType(), column.getOption()));
159
+ }
160
+ }
161
+ }
149
162
  }
163
+
164
+ log.debug(schema.toString());
165
+ schemaConfig = new SchemaConfig(schema);
150
166
  }
151
- else { /* embulk-parser-csv embulk */
167
+ else if (task.getSchemaConfig().isPresent()) { /* original CsvParserPlugin */
152
168
  // backward compatibility
153
169
  if (task.getHeaderLine().isPresent()) {
154
170
  if (task.getSkipHeaderLines() > 0) {
@@ -163,6 +179,9 @@ public class CsvGuessableParserPlugin
163
179
  }
164
180
  schemaConfig = task.getSchemaConfig().get();
165
181
  }
182
+ else {
183
+ throw new ConfigException("Field 'columns' or 'schema_file' is required but not set");
184
+ }
166
185
 
167
186
  control.run(task.dump(), schemaConfig.toSchema());
168
187
  }
@@ -377,7 +396,7 @@ public class CsvGuessableParserPlugin
377
396
 
378
397
  private ArrayList<ColumnConfig> newColumns(String header, ConfigSource config)
379
398
  {
380
- ArrayList columns = new ArrayList<ArrayList>();
399
+ ArrayList<ColumnConfig> columns = new ArrayList<ColumnConfig>();
381
400
  PluginTask task = config.loadConfig(PluginTask.class);
382
401
 
383
402
  try (CSVReader reader = new CSVReader(new StringReader(header))) {
@@ -1,18 +1,34 @@
1
1
  package org.embulk.parser.csv_guessable;
2
2
 
3
+ import com.google.common.collect.ImmutableList;
3
4
  import org.embulk.EmbulkTestRuntime;
4
5
  import org.embulk.config.ConfigException;
5
6
  import org.embulk.config.ConfigLoader;
6
7
  import org.embulk.config.ConfigSource;
8
+ import org.embulk.config.TaskSource;
7
9
  import org.embulk.spi.Exec;
10
+ import org.embulk.spi.FileInput;
11
+ import org.embulk.spi.ParserPlugin;
12
+ import org.embulk.spi.Schema;
13
+ import org.embulk.spi.TestPageBuilderReader.MockPageOutput;
14
+ import org.embulk.spi.util.InputStreamFileInput;
15
+ import org.embulk.standards.TestCsvParserPlugin;
16
+ import org.junit.Before;
8
17
  import org.junit.Rule;
9
18
  import org.junit.Test;
10
19
  import org.junit.rules.ExpectedException;
11
20
 
21
+ import java.io.File;
22
+ import java.io.FileInputStream;
23
+ import java.io.IOException;
24
+ import java.io.InputStream;
25
+
12
26
  import static org.embulk.parser.csv_guessable.CsvGuessableParserPlugin.PluginTask;
13
- import static org.junit.Assert.assertFalse;
27
+ import static org.junit.Assert.assertEquals;
28
+ import static org.junit.Assert.assertNull;
14
29
 
15
30
  public class TestCsvGuessableParserPlugin
31
+ extends TestCsvParserPlugin
16
32
  {
17
33
  @Rule
18
34
  public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
@@ -20,61 +36,118 @@ public class TestCsvGuessableParserPlugin
20
36
  @Rule
21
37
  public ExpectedException exception = ExpectedException.none();
22
38
 
39
+ private CsvGuessableParserPlugin plugin;
40
+ private MockPageOutput output;
41
+
42
+ @Before
43
+ public void createResouce()
44
+ {
45
+ plugin = new CsvGuessableParserPlugin();
46
+ output = new MockPageOutput();
47
+ }
48
+
23
49
  private ConfigSource getConfigFromYaml(String yaml)
24
50
  {
25
51
  ConfigLoader loader = new ConfigLoader(Exec.getModelManager());
26
52
  return loader.fromYamlString(yaml);
27
53
  }
28
54
 
29
- /*
30
- @Test
31
- public void throwExceptionWithoutRequisite()
55
+ @Test(expected = ConfigException.class)
56
+ public void checkColumnsRequired()
32
57
  {
33
58
  String configYaml = "" +
34
- "type: csv_with_header";
35
-
59
+ "type: csv_guessable";
36
60
  ConfigSource config = getConfigFromYaml(configYaml);
61
+ PluginTask task = config.loadConfig(PluginTask.class);
37
62
 
38
- exception.expect(ConfigException.class);
39
- exception.expectMessage("Field either 'columns' or 'schema_file' is required but not set");
63
+ if (!task.getSchemaConfig().isPresent() && !task.getSchemaFile().isPresent()) {
64
+ throw new ConfigException("Field 'columns' or 'schema_line' is required but not set");
65
+ }
40
66
  }
41
- */
42
67
 
43
68
  @Test
44
69
  public void defaultValue()
45
70
  {
46
71
  String configYaml = "" +
47
- "type: csv_with_header\n" +
48
- "schema_line: 2";
72
+ "type: csv_guessable";
49
73
  ConfigSource config = getConfigFromYaml(configYaml);
50
74
  PluginTask task = config.loadConfig(PluginTask.class);
51
75
 
52
- assertFalse(task.getSchemaConfig().isPresent());
53
- assertFalse(task.getSchemaFile().isPresent());
76
+ assertNull(task.getSchemaConfig().orNull());
77
+ assertNull(task.getSchemaFile().orNull());
78
+ assertEquals(1, task.getSchemaLine());
54
79
  }
55
80
 
56
- /*
57
81
  @Test
58
82
  public void originalCsvParserPlugin()
59
83
  {
60
84
  String configYaml = "" +
61
- "type: csv_with_header\n" +
85
+ "type: csv_guessable\n" +
62
86
  "columns:\n" +
63
87
  " - {name: id, type: long}\n" +
64
88
  " - {name: title, type: string}\n" +
65
89
  " - {name: status, type: string}";
66
90
  ConfigSource config = getConfigFromYaml(configYaml);
67
91
  PluginTask task = config.loadConfig(PluginTask.class);
92
+
93
+ // TODO: impl or extends
68
94
  }
69
95
 
70
96
  @Test
71
- public void csvGuessable()
97
+ public void guessableCsv()
98
+ throws Exception
72
99
  {
100
+ String configYaml = "" +
101
+ "type: csv_guessable\n" +
102
+ "schema_file: src/test/resources/org/embulk/parser/csv_guessable/data/test.csv\n" + // TODO: FIX PATH
103
+ "schema_line: 1";
104
+ ConfigSource config = getConfigFromYaml(configYaml);
105
+ PluginTask task = config.loadConfig(PluginTask.class);
106
+ // transaction(config, fileInput(new File(this.getClass().getResource("data/test.csv").getPath())));
107
+
108
+ // TODO: impl
73
109
  }
74
110
 
75
111
  @Test
76
- public void replaceColumnsName()
112
+ public void replaceColumnsMetadata()
113
+ throws Exception
114
+ {
115
+ String configYaml = "" +
116
+ "type: csv_guessable\n" +
117
+ "schema_file: src/test/resources/org/embulk/parser/csv_guessable/data/test.csv\n" + // TODO: FIX PATH
118
+ "schema_line: 1\n" +
119
+ "columns:\n" +
120
+ "- {value_name: '#', name: 'number', type: long}\n" +
121
+ "- {value_name: 'title', name: 'description', type: string}\n" +
122
+ "- {value_name: 'status', name: 'ok?', type: string}";
123
+ ConfigSource config = getConfigFromYaml(configYaml);
124
+
125
+ // TODO: impl
126
+ }
127
+
128
+ private void transaction(ConfigSource config, final FileInput input)
129
+ {
130
+ plugin.transaction(config, new ParserPlugin.Control()
131
+ {
132
+ @Override
133
+ public void run(TaskSource taskSource, Schema schema)
134
+ {
135
+ plugin.run(taskSource, schema, input, output);
136
+ }
137
+ });
138
+ }
139
+
140
+ private FileInput fileInput(File file)
141
+ throws Exception
142
+ {
143
+ FileInputStream in = new FileInputStream(file);
144
+ return new InputStreamFileInput(runtime.getBufferAllocator(), provider(in));
145
+ }
146
+
147
+ private InputStreamFileInput.IteratorProvider provider(InputStream... inputStreams)
148
+ throws IOException
77
149
  {
150
+ return new InputStreamFileInput.IteratorProvider(
151
+ ImmutableList.copyOf(inputStreams));
78
152
  }
79
- */
80
153
  }
@@ -0,0 +1,13 @@
1
+ in:
2
+ type: file
3
+ path_prefix: data/test
4
+ parser:
5
+ type: csv_guessable
6
+ schema_file: data/test.csv
7
+ schema_line: 1
8
+ columns:
9
+ - {name: 'id', type: long}
10
+ - {name: 'title', type: string}
11
+ - {name: 'status', type: string}
12
+ out:
13
+ type: stdout
@@ -0,0 +1,12 @@
1
+ in:
2
+ type: file
3
+ path_prefix: data/test
4
+ parser:
5
+ type: csv_guessable
6
+ skip_header_lines: 1
7
+ columns:
8
+ - {name: id, type: long}
9
+ - {name: title, type: string}
10
+ - {name: status, type: string}
11
+ out:
12
+ type: stdout
@@ -0,0 +1,13 @@
1
+ in:
2
+ type: file
3
+ path_prefix: data/test
4
+ parser:
5
+ type: csv_guessable
6
+ schema_file: data/test.csv
7
+ schema_line: 1
8
+ columns:
9
+ - {value_name: 'id', name: 'number', type: long}
10
+ - {value_name: 'title', name: 'description', type: string}
11
+ - {value_name: 'status', name: 'ok?', type: string}
12
+ out:
13
+ type: stdout
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-parser-csv_guessable
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - koooge
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-06-02 00:00:00.000000000 Z
11
+ date: 2017-06-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -38,7 +38,7 @@ dependencies:
38
38
  - - '>='
39
39
  - !ruby/object:Gem::Version
40
40
  version: '10.0'
41
- description: Parses Csv Guessable files read by other file input plugins.
41
+ description: Parses Guessable Csv files read by other file input plugins.
42
42
  email:
43
43
  - koooooge@gmail.com
44
44
  executables: []
@@ -60,16 +60,17 @@ files:
60
60
  - src/main/java/org/embulk/parser/csv_guessable/CsvGuessableParserPlugin.java
61
61
  - src/main/java/org/embulk/parser/csv_guessable/CsvTokenizer.java
62
62
  - src/test/java/org/embulk/parser/csv_guessable/TestCsvGuessableParserPlugin.java
63
- - src/test/resources/data/test.csv
64
- - src/test/resources/data/test_alias.yml
65
- - src/test/resources/yml/guess_from_header.yml
66
- - src/test/resources/yml/original-csv.yml
67
- - src/test/resources/yml/replace_column_name.yml
63
+ - src/test/resources/org/embulk/parser/csv_guessable/data/test.csv
64
+ - src/test/resources/org/embulk/parser/csv_guessable/data/test_alias.yml
65
+ - src/test/resources/org/embulk/parser/csv_guessable/yml/guess_and_set_type.yml
66
+ - src/test/resources/org/embulk/parser/csv_guessable/yml/guess_from_header.yml
67
+ - src/test/resources/org/embulk/parser/csv_guessable/yml/original-csv.yml
68
+ - src/test/resources/org/embulk/parser/csv_guessable/yml/replace_column_name.yml
68
69
  - classpath/commons-lang3-3.5.jar
70
+ - classpath/embulk-parser-csv_guessable-0.1.3.jar
69
71
  - classpath/opencsv-3.9.jar
70
72
  - classpath/commons-beanutils-1.9.3.jar
71
73
  - classpath/commons-compress-1.10.jar
72
- - classpath/embulk-parser-csv_guessable-0.1.2.jar
73
74
  - classpath/embulk-standards-0.8.22.jar
74
75
  - classpath/commons-collections-3.2.2.jar
75
76
  - classpath/commons-logging-1.2.jar
@@ -96,5 +97,5 @@ rubyforge_project:
96
97
  rubygems_version: 2.1.9
97
98
  signing_key:
98
99
  specification_version: 4
99
- summary: Csv Guessable parser plugin for Embulk
100
+ summary: Guessable Csv parser plugin for Embulk
100
101
  test_files: []
@@ -1,12 +0,0 @@
1
- in:
2
- type: file
3
- path_prefix: data/test
4
- parser:
5
- type: csv_guessable
6
- skip_header_line: 1
7
- columns:
8
- - {name: id, type: long}
9
- - {name: title, type: string}
10
- - {name: status, type: string}
11
- out:
12
- type: stdout
@@ -1,13 +0,0 @@
1
- in:
2
- type: file
3
- path_prefix: data/test
4
- parser:
5
- type: csv_guessable
6
- schema_file: data/test.csv
7
- schema_line: 1
8
- columns:
9
- - {value_name: '#', name: number, type: long}
10
- - {value_name: 'title', name: description, type: string}
11
- - {value_name: 'status', name: ok?, type: string}
12
- out:
13
- type: stdout