embulk-parser-csv_guessable 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 90dc39f04076979425a69d11b3177e1e4b1d5e7a
4
- data.tar.gz: 5dca8965baaeb7fbe51f5f9df63f385b09d9bdb1
3
+ metadata.gz: 1bd20694daee60a0018828d263039d4fcbdc28bf
4
+ data.tar.gz: 34b6b8c95c8e8e375059d7ff87dd46314fb3ef55
5
5
  SHA512:
6
- metadata.gz: 61bd54ee36352ab6667654f6dfdfcccd77de37efdd49f3325924538ba3d921737a1e9139691cc8aa8617f47893cef8219f0f50bff454a456df01309ed4668617
7
- data.tar.gz: d6ab6e9d35ae8932ee5aa1a035c8f1440e6c4d1cf41565010278ca88d13a91cd85e2444476106a29cae6bd4312c5b2987d1211dfb45c26e7b6ae9bf511e12b75
6
+ metadata.gz: 844084c1b76193789c12821bf9eb76a3fef969b33e42e2ea9ff6b9935848276412c3fee3af558d04aef3414ba1e93e72bfbab35393cc0a935d994b9f98ff8e4e
7
+ data.tar.gz: 57c246a6c2d551a55de98e8f420354da9c51970ef1e53184d87c0f6a22dd1dde47e9f49e3df4e1a6987a76fd71aac7ee84590e2c06be50a5a1bf2296bcd26398
data/README.md CHANGED
@@ -1,8 +1,11 @@
1
- # Csv Guessable parser plugin for Embulk
2
- **embulk-parser-csv_gussable** (runtime) guesses and parses csv which has schema in header.
1
+ # Guessable csv parser plugin for Embulk
2
+ **embulk-parser-csv_guessable** (runtime)guesses and parses csv which has schema in header.
3
+
4
+ Csv file sometimes has a schema in the header.
5
+ **embulk-parser-csv_guessable** parses such a csv by using their header as column name.
3
6
  This plugin is useful in case of target csv schema changes frequently.
4
7
 
5
- Also it can behave as original csv parser without **embulk-parser-csv_guessable** specified configs.
8
+ It behaves as original csv parser when **embulk-parser-csv_guessable** conifgs(`schema_file` and `schema_line`) is not defined.
6
9
 
7
10
  ## Overview
8
11
 
@@ -12,12 +15,19 @@ Also it can behave as original csv parser without **embulk-parser-csv_guessable*
12
15
  ## Configuration
13
16
 
14
17
  - **schema_file**: filename which has schema.(string, default: `null`)
15
- - **schema_line**: schema line in header. (integer default: `"1"`)
16
- - **(TODO)columns**: Columns attributes for parse. `embulk-parser-csv_guessable` use this config only when `"schema_file"` is set. If `"schema_file"` isn't set, this is same as original csv parser's `"columns"`. (hash, default: `null`)
18
+ - **schema_line**: schema line in header. (integer default: `1`)
19
+ - **columns**: Columns attributes for parse. `embulk-parser-csv_guessable` use this config only when `schema_file` is set. If `"schema_file"` isn't set, this is same as the original csv parser's `columns`. (hash, default: `null`)
20
+ - **value_name**: Name of the column in the header. rename to `name`
21
+ - **name**: Name of the column
22
+ - **type**: Type of the column
23
+ - **format**: Format of the timestamp if type is timestamp
24
+ - **date**: Set date part if the format doesn't include date part
17
25
  - any other csv configs: see [www.embulk.org](http://www.embulk.org/docs/built-in.html#csv-parser-plugin)
18
26
 
27
+ The `columns`
28
+
19
29
  ## Example
20
- test.csv
30
+ test.csv (There is a schema at the first line.)
21
31
 
22
32
  ```csv
23
33
  id, title, description
@@ -32,11 +42,11 @@ in:
32
42
  type: any file input plugin type
33
43
  parser:
34
44
  type: csv_guessable
35
- schema_file: data/test.csv
45
+ schema_file: test.csv
36
46
  schema_line: 1
37
47
  ```
38
48
 
39
- (To explain)
49
+ (For explain)
40
50
  In case original csv parser
41
51
  config.yml
42
52
  ```yaml
@@ -51,6 +61,22 @@ in:
51
61
  - {name: description, type: string}
52
62
  ```
53
63
 
64
+ ## Example2
65
+ rename column name and set type Example
66
+
67
+ ```yaml
68
+ in:
69
+ type: any file input plugin type
70
+ parser:
71
+ type: csv_guessable
72
+ schema_file test.csv
73
+ schema_line: 1
74
+ columns:
75
+ - {value_name: 'id', name: 'number', type: long}
76
+ - {value_name: 'title', name: 'description', type: string}
77
+ - {value_name: 'status', name: 'ok?', type: string}
78
+ ```
79
+
54
80
  <!--
55
81
  (If guess supported) you don't have to write `parser:` section in the configuration file. After writing `in:` section, you can let embulk guess `parser:` section using this command:
56
82
  -->
data/build.gradle CHANGED
@@ -13,7 +13,7 @@ configurations {
13
13
  provided
14
14
  }
15
15
 
16
- version = "0.1.2"
16
+ version = "0.1.3"
17
17
 
18
18
  sourceCompatibility = 1.7
19
19
  targetCompatibility = 1.7
@@ -25,6 +25,7 @@ dependencies {
25
25
  provided "org.embulk:embulk-core:0.8.22"
26
26
  testCompile "junit:junit:4.+"
27
27
  testCompile "org.embulk:embulk-core:0.8.+:tests"
28
+ testCompile "org.embulk:embulk-standards:0.8.+:tests"
28
29
  }
29
30
 
30
31
  task classpath(type: Copy, dependsOn: ["jar"]) {
@@ -78,8 +79,8 @@ Gem::Specification.new do |spec|
78
79
  spec.name = "${project.name}"
79
80
  spec.version = "${project.version}"
80
81
  spec.authors = ["koooge"]
81
- spec.summary = %[Csv Guessable parser plugin for Embulk]
82
- spec.description = %[Parses Csv Guessable files read by other file input plugins.]
82
+ spec.summary = %[Guessable Csv parser plugin for Embulk]
83
+ spec.description = %[Parses Guessable Csv files read by other file input plugins.]
83
84
  spec.email = ["koooooge@gmail.com"]
84
85
  spec.licenses = ["MIT"]
85
86
  spec.homepage = "https://github.com/koooge/embulk-parser-csv_guessable"
@@ -40,6 +40,7 @@ import java.nio.charset.StandardCharsets;
40
40
  import java.nio.file.Files;
41
41
  import java.nio.file.Path;
42
42
  import java.util.ArrayList;
43
+ import java.util.List;
43
44
 
44
45
  public class CsvGuessableParserPlugin
45
46
  extends CsvParserPlugin
@@ -132,23 +133,38 @@ public class CsvGuessableParserPlugin
132
133
  PluginTask task = config.loadConfig(PluginTask.class);
133
134
  SchemaConfig schemaConfig = null;
134
135
 
135
- if (task.getSchemaFile().isPresent()) { /* embulk-parser-csv_guessable */
136
- if (task.getHeaderLine().isPresent()) {
137
- // TODO: use 'columns' as hints for guess
138
- throw new ConfigException("embulk-parsre-csv_gussable will use 'columnes' as hints for guess as hints for guess. Please delete 'columnes' now.");
139
- }
140
- else { /* guess from header */
141
- int schemaLine = task.getSchemaLine();
142
- task.setSkipHeaderLines(schemaLine); // TODO: use 'skip_header_line'
143
-
144
- String header = readHeader(task.getSchemaFile().get().getPath(), schemaLine);
145
- log.debug(header);
146
- ArrayList<ColumnConfig> columns = newColumns(header, config);
147
- log.debug(columns.toString());
148
- schemaConfig = new SchemaConfig(columns);
136
+ if (task.getSchemaFile().isPresent()) {
137
+ int schemaLine = task.getSchemaLine();
138
+ task.setSkipHeaderLines(schemaLine); // TODO: use 'skip_header_line'
139
+
140
+ String header = readHeader(task.getSchemaFile().get().getPath(), schemaLine);
141
+ log.debug(header);
142
+ ArrayList<ColumnConfig> schema = newColumns(header, config);
143
+
144
+ /* alias and set type */
145
+ if (task.getSchemaConfig().isPresent()) {
146
+ List<ColumnConfig> columns = task.getSchemaConfig().get().getColumns();
147
+ for (ColumnConfig column : columns) {
148
+ String name = column.getName();
149
+ try {
150
+ name = column.getConfigSource().get(String.class, "value_name");
151
+ }
152
+ catch (ConfigException e) {
153
+ /* only setType */
154
+ }
155
+ for (int i = 0; i < schema.size(); ++i) {
156
+ ColumnConfig c = schema.get(i);
157
+ if (c.getName().equals(name)) {
158
+ schema.set(i, new ColumnConfig(name, column.getType(), column.getOption()));
159
+ }
160
+ }
161
+ }
149
162
  }
163
+
164
+ log.debug(schema.toString());
165
+ schemaConfig = new SchemaConfig(schema);
150
166
  }
151
- else { /* embulk-parser-csv embulk */
167
+ else if (task.getSchemaConfig().isPresent()) { /* original CsvParserPlugin */
152
168
  // backward compatibility
153
169
  if (task.getHeaderLine().isPresent()) {
154
170
  if (task.getSkipHeaderLines() > 0) {
@@ -163,6 +179,9 @@ public class CsvGuessableParserPlugin
163
179
  }
164
180
  schemaConfig = task.getSchemaConfig().get();
165
181
  }
182
+ else {
183
+ throw new ConfigException("Field 'columns' or 'schema_file' is required but not set");
184
+ }
166
185
 
167
186
  control.run(task.dump(), schemaConfig.toSchema());
168
187
  }
@@ -377,7 +396,7 @@ public class CsvGuessableParserPlugin
377
396
 
378
397
  private ArrayList<ColumnConfig> newColumns(String header, ConfigSource config)
379
398
  {
380
- ArrayList columns = new ArrayList<ArrayList>();
399
+ ArrayList<ColumnConfig> columns = new ArrayList<ColumnConfig>();
381
400
  PluginTask task = config.loadConfig(PluginTask.class);
382
401
 
383
402
  try (CSVReader reader = new CSVReader(new StringReader(header))) {
@@ -1,18 +1,34 @@
1
1
  package org.embulk.parser.csv_guessable;
2
2
 
3
+ import com.google.common.collect.ImmutableList;
3
4
  import org.embulk.EmbulkTestRuntime;
4
5
  import org.embulk.config.ConfigException;
5
6
  import org.embulk.config.ConfigLoader;
6
7
  import org.embulk.config.ConfigSource;
8
+ import org.embulk.config.TaskSource;
7
9
  import org.embulk.spi.Exec;
10
+ import org.embulk.spi.FileInput;
11
+ import org.embulk.spi.ParserPlugin;
12
+ import org.embulk.spi.Schema;
13
+ import org.embulk.spi.TestPageBuilderReader.MockPageOutput;
14
+ import org.embulk.spi.util.InputStreamFileInput;
15
+ import org.embulk.standards.TestCsvParserPlugin;
16
+ import org.junit.Before;
8
17
  import org.junit.Rule;
9
18
  import org.junit.Test;
10
19
  import org.junit.rules.ExpectedException;
11
20
 
21
+ import java.io.File;
22
+ import java.io.FileInputStream;
23
+ import java.io.IOException;
24
+ import java.io.InputStream;
25
+
12
26
  import static org.embulk.parser.csv_guessable.CsvGuessableParserPlugin.PluginTask;
13
- import static org.junit.Assert.assertFalse;
27
+ import static org.junit.Assert.assertEquals;
28
+ import static org.junit.Assert.assertNull;
14
29
 
15
30
  public class TestCsvGuessableParserPlugin
31
+ extends TestCsvParserPlugin
16
32
  {
17
33
  @Rule
18
34
  public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
@@ -20,61 +36,118 @@ public class TestCsvGuessableParserPlugin
20
36
  @Rule
21
37
  public ExpectedException exception = ExpectedException.none();
22
38
 
39
+ private CsvGuessableParserPlugin plugin;
40
+ private MockPageOutput output;
41
+
42
+ @Before
43
+ public void createResouce()
44
+ {
45
+ plugin = new CsvGuessableParserPlugin();
46
+ output = new MockPageOutput();
47
+ }
48
+
23
49
  private ConfigSource getConfigFromYaml(String yaml)
24
50
  {
25
51
  ConfigLoader loader = new ConfigLoader(Exec.getModelManager());
26
52
  return loader.fromYamlString(yaml);
27
53
  }
28
54
 
29
- /*
30
- @Test
31
- public void throwExceptionWithoutRequisite()
55
+ @Test(expected = ConfigException.class)
56
+ public void checkColumnsRequired()
32
57
  {
33
58
  String configYaml = "" +
34
- "type: csv_with_header";
35
-
59
+ "type: csv_guessable";
36
60
  ConfigSource config = getConfigFromYaml(configYaml);
61
+ PluginTask task = config.loadConfig(PluginTask.class);
37
62
 
38
- exception.expect(ConfigException.class);
39
- exception.expectMessage("Field either 'columns' or 'schema_file' is required but not set");
63
+ if (!task.getSchemaConfig().isPresent() && !task.getSchemaFile().isPresent()) {
64
+ throw new ConfigException("Field 'columns' or 'schema_line' is required but not set");
65
+ }
40
66
  }
41
- */
42
67
 
43
68
  @Test
44
69
  public void defaultValue()
45
70
  {
46
71
  String configYaml = "" +
47
- "type: csv_with_header\n" +
48
- "schema_line: 2";
72
+ "type: csv_guessable";
49
73
  ConfigSource config = getConfigFromYaml(configYaml);
50
74
  PluginTask task = config.loadConfig(PluginTask.class);
51
75
 
52
- assertFalse(task.getSchemaConfig().isPresent());
53
- assertFalse(task.getSchemaFile().isPresent());
76
+ assertNull(task.getSchemaConfig().orNull());
77
+ assertNull(task.getSchemaFile().orNull());
78
+ assertEquals(1, task.getSchemaLine());
54
79
  }
55
80
 
56
- /*
57
81
  @Test
58
82
  public void originalCsvParserPlugin()
59
83
  {
60
84
  String configYaml = "" +
61
- "type: csv_with_header\n" +
85
+ "type: csv_guessable\n" +
62
86
  "columns:\n" +
63
87
  " - {name: id, type: long}\n" +
64
88
  " - {name: title, type: string}\n" +
65
89
  " - {name: status, type: string}";
66
90
  ConfigSource config = getConfigFromYaml(configYaml);
67
91
  PluginTask task = config.loadConfig(PluginTask.class);
92
+
93
+ // TODO: impl or extends
68
94
  }
69
95
 
70
96
  @Test
71
- public void csvGuessable()
97
+ public void guessableCsv()
98
+ throws Exception
72
99
  {
100
+ String configYaml = "" +
101
+ "type: csv_guessable\n" +
102
+ "schema_file: src/test/resources/org/embulk/parser/csv_guessable/data/test.csv\n" + // TODO: FIX PATH
103
+ "schema_line: 1";
104
+ ConfigSource config = getConfigFromYaml(configYaml);
105
+ PluginTask task = config.loadConfig(PluginTask.class);
106
+ // transaction(config, fileInput(new File(this.getClass().getResource("data/test.csv").getPath())));
107
+
108
+ // TODO: impl
73
109
  }
74
110
 
75
111
  @Test
76
- public void replaceColumnsName()
112
+ public void replaceColumnsMetadata()
113
+ throws Exception
114
+ {
115
+ String configYaml = "" +
116
+ "type: csv_guessable\n" +
117
+ "schema_file: src/test/resources/org/embulk/parser/csv_guessable/data/test.csv\n" + // TODO: FIX PATH
118
+ "schema_line: 1\n" +
119
+ "columns:\n" +
120
+ "- {value_name: '#', name: 'number', type: long}\n" +
121
+ "- {value_name: 'title', name: 'description', type: string}\n" +
122
+ "- {value_name: 'status', name: 'ok?', type: string}";
123
+ ConfigSource config = getConfigFromYaml(configYaml);
124
+
125
+ // TODO: impl
126
+ }
127
+
128
+ private void transaction(ConfigSource config, final FileInput input)
129
+ {
130
+ plugin.transaction(config, new ParserPlugin.Control()
131
+ {
132
+ @Override
133
+ public void run(TaskSource taskSource, Schema schema)
134
+ {
135
+ plugin.run(taskSource, schema, input, output);
136
+ }
137
+ });
138
+ }
139
+
140
+ private FileInput fileInput(File file)
141
+ throws Exception
142
+ {
143
+ FileInputStream in = new FileInputStream(file);
144
+ return new InputStreamFileInput(runtime.getBufferAllocator(), provider(in));
145
+ }
146
+
147
+ private InputStreamFileInput.IteratorProvider provider(InputStream... inputStreams)
148
+ throws IOException
77
149
  {
150
+ return new InputStreamFileInput.IteratorProvider(
151
+ ImmutableList.copyOf(inputStreams));
78
152
  }
79
- */
80
153
  }
@@ -0,0 +1,13 @@
1
+ in:
2
+ type: file
3
+ path_prefix: data/test
4
+ parser:
5
+ type: csv_guessable
6
+ schema_file: data/test.csv
7
+ schema_line: 1
8
+ columns:
9
+ - {name: 'id', type: long}
10
+ - {name: 'title', type: string}
11
+ - {name: 'status', type: string}
12
+ out:
13
+ type: stdout
@@ -0,0 +1,12 @@
1
+ in:
2
+ type: file
3
+ path_prefix: data/test
4
+ parser:
5
+ type: csv_guessable
6
+ skip_header_lines: 1
7
+ columns:
8
+ - {name: id, type: long}
9
+ - {name: title, type: string}
10
+ - {name: status, type: string}
11
+ out:
12
+ type: stdout
@@ -0,0 +1,13 @@
1
+ in:
2
+ type: file
3
+ path_prefix: data/test
4
+ parser:
5
+ type: csv_guessable
6
+ schema_file: data/test.csv
7
+ schema_line: 1
8
+ columns:
9
+ - {value_name: 'id', name: 'number', type: long}
10
+ - {value_name: 'title', name: 'description', type: string}
11
+ - {value_name: 'status', name: 'ok?', type: string}
12
+ out:
13
+ type: stdout
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-parser-csv_guessable
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - koooge
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-06-02 00:00:00.000000000 Z
11
+ date: 2017-06-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -38,7 +38,7 @@ dependencies:
38
38
  - - '>='
39
39
  - !ruby/object:Gem::Version
40
40
  version: '10.0'
41
- description: Parses Csv Guessable files read by other file input plugins.
41
+ description: Parses Guessable Csv files read by other file input plugins.
42
42
  email:
43
43
  - koooooge@gmail.com
44
44
  executables: []
@@ -60,16 +60,17 @@ files:
60
60
  - src/main/java/org/embulk/parser/csv_guessable/CsvGuessableParserPlugin.java
61
61
  - src/main/java/org/embulk/parser/csv_guessable/CsvTokenizer.java
62
62
  - src/test/java/org/embulk/parser/csv_guessable/TestCsvGuessableParserPlugin.java
63
- - src/test/resources/data/test.csv
64
- - src/test/resources/data/test_alias.yml
65
- - src/test/resources/yml/guess_from_header.yml
66
- - src/test/resources/yml/original-csv.yml
67
- - src/test/resources/yml/replace_column_name.yml
63
+ - src/test/resources/org/embulk/parser/csv_guessable/data/test.csv
64
+ - src/test/resources/org/embulk/parser/csv_guessable/data/test_alias.yml
65
+ - src/test/resources/org/embulk/parser/csv_guessable/yml/guess_and_set_type.yml
66
+ - src/test/resources/org/embulk/parser/csv_guessable/yml/guess_from_header.yml
67
+ - src/test/resources/org/embulk/parser/csv_guessable/yml/original-csv.yml
68
+ - src/test/resources/org/embulk/parser/csv_guessable/yml/replace_column_name.yml
68
69
  - classpath/commons-lang3-3.5.jar
70
+ - classpath/embulk-parser-csv_guessable-0.1.3.jar
69
71
  - classpath/opencsv-3.9.jar
70
72
  - classpath/commons-beanutils-1.9.3.jar
71
73
  - classpath/commons-compress-1.10.jar
72
- - classpath/embulk-parser-csv_guessable-0.1.2.jar
73
74
  - classpath/embulk-standards-0.8.22.jar
74
75
  - classpath/commons-collections-3.2.2.jar
75
76
  - classpath/commons-logging-1.2.jar
@@ -96,5 +97,5 @@ rubyforge_project:
96
97
  rubygems_version: 2.1.9
97
98
  signing_key:
98
99
  specification_version: 4
99
- summary: Csv Guessable parser plugin for Embulk
100
+ summary: Guessable Csv parser plugin for Embulk
100
101
  test_files: []
@@ -1,12 +0,0 @@
1
- in:
2
- type: file
3
- path_prefix: data/test
4
- parser:
5
- type: csv_guessable
6
- skip_header_line: 1
7
- columns:
8
- - {name: id, type: long}
9
- - {name: title, type: string}
10
- - {name: status, type: string}
11
- out:
12
- type: stdout
@@ -1,13 +0,0 @@
1
- in:
2
- type: file
3
- path_prefix: data/test
4
- parser:
5
- type: csv_guessable
6
- schema_file: data/test.csv
7
- schema_line: 1
8
- columns:
9
- - {value_name: '#', name: number, type: long}
10
- - {value_name: 'title', name: description, type: string}
11
- - {value_name: 'status', name: ok?, type: string}
12
- out:
13
- type: stdout