embulk-parser-csv_guessable 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +34 -8
- data/build.gradle +4 -3
- data/src/main/java/org/embulk/parser/csv_guessable/CsvGuessableParserPlugin.java +35 -16
- data/src/test/java/org/embulk/parser/csv_guessable/TestCsvGuessableParserPlugin.java +91 -18
- data/src/test/resources/{data → org/embulk/parser/csv_guessable/data}/test.csv +0 -0
- data/src/test/resources/{data → org/embulk/parser/csv_guessable/data}/test_alias.yml +0 -0
- data/src/test/resources/org/embulk/parser/csv_guessable/yml/guess_and_set_type.yml +13 -0
- data/src/test/resources/{yml → org/embulk/parser/csv_guessable/yml}/guess_from_header.yml +0 -0
- data/src/test/resources/org/embulk/parser/csv_guessable/yml/original-csv.yml +12 -0
- data/src/test/resources/org/embulk/parser/csv_guessable/yml/replace_column_name.yml +13 -0
- metadata +11 -10
- data/src/test/resources/yml/original-csv.yml +0 -12
- data/src/test/resources/yml/replace_column_name.yml +0 -13
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1bd20694daee60a0018828d263039d4fcbdc28bf
|
4
|
+
data.tar.gz: 34b6b8c95c8e8e375059d7ff87dd46314fb3ef55
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 844084c1b76193789c12821bf9eb76a3fef969b33e42e2ea9ff6b9935848276412c3fee3af558d04aef3414ba1e93e72bfbab35393cc0a935d994b9f98ff8e4e
|
7
|
+
data.tar.gz: 57c246a6c2d551a55de98e8f420354da9c51970ef1e53184d87c0f6a22dd1dde47e9f49e3df4e1a6987a76fd71aac7ee84590e2c06be50a5a1bf2296bcd26398
|
data/README.md
CHANGED
@@ -1,8 +1,11 @@
|
|
1
|
-
#
|
2
|
-
**embulk-parser-
|
1
|
+
# Guessable csv parser plugin for Embulk
|
2
|
+
**embulk-parser-csv_guessable** (runtime)guesses and parses csv which has schema in header.
|
3
|
+
|
4
|
+
Csv file sometimes has a schema in the header.
|
5
|
+
**embulk-parser-csv_guessable** parses such a csv by using their header as column name.
|
3
6
|
This plugin is useful in case of target csv schema changes frequently.
|
4
7
|
|
5
|
-
|
8
|
+
It behaves as original csv parser when **embulk-parser-csv_guessable** conifgs(`schema_file` and `schema_line`) is not defined.
|
6
9
|
|
7
10
|
## Overview
|
8
11
|
|
@@ -12,12 +15,19 @@ Also it can behave as original csv parser without **embulk-parser-csv_guessable*
|
|
12
15
|
## Configuration
|
13
16
|
|
14
17
|
- **schema_file**: filename which has schema.(string, default: `null`)
|
15
|
-
- **schema_line**: schema line in header. (integer default: `
|
16
|
-
- **
|
18
|
+
- **schema_line**: schema line in header. (integer default: `1`)
|
19
|
+
- **columns**: Columns attributes for parse. `embulk-parser-csv_guessable` use this config only when `schema_file` is set. If `"schema_file"` isn't set, this is same as the original csv parser's `columns`. (hash, default: `null`)
|
20
|
+
- **value_name**: Name of the column in the header. rename to `name`
|
21
|
+
- **name**: Name of the column
|
22
|
+
- **type**: Type of the column
|
23
|
+
- **format**: Format of the timestamp if type is timestamp
|
24
|
+
- **date**: Set date part if the format doesn't include date part
|
17
25
|
- any other csv configs: see [www.embulk.org](http://www.embulk.org/docs/built-in.html#csv-parser-plugin)
|
18
26
|
|
27
|
+
The `columns`
|
28
|
+
|
19
29
|
## Example
|
20
|
-
test.csv
|
30
|
+
test.csv (There is a schema at the first line.)
|
21
31
|
|
22
32
|
```csv
|
23
33
|
id, title, description
|
@@ -32,11 +42,11 @@ in:
|
|
32
42
|
type: any file input plugin type
|
33
43
|
parser:
|
34
44
|
type: csv_guessable
|
35
|
-
schema_file:
|
45
|
+
schema_file: test.csv
|
36
46
|
schema_line: 1
|
37
47
|
```
|
38
48
|
|
39
|
-
(
|
49
|
+
(For explain)
|
40
50
|
In case original csv parser
|
41
51
|
config.yml
|
42
52
|
```yaml
|
@@ -51,6 +61,22 @@ in:
|
|
51
61
|
- {name: description, type: string}
|
52
62
|
```
|
53
63
|
|
64
|
+
## Example2
|
65
|
+
rename column name and set type Example
|
66
|
+
|
67
|
+
```yaml
|
68
|
+
in:
|
69
|
+
type: any file input plugin type
|
70
|
+
parser:
|
71
|
+
type: csv_guessable
|
72
|
+
schema_file test.csv
|
73
|
+
schema_line: 1
|
74
|
+
columns:
|
75
|
+
- {value_name: 'id', name: 'number', type: long}
|
76
|
+
- {value_name: 'title', name: 'description', type: string}
|
77
|
+
- {value_name: 'status', name: 'ok?', type: string}
|
78
|
+
```
|
79
|
+
|
54
80
|
<!--
|
55
81
|
(If guess supported) you don't have to write `parser:` section in the configuration file. After writing `in:` section, you can let embulk guess `parser:` section using this command:
|
56
82
|
-->
|
data/build.gradle
CHANGED
@@ -13,7 +13,7 @@ configurations {
|
|
13
13
|
provided
|
14
14
|
}
|
15
15
|
|
16
|
-
version = "0.1.
|
16
|
+
version = "0.1.3"
|
17
17
|
|
18
18
|
sourceCompatibility = 1.7
|
19
19
|
targetCompatibility = 1.7
|
@@ -25,6 +25,7 @@ dependencies {
|
|
25
25
|
provided "org.embulk:embulk-core:0.8.22"
|
26
26
|
testCompile "junit:junit:4.+"
|
27
27
|
testCompile "org.embulk:embulk-core:0.8.+:tests"
|
28
|
+
testCompile "org.embulk:embulk-standards:0.8.+:tests"
|
28
29
|
}
|
29
30
|
|
30
31
|
task classpath(type: Copy, dependsOn: ["jar"]) {
|
@@ -78,8 +79,8 @@ Gem::Specification.new do |spec|
|
|
78
79
|
spec.name = "${project.name}"
|
79
80
|
spec.version = "${project.version}"
|
80
81
|
spec.authors = ["koooge"]
|
81
|
-
spec.summary = %[Csv
|
82
|
-
spec.description = %[Parses Csv
|
82
|
+
spec.summary = %[Guessable Csv parser plugin for Embulk]
|
83
|
+
spec.description = %[Parses Guessable Csv files read by other file input plugins.]
|
83
84
|
spec.email = ["koooooge@gmail.com"]
|
84
85
|
spec.licenses = ["MIT"]
|
85
86
|
spec.homepage = "https://github.com/koooge/embulk-parser-csv_guessable"
|
@@ -40,6 +40,7 @@ import java.nio.charset.StandardCharsets;
|
|
40
40
|
import java.nio.file.Files;
|
41
41
|
import java.nio.file.Path;
|
42
42
|
import java.util.ArrayList;
|
43
|
+
import java.util.List;
|
43
44
|
|
44
45
|
public class CsvGuessableParserPlugin
|
45
46
|
extends CsvParserPlugin
|
@@ -132,23 +133,38 @@ public class CsvGuessableParserPlugin
|
|
132
133
|
PluginTask task = config.loadConfig(PluginTask.class);
|
133
134
|
SchemaConfig schemaConfig = null;
|
134
135
|
|
135
|
-
if (task.getSchemaFile().isPresent()) {
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
136
|
+
if (task.getSchemaFile().isPresent()) {
|
137
|
+
int schemaLine = task.getSchemaLine();
|
138
|
+
task.setSkipHeaderLines(schemaLine); // TODO: use 'skip_header_line'
|
139
|
+
|
140
|
+
String header = readHeader(task.getSchemaFile().get().getPath(), schemaLine);
|
141
|
+
log.debug(header);
|
142
|
+
ArrayList<ColumnConfig> schema = newColumns(header, config);
|
143
|
+
|
144
|
+
/* alias and set type */
|
145
|
+
if (task.getSchemaConfig().isPresent()) {
|
146
|
+
List<ColumnConfig> columns = task.getSchemaConfig().get().getColumns();
|
147
|
+
for (ColumnConfig column : columns) {
|
148
|
+
String name = column.getName();
|
149
|
+
try {
|
150
|
+
name = column.getConfigSource().get(String.class, "value_name");
|
151
|
+
}
|
152
|
+
catch (ConfigException e) {
|
153
|
+
/* only setType */
|
154
|
+
}
|
155
|
+
for (int i = 0; i < schema.size(); ++i) {
|
156
|
+
ColumnConfig c = schema.get(i);
|
157
|
+
if (c.getName().equals(name)) {
|
158
|
+
schema.set(i, new ColumnConfig(name, column.getType(), column.getOption()));
|
159
|
+
}
|
160
|
+
}
|
161
|
+
}
|
149
162
|
}
|
163
|
+
|
164
|
+
log.debug(schema.toString());
|
165
|
+
schemaConfig = new SchemaConfig(schema);
|
150
166
|
}
|
151
|
-
else { /*
|
167
|
+
else if (task.getSchemaConfig().isPresent()) { /* original CsvParserPlugin */
|
152
168
|
// backward compatibility
|
153
169
|
if (task.getHeaderLine().isPresent()) {
|
154
170
|
if (task.getSkipHeaderLines() > 0) {
|
@@ -163,6 +179,9 @@ public class CsvGuessableParserPlugin
|
|
163
179
|
}
|
164
180
|
schemaConfig = task.getSchemaConfig().get();
|
165
181
|
}
|
182
|
+
else {
|
183
|
+
throw new ConfigException("Field 'columns' or 'schema_file' is required but not set");
|
184
|
+
}
|
166
185
|
|
167
186
|
control.run(task.dump(), schemaConfig.toSchema());
|
168
187
|
}
|
@@ -377,7 +396,7 @@ public class CsvGuessableParserPlugin
|
|
377
396
|
|
378
397
|
private ArrayList<ColumnConfig> newColumns(String header, ConfigSource config)
|
379
398
|
{
|
380
|
-
ArrayList columns = new ArrayList<
|
399
|
+
ArrayList<ColumnConfig> columns = new ArrayList<ColumnConfig>();
|
381
400
|
PluginTask task = config.loadConfig(PluginTask.class);
|
382
401
|
|
383
402
|
try (CSVReader reader = new CSVReader(new StringReader(header))) {
|
@@ -1,18 +1,34 @@
|
|
1
1
|
package org.embulk.parser.csv_guessable;
|
2
2
|
|
3
|
+
import com.google.common.collect.ImmutableList;
|
3
4
|
import org.embulk.EmbulkTestRuntime;
|
4
5
|
import org.embulk.config.ConfigException;
|
5
6
|
import org.embulk.config.ConfigLoader;
|
6
7
|
import org.embulk.config.ConfigSource;
|
8
|
+
import org.embulk.config.TaskSource;
|
7
9
|
import org.embulk.spi.Exec;
|
10
|
+
import org.embulk.spi.FileInput;
|
11
|
+
import org.embulk.spi.ParserPlugin;
|
12
|
+
import org.embulk.spi.Schema;
|
13
|
+
import org.embulk.spi.TestPageBuilderReader.MockPageOutput;
|
14
|
+
import org.embulk.spi.util.InputStreamFileInput;
|
15
|
+
import org.embulk.standards.TestCsvParserPlugin;
|
16
|
+
import org.junit.Before;
|
8
17
|
import org.junit.Rule;
|
9
18
|
import org.junit.Test;
|
10
19
|
import org.junit.rules.ExpectedException;
|
11
20
|
|
21
|
+
import java.io.File;
|
22
|
+
import java.io.FileInputStream;
|
23
|
+
import java.io.IOException;
|
24
|
+
import java.io.InputStream;
|
25
|
+
|
12
26
|
import static org.embulk.parser.csv_guessable.CsvGuessableParserPlugin.PluginTask;
|
13
|
-
import static org.junit.Assert.
|
27
|
+
import static org.junit.Assert.assertEquals;
|
28
|
+
import static org.junit.Assert.assertNull;
|
14
29
|
|
15
30
|
public class TestCsvGuessableParserPlugin
|
31
|
+
extends TestCsvParserPlugin
|
16
32
|
{
|
17
33
|
@Rule
|
18
34
|
public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
|
@@ -20,61 +36,118 @@ public class TestCsvGuessableParserPlugin
|
|
20
36
|
@Rule
|
21
37
|
public ExpectedException exception = ExpectedException.none();
|
22
38
|
|
39
|
+
private CsvGuessableParserPlugin plugin;
|
40
|
+
private MockPageOutput output;
|
41
|
+
|
42
|
+
@Before
|
43
|
+
public void createResouce()
|
44
|
+
{
|
45
|
+
plugin = new CsvGuessableParserPlugin();
|
46
|
+
output = new MockPageOutput();
|
47
|
+
}
|
48
|
+
|
23
49
|
private ConfigSource getConfigFromYaml(String yaml)
|
24
50
|
{
|
25
51
|
ConfigLoader loader = new ConfigLoader(Exec.getModelManager());
|
26
52
|
return loader.fromYamlString(yaml);
|
27
53
|
}
|
28
54
|
|
29
|
-
|
30
|
-
|
31
|
-
public void throwExceptionWithoutRequisite()
|
55
|
+
@Test(expected = ConfigException.class)
|
56
|
+
public void checkColumnsRequired()
|
32
57
|
{
|
33
58
|
String configYaml = "" +
|
34
|
-
"type:
|
35
|
-
|
59
|
+
"type: csv_guessable";
|
36
60
|
ConfigSource config = getConfigFromYaml(configYaml);
|
61
|
+
PluginTask task = config.loadConfig(PluginTask.class);
|
37
62
|
|
38
|
-
|
39
|
-
|
63
|
+
if (!task.getSchemaConfig().isPresent() && !task.getSchemaFile().isPresent()) {
|
64
|
+
throw new ConfigException("Field 'columns' or 'schema_line' is required but not set");
|
65
|
+
}
|
40
66
|
}
|
41
|
-
*/
|
42
67
|
|
43
68
|
@Test
|
44
69
|
public void defaultValue()
|
45
70
|
{
|
46
71
|
String configYaml = "" +
|
47
|
-
"type:
|
48
|
-
"schema_line: 2";
|
72
|
+
"type: csv_guessable";
|
49
73
|
ConfigSource config = getConfigFromYaml(configYaml);
|
50
74
|
PluginTask task = config.loadConfig(PluginTask.class);
|
51
75
|
|
52
|
-
|
53
|
-
|
76
|
+
assertNull(task.getSchemaConfig().orNull());
|
77
|
+
assertNull(task.getSchemaFile().orNull());
|
78
|
+
assertEquals(1, task.getSchemaLine());
|
54
79
|
}
|
55
80
|
|
56
|
-
/*
|
57
81
|
@Test
|
58
82
|
public void originalCsvParserPlugin()
|
59
83
|
{
|
60
84
|
String configYaml = "" +
|
61
|
-
"type:
|
85
|
+
"type: csv_guessable\n" +
|
62
86
|
"columns:\n" +
|
63
87
|
" - {name: id, type: long}\n" +
|
64
88
|
" - {name: title, type: string}\n" +
|
65
89
|
" - {name: status, type: string}";
|
66
90
|
ConfigSource config = getConfigFromYaml(configYaml);
|
67
91
|
PluginTask task = config.loadConfig(PluginTask.class);
|
92
|
+
|
93
|
+
// TODO: impl or extends
|
68
94
|
}
|
69
95
|
|
70
96
|
@Test
|
71
|
-
public void
|
97
|
+
public void guessableCsv()
|
98
|
+
throws Exception
|
72
99
|
{
|
100
|
+
String configYaml = "" +
|
101
|
+
"type: csv_guessable\n" +
|
102
|
+
"schema_file: src/test/resources/org/embulk/parser/csv_guessable/data/test.csv\n" + // TODO: FIX PATH
|
103
|
+
"schema_line: 1";
|
104
|
+
ConfigSource config = getConfigFromYaml(configYaml);
|
105
|
+
PluginTask task = config.loadConfig(PluginTask.class);
|
106
|
+
// transaction(config, fileInput(new File(this.getClass().getResource("data/test.csv").getPath())));
|
107
|
+
|
108
|
+
// TODO: impl
|
73
109
|
}
|
74
110
|
|
75
111
|
@Test
|
76
|
-
public void
|
112
|
+
public void replaceColumnsMetadata()
|
113
|
+
throws Exception
|
114
|
+
{
|
115
|
+
String configYaml = "" +
|
116
|
+
"type: csv_guessable\n" +
|
117
|
+
"schema_file: src/test/resources/org/embulk/parser/csv_guessable/data/test.csv\n" + // TODO: FIX PATH
|
118
|
+
"schema_line: 1\n" +
|
119
|
+
"columns:\n" +
|
120
|
+
"- {value_name: '#', name: 'number', type: long}\n" +
|
121
|
+
"- {value_name: 'title', name: 'description', type: string}\n" +
|
122
|
+
"- {value_name: 'status', name: 'ok?', type: string}";
|
123
|
+
ConfigSource config = getConfigFromYaml(configYaml);
|
124
|
+
|
125
|
+
// TODO: impl
|
126
|
+
}
|
127
|
+
|
128
|
+
private void transaction(ConfigSource config, final FileInput input)
|
129
|
+
{
|
130
|
+
plugin.transaction(config, new ParserPlugin.Control()
|
131
|
+
{
|
132
|
+
@Override
|
133
|
+
public void run(TaskSource taskSource, Schema schema)
|
134
|
+
{
|
135
|
+
plugin.run(taskSource, schema, input, output);
|
136
|
+
}
|
137
|
+
});
|
138
|
+
}
|
139
|
+
|
140
|
+
private FileInput fileInput(File file)
|
141
|
+
throws Exception
|
142
|
+
{
|
143
|
+
FileInputStream in = new FileInputStream(file);
|
144
|
+
return new InputStreamFileInput(runtime.getBufferAllocator(), provider(in));
|
145
|
+
}
|
146
|
+
|
147
|
+
private InputStreamFileInput.IteratorProvider provider(InputStream... inputStreams)
|
148
|
+
throws IOException
|
77
149
|
{
|
150
|
+
return new InputStreamFileInput.IteratorProvider(
|
151
|
+
ImmutableList.copyOf(inputStreams));
|
78
152
|
}
|
79
|
-
*/
|
80
153
|
}
|
File without changes
|
File without changes
|
File without changes
|
@@ -0,0 +1,13 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: data/test
|
4
|
+
parser:
|
5
|
+
type: csv_guessable
|
6
|
+
schema_file: data/test.csv
|
7
|
+
schema_line: 1
|
8
|
+
columns:
|
9
|
+
- {value_name: 'id', name: 'number', type: long}
|
10
|
+
- {value_name: 'title', name: 'description', type: string}
|
11
|
+
- {value_name: 'status', name: 'ok?', type: string}
|
12
|
+
out:
|
13
|
+
type: stdout
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-parser-csv_guessable
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- koooge
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-06-
|
11
|
+
date: 2017-06-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -38,7 +38,7 @@ dependencies:
|
|
38
38
|
- - '>='
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '10.0'
|
41
|
-
description: Parses Csv
|
41
|
+
description: Parses Guessable Csv files read by other file input plugins.
|
42
42
|
email:
|
43
43
|
- koooooge@gmail.com
|
44
44
|
executables: []
|
@@ -60,16 +60,17 @@ files:
|
|
60
60
|
- src/main/java/org/embulk/parser/csv_guessable/CsvGuessableParserPlugin.java
|
61
61
|
- src/main/java/org/embulk/parser/csv_guessable/CsvTokenizer.java
|
62
62
|
- src/test/java/org/embulk/parser/csv_guessable/TestCsvGuessableParserPlugin.java
|
63
|
-
- src/test/resources/data/test.csv
|
64
|
-
- src/test/resources/data/test_alias.yml
|
65
|
-
- src/test/resources/yml/
|
66
|
-
- src/test/resources/yml/
|
67
|
-
- src/test/resources/yml/
|
63
|
+
- src/test/resources/org/embulk/parser/csv_guessable/data/test.csv
|
64
|
+
- src/test/resources/org/embulk/parser/csv_guessable/data/test_alias.yml
|
65
|
+
- src/test/resources/org/embulk/parser/csv_guessable/yml/guess_and_set_type.yml
|
66
|
+
- src/test/resources/org/embulk/parser/csv_guessable/yml/guess_from_header.yml
|
67
|
+
- src/test/resources/org/embulk/parser/csv_guessable/yml/original-csv.yml
|
68
|
+
- src/test/resources/org/embulk/parser/csv_guessable/yml/replace_column_name.yml
|
68
69
|
- classpath/commons-lang3-3.5.jar
|
70
|
+
- classpath/embulk-parser-csv_guessable-0.1.3.jar
|
69
71
|
- classpath/opencsv-3.9.jar
|
70
72
|
- classpath/commons-beanutils-1.9.3.jar
|
71
73
|
- classpath/commons-compress-1.10.jar
|
72
|
-
- classpath/embulk-parser-csv_guessable-0.1.2.jar
|
73
74
|
- classpath/embulk-standards-0.8.22.jar
|
74
75
|
- classpath/commons-collections-3.2.2.jar
|
75
76
|
- classpath/commons-logging-1.2.jar
|
@@ -96,5 +97,5 @@ rubyforge_project:
|
|
96
97
|
rubygems_version: 2.1.9
|
97
98
|
signing_key:
|
98
99
|
specification_version: 4
|
99
|
-
summary: Csv
|
100
|
+
summary: Guessable Csv parser plugin for Embulk
|
100
101
|
test_files: []
|
@@ -1,13 +0,0 @@
|
|
1
|
-
in:
|
2
|
-
type: file
|
3
|
-
path_prefix: data/test
|
4
|
-
parser:
|
5
|
-
type: csv_guessable
|
6
|
-
schema_file: data/test.csv
|
7
|
-
schema_line: 1
|
8
|
-
columns:
|
9
|
-
- {value_name: '#', name: number, type: long}
|
10
|
-
- {value_name: 'title', name: description, type: string}
|
11
|
-
- {value_name: 'status', name: ok?, type: string}
|
12
|
-
out:
|
13
|
-
type: stdout
|