embulk-parser-csv_guessable 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +34 -8
- data/build.gradle +4 -3
- data/src/main/java/org/embulk/parser/csv_guessable/CsvGuessableParserPlugin.java +35 -16
- data/src/test/java/org/embulk/parser/csv_guessable/TestCsvGuessableParserPlugin.java +91 -18
- data/src/test/resources/{data → org/embulk/parser/csv_guessable/data}/test.csv +0 -0
- data/src/test/resources/{data → org/embulk/parser/csv_guessable/data}/test_alias.yml +0 -0
- data/src/test/resources/org/embulk/parser/csv_guessable/yml/guess_and_set_type.yml +13 -0
- data/src/test/resources/{yml → org/embulk/parser/csv_guessable/yml}/guess_from_header.yml +0 -0
- data/src/test/resources/org/embulk/parser/csv_guessable/yml/original-csv.yml +12 -0
- data/src/test/resources/org/embulk/parser/csv_guessable/yml/replace_column_name.yml +13 -0
- metadata +11 -10
- data/src/test/resources/yml/original-csv.yml +0 -12
- data/src/test/resources/yml/replace_column_name.yml +0 -13
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1bd20694daee60a0018828d263039d4fcbdc28bf
|
4
|
+
data.tar.gz: 34b6b8c95c8e8e375059d7ff87dd46314fb3ef55
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 844084c1b76193789c12821bf9eb76a3fef969b33e42e2ea9ff6b9935848276412c3fee3af558d04aef3414ba1e93e72bfbab35393cc0a935d994b9f98ff8e4e
|
7
|
+
data.tar.gz: 57c246a6c2d551a55de98e8f420354da9c51970ef1e53184d87c0f6a22dd1dde47e9f49e3df4e1a6987a76fd71aac7ee84590e2c06be50a5a1bf2296bcd26398
|
data/README.md
CHANGED
@@ -1,8 +1,11 @@
|
|
1
|
-
#
|
2
|
-
**embulk-parser-
|
1
|
+
# Guessable csv parser plugin for Embulk
|
2
|
+
**embulk-parser-csv_guessable** (runtime)guesses and parses csv which has schema in header.
|
3
|
+
|
4
|
+
Csv file sometimes has a schema in the header.
|
5
|
+
**embulk-parser-csv_guessable** parses such a csv by using their header as column name.
|
3
6
|
This plugin is useful in case of target csv schema changes frequently.
|
4
7
|
|
5
|
-
|
8
|
+
It behaves as original csv parser when **embulk-parser-csv_guessable** conifgs(`schema_file` and `schema_line`) is not defined.
|
6
9
|
|
7
10
|
## Overview
|
8
11
|
|
@@ -12,12 +15,19 @@ Also it can behave as original csv parser without **embulk-parser-csv_guessable*
|
|
12
15
|
## Configuration
|
13
16
|
|
14
17
|
- **schema_file**: filename which has schema.(string, default: `null`)
|
15
|
-
- **schema_line**: schema line in header. (integer default: `
|
16
|
-
- **
|
18
|
+
- **schema_line**: schema line in header. (integer default: `1`)
|
19
|
+
- **columns**: Columns attributes for parse. `embulk-parser-csv_guessable` use this config only when `schema_file` is set. If `"schema_file"` isn't set, this is same as the original csv parser's `columns`. (hash, default: `null`)
|
20
|
+
- **value_name**: Name of the column in the header. rename to `name`
|
21
|
+
- **name**: Name of the column
|
22
|
+
- **type**: Type of the column
|
23
|
+
- **format**: Format of the timestamp if type is timestamp
|
24
|
+
- **date**: Set date part if the format doesn't include date part
|
17
25
|
- any other csv configs: see [www.embulk.org](http://www.embulk.org/docs/built-in.html#csv-parser-plugin)
|
18
26
|
|
27
|
+
The `columns`
|
28
|
+
|
19
29
|
## Example
|
20
|
-
test.csv
|
30
|
+
test.csv (There is a schema at the first line.)
|
21
31
|
|
22
32
|
```csv
|
23
33
|
id, title, description
|
@@ -32,11 +42,11 @@ in:
|
|
32
42
|
type: any file input plugin type
|
33
43
|
parser:
|
34
44
|
type: csv_guessable
|
35
|
-
schema_file:
|
45
|
+
schema_file: test.csv
|
36
46
|
schema_line: 1
|
37
47
|
```
|
38
48
|
|
39
|
-
(
|
49
|
+
(For explain)
|
40
50
|
In case original csv parser
|
41
51
|
config.yml
|
42
52
|
```yaml
|
@@ -51,6 +61,22 @@ in:
|
|
51
61
|
- {name: description, type: string}
|
52
62
|
```
|
53
63
|
|
64
|
+
## Example2
|
65
|
+
rename column name and set type Example
|
66
|
+
|
67
|
+
```yaml
|
68
|
+
in:
|
69
|
+
type: any file input plugin type
|
70
|
+
parser:
|
71
|
+
type: csv_guessable
|
72
|
+
schema_file test.csv
|
73
|
+
schema_line: 1
|
74
|
+
columns:
|
75
|
+
- {value_name: 'id', name: 'number', type: long}
|
76
|
+
- {value_name: 'title', name: 'description', type: string}
|
77
|
+
- {value_name: 'status', name: 'ok?', type: string}
|
78
|
+
```
|
79
|
+
|
54
80
|
<!--
|
55
81
|
(If guess supported) you don't have to write `parser:` section in the configuration file. After writing `in:` section, you can let embulk guess `parser:` section using this command:
|
56
82
|
-->
|
data/build.gradle
CHANGED
@@ -13,7 +13,7 @@ configurations {
|
|
13
13
|
provided
|
14
14
|
}
|
15
15
|
|
16
|
-
version = "0.1.
|
16
|
+
version = "0.1.3"
|
17
17
|
|
18
18
|
sourceCompatibility = 1.7
|
19
19
|
targetCompatibility = 1.7
|
@@ -25,6 +25,7 @@ dependencies {
|
|
25
25
|
provided "org.embulk:embulk-core:0.8.22"
|
26
26
|
testCompile "junit:junit:4.+"
|
27
27
|
testCompile "org.embulk:embulk-core:0.8.+:tests"
|
28
|
+
testCompile "org.embulk:embulk-standards:0.8.+:tests"
|
28
29
|
}
|
29
30
|
|
30
31
|
task classpath(type: Copy, dependsOn: ["jar"]) {
|
@@ -78,8 +79,8 @@ Gem::Specification.new do |spec|
|
|
78
79
|
spec.name = "${project.name}"
|
79
80
|
spec.version = "${project.version}"
|
80
81
|
spec.authors = ["koooge"]
|
81
|
-
spec.summary = %[Csv
|
82
|
-
spec.description = %[Parses Csv
|
82
|
+
spec.summary = %[Guessable Csv parser plugin for Embulk]
|
83
|
+
spec.description = %[Parses Guessable Csv files read by other file input plugins.]
|
83
84
|
spec.email = ["koooooge@gmail.com"]
|
84
85
|
spec.licenses = ["MIT"]
|
85
86
|
spec.homepage = "https://github.com/koooge/embulk-parser-csv_guessable"
|
@@ -40,6 +40,7 @@ import java.nio.charset.StandardCharsets;
|
|
40
40
|
import java.nio.file.Files;
|
41
41
|
import java.nio.file.Path;
|
42
42
|
import java.util.ArrayList;
|
43
|
+
import java.util.List;
|
43
44
|
|
44
45
|
public class CsvGuessableParserPlugin
|
45
46
|
extends CsvParserPlugin
|
@@ -132,23 +133,38 @@ public class CsvGuessableParserPlugin
|
|
132
133
|
PluginTask task = config.loadConfig(PluginTask.class);
|
133
134
|
SchemaConfig schemaConfig = null;
|
134
135
|
|
135
|
-
if (task.getSchemaFile().isPresent()) {
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
136
|
+
if (task.getSchemaFile().isPresent()) {
|
137
|
+
int schemaLine = task.getSchemaLine();
|
138
|
+
task.setSkipHeaderLines(schemaLine); // TODO: use 'skip_header_line'
|
139
|
+
|
140
|
+
String header = readHeader(task.getSchemaFile().get().getPath(), schemaLine);
|
141
|
+
log.debug(header);
|
142
|
+
ArrayList<ColumnConfig> schema = newColumns(header, config);
|
143
|
+
|
144
|
+
/* alias and set type */
|
145
|
+
if (task.getSchemaConfig().isPresent()) {
|
146
|
+
List<ColumnConfig> columns = task.getSchemaConfig().get().getColumns();
|
147
|
+
for (ColumnConfig column : columns) {
|
148
|
+
String name = column.getName();
|
149
|
+
try {
|
150
|
+
name = column.getConfigSource().get(String.class, "value_name");
|
151
|
+
}
|
152
|
+
catch (ConfigException e) {
|
153
|
+
/* only setType */
|
154
|
+
}
|
155
|
+
for (int i = 0; i < schema.size(); ++i) {
|
156
|
+
ColumnConfig c = schema.get(i);
|
157
|
+
if (c.getName().equals(name)) {
|
158
|
+
schema.set(i, new ColumnConfig(name, column.getType(), column.getOption()));
|
159
|
+
}
|
160
|
+
}
|
161
|
+
}
|
149
162
|
}
|
163
|
+
|
164
|
+
log.debug(schema.toString());
|
165
|
+
schemaConfig = new SchemaConfig(schema);
|
150
166
|
}
|
151
|
-
else { /*
|
167
|
+
else if (task.getSchemaConfig().isPresent()) { /* original CsvParserPlugin */
|
152
168
|
// backward compatibility
|
153
169
|
if (task.getHeaderLine().isPresent()) {
|
154
170
|
if (task.getSkipHeaderLines() > 0) {
|
@@ -163,6 +179,9 @@ public class CsvGuessableParserPlugin
|
|
163
179
|
}
|
164
180
|
schemaConfig = task.getSchemaConfig().get();
|
165
181
|
}
|
182
|
+
else {
|
183
|
+
throw new ConfigException("Field 'columns' or 'schema_file' is required but not set");
|
184
|
+
}
|
166
185
|
|
167
186
|
control.run(task.dump(), schemaConfig.toSchema());
|
168
187
|
}
|
@@ -377,7 +396,7 @@ public class CsvGuessableParserPlugin
|
|
377
396
|
|
378
397
|
private ArrayList<ColumnConfig> newColumns(String header, ConfigSource config)
|
379
398
|
{
|
380
|
-
ArrayList columns = new ArrayList<
|
399
|
+
ArrayList<ColumnConfig> columns = new ArrayList<ColumnConfig>();
|
381
400
|
PluginTask task = config.loadConfig(PluginTask.class);
|
382
401
|
|
383
402
|
try (CSVReader reader = new CSVReader(new StringReader(header))) {
|
@@ -1,18 +1,34 @@
|
|
1
1
|
package org.embulk.parser.csv_guessable;
|
2
2
|
|
3
|
+
import com.google.common.collect.ImmutableList;
|
3
4
|
import org.embulk.EmbulkTestRuntime;
|
4
5
|
import org.embulk.config.ConfigException;
|
5
6
|
import org.embulk.config.ConfigLoader;
|
6
7
|
import org.embulk.config.ConfigSource;
|
8
|
+
import org.embulk.config.TaskSource;
|
7
9
|
import org.embulk.spi.Exec;
|
10
|
+
import org.embulk.spi.FileInput;
|
11
|
+
import org.embulk.spi.ParserPlugin;
|
12
|
+
import org.embulk.spi.Schema;
|
13
|
+
import org.embulk.spi.TestPageBuilderReader.MockPageOutput;
|
14
|
+
import org.embulk.spi.util.InputStreamFileInput;
|
15
|
+
import org.embulk.standards.TestCsvParserPlugin;
|
16
|
+
import org.junit.Before;
|
8
17
|
import org.junit.Rule;
|
9
18
|
import org.junit.Test;
|
10
19
|
import org.junit.rules.ExpectedException;
|
11
20
|
|
21
|
+
import java.io.File;
|
22
|
+
import java.io.FileInputStream;
|
23
|
+
import java.io.IOException;
|
24
|
+
import java.io.InputStream;
|
25
|
+
|
12
26
|
import static org.embulk.parser.csv_guessable.CsvGuessableParserPlugin.PluginTask;
|
13
|
-
import static org.junit.Assert.
|
27
|
+
import static org.junit.Assert.assertEquals;
|
28
|
+
import static org.junit.Assert.assertNull;
|
14
29
|
|
15
30
|
public class TestCsvGuessableParserPlugin
|
31
|
+
extends TestCsvParserPlugin
|
16
32
|
{
|
17
33
|
@Rule
|
18
34
|
public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
|
@@ -20,61 +36,118 @@ public class TestCsvGuessableParserPlugin
|
|
20
36
|
@Rule
|
21
37
|
public ExpectedException exception = ExpectedException.none();
|
22
38
|
|
39
|
+
private CsvGuessableParserPlugin plugin;
|
40
|
+
private MockPageOutput output;
|
41
|
+
|
42
|
+
@Before
|
43
|
+
public void createResouce()
|
44
|
+
{
|
45
|
+
plugin = new CsvGuessableParserPlugin();
|
46
|
+
output = new MockPageOutput();
|
47
|
+
}
|
48
|
+
|
23
49
|
private ConfigSource getConfigFromYaml(String yaml)
|
24
50
|
{
|
25
51
|
ConfigLoader loader = new ConfigLoader(Exec.getModelManager());
|
26
52
|
return loader.fromYamlString(yaml);
|
27
53
|
}
|
28
54
|
|
29
|
-
|
30
|
-
|
31
|
-
public void throwExceptionWithoutRequisite()
|
55
|
+
@Test(expected = ConfigException.class)
|
56
|
+
public void checkColumnsRequired()
|
32
57
|
{
|
33
58
|
String configYaml = "" +
|
34
|
-
"type:
|
35
|
-
|
59
|
+
"type: csv_guessable";
|
36
60
|
ConfigSource config = getConfigFromYaml(configYaml);
|
61
|
+
PluginTask task = config.loadConfig(PluginTask.class);
|
37
62
|
|
38
|
-
|
39
|
-
|
63
|
+
if (!task.getSchemaConfig().isPresent() && !task.getSchemaFile().isPresent()) {
|
64
|
+
throw new ConfigException("Field 'columns' or 'schema_line' is required but not set");
|
65
|
+
}
|
40
66
|
}
|
41
|
-
*/
|
42
67
|
|
43
68
|
@Test
|
44
69
|
public void defaultValue()
|
45
70
|
{
|
46
71
|
String configYaml = "" +
|
47
|
-
"type:
|
48
|
-
"schema_line: 2";
|
72
|
+
"type: csv_guessable";
|
49
73
|
ConfigSource config = getConfigFromYaml(configYaml);
|
50
74
|
PluginTask task = config.loadConfig(PluginTask.class);
|
51
75
|
|
52
|
-
|
53
|
-
|
76
|
+
assertNull(task.getSchemaConfig().orNull());
|
77
|
+
assertNull(task.getSchemaFile().orNull());
|
78
|
+
assertEquals(1, task.getSchemaLine());
|
54
79
|
}
|
55
80
|
|
56
|
-
/*
|
57
81
|
@Test
|
58
82
|
public void originalCsvParserPlugin()
|
59
83
|
{
|
60
84
|
String configYaml = "" +
|
61
|
-
"type:
|
85
|
+
"type: csv_guessable\n" +
|
62
86
|
"columns:\n" +
|
63
87
|
" - {name: id, type: long}\n" +
|
64
88
|
" - {name: title, type: string}\n" +
|
65
89
|
" - {name: status, type: string}";
|
66
90
|
ConfigSource config = getConfigFromYaml(configYaml);
|
67
91
|
PluginTask task = config.loadConfig(PluginTask.class);
|
92
|
+
|
93
|
+
// TODO: impl or extends
|
68
94
|
}
|
69
95
|
|
70
96
|
@Test
|
71
|
-
public void
|
97
|
+
public void guessableCsv()
|
98
|
+
throws Exception
|
72
99
|
{
|
100
|
+
String configYaml = "" +
|
101
|
+
"type: csv_guessable\n" +
|
102
|
+
"schema_file: src/test/resources/org/embulk/parser/csv_guessable/data/test.csv\n" + // TODO: FIX PATH
|
103
|
+
"schema_line: 1";
|
104
|
+
ConfigSource config = getConfigFromYaml(configYaml);
|
105
|
+
PluginTask task = config.loadConfig(PluginTask.class);
|
106
|
+
// transaction(config, fileInput(new File(this.getClass().getResource("data/test.csv").getPath())));
|
107
|
+
|
108
|
+
// TODO: impl
|
73
109
|
}
|
74
110
|
|
75
111
|
@Test
|
76
|
-
public void
|
112
|
+
public void replaceColumnsMetadata()
|
113
|
+
throws Exception
|
114
|
+
{
|
115
|
+
String configYaml = "" +
|
116
|
+
"type: csv_guessable\n" +
|
117
|
+
"schema_file: src/test/resources/org/embulk/parser/csv_guessable/data/test.csv\n" + // TODO: FIX PATH
|
118
|
+
"schema_line: 1\n" +
|
119
|
+
"columns:\n" +
|
120
|
+
"- {value_name: '#', name: 'number', type: long}\n" +
|
121
|
+
"- {value_name: 'title', name: 'description', type: string}\n" +
|
122
|
+
"- {value_name: 'status', name: 'ok?', type: string}";
|
123
|
+
ConfigSource config = getConfigFromYaml(configYaml);
|
124
|
+
|
125
|
+
// TODO: impl
|
126
|
+
}
|
127
|
+
|
128
|
+
private void transaction(ConfigSource config, final FileInput input)
|
129
|
+
{
|
130
|
+
plugin.transaction(config, new ParserPlugin.Control()
|
131
|
+
{
|
132
|
+
@Override
|
133
|
+
public void run(TaskSource taskSource, Schema schema)
|
134
|
+
{
|
135
|
+
plugin.run(taskSource, schema, input, output);
|
136
|
+
}
|
137
|
+
});
|
138
|
+
}
|
139
|
+
|
140
|
+
private FileInput fileInput(File file)
|
141
|
+
throws Exception
|
142
|
+
{
|
143
|
+
FileInputStream in = new FileInputStream(file);
|
144
|
+
return new InputStreamFileInput(runtime.getBufferAllocator(), provider(in));
|
145
|
+
}
|
146
|
+
|
147
|
+
private InputStreamFileInput.IteratorProvider provider(InputStream... inputStreams)
|
148
|
+
throws IOException
|
77
149
|
{
|
150
|
+
return new InputStreamFileInput.IteratorProvider(
|
151
|
+
ImmutableList.copyOf(inputStreams));
|
78
152
|
}
|
79
|
-
*/
|
80
153
|
}
|
File without changes
|
File without changes
|
File without changes
|
@@ -0,0 +1,13 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: data/test
|
4
|
+
parser:
|
5
|
+
type: csv_guessable
|
6
|
+
schema_file: data/test.csv
|
7
|
+
schema_line: 1
|
8
|
+
columns:
|
9
|
+
- {value_name: 'id', name: 'number', type: long}
|
10
|
+
- {value_name: 'title', name: 'description', type: string}
|
11
|
+
- {value_name: 'status', name: 'ok?', type: string}
|
12
|
+
out:
|
13
|
+
type: stdout
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-parser-csv_guessable
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- koooge
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-06-
|
11
|
+
date: 2017-06-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -38,7 +38,7 @@ dependencies:
|
|
38
38
|
- - '>='
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '10.0'
|
41
|
-
description: Parses Csv
|
41
|
+
description: Parses Guessable Csv files read by other file input plugins.
|
42
42
|
email:
|
43
43
|
- koooooge@gmail.com
|
44
44
|
executables: []
|
@@ -60,16 +60,17 @@ files:
|
|
60
60
|
- src/main/java/org/embulk/parser/csv_guessable/CsvGuessableParserPlugin.java
|
61
61
|
- src/main/java/org/embulk/parser/csv_guessable/CsvTokenizer.java
|
62
62
|
- src/test/java/org/embulk/parser/csv_guessable/TestCsvGuessableParserPlugin.java
|
63
|
-
- src/test/resources/data/test.csv
|
64
|
-
- src/test/resources/data/test_alias.yml
|
65
|
-
- src/test/resources/yml/
|
66
|
-
- src/test/resources/yml/
|
67
|
-
- src/test/resources/yml/
|
63
|
+
- src/test/resources/org/embulk/parser/csv_guessable/data/test.csv
|
64
|
+
- src/test/resources/org/embulk/parser/csv_guessable/data/test_alias.yml
|
65
|
+
- src/test/resources/org/embulk/parser/csv_guessable/yml/guess_and_set_type.yml
|
66
|
+
- src/test/resources/org/embulk/parser/csv_guessable/yml/guess_from_header.yml
|
67
|
+
- src/test/resources/org/embulk/parser/csv_guessable/yml/original-csv.yml
|
68
|
+
- src/test/resources/org/embulk/parser/csv_guessable/yml/replace_column_name.yml
|
68
69
|
- classpath/commons-lang3-3.5.jar
|
70
|
+
- classpath/embulk-parser-csv_guessable-0.1.3.jar
|
69
71
|
- classpath/opencsv-3.9.jar
|
70
72
|
- classpath/commons-beanutils-1.9.3.jar
|
71
73
|
- classpath/commons-compress-1.10.jar
|
72
|
-
- classpath/embulk-parser-csv_guessable-0.1.2.jar
|
73
74
|
- classpath/embulk-standards-0.8.22.jar
|
74
75
|
- classpath/commons-collections-3.2.2.jar
|
75
76
|
- classpath/commons-logging-1.2.jar
|
@@ -96,5 +97,5 @@ rubyforge_project:
|
|
96
97
|
rubygems_version: 2.1.9
|
97
98
|
signing_key:
|
98
99
|
specification_version: 4
|
99
|
-
summary: Csv
|
100
|
+
summary: Guessable Csv parser plugin for Embulk
|
100
101
|
test_files: []
|
@@ -1,13 +0,0 @@
|
|
1
|
-
in:
|
2
|
-
type: file
|
3
|
-
path_prefix: data/test
|
4
|
-
parser:
|
5
|
-
type: csv_guessable
|
6
|
-
schema_file: data/test.csv
|
7
|
-
schema_line: 1
|
8
|
-
columns:
|
9
|
-
- {value_name: '#', name: number, type: long}
|
10
|
-
- {value_name: 'title', name: description, type: string}
|
11
|
-
- {value_name: 'status', name: ok?, type: string}
|
12
|
-
out:
|
13
|
-
type: stdout
|