embulk-filter-hash 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5ded1ac8ae36443e9654c2192980f2032b692e11
4
- data.tar.gz: 379c360184e10e28c39eeaf1e7eccd25f582200f
3
+ metadata.gz: c0f52e09e73a986928c5c653de319a33aa57beec
4
+ data.tar.gz: cd8bd547ea9d23c0a844e000a468ea0e897202dd
5
5
  SHA512:
6
- metadata.gz: 72cfce290a43b1ff0df9f036482647fea65c589fc4ef2db8abfdb4f625c95902efb6216140b83f01546251ad2191406962ad04ea12ad1a3c22bfb20457ebe6f0
7
- data.tar.gz: c333b812cc4807dd514682f8930a07400fc4d23851ee4010eaf6274021a83ce408c6db7d8b15cf819c7d9d1654508a668aaac3e54dbf551eaba799aad7bf2030
6
+ metadata.gz: 8f747472a2553c25acdac402c4087479ae1905623a603fdb16eee22af386598ebc641b3070f56e246cc04bf66818c2923b0634673073eb6d7f7916774c8653b0
7
+ data.tar.gz: 350eb6742331943747ad15f694698520a10c9cf2d360b9ef13389e386385ff96ad8adb2b4ae634eb2029602324e543916ce9e643e5184df76f48577ba076f79d
@@ -1,8 +1,20 @@
1
+ buildscript {
2
+ ext.kotlin_version = '1.0.6'
3
+ repositories {
4
+ mavenCentral()
5
+ }
6
+ dependencies {
7
+ classpath "org.jetbrains.kotlin:kotlin-gradle-plugin:$kotlin_version"
8
+ }
9
+ }
10
+
1
11
  plugins {
2
12
  id "com.jfrog.bintray" version "1.1"
3
13
  id "com.github.jruby-gradle.base" version "0.1.5"
4
14
  id "java"
5
15
  }
16
+ apply plugin: "kotlin"
17
+
6
18
  import com.github.jrubygradle.JRubyExec
7
19
  repositories {
8
20
  mavenCentral()
@@ -13,12 +25,15 @@ configurations {
13
25
  provided
14
26
  }
15
27
 
16
- version = "0.2.0"
28
+ version = "0.3.0"
29
+ sourceCompatibility = 1.7
30
+ targetCompatibility = 1.7
17
31
 
18
32
  dependencies {
19
33
  compile "org.embulk:embulk-core:0.8.16"
20
34
  provided "org.embulk:embulk-core:0.8.16"
21
- testCompile "com.kamatama41:embulk-test-helpers:0.1.1"
35
+ compile "org.jetbrains.kotlin:kotlin-stdlib:$kotlin_version"
36
+ testCompile "com.kamatama41:embulk-test-helpers:0.1.3"
22
37
  // Uncomment when using local embulk-test-helpers (and settings.gradle as well)
23
38
  //testCompile project(':embulk-test-helpers')
24
39
  }
@@ -0,0 +1,140 @@
1
+ package org.embulk.filter.hash
2
+
3
+ import com.google.common.base.Optional
4
+ import org.embulk.config.Config
5
+ import org.embulk.config.ConfigDefault
6
+ import org.embulk.config.ConfigSource
7
+ import org.embulk.config.Task
8
+ import org.embulk.config.TaskSource
9
+ import org.embulk.spi.Column
10
+ import org.embulk.spi.DataException
11
+ import org.embulk.spi.Exec
12
+ import org.embulk.spi.FilterPlugin
13
+ import org.embulk.spi.Page
14
+ import org.embulk.spi.PageBuilder
15
+ import org.embulk.spi.PageOutput
16
+ import org.embulk.spi.PageReader
17
+ import org.embulk.spi.Schema
18
+ import org.embulk.spi.type.Types
19
+ import java.security.MessageDigest
20
+
21
+ class HashFilterPlugin : FilterPlugin {
22
+
23
+ interface PluginTask : Task {
24
+ @Config("columns")
25
+ fun getColumns(): List<HashColumn>
26
+ }
27
+
28
+ interface HashColumn : Task {
29
+ @Config("name")
30
+ fun getName(): String
31
+
32
+ @Config("algorithm")
33
+ @ConfigDefault("\"SHA-256\"")
34
+ fun getAlgorithm(): Optional<String>
35
+
36
+ @Config("new_name")
37
+ @ConfigDefault("null")
38
+ fun getNewName(): Optional<String>
39
+ }
40
+
41
+ override fun transaction(config: ConfigSource, inputSchema: Schema, control: FilterPlugin.Control) {
42
+
43
+ val task = config.loadConfig(PluginTask::class.java)
44
+ val hashColumnMap = convertHashColumnListToMap(task.getColumns())
45
+
46
+ val builder = Schema.builder()
47
+ inputSchema.columns.forEach { column ->
48
+ val hashColumn = hashColumnMap[column.name]
49
+ if (hashColumn != null) {
50
+ builder.add(hashColumn.getNewName().or(column.name), Types.STRING)
51
+ } else {
52
+ builder.add(column.name, column.type)
53
+ }
54
+ }
55
+ control.run(task.dump(), builder.build())
56
+ }
57
+
58
+ override fun open(taskSource: TaskSource, inputSchema: Schema,
59
+ outputSchema: Schema, output: PageOutput): PageOutput {
60
+
61
+ val task = taskSource.loadTask(PluginTask::class.java)
62
+ val hashColumnMap = convertHashColumnListToMap(task.getColumns())
63
+ val outputColumnMap = convertColumnListToMap(outputSchema.columns)
64
+
65
+ return object : PageOutput {
66
+ private val reader = PageReader(inputSchema)
67
+ private val builder = PageBuilder(Exec.getBufferAllocator(), outputSchema, output)
68
+
69
+ override fun add(page: Page) {
70
+ reader.setPage(page)
71
+ while (reader.nextRecord()) {
72
+ setValue()
73
+ builder.addRecord()
74
+ }
75
+ }
76
+
77
+ private fun setValue() {
78
+ for (inputColumn in inputSchema.columns) {
79
+ if (reader.isNull(inputColumn)) {
80
+ builder.setNull(inputColumn)
81
+ continue
82
+ }
83
+
84
+ // Write the original data
85
+ val inputValue : Any = when (inputColumn.type) {
86
+ Types.STRING -> {
87
+ reader.getString(inputColumn).apply { builder.setString(inputColumn, this) }
88
+ }
89
+ Types.BOOLEAN -> {
90
+ reader.getBoolean(inputColumn).apply { builder.setBoolean(inputColumn, this) }
91
+ }
92
+ Types.DOUBLE -> {
93
+ reader.getDouble(inputColumn).apply { builder.setDouble(inputColumn, this) }
94
+ }
95
+ Types.LONG -> {
96
+ reader.getLong(inputColumn).apply { builder.setLong(inputColumn, this) }
97
+ }
98
+ Types.TIMESTAMP -> {
99
+ reader.getTimestamp(inputColumn).apply { builder.setTimestamp(inputColumn, this) }
100
+ }
101
+ Types.JSON -> {
102
+ reader.getJson(inputColumn).apply { builder.setJson(inputColumn, this) }
103
+ } else -> {
104
+ throw DataException("Unexpected type:" + inputColumn.type)
105
+ }
106
+ }
107
+
108
+ // Overwrite the column if it's hash column.
109
+ hashColumnMap[inputColumn.name]?.let { hashColumn ->
110
+ val outputColumn = outputColumnMap[hashColumn.getNewName().or(inputColumn.name)]
111
+ val hashedValue = generateHash(inputValue.toString(), hashColumn.getAlgorithm().get())
112
+ builder.setString(outputColumn, hashedValue)
113
+ }
114
+ }
115
+ }
116
+
117
+ private fun generateHash(value: String, algorithm: String): String {
118
+ val md = MessageDigest.getInstance(algorithm)
119
+ md.update(value.toByteArray())
120
+ return md.digest().joinToString("") { "%02x".format(it) }
121
+ }
122
+
123
+ override fun finish() {
124
+ builder.finish()
125
+ }
126
+
127
+ override fun close() {
128
+ builder.close()
129
+ }
130
+ }
131
+ }
132
+
133
+ private fun convertHashColumnListToMap(hashColumns: List<HashColumn>?): Map<String, HashColumn> {
134
+ return hashColumns!!.associate { Pair(it.getName(), it) }
135
+ }
136
+
137
+ private fun convertColumnListToMap(columns: List<Column>?): Map<String, Column> {
138
+ return columns!!.associate { Pair(it.name, it) }
139
+ }
140
+ }
@@ -0,0 +1,96 @@
1
+ package org.embulk.filter.hash
2
+
3
+ import org.embulk.spi.FilterPlugin
4
+ import org.embulk.test.EmbulkPluginTest
5
+ import org.embulk.test.TestingEmbulk
6
+ import org.junit.Test
7
+
8
+ import org.embulk.spi.type.Types.STRING
9
+ import org.embulk.test.TestOutputPlugin.assertRecords
10
+ import org.embulk.test.TestOutputPlugin.assertSchema
11
+ import org.embulk.test.Utils.column
12
+ import org.embulk.test.Utils.record
13
+
14
+ class TestHashFilterPlugin : EmbulkPluginTest() {
15
+
16
+ override fun setup(builder: TestingEmbulk.Builder) {
17
+ builder.registerPlugin(FilterPlugin::class.java, "hash", HashFilterPlugin::class.java)
18
+ }
19
+
20
+ @Test fun specifiedColumnIsHashedAndRenamed() {
21
+ val inConfigPath = "yaml/input_basic.yml"
22
+
23
+ val config = newConfig()
24
+ .set("type", "hash")
25
+ .set("columns", listOf(newConfig()
26
+ .set("name", "age")
27
+ .set("algorithm", "MD5")
28
+ .set("new_name", "hashed_age")
29
+ ))
30
+
31
+ runFilter(config, inConfigPath)
32
+
33
+ assertSchema(
34
+ column("username", STRING),
35
+ column("hashed_age", STRING)
36
+ )
37
+
38
+ assertRecords(
39
+ record("user1", "98f13708210194c475687be6106a3b84")
40
+ )
41
+ }
42
+
43
+ @Test fun allColumnTypesAreHashed() {
44
+ val inConfigPath = "yaml/input_column_types.yml"
45
+
46
+ val config = newConfig()
47
+ .set("type", "hash")
48
+ .set("columns", listOf(
49
+ newConfig().set("name", "username"),
50
+ newConfig().set("name", "age"),
51
+ newConfig().set("name", "weight"),
52
+ newConfig().set("name", "active"),
53
+ newConfig().set("name", "created_at"),
54
+ newConfig().set("name", "options")
55
+ ))
56
+
57
+ runFilter(config, inConfigPath)
58
+
59
+ assertSchema(
60
+ column("username", STRING),
61
+ column("age", STRING),
62
+ column("weight", STRING),
63
+ column("active", STRING),
64
+ column("created_at", STRING),
65
+ column("options", STRING)
66
+ )
67
+
68
+ assertRecords(
69
+ record(
70
+ "0a041b9462caa4a31bac3567e0b6e6fd9100787db2ab433d96f6d178cabfce90",
71
+ "6f4b6612125fb3a0daecd2799dfd6c9c299424fd920f9b308110a2c1fbd8f443",
72
+ "70822ecbef5bee37d162492107a3127fc0a4de0564f34ce92713a7baaeb582b0",
73
+ "b5bea41b6c623f7c09f1bf24dcae58ebab3c0cdd90ad966bc43a45b44867e12b",
74
+ "9673fe7b67d880e2c9071428c63f6e1bea9dde98283297277a20b92ea0acdc72",
75
+ "3ff0e331ca59a2a1194bac0e36359ed4540a97383e1cdf6eb95c7de9309143fc"
76
+ )
77
+ )
78
+ }
79
+
80
+ @Test fun columnIsNull() {
81
+ val inConfigPath = "yaml/input_null_column.yml"
82
+
83
+ val config = newConfig()
84
+ .set("type", "hash")
85
+ .set("columns", listOf(
86
+ newConfig().set("name", "username"),
87
+ newConfig().set("name", "age")
88
+ ))
89
+
90
+ runFilter(config, inConfigPath)
91
+
92
+ assertRecords(
93
+ record(null, "f5ca38f748a1d6eaf726b8a42fb575c3c71f1864a8143301782de13da2d9202b")
94
+ )
95
+ }
96
+ }
@@ -0,0 +1,9 @@
1
+ type: test
2
+ data:
3
+ - null,20
4
+ parser:
5
+ type: csv
6
+ null_string: 'null'
7
+ columns:
8
+ - {name: username, type: string}
9
+ - {name: age, type: long}
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-filter-hash
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shinichi Ishimura
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-01-29 00:00:00.000000000 Z
11
+ date: 2017-02-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -56,11 +56,14 @@ files:
56
56
  - gradlew.bat
57
57
  - lib/embulk/filter/hash.rb
58
58
  - settings.gradle
59
- - src/main/java/org/embulk/filter/hash/HashFilterPlugin.java
60
- - src/test/java/org/embulk/filter/hash/TestHashFilterPlugin.java
59
+ - src/main/kotlin/org/embulk/filter/hash/HashFilterPlugin.kt
60
+ - src/test/kotlin/org/embulk/filter/hash/TestHashFilterPlugin.kt
61
61
  - src/test/resources/yaml/input_basic.yml
62
62
  - src/test/resources/yaml/input_column_types.yml
63
- - classpath/embulk-filter-hash-0.2.0.jar
63
+ - src/test/resources/yaml/input_null_column.yml
64
+ - classpath/embulk-filter-hash-0.3.0.jar
65
+ - classpath/kotlin-runtime-1.0.6.jar
66
+ - classpath/kotlin-stdlib-1.0.6.jar
64
67
  homepage: https://github.com/kamatama41/embulk-filter-hash
65
68
  licenses:
66
69
  - MIT
@@ -1,181 +0,0 @@
1
- package org.embulk.filter.hash;
2
-
3
- import com.google.common.base.Optional;
4
- import com.google.common.base.Throwables;
5
- import org.embulk.config.Config;
6
- import org.embulk.config.ConfigDefault;
7
- import org.embulk.config.ConfigSource;
8
- import org.embulk.config.Task;
9
- import org.embulk.config.TaskSource;
10
- import org.embulk.spi.Column;
11
- import org.embulk.spi.DataException;
12
- import org.embulk.spi.Exec;
13
- import org.embulk.spi.FilterPlugin;
14
- import org.embulk.spi.Page;
15
- import org.embulk.spi.PageBuilder;
16
- import org.embulk.spi.PageOutput;
17
- import org.embulk.spi.PageReader;
18
- import org.embulk.spi.Schema;
19
- import org.embulk.spi.time.Timestamp;
20
- import org.embulk.spi.type.Types;
21
- import org.msgpack.value.Value;
22
-
23
- import java.security.MessageDigest;
24
- import java.security.NoSuchAlgorithmException;
25
- import java.util.HashMap;
26
- import java.util.List;
27
- import java.util.Map;
28
-
29
- public class HashFilterPlugin implements FilterPlugin {
30
-
31
- public interface PluginTask extends Task {
32
- @Config("columns")
33
- List<HashColumn> getColumns();
34
- }
35
-
36
- public interface HashColumn extends Task {
37
- @Config("name")
38
- String getName();
39
-
40
- @Config("algorithm")
41
- @ConfigDefault("\"SHA-256\"")
42
- Optional<String> getAlgorithm();
43
-
44
- @Config("new_name")
45
- @ConfigDefault("null")
46
- Optional<String> getNewName();
47
- }
48
-
49
- @Override
50
- public void transaction(ConfigSource config, Schema inputSchema, FilterPlugin.Control control) {
51
-
52
- PluginTask task = config.loadConfig(PluginTask.class);
53
- Map<String, HashColumn> hashColumnMap = convertHashColumnListToMap(task.getColumns());
54
-
55
- Schema.Builder builder = Schema.builder();
56
- for (Column column : inputSchema.getColumns()) {
57
-
58
- HashColumn hashColumn = hashColumnMap.get(column.getName());
59
-
60
- if (hashColumn != null) {
61
- builder.add(hashColumn.getNewName().or(column.getName()), Types.STRING);
62
- } else {
63
- builder.add(column.getName(), column.getType());
64
- }
65
- }
66
- control.run(task.dump(), builder.build());
67
- }
68
-
69
- @Override
70
- public PageOutput open(final TaskSource taskSource, final Schema inputSchema,
71
- final Schema outputSchema, final PageOutput output) {
72
-
73
- final PluginTask task = taskSource.loadTask(PluginTask.class);
74
- final Map<String, HashColumn> hashColumnMap = convertHashColumnListToMap(task.getColumns());
75
- final Map<String, Column> outputColumnMap = convertColumnListToMap(outputSchema.getColumns());
76
-
77
- return new PageOutput() {
78
- private PageReader reader = new PageReader(inputSchema);
79
- private PageBuilder builder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
80
-
81
- @Override
82
- public void add(Page page) {
83
- reader.setPage(page);
84
- while (reader.nextRecord()) {
85
- setValue();
86
- builder.addRecord();
87
- }
88
- }
89
-
90
- private void setValue() {
91
- for (Column inputColumn : inputSchema.getColumns()) {
92
- if (reader.isNull(inputColumn)) {
93
- builder.setNull(inputColumn);
94
- continue;
95
- }
96
-
97
- // Write the original data
98
- Object inputValue;
99
- if (Types.STRING.equals(inputColumn.getType())) {
100
- final String value = reader.getString(inputColumn);
101
- inputValue = value;
102
- builder.setString(inputColumn, value);
103
- } else if (Types.BOOLEAN.equals(inputColumn.getType())) {
104
- final boolean value = reader.getBoolean(inputColumn);
105
- inputValue = value;
106
- builder.setBoolean(inputColumn, value);
107
- } else if (Types.DOUBLE.equals(inputColumn.getType())) {
108
- final double value = reader.getDouble(inputColumn);
109
- inputValue = value;
110
- builder.setDouble(inputColumn, value);
111
- } else if (Types.LONG.equals(inputColumn.getType())) {
112
- final long value = reader.getLong(inputColumn);
113
- inputValue = value;
114
- builder.setLong(inputColumn, value);
115
- } else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
116
- final Timestamp value = reader.getTimestamp(inputColumn);
117
- inputValue = value;
118
- builder.setTimestamp(inputColumn, value);
119
- } else if (Types.JSON.equals(inputColumn.getType())) {
120
- final Value value = reader.getJson(inputColumn);
121
- inputValue = value;
122
- builder.setJson(inputColumn, value);
123
- } else {
124
- throw new DataException("Unexpected type:" + inputColumn.getType());
125
- }
126
-
127
- // Overwrite the column if it's hash column.
128
- HashColumn hashColumn = hashColumnMap.get(inputColumn.getName());
129
- if (hashColumn != null) {
130
- final Column outputColumn = outputColumnMap.get(hashColumn.getNewName().or(inputColumn.getName()));
131
- final String hashedValue = generateHash(inputValue.toString(), hashColumn.getAlgorithm().get());
132
- builder.setString(outputColumn, hashedValue);
133
- }
134
- }
135
- }
136
-
137
- private String generateHash(String value, String algorithm) {
138
- String result = null;
139
- try {
140
- MessageDigest md = MessageDigest.getInstance(algorithm);
141
- md.update(value.getBytes());
142
-
143
- StringBuilder sb = new StringBuilder();
144
- for (byte b : md.digest()) {
145
- sb.append(String.format("%02x", b));
146
- }
147
- result = sb.toString();
148
- } catch (NoSuchAlgorithmException e) {
149
- Throwables.propagate(e);
150
- }
151
- return result;
152
- }
153
-
154
- @Override
155
- public void finish() {
156
- builder.finish();
157
- }
158
-
159
- @Override
160
- public void close() {
161
- builder.close();
162
- }
163
- };
164
- }
165
-
166
- private static Map<String, HashColumn> convertHashColumnListToMap(List<HashColumn> hashColumns) {
167
- Map<String, HashColumn> result = new HashMap<>();
168
- for (HashColumn hashColumn : hashColumns) {
169
- result.put(hashColumn.getName(), hashColumn);
170
- }
171
- return result;
172
- }
173
-
174
- private static Map<String, Column> convertColumnListToMap(List<Column> columns) {
175
- Map<String, Column> result = new HashMap<>();
176
- for (Column column : columns) {
177
- result.put(column.getName(), column);
178
- }
179
- return result;
180
- }
181
- }
@@ -1,90 +0,0 @@
1
- package org.embulk.filter.hash;
2
-
3
- import org.embulk.config.ConfigSource;
4
- import org.embulk.spi.FilterPlugin;
5
- import org.embulk.test.ExtendedTestingEmbulk;
6
- import org.junit.Rule;
7
- import org.junit.Test;
8
-
9
- import java.util.Arrays;
10
- import java.util.Collections;
11
-
12
- import static org.embulk.spi.type.Types.STRING;
13
- import static org.embulk.test.TestOutputPlugin.assertRecords;
14
- import static org.embulk.test.TestOutputPlugin.assertSchema;
15
- import static org.embulk.test.Utils.column;
16
- import static org.embulk.test.Utils.record;
17
-
18
- public class TestHashFilterPlugin {
19
- @Rule
20
- public ExtendedTestingEmbulk embulk = (ExtendedTestingEmbulk) ExtendedTestingEmbulk
21
- .builder()
22
- .registerPlugin(FilterPlugin.class, "hash", HashFilterPlugin.class)
23
- .build();
24
-
25
- @Test
26
- public void specifiedColumnIsHashedAndRenamed() {
27
- final String inConfigPath = "yaml/input_basic.yml";
28
-
29
- ConfigSource config = embulk.newConfig()
30
- .set("type", "hash")
31
- .set("columns", Collections.singletonList(
32
- config().set("name", "age").set("algorithm", "MD5").set("new_name", "hashed_age")
33
- )
34
- );
35
-
36
- embulk.runFilter(config, inConfigPath);
37
-
38
- assertSchema(
39
- column("username", STRING),
40
- column("hashed_age", STRING)
41
- );
42
-
43
- assertRecords(
44
- record("user1", "98f13708210194c475687be6106a3b84")
45
- );
46
- }
47
-
48
- @Test
49
- public void allColumnTypesAreHashed() {
50
- final String inConfigPath = "yaml/input_column_types.yml";
51
-
52
- ConfigSource config = embulk.newConfig()
53
- .set("type", "hash")
54
- .set("columns", Arrays.asList(
55
- config().set("name", "username"),
56
- config().set("name", "age"),
57
- config().set("name", "weight"),
58
- config().set("name", "active"),
59
- config().set("name", "created_at"),
60
- config().set("name", "options")
61
- )
62
- );
63
-
64
- embulk.runFilter(config, inConfigPath);
65
-
66
- assertSchema(
67
- column("username", STRING),
68
- column("age", STRING),
69
- column("weight", STRING),
70
- column("active", STRING),
71
- column("created_at", STRING),
72
- column("options", STRING)
73
- );
74
-
75
- assertRecords(
76
- record(
77
- "0a041b9462caa4a31bac3567e0b6e6fd9100787db2ab433d96f6d178cabfce90",
78
- "6f4b6612125fb3a0daecd2799dfd6c9c299424fd920f9b308110a2c1fbd8f443",
79
- "70822ecbef5bee37d162492107a3127fc0a4de0564f34ce92713a7baaeb582b0",
80
- "b5bea41b6c623f7c09f1bf24dcae58ebab3c0cdd90ad966bc43a45b44867e12b",
81
- "9673fe7b67d880e2c9071428c63f6e1bea9dde98283297277a20b92ea0acdc72",
82
- "3ff0e331ca59a2a1194bac0e36359ed4540a97383e1cdf6eb95c7de9309143fc"
83
- )
84
- );
85
- }
86
-
87
- private ConfigSource config() {
88
- return embulk.newConfig();
89
- }
90
- }