embulk-filter-hash 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5ded1ac8ae36443e9654c2192980f2032b692e11
4
- data.tar.gz: 379c360184e10e28c39eeaf1e7eccd25f582200f
3
+ metadata.gz: c0f52e09e73a986928c5c653de319a33aa57beec
4
+ data.tar.gz: cd8bd547ea9d23c0a844e000a468ea0e897202dd
5
5
  SHA512:
6
- metadata.gz: 72cfce290a43b1ff0df9f036482647fea65c589fc4ef2db8abfdb4f625c95902efb6216140b83f01546251ad2191406962ad04ea12ad1a3c22bfb20457ebe6f0
7
- data.tar.gz: c333b812cc4807dd514682f8930a07400fc4d23851ee4010eaf6274021a83ce408c6db7d8b15cf819c7d9d1654508a668aaac3e54dbf551eaba799aad7bf2030
6
+ metadata.gz: 8f747472a2553c25acdac402c4087479ae1905623a603fdb16eee22af386598ebc641b3070f56e246cc04bf66818c2923b0634673073eb6d7f7916774c8653b0
7
+ data.tar.gz: 350eb6742331943747ad15f694698520a10c9cf2d360b9ef13389e386385ff96ad8adb2b4ae634eb2029602324e543916ce9e643e5184df76f48577ba076f79d
@@ -1,8 +1,20 @@
1
+ buildscript {
2
+ ext.kotlin_version = '1.0.6'
3
+ repositories {
4
+ mavenCentral()
5
+ }
6
+ dependencies {
7
+ classpath "org.jetbrains.kotlin:kotlin-gradle-plugin:$kotlin_version"
8
+ }
9
+ }
10
+
1
11
  plugins {
2
12
  id "com.jfrog.bintray" version "1.1"
3
13
  id "com.github.jruby-gradle.base" version "0.1.5"
4
14
  id "java"
5
15
  }
16
+ apply plugin: "kotlin"
17
+
6
18
  import com.github.jrubygradle.JRubyExec
7
19
  repositories {
8
20
  mavenCentral()
@@ -13,12 +25,15 @@ configurations {
13
25
  provided
14
26
  }
15
27
 
16
- version = "0.2.0"
28
+ version = "0.3.0"
29
+ sourceCompatibility = 1.7
30
+ targetCompatibility = 1.7
17
31
 
18
32
  dependencies {
19
33
  compile "org.embulk:embulk-core:0.8.16"
20
34
  provided "org.embulk:embulk-core:0.8.16"
21
- testCompile "com.kamatama41:embulk-test-helpers:0.1.1"
35
+ compile "org.jetbrains.kotlin:kotlin-stdlib:$kotlin_version"
36
+ testCompile "com.kamatama41:embulk-test-helpers:0.1.3"
22
37
  // Uncomment when using local embulk-test-helpers (and settings.gradle as well)
23
38
  //testCompile project(':embulk-test-helpers')
24
39
  }
@@ -0,0 +1,140 @@
1
+ package org.embulk.filter.hash
2
+
3
+ import com.google.common.base.Optional
4
+ import org.embulk.config.Config
5
+ import org.embulk.config.ConfigDefault
6
+ import org.embulk.config.ConfigSource
7
+ import org.embulk.config.Task
8
+ import org.embulk.config.TaskSource
9
+ import org.embulk.spi.Column
10
+ import org.embulk.spi.DataException
11
+ import org.embulk.spi.Exec
12
+ import org.embulk.spi.FilterPlugin
13
+ import org.embulk.spi.Page
14
+ import org.embulk.spi.PageBuilder
15
+ import org.embulk.spi.PageOutput
16
+ import org.embulk.spi.PageReader
17
+ import org.embulk.spi.Schema
18
+ import org.embulk.spi.type.Types
19
+ import java.security.MessageDigest
20
+
21
+ class HashFilterPlugin : FilterPlugin {
22
+
23
+ interface PluginTask : Task {
24
+ @Config("columns")
25
+ fun getColumns(): List<HashColumn>
26
+ }
27
+
28
+ interface HashColumn : Task {
29
+ @Config("name")
30
+ fun getName(): String
31
+
32
+ @Config("algorithm")
33
+ @ConfigDefault("\"SHA-256\"")
34
+ fun getAlgorithm(): Optional<String>
35
+
36
+ @Config("new_name")
37
+ @ConfigDefault("null")
38
+ fun getNewName(): Optional<String>
39
+ }
40
+
41
+ override fun transaction(config: ConfigSource, inputSchema: Schema, control: FilterPlugin.Control) {
42
+
43
+ val task = config.loadConfig(PluginTask::class.java)
44
+ val hashColumnMap = convertHashColumnListToMap(task.getColumns())
45
+
46
+ val builder = Schema.builder()
47
+ inputSchema.columns.forEach { column ->
48
+ val hashColumn = hashColumnMap[column.name]
49
+ if (hashColumn != null) {
50
+ builder.add(hashColumn.getNewName().or(column.name), Types.STRING)
51
+ } else {
52
+ builder.add(column.name, column.type)
53
+ }
54
+ }
55
+ control.run(task.dump(), builder.build())
56
+ }
57
+
58
+ override fun open(taskSource: TaskSource, inputSchema: Schema,
59
+ outputSchema: Schema, output: PageOutput): PageOutput {
60
+
61
+ val task = taskSource.loadTask(PluginTask::class.java)
62
+ val hashColumnMap = convertHashColumnListToMap(task.getColumns())
63
+ val outputColumnMap = convertColumnListToMap(outputSchema.columns)
64
+
65
+ return object : PageOutput {
66
+ private val reader = PageReader(inputSchema)
67
+ private val builder = PageBuilder(Exec.getBufferAllocator(), outputSchema, output)
68
+
69
+ override fun add(page: Page) {
70
+ reader.setPage(page)
71
+ while (reader.nextRecord()) {
72
+ setValue()
73
+ builder.addRecord()
74
+ }
75
+ }
76
+
77
+ private fun setValue() {
78
+ for (inputColumn in inputSchema.columns) {
79
+ if (reader.isNull(inputColumn)) {
80
+ builder.setNull(inputColumn)
81
+ continue
82
+ }
83
+
84
+ // Write the original data
85
+ val inputValue : Any = when (inputColumn.type) {
86
+ Types.STRING -> {
87
+ reader.getString(inputColumn).apply { builder.setString(inputColumn, this) }
88
+ }
89
+ Types.BOOLEAN -> {
90
+ reader.getBoolean(inputColumn).apply { builder.setBoolean(inputColumn, this) }
91
+ }
92
+ Types.DOUBLE -> {
93
+ reader.getDouble(inputColumn).apply { builder.setDouble(inputColumn, this) }
94
+ }
95
+ Types.LONG -> {
96
+ reader.getLong(inputColumn).apply { builder.setLong(inputColumn, this) }
97
+ }
98
+ Types.TIMESTAMP -> {
99
+ reader.getTimestamp(inputColumn).apply { builder.setTimestamp(inputColumn, this) }
100
+ }
101
+ Types.JSON -> {
102
+ reader.getJson(inputColumn).apply { builder.setJson(inputColumn, this) }
103
+ } else -> {
104
+ throw DataException("Unexpected type:" + inputColumn.type)
105
+ }
106
+ }
107
+
108
+ // Overwrite the column if it's hash column.
109
+ hashColumnMap[inputColumn.name]?.let { hashColumn ->
110
+ val outputColumn = outputColumnMap[hashColumn.getNewName().or(inputColumn.name)]
111
+ val hashedValue = generateHash(inputValue.toString(), hashColumn.getAlgorithm().get())
112
+ builder.setString(outputColumn, hashedValue)
113
+ }
114
+ }
115
+ }
116
+
117
+ private fun generateHash(value: String, algorithm: String): String {
118
+ val md = MessageDigest.getInstance(algorithm)
119
+ md.update(value.toByteArray())
120
+ return md.digest().joinToString("") { "%02x".format(it) }
121
+ }
122
+
123
+ override fun finish() {
124
+ builder.finish()
125
+ }
126
+
127
+ override fun close() {
128
+ builder.close()
129
+ }
130
+ }
131
+ }
132
+
133
+ private fun convertHashColumnListToMap(hashColumns: List<HashColumn>?): Map<String, HashColumn> {
134
+ return hashColumns!!.associate { Pair(it.getName(), it) }
135
+ }
136
+
137
+ private fun convertColumnListToMap(columns: List<Column>?): Map<String, Column> {
138
+ return columns!!.associate { Pair(it.name, it) }
139
+ }
140
+ }
@@ -0,0 +1,96 @@
1
+ package org.embulk.filter.hash
2
+
3
+ import org.embulk.spi.FilterPlugin
4
+ import org.embulk.test.EmbulkPluginTest
5
+ import org.embulk.test.TestingEmbulk
6
+ import org.junit.Test
7
+
8
+ import org.embulk.spi.type.Types.STRING
9
+ import org.embulk.test.TestOutputPlugin.assertRecords
10
+ import org.embulk.test.TestOutputPlugin.assertSchema
11
+ import org.embulk.test.Utils.column
12
+ import org.embulk.test.Utils.record
13
+
14
+ class TestHashFilterPlugin : EmbulkPluginTest() {
15
+
16
+ override fun setup(builder: TestingEmbulk.Builder) {
17
+ builder.registerPlugin(FilterPlugin::class.java, "hash", HashFilterPlugin::class.java)
18
+ }
19
+
20
+ @Test fun specifiedColumnIsHashedAndRenamed() {
21
+ val inConfigPath = "yaml/input_basic.yml"
22
+
23
+ val config = newConfig()
24
+ .set("type", "hash")
25
+ .set("columns", listOf(newConfig()
26
+ .set("name", "age")
27
+ .set("algorithm", "MD5")
28
+ .set("new_name", "hashed_age")
29
+ ))
30
+
31
+ runFilter(config, inConfigPath)
32
+
33
+ assertSchema(
34
+ column("username", STRING),
35
+ column("hashed_age", STRING)
36
+ )
37
+
38
+ assertRecords(
39
+ record("user1", "98f13708210194c475687be6106a3b84")
40
+ )
41
+ }
42
+
43
+ @Test fun allColumnTypesAreHashed() {
44
+ val inConfigPath = "yaml/input_column_types.yml"
45
+
46
+ val config = newConfig()
47
+ .set("type", "hash")
48
+ .set("columns", listOf(
49
+ newConfig().set("name", "username"),
50
+ newConfig().set("name", "age"),
51
+ newConfig().set("name", "weight"),
52
+ newConfig().set("name", "active"),
53
+ newConfig().set("name", "created_at"),
54
+ newConfig().set("name", "options")
55
+ ))
56
+
57
+ runFilter(config, inConfigPath)
58
+
59
+ assertSchema(
60
+ column("username", STRING),
61
+ column("age", STRING),
62
+ column("weight", STRING),
63
+ column("active", STRING),
64
+ column("created_at", STRING),
65
+ column("options", STRING)
66
+ )
67
+
68
+ assertRecords(
69
+ record(
70
+ "0a041b9462caa4a31bac3567e0b6e6fd9100787db2ab433d96f6d178cabfce90",
71
+ "6f4b6612125fb3a0daecd2799dfd6c9c299424fd920f9b308110a2c1fbd8f443",
72
+ "70822ecbef5bee37d162492107a3127fc0a4de0564f34ce92713a7baaeb582b0",
73
+ "b5bea41b6c623f7c09f1bf24dcae58ebab3c0cdd90ad966bc43a45b44867e12b",
74
+ "9673fe7b67d880e2c9071428c63f6e1bea9dde98283297277a20b92ea0acdc72",
75
+ "3ff0e331ca59a2a1194bac0e36359ed4540a97383e1cdf6eb95c7de9309143fc"
76
+ )
77
+ )
78
+ }
79
+
80
+ @Test fun columnIsNull() {
81
+ val inConfigPath = "yaml/input_null_column.yml"
82
+
83
+ val config = newConfig()
84
+ .set("type", "hash")
85
+ .set("columns", listOf(
86
+ newConfig().set("name", "username"),
87
+ newConfig().set("name", "age")
88
+ ))
89
+
90
+ runFilter(config, inConfigPath)
91
+
92
+ assertRecords(
93
+ record(null, "f5ca38f748a1d6eaf726b8a42fb575c3c71f1864a8143301782de13da2d9202b")
94
+ )
95
+ }
96
+ }
@@ -0,0 +1,9 @@
1
+ type: test
2
+ data:
3
+ - null,20
4
+ parser:
5
+ type: csv
6
+ null_string: 'null'
7
+ columns:
8
+ - {name: username, type: string}
9
+ - {name: age, type: long}
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-filter-hash
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shinichi Ishimura
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-01-29 00:00:00.000000000 Z
11
+ date: 2017-02-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -56,11 +56,14 @@ files:
56
56
  - gradlew.bat
57
57
  - lib/embulk/filter/hash.rb
58
58
  - settings.gradle
59
- - src/main/java/org/embulk/filter/hash/HashFilterPlugin.java
60
- - src/test/java/org/embulk/filter/hash/TestHashFilterPlugin.java
59
+ - src/main/kotlin/org/embulk/filter/hash/HashFilterPlugin.kt
60
+ - src/test/kotlin/org/embulk/filter/hash/TestHashFilterPlugin.kt
61
61
  - src/test/resources/yaml/input_basic.yml
62
62
  - src/test/resources/yaml/input_column_types.yml
63
- - classpath/embulk-filter-hash-0.2.0.jar
63
+ - src/test/resources/yaml/input_null_column.yml
64
+ - classpath/embulk-filter-hash-0.3.0.jar
65
+ - classpath/kotlin-runtime-1.0.6.jar
66
+ - classpath/kotlin-stdlib-1.0.6.jar
64
67
  homepage: https://github.com/kamatama41/embulk-filter-hash
65
68
  licenses:
66
69
  - MIT
@@ -1,181 +0,0 @@
1
- package org.embulk.filter.hash;
2
-
3
- import com.google.common.base.Optional;
4
- import com.google.common.base.Throwables;
5
- import org.embulk.config.Config;
6
- import org.embulk.config.ConfigDefault;
7
- import org.embulk.config.ConfigSource;
8
- import org.embulk.config.Task;
9
- import org.embulk.config.TaskSource;
10
- import org.embulk.spi.Column;
11
- import org.embulk.spi.DataException;
12
- import org.embulk.spi.Exec;
13
- import org.embulk.spi.FilterPlugin;
14
- import org.embulk.spi.Page;
15
- import org.embulk.spi.PageBuilder;
16
- import org.embulk.spi.PageOutput;
17
- import org.embulk.spi.PageReader;
18
- import org.embulk.spi.Schema;
19
- import org.embulk.spi.time.Timestamp;
20
- import org.embulk.spi.type.Types;
21
- import org.msgpack.value.Value;
22
-
23
- import java.security.MessageDigest;
24
- import java.security.NoSuchAlgorithmException;
25
- import java.util.HashMap;
26
- import java.util.List;
27
- import java.util.Map;
28
-
29
- public class HashFilterPlugin implements FilterPlugin {
30
-
31
- public interface PluginTask extends Task {
32
- @Config("columns")
33
- List<HashColumn> getColumns();
34
- }
35
-
36
- public interface HashColumn extends Task {
37
- @Config("name")
38
- String getName();
39
-
40
- @Config("algorithm")
41
- @ConfigDefault("\"SHA-256\"")
42
- Optional<String> getAlgorithm();
43
-
44
- @Config("new_name")
45
- @ConfigDefault("null")
46
- Optional<String> getNewName();
47
- }
48
-
49
- @Override
50
- public void transaction(ConfigSource config, Schema inputSchema, FilterPlugin.Control control) {
51
-
52
- PluginTask task = config.loadConfig(PluginTask.class);
53
- Map<String, HashColumn> hashColumnMap = convertHashColumnListToMap(task.getColumns());
54
-
55
- Schema.Builder builder = Schema.builder();
56
- for (Column column : inputSchema.getColumns()) {
57
-
58
- HashColumn hashColumn = hashColumnMap.get(column.getName());
59
-
60
- if (hashColumn != null) {
61
- builder.add(hashColumn.getNewName().or(column.getName()), Types.STRING);
62
- } else {
63
- builder.add(column.getName(), column.getType());
64
- }
65
- }
66
- control.run(task.dump(), builder.build());
67
- }
68
-
69
- @Override
70
- public PageOutput open(final TaskSource taskSource, final Schema inputSchema,
71
- final Schema outputSchema, final PageOutput output) {
72
-
73
- final PluginTask task = taskSource.loadTask(PluginTask.class);
74
- final Map<String, HashColumn> hashColumnMap = convertHashColumnListToMap(task.getColumns());
75
- final Map<String, Column> outputColumnMap = convertColumnListToMap(outputSchema.getColumns());
76
-
77
- return new PageOutput() {
78
- private PageReader reader = new PageReader(inputSchema);
79
- private PageBuilder builder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
80
-
81
- @Override
82
- public void add(Page page) {
83
- reader.setPage(page);
84
- while (reader.nextRecord()) {
85
- setValue();
86
- builder.addRecord();
87
- }
88
- }
89
-
90
- private void setValue() {
91
- for (Column inputColumn : inputSchema.getColumns()) {
92
- if (reader.isNull(inputColumn)) {
93
- builder.setNull(inputColumn);
94
- continue;
95
- }
96
-
97
- // Write the original data
98
- Object inputValue;
99
- if (Types.STRING.equals(inputColumn.getType())) {
100
- final String value = reader.getString(inputColumn);
101
- inputValue = value;
102
- builder.setString(inputColumn, value);
103
- } else if (Types.BOOLEAN.equals(inputColumn.getType())) {
104
- final boolean value = reader.getBoolean(inputColumn);
105
- inputValue = value;
106
- builder.setBoolean(inputColumn, value);
107
- } else if (Types.DOUBLE.equals(inputColumn.getType())) {
108
- final double value = reader.getDouble(inputColumn);
109
- inputValue = value;
110
- builder.setDouble(inputColumn, value);
111
- } else if (Types.LONG.equals(inputColumn.getType())) {
112
- final long value = reader.getLong(inputColumn);
113
- inputValue = value;
114
- builder.setLong(inputColumn, value);
115
- } else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
116
- final Timestamp value = reader.getTimestamp(inputColumn);
117
- inputValue = value;
118
- builder.setTimestamp(inputColumn, value);
119
- } else if (Types.JSON.equals(inputColumn.getType())) {
120
- final Value value = reader.getJson(inputColumn);
121
- inputValue = value;
122
- builder.setJson(inputColumn, value);
123
- } else {
124
- throw new DataException("Unexpected type:" + inputColumn.getType());
125
- }
126
-
127
- // Overwrite the column if it's hash column.
128
- HashColumn hashColumn = hashColumnMap.get(inputColumn.getName());
129
- if (hashColumn != null) {
130
- final Column outputColumn = outputColumnMap.get(hashColumn.getNewName().or(inputColumn.getName()));
131
- final String hashedValue = generateHash(inputValue.toString(), hashColumn.getAlgorithm().get());
132
- builder.setString(outputColumn, hashedValue);
133
- }
134
- }
135
- }
136
-
137
- private String generateHash(String value, String algorithm) {
138
- String result = null;
139
- try {
140
- MessageDigest md = MessageDigest.getInstance(algorithm);
141
- md.update(value.getBytes());
142
-
143
- StringBuilder sb = new StringBuilder();
144
- for (byte b : md.digest()) {
145
- sb.append(String.format("%02x", b));
146
- }
147
- result = sb.toString();
148
- } catch (NoSuchAlgorithmException e) {
149
- Throwables.propagate(e);
150
- }
151
- return result;
152
- }
153
-
154
- @Override
155
- public void finish() {
156
- builder.finish();
157
- }
158
-
159
- @Override
160
- public void close() {
161
- builder.close();
162
- }
163
- };
164
- }
165
-
166
- private static Map<String, HashColumn> convertHashColumnListToMap(List<HashColumn> hashColumns) {
167
- Map<String, HashColumn> result = new HashMap<>();
168
- for (HashColumn hashColumn : hashColumns) {
169
- result.put(hashColumn.getName(), hashColumn);
170
- }
171
- return result;
172
- }
173
-
174
- private static Map<String, Column> convertColumnListToMap(List<Column> columns) {
175
- Map<String, Column> result = new HashMap<>();
176
- for (Column column : columns) {
177
- result.put(column.getName(), column);
178
- }
179
- return result;
180
- }
181
- }
@@ -1,90 +0,0 @@
1
- package org.embulk.filter.hash;
2
-
3
- import org.embulk.config.ConfigSource;
4
- import org.embulk.spi.FilterPlugin;
5
- import org.embulk.test.ExtendedTestingEmbulk;
6
- import org.junit.Rule;
7
- import org.junit.Test;
8
-
9
- import java.util.Arrays;
10
- import java.util.Collections;
11
-
12
- import static org.embulk.spi.type.Types.STRING;
13
- import static org.embulk.test.TestOutputPlugin.assertRecords;
14
- import static org.embulk.test.TestOutputPlugin.assertSchema;
15
- import static org.embulk.test.Utils.column;
16
- import static org.embulk.test.Utils.record;
17
-
18
- public class TestHashFilterPlugin {
19
- @Rule
20
- public ExtendedTestingEmbulk embulk = (ExtendedTestingEmbulk) ExtendedTestingEmbulk
21
- .builder()
22
- .registerPlugin(FilterPlugin.class, "hash", HashFilterPlugin.class)
23
- .build();
24
-
25
- @Test
26
- public void specifiedColumnIsHashedAndRenamed() {
27
- final String inConfigPath = "yaml/input_basic.yml";
28
-
29
- ConfigSource config = embulk.newConfig()
30
- .set("type", "hash")
31
- .set("columns", Collections.singletonList(
32
- config().set("name", "age").set("algorithm", "MD5").set("new_name", "hashed_age")
33
- )
34
- );
35
-
36
- embulk.runFilter(config, inConfigPath);
37
-
38
- assertSchema(
39
- column("username", STRING),
40
- column("hashed_age", STRING)
41
- );
42
-
43
- assertRecords(
44
- record("user1", "98f13708210194c475687be6106a3b84")
45
- );
46
- }
47
-
48
- @Test
49
- public void allColumnTypesAreHashed() {
50
- final String inConfigPath = "yaml/input_column_types.yml";
51
-
52
- ConfigSource config = embulk.newConfig()
53
- .set("type", "hash")
54
- .set("columns", Arrays.asList(
55
- config().set("name", "username"),
56
- config().set("name", "age"),
57
- config().set("name", "weight"),
58
- config().set("name", "active"),
59
- config().set("name", "created_at"),
60
- config().set("name", "options")
61
- )
62
- );
63
-
64
- embulk.runFilter(config, inConfigPath);
65
-
66
- assertSchema(
67
- column("username", STRING),
68
- column("age", STRING),
69
- column("weight", STRING),
70
- column("active", STRING),
71
- column("created_at", STRING),
72
- column("options", STRING)
73
- );
74
-
75
- assertRecords(
76
- record(
77
- "0a041b9462caa4a31bac3567e0b6e6fd9100787db2ab433d96f6d178cabfce90",
78
- "6f4b6612125fb3a0daecd2799dfd6c9c299424fd920f9b308110a2c1fbd8f443",
79
- "70822ecbef5bee37d162492107a3127fc0a4de0564f34ce92713a7baaeb582b0",
80
- "b5bea41b6c623f7c09f1bf24dcae58ebab3c0cdd90ad966bc43a45b44867e12b",
81
- "9673fe7b67d880e2c9071428c63f6e1bea9dde98283297277a20b92ea0acdc72",
82
- "3ff0e331ca59a2a1194bac0e36359ed4540a97383e1cdf6eb95c7de9309143fc"
83
- )
84
- );
85
- }
86
-
87
- private ConfigSource config() {
88
- return embulk.newConfig();
89
- }
90
- }