embulk-output-kafka 0.1.3 → 0.1.8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (29) hide show
  1. checksums.yaml +5 -5
  2. data/.circleci/config.yml +44 -0
  3. data/.github/dependabot.yml +11 -0
  4. data/README.md +6 -1
  5. data/build.gradle +29 -12
  6. data/docker-compose.yml +1 -1
  7. data/src/main/java/org/embulk/output/kafka/AvroFormatColumnVisitor.java +26 -10
  8. data/src/main/java/org/embulk/output/kafka/AvroFormatTransactionalPageOutput.java +13 -0
  9. data/src/main/java/org/embulk/output/kafka/JsonFormatColumnVisitor.java +27 -5
  10. data/src/main/java/org/embulk/output/kafka/JsonFormatTransactionalPageOutput.java +13 -0
  11. data/src/main/java/org/embulk/output/kafka/KafkaJsonSerializer.java +4 -0
  12. data/src/main/java/org/embulk/output/kafka/KafkaOutputColumnVisitor.java +62 -8
  13. data/src/main/java/org/embulk/output/kafka/KafkaOutputPlugin.java +82 -145
  14. data/src/main/java/org/embulk/output/kafka/KafkaTransactionalPageOutput.java +104 -0
  15. data/src/main/java/org/embulk/output/kafka/RecordProducerFactory.java +3 -3
  16. data/src/test/java/org/embulk/output/kafka/TestKafkaOutputPlugin.java +384 -0
  17. data/src/test/resources/config_complex.yml +9 -28
  18. data/src/test/resources/config_complex_avro.yml +23 -42
  19. data/src/test/resources/config_simple.yml +5 -22
  20. data/src/test/resources/config_simple_avro.yml +14 -32
  21. data/src/test/resources/config_simple_avro_avsc_file.yml +7 -25
  22. data/src/test/resources/config_with_column_for_deletion.yml +7 -0
  23. data/src/test/resources/config_with_column_for_deletion_avro.yml +18 -0
  24. data/src/test/resources/config_with_key_column.yml +6 -23
  25. data/src/test/resources/config_with_partition_column.yml +6 -0
  26. data/src/test/resources/in1.csv +4 -4
  27. data/src/test/resources/in_complex.csv +4 -4
  28. data/src/test/resources/in_with_deletion.csv +4 -0
  29. metadata +30 -24
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: fb1edf4ce79bd490f0f01662547d36c2e6d2dd6e
4
- data.tar.gz: 44b9affb8e9d1385314274b5d1ac083fd5a4848e
2
+ SHA256:
3
+ metadata.gz: cb0e6dfc4b8b49b93fb6966948e8930e5785cf5a2a89fc5a8f17c80a4e7865c1
4
+ data.tar.gz: 18cb578eba9423c490e7416c906054f8401301b977e796c271182f593dc08563
5
5
  SHA512:
6
- metadata.gz: 5c76d2cca3d141b7f44208449ba092ee732993183de6e6f19ec4e5280f302f0e3a4b5746414332103aa492b0cb7ea7df55159fa936248839bb68a8280ecfc060
7
- data.tar.gz: 28dcd4f04aceec97e78c427b2b859266b9adb5147ae8cd28a63c5df86840d5c4051af049b225eefe01d410007afdd17222ec96fc3525cf0cd28e60c30414f6a2
6
+ metadata.gz: 84a080d3cc49d30ad04802162e94de0072aac96d2562649c34cb1623e7734f783e96deeb8082c81ce965b36520c21be122cdf0e292a55697c9d2c89fd767f551
7
+ data.tar.gz: 4eb1b72cb705b2eb473ae76f02085987b4f55a37d9ba7048789ea3133e8056850813d39cfb86b2d4f7482b31fbbb67f5387b2c3a730dfb0bffd2c5f9015d16d0
@@ -0,0 +1,44 @@
1
+ # Java Gradle CircleCI 2.0 configuration file
2
+ #
3
+ # Check https://circleci.com/docs/2.0/language-java/ for more details
4
+ #
5
+ version: 2
6
+ jobs:
7
+ build:
8
+ docker:
9
+ # specify the version you desire here
10
+ - image: circleci/openjdk:8-jdk
11
+
12
+ # Specify service dependencies here if necessary
13
+ # CircleCI maintains a library of pre-built images
14
+ # documented at https://circleci.com/docs/2.0/circleci-images/
15
+ # - image: circleci/postgres:9.4
16
+
17
+ working_directory: ~/repo
18
+
19
+ environment:
20
+ # Customize the JVM maximum heap limit
21
+ JVM_OPTS: -Xmx3200m
22
+ TERM: dumb
23
+
24
+ steps:
25
+ - checkout
26
+
27
+ # Download and cache dependencies
28
+ - restore_cache:
29
+ keys:
30
+ - v1-dependencies-{{ checksum "build.gradle" }}
31
+ # fallback to using the latest cache if no exact match is found
32
+ - v1-dependencies-
33
+
34
+ - run: ./gradlew dependencies
35
+
36
+ - save_cache:
37
+ paths:
38
+ - ~/.gradle
39
+ key: v1-dependencies-{{ checksum "build.gradle" }}
40
+
41
+ # run tests!
42
+ - run: ./gradlew test
43
+ - store_test_results:
44
+ path: build/test-results
@@ -0,0 +1,11 @@
1
+ # To get started with Dependabot version updates, you'll need to specify which
2
+ # package ecosystems to update and where the package manifests are located.
3
+ # Please see the documentation for all configuration options:
4
+ # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
5
+
6
+ version: 2
7
+ updates:
8
+ - package-ecosystem: "gradle" # See documentation for possible values
9
+ directory: "/" # Location of package manifests
10
+ schedule:
11
+ interval: "weekly"
data/README.md CHANGED
@@ -1,4 +1,5 @@
1
1
  # Kafka output plugin for Embulk
2
+ [![CircleCI](https://circleci.com/gh/joker1007/embulk-output-kafka.svg?style=svg)](https://circleci.com/gh/joker1007/embulk-output-kafka)
2
3
 
3
4
  ## Overview
4
5
 
@@ -18,6 +19,8 @@
18
19
  - **avsc**: inline avro schema config (json, default: `null`)
19
20
  - **ignore_columns**: remove columns from output (array(string), default: `[]`)
20
21
  - **key_column_name**: use column value as record key (string, default: `null`, if this parameter is null, set random number as record key, and it can use column in `ignore_columns`)
22
+ - **partition_column_name**: use column value as partition id (string, default: `null`, this value is prefer to `key_column_name`, and if partition_column value is null, use key_column for partitioning)
23
+ - **column_for_deletion**: Determine to delete (string, default: `null`, `column_for_deletion` column must be boolean. If the value of the column is `true`, KafkaProducer sends `null` value to a Kafka Broker.)
21
24
  - **record_batch_size**: kafka producer record batch size (integer, default: `1000`)
22
25
  - **acks**: kafka producer require acks (string, default: `"1"`)
23
26
  - **retries**: kafka producer max retry count (integer, default: `1`)
@@ -27,7 +30,9 @@
27
30
  If use `avro_with_schema_registry` format, following configs are required.
28
31
 
29
32
  - **schema_registry_url**
30
- - **avsc** or **avsc_file**
33
+
34
+ If avsc and avsc_file are null, embulk-output-kafka fetch a schema from schema registry.
35
+ But currently, embulk-output-kafka supports only TopicNameStrategy.
31
36
 
32
37
  ## Example
33
38
 
@@ -1,6 +1,6 @@
1
1
  plugins {
2
- id "com.jfrog.bintray" version "1.1"
3
- id "com.github.jruby-gradle.base" version "1.5.0"
2
+ id "com.jfrog.bintray" version "1.8.5"
3
+ id "com.github.jruby-gradle.base" version "1.6.0"
4
4
  id "java"
5
5
  id "checkstyle"
6
6
  }
@@ -17,26 +17,43 @@ configurations {
17
17
  provided
18
18
  }
19
19
 
20
- version = "0.1.3"
20
+ version = "0.1.8"
21
21
 
22
22
  sourceCompatibility = 1.8
23
23
  targetCompatibility = 1.8
24
24
 
25
25
  dependencies {
26
- compile "org.embulk:embulk-core:0.9.17"
27
- provided "org.embulk:embulk-core:0.9.17"
26
+ compile "org.embulk:embulk-core:0.9.22"
27
+ provided "org.embulk:embulk-core:0.9.22"
28
28
  // compile "YOUR_JAR_DEPENDENCY_GROUP:YOUR_JAR_DEPENDENCY_MODULE:YOUR_JAR_DEPENDENCY_VERSION"
29
- testCompile "junit:junit:4.12"
30
- testCompile "org.embulk:embulk-test:0.9.17"
31
- testCompile "org.embulk:embulk-standards:0.9.17"
29
+ testCompile "junit:junit:4.13"
30
+ testCompile "org.embulk:embulk-test:0.9.22"
31
+ testCompile "org.embulk:embulk-standards:0.9.22"
32
+ testCompile "org.embulk:embulk-deps-buffer:0.9.22"
32
33
 
33
- compile "org.apache.kafka:kafka-clients:2.3.0"
34
- compile("org.apache.avro:avro:1.9.0") {
34
+ compile("org.apache.kafka:kafka-clients:2.5.1") {
35
+ exclude group: "org.slf4j", module: "slf4j-api"
35
36
  }
36
- compile("io.confluent:kafka-avro-serializer:5.3.0") {
37
+ compile("org.apache.avro:avro:1.10.0") {
38
+ exclude group: "com.fasterxml.jackson.core", module: "jackson-databind"
39
+ exclude group: "com.fasterxml.jackson.core", module: "jackson-annotations"
40
+ exclude group: "com.fasterxml.jackson.core", module: "jackson-core"
41
+ exclude group: "org.slf4j", module: "slf4j-api"
42
+ }
43
+ compile("io.confluent:kafka-avro-serializer:5.5.1") {
44
+ exclude group: "com.fasterxml.jackson.core", module: "jackson-databind"
45
+ exclude group: "com.fasterxml.jackson.core", module: "jackson-annotations"
46
+ exclude group: "com.fasterxml.jackson.core", module: "jackson-core"
47
+ exclude group: "org.slf4j", module: "slf4j-api"
37
48
  }
38
49
 
39
- testCompile("com.github.charithe:kafka-junit:4.1.6") {
50
+ testCompile "com.salesforce.kafka.test:kafka-junit4:3.+"
51
+ testCompile("org.apache.kafka:kafka_2.12:2.5.+") {
52
+ exclude group: "com.fasterxml.jackson.core", module: "jackson-databind"
53
+ exclude group: "com.fasterxml.jackson.core", module: "jackson-annotations"
54
+ exclude group: "com.fasterxml.jackson.core", module: "jackson-core"
55
+ // exclude group: "com.fasterxml.jackson.dataformat", module: "jackson-dataformat-csv"
56
+ exclude group: "com.fasterxml.jackson.datatype", module: "jackson-datatype-jdk8"
40
57
  }
41
58
  }
42
59
 
@@ -23,7 +23,7 @@ services:
23
23
  KAFKA_BROKER_ID: 1
24
24
  KAFKA_ZOOKEEPER_CONNECT: 'zookeeper:2181'
25
25
  KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
26
- KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://broker:29092,PLAINTEXT_HOST://broker:9092
26
+ KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://broker:29092,PLAINTEXT_HOST://localhost:9092
27
27
  KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
28
28
  KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0
29
29
 
@@ -14,21 +14,39 @@ import java.util.Objects;
14
14
  import java.util.Optional;
15
15
  import java.util.stream.Collectors;
16
16
 
17
- public class AvroFormatColumnVisitor extends KafkaOutputColumnVisitor
17
+ public class AvroFormatColumnVisitor extends KafkaOutputColumnVisitor<GenericRecord>
18
18
  {
19
19
  private Schema avroSchema;
20
- public GenericRecord genericRecord;
20
+ private GenericRecord genericRecord;
21
21
 
22
- public AvroFormatColumnVisitor(KafkaOutputPlugin.PluginTask task, PageReader pageReader, Schema avroSchema, GenericRecord genericRecord)
22
+ AvroFormatColumnVisitor(KafkaOutputPlugin.PluginTask task, PageReader pageReader, Schema avroSchema)
23
23
  {
24
24
  super(task, pageReader);
25
25
  this.avroSchema = avroSchema;
26
- this.genericRecord = genericRecord;
26
+ }
27
+
28
+ @Override
29
+ public GenericRecord getRecord()
30
+ {
31
+ if (isDeletion()) {
32
+ return null;
33
+ }
34
+
35
+ return genericRecord;
36
+ }
37
+
38
+ @Override
39
+ void reset()
40
+ {
41
+ super.reset();
42
+ this.genericRecord = new GenericData.Record(avroSchema);
27
43
  }
28
44
 
29
45
  @Override
30
46
  public void booleanColumn(Column column)
31
47
  {
48
+ super.booleanColumn(column);
49
+
32
50
  if (isIgnoreColumn(column)) {
33
51
  return;
34
52
  }
@@ -134,9 +152,8 @@ public class AvroFormatColumnVisitor extends KafkaOutputColumnVisitor
134
152
  switch (avroSchema.getType()) {
135
153
  case ARRAY:
136
154
  if (value.isArrayValue()) {
137
- return value.asArrayValue().list().stream().map(item -> {
138
- return convertMsgPackValueToAvroValue(avroSchema.getElementType(), item);
139
- }).filter(Objects::nonNull).collect(Collectors.toList());
155
+ return value.asArrayValue().list().stream().map(item ->
156
+ convertMsgPackValueToAvroValue(avroSchema.getElementType(), item)).filter(Objects::nonNull).collect(Collectors.toList());
140
157
  }
141
158
  throw new RuntimeException(String.format("Schema mismatch: avro: %s, msgpack: %s", avroSchema.getType().getName(), value.getValueType().name()));
142
159
  case MAP:
@@ -155,9 +172,8 @@ public class AvroFormatColumnVisitor extends KafkaOutputColumnVisitor
155
172
  GenericRecord record = new GenericData.Record(avroSchema);
156
173
  Map<Value, Value> valueMap = value.asMapValue().map();
157
174
  for (org.apache.avro.Schema.Field field : avroSchema.getFields()) {
158
- Optional.ofNullable(valueMap.get(ValueFactory.newString(field.name()))).ifPresent(v -> {
159
- record.put(field.name(), convertMsgPackValueToAvroValue(field.schema(), v));
160
- });
175
+ Optional.ofNullable(valueMap.get(ValueFactory.newString(field.name()))).ifPresent(v ->
176
+ record.put(field.name(), convertMsgPackValueToAvroValue(field.schema(), v)));
161
177
  }
162
178
  return record;
163
179
  }
@@ -0,0 +1,13 @@
1
+ package org.embulk.output.kafka;
2
+
3
+ import org.apache.avro.generic.GenericRecord;
4
+ import org.apache.kafka.clients.producer.KafkaProducer;
5
+ import org.embulk.spi.PageReader;
6
+
7
+ public class AvroFormatTransactionalPageOutput extends KafkaTransactionalPageOutput<Object, GenericRecord>
8
+ {
9
+ public AvroFormatTransactionalPageOutput(KafkaProducer<Object, Object> producer, PageReader pageReader, KafkaOutputColumnVisitor<GenericRecord> columnVisitor, String topic, int taskIndex)
10
+ {
11
+ super(producer, pageReader, columnVisitor, topic, taskIndex);
12
+ }
13
+ };
@@ -10,23 +10,41 @@ import org.msgpack.value.Value;
10
10
  import java.io.IOException;
11
11
  import java.time.format.DateTimeFormatter;
12
12
 
13
- public class JsonFormatColumnVisitor extends KafkaOutputColumnVisitor
13
+ public class JsonFormatColumnVisitor extends KafkaOutputColumnVisitor<ObjectNode>
14
14
  {
15
15
  private ObjectMapper objectMapper;
16
- public ObjectNode jsonNode;
16
+ private ObjectNode jsonNode;
17
17
 
18
18
  private static DateTimeFormatter timestampFormatter = DateTimeFormatter.ISO_INSTANT;
19
19
 
20
- public JsonFormatColumnVisitor(KafkaOutputPlugin.PluginTask task, PageReader pageReader, ObjectMapper objectMapper)
20
+ JsonFormatColumnVisitor(KafkaOutputPlugin.PluginTask task, PageReader pageReader, ObjectMapper objectMapper)
21
21
  {
22
22
  super(task, pageReader);
23
23
  this.objectMapper = objectMapper;
24
+ }
25
+
26
+ @Override
27
+ public ObjectNode getRecord()
28
+ {
29
+ if (isDeletion()) {
30
+ return null;
31
+ }
32
+
33
+ return jsonNode;
34
+ }
35
+
36
+ @Override
37
+ void reset()
38
+ {
39
+ super.reset();
24
40
  this.jsonNode = objectMapper.createObjectNode();
25
41
  }
26
42
 
27
43
  @Override
28
44
  public void booleanColumn(Column column)
29
45
  {
46
+ super.booleanColumn(column);
47
+
30
48
  if (isIgnoreColumn(column)) {
31
49
  return;
32
50
  }
@@ -42,6 +60,8 @@ public class JsonFormatColumnVisitor extends KafkaOutputColumnVisitor
42
60
  @Override
43
61
  public void longColumn(Column column)
44
62
  {
63
+ super.longColumn(column);
64
+
45
65
  if (isIgnoreColumn(column)) {
46
66
  return;
47
67
  }
@@ -58,6 +78,8 @@ public class JsonFormatColumnVisitor extends KafkaOutputColumnVisitor
58
78
  @Override
59
79
  public void doubleColumn(Column column)
60
80
  {
81
+ super.doubleColumn(column);
82
+
61
83
  if (isIgnoreColumn(column)) {
62
84
  return;
63
85
  }
@@ -68,12 +90,13 @@ public class JsonFormatColumnVisitor extends KafkaOutputColumnVisitor
68
90
  }
69
91
 
70
92
  jsonNode.put(column.getName(), pageReader.getDouble(column));
71
- super.doubleColumn(column);
72
93
  }
73
94
 
74
95
  @Override
75
96
  public void stringColumn(Column column)
76
97
  {
98
+ super.stringColumn(column);
99
+
77
100
  if (isIgnoreColumn(column)) {
78
101
  return;
79
102
  }
@@ -84,7 +107,6 @@ public class JsonFormatColumnVisitor extends KafkaOutputColumnVisitor
84
107
  }
85
108
 
86
109
  jsonNode.put(column.getName(), pageReader.getString(column));
87
- super.stringColumn(column);
88
110
  }
89
111
 
90
112
  @Override
@@ -0,0 +1,13 @@
1
+ package org.embulk.output.kafka;
2
+
3
+ import com.fasterxml.jackson.databind.node.ObjectNode;
4
+ import org.apache.kafka.clients.producer.KafkaProducer;
5
+ import org.embulk.spi.PageReader;
6
+
7
+ public class JsonFormatTransactionalPageOutput extends KafkaTransactionalPageOutput<ObjectNode, ObjectNode>
8
+ {
9
+ public JsonFormatTransactionalPageOutput(KafkaProducer<Object, ObjectNode> producer, PageReader pageReader, KafkaOutputColumnVisitor<ObjectNode> columnVisitor, String topic, int taskIndex)
10
+ {
11
+ super(producer, pageReader, columnVisitor, topic, taskIndex);
12
+ }
13
+ };
@@ -12,6 +12,10 @@ public class KafkaJsonSerializer implements Serializer<ObjectNode>
12
12
  @Override
13
13
  public byte[] serialize(String topic, ObjectNode data)
14
14
  {
15
+ if (data == null) {
16
+ return null;
17
+ }
18
+
15
19
  try {
16
20
  return objectMapper.writeValueAsBytes(data);
17
21
  }
@@ -4,44 +4,98 @@ import org.embulk.spi.Column;
4
4
  import org.embulk.spi.ColumnVisitor;
5
5
  import org.embulk.spi.PageReader;
6
6
 
7
- public abstract class KafkaOutputColumnVisitor implements ColumnVisitor
7
+ public abstract class KafkaOutputColumnVisitor<T> implements ColumnVisitor
8
8
  {
9
- KafkaOutputPlugin.PluginTask task;
9
+ private KafkaOutputPlugin.PluginTask task;
10
10
  PageReader pageReader;
11
+ private String partitionColumnName;
11
12
 
12
- public Object recordKey = null;
13
- public String topicName = null;
13
+ private Object recordKey = null;
14
+ private String topicName = null;
15
+ private Integer partition = null;
16
+ private boolean deletion = false;
14
17
 
15
- public KafkaOutputColumnVisitor(KafkaOutputPlugin.PluginTask task, PageReader pageReader)
18
+ KafkaOutputColumnVisitor(KafkaOutputPlugin.PluginTask task, PageReader pageReader)
16
19
  {
17
20
  this.task = task;
18
21
  this.pageReader = pageReader;
22
+ this.partitionColumnName = task.getPartitionColumnName().orElse(null);
19
23
  }
20
24
 
21
- void setRecordKey(Column column, Object value)
25
+ public abstract T getRecord();
26
+
27
+ Object getRecordKey()
28
+ {
29
+ return recordKey;
30
+ }
31
+
32
+ private void setRecordKey(Column column, Object value)
22
33
  {
23
34
  if (task.getKeyColumnName().isPresent() && task.getKeyColumnName().get().equals(column.getName())) {
24
35
  recordKey = value;
25
36
  }
26
37
  }
27
38
 
28
- void setTopicName(Column column, String value)
39
+ String getTopicName()
40
+ {
41
+ return topicName;
42
+ }
43
+
44
+ private void setTopicName(Column column, String value)
29
45
  {
30
46
  if (task.getTopicColumn().isPresent() && task.getTopicColumn().get().equals(column.getName())) {
31
47
  topicName = value;
32
48
  }
33
49
  }
34
50
 
51
+ Integer getPartition()
52
+ {
53
+ return partition;
54
+ }
55
+
56
+ boolean isDeletion()
57
+ {
58
+ return deletion;
59
+ }
60
+
61
+ void reset()
62
+ {
63
+ this.recordKey = null;
64
+ this.topicName = null;
65
+ this.partition = null;
66
+ this.deletion = false;
67
+ }
68
+
35
69
  boolean isIgnoreColumn(Column column)
36
70
  {
37
71
  return task.getIgnoreColumns().stream().anyMatch(name -> name.equals(column.getName()));
38
72
  }
39
73
 
74
+ boolean isColumnForDeletion(Column column)
75
+ {
76
+ return task.getColumnForDeletion().map(name -> name.equals(column.getName())).orElse(false);
77
+ }
78
+
79
+ @Override
80
+ public void booleanColumn(Column column)
81
+ {
82
+ if (!pageReader.isNull(column)) {
83
+ if (isColumnForDeletion(column)) {
84
+ deletion = pageReader.getBoolean(column);
85
+ }
86
+ }
87
+ }
88
+
40
89
  @Override
41
90
  public void longColumn(Column column)
42
91
  {
43
92
  if (!pageReader.isNull(column)) {
44
- setRecordKey(column, pageReader.getLong(column));
93
+ long value = pageReader.getLong(column);
94
+ setRecordKey(column, value);
95
+
96
+ if (partitionColumnName != null && partitionColumnName.equals(column.getName())) {
97
+ partition = Long.valueOf(value).intValue();
98
+ }
45
99
  }
46
100
  }
47
101