embulk-output-kafka 0.1.3 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. checksums.yaml +5 -5
  2. data/.circleci/config.yml +44 -0
  3. data/.github/dependabot.yml +11 -0
  4. data/README.md +6 -1
  5. data/build.gradle +29 -12
  6. data/docker-compose.yml +1 -1
  7. data/src/main/java/org/embulk/output/kafka/AvroFormatColumnVisitor.java +26 -10
  8. data/src/main/java/org/embulk/output/kafka/AvroFormatTransactionalPageOutput.java +13 -0
  9. data/src/main/java/org/embulk/output/kafka/JsonFormatColumnVisitor.java +27 -5
  10. data/src/main/java/org/embulk/output/kafka/JsonFormatTransactionalPageOutput.java +13 -0
  11. data/src/main/java/org/embulk/output/kafka/KafkaJsonSerializer.java +4 -0
  12. data/src/main/java/org/embulk/output/kafka/KafkaOutputColumnVisitor.java +62 -8
  13. data/src/main/java/org/embulk/output/kafka/KafkaOutputPlugin.java +82 -145
  14. data/src/main/java/org/embulk/output/kafka/KafkaTransactionalPageOutput.java +104 -0
  15. data/src/main/java/org/embulk/output/kafka/RecordProducerFactory.java +3 -3
  16. data/src/test/java/org/embulk/output/kafka/TestKafkaOutputPlugin.java +384 -0
  17. data/src/test/resources/config_complex.yml +9 -28
  18. data/src/test/resources/config_complex_avro.yml +23 -42
  19. data/src/test/resources/config_simple.yml +5 -22
  20. data/src/test/resources/config_simple_avro.yml +14 -32
  21. data/src/test/resources/config_simple_avro_avsc_file.yml +7 -25
  22. data/src/test/resources/config_with_column_for_deletion.yml +7 -0
  23. data/src/test/resources/config_with_column_for_deletion_avro.yml +18 -0
  24. data/src/test/resources/config_with_key_column.yml +6 -23
  25. data/src/test/resources/config_with_partition_column.yml +6 -0
  26. data/src/test/resources/in1.csv +4 -4
  27. data/src/test/resources/in_complex.csv +4 -4
  28. data/src/test/resources/in_with_deletion.csv +4 -0
  29. metadata +30 -24
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: fb1edf4ce79bd490f0f01662547d36c2e6d2dd6e
4
- data.tar.gz: 44b9affb8e9d1385314274b5d1ac083fd5a4848e
2
+ SHA256:
3
+ metadata.gz: cb0e6dfc4b8b49b93fb6966948e8930e5785cf5a2a89fc5a8f17c80a4e7865c1
4
+ data.tar.gz: 18cb578eba9423c490e7416c906054f8401301b977e796c271182f593dc08563
5
5
  SHA512:
6
- metadata.gz: 5c76d2cca3d141b7f44208449ba092ee732993183de6e6f19ec4e5280f302f0e3a4b5746414332103aa492b0cb7ea7df55159fa936248839bb68a8280ecfc060
7
- data.tar.gz: 28dcd4f04aceec97e78c427b2b859266b9adb5147ae8cd28a63c5df86840d5c4051af049b225eefe01d410007afdd17222ec96fc3525cf0cd28e60c30414f6a2
6
+ metadata.gz: 84a080d3cc49d30ad04802162e94de0072aac96d2562649c34cb1623e7734f783e96deeb8082c81ce965b36520c21be122cdf0e292a55697c9d2c89fd767f551
7
+ data.tar.gz: 4eb1b72cb705b2eb473ae76f02085987b4f55a37d9ba7048789ea3133e8056850813d39cfb86b2d4f7482b31fbbb67f5387b2c3a730dfb0bffd2c5f9015d16d0
@@ -0,0 +1,44 @@
1
+ # Java Gradle CircleCI 2.0 configuration file
2
+ #
3
+ # Check https://circleci.com/docs/2.0/language-java/ for more details
4
+ #
5
+ version: 2
6
+ jobs:
7
+ build:
8
+ docker:
9
+ # specify the version you desire here
10
+ - image: circleci/openjdk:8-jdk
11
+
12
+ # Specify service dependencies here if necessary
13
+ # CircleCI maintains a library of pre-built images
14
+ # documented at https://circleci.com/docs/2.0/circleci-images/
15
+ # - image: circleci/postgres:9.4
16
+
17
+ working_directory: ~/repo
18
+
19
+ environment:
20
+ # Customize the JVM maximum heap limit
21
+ JVM_OPTS: -Xmx3200m
22
+ TERM: dumb
23
+
24
+ steps:
25
+ - checkout
26
+
27
+ # Download and cache dependencies
28
+ - restore_cache:
29
+ keys:
30
+ - v1-dependencies-{{ checksum "build.gradle" }}
31
+ # fallback to using the latest cache if no exact match is found
32
+ - v1-dependencies-
33
+
34
+ - run: ./gradlew dependencies
35
+
36
+ - save_cache:
37
+ paths:
38
+ - ~/.gradle
39
+ key: v1-dependencies-{{ checksum "build.gradle" }}
40
+
41
+ # run tests!
42
+ - run: ./gradlew test
43
+ - store_test_results:
44
+ path: build/test-results
@@ -0,0 +1,11 @@
1
+ # To get started with Dependabot version updates, you'll need to specify which
2
+ # package ecosystems to update and where the package manifests are located.
3
+ # Please see the documentation for all configuration options:
4
+ # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
5
+
6
+ version: 2
7
+ updates:
8
+ - package-ecosystem: "gradle" # See documentation for possible values
9
+ directory: "/" # Location of package manifests
10
+ schedule:
11
+ interval: "weekly"
data/README.md CHANGED
@@ -1,4 +1,5 @@
1
1
  # Kafka output plugin for Embulk
2
+ [![CircleCI](https://circleci.com/gh/joker1007/embulk-output-kafka.svg?style=svg)](https://circleci.com/gh/joker1007/embulk-output-kafka)
2
3
 
3
4
  ## Overview
4
5
 
@@ -18,6 +19,8 @@
18
19
  - **avsc**: inline avro schema config (json, default: `null`)
19
20
  - **ignore_columns**: remove columns from output (array(string), default: `[]`)
20
21
  - **key_column_name**: use column value as record key (string, default: `null`, if this parameter is null, set random number as record key, and it can use column in `ignore_columns`)
22
+ - **partition_column_name**: use column value as partition id (string, default: `null`, this value is prefer to `key_column_name`, and if partition_column value is null, use key_column for partitioning)
23
+ - **column_for_deletion**: Determine to delete (string, default: `null`, `column_for_deletion` column must be boolean. If the value of the column is `true`, KafkaProducer sends `null` value to a Kafka Broker.)
21
24
  - **record_batch_size**: kafka producer record batch size (integer, default: `1000`)
22
25
  - **acks**: kafka producer require acks (string, default: `"1"`)
23
26
  - **retries**: kafka producer max retry count (integer, default: `1`)
@@ -27,7 +30,9 @@
27
30
  If use `avro_with_schema_registry` format, following configs are required.
28
31
 
29
32
  - **schema_registry_url**
30
- - **avsc** or **avsc_file**
33
+
34
+ If avsc and avsc_file are null, embulk-output-kafka fetch a schema from schema registry.
35
+ But currently, embulk-output-kafka supports only TopicNameStrategy.
31
36
 
32
37
  ## Example
33
38
 
@@ -1,6 +1,6 @@
1
1
  plugins {
2
- id "com.jfrog.bintray" version "1.1"
3
- id "com.github.jruby-gradle.base" version "1.5.0"
2
+ id "com.jfrog.bintray" version "1.8.5"
3
+ id "com.github.jruby-gradle.base" version "1.6.0"
4
4
  id "java"
5
5
  id "checkstyle"
6
6
  }
@@ -17,26 +17,43 @@ configurations {
17
17
  provided
18
18
  }
19
19
 
20
- version = "0.1.3"
20
+ version = "0.1.8"
21
21
 
22
22
  sourceCompatibility = 1.8
23
23
  targetCompatibility = 1.8
24
24
 
25
25
  dependencies {
26
- compile "org.embulk:embulk-core:0.9.17"
27
- provided "org.embulk:embulk-core:0.9.17"
26
+ compile "org.embulk:embulk-core:0.9.22"
27
+ provided "org.embulk:embulk-core:0.9.22"
28
28
  // compile "YOUR_JAR_DEPENDENCY_GROUP:YOUR_JAR_DEPENDENCY_MODULE:YOUR_JAR_DEPENDENCY_VERSION"
29
- testCompile "junit:junit:4.12"
30
- testCompile "org.embulk:embulk-test:0.9.17"
31
- testCompile "org.embulk:embulk-standards:0.9.17"
29
+ testCompile "junit:junit:4.13"
30
+ testCompile "org.embulk:embulk-test:0.9.22"
31
+ testCompile "org.embulk:embulk-standards:0.9.22"
32
+ testCompile "org.embulk:embulk-deps-buffer:0.9.22"
32
33
 
33
- compile "org.apache.kafka:kafka-clients:2.3.0"
34
- compile("org.apache.avro:avro:1.9.0") {
34
+ compile("org.apache.kafka:kafka-clients:2.5.1") {
35
+ exclude group: "org.slf4j", module: "slf4j-api"
35
36
  }
36
- compile("io.confluent:kafka-avro-serializer:5.3.0") {
37
+ compile("org.apache.avro:avro:1.10.0") {
38
+ exclude group: "com.fasterxml.jackson.core", module: "jackson-databind"
39
+ exclude group: "com.fasterxml.jackson.core", module: "jackson-annotations"
40
+ exclude group: "com.fasterxml.jackson.core", module: "jackson-core"
41
+ exclude group: "org.slf4j", module: "slf4j-api"
42
+ }
43
+ compile("io.confluent:kafka-avro-serializer:5.5.1") {
44
+ exclude group: "com.fasterxml.jackson.core", module: "jackson-databind"
45
+ exclude group: "com.fasterxml.jackson.core", module: "jackson-annotations"
46
+ exclude group: "com.fasterxml.jackson.core", module: "jackson-core"
47
+ exclude group: "org.slf4j", module: "slf4j-api"
37
48
  }
38
49
 
39
- testCompile("com.github.charithe:kafka-junit:4.1.6") {
50
+ testCompile "com.salesforce.kafka.test:kafka-junit4:3.+"
51
+ testCompile("org.apache.kafka:kafka_2.12:2.5.+") {
52
+ exclude group: "com.fasterxml.jackson.core", module: "jackson-databind"
53
+ exclude group: "com.fasterxml.jackson.core", module: "jackson-annotations"
54
+ exclude group: "com.fasterxml.jackson.core", module: "jackson-core"
55
+ // exclude group: "com.fasterxml.jackson.dataformat", module: "jackson-dataformat-csv"
56
+ exclude group: "com.fasterxml.jackson.datatype", module: "jackson-datatype-jdk8"
40
57
  }
41
58
  }
42
59
 
@@ -23,7 +23,7 @@ services:
23
23
  KAFKA_BROKER_ID: 1
24
24
  KAFKA_ZOOKEEPER_CONNECT: 'zookeeper:2181'
25
25
  KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
26
- KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://broker:29092,PLAINTEXT_HOST://broker:9092
26
+ KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://broker:29092,PLAINTEXT_HOST://localhost:9092
27
27
  KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
28
28
  KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0
29
29
 
@@ -14,21 +14,39 @@ import java.util.Objects;
14
14
  import java.util.Optional;
15
15
  import java.util.stream.Collectors;
16
16
 
17
- public class AvroFormatColumnVisitor extends KafkaOutputColumnVisitor
17
+ public class AvroFormatColumnVisitor extends KafkaOutputColumnVisitor<GenericRecord>
18
18
  {
19
19
  private Schema avroSchema;
20
- public GenericRecord genericRecord;
20
+ private GenericRecord genericRecord;
21
21
 
22
- public AvroFormatColumnVisitor(KafkaOutputPlugin.PluginTask task, PageReader pageReader, Schema avroSchema, GenericRecord genericRecord)
22
+ AvroFormatColumnVisitor(KafkaOutputPlugin.PluginTask task, PageReader pageReader, Schema avroSchema)
23
23
  {
24
24
  super(task, pageReader);
25
25
  this.avroSchema = avroSchema;
26
- this.genericRecord = genericRecord;
26
+ }
27
+
28
+ @Override
29
+ public GenericRecord getRecord()
30
+ {
31
+ if (isDeletion()) {
32
+ return null;
33
+ }
34
+
35
+ return genericRecord;
36
+ }
37
+
38
+ @Override
39
+ void reset()
40
+ {
41
+ super.reset();
42
+ this.genericRecord = new GenericData.Record(avroSchema);
27
43
  }
28
44
 
29
45
  @Override
30
46
  public void booleanColumn(Column column)
31
47
  {
48
+ super.booleanColumn(column);
49
+
32
50
  if (isIgnoreColumn(column)) {
33
51
  return;
34
52
  }
@@ -134,9 +152,8 @@ public class AvroFormatColumnVisitor extends KafkaOutputColumnVisitor
134
152
  switch (avroSchema.getType()) {
135
153
  case ARRAY:
136
154
  if (value.isArrayValue()) {
137
- return value.asArrayValue().list().stream().map(item -> {
138
- return convertMsgPackValueToAvroValue(avroSchema.getElementType(), item);
139
- }).filter(Objects::nonNull).collect(Collectors.toList());
155
+ return value.asArrayValue().list().stream().map(item ->
156
+ convertMsgPackValueToAvroValue(avroSchema.getElementType(), item)).filter(Objects::nonNull).collect(Collectors.toList());
140
157
  }
141
158
  throw new RuntimeException(String.format("Schema mismatch: avro: %s, msgpack: %s", avroSchema.getType().getName(), value.getValueType().name()));
142
159
  case MAP:
@@ -155,9 +172,8 @@ public class AvroFormatColumnVisitor extends KafkaOutputColumnVisitor
155
172
  GenericRecord record = new GenericData.Record(avroSchema);
156
173
  Map<Value, Value> valueMap = value.asMapValue().map();
157
174
  for (org.apache.avro.Schema.Field field : avroSchema.getFields()) {
158
- Optional.ofNullable(valueMap.get(ValueFactory.newString(field.name()))).ifPresent(v -> {
159
- record.put(field.name(), convertMsgPackValueToAvroValue(field.schema(), v));
160
- });
175
+ Optional.ofNullable(valueMap.get(ValueFactory.newString(field.name()))).ifPresent(v ->
176
+ record.put(field.name(), convertMsgPackValueToAvroValue(field.schema(), v)));
161
177
  }
162
178
  return record;
163
179
  }
@@ -0,0 +1,13 @@
1
+ package org.embulk.output.kafka;
2
+
3
+ import org.apache.avro.generic.GenericRecord;
4
+ import org.apache.kafka.clients.producer.KafkaProducer;
5
+ import org.embulk.spi.PageReader;
6
+
7
+ public class AvroFormatTransactionalPageOutput extends KafkaTransactionalPageOutput<Object, GenericRecord>
8
+ {
9
+ public AvroFormatTransactionalPageOutput(KafkaProducer<Object, Object> producer, PageReader pageReader, KafkaOutputColumnVisitor<GenericRecord> columnVisitor, String topic, int taskIndex)
10
+ {
11
+ super(producer, pageReader, columnVisitor, topic, taskIndex);
12
+ }
13
+ };
@@ -10,23 +10,41 @@ import org.msgpack.value.Value;
10
10
  import java.io.IOException;
11
11
  import java.time.format.DateTimeFormatter;
12
12
 
13
- public class JsonFormatColumnVisitor extends KafkaOutputColumnVisitor
13
+ public class JsonFormatColumnVisitor extends KafkaOutputColumnVisitor<ObjectNode>
14
14
  {
15
15
  private ObjectMapper objectMapper;
16
- public ObjectNode jsonNode;
16
+ private ObjectNode jsonNode;
17
17
 
18
18
  private static DateTimeFormatter timestampFormatter = DateTimeFormatter.ISO_INSTANT;
19
19
 
20
- public JsonFormatColumnVisitor(KafkaOutputPlugin.PluginTask task, PageReader pageReader, ObjectMapper objectMapper)
20
+ JsonFormatColumnVisitor(KafkaOutputPlugin.PluginTask task, PageReader pageReader, ObjectMapper objectMapper)
21
21
  {
22
22
  super(task, pageReader);
23
23
  this.objectMapper = objectMapper;
24
+ }
25
+
26
+ @Override
27
+ public ObjectNode getRecord()
28
+ {
29
+ if (isDeletion()) {
30
+ return null;
31
+ }
32
+
33
+ return jsonNode;
34
+ }
35
+
36
+ @Override
37
+ void reset()
38
+ {
39
+ super.reset();
24
40
  this.jsonNode = objectMapper.createObjectNode();
25
41
  }
26
42
 
27
43
  @Override
28
44
  public void booleanColumn(Column column)
29
45
  {
46
+ super.booleanColumn(column);
47
+
30
48
  if (isIgnoreColumn(column)) {
31
49
  return;
32
50
  }
@@ -42,6 +60,8 @@ public class JsonFormatColumnVisitor extends KafkaOutputColumnVisitor
42
60
  @Override
43
61
  public void longColumn(Column column)
44
62
  {
63
+ super.longColumn(column);
64
+
45
65
  if (isIgnoreColumn(column)) {
46
66
  return;
47
67
  }
@@ -58,6 +78,8 @@ public class JsonFormatColumnVisitor extends KafkaOutputColumnVisitor
58
78
  @Override
59
79
  public void doubleColumn(Column column)
60
80
  {
81
+ super.doubleColumn(column);
82
+
61
83
  if (isIgnoreColumn(column)) {
62
84
  return;
63
85
  }
@@ -68,12 +90,13 @@ public class JsonFormatColumnVisitor extends KafkaOutputColumnVisitor
68
90
  }
69
91
 
70
92
  jsonNode.put(column.getName(), pageReader.getDouble(column));
71
- super.doubleColumn(column);
72
93
  }
73
94
 
74
95
  @Override
75
96
  public void stringColumn(Column column)
76
97
  {
98
+ super.stringColumn(column);
99
+
77
100
  if (isIgnoreColumn(column)) {
78
101
  return;
79
102
  }
@@ -84,7 +107,6 @@ public class JsonFormatColumnVisitor extends KafkaOutputColumnVisitor
84
107
  }
85
108
 
86
109
  jsonNode.put(column.getName(), pageReader.getString(column));
87
- super.stringColumn(column);
88
110
  }
89
111
 
90
112
  @Override
@@ -0,0 +1,13 @@
1
+ package org.embulk.output.kafka;
2
+
3
+ import com.fasterxml.jackson.databind.node.ObjectNode;
4
+ import org.apache.kafka.clients.producer.KafkaProducer;
5
+ import org.embulk.spi.PageReader;
6
+
7
+ public class JsonFormatTransactionalPageOutput extends KafkaTransactionalPageOutput<ObjectNode, ObjectNode>
8
+ {
9
+ public JsonFormatTransactionalPageOutput(KafkaProducer<Object, ObjectNode> producer, PageReader pageReader, KafkaOutputColumnVisitor<ObjectNode> columnVisitor, String topic, int taskIndex)
10
+ {
11
+ super(producer, pageReader, columnVisitor, topic, taskIndex);
12
+ }
13
+ };
@@ -12,6 +12,10 @@ public class KafkaJsonSerializer implements Serializer<ObjectNode>
12
12
  @Override
13
13
  public byte[] serialize(String topic, ObjectNode data)
14
14
  {
15
+ if (data == null) {
16
+ return null;
17
+ }
18
+
15
19
  try {
16
20
  return objectMapper.writeValueAsBytes(data);
17
21
  }
@@ -4,44 +4,98 @@ import org.embulk.spi.Column;
4
4
  import org.embulk.spi.ColumnVisitor;
5
5
  import org.embulk.spi.PageReader;
6
6
 
7
- public abstract class KafkaOutputColumnVisitor implements ColumnVisitor
7
+ public abstract class KafkaOutputColumnVisitor<T> implements ColumnVisitor
8
8
  {
9
- KafkaOutputPlugin.PluginTask task;
9
+ private KafkaOutputPlugin.PluginTask task;
10
10
  PageReader pageReader;
11
+ private String partitionColumnName;
11
12
 
12
- public Object recordKey = null;
13
- public String topicName = null;
13
+ private Object recordKey = null;
14
+ private String topicName = null;
15
+ private Integer partition = null;
16
+ private boolean deletion = false;
14
17
 
15
- public KafkaOutputColumnVisitor(KafkaOutputPlugin.PluginTask task, PageReader pageReader)
18
+ KafkaOutputColumnVisitor(KafkaOutputPlugin.PluginTask task, PageReader pageReader)
16
19
  {
17
20
  this.task = task;
18
21
  this.pageReader = pageReader;
22
+ this.partitionColumnName = task.getPartitionColumnName().orElse(null);
19
23
  }
20
24
 
21
- void setRecordKey(Column column, Object value)
25
+ public abstract T getRecord();
26
+
27
+ Object getRecordKey()
28
+ {
29
+ return recordKey;
30
+ }
31
+
32
+ private void setRecordKey(Column column, Object value)
22
33
  {
23
34
  if (task.getKeyColumnName().isPresent() && task.getKeyColumnName().get().equals(column.getName())) {
24
35
  recordKey = value;
25
36
  }
26
37
  }
27
38
 
28
- void setTopicName(Column column, String value)
39
+ String getTopicName()
40
+ {
41
+ return topicName;
42
+ }
43
+
44
+ private void setTopicName(Column column, String value)
29
45
  {
30
46
  if (task.getTopicColumn().isPresent() && task.getTopicColumn().get().equals(column.getName())) {
31
47
  topicName = value;
32
48
  }
33
49
  }
34
50
 
51
+ Integer getPartition()
52
+ {
53
+ return partition;
54
+ }
55
+
56
+ boolean isDeletion()
57
+ {
58
+ return deletion;
59
+ }
60
+
61
+ void reset()
62
+ {
63
+ this.recordKey = null;
64
+ this.topicName = null;
65
+ this.partition = null;
66
+ this.deletion = false;
67
+ }
68
+
35
69
  boolean isIgnoreColumn(Column column)
36
70
  {
37
71
  return task.getIgnoreColumns().stream().anyMatch(name -> name.equals(column.getName()));
38
72
  }
39
73
 
74
+ boolean isColumnForDeletion(Column column)
75
+ {
76
+ return task.getColumnForDeletion().map(name -> name.equals(column.getName())).orElse(false);
77
+ }
78
+
79
+ @Override
80
+ public void booleanColumn(Column column)
81
+ {
82
+ if (!pageReader.isNull(column)) {
83
+ if (isColumnForDeletion(column)) {
84
+ deletion = pageReader.getBoolean(column);
85
+ }
86
+ }
87
+ }
88
+
40
89
  @Override
41
90
  public void longColumn(Column column)
42
91
  {
43
92
  if (!pageReader.isNull(column)) {
44
- setRecordKey(column, pageReader.getLong(column));
93
+ long value = pageReader.getLong(column);
94
+ setRecordKey(column, value);
95
+
96
+ if (partitionColumnName != null && partitionColumnName.equals(column.getName())) {
97
+ partition = Long.valueOf(value).intValue();
98
+ }
45
99
  }
46
100
  }
47
101