embulk-guess-csv_verify 0.10.29-java → 0.10.30-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/classpath/aopalliance-1.0.jar +0 -0
- data/classpath/bval-core-0.5.jar +0 -0
- data/classpath/bval-jsr303-0.5.jar +0 -0
- data/classpath/commons-beanutils-core-1.8.3.jar +0 -0
- data/classpath/commons-lang3-3.4.jar +0 -0
- data/classpath/embulk-api-0.10.30.jar +0 -0
- data/classpath/embulk-core-0.10.30.jar +0 -0
- data/classpath/embulk-guess-csv-0.10.30.jar +0 -0
- data/classpath/embulk-guess-csv_verify-0.10.30.jar +0 -0
- data/classpath/{embulk-parser-csv-0.10.29.jar → embulk-parser-csv-0.10.30.jar} +0 -0
- data/classpath/embulk-spi-0.10.30.jar +0 -0
- data/classpath/guava-18.0.jar +0 -0
- data/classpath/guice-4.0.jar +0 -0
- data/classpath/guice-multibindings-4.0.jar +0 -0
- data/classpath/jackson-datatype-guava-2.6.7.jar +0 -0
- data/classpath/jackson-module-guice-2.6.7.jar +0 -0
- data/classpath/javax.inject-1.jar +0 -0
- data/classpath/msgpack-core-0.8.11.jar +0 -0
- data/classpath/slf4j-api-1.7.30.jar +0 -0
- data/lib/embulk/guess/csv_verify.rb +20 -30
- metadata +24 -6
- data/classpath/embulk-guess-csv-0.10.29.jar +0 -0
- data/classpath/embulk-guess-csv_verify-0.10.29.jar +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c980881f2e59cd43b69eebe56691108f9a9ff2436bd9d2446b2b22330bf8f88c
|
4
|
+
data.tar.gz: da1cf6e385c0b77d8b3d1aca9076e5ab5f2a44e81ceb357207837bcfd255de0f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a3c223339d066fd2a4241232cdd6362e0836d051eb7a209353bbb4879c800fcc561ede18005b4c257e062119271e000f36cf3972afbce164f37fb7b6e51fad86
|
7
|
+
data.tar.gz: 61a444758472772ff192f932dabe6ed9fea417c413ec03203782931a0efe0d44f270686cb546870ec7bdc6a732edd89df27d97605e23a2d81838c08eb4e95511
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -25,13 +25,13 @@ module Embulk
|
|
25
25
|
CONFIG_MAPPER_FACTORY_CLASS = CLASSLOADER.loadClass("org.embulk.util.config.ConfigMapperFactory").ruby_class
|
26
26
|
TYPE_MODULE_CLASS = CLASSLOADER.loadClass("org.embulk.util.config.modules.TypeModule").ruby_class
|
27
27
|
CONFIG_MAPPER_FACTORY = CONFIG_MAPPER_FACTORY_CLASS.builder.addDefaultModules.addModule(TYPE_MODULE_CLASS.new).build
|
28
|
-
|
28
|
+
LEGACY_PLUGIN_TASK_CLASS = CLASSLOADER.loadClass("org.embulk.standards.CsvParserPlugin$PluginTask")
|
29
29
|
LIST_FILE_INPUT_CLASS = CLASSLOADER.loadClass("org.embulk.util.file.ListFileInput").ruby_class
|
30
30
|
LINE_DECODER_CLASS = CLASSLOADER.loadClass("org.embulk.util.text.LineDecoder").ruby_class
|
31
31
|
CSV_GUESS_PLUGIN_CLASS = CLASSLOADER.loadClass("org.embulk.guess.csv.CsvGuessPlugin").ruby_class
|
32
|
-
|
33
|
-
|
34
|
-
|
32
|
+
LEGACY_CSV_TOKENIZER_CLASS = CLASSLOADER.loadClass("org.embulk.standards.CsvTokenizer").ruby_class
|
33
|
+
LEGACY_TOO_FEW_COLUMNS_EXCEPTION_CLASS = CLASSLOADER.loadClass("org.embulk.parser.csv.CsvTokenizer$TooFewColumnsException").ruby_class
|
34
|
+
LEGACY_INVALID_VALUE_EXCEPTION_CLASS = CLASSLOADER.loadClass("org.embulk.parser.csv.CsvTokenizer$InvalidValueException").ruby_class
|
35
35
|
|
36
36
|
DELIMITER_CANDIDATES = [
|
37
37
|
",", "\t", "|", ";"
|
@@ -71,7 +71,9 @@ module Embulk
|
|
71
71
|
end
|
72
72
|
guessed_ruby_converted = config_to_java(guessed_ruby)
|
73
73
|
if !guessed_java.equals(guessed_ruby_converted)
|
74
|
-
|
74
|
+
log_guess_diff(guessed_ruby, guessed_java, "decoders")
|
75
|
+
log_guess_diff(guessed_ruby, guessed_java, "parser")
|
76
|
+
raise "embulk-guess-csv has difference between Java/Ruby."
|
75
77
|
end
|
76
78
|
rescue Exception => e
|
77
79
|
# Any error from the Java-based guess plugin should pass-through just with logging.
|
@@ -237,34 +239,23 @@ module Embulk
|
|
237
239
|
|
238
240
|
private
|
239
241
|
|
240
|
-
def
|
241
|
-
guessed_ruby = guessed_ruby_entire[
|
242
|
-
guessed_java = guessed_java_entire.getNestedOrGetEmpty(
|
243
|
-
|
244
|
-
require 'set'
|
245
|
-
keys = Set.new(guessed_ruby.keys) + Set.new(guessed_java.getAttributeNames)
|
242
|
+
def log_guess_diff(guessed_ruby_entire, guessed_java_entire, key)
|
243
|
+
guessed_ruby = guessed_ruby_entire[key] || {}
|
244
|
+
guessed_java = guessed_java_entire.getNestedOrGetEmpty(key)
|
246
245
|
|
247
246
|
begin
|
248
247
|
require 'json'
|
249
248
|
rescue LoadError
|
250
|
-
|
251
|
-
guessed_java_hash = nil
|
249
|
+
raise "The 'json' gem is not installed. No details compared."
|
252
250
|
else
|
253
251
|
guessed_java_hash = JSON.parse(guessed_java.toJson)
|
254
252
|
end
|
255
253
|
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
elsif !guessed_java.has(key.to_java)
|
261
|
-
diffs << "Only embulk-guess-csv (Ruby) has: \"#{key}\""
|
262
|
-
elsif guessed_java_hash && guessed_ruby[key] != guessed_java_hash[key]
|
263
|
-
diffs << "embulk-guess-csv has difference between Java/Ruby: \"#{key}\""
|
264
|
-
end
|
254
|
+
if guessed_java_hash && guessed_ruby != guessed_java_hash
|
255
|
+
Embulk.logger.error "[Embulk CSV guess verify] '#{key}' has difference."
|
256
|
+
Embulk.logger.error "[Embulk CSV guess verify] Java => #{guessed_java_hash.to_json}"
|
257
|
+
Embulk.logger.error "[Embulk CSV guess verify] Ruby => #{guessed_ruby.to_json}"
|
265
258
|
end
|
266
|
-
|
267
|
-
raise "embulk-guess-csv has difference between Java/Ruby: #{diffs.inspect}"
|
268
259
|
end
|
269
260
|
|
270
261
|
def config_to_java(config_ruby)
|
@@ -289,12 +280,11 @@ module Embulk
|
|
289
280
|
def split_lines(parser_config, skip_empty_lines, sample_lines, delim, extra_config)
|
290
281
|
null_string = parser_config["null_string"]
|
291
282
|
config = parser_config.merge(extra_config).merge({"charset" => "UTF-8", "columns" => []})
|
292
|
-
parser_task =
|
283
|
+
parser_task = config.load_config(LEGACY_PLUGIN_TASK_CLASS)
|
293
284
|
data = sample_lines.map {|line| line.force_encoding('UTF-8') }.join(parser_task.getNewline.getString.encode('UTF-8'))
|
294
285
|
sample = Buffer.from_ruby_string(data)
|
295
|
-
decoder =
|
296
|
-
|
297
|
-
tokenizer = CSV_TOKENIZER_CLASS.new(decoder, parser_task)
|
286
|
+
decoder = Java::LineDecoder.new(Java::ListFileInput.new([[sample.to_java]]), parser_task)
|
287
|
+
tokenizer = LEGACY_CSV_TOKENIZER_CLASS.new(decoder, parser_task)
|
298
288
|
rows = []
|
299
289
|
while tokenizer.nextFile
|
300
290
|
while tokenizer.nextRecord(skip_empty_lines)
|
@@ -308,12 +298,12 @@ module Embulk
|
|
308
298
|
column = nil
|
309
299
|
end
|
310
300
|
columns << column
|
311
|
-
rescue
|
301
|
+
rescue LEGACY_TOO_FEW_COLUMNS_EXCEPTION_CLASS
|
312
302
|
rows << columns
|
313
303
|
break
|
314
304
|
end
|
315
305
|
end
|
316
|
-
rescue
|
306
|
+
rescue LEGACY_INVALID_VALUE_EXCEPTION_CLASS
|
317
307
|
# TODO warning
|
318
308
|
tokenizer.skipCurrentLine
|
319
309
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-guess-csv_verify
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.10.
|
4
|
+
version: 0.10.30
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Sadayuki Furuhashi
|
@@ -10,19 +10,29 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2021-04-
|
13
|
+
date: 2021-04-15 00:00:00.000000000 Z
|
14
14
|
dependencies: []
|
15
15
|
description: Verification-purpose Embulk CSV guess plugin to compare the old Ruby-based
|
16
|
-
one and the new Java-based one (not for your production use
|
16
|
+
one and the new Java-based one (not for your production use; note that 'decoders'
|
17
|
+
and 'parser' sections in your configuration can be logged even if they contain confidential
|
18
|
+
information)
|
17
19
|
email:
|
18
20
|
- dmikurube@treasure-data.com
|
19
21
|
executables: []
|
20
22
|
extensions: []
|
21
23
|
extra_rdoc_files: []
|
22
24
|
files:
|
23
|
-
- classpath/
|
24
|
-
- classpath/
|
25
|
-
- classpath/
|
25
|
+
- classpath/aopalliance-1.0.jar
|
26
|
+
- classpath/bval-core-0.5.jar
|
27
|
+
- classpath/bval-jsr303-0.5.jar
|
28
|
+
- classpath/commons-beanutils-core-1.8.3.jar
|
29
|
+
- classpath/commons-lang3-3.4.jar
|
30
|
+
- classpath/embulk-api-0.10.30.jar
|
31
|
+
- classpath/embulk-core-0.10.30.jar
|
32
|
+
- classpath/embulk-guess-csv-0.10.30.jar
|
33
|
+
- classpath/embulk-guess-csv_verify-0.10.30.jar
|
34
|
+
- classpath/embulk-parser-csv-0.10.30.jar
|
35
|
+
- classpath/embulk-spi-0.10.30.jar
|
26
36
|
- classpath/embulk-util-config-0.2.1.jar
|
27
37
|
- classpath/embulk-util-file-0.1.3.jar
|
28
38
|
- classpath/embulk-util-guess-0.1.1.jar
|
@@ -30,11 +40,19 @@ files:
|
|
30
40
|
- classpath/embulk-util-rubytime-0.3.2.jar
|
31
41
|
- classpath/embulk-util-text-0.1.0.jar
|
32
42
|
- classpath/embulk-util-timestamp-0.2.1.jar
|
43
|
+
- classpath/guava-18.0.jar
|
44
|
+
- classpath/guice-4.0.jar
|
45
|
+
- classpath/guice-multibindings-4.0.jar
|
33
46
|
- classpath/icu4j-54.1.1.jar
|
34
47
|
- classpath/jackson-annotations-2.6.7.jar
|
35
48
|
- classpath/jackson-core-2.6.7.jar
|
36
49
|
- classpath/jackson-databind-2.6.7.jar
|
50
|
+
- classpath/jackson-datatype-guava-2.6.7.jar
|
37
51
|
- classpath/jackson-datatype-jdk8-2.6.7.jar
|
52
|
+
- classpath/jackson-module-guice-2.6.7.jar
|
53
|
+
- classpath/javax.inject-1.jar
|
54
|
+
- classpath/msgpack-core-0.8.11.jar
|
55
|
+
- classpath/slf4j-api-1.7.30.jar
|
38
56
|
- classpath/validation-api-1.1.0.Final.jar
|
39
57
|
- lib/embulk/guess/csv_verify.rb
|
40
58
|
homepage: https://github.com/embulk/embulk
|
Binary file
|
Binary file
|