embulk-guess-csv_verify 0.10.29-java → 0.10.30-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/classpath/aopalliance-1.0.jar +0 -0
- data/classpath/bval-core-0.5.jar +0 -0
- data/classpath/bval-jsr303-0.5.jar +0 -0
- data/classpath/commons-beanutils-core-1.8.3.jar +0 -0
- data/classpath/commons-lang3-3.4.jar +0 -0
- data/classpath/embulk-api-0.10.30.jar +0 -0
- data/classpath/embulk-core-0.10.30.jar +0 -0
- data/classpath/embulk-guess-csv-0.10.30.jar +0 -0
- data/classpath/embulk-guess-csv_verify-0.10.30.jar +0 -0
- data/classpath/{embulk-parser-csv-0.10.29.jar → embulk-parser-csv-0.10.30.jar} +0 -0
- data/classpath/embulk-spi-0.10.30.jar +0 -0
- data/classpath/guava-18.0.jar +0 -0
- data/classpath/guice-4.0.jar +0 -0
- data/classpath/guice-multibindings-4.0.jar +0 -0
- data/classpath/jackson-datatype-guava-2.6.7.jar +0 -0
- data/classpath/jackson-module-guice-2.6.7.jar +0 -0
- data/classpath/javax.inject-1.jar +0 -0
- data/classpath/msgpack-core-0.8.11.jar +0 -0
- data/classpath/slf4j-api-1.7.30.jar +0 -0
- data/lib/embulk/guess/csv_verify.rb +20 -30
- metadata +24 -6
- data/classpath/embulk-guess-csv-0.10.29.jar +0 -0
- data/classpath/embulk-guess-csv_verify-0.10.29.jar +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c980881f2e59cd43b69eebe56691108f9a9ff2436bd9d2446b2b22330bf8f88c
|
4
|
+
data.tar.gz: da1cf6e385c0b77d8b3d1aca9076e5ab5f2a44e81ceb357207837bcfd255de0f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a3c223339d066fd2a4241232cdd6362e0836d051eb7a209353bbb4879c800fcc561ede18005b4c257e062119271e000f36cf3972afbce164f37fb7b6e51fad86
|
7
|
+
data.tar.gz: 61a444758472772ff192f932dabe6ed9fea417c413ec03203782931a0efe0d44f270686cb546870ec7bdc6a732edd89df27d97605e23a2d81838c08eb4e95511
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -25,13 +25,13 @@ module Embulk
|
|
25
25
|
CONFIG_MAPPER_FACTORY_CLASS = CLASSLOADER.loadClass("org.embulk.util.config.ConfigMapperFactory").ruby_class
|
26
26
|
TYPE_MODULE_CLASS = CLASSLOADER.loadClass("org.embulk.util.config.modules.TypeModule").ruby_class
|
27
27
|
CONFIG_MAPPER_FACTORY = CONFIG_MAPPER_FACTORY_CLASS.builder.addDefaultModules.addModule(TYPE_MODULE_CLASS.new).build
|
28
|
-
|
28
|
+
LEGACY_PLUGIN_TASK_CLASS = CLASSLOADER.loadClass("org.embulk.standards.CsvParserPlugin$PluginTask")
|
29
29
|
LIST_FILE_INPUT_CLASS = CLASSLOADER.loadClass("org.embulk.util.file.ListFileInput").ruby_class
|
30
30
|
LINE_DECODER_CLASS = CLASSLOADER.loadClass("org.embulk.util.text.LineDecoder").ruby_class
|
31
31
|
CSV_GUESS_PLUGIN_CLASS = CLASSLOADER.loadClass("org.embulk.guess.csv.CsvGuessPlugin").ruby_class
|
32
|
-
|
33
|
-
|
34
|
-
|
32
|
+
LEGACY_CSV_TOKENIZER_CLASS = CLASSLOADER.loadClass("org.embulk.standards.CsvTokenizer").ruby_class
|
33
|
+
LEGACY_TOO_FEW_COLUMNS_EXCEPTION_CLASS = CLASSLOADER.loadClass("org.embulk.parser.csv.CsvTokenizer$TooFewColumnsException").ruby_class
|
34
|
+
LEGACY_INVALID_VALUE_EXCEPTION_CLASS = CLASSLOADER.loadClass("org.embulk.parser.csv.CsvTokenizer$InvalidValueException").ruby_class
|
35
35
|
|
36
36
|
DELIMITER_CANDIDATES = [
|
37
37
|
",", "\t", "|", ";"
|
@@ -71,7 +71,9 @@ module Embulk
|
|
71
71
|
end
|
72
72
|
guessed_ruby_converted = config_to_java(guessed_ruby)
|
73
73
|
if !guessed_java.equals(guessed_ruby_converted)
|
74
|
-
|
74
|
+
log_guess_diff(guessed_ruby, guessed_java, "decoders")
|
75
|
+
log_guess_diff(guessed_ruby, guessed_java, "parser")
|
76
|
+
raise "embulk-guess-csv has difference between Java/Ruby."
|
75
77
|
end
|
76
78
|
rescue Exception => e
|
77
79
|
# Any error from the Java-based guess plugin should pass-through just with logging.
|
@@ -237,34 +239,23 @@ module Embulk
|
|
237
239
|
|
238
240
|
private
|
239
241
|
|
240
|
-
def
|
241
|
-
guessed_ruby = guessed_ruby_entire[
|
242
|
-
guessed_java = guessed_java_entire.getNestedOrGetEmpty(
|
243
|
-
|
244
|
-
require 'set'
|
245
|
-
keys = Set.new(guessed_ruby.keys) + Set.new(guessed_java.getAttributeNames)
|
242
|
+
def log_guess_diff(guessed_ruby_entire, guessed_java_entire, key)
|
243
|
+
guessed_ruby = guessed_ruby_entire[key] || {}
|
244
|
+
guessed_java = guessed_java_entire.getNestedOrGetEmpty(key)
|
246
245
|
|
247
246
|
begin
|
248
247
|
require 'json'
|
249
248
|
rescue LoadError
|
250
|
-
|
251
|
-
guessed_java_hash = nil
|
249
|
+
raise "The 'json' gem is not installed. No details compared."
|
252
250
|
else
|
253
251
|
guessed_java_hash = JSON.parse(guessed_java.toJson)
|
254
252
|
end
|
255
253
|
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
elsif !guessed_java.has(key.to_java)
|
261
|
-
diffs << "Only embulk-guess-csv (Ruby) has: \"#{key}\""
|
262
|
-
elsif guessed_java_hash && guessed_ruby[key] != guessed_java_hash[key]
|
263
|
-
diffs << "embulk-guess-csv has difference between Java/Ruby: \"#{key}\""
|
264
|
-
end
|
254
|
+
if guessed_java_hash && guessed_ruby != guessed_java_hash
|
255
|
+
Embulk.logger.error "[Embulk CSV guess verify] '#{key}' has difference."
|
256
|
+
Embulk.logger.error "[Embulk CSV guess verify] Java => #{guessed_java_hash.to_json}"
|
257
|
+
Embulk.logger.error "[Embulk CSV guess verify] Ruby => #{guessed_ruby.to_json}"
|
265
258
|
end
|
266
|
-
|
267
|
-
raise "embulk-guess-csv has difference between Java/Ruby: #{diffs.inspect}"
|
268
259
|
end
|
269
260
|
|
270
261
|
def config_to_java(config_ruby)
|
@@ -289,12 +280,11 @@ module Embulk
|
|
289
280
|
def split_lines(parser_config, skip_empty_lines, sample_lines, delim, extra_config)
|
290
281
|
null_string = parser_config["null_string"]
|
291
282
|
config = parser_config.merge(extra_config).merge({"charset" => "UTF-8", "columns" => []})
|
292
|
-
parser_task =
|
283
|
+
parser_task = config.load_config(LEGACY_PLUGIN_TASK_CLASS)
|
293
284
|
data = sample_lines.map {|line| line.force_encoding('UTF-8') }.join(parser_task.getNewline.getString.encode('UTF-8'))
|
294
285
|
sample = Buffer.from_ruby_string(data)
|
295
|
-
decoder =
|
296
|
-
|
297
|
-
tokenizer = CSV_TOKENIZER_CLASS.new(decoder, parser_task)
|
286
|
+
decoder = Java::LineDecoder.new(Java::ListFileInput.new([[sample.to_java]]), parser_task)
|
287
|
+
tokenizer = LEGACY_CSV_TOKENIZER_CLASS.new(decoder, parser_task)
|
298
288
|
rows = []
|
299
289
|
while tokenizer.nextFile
|
300
290
|
while tokenizer.nextRecord(skip_empty_lines)
|
@@ -308,12 +298,12 @@ module Embulk
|
|
308
298
|
column = nil
|
309
299
|
end
|
310
300
|
columns << column
|
311
|
-
rescue
|
301
|
+
rescue LEGACY_TOO_FEW_COLUMNS_EXCEPTION_CLASS
|
312
302
|
rows << columns
|
313
303
|
break
|
314
304
|
end
|
315
305
|
end
|
316
|
-
rescue
|
306
|
+
rescue LEGACY_INVALID_VALUE_EXCEPTION_CLASS
|
317
307
|
# TODO warning
|
318
308
|
tokenizer.skipCurrentLine
|
319
309
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-guess-csv_verify
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.10.
|
4
|
+
version: 0.10.30
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Sadayuki Furuhashi
|
@@ -10,19 +10,29 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2021-04-
|
13
|
+
date: 2021-04-15 00:00:00.000000000 Z
|
14
14
|
dependencies: []
|
15
15
|
description: Verification-purpose Embulk CSV guess plugin to compare the old Ruby-based
|
16
|
-
one and the new Java-based one (not for your production use
|
16
|
+
one and the new Java-based one (not for your production use; note that 'decoders'
|
17
|
+
and 'parser' sections in your configuration can be logged even if they contain confidential
|
18
|
+
information)
|
17
19
|
email:
|
18
20
|
- dmikurube@treasure-data.com
|
19
21
|
executables: []
|
20
22
|
extensions: []
|
21
23
|
extra_rdoc_files: []
|
22
24
|
files:
|
23
|
-
- classpath/
|
24
|
-
- classpath/
|
25
|
-
- classpath/
|
25
|
+
- classpath/aopalliance-1.0.jar
|
26
|
+
- classpath/bval-core-0.5.jar
|
27
|
+
- classpath/bval-jsr303-0.5.jar
|
28
|
+
- classpath/commons-beanutils-core-1.8.3.jar
|
29
|
+
- classpath/commons-lang3-3.4.jar
|
30
|
+
- classpath/embulk-api-0.10.30.jar
|
31
|
+
- classpath/embulk-core-0.10.30.jar
|
32
|
+
- classpath/embulk-guess-csv-0.10.30.jar
|
33
|
+
- classpath/embulk-guess-csv_verify-0.10.30.jar
|
34
|
+
- classpath/embulk-parser-csv-0.10.30.jar
|
35
|
+
- classpath/embulk-spi-0.10.30.jar
|
26
36
|
- classpath/embulk-util-config-0.2.1.jar
|
27
37
|
- classpath/embulk-util-file-0.1.3.jar
|
28
38
|
- classpath/embulk-util-guess-0.1.1.jar
|
@@ -30,11 +40,19 @@ files:
|
|
30
40
|
- classpath/embulk-util-rubytime-0.3.2.jar
|
31
41
|
- classpath/embulk-util-text-0.1.0.jar
|
32
42
|
- classpath/embulk-util-timestamp-0.2.1.jar
|
43
|
+
- classpath/guava-18.0.jar
|
44
|
+
- classpath/guice-4.0.jar
|
45
|
+
- classpath/guice-multibindings-4.0.jar
|
33
46
|
- classpath/icu4j-54.1.1.jar
|
34
47
|
- classpath/jackson-annotations-2.6.7.jar
|
35
48
|
- classpath/jackson-core-2.6.7.jar
|
36
49
|
- classpath/jackson-databind-2.6.7.jar
|
50
|
+
- classpath/jackson-datatype-guava-2.6.7.jar
|
37
51
|
- classpath/jackson-datatype-jdk8-2.6.7.jar
|
52
|
+
- classpath/jackson-module-guice-2.6.7.jar
|
53
|
+
- classpath/javax.inject-1.jar
|
54
|
+
- classpath/msgpack-core-0.8.11.jar
|
55
|
+
- classpath/slf4j-api-1.7.30.jar
|
38
56
|
- classpath/validation-api-1.1.0.Final.jar
|
39
57
|
- lib/embulk/guess/csv_verify.rb
|
40
58
|
homepage: https://github.com/embulk/embulk
|
Binary file
|
Binary file
|