embulk 0.4.1 → 0.4.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/build.gradle +8 -2
- data/embulk-core/src/main/java/org/embulk/command/Runner.java +5 -8
- data/embulk-docs/push-gh-pages.sh +1 -1
- data/embulk-docs/src/release.rst +2 -0
- data/embulk-docs/src/release/release-0.4.1.rst +2 -2
- data/embulk-docs/src/release/release-0.4.2.rst +18 -0
- data/lib/embulk/command/embulk_new_plugin.rb +9 -9
- data/lib/embulk/data/new/java/plugin_loader.rb.erb +1 -1
- data/lib/embulk/data/new/ruby/filter.rb.erb +1 -1
- data/lib/embulk/data/new/ruby/gemspec.erb +2 -2
- data/lib/embulk/data/new/ruby/input.rb.erb +1 -1
- data/lib/embulk/guess/charset.rb +28 -0
- data/lib/embulk/guess/csv.rb +206 -0
- data/lib/embulk/guess/gzip.rb +18 -0
- data/lib/embulk/guess/newline.rb +22 -0
- data/lib/embulk/guess/time_format_guess.rb +333 -0
- data/lib/embulk/version.rb +1 -1
- metadata +9 -8
- data/lib/embulk/guess_charset.rb +0 -26
- data/lib/embulk/guess_csv.rb +0 -204
- data/lib/embulk/guess_gzip.rb +0 -16
- data/lib/embulk/guess_newline.rb +0 -20
- data/lib/embulk/time_format_guess.rb +0 -331
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d63d08592793b691be7f691d842f73d18a202d45
|
4
|
+
data.tar.gz: 5a14e328213b2a97fe6e0abaa044428bb5caa0fd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8e447c2ae251e6ef3309c0862344d10074797389ea2c55bce2e2302d82eea2425345316efdf7dae9adcd9f08de5b0132c04e6cad43edaef29ac3cf635ad640f8
|
7
|
+
data.tar.gz: 0b33b0a4f2dc7911d1432828e303c464c8b25bda00dc9eb3df8c9fb905c85011c30132292c581163fb557d2e2eaeb3a32a8bbd61bd486509361b4464aa2477ed
|
data/README.md
CHANGED
@@ -24,7 +24,7 @@ You can release plugins to share your efforts of data cleaning, error handling,
|
|
24
24
|
The single-file package is the simplest way to try Embulk. You can download the latest embulk-VERSION.jar from [the releases page](https://bintray.com/embulk/maven/embulk/view#files) and run it with java:
|
25
25
|
|
26
26
|
```
|
27
|
-
wget https://bintray.com/artifact/download/embulk/maven/embulk-0.4.
|
27
|
+
wget https://bintray.com/artifact/download/embulk/maven/embulk-0.4.2.jar -O embulk.jar
|
28
28
|
java -jar embulk.jar --help
|
29
29
|
```
|
30
30
|
|
data/build.gradle
CHANGED
@@ -9,7 +9,7 @@ plugins {
|
|
9
9
|
|
10
10
|
allprojects {
|
11
11
|
group = 'org.embulk'
|
12
|
-
version = '0.4.
|
12
|
+
version = '0.4.2'
|
13
13
|
|
14
14
|
apply plugin: 'maven' // install jar files to the local repo: $ gradle install
|
15
15
|
apply plugin: 'maven-publish'
|
@@ -205,9 +205,15 @@ task releaseCheck << {
|
|
205
205
|
if (!file("lib/embulk/version.rb").getText().contains("${project.version}")) {
|
206
206
|
throw new GradleException("lib/embulk/version.rb doesn't include ${project.version}")
|
207
207
|
}
|
208
|
+
if (!file("embulk-docs/src/release/release-${project.version}.rst").getText().contains("${project.version}")) {
|
209
|
+
throw new GradleException("Release note for ${project.version} doesn't exist")
|
210
|
+
}
|
211
|
+
if (!file("embulk-docs/src/release.rst").getText().contains("release-${project.version}")) {
|
212
|
+
throw new GradleException("embulk-docs/src/release.rst doesn't include release-${project.version}")
|
213
|
+
}
|
208
214
|
String date = new Date().format("yyyy-MM-dd")
|
209
215
|
if (!file("embulk-docs/src/release/release-${project.version}.rst").getText().contains(date)) {
|
210
|
-
throw new GradleException("
|
216
|
+
throw new GradleException("embulk-docs/src/release/release-${project.version}.rst doesn't include today's release date")
|
211
217
|
}
|
212
218
|
println "Ready. Run ./gradlew release"
|
213
219
|
}
|
@@ -169,6 +169,7 @@ public class Runner
|
|
169
169
|
|
170
170
|
String yml = writeNextConfig(options.getNextConfigOutputPath(), config, configDiff);
|
171
171
|
System.err.println(yml);
|
172
|
+
System.out.println("Created "+options.getNextConfigOutputPath());
|
172
173
|
}
|
173
174
|
|
174
175
|
private void checkFileWritable(String path)
|
@@ -191,14 +192,10 @@ public class Runner
|
|
191
192
|
{
|
192
193
|
String yml = dumpYaml(obj);
|
193
194
|
if (path != null) {
|
194
|
-
|
195
|
-
|
196
|
-
}
|
197
|
-
|
198
|
-
writer.write(yml);
|
199
|
-
} catch (IOException ex) {
|
200
|
-
throw new RuntimeException(ex);
|
201
|
-
}
|
195
|
+
try (Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(path), "UTF-8"))) {
|
196
|
+
writer.write(yml);
|
197
|
+
} catch (IOException ex) {
|
198
|
+
throw new RuntimeException(ex);
|
202
199
|
}
|
203
200
|
}
|
204
201
|
return yml;
|
@@ -13,7 +13,7 @@ function r() {
|
|
13
13
|
}
|
14
14
|
|
15
15
|
[ "$TRAVIS_PULL_REQUEST" != "false" ] && exit 0
|
16
|
-
[ "$TRAVIS_BRANCH" != "master" ] && exit 0
|
16
|
+
[ "$TRAVIS_BRANCH" != "master" -a "$TRAVIS_BRANCH" != "$(git describe --tags --always HEAD)" ] && exit 0
|
17
17
|
|
18
18
|
revision="$(git rev-parse HEAD)"
|
19
19
|
remote="$(git config remote.origin.url | sed "s+^git:+https:+")"
|
data/embulk-docs/src/release.rst
CHANGED
@@ -0,0 +1,18 @@
|
|
1
|
+
Release 0.4.2
|
2
|
+
==================================
|
3
|
+
|
4
|
+
CLI
|
5
|
+
------------------
|
6
|
+
|
7
|
+
* Removed support for setting ``-`` to ``-o, --output PATH`` option
|
8
|
+
|
9
|
+
General Changes
|
10
|
+
------------------
|
11
|
+
|
12
|
+
* Fixed guess plugin paths (@takei-yuya++)
|
13
|
+
* Fixed syntax error in generated gemspec file
|
14
|
+
* Fixed syntax error in generated generated ruby filter and input plugins (@takei-yuya++, also reported by hiroyuki-sato)
|
15
|
+
|
16
|
+
Release Date
|
17
|
+
------------------
|
18
|
+
2015-02-16
|
@@ -34,23 +34,23 @@ module Embulk
|
|
34
34
|
description =
|
35
35
|
case category
|
36
36
|
when :input
|
37
|
-
%[that loads records from #{display_name} so that any output plugins can receive the records. Search the output plugins by
|
37
|
+
%[that loads records from #{display_name} so that any output plugins can receive the records. Search the output plugins by 'embulk-output' keyword.]
|
38
38
|
when :file_input
|
39
|
-
%[that reads files from #{display_name} and parses the file using any parser plugins. Search the parser plugins by
|
39
|
+
%[that reads files from #{display_name} and parses the file using any parser plugins. Search the parser plugins by 'embulk-parser' keyword.]
|
40
40
|
when :parser
|
41
|
-
%[that parses #{display_name} file format read by any file input plugins. Search the file input plugins by
|
41
|
+
%[that parses #{display_name} file format read by any file input plugins. Search the file input plugins by 'embulk-input file' keywords.]
|
42
42
|
when :decoder
|
43
|
-
%[that decodes files encoded by #{display_name} read by any file input plugins. Search the file input plugins by
|
43
|
+
%[that decodes files encoded by #{display_name} read by any file input plugins. Search the file input plugins by 'embulk-input file' keywords.]
|
44
44
|
when :output
|
45
|
-
%[that loads records to #{display_name} read by any input plugins. Search the input plugins by
|
45
|
+
%[that loads records to #{display_name} read by any input plugins. Search the input plugins by 'embulk-input' keyword.]
|
46
46
|
when :file_output
|
47
|
-
%[that stores files to #{display_name} formatted by any formatter plugins. Search the formatter plugins by
|
47
|
+
%[that stores files to #{display_name} formatted by any formatter plugins. Search the formatter plugins by 'embulk-formatter' keyword.]
|
48
48
|
when :formtter
|
49
|
-
%[that formats records using #{display_name} file format and so that any file output plugins can store the files. Search the file output plugins by
|
49
|
+
%[that formats records using #{display_name} file format and so that any file output plugins can store the files. Search the file output plugins by 'embulk-output file' keywords.]
|
50
50
|
when :encoder
|
51
|
-
%[that encodes files using #{display_name} so that any file output plugins can store the files. Search the file output plugins by
|
51
|
+
%[that encodes files using #{display_name} so that any file output plugins can store the files. Search the file output plugins by 'embulk-output file' keywords.]
|
52
52
|
when :filter
|
53
|
-
%[that converts records read by an input plugin before passing it to an output plugins. Search the input and plugins by
|
53
|
+
%[that converts records read by an input plugin before passing it to an output plugins. Search the input and plugins by 'embulk-input' and 'embulk-output' plugins.]
|
54
54
|
end
|
55
55
|
|
56
56
|
pkg = Embulk::PackageData.new("new", project_name, binding())
|
@@ -1,3 +1,3 @@
|
|
1
1
|
Embulk::JavaPlugin.register_<%= embulk_category %>(
|
2
|
-
<%= name.
|
2
|
+
<%= name.dump %>, <%= "org.embulk.#{embulk_category}.#{java_class_name}".dump %>,
|
3
3
|
File.expand_path('../../../../classpath', __FILE__))
|
@@ -3,8 +3,8 @@ Gem::Specification.new do |spec|
|
|
3
3
|
spec.name = "<%= project_name %>"
|
4
4
|
spec.version = "0.1.0"
|
5
5
|
spec.authors = [<%= author.dump %>]
|
6
|
-
spec.summary =
|
7
|
-
spec.description =
|
6
|
+
spec.summary = <%= "#{display_name} #{display_category} plugin for Embulk".dump %>
|
7
|
+
spec.description = <%= "#{display_name} #{display_category} plugin is an Embulk plugin #{description}".dump %>
|
8
8
|
spec.email = [<%= email.dump %>]
|
9
9
|
spec.licenses = ["MIT"]
|
10
10
|
# TODO: spec.homepage = "https://github.com/<%= email[/([^@]*)/] %>/<%= project_name %>"
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Embulk
|
2
|
+
module Guess
|
3
|
+
|
4
|
+
class CharsetGuessPlugin < GuessPlugin
|
5
|
+
Plugin.register_guess('charset', self)
|
6
|
+
|
7
|
+
def guess(config, sample_buffer)
|
8
|
+
# ICU4J
|
9
|
+
detector = com.ibm.icu.text.CharsetDetector.new
|
10
|
+
detector.setText(sample_buffer.to_java_bytes)
|
11
|
+
best_match = detector.detect
|
12
|
+
if best_match.getConfidence < 50
|
13
|
+
name = "UTF-8"
|
14
|
+
else
|
15
|
+
name = best_match.getName
|
16
|
+
if name == "ISO-8859-1"
|
17
|
+
# ISO-8859-1 means ASCII which is a subset
|
18
|
+
# of UTF-8 in most of cases due to lack of
|
19
|
+
# sample data set
|
20
|
+
name = "UTF-8"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
return {"parser" => {"charset" => name}}
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,206 @@
|
|
1
|
+
module Embulk
|
2
|
+
module Guess
|
3
|
+
require_relative 'time_format_guess'
|
4
|
+
|
5
|
+
class CsvGuessPlugin < LineGuessPlugin
|
6
|
+
Plugin.register_guess('csv', self)
|
7
|
+
|
8
|
+
DELIMITER_CANDIDATES = [
|
9
|
+
",", "\t", "|"
|
10
|
+
]
|
11
|
+
|
12
|
+
QUOTE_CANDIDATES = [
|
13
|
+
"\"", "'"
|
14
|
+
]
|
15
|
+
|
16
|
+
# CsvParserPlugin.TRUE_STRINGS
|
17
|
+
TRUE_STRINGS = Hash[*%w[
|
18
|
+
true True TRUE
|
19
|
+
yes Yes YES
|
20
|
+
y Y
|
21
|
+
on On ON
|
22
|
+
1
|
23
|
+
].map {|k| [k, true] }]
|
24
|
+
|
25
|
+
def guess_lines(config, sample_lines)
|
26
|
+
delim = guess_delimiter(sample_lines)
|
27
|
+
unless delim
|
28
|
+
# not CSV file
|
29
|
+
return {}
|
30
|
+
end
|
31
|
+
|
32
|
+
parser_config = config["parser"] || {}
|
33
|
+
parser_guessed = {"type" => "csv", "delimiter" => delim}
|
34
|
+
|
35
|
+
quote = guess_quote(sample_lines, delim)
|
36
|
+
parser_guessed["quote"] = quote ? quote : ''
|
37
|
+
|
38
|
+
sample_records = sample_lines.map {|line| line.split(delim) } # TODO use CsvTokenizer
|
39
|
+
first_types = guess_field_types(sample_records[0, 1])
|
40
|
+
other_types = guess_field_types(sample_records[1..-1])
|
41
|
+
|
42
|
+
if first_types.size <= 1 || other_types.size <= 1
|
43
|
+
# guess failed
|
44
|
+
return {}
|
45
|
+
end
|
46
|
+
|
47
|
+
unless parser_config.has_key?("header_line")
|
48
|
+
parser_guessed["header_line"] = (first_types != other_types && !first_types.any? {|t| t != ["string"] })
|
49
|
+
end
|
50
|
+
|
51
|
+
unless parser_config.has_key?("columns")
|
52
|
+
if parser_guessed["header_line"] || parser_config["header_line"]
|
53
|
+
column_names = sample_records.first
|
54
|
+
else
|
55
|
+
column_names = (0..other_types.size).to_a.map {|i| "c#{i}" }
|
56
|
+
end
|
57
|
+
schema = []
|
58
|
+
column_names.zip(other_types).each do |name,(type,format)|
|
59
|
+
if name && type
|
60
|
+
if format
|
61
|
+
schema << {"name" => name, "type" => type, "format" => format}
|
62
|
+
else
|
63
|
+
schema << {"name" => name, "type" => type}
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
parser_guessed["columns"] = schema
|
68
|
+
end
|
69
|
+
|
70
|
+
return {"parser" => parser_guessed}
|
71
|
+
end
|
72
|
+
|
73
|
+
private
|
74
|
+
|
75
|
+
def guess_delimiter(sample_lines)
|
76
|
+
delim_weights = DELIMITER_CANDIDATES.map do |d|
|
77
|
+
counts = sample_lines.map {|line| line.count(d) }
|
78
|
+
total = array_sum(counts)
|
79
|
+
if total > 0
|
80
|
+
stddev = array_standard_deviation(counts)
|
81
|
+
stddev = 0.000000001 if stddev == 0.0
|
82
|
+
weight = total / stddev
|
83
|
+
[d, weight]
|
84
|
+
else
|
85
|
+
[nil, 0]
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
delim, weight = *delim_weights.sort_by {|d,weight| weight }.last
|
90
|
+
if delim != nil && weight > 1
|
91
|
+
return delim
|
92
|
+
else
|
93
|
+
return nil
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def guess_quote(sample_lines, delim)
|
98
|
+
delim_regexp = Regexp.escape(delim)
|
99
|
+
quote_weights = QUOTE_CANDIDATES.map do |q|
|
100
|
+
weights = sample_lines.map do |line|
|
101
|
+
q_regexp = Regexp.escape(q)
|
102
|
+
count = line.count(q)
|
103
|
+
if count > 0
|
104
|
+
weight = count
|
105
|
+
weight += line.scan(/(?:\A|#{delim_regexp})\s*#{q_regexp}(?:(?!#{q_regexp}).)*\s*#{q_regexp}(?:$|#{delim_regexp})/).size * 20
|
106
|
+
weight += line.scan(/(?:\A|#{delim_regexp})\s*#{q_regexp}(?:(?!#{delim_regexp}).)*\s*#{q_regexp}(?:$|#{delim_regexp})/).size * 40
|
107
|
+
weight
|
108
|
+
else
|
109
|
+
nil
|
110
|
+
end
|
111
|
+
end.compact
|
112
|
+
weights.empty? ? 0 : array_avg(weights)
|
113
|
+
end
|
114
|
+
quote, weight = QUOTE_CANDIDATES.zip(quote_weights).sort_by {|q,w| w }.last
|
115
|
+
if weight >= 10.0
|
116
|
+
return quote
|
117
|
+
else
|
118
|
+
return nil
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
def guess_field_types(field_lines)
|
123
|
+
column_lines = []
|
124
|
+
field_lines.each do |fields|
|
125
|
+
fields.each_with_index {|field,i| (column_lines[i] ||= []) << guess_type(field) }
|
126
|
+
end
|
127
|
+
columns = column_lines.map do |types|
|
128
|
+
t = types.inject(nil) {|r,t| merge_type(r,t) } || "string"
|
129
|
+
if t.is_a?(TimestampMatch)
|
130
|
+
format = TimeFormatGuess.guess(types.map {|type| type.text })
|
131
|
+
["timestamp", format]
|
132
|
+
else
|
133
|
+
[t]
|
134
|
+
end
|
135
|
+
end
|
136
|
+
return columns
|
137
|
+
end
|
138
|
+
|
139
|
+
TYPE_COALESCE = Hash[{
|
140
|
+
long: :double,
|
141
|
+
boolean: :long,
|
142
|
+
}.map {|k,v|
|
143
|
+
[[k.to_s, v.to_s].sort, v.to_s]
|
144
|
+
}]
|
145
|
+
|
146
|
+
def merge_type(type1, type2)
|
147
|
+
if type1 == type2
|
148
|
+
type1
|
149
|
+
elsif type1.nil? || type2.nil?
|
150
|
+
type1 || type2
|
151
|
+
else
|
152
|
+
TYPE_COALESCE[[type1, type2].sort] || "string"
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
class TimestampMatch < String
|
157
|
+
def initialize(text)
|
158
|
+
super("timestamp")
|
159
|
+
@text = text
|
160
|
+
end
|
161
|
+
attr_reader :text
|
162
|
+
end
|
163
|
+
|
164
|
+
def guess_type(str)
|
165
|
+
if TRUE_STRINGS[str]
|
166
|
+
return "boolean"
|
167
|
+
end
|
168
|
+
|
169
|
+
if TimeFormatGuess.guess(str)
|
170
|
+
return TimestampMatch.new(str)
|
171
|
+
end
|
172
|
+
|
173
|
+
if str.to_i.to_s == str
|
174
|
+
return "long"
|
175
|
+
end
|
176
|
+
|
177
|
+
if str.include?('.')
|
178
|
+
a, b = str.split(".", 2)
|
179
|
+
if a.to_i.to_s == a && b.to_i.to_s == b
|
180
|
+
return "double"
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
return "string"
|
185
|
+
end
|
186
|
+
|
187
|
+
def array_sum(array)
|
188
|
+
array.inject(0) {|r,i| r += i }
|
189
|
+
end
|
190
|
+
|
191
|
+
def array_avg(array)
|
192
|
+
array.inject(0.0) {|r,i| r += i } / array.size
|
193
|
+
end
|
194
|
+
|
195
|
+
def array_variance(array)
|
196
|
+
avg = array_avg(array)
|
197
|
+
array.inject(0.0) {|r,i| r += (i - avg) ** 2 } / array.size
|
198
|
+
end
|
199
|
+
|
200
|
+
def array_standard_deviation(array)
|
201
|
+
Math.sqrt(array_variance(array))
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
end
|
206
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Embulk
|
2
|
+
module Guess
|
3
|
+
|
4
|
+
class GzipGuessPlugin < GuessPlugin
|
5
|
+
Plugin.register_guess('gzip', self)
|
6
|
+
|
7
|
+
GZIP_HEADER = "\x1f\x8b".force_encoding('ASCII-8BIT').freeze
|
8
|
+
|
9
|
+
def guess(config, sample_buffer)
|
10
|
+
if sample_buffer[0,2] == GZIP_HEADER
|
11
|
+
return {"decoders" => [{"type" => "gzip"}]}
|
12
|
+
end
|
13
|
+
return {}
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
18
|
+
end
|