embulk 0.4.1 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/build.gradle +8 -2
- data/embulk-core/src/main/java/org/embulk/command/Runner.java +5 -8
- data/embulk-docs/push-gh-pages.sh +1 -1
- data/embulk-docs/src/release.rst +2 -0
- data/embulk-docs/src/release/release-0.4.1.rst +2 -2
- data/embulk-docs/src/release/release-0.4.2.rst +18 -0
- data/lib/embulk/command/embulk_new_plugin.rb +9 -9
- data/lib/embulk/data/new/java/plugin_loader.rb.erb +1 -1
- data/lib/embulk/data/new/ruby/filter.rb.erb +1 -1
- data/lib/embulk/data/new/ruby/gemspec.erb +2 -2
- data/lib/embulk/data/new/ruby/input.rb.erb +1 -1
- data/lib/embulk/guess/charset.rb +28 -0
- data/lib/embulk/guess/csv.rb +206 -0
- data/lib/embulk/guess/gzip.rb +18 -0
- data/lib/embulk/guess/newline.rb +22 -0
- data/lib/embulk/guess/time_format_guess.rb +333 -0
- data/lib/embulk/version.rb +1 -1
- metadata +9 -8
- data/lib/embulk/guess_charset.rb +0 -26
- data/lib/embulk/guess_csv.rb +0 -204
- data/lib/embulk/guess_gzip.rb +0 -16
- data/lib/embulk/guess_newline.rb +0 -20
- data/lib/embulk/time_format_guess.rb +0 -331
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d63d08592793b691be7f691d842f73d18a202d45
|
4
|
+
data.tar.gz: 5a14e328213b2a97fe6e0abaa044428bb5caa0fd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8e447c2ae251e6ef3309c0862344d10074797389ea2c55bce2e2302d82eea2425345316efdf7dae9adcd9f08de5b0132c04e6cad43edaef29ac3cf635ad640f8
|
7
|
+
data.tar.gz: 0b33b0a4f2dc7911d1432828e303c464c8b25bda00dc9eb3df8c9fb905c85011c30132292c581163fb557d2e2eaeb3a32a8bbd61bd486509361b4464aa2477ed
|
data/README.md
CHANGED
@@ -24,7 +24,7 @@ You can release plugins to share your efforts of data cleaning, error handling,
|
|
24
24
|
The single-file package is the simplest way to try Embulk. You can download the latest embulk-VERSION.jar from [the releases page](https://bintray.com/embulk/maven/embulk/view#files) and run it with java:
|
25
25
|
|
26
26
|
```
|
27
|
-
wget https://bintray.com/artifact/download/embulk/maven/embulk-0.4.
|
27
|
+
wget https://bintray.com/artifact/download/embulk/maven/embulk-0.4.2.jar -O embulk.jar
|
28
28
|
java -jar embulk.jar --help
|
29
29
|
```
|
30
30
|
|
data/build.gradle
CHANGED
@@ -9,7 +9,7 @@ plugins {
|
|
9
9
|
|
10
10
|
allprojects {
|
11
11
|
group = 'org.embulk'
|
12
|
-
version = '0.4.
|
12
|
+
version = '0.4.2'
|
13
13
|
|
14
14
|
apply plugin: 'maven' // install jar files to the local repo: $ gradle install
|
15
15
|
apply plugin: 'maven-publish'
|
@@ -205,9 +205,15 @@ task releaseCheck << {
|
|
205
205
|
if (!file("lib/embulk/version.rb").getText().contains("${project.version}")) {
|
206
206
|
throw new GradleException("lib/embulk/version.rb doesn't include ${project.version}")
|
207
207
|
}
|
208
|
+
if (!file("embulk-docs/src/release/release-${project.version}.rst").getText().contains("${project.version}")) {
|
209
|
+
throw new GradleException("Release note for ${project.version} doesn't exist")
|
210
|
+
}
|
211
|
+
if (!file("embulk-docs/src/release.rst").getText().contains("release-${project.version}")) {
|
212
|
+
throw new GradleException("embulk-docs/src/release.rst doesn't include release-${project.version}")
|
213
|
+
}
|
208
214
|
String date = new Date().format("yyyy-MM-dd")
|
209
215
|
if (!file("embulk-docs/src/release/release-${project.version}.rst").getText().contains(date)) {
|
210
|
-
throw new GradleException("
|
216
|
+
throw new GradleException("embulk-docs/src/release/release-${project.version}.rst doesn't include today's release date")
|
211
217
|
}
|
212
218
|
println "Ready. Run ./gradlew release"
|
213
219
|
}
|
@@ -169,6 +169,7 @@ public class Runner
|
|
169
169
|
|
170
170
|
String yml = writeNextConfig(options.getNextConfigOutputPath(), config, configDiff);
|
171
171
|
System.err.println(yml);
|
172
|
+
System.out.println("Created "+options.getNextConfigOutputPath());
|
172
173
|
}
|
173
174
|
|
174
175
|
private void checkFileWritable(String path)
|
@@ -191,14 +192,10 @@ public class Runner
|
|
191
192
|
{
|
192
193
|
String yml = dumpYaml(obj);
|
193
194
|
if (path != null) {
|
194
|
-
|
195
|
-
|
196
|
-
}
|
197
|
-
|
198
|
-
writer.write(yml);
|
199
|
-
} catch (IOException ex) {
|
200
|
-
throw new RuntimeException(ex);
|
201
|
-
}
|
195
|
+
try (Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(path), "UTF-8"))) {
|
196
|
+
writer.write(yml);
|
197
|
+
} catch (IOException ex) {
|
198
|
+
throw new RuntimeException(ex);
|
202
199
|
}
|
203
200
|
}
|
204
201
|
return yml;
|
@@ -13,7 +13,7 @@ function r() {
|
|
13
13
|
}
|
14
14
|
|
15
15
|
[ "$TRAVIS_PULL_REQUEST" != "false" ] && exit 0
|
16
|
-
[ "$TRAVIS_BRANCH" != "master" ] && exit 0
|
16
|
+
[ "$TRAVIS_BRANCH" != "master" -a "$TRAVIS_BRANCH" != "$(git describe --tags --always HEAD)" ] && exit 0
|
17
17
|
|
18
18
|
revision="$(git rev-parse HEAD)"
|
19
19
|
remote="$(git config remote.origin.url | sed "s+^git:+https:+")"
|
data/embulk-docs/src/release.rst
CHANGED
@@ -0,0 +1,18 @@
|
|
1
|
+
Release 0.4.2
|
2
|
+
==================================
|
3
|
+
|
4
|
+
CLI
|
5
|
+
------------------
|
6
|
+
|
7
|
+
* Removed support for setting ``-`` to ``-o, --output PATH`` option
|
8
|
+
|
9
|
+
General Changes
|
10
|
+
------------------
|
11
|
+
|
12
|
+
* Fixed guess plugin paths (@takei-yuya++)
|
13
|
+
* Fixed syntax error in generated gemspec file
|
14
|
+
* Fixed syntax error in generated generated ruby filter and input plugins (@takei-yuya++, also reported by hiroyuki-sato)
|
15
|
+
|
16
|
+
Release Date
|
17
|
+
------------------
|
18
|
+
2015-02-16
|
@@ -34,23 +34,23 @@ module Embulk
|
|
34
34
|
description =
|
35
35
|
case category
|
36
36
|
when :input
|
37
|
-
%[that loads records from #{display_name} so that any output plugins can receive the records. Search the output plugins by
|
37
|
+
%[that loads records from #{display_name} so that any output plugins can receive the records. Search the output plugins by 'embulk-output' keyword.]
|
38
38
|
when :file_input
|
39
|
-
%[that reads files from #{display_name} and parses the file using any parser plugins. Search the parser plugins by
|
39
|
+
%[that reads files from #{display_name} and parses the file using any parser plugins. Search the parser plugins by 'embulk-parser' keyword.]
|
40
40
|
when :parser
|
41
|
-
%[that parses #{display_name} file format read by any file input plugins. Search the file input plugins by
|
41
|
+
%[that parses #{display_name} file format read by any file input plugins. Search the file input plugins by 'embulk-input file' keywords.]
|
42
42
|
when :decoder
|
43
|
-
%[that decodes files encoded by #{display_name} read by any file input plugins. Search the file input plugins by
|
43
|
+
%[that decodes files encoded by #{display_name} read by any file input plugins. Search the file input plugins by 'embulk-input file' keywords.]
|
44
44
|
when :output
|
45
|
-
%[that loads records to #{display_name} read by any input plugins. Search the input plugins by
|
45
|
+
%[that loads records to #{display_name} read by any input plugins. Search the input plugins by 'embulk-input' keyword.]
|
46
46
|
when :file_output
|
47
|
-
%[that stores files to #{display_name} formatted by any formatter plugins. Search the formatter plugins by
|
47
|
+
%[that stores files to #{display_name} formatted by any formatter plugins. Search the formatter plugins by 'embulk-formatter' keyword.]
|
48
48
|
when :formtter
|
49
|
-
%[that formats records using #{display_name} file format and so that any file output plugins can store the files. Search the file output plugins by
|
49
|
+
%[that formats records using #{display_name} file format and so that any file output plugins can store the files. Search the file output plugins by 'embulk-output file' keywords.]
|
50
50
|
when :encoder
|
51
|
-
%[that encodes files using #{display_name} so that any file output plugins can store the files. Search the file output plugins by
|
51
|
+
%[that encodes files using #{display_name} so that any file output plugins can store the files. Search the file output plugins by 'embulk-output file' keywords.]
|
52
52
|
when :filter
|
53
|
-
%[that converts records read by an input plugin before passing it to an output plugins. Search the input and plugins by
|
53
|
+
%[that converts records read by an input plugin before passing it to an output plugins. Search the input and plugins by 'embulk-input' and 'embulk-output' plugins.]
|
54
54
|
end
|
55
55
|
|
56
56
|
pkg = Embulk::PackageData.new("new", project_name, binding())
|
@@ -1,3 +1,3 @@
|
|
1
1
|
Embulk::JavaPlugin.register_<%= embulk_category %>(
|
2
|
-
<%= name.
|
2
|
+
<%= name.dump %>, <%= "org.embulk.#{embulk_category}.#{java_class_name}".dump %>,
|
3
3
|
File.expand_path('../../../../classpath', __FILE__))
|
@@ -3,8 +3,8 @@ Gem::Specification.new do |spec|
|
|
3
3
|
spec.name = "<%= project_name %>"
|
4
4
|
spec.version = "0.1.0"
|
5
5
|
spec.authors = [<%= author.dump %>]
|
6
|
-
spec.summary =
|
7
|
-
spec.description =
|
6
|
+
spec.summary = <%= "#{display_name} #{display_category} plugin for Embulk".dump %>
|
7
|
+
spec.description = <%= "#{display_name} #{display_category} plugin is an Embulk plugin #{description}".dump %>
|
8
8
|
spec.email = [<%= email.dump %>]
|
9
9
|
spec.licenses = ["MIT"]
|
10
10
|
# TODO: spec.homepage = "https://github.com/<%= email[/([^@]*)/] %>/<%= project_name %>"
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Embulk
|
2
|
+
module Guess
|
3
|
+
|
4
|
+
class CharsetGuessPlugin < GuessPlugin
|
5
|
+
Plugin.register_guess('charset', self)
|
6
|
+
|
7
|
+
def guess(config, sample_buffer)
|
8
|
+
# ICU4J
|
9
|
+
detector = com.ibm.icu.text.CharsetDetector.new
|
10
|
+
detector.setText(sample_buffer.to_java_bytes)
|
11
|
+
best_match = detector.detect
|
12
|
+
if best_match.getConfidence < 50
|
13
|
+
name = "UTF-8"
|
14
|
+
else
|
15
|
+
name = best_match.getName
|
16
|
+
if name == "ISO-8859-1"
|
17
|
+
# ISO-8859-1 means ASCII which is a subset
|
18
|
+
# of UTF-8 in most of cases due to lack of
|
19
|
+
# sample data set
|
20
|
+
name = "UTF-8"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
return {"parser" => {"charset" => name}}
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,206 @@
|
|
1
|
+
module Embulk
|
2
|
+
module Guess
|
3
|
+
require_relative 'time_format_guess'
|
4
|
+
|
5
|
+
class CsvGuessPlugin < LineGuessPlugin
|
6
|
+
Plugin.register_guess('csv', self)
|
7
|
+
|
8
|
+
DELIMITER_CANDIDATES = [
|
9
|
+
",", "\t", "|"
|
10
|
+
]
|
11
|
+
|
12
|
+
QUOTE_CANDIDATES = [
|
13
|
+
"\"", "'"
|
14
|
+
]
|
15
|
+
|
16
|
+
# CsvParserPlugin.TRUE_STRINGS
|
17
|
+
TRUE_STRINGS = Hash[*%w[
|
18
|
+
true True TRUE
|
19
|
+
yes Yes YES
|
20
|
+
y Y
|
21
|
+
on On ON
|
22
|
+
1
|
23
|
+
].map {|k| [k, true] }]
|
24
|
+
|
25
|
+
def guess_lines(config, sample_lines)
|
26
|
+
delim = guess_delimiter(sample_lines)
|
27
|
+
unless delim
|
28
|
+
# not CSV file
|
29
|
+
return {}
|
30
|
+
end
|
31
|
+
|
32
|
+
parser_config = config["parser"] || {}
|
33
|
+
parser_guessed = {"type" => "csv", "delimiter" => delim}
|
34
|
+
|
35
|
+
quote = guess_quote(sample_lines, delim)
|
36
|
+
parser_guessed["quote"] = quote ? quote : ''
|
37
|
+
|
38
|
+
sample_records = sample_lines.map {|line| line.split(delim) } # TODO use CsvTokenizer
|
39
|
+
first_types = guess_field_types(sample_records[0, 1])
|
40
|
+
other_types = guess_field_types(sample_records[1..-1])
|
41
|
+
|
42
|
+
if first_types.size <= 1 || other_types.size <= 1
|
43
|
+
# guess failed
|
44
|
+
return {}
|
45
|
+
end
|
46
|
+
|
47
|
+
unless parser_config.has_key?("header_line")
|
48
|
+
parser_guessed["header_line"] = (first_types != other_types && !first_types.any? {|t| t != ["string"] })
|
49
|
+
end
|
50
|
+
|
51
|
+
unless parser_config.has_key?("columns")
|
52
|
+
if parser_guessed["header_line"] || parser_config["header_line"]
|
53
|
+
column_names = sample_records.first
|
54
|
+
else
|
55
|
+
column_names = (0..other_types.size).to_a.map {|i| "c#{i}" }
|
56
|
+
end
|
57
|
+
schema = []
|
58
|
+
column_names.zip(other_types).each do |name,(type,format)|
|
59
|
+
if name && type
|
60
|
+
if format
|
61
|
+
schema << {"name" => name, "type" => type, "format" => format}
|
62
|
+
else
|
63
|
+
schema << {"name" => name, "type" => type}
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
parser_guessed["columns"] = schema
|
68
|
+
end
|
69
|
+
|
70
|
+
return {"parser" => parser_guessed}
|
71
|
+
end
|
72
|
+
|
73
|
+
private
|
74
|
+
|
75
|
+
def guess_delimiter(sample_lines)
|
76
|
+
delim_weights = DELIMITER_CANDIDATES.map do |d|
|
77
|
+
counts = sample_lines.map {|line| line.count(d) }
|
78
|
+
total = array_sum(counts)
|
79
|
+
if total > 0
|
80
|
+
stddev = array_standard_deviation(counts)
|
81
|
+
stddev = 0.000000001 if stddev == 0.0
|
82
|
+
weight = total / stddev
|
83
|
+
[d, weight]
|
84
|
+
else
|
85
|
+
[nil, 0]
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
delim, weight = *delim_weights.sort_by {|d,weight| weight }.last
|
90
|
+
if delim != nil && weight > 1
|
91
|
+
return delim
|
92
|
+
else
|
93
|
+
return nil
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def guess_quote(sample_lines, delim)
|
98
|
+
delim_regexp = Regexp.escape(delim)
|
99
|
+
quote_weights = QUOTE_CANDIDATES.map do |q|
|
100
|
+
weights = sample_lines.map do |line|
|
101
|
+
q_regexp = Regexp.escape(q)
|
102
|
+
count = line.count(q)
|
103
|
+
if count > 0
|
104
|
+
weight = count
|
105
|
+
weight += line.scan(/(?:\A|#{delim_regexp})\s*#{q_regexp}(?:(?!#{q_regexp}).)*\s*#{q_regexp}(?:$|#{delim_regexp})/).size * 20
|
106
|
+
weight += line.scan(/(?:\A|#{delim_regexp})\s*#{q_regexp}(?:(?!#{delim_regexp}).)*\s*#{q_regexp}(?:$|#{delim_regexp})/).size * 40
|
107
|
+
weight
|
108
|
+
else
|
109
|
+
nil
|
110
|
+
end
|
111
|
+
end.compact
|
112
|
+
weights.empty? ? 0 : array_avg(weights)
|
113
|
+
end
|
114
|
+
quote, weight = QUOTE_CANDIDATES.zip(quote_weights).sort_by {|q,w| w }.last
|
115
|
+
if weight >= 10.0
|
116
|
+
return quote
|
117
|
+
else
|
118
|
+
return nil
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
def guess_field_types(field_lines)
|
123
|
+
column_lines = []
|
124
|
+
field_lines.each do |fields|
|
125
|
+
fields.each_with_index {|field,i| (column_lines[i] ||= []) << guess_type(field) }
|
126
|
+
end
|
127
|
+
columns = column_lines.map do |types|
|
128
|
+
t = types.inject(nil) {|r,t| merge_type(r,t) } || "string"
|
129
|
+
if t.is_a?(TimestampMatch)
|
130
|
+
format = TimeFormatGuess.guess(types.map {|type| type.text })
|
131
|
+
["timestamp", format]
|
132
|
+
else
|
133
|
+
[t]
|
134
|
+
end
|
135
|
+
end
|
136
|
+
return columns
|
137
|
+
end
|
138
|
+
|
139
|
+
TYPE_COALESCE = Hash[{
|
140
|
+
long: :double,
|
141
|
+
boolean: :long,
|
142
|
+
}.map {|k,v|
|
143
|
+
[[k.to_s, v.to_s].sort, v.to_s]
|
144
|
+
}]
|
145
|
+
|
146
|
+
def merge_type(type1, type2)
|
147
|
+
if type1 == type2
|
148
|
+
type1
|
149
|
+
elsif type1.nil? || type2.nil?
|
150
|
+
type1 || type2
|
151
|
+
else
|
152
|
+
TYPE_COALESCE[[type1, type2].sort] || "string"
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
class TimestampMatch < String
|
157
|
+
def initialize(text)
|
158
|
+
super("timestamp")
|
159
|
+
@text = text
|
160
|
+
end
|
161
|
+
attr_reader :text
|
162
|
+
end
|
163
|
+
|
164
|
+
def guess_type(str)
|
165
|
+
if TRUE_STRINGS[str]
|
166
|
+
return "boolean"
|
167
|
+
end
|
168
|
+
|
169
|
+
if TimeFormatGuess.guess(str)
|
170
|
+
return TimestampMatch.new(str)
|
171
|
+
end
|
172
|
+
|
173
|
+
if str.to_i.to_s == str
|
174
|
+
return "long"
|
175
|
+
end
|
176
|
+
|
177
|
+
if str.include?('.')
|
178
|
+
a, b = str.split(".", 2)
|
179
|
+
if a.to_i.to_s == a && b.to_i.to_s == b
|
180
|
+
return "double"
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
return "string"
|
185
|
+
end
|
186
|
+
|
187
|
+
def array_sum(array)
|
188
|
+
array.inject(0) {|r,i| r += i }
|
189
|
+
end
|
190
|
+
|
191
|
+
def array_avg(array)
|
192
|
+
array.inject(0.0) {|r,i| r += i } / array.size
|
193
|
+
end
|
194
|
+
|
195
|
+
def array_variance(array)
|
196
|
+
avg = array_avg(array)
|
197
|
+
array.inject(0.0) {|r,i| r += (i - avg) ** 2 } / array.size
|
198
|
+
end
|
199
|
+
|
200
|
+
def array_standard_deviation(array)
|
201
|
+
Math.sqrt(array_variance(array))
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
end
|
206
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Embulk
|
2
|
+
module Guess
|
3
|
+
|
4
|
+
class GzipGuessPlugin < GuessPlugin
|
5
|
+
Plugin.register_guess('gzip', self)
|
6
|
+
|
7
|
+
GZIP_HEADER = "\x1f\x8b".force_encoding('ASCII-8BIT').freeze
|
8
|
+
|
9
|
+
def guess(config, sample_buffer)
|
10
|
+
if sample_buffer[0,2] == GZIP_HEADER
|
11
|
+
return {"decoders" => [{"type" => "gzip"}]}
|
12
|
+
end
|
13
|
+
return {}
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
18
|
+
end
|