embulk 0.4.1 → 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2aebc5b785b1d7e303dd69fe4b7922abdce2954e
4
- data.tar.gz: 99189534152199abb9b280b7f7ce5511691098e7
3
+ metadata.gz: d63d08592793b691be7f691d842f73d18a202d45
4
+ data.tar.gz: 5a14e328213b2a97fe6e0abaa044428bb5caa0fd
5
5
  SHA512:
6
- metadata.gz: fafcf2fcaadaa263619c3f56e90f7a3abf5fa6b257c4ab8338ab0e84130e47ecec5fb3bc09c45b4ccbb7ec207123b1f4517324605000de2525a7632f32ae187f
7
- data.tar.gz: 7b72ad6d02f613583624265608ebda2201159b28aaefeed8d5fd9e1a553c2ff361078b4e68d5a2a7f242dc4329536c58925ee4b20739274f9dafb2ba81761d93
6
+ metadata.gz: 8e447c2ae251e6ef3309c0862344d10074797389ea2c55bce2e2302d82eea2425345316efdf7dae9adcd9f08de5b0132c04e6cad43edaef29ac3cf635ad640f8
7
+ data.tar.gz: 0b33b0a4f2dc7911d1432828e303c464c8b25bda00dc9eb3df8c9fb905c85011c30132292c581163fb557d2e2eaeb3a32a8bbd61bd486509361b4464aa2477ed
data/README.md CHANGED
@@ -24,7 +24,7 @@ You can release plugins to share your efforts of data cleaning, error handling,
24
24
  The single-file package is the simplest way to try Embulk. You can download the latest embulk-VERSION.jar from [the releases page](https://bintray.com/embulk/maven/embulk/view#files) and run it with java:
25
25
 
26
26
  ```
27
- wget https://bintray.com/artifact/download/embulk/maven/embulk-0.4.1.jar -O embulk.jar
27
+ wget https://bintray.com/artifact/download/embulk/maven/embulk-0.4.2.jar -O embulk.jar
28
28
  java -jar embulk.jar --help
29
29
  ```
30
30
 
@@ -9,7 +9,7 @@ plugins {
9
9
 
10
10
  allprojects {
11
11
  group = 'org.embulk'
12
- version = '0.4.1'
12
+ version = '0.4.2'
13
13
 
14
14
  apply plugin: 'maven' // install jar files to the local repo: $ gradle install
15
15
  apply plugin: 'maven-publish'
@@ -205,9 +205,15 @@ task releaseCheck << {
205
205
  if (!file("lib/embulk/version.rb").getText().contains("${project.version}")) {
206
206
  throw new GradleException("lib/embulk/version.rb doesn't include ${project.version}")
207
207
  }
208
+ if (!file("embulk-docs/src/release/release-${project.version}.rst").getText().contains("${project.version}")) {
209
+ throw new GradleException("Release note for ${project.version} doesn't exist")
210
+ }
211
+ if (!file("embulk-docs/src/release.rst").getText().contains("release-${project.version}")) {
212
+ throw new GradleException("embulk-docs/src/release.rst doesn't include release-${project.version}")
213
+ }
208
214
  String date = new Date().format("yyyy-MM-dd")
209
215
  if (!file("embulk-docs/src/release/release-${project.version}.rst").getText().contains(date)) {
210
- throw new GradleException("ChangeLog doesn't include entry for ${project.version}")
216
+ throw new GradleException("embulk-docs/src/release/release-${project.version}.rst doesn't include today's release date")
211
217
  }
212
218
  println "Ready. Run ./gradlew release"
213
219
  }
@@ -169,6 +169,7 @@ public class Runner
169
169
 
170
170
  String yml = writeNextConfig(options.getNextConfigOutputPath(), config, configDiff);
171
171
  System.err.println(yml);
172
+ System.out.println("Created "+options.getNextConfigOutputPath());
172
173
  }
173
174
 
174
175
  private void checkFileWritable(String path)
@@ -191,14 +192,10 @@ public class Runner
191
192
  {
192
193
  String yml = dumpYaml(obj);
193
194
  if (path != null) {
194
- if (path.equals("-")) {
195
- System.out.print(yml);
196
- } else {
197
- try (Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(path), "UTF-8"))) {
198
- writer.write(yml);
199
- } catch (IOException ex) {
200
- throw new RuntimeException(ex);
201
- }
195
+ try (Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(path), "UTF-8"))) {
196
+ writer.write(yml);
197
+ } catch (IOException ex) {
198
+ throw new RuntimeException(ex);
202
199
  }
203
200
  }
204
201
  return yml;
@@ -13,7 +13,7 @@ function r() {
13
13
  }
14
14
 
15
15
  [ "$TRAVIS_PULL_REQUEST" != "false" ] && exit 0
16
- [ "$TRAVIS_BRANCH" != "master" ] && exit 0
16
+ [ "$TRAVIS_BRANCH" != "master" -a "$TRAVIS_BRANCH" != "$(git describe --tags --always HEAD)" ] && exit 0
17
17
 
18
18
  revision="$(git rev-parse HEAD)"
19
19
  remote="$(git config remote.origin.url | sed "s+^git:+https:+")"
@@ -11,4 +11,6 @@ Release Notes
11
11
  release/release-0.3.1
12
12
  release/release-0.3.2
13
13
  release/release-0.4.0
14
+ release/release-0.4.1
15
+ release/release-0.4.2
14
16
 
@@ -1,4 +1,4 @@
1
- Release 0.4.0
1
+ Release 0.4.1
2
2
  ==================================
3
3
 
4
4
  CLI
@@ -15,4 +15,4 @@ General Changes
15
15
 
16
16
  Release Date
17
17
  ------------------
18
- 2015-02-15
18
+ 2015-02-16
@@ -0,0 +1,18 @@
1
+ Release 0.4.2
2
+ ==================================
3
+
4
+ CLI
5
+ ------------------
6
+
7
+ * Removed support for setting ``-`` to ``-o, --output PATH`` option
8
+
9
+ General Changes
10
+ ------------------
11
+
12
+ * Fixed guess plugin paths (@takei-yuya++)
13
+ * Fixed syntax error in generated gemspec file
14
+ * Fixed syntax error in generated generated ruby filter and input plugins (@takei-yuya++, also reported by hiroyuki-sato)
15
+
16
+ Release Date
17
+ ------------------
18
+ 2015-02-16
@@ -34,23 +34,23 @@ module Embulk
34
34
  description =
35
35
  case category
36
36
  when :input
37
- %[that loads records from #{display_name} so that any output plugins can receive the records. Search the output plugins by "embulk-output" keyword.]
37
+ %[that loads records from #{display_name} so that any output plugins can receive the records. Search the output plugins by 'embulk-output' keyword.]
38
38
  when :file_input
39
- %[that reads files from #{display_name} and parses the file using any parser plugins. Search the parser plugins by "embulk-parser" keyword.]
39
+ %[that reads files from #{display_name} and parses the file using any parser plugins. Search the parser plugins by 'embulk-parser' keyword.]
40
40
  when :parser
41
- %[that parses #{display_name} file format read by any file input plugins. Search the file input plugins by "embulk-input file" keywords.]
41
+ %[that parses #{display_name} file format read by any file input plugins. Search the file input plugins by 'embulk-input file' keywords.]
42
42
  when :decoder
43
- %[that decodes files encoded by #{display_name} read by any file input plugins. Search the file input plugins by "embulk-input file" keywords.]
43
+ %[that decodes files encoded by #{display_name} read by any file input plugins. Search the file input plugins by 'embulk-input file' keywords.]
44
44
  when :output
45
- %[that loads records to #{display_name} read by any input plugins. Search the input plugins by "embulk-input" keyword.]
45
+ %[that loads records to #{display_name} read by any input plugins. Search the input plugins by 'embulk-input' keyword.]
46
46
  when :file_output
47
- %[that stores files to #{display_name} formatted by any formatter plugins. Search the formatter plugins by "embulk-formatter" keyword.]
47
+ %[that stores files to #{display_name} formatted by any formatter plugins. Search the formatter plugins by 'embulk-formatter' keyword.]
48
48
  when :formtter
49
- %[that formats records using #{display_name} file format and so that any file output plugins can store the files. Search the file output plugins by "embulk-output file" keywords.]
49
+ %[that formats records using #{display_name} file format and so that any file output plugins can store the files. Search the file output plugins by 'embulk-output file' keywords.]
50
50
  when :encoder
51
- %[that encodes files using #{display_name} so that any file output plugins can store the files. Search the file output plugins by "embulk-output file" keywords.]
51
+ %[that encodes files using #{display_name} so that any file output plugins can store the files. Search the file output plugins by 'embulk-output file' keywords.]
52
52
  when :filter
53
- %[that converts records read by an input plugin before passing it to an output plugins. Search the input and plugins by "embulk-input" and "embulk-output" plugins.]
53
+ %[that converts records read by an input plugin before passing it to an output plugins. Search the input and plugins by 'embulk-input' and 'embulk-output' plugins.]
54
54
  end
55
55
 
56
56
  pkg = Embulk::PackageData.new("new", project_name, binding())
@@ -1,3 +1,3 @@
1
1
  Embulk::JavaPlugin.register_<%= embulk_category %>(
2
- <%= name.to_sym.inspect %>, <%= "org.embulk.#{embulk_category}.#{java_class_name}".dump %>,
2
+ <%= name.dump %>, <%= "org.embulk.#{embulk_category}.#{java_class_name}".dump %>,
3
3
  File.expand_path('../../../../classpath', __FILE__))
@@ -2,7 +2,7 @@ module Embulk
2
2
  module Filter
3
3
 
4
4
  class <%= ruby_class_name %> < FilterPlugin
5
- Plugin.register_filter(<%= name %>, self)
5
+ Plugin.register_filter(<%= name.dump %>, self)
6
6
 
7
7
  def self.transaction(config, in_schema, &control)
8
8
  # configuration code:
@@ -3,8 +3,8 @@ Gem::Specification.new do |spec|
3
3
  spec.name = "<%= project_name %>"
4
4
  spec.version = "0.1.0"
5
5
  spec.authors = [<%= author.dump %>]
6
- spec.summary = "<%= display_name %> <%= display_category %> plugin for Embulk"
7
- spec.description = "<%= display_name %> <%= display_category %> plugin is an Embulk plugin <%= description %>"
6
+ spec.summary = <%= "#{display_name} #{display_category} plugin for Embulk".dump %>
7
+ spec.description = <%= "#{display_name} #{display_category} plugin is an Embulk plugin #{description}".dump %>
8
8
  spec.email = [<%= email.dump %>]
9
9
  spec.licenses = ["MIT"]
10
10
  # TODO: spec.homepage = "https://github.com/<%= email[/([^@]*)/] %>/<%= project_name %>"
@@ -2,7 +2,7 @@ module Embulk
2
2
  module Input
3
3
 
4
4
  class <%= ruby_class_name %> < InputPlugin
5
- Plugin.register_input(<%= name %>, self)
5
+ Plugin.register_input(<%= name.dump %>, self)
6
6
 
7
7
  def self.transaction(config, &control)
8
8
  # configuration code:
@@ -0,0 +1,28 @@
1
+ module Embulk
2
+ module Guess
3
+
4
+ class CharsetGuessPlugin < GuessPlugin
5
+ Plugin.register_guess('charset', self)
6
+
7
+ def guess(config, sample_buffer)
8
+ # ICU4J
9
+ detector = com.ibm.icu.text.CharsetDetector.new
10
+ detector.setText(sample_buffer.to_java_bytes)
11
+ best_match = detector.detect
12
+ if best_match.getConfidence < 50
13
+ name = "UTF-8"
14
+ else
15
+ name = best_match.getName
16
+ if name == "ISO-8859-1"
17
+ # ISO-8859-1 means ASCII which is a subset
18
+ # of UTF-8 in most of cases due to lack of
19
+ # sample data set
20
+ name = "UTF-8"
21
+ end
22
+ end
23
+ return {"parser" => {"charset" => name}}
24
+ end
25
+ end
26
+
27
+ end
28
+ end
@@ -0,0 +1,206 @@
1
+ module Embulk
2
+ module Guess
3
+ require_relative 'time_format_guess'
4
+
5
+ class CsvGuessPlugin < LineGuessPlugin
6
+ Plugin.register_guess('csv', self)
7
+
8
+ DELIMITER_CANDIDATES = [
9
+ ",", "\t", "|"
10
+ ]
11
+
12
+ QUOTE_CANDIDATES = [
13
+ "\"", "'"
14
+ ]
15
+
16
+ # CsvParserPlugin.TRUE_STRINGS
17
+ TRUE_STRINGS = Hash[*%w[
18
+ true True TRUE
19
+ yes Yes YES
20
+ y Y
21
+ on On ON
22
+ 1
23
+ ].map {|k| [k, true] }]
24
+
25
+ def guess_lines(config, sample_lines)
26
+ delim = guess_delimiter(sample_lines)
27
+ unless delim
28
+ # not CSV file
29
+ return {}
30
+ end
31
+
32
+ parser_config = config["parser"] || {}
33
+ parser_guessed = {"type" => "csv", "delimiter" => delim}
34
+
35
+ quote = guess_quote(sample_lines, delim)
36
+ parser_guessed["quote"] = quote ? quote : ''
37
+
38
+ sample_records = sample_lines.map {|line| line.split(delim) } # TODO use CsvTokenizer
39
+ first_types = guess_field_types(sample_records[0, 1])
40
+ other_types = guess_field_types(sample_records[1..-1])
41
+
42
+ if first_types.size <= 1 || other_types.size <= 1
43
+ # guess failed
44
+ return {}
45
+ end
46
+
47
+ unless parser_config.has_key?("header_line")
48
+ parser_guessed["header_line"] = (first_types != other_types && !first_types.any? {|t| t != ["string"] })
49
+ end
50
+
51
+ unless parser_config.has_key?("columns")
52
+ if parser_guessed["header_line"] || parser_config["header_line"]
53
+ column_names = sample_records.first
54
+ else
55
+ column_names = (0..other_types.size).to_a.map {|i| "c#{i}" }
56
+ end
57
+ schema = []
58
+ column_names.zip(other_types).each do |name,(type,format)|
59
+ if name && type
60
+ if format
61
+ schema << {"name" => name, "type" => type, "format" => format}
62
+ else
63
+ schema << {"name" => name, "type" => type}
64
+ end
65
+ end
66
+ end
67
+ parser_guessed["columns"] = schema
68
+ end
69
+
70
+ return {"parser" => parser_guessed}
71
+ end
72
+
73
+ private
74
+
75
+ def guess_delimiter(sample_lines)
76
+ delim_weights = DELIMITER_CANDIDATES.map do |d|
77
+ counts = sample_lines.map {|line| line.count(d) }
78
+ total = array_sum(counts)
79
+ if total > 0
80
+ stddev = array_standard_deviation(counts)
81
+ stddev = 0.000000001 if stddev == 0.0
82
+ weight = total / stddev
83
+ [d, weight]
84
+ else
85
+ [nil, 0]
86
+ end
87
+ end
88
+
89
+ delim, weight = *delim_weights.sort_by {|d,weight| weight }.last
90
+ if delim != nil && weight > 1
91
+ return delim
92
+ else
93
+ return nil
94
+ end
95
+ end
96
+
97
+ def guess_quote(sample_lines, delim)
98
+ delim_regexp = Regexp.escape(delim)
99
+ quote_weights = QUOTE_CANDIDATES.map do |q|
100
+ weights = sample_lines.map do |line|
101
+ q_regexp = Regexp.escape(q)
102
+ count = line.count(q)
103
+ if count > 0
104
+ weight = count
105
+ weight += line.scan(/(?:\A|#{delim_regexp})\s*#{q_regexp}(?:(?!#{q_regexp}).)*\s*#{q_regexp}(?:$|#{delim_regexp})/).size * 20
106
+ weight += line.scan(/(?:\A|#{delim_regexp})\s*#{q_regexp}(?:(?!#{delim_regexp}).)*\s*#{q_regexp}(?:$|#{delim_regexp})/).size * 40
107
+ weight
108
+ else
109
+ nil
110
+ end
111
+ end.compact
112
+ weights.empty? ? 0 : array_avg(weights)
113
+ end
114
+ quote, weight = QUOTE_CANDIDATES.zip(quote_weights).sort_by {|q,w| w }.last
115
+ if weight >= 10.0
116
+ return quote
117
+ else
118
+ return nil
119
+ end
120
+ end
121
+
122
+ def guess_field_types(field_lines)
123
+ column_lines = []
124
+ field_lines.each do |fields|
125
+ fields.each_with_index {|field,i| (column_lines[i] ||= []) << guess_type(field) }
126
+ end
127
+ columns = column_lines.map do |types|
128
+ t = types.inject(nil) {|r,t| merge_type(r,t) } || "string"
129
+ if t.is_a?(TimestampMatch)
130
+ format = TimeFormatGuess.guess(types.map {|type| type.text })
131
+ ["timestamp", format]
132
+ else
133
+ [t]
134
+ end
135
+ end
136
+ return columns
137
+ end
138
+
139
+ TYPE_COALESCE = Hash[{
140
+ long: :double,
141
+ boolean: :long,
142
+ }.map {|k,v|
143
+ [[k.to_s, v.to_s].sort, v.to_s]
144
+ }]
145
+
146
+ def merge_type(type1, type2)
147
+ if type1 == type2
148
+ type1
149
+ elsif type1.nil? || type2.nil?
150
+ type1 || type2
151
+ else
152
+ TYPE_COALESCE[[type1, type2].sort] || "string"
153
+ end
154
+ end
155
+
156
+ class TimestampMatch < String
157
+ def initialize(text)
158
+ super("timestamp")
159
+ @text = text
160
+ end
161
+ attr_reader :text
162
+ end
163
+
164
+ def guess_type(str)
165
+ if TRUE_STRINGS[str]
166
+ return "boolean"
167
+ end
168
+
169
+ if TimeFormatGuess.guess(str)
170
+ return TimestampMatch.new(str)
171
+ end
172
+
173
+ if str.to_i.to_s == str
174
+ return "long"
175
+ end
176
+
177
+ if str.include?('.')
178
+ a, b = str.split(".", 2)
179
+ if a.to_i.to_s == a && b.to_i.to_s == b
180
+ return "double"
181
+ end
182
+ end
183
+
184
+ return "string"
185
+ end
186
+
187
+ def array_sum(array)
188
+ array.inject(0) {|r,i| r += i }
189
+ end
190
+
191
+ def array_avg(array)
192
+ array.inject(0.0) {|r,i| r += i } / array.size
193
+ end
194
+
195
+ def array_variance(array)
196
+ avg = array_avg(array)
197
+ array.inject(0.0) {|r,i| r += (i - avg) ** 2 } / array.size
198
+ end
199
+
200
+ def array_standard_deviation(array)
201
+ Math.sqrt(array_variance(array))
202
+ end
203
+ end
204
+
205
+ end
206
+ end
@@ -0,0 +1,18 @@
1
+ module Embulk
2
+ module Guess
3
+
4
+ class GzipGuessPlugin < GuessPlugin
5
+ Plugin.register_guess('gzip', self)
6
+
7
+ GZIP_HEADER = "\x1f\x8b".force_encoding('ASCII-8BIT').freeze
8
+
9
+ def guess(config, sample_buffer)
10
+ if sample_buffer[0,2] == GZIP_HEADER
11
+ return {"decoders" => [{"type" => "gzip"}]}
12
+ end
13
+ return {}
14
+ end
15
+ end
16
+
17
+ end
18
+ end