embulk 0.4.1 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2aebc5b785b1d7e303dd69fe4b7922abdce2954e
4
- data.tar.gz: 99189534152199abb9b280b7f7ce5511691098e7
3
+ metadata.gz: d63d08592793b691be7f691d842f73d18a202d45
4
+ data.tar.gz: 5a14e328213b2a97fe6e0abaa044428bb5caa0fd
5
5
  SHA512:
6
- metadata.gz: fafcf2fcaadaa263619c3f56e90f7a3abf5fa6b257c4ab8338ab0e84130e47ecec5fb3bc09c45b4ccbb7ec207123b1f4517324605000de2525a7632f32ae187f
7
- data.tar.gz: 7b72ad6d02f613583624265608ebda2201159b28aaefeed8d5fd9e1a553c2ff361078b4e68d5a2a7f242dc4329536c58925ee4b20739274f9dafb2ba81761d93
6
+ metadata.gz: 8e447c2ae251e6ef3309c0862344d10074797389ea2c55bce2e2302d82eea2425345316efdf7dae9adcd9f08de5b0132c04e6cad43edaef29ac3cf635ad640f8
7
+ data.tar.gz: 0b33b0a4f2dc7911d1432828e303c464c8b25bda00dc9eb3df8c9fb905c85011c30132292c581163fb557d2e2eaeb3a32a8bbd61bd486509361b4464aa2477ed
data/README.md CHANGED
@@ -24,7 +24,7 @@ You can release plugins to share your efforts of data cleaning, error handling,
24
24
  The single-file package is the simplest way to try Embulk. You can download the latest embulk-VERSION.jar from [the releases page](https://bintray.com/embulk/maven/embulk/view#files) and run it with java:
25
25
 
26
26
  ```
27
- wget https://bintray.com/artifact/download/embulk/maven/embulk-0.4.1.jar -O embulk.jar
27
+ wget https://bintray.com/artifact/download/embulk/maven/embulk-0.4.2.jar -O embulk.jar
28
28
  java -jar embulk.jar --help
29
29
  ```
30
30
 
@@ -9,7 +9,7 @@ plugins {
9
9
 
10
10
  allprojects {
11
11
  group = 'org.embulk'
12
- version = '0.4.1'
12
+ version = '0.4.2'
13
13
 
14
14
  apply plugin: 'maven' // install jar files to the local repo: $ gradle install
15
15
  apply plugin: 'maven-publish'
@@ -205,9 +205,15 @@ task releaseCheck << {
205
205
  if (!file("lib/embulk/version.rb").getText().contains("${project.version}")) {
206
206
  throw new GradleException("lib/embulk/version.rb doesn't include ${project.version}")
207
207
  }
208
+ if (!file("embulk-docs/src/release/release-${project.version}.rst").getText().contains("${project.version}")) {
209
+ throw new GradleException("Release note for ${project.version} doesn't exist")
210
+ }
211
+ if (!file("embulk-docs/src/release.rst").getText().contains("release-${project.version}")) {
212
+ throw new GradleException("embulk-docs/src/release.rst doesn't include release-${project.version}")
213
+ }
208
214
  String date = new Date().format("yyyy-MM-dd")
209
215
  if (!file("embulk-docs/src/release/release-${project.version}.rst").getText().contains(date)) {
210
- throw new GradleException("ChangeLog doesn't include entry for ${project.version}")
216
+ throw new GradleException("embulk-docs/src/release/release-${project.version}.rst doesn't include today's release date")
211
217
  }
212
218
  println "Ready. Run ./gradlew release"
213
219
  }
@@ -169,6 +169,7 @@ public class Runner
169
169
 
170
170
  String yml = writeNextConfig(options.getNextConfigOutputPath(), config, configDiff);
171
171
  System.err.println(yml);
172
+ System.out.println("Created "+options.getNextConfigOutputPath());
172
173
  }
173
174
 
174
175
  private void checkFileWritable(String path)
@@ -191,14 +192,10 @@ public class Runner
191
192
  {
192
193
  String yml = dumpYaml(obj);
193
194
  if (path != null) {
194
- if (path.equals("-")) {
195
- System.out.print(yml);
196
- } else {
197
- try (Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(path), "UTF-8"))) {
198
- writer.write(yml);
199
- } catch (IOException ex) {
200
- throw new RuntimeException(ex);
201
- }
195
+ try (Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(path), "UTF-8"))) {
196
+ writer.write(yml);
197
+ } catch (IOException ex) {
198
+ throw new RuntimeException(ex);
202
199
  }
203
200
  }
204
201
  return yml;
@@ -13,7 +13,7 @@ function r() {
13
13
  }
14
14
 
15
15
  [ "$TRAVIS_PULL_REQUEST" != "false" ] && exit 0
16
- [ "$TRAVIS_BRANCH" != "master" ] && exit 0
16
+ [ "$TRAVIS_BRANCH" != "master" -a "$TRAVIS_BRANCH" != "$(git describe --tags --always HEAD)" ] && exit 0
17
17
 
18
18
  revision="$(git rev-parse HEAD)"
19
19
  remote="$(git config remote.origin.url | sed "s+^git:+https:+")"
@@ -11,4 +11,6 @@ Release Notes
11
11
  release/release-0.3.1
12
12
  release/release-0.3.2
13
13
  release/release-0.4.0
14
+ release/release-0.4.1
15
+ release/release-0.4.2
14
16
 
@@ -1,4 +1,4 @@
1
- Release 0.4.0
1
+ Release 0.4.1
2
2
  ==================================
3
3
 
4
4
  CLI
@@ -15,4 +15,4 @@ General Changes
15
15
 
16
16
  Release Date
17
17
  ------------------
18
- 2015-02-15
18
+ 2015-02-16
@@ -0,0 +1,18 @@
1
+ Release 0.4.2
2
+ ==================================
3
+
4
+ CLI
5
+ ------------------
6
+
7
+ * Removed support for setting ``-`` to ``-o, --output PATH`` option
8
+
9
+ General Changes
10
+ ------------------
11
+
12
+ * Fixed guess plugin paths (@takei-yuya++)
13
+ * Fixed syntax error in generated gemspec file
14
+ * Fixed syntax error in generated generated ruby filter and input plugins (@takei-yuya++, also reported by hiroyuki-sato)
15
+
16
+ Release Date
17
+ ------------------
18
+ 2015-02-16
@@ -34,23 +34,23 @@ module Embulk
34
34
  description =
35
35
  case category
36
36
  when :input
37
- %[that loads records from #{display_name} so that any output plugins can receive the records. Search the output plugins by "embulk-output" keyword.]
37
+ %[that loads records from #{display_name} so that any output plugins can receive the records. Search the output plugins by 'embulk-output' keyword.]
38
38
  when :file_input
39
- %[that reads files from #{display_name} and parses the file using any parser plugins. Search the parser plugins by "embulk-parser" keyword.]
39
+ %[that reads files from #{display_name} and parses the file using any parser plugins. Search the parser plugins by 'embulk-parser' keyword.]
40
40
  when :parser
41
- %[that parses #{display_name} file format read by any file input plugins. Search the file input plugins by "embulk-input file" keywords.]
41
+ %[that parses #{display_name} file format read by any file input plugins. Search the file input plugins by 'embulk-input file' keywords.]
42
42
  when :decoder
43
- %[that decodes files encoded by #{display_name} read by any file input plugins. Search the file input plugins by "embulk-input file" keywords.]
43
+ %[that decodes files encoded by #{display_name} read by any file input plugins. Search the file input plugins by 'embulk-input file' keywords.]
44
44
  when :output
45
- %[that loads records to #{display_name} read by any input plugins. Search the input plugins by "embulk-input" keyword.]
45
+ %[that loads records to #{display_name} read by any input plugins. Search the input plugins by 'embulk-input' keyword.]
46
46
  when :file_output
47
- %[that stores files to #{display_name} formatted by any formatter plugins. Search the formatter plugins by "embulk-formatter" keyword.]
47
+ %[that stores files to #{display_name} formatted by any formatter plugins. Search the formatter plugins by 'embulk-formatter' keyword.]
48
48
  when :formtter
49
- %[that formats records using #{display_name} file format and so that any file output plugins can store the files. Search the file output plugins by "embulk-output file" keywords.]
49
+ %[that formats records using #{display_name} file format and so that any file output plugins can store the files. Search the file output plugins by 'embulk-output file' keywords.]
50
50
  when :encoder
51
- %[that encodes files using #{display_name} so that any file output plugins can store the files. Search the file output plugins by "embulk-output file" keywords.]
51
+ %[that encodes files using #{display_name} so that any file output plugins can store the files. Search the file output plugins by 'embulk-output file' keywords.]
52
52
  when :filter
53
- %[that converts records read by an input plugin before passing it to an output plugins. Search the input and plugins by "embulk-input" and "embulk-output" plugins.]
53
+ %[that converts records read by an input plugin before passing it to an output plugins. Search the input and plugins by 'embulk-input' and 'embulk-output' plugins.]
54
54
  end
55
55
 
56
56
  pkg = Embulk::PackageData.new("new", project_name, binding())
@@ -1,3 +1,3 @@
1
1
  Embulk::JavaPlugin.register_<%= embulk_category %>(
2
- <%= name.to_sym.inspect %>, <%= "org.embulk.#{embulk_category}.#{java_class_name}".dump %>,
2
+ <%= name.dump %>, <%= "org.embulk.#{embulk_category}.#{java_class_name}".dump %>,
3
3
  File.expand_path('../../../../classpath', __FILE__))
@@ -2,7 +2,7 @@ module Embulk
2
2
  module Filter
3
3
 
4
4
  class <%= ruby_class_name %> < FilterPlugin
5
- Plugin.register_filter(<%= name %>, self)
5
+ Plugin.register_filter(<%= name.dump %>, self)
6
6
 
7
7
  def self.transaction(config, in_schema, &control)
8
8
  # configuration code:
@@ -3,8 +3,8 @@ Gem::Specification.new do |spec|
3
3
  spec.name = "<%= project_name %>"
4
4
  spec.version = "0.1.0"
5
5
  spec.authors = [<%= author.dump %>]
6
- spec.summary = "<%= display_name %> <%= display_category %> plugin for Embulk"
7
- spec.description = "<%= display_name %> <%= display_category %> plugin is an Embulk plugin <%= description %>"
6
+ spec.summary = <%= "#{display_name} #{display_category} plugin for Embulk".dump %>
7
+ spec.description = <%= "#{display_name} #{display_category} plugin is an Embulk plugin #{description}".dump %>
8
8
  spec.email = [<%= email.dump %>]
9
9
  spec.licenses = ["MIT"]
10
10
  # TODO: spec.homepage = "https://github.com/<%= email[/([^@]*)/] %>/<%= project_name %>"
@@ -2,7 +2,7 @@ module Embulk
2
2
  module Input
3
3
 
4
4
  class <%= ruby_class_name %> < InputPlugin
5
- Plugin.register_input(<%= name %>, self)
5
+ Plugin.register_input(<%= name.dump %>, self)
6
6
 
7
7
  def self.transaction(config, &control)
8
8
  # configuration code:
@@ -0,0 +1,28 @@
1
+ module Embulk
2
+ module Guess
3
+
4
+ class CharsetGuessPlugin < GuessPlugin
5
+ Plugin.register_guess('charset', self)
6
+
7
+ def guess(config, sample_buffer)
8
+ # ICU4J
9
+ detector = com.ibm.icu.text.CharsetDetector.new
10
+ detector.setText(sample_buffer.to_java_bytes)
11
+ best_match = detector.detect
12
+ if best_match.getConfidence < 50
13
+ name = "UTF-8"
14
+ else
15
+ name = best_match.getName
16
+ if name == "ISO-8859-1"
17
+ # ISO-8859-1 means ASCII which is a subset
18
+ # of UTF-8 in most of cases due to lack of
19
+ # sample data set
20
+ name = "UTF-8"
21
+ end
22
+ end
23
+ return {"parser" => {"charset" => name}}
24
+ end
25
+ end
26
+
27
+ end
28
+ end
@@ -0,0 +1,206 @@
1
+ module Embulk
2
+ module Guess
3
+ require_relative 'time_format_guess'
4
+
5
+ class CsvGuessPlugin < LineGuessPlugin
6
+ Plugin.register_guess('csv', self)
7
+
8
+ DELIMITER_CANDIDATES = [
9
+ ",", "\t", "|"
10
+ ]
11
+
12
+ QUOTE_CANDIDATES = [
13
+ "\"", "'"
14
+ ]
15
+
16
+ # CsvParserPlugin.TRUE_STRINGS
17
+ TRUE_STRINGS = Hash[*%w[
18
+ true True TRUE
19
+ yes Yes YES
20
+ y Y
21
+ on On ON
22
+ 1
23
+ ].map {|k| [k, true] }]
24
+
25
+ def guess_lines(config, sample_lines)
26
+ delim = guess_delimiter(sample_lines)
27
+ unless delim
28
+ # not CSV file
29
+ return {}
30
+ end
31
+
32
+ parser_config = config["parser"] || {}
33
+ parser_guessed = {"type" => "csv", "delimiter" => delim}
34
+
35
+ quote = guess_quote(sample_lines, delim)
36
+ parser_guessed["quote"] = quote ? quote : ''
37
+
38
+ sample_records = sample_lines.map {|line| line.split(delim) } # TODO use CsvTokenizer
39
+ first_types = guess_field_types(sample_records[0, 1])
40
+ other_types = guess_field_types(sample_records[1..-1])
41
+
42
+ if first_types.size <= 1 || other_types.size <= 1
43
+ # guess failed
44
+ return {}
45
+ end
46
+
47
+ unless parser_config.has_key?("header_line")
48
+ parser_guessed["header_line"] = (first_types != other_types && !first_types.any? {|t| t != ["string"] })
49
+ end
50
+
51
+ unless parser_config.has_key?("columns")
52
+ if parser_guessed["header_line"] || parser_config["header_line"]
53
+ column_names = sample_records.first
54
+ else
55
+ column_names = (0..other_types.size).to_a.map {|i| "c#{i}" }
56
+ end
57
+ schema = []
58
+ column_names.zip(other_types).each do |name,(type,format)|
59
+ if name && type
60
+ if format
61
+ schema << {"name" => name, "type" => type, "format" => format}
62
+ else
63
+ schema << {"name" => name, "type" => type}
64
+ end
65
+ end
66
+ end
67
+ parser_guessed["columns"] = schema
68
+ end
69
+
70
+ return {"parser" => parser_guessed}
71
+ end
72
+
73
+ private
74
+
75
+ def guess_delimiter(sample_lines)
76
+ delim_weights = DELIMITER_CANDIDATES.map do |d|
77
+ counts = sample_lines.map {|line| line.count(d) }
78
+ total = array_sum(counts)
79
+ if total > 0
80
+ stddev = array_standard_deviation(counts)
81
+ stddev = 0.000000001 if stddev == 0.0
82
+ weight = total / stddev
83
+ [d, weight]
84
+ else
85
+ [nil, 0]
86
+ end
87
+ end
88
+
89
+ delim, weight = *delim_weights.sort_by {|d,weight| weight }.last
90
+ if delim != nil && weight > 1
91
+ return delim
92
+ else
93
+ return nil
94
+ end
95
+ end
96
+
97
+ def guess_quote(sample_lines, delim)
98
+ delim_regexp = Regexp.escape(delim)
99
+ quote_weights = QUOTE_CANDIDATES.map do |q|
100
+ weights = sample_lines.map do |line|
101
+ q_regexp = Regexp.escape(q)
102
+ count = line.count(q)
103
+ if count > 0
104
+ weight = count
105
+ weight += line.scan(/(?:\A|#{delim_regexp})\s*#{q_regexp}(?:(?!#{q_regexp}).)*\s*#{q_regexp}(?:$|#{delim_regexp})/).size * 20
106
+ weight += line.scan(/(?:\A|#{delim_regexp})\s*#{q_regexp}(?:(?!#{delim_regexp}).)*\s*#{q_regexp}(?:$|#{delim_regexp})/).size * 40
107
+ weight
108
+ else
109
+ nil
110
+ end
111
+ end.compact
112
+ weights.empty? ? 0 : array_avg(weights)
113
+ end
114
+ quote, weight = QUOTE_CANDIDATES.zip(quote_weights).sort_by {|q,w| w }.last
115
+ if weight >= 10.0
116
+ return quote
117
+ else
118
+ return nil
119
+ end
120
+ end
121
+
122
+ def guess_field_types(field_lines)
123
+ column_lines = []
124
+ field_lines.each do |fields|
125
+ fields.each_with_index {|field,i| (column_lines[i] ||= []) << guess_type(field) }
126
+ end
127
+ columns = column_lines.map do |types|
128
+ t = types.inject(nil) {|r,t| merge_type(r,t) } || "string"
129
+ if t.is_a?(TimestampMatch)
130
+ format = TimeFormatGuess.guess(types.map {|type| type.text })
131
+ ["timestamp", format]
132
+ else
133
+ [t]
134
+ end
135
+ end
136
+ return columns
137
+ end
138
+
139
+ TYPE_COALESCE = Hash[{
140
+ long: :double,
141
+ boolean: :long,
142
+ }.map {|k,v|
143
+ [[k.to_s, v.to_s].sort, v.to_s]
144
+ }]
145
+
146
+ def merge_type(type1, type2)
147
+ if type1 == type2
148
+ type1
149
+ elsif type1.nil? || type2.nil?
150
+ type1 || type2
151
+ else
152
+ TYPE_COALESCE[[type1, type2].sort] || "string"
153
+ end
154
+ end
155
+
156
+ class TimestampMatch < String
157
+ def initialize(text)
158
+ super("timestamp")
159
+ @text = text
160
+ end
161
+ attr_reader :text
162
+ end
163
+
164
+ def guess_type(str)
165
+ if TRUE_STRINGS[str]
166
+ return "boolean"
167
+ end
168
+
169
+ if TimeFormatGuess.guess(str)
170
+ return TimestampMatch.new(str)
171
+ end
172
+
173
+ if str.to_i.to_s == str
174
+ return "long"
175
+ end
176
+
177
+ if str.include?('.')
178
+ a, b = str.split(".", 2)
179
+ if a.to_i.to_s == a && b.to_i.to_s == b
180
+ return "double"
181
+ end
182
+ end
183
+
184
+ return "string"
185
+ end
186
+
187
+ def array_sum(array)
188
+ array.inject(0) {|r,i| r += i }
189
+ end
190
+
191
+ def array_avg(array)
192
+ array.inject(0.0) {|r,i| r += i } / array.size
193
+ end
194
+
195
+ def array_variance(array)
196
+ avg = array_avg(array)
197
+ array.inject(0.0) {|r,i| r += (i - avg) ** 2 } / array.size
198
+ end
199
+
200
+ def array_standard_deviation(array)
201
+ Math.sqrt(array_variance(array))
202
+ end
203
+ end
204
+
205
+ end
206
+ end
@@ -0,0 +1,18 @@
1
+ module Embulk
2
+ module Guess
3
+
4
+ class GzipGuessPlugin < GuessPlugin
5
+ Plugin.register_guess('gzip', self)
6
+
7
+ GZIP_HEADER = "\x1f\x8b".force_encoding('ASCII-8BIT').freeze
8
+
9
+ def guess(config, sample_buffer)
10
+ if sample_buffer[0,2] == GZIP_HEADER
11
+ return {"decoders" => [{"type" => "gzip"}]}
12
+ end
13
+ return {}
14
+ end
15
+ end
16
+
17
+ end
18
+ end