embulk-filter-kuromoji 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4b4dca9e4715f6af3d568ca4013519d4f5351470
4
- data.tar.gz: 9acf950dff08f825309b721eb63fefc031977b99
3
+ metadata.gz: 3d706d852774bb54a883e1fad994b508213b0a12
4
+ data.tar.gz: c68d3372c01a5b273391f9c14e3b19dce7977aaa
5
5
  SHA512:
6
- metadata.gz: d5b3f8dda0b8ccc31be015bb839925640fdc109c9fe6de98ced99e9008d2df27dfa8c686148b55d7be0694ad19cf1110db11a3227a5941b69e19c5af9b693d17
7
- data.tar.gz: 2eb8b106940f6716b782ef1e5b2960d96d425b475c33d652a797620a40caef87d4157d916cb99d47e1b134766e35212a54a90a201aa8415f3fcb630fe19160e8
6
+ metadata.gz: 163708524757836fb4a45b5f88b5fb030e7e39d91079d5f2aff2ab7dad31c79d52ba6a199979bea0b7cb73ef32cd6d1abd7e28f7a1c7b4f988a8e0aea6d97b8a
7
+ data.tar.gz: 8e9a645c381a40a0fa9dfefe6103a4bbfb96347eccf110ba870a488f6dda136be2982bbf6004790d6790839648c15e5ca78b6d5960d084b0a637b4e54e9d7e05
data/build.gradle CHANGED
@@ -2,27 +2,27 @@ plugins {
2
2
  id "com.jfrog.bintray" version "1.1"
3
3
  id "com.github.jruby-gradle.base" version "0.1.5"
4
4
  id "java"
5
+ id "checkstyle"
5
6
  id "eclipse"
6
7
  }
7
8
  import com.github.jrubygradle.JRubyExec
8
9
  repositories {
9
10
  mavenCentral()
10
11
  jcenter()
11
- maven { url "http://www.atilika.org/nexus/content/repositories/atilika" }
12
12
  }
13
13
  configurations {
14
14
  provided
15
15
  }
16
16
 
17
- version = "0.2.0"
17
+ version = "0.3.0"
18
18
 
19
19
  sourceCompatibility = 1.7
20
20
  targetCompatibility = 1.7
21
21
 
22
22
  dependencies {
23
- compile "org.embulk:embulk-core:0.7.4"
24
- compile 'org.atilika.kuromoji:kuromoji:0.7.7'
25
- provided "org.embulk:embulk-core:0.7.4"
23
+ compile "org.embulk:embulk-core:0.8.1"
24
+ compile 'com.atilika.kuromoji:kuromoji-ipadic:0.9.0'
25
+ provided "org.embulk:embulk-core:0.8.1"
26
26
  testCompile "junit:junit:4.+"
27
27
  }
28
28
 
@@ -33,6 +33,23 @@ task classpath(type: Copy, dependsOn: ["jar"]) {
33
33
  }
34
34
  clean { delete "classpath" }
35
35
 
36
+ checkstyle {
37
+ configFile = file("${project.rootDir}/config/checkstyle/checkstyle.xml")
38
+ toolVersion = '6.14.1'
39
+ }
40
+ checkstyleMain {
41
+ configFile = file("${project.rootDir}/config/checkstyle/default.xml")
42
+ ignoreFailures = true
43
+ }
44
+ checkstyleTest {
45
+ configFile = file("${project.rootDir}/config/checkstyle/default.xml")
46
+ ignoreFailures = true
47
+ }
48
+ task checkstyle(type: Checkstyle) {
49
+ classpath = sourceSets.main.output + sourceSets.test.output
50
+ source = sourceSets.main.allJava + sourceSets.test.allJava
51
+ }
52
+
36
53
  task gem(type: JRubyExec, dependsOn: ["gemspec", "classpath"]) {
37
54
  jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "build"
38
55
  script "${project.name}.gemspec"
Binary file
Binary file
@@ -1,6 +1,6 @@
1
- #Tue Aug 11 00:26:20 PDT 2015
1
+ #Wed Jan 13 12:41:02 JST 2016
2
2
  distributionBase=GRADLE_USER_HOME
3
3
  distributionPath=wrapper/dists
4
4
  zipStoreBase=GRADLE_USER_HOME
5
5
  zipStorePath=wrapper/dists
6
- distributionUrl=https\://services.gradle.org/distributions/gradle-2.6-bin.zip
6
+ distributionUrl=https\://services.gradle.org/distributions/gradle-2.10-bin.zip
@@ -5,9 +5,6 @@ import java.io.IOException;
5
5
  import java.util.List;
6
6
  import java.util.Map;
7
7
 
8
- import org.atilika.kuromoji.Token;
9
- import org.atilika.kuromoji.Tokenizer;
10
- import org.atilika.kuromoji.Tokenizer.Builder;
11
8
  import org.embulk.config.Config;
12
9
  import org.embulk.config.ConfigDefault;
13
10
  import org.embulk.config.ConfigSource;
@@ -21,8 +18,14 @@ import org.embulk.spi.PageBuilder;
21
18
  import org.embulk.spi.PageOutput;
22
19
  import org.embulk.spi.PageReader;
23
20
  import org.embulk.spi.Schema;
21
+ import org.embulk.spi.type.Type;
24
22
  import org.embulk.spi.type.Types;
23
+ import org.msgpack.value.Value;
24
+ import org.msgpack.value.ValueFactory;
25
25
 
26
+ import com.atilika.kuromoji.ipadic.Token;
27
+ import com.atilika.kuromoji.ipadic.Tokenizer;
28
+ import com.atilika.kuromoji.ipadic.Tokenizer.Builder;
26
29
  import com.google.common.base.Joiner;
27
30
  import com.google.common.base.MoreObjects;
28
31
  import com.google.common.base.Optional;
@@ -43,7 +46,7 @@ public class KuromojiFilterPlugin implements FilterPlugin
43
46
  @Config("ok_parts_of_speech")
44
47
  @ConfigDefault("null")
45
48
  public Optional<List<String>> getOkPartsOfSpeech();
46
-
49
+
47
50
  @Config("keep_input")
48
51
  @ConfigDefault("true")
49
52
  public boolean getKeepInput();
@@ -61,21 +64,22 @@ public class KuromojiFilterPlugin implements FilterPlugin
61
64
  ImmutableList.Builder<Column> builder = ImmutableList.builder();
62
65
  int i = 0;
63
66
  if (task.getKeepInput()) {
64
- for (Column inputColumn: inputSchema.getColumns()) {
67
+ for (Column inputColumn : inputSchema.getColumns()) {
65
68
  Column outputColumn = new Column(i++, inputColumn.getName(), inputColumn.getType());
66
69
  builder.add(outputColumn);
67
70
  }
68
71
  }
69
72
 
70
- for (String key: task.getKeyNames()) {
73
+ for (String key : task.getKeyNames()) {
71
74
  for (Map<String, String> setting : task.getSettings()) {
72
75
  String keyName = key + MoreObjects.firstNonNull(setting.get("suffix"), "");
76
+ Type type = "array".equals(setting.get("type")) ? Types.JSON : Types.STRING;
73
77
  if (task.getKeepInput()) {
74
78
  if (setting.get("suffix") != null) {
75
- builder.add(new Column(i++, keyName, Types.STRING));
79
+ builder.add(new Column(i++, keyName, type));
76
80
  }
77
81
  } else {
78
- builder.add(new Column(i++, keyName, Types.STRING));
82
+ builder.add(new Column(i++, keyName, type));
79
83
  }
80
84
  }
81
85
  }
@@ -88,7 +92,8 @@ public class KuromojiFilterPlugin implements FilterPlugin
88
92
  public PageOutput open(TaskSource taskSource, final Schema inputSchema, final Schema outputSchema, final PageOutput output)
89
93
  {
90
94
  final PluginTask task = taskSource.loadTask(PluginTask.class);
91
- Builder builder = Tokenizer.builder();
95
+
96
+ Builder builder = new Tokenizer.Builder();
92
97
  if (task.getDictionaryPath().isPresent()) {
93
98
  try {
94
99
  builder.userDictionary(task.getDictionaryPath().get());
@@ -102,7 +107,7 @@ public class KuromojiFilterPlugin implements FilterPlugin
102
107
  final List<Column> keyNameColumns = Lists.newArrayList();
103
108
 
104
109
  for (String keyName : task.getKeyNames()) {
105
- keyNameColumns.add(inputSchema.lookupColumn(keyName));
110
+ keyNameColumns.add(outputSchema.lookupColumn(keyName));
106
111
  }
107
112
 
108
113
  return new PageOutput() {
@@ -133,7 +138,7 @@ public class KuromojiFilterPlugin implements FilterPlugin
133
138
  */
134
139
  private void setValue(PageBuilder builder) {
135
140
  if (task.getKeepInput()) {
136
- for (Column inputColumn: inputSchema.getColumns()) {
141
+ for (Column inputColumn : inputSchema.getColumns()) {
137
142
  if (reader.isNull(inputColumn)) {
138
143
  builder.setNull(inputColumn);
139
144
  continue;
@@ -154,29 +159,40 @@ public class KuromojiFilterPlugin implements FilterPlugin
154
159
 
155
160
  for (Column column : keyNameColumns) {
156
161
  List<Token> tokens = tokenizer.tokenize(reader.getString(column));
157
- for (Map<String, String> setting: task.getSettings()) {
162
+ for (Map<String, String> setting : task.getSettings()) {
158
163
  String suffix = setting.get("suffix");
159
164
  String method = setting.get("method");
160
165
  Column outputColumn = outputSchema.lookupColumn(column.getName() + MoreObjects.firstNonNull(suffix, ""));
161
- List<String> outputs = Lists.newArrayList();
166
+ List<Value> outputs = Lists.newArrayList();
162
167
  for (Token token : tokens) {
163
- if (!isOkPartsOfSpeech(token)) continue;
168
+ System.err.println(token.getAllFeaturesArray().toString());
169
+ System.err.println(token.getPartOfSpeechLevel1());
170
+ System.err.println(token.getPartOfSpeechLevel2());
171
+ System.err.println(token.getPartOfSpeechLevel3());
172
+ System.err.println(token.getPartOfSpeechLevel4());
173
+ if (!isOkPartsOfSpeech(token)) { continue; }
174
+ String word = null;
164
175
  if ("base_form".equals(method)) {
165
- outputs.add(MoreObjects.firstNonNull(token.getBaseForm(), token.getSurfaceForm()));
176
+ word = MoreObjects.firstNonNull(token.getBaseForm(), token.getSurface());
166
177
  } else if ("reading".equals(method)) {
167
- outputs.add(MoreObjects.firstNonNull(token.getReading(), token.getSurfaceForm()));
178
+ word = MoreObjects.firstNonNull(token.getReading(), token.getSurface());
168
179
  } else if ("surface_form".equals(method)) {
169
- outputs.add(token.getSurfaceForm());
180
+ word = token.getSurface();
170
181
  }
182
+ outputs.add(ValueFactory.newString(word));
183
+ }
184
+ if (outputColumn.getType().equals(Types.STRING)) {
185
+ Joiner joiner = Joiner.on(MoreObjects.firstNonNull(setting.get("delimiter"), ",")).skipNulls();
186
+ builder.setString(outputColumn, joiner.join(outputs));
187
+ } else if (outputColumn.getType().equals(Types.JSON)) {
188
+ builder.setJson(outputColumn, ValueFactory.newArray(outputs));
171
189
  }
172
- Joiner joiner = Joiner.on(MoreObjects.firstNonNull(setting.get("delimiter"), ",")).skipNulls();
173
- builder.setString(outputColumn, joiner.join(outputs));
174
190
  }
175
191
  }
176
192
  }
177
193
 
178
194
  private boolean isOkPartsOfSpeech(Token token) {
179
- if (!task.getOkPartsOfSpeech().isPresent()) return true;
195
+ if (!task.getOkPartsOfSpeech().isPresent()) { return true; };
180
196
  for (String okPartsOfSpeech : task.getOkPartsOfSpeech().get()) {
181
197
  if (token.getAllFeaturesArray()[0].equals(okPartsOfSpeech)) {
182
198
  return true;
metadata CHANGED
@@ -1,43 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-filter-kuromoji
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - toyama0919
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-10-23 00:00:00.000000000 Z
11
+ date: 2016-02-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: bundler
15
- version_requirements: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - ~>
18
- - !ruby/object:Gem::Version
19
- version: '1.0'
20
14
  requirement: !ruby/object:Gem::Requirement
21
15
  requirements:
22
16
  - - ~>
23
17
  - !ruby/object:Gem::Version
24
18
  version: '1.0'
19
+ name: bundler
25
20
  prerelease: false
26
21
  type: :development
27
- - !ruby/object:Gem::Dependency
28
- name: rake
29
22
  version_requirements: !ruby/object:Gem::Requirement
30
23
  requirements:
31
- - - '>='
24
+ - - ~>
32
25
  - !ruby/object:Gem::Version
33
- version: '10.0'
26
+ version: '1.0'
27
+ - !ruby/object:Gem::Dependency
34
28
  requirement: !ruby/object:Gem::Requirement
35
29
  requirements:
36
30
  - - '>='
37
31
  - !ruby/object:Gem::Version
38
32
  version: '10.0'
33
+ name: rake
39
34
  prerelease: false
40
35
  type: :development
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
41
  description: Kuromoji filter plugin for Embulk
42
42
  email:
43
43
  - toyama0919@gmail.com
@@ -56,8 +56,9 @@ files:
56
56
  - lib/embulk/filter/kuromoji.rb
57
57
  - src/main/java/org/embulk/filter/kuromoji/KuromojiFilterPlugin.java
58
58
  - src/test/java/org/embulk/filter/kuromoji/TestKuromojiFilterPlugin.java
59
- - classpath/embulk-filter-kuromoji-0.2.0.jar
60
- - classpath/kuromoji-0.7.7.jar
59
+ - classpath/embulk-filter-kuromoji-0.3.0.jar
60
+ - classpath/kuromoji-core-0.9.0.jar
61
+ - classpath/kuromoji-ipadic-0.9.0.jar
61
62
  homepage: https://github.com/toyama0919/embulk-filter-kuromoji
62
63
  licenses:
63
64
  - MIT
@@ -83,4 +84,3 @@ signing_key:
83
84
  specification_version: 4
84
85
  summary: Kuromoji filter plugin for Embulk
85
86
  test_files: []
86
- has_rdoc: