embulk-filter-kuromoji 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4b4dca9e4715f6af3d568ca4013519d4f5351470
4
- data.tar.gz: 9acf950dff08f825309b721eb63fefc031977b99
3
+ metadata.gz: 3d706d852774bb54a883e1fad994b508213b0a12
4
+ data.tar.gz: c68d3372c01a5b273391f9c14e3b19dce7977aaa
5
5
  SHA512:
6
- metadata.gz: d5b3f8dda0b8ccc31be015bb839925640fdc109c9fe6de98ced99e9008d2df27dfa8c686148b55d7be0694ad19cf1110db11a3227a5941b69e19c5af9b693d17
7
- data.tar.gz: 2eb8b106940f6716b782ef1e5b2960d96d425b475c33d652a797620a40caef87d4157d916cb99d47e1b134766e35212a54a90a201aa8415f3fcb630fe19160e8
6
+ metadata.gz: 163708524757836fb4a45b5f88b5fb030e7e39d91079d5f2aff2ab7dad31c79d52ba6a199979bea0b7cb73ef32cd6d1abd7e28f7a1c7b4f988a8e0aea6d97b8a
7
+ data.tar.gz: 8e9a645c381a40a0fa9dfefe6103a4bbfb96347eccf110ba870a488f6dda136be2982bbf6004790d6790839648c15e5ca78b6d5960d084b0a637b4e54e9d7e05
data/build.gradle CHANGED
@@ -2,27 +2,27 @@ plugins {
2
2
  id "com.jfrog.bintray" version "1.1"
3
3
  id "com.github.jruby-gradle.base" version "0.1.5"
4
4
  id "java"
5
+ id "checkstyle"
5
6
  id "eclipse"
6
7
  }
7
8
  import com.github.jrubygradle.JRubyExec
8
9
  repositories {
9
10
  mavenCentral()
10
11
  jcenter()
11
- maven { url "http://www.atilika.org/nexus/content/repositories/atilika" }
12
12
  }
13
13
  configurations {
14
14
  provided
15
15
  }
16
16
 
17
- version = "0.2.0"
17
+ version = "0.3.0"
18
18
 
19
19
  sourceCompatibility = 1.7
20
20
  targetCompatibility = 1.7
21
21
 
22
22
  dependencies {
23
- compile "org.embulk:embulk-core:0.7.4"
24
- compile 'org.atilika.kuromoji:kuromoji:0.7.7'
25
- provided "org.embulk:embulk-core:0.7.4"
23
+ compile "org.embulk:embulk-core:0.8.1"
24
+ compile 'com.atilika.kuromoji:kuromoji-ipadic:0.9.0'
25
+ provided "org.embulk:embulk-core:0.8.1"
26
26
  testCompile "junit:junit:4.+"
27
27
  }
28
28
 
@@ -33,6 +33,23 @@ task classpath(type: Copy, dependsOn: ["jar"]) {
33
33
  }
34
34
  clean { delete "classpath" }
35
35
 
36
+ checkstyle {
37
+ configFile = file("${project.rootDir}/config/checkstyle/checkstyle.xml")
38
+ toolVersion = '6.14.1'
39
+ }
40
+ checkstyleMain {
41
+ configFile = file("${project.rootDir}/config/checkstyle/default.xml")
42
+ ignoreFailures = true
43
+ }
44
+ checkstyleTest {
45
+ configFile = file("${project.rootDir}/config/checkstyle/default.xml")
46
+ ignoreFailures = true
47
+ }
48
+ task checkstyle(type: Checkstyle) {
49
+ classpath = sourceSets.main.output + sourceSets.test.output
50
+ source = sourceSets.main.allJava + sourceSets.test.allJava
51
+ }
52
+
36
53
  task gem(type: JRubyExec, dependsOn: ["gemspec", "classpath"]) {
37
54
  jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "build"
38
55
  script "${project.name}.gemspec"
Binary file
Binary file
@@ -1,6 +1,6 @@
1
- #Tue Aug 11 00:26:20 PDT 2015
1
+ #Wed Jan 13 12:41:02 JST 2016
2
2
  distributionBase=GRADLE_USER_HOME
3
3
  distributionPath=wrapper/dists
4
4
  zipStoreBase=GRADLE_USER_HOME
5
5
  zipStorePath=wrapper/dists
6
- distributionUrl=https\://services.gradle.org/distributions/gradle-2.6-bin.zip
6
+ distributionUrl=https\://services.gradle.org/distributions/gradle-2.10-bin.zip
@@ -5,9 +5,6 @@ import java.io.IOException;
5
5
  import java.util.List;
6
6
  import java.util.Map;
7
7
 
8
- import org.atilika.kuromoji.Token;
9
- import org.atilika.kuromoji.Tokenizer;
10
- import org.atilika.kuromoji.Tokenizer.Builder;
11
8
  import org.embulk.config.Config;
12
9
  import org.embulk.config.ConfigDefault;
13
10
  import org.embulk.config.ConfigSource;
@@ -21,8 +18,14 @@ import org.embulk.spi.PageBuilder;
21
18
  import org.embulk.spi.PageOutput;
22
19
  import org.embulk.spi.PageReader;
23
20
  import org.embulk.spi.Schema;
21
+ import org.embulk.spi.type.Type;
24
22
  import org.embulk.spi.type.Types;
23
+ import org.msgpack.value.Value;
24
+ import org.msgpack.value.ValueFactory;
25
25
 
26
+ import com.atilika.kuromoji.ipadic.Token;
27
+ import com.atilika.kuromoji.ipadic.Tokenizer;
28
+ import com.atilika.kuromoji.ipadic.Tokenizer.Builder;
26
29
  import com.google.common.base.Joiner;
27
30
  import com.google.common.base.MoreObjects;
28
31
  import com.google.common.base.Optional;
@@ -43,7 +46,7 @@ public class KuromojiFilterPlugin implements FilterPlugin
43
46
  @Config("ok_parts_of_speech")
44
47
  @ConfigDefault("null")
45
48
  public Optional<List<String>> getOkPartsOfSpeech();
46
-
49
+
47
50
  @Config("keep_input")
48
51
  @ConfigDefault("true")
49
52
  public boolean getKeepInput();
@@ -61,21 +64,22 @@ public class KuromojiFilterPlugin implements FilterPlugin
61
64
  ImmutableList.Builder<Column> builder = ImmutableList.builder();
62
65
  int i = 0;
63
66
  if (task.getKeepInput()) {
64
- for (Column inputColumn: inputSchema.getColumns()) {
67
+ for (Column inputColumn : inputSchema.getColumns()) {
65
68
  Column outputColumn = new Column(i++, inputColumn.getName(), inputColumn.getType());
66
69
  builder.add(outputColumn);
67
70
  }
68
71
  }
69
72
 
70
- for (String key: task.getKeyNames()) {
73
+ for (String key : task.getKeyNames()) {
71
74
  for (Map<String, String> setting : task.getSettings()) {
72
75
  String keyName = key + MoreObjects.firstNonNull(setting.get("suffix"), "");
76
+ Type type = "array".equals(setting.get("type")) ? Types.JSON : Types.STRING;
73
77
  if (task.getKeepInput()) {
74
78
  if (setting.get("suffix") != null) {
75
- builder.add(new Column(i++, keyName, Types.STRING));
79
+ builder.add(new Column(i++, keyName, type));
76
80
  }
77
81
  } else {
78
- builder.add(new Column(i++, keyName, Types.STRING));
82
+ builder.add(new Column(i++, keyName, type));
79
83
  }
80
84
  }
81
85
  }
@@ -88,7 +92,8 @@ public class KuromojiFilterPlugin implements FilterPlugin
88
92
  public PageOutput open(TaskSource taskSource, final Schema inputSchema, final Schema outputSchema, final PageOutput output)
89
93
  {
90
94
  final PluginTask task = taskSource.loadTask(PluginTask.class);
91
- Builder builder = Tokenizer.builder();
95
+
96
+ Builder builder = new Tokenizer.Builder();
92
97
  if (task.getDictionaryPath().isPresent()) {
93
98
  try {
94
99
  builder.userDictionary(task.getDictionaryPath().get());
@@ -102,7 +107,7 @@ public class KuromojiFilterPlugin implements FilterPlugin
102
107
  final List<Column> keyNameColumns = Lists.newArrayList();
103
108
 
104
109
  for (String keyName : task.getKeyNames()) {
105
- keyNameColumns.add(inputSchema.lookupColumn(keyName));
110
+ keyNameColumns.add(outputSchema.lookupColumn(keyName));
106
111
  }
107
112
 
108
113
  return new PageOutput() {
@@ -133,7 +138,7 @@ public class KuromojiFilterPlugin implements FilterPlugin
133
138
  */
134
139
  private void setValue(PageBuilder builder) {
135
140
  if (task.getKeepInput()) {
136
- for (Column inputColumn: inputSchema.getColumns()) {
141
+ for (Column inputColumn : inputSchema.getColumns()) {
137
142
  if (reader.isNull(inputColumn)) {
138
143
  builder.setNull(inputColumn);
139
144
  continue;
@@ -154,29 +159,40 @@ public class KuromojiFilterPlugin implements FilterPlugin
154
159
 
155
160
  for (Column column : keyNameColumns) {
156
161
  List<Token> tokens = tokenizer.tokenize(reader.getString(column));
157
- for (Map<String, String> setting: task.getSettings()) {
162
+ for (Map<String, String> setting : task.getSettings()) {
158
163
  String suffix = setting.get("suffix");
159
164
  String method = setting.get("method");
160
165
  Column outputColumn = outputSchema.lookupColumn(column.getName() + MoreObjects.firstNonNull(suffix, ""));
161
- List<String> outputs = Lists.newArrayList();
166
+ List<Value> outputs = Lists.newArrayList();
162
167
  for (Token token : tokens) {
163
- if (!isOkPartsOfSpeech(token)) continue;
168
+ System.err.println(token.getAllFeaturesArray().toString());
169
+ System.err.println(token.getPartOfSpeechLevel1());
170
+ System.err.println(token.getPartOfSpeechLevel2());
171
+ System.err.println(token.getPartOfSpeechLevel3());
172
+ System.err.println(token.getPartOfSpeechLevel4());
173
+ if (!isOkPartsOfSpeech(token)) { continue; }
174
+ String word = null;
164
175
  if ("base_form".equals(method)) {
165
- outputs.add(MoreObjects.firstNonNull(token.getBaseForm(), token.getSurfaceForm()));
176
+ word = MoreObjects.firstNonNull(token.getBaseForm(), token.getSurface());
166
177
  } else if ("reading".equals(method)) {
167
- outputs.add(MoreObjects.firstNonNull(token.getReading(), token.getSurfaceForm()));
178
+ word = MoreObjects.firstNonNull(token.getReading(), token.getSurface());
168
179
  } else if ("surface_form".equals(method)) {
169
- outputs.add(token.getSurfaceForm());
180
+ word = token.getSurface();
170
181
  }
182
+ outputs.add(ValueFactory.newString(word));
183
+ }
184
+ if (outputColumn.getType().equals(Types.STRING)) {
185
+ Joiner joiner = Joiner.on(MoreObjects.firstNonNull(setting.get("delimiter"), ",")).skipNulls();
186
+ builder.setString(outputColumn, joiner.join(outputs));
187
+ } else if (outputColumn.getType().equals(Types.JSON)) {
188
+ builder.setJson(outputColumn, ValueFactory.newArray(outputs));
171
189
  }
172
- Joiner joiner = Joiner.on(MoreObjects.firstNonNull(setting.get("delimiter"), ",")).skipNulls();
173
- builder.setString(outputColumn, joiner.join(outputs));
174
190
  }
175
191
  }
176
192
  }
177
193
 
178
194
  private boolean isOkPartsOfSpeech(Token token) {
179
- if (!task.getOkPartsOfSpeech().isPresent()) return true;
195
+ if (!task.getOkPartsOfSpeech().isPresent()) { return true; };
180
196
  for (String okPartsOfSpeech : task.getOkPartsOfSpeech().get()) {
181
197
  if (token.getAllFeaturesArray()[0].equals(okPartsOfSpeech)) {
182
198
  return true;
metadata CHANGED
@@ -1,43 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-filter-kuromoji
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - toyama0919
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-10-23 00:00:00.000000000 Z
11
+ date: 2016-02-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: bundler
15
- version_requirements: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - ~>
18
- - !ruby/object:Gem::Version
19
- version: '1.0'
20
14
  requirement: !ruby/object:Gem::Requirement
21
15
  requirements:
22
16
  - - ~>
23
17
  - !ruby/object:Gem::Version
24
18
  version: '1.0'
19
+ name: bundler
25
20
  prerelease: false
26
21
  type: :development
27
- - !ruby/object:Gem::Dependency
28
- name: rake
29
22
  version_requirements: !ruby/object:Gem::Requirement
30
23
  requirements:
31
- - - '>='
24
+ - - ~>
32
25
  - !ruby/object:Gem::Version
33
- version: '10.0'
26
+ version: '1.0'
27
+ - !ruby/object:Gem::Dependency
34
28
  requirement: !ruby/object:Gem::Requirement
35
29
  requirements:
36
30
  - - '>='
37
31
  - !ruby/object:Gem::Version
38
32
  version: '10.0'
33
+ name: rake
39
34
  prerelease: false
40
35
  type: :development
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
41
  description: Kuromoji filter plugin for Embulk
42
42
  email:
43
43
  - toyama0919@gmail.com
@@ -56,8 +56,9 @@ files:
56
56
  - lib/embulk/filter/kuromoji.rb
57
57
  - src/main/java/org/embulk/filter/kuromoji/KuromojiFilterPlugin.java
58
58
  - src/test/java/org/embulk/filter/kuromoji/TestKuromojiFilterPlugin.java
59
- - classpath/embulk-filter-kuromoji-0.2.0.jar
60
- - classpath/kuromoji-0.7.7.jar
59
+ - classpath/embulk-filter-kuromoji-0.3.0.jar
60
+ - classpath/kuromoji-core-0.9.0.jar
61
+ - classpath/kuromoji-ipadic-0.9.0.jar
61
62
  homepage: https://github.com/toyama0919/embulk-filter-kuromoji
62
63
  licenses:
63
64
  - MIT
@@ -83,4 +84,3 @@ signing_key:
83
84
  specification_version: 4
84
85
  summary: Kuromoji filter plugin for Embulk
85
86
  test_files: []
86
- has_rdoc: