embulk-filter-kuromoji 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/build.gradle +22 -5
- data/classpath/embulk-filter-kuromoji-0.3.0.jar +0 -0
- data/classpath/kuromoji-core-0.9.0.jar +0 -0
- data/classpath/{kuromoji-0.7.7.jar → kuromoji-ipadic-0.9.0.jar} +0 -0
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +2 -2
- data/src/main/java/org/embulk/filter/kuromoji/KuromojiFilterPlugin.java +36 -20
- metadata +15 -15
- data/classpath/embulk-filter-kuromoji-0.2.0.jar +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3d706d852774bb54a883e1fad994b508213b0a12
|
4
|
+
data.tar.gz: c68d3372c01a5b273391f9c14e3b19dce7977aaa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 163708524757836fb4a45b5f88b5fb030e7e39d91079d5f2aff2ab7dad31c79d52ba6a199979bea0b7cb73ef32cd6d1abd7e28f7a1c7b4f988a8e0aea6d97b8a
|
7
|
+
data.tar.gz: 8e9a645c381a40a0fa9dfefe6103a4bbfb96347eccf110ba870a488f6dda136be2982bbf6004790d6790839648c15e5ca78b6d5960d084b0a637b4e54e9d7e05
|
data/build.gradle
CHANGED
@@ -2,27 +2,27 @@ plugins {
|
|
2
2
|
id "com.jfrog.bintray" version "1.1"
|
3
3
|
id "com.github.jruby-gradle.base" version "0.1.5"
|
4
4
|
id "java"
|
5
|
+
id "checkstyle"
|
5
6
|
id "eclipse"
|
6
7
|
}
|
7
8
|
import com.github.jrubygradle.JRubyExec
|
8
9
|
repositories {
|
9
10
|
mavenCentral()
|
10
11
|
jcenter()
|
11
|
-
maven { url "http://www.atilika.org/nexus/content/repositories/atilika" }
|
12
12
|
}
|
13
13
|
configurations {
|
14
14
|
provided
|
15
15
|
}
|
16
16
|
|
17
|
-
version = "0.
|
17
|
+
version = "0.3.0"
|
18
18
|
|
19
19
|
sourceCompatibility = 1.7
|
20
20
|
targetCompatibility = 1.7
|
21
21
|
|
22
22
|
dependencies {
|
23
|
-
compile "org.embulk:embulk-core:0.
|
24
|
-
compile '
|
25
|
-
provided "org.embulk:embulk-core:0.
|
23
|
+
compile "org.embulk:embulk-core:0.8.1"
|
24
|
+
compile 'com.atilika.kuromoji:kuromoji-ipadic:0.9.0'
|
25
|
+
provided "org.embulk:embulk-core:0.8.1"
|
26
26
|
testCompile "junit:junit:4.+"
|
27
27
|
}
|
28
28
|
|
@@ -33,6 +33,23 @@ task classpath(type: Copy, dependsOn: ["jar"]) {
|
|
33
33
|
}
|
34
34
|
clean { delete "classpath" }
|
35
35
|
|
36
|
+
checkstyle {
|
37
|
+
configFile = file("${project.rootDir}/config/checkstyle/checkstyle.xml")
|
38
|
+
toolVersion = '6.14.1'
|
39
|
+
}
|
40
|
+
checkstyleMain {
|
41
|
+
configFile = file("${project.rootDir}/config/checkstyle/default.xml")
|
42
|
+
ignoreFailures = true
|
43
|
+
}
|
44
|
+
checkstyleTest {
|
45
|
+
configFile = file("${project.rootDir}/config/checkstyle/default.xml")
|
46
|
+
ignoreFailures = true
|
47
|
+
}
|
48
|
+
task checkstyle(type: Checkstyle) {
|
49
|
+
classpath = sourceSets.main.output + sourceSets.test.output
|
50
|
+
source = sourceSets.main.allJava + sourceSets.test.allJava
|
51
|
+
}
|
52
|
+
|
36
53
|
task gem(type: JRubyExec, dependsOn: ["gemspec", "classpath"]) {
|
37
54
|
jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "build"
|
38
55
|
script "${project.name}.gemspec"
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -1,6 +1,6 @@
|
|
1
|
-
#
|
1
|
+
#Wed Jan 13 12:41:02 JST 2016
|
2
2
|
distributionBase=GRADLE_USER_HOME
|
3
3
|
distributionPath=wrapper/dists
|
4
4
|
zipStoreBase=GRADLE_USER_HOME
|
5
5
|
zipStorePath=wrapper/dists
|
6
|
-
distributionUrl=https\://services.gradle.org/distributions/gradle-2.
|
6
|
+
distributionUrl=https\://services.gradle.org/distributions/gradle-2.10-bin.zip
|
@@ -5,9 +5,6 @@ import java.io.IOException;
|
|
5
5
|
import java.util.List;
|
6
6
|
import java.util.Map;
|
7
7
|
|
8
|
-
import org.atilika.kuromoji.Token;
|
9
|
-
import org.atilika.kuromoji.Tokenizer;
|
10
|
-
import org.atilika.kuromoji.Tokenizer.Builder;
|
11
8
|
import org.embulk.config.Config;
|
12
9
|
import org.embulk.config.ConfigDefault;
|
13
10
|
import org.embulk.config.ConfigSource;
|
@@ -21,8 +18,14 @@ import org.embulk.spi.PageBuilder;
|
|
21
18
|
import org.embulk.spi.PageOutput;
|
22
19
|
import org.embulk.spi.PageReader;
|
23
20
|
import org.embulk.spi.Schema;
|
21
|
+
import org.embulk.spi.type.Type;
|
24
22
|
import org.embulk.spi.type.Types;
|
23
|
+
import org.msgpack.value.Value;
|
24
|
+
import org.msgpack.value.ValueFactory;
|
25
25
|
|
26
|
+
import com.atilika.kuromoji.ipadic.Token;
|
27
|
+
import com.atilika.kuromoji.ipadic.Tokenizer;
|
28
|
+
import com.atilika.kuromoji.ipadic.Tokenizer.Builder;
|
26
29
|
import com.google.common.base.Joiner;
|
27
30
|
import com.google.common.base.MoreObjects;
|
28
31
|
import com.google.common.base.Optional;
|
@@ -43,7 +46,7 @@ public class KuromojiFilterPlugin implements FilterPlugin
|
|
43
46
|
@Config("ok_parts_of_speech")
|
44
47
|
@ConfigDefault("null")
|
45
48
|
public Optional<List<String>> getOkPartsOfSpeech();
|
46
|
-
|
49
|
+
|
47
50
|
@Config("keep_input")
|
48
51
|
@ConfigDefault("true")
|
49
52
|
public boolean getKeepInput();
|
@@ -61,21 +64,22 @@ public class KuromojiFilterPlugin implements FilterPlugin
|
|
61
64
|
ImmutableList.Builder<Column> builder = ImmutableList.builder();
|
62
65
|
int i = 0;
|
63
66
|
if (task.getKeepInput()) {
|
64
|
-
for (Column inputColumn: inputSchema.getColumns()) {
|
67
|
+
for (Column inputColumn : inputSchema.getColumns()) {
|
65
68
|
Column outputColumn = new Column(i++, inputColumn.getName(), inputColumn.getType());
|
66
69
|
builder.add(outputColumn);
|
67
70
|
}
|
68
71
|
}
|
69
72
|
|
70
|
-
for (String key: task.getKeyNames()) {
|
73
|
+
for (String key : task.getKeyNames()) {
|
71
74
|
for (Map<String, String> setting : task.getSettings()) {
|
72
75
|
String keyName = key + MoreObjects.firstNonNull(setting.get("suffix"), "");
|
76
|
+
Type type = "array".equals(setting.get("type")) ? Types.JSON : Types.STRING;
|
73
77
|
if (task.getKeepInput()) {
|
74
78
|
if (setting.get("suffix") != null) {
|
75
|
-
builder.add(new Column(i++, keyName,
|
79
|
+
builder.add(new Column(i++, keyName, type));
|
76
80
|
}
|
77
81
|
} else {
|
78
|
-
builder.add(new Column(i++, keyName,
|
82
|
+
builder.add(new Column(i++, keyName, type));
|
79
83
|
}
|
80
84
|
}
|
81
85
|
}
|
@@ -88,7 +92,8 @@ public class KuromojiFilterPlugin implements FilterPlugin
|
|
88
92
|
public PageOutput open(TaskSource taskSource, final Schema inputSchema, final Schema outputSchema, final PageOutput output)
|
89
93
|
{
|
90
94
|
final PluginTask task = taskSource.loadTask(PluginTask.class);
|
91
|
-
|
95
|
+
|
96
|
+
Builder builder = new Tokenizer.Builder();
|
92
97
|
if (task.getDictionaryPath().isPresent()) {
|
93
98
|
try {
|
94
99
|
builder.userDictionary(task.getDictionaryPath().get());
|
@@ -102,7 +107,7 @@ public class KuromojiFilterPlugin implements FilterPlugin
|
|
102
107
|
final List<Column> keyNameColumns = Lists.newArrayList();
|
103
108
|
|
104
109
|
for (String keyName : task.getKeyNames()) {
|
105
|
-
keyNameColumns.add(
|
110
|
+
keyNameColumns.add(outputSchema.lookupColumn(keyName));
|
106
111
|
}
|
107
112
|
|
108
113
|
return new PageOutput() {
|
@@ -133,7 +138,7 @@ public class KuromojiFilterPlugin implements FilterPlugin
|
|
133
138
|
*/
|
134
139
|
private void setValue(PageBuilder builder) {
|
135
140
|
if (task.getKeepInput()) {
|
136
|
-
for (Column inputColumn: inputSchema.getColumns()) {
|
141
|
+
for (Column inputColumn : inputSchema.getColumns()) {
|
137
142
|
if (reader.isNull(inputColumn)) {
|
138
143
|
builder.setNull(inputColumn);
|
139
144
|
continue;
|
@@ -154,29 +159,40 @@ public class KuromojiFilterPlugin implements FilterPlugin
|
|
154
159
|
|
155
160
|
for (Column column : keyNameColumns) {
|
156
161
|
List<Token> tokens = tokenizer.tokenize(reader.getString(column));
|
157
|
-
for (Map<String, String> setting: task.getSettings()) {
|
162
|
+
for (Map<String, String> setting : task.getSettings()) {
|
158
163
|
String suffix = setting.get("suffix");
|
159
164
|
String method = setting.get("method");
|
160
165
|
Column outputColumn = outputSchema.lookupColumn(column.getName() + MoreObjects.firstNonNull(suffix, ""));
|
161
|
-
List<
|
166
|
+
List<Value> outputs = Lists.newArrayList();
|
162
167
|
for (Token token : tokens) {
|
163
|
-
|
168
|
+
System.err.println(token.getAllFeaturesArray().toString());
|
169
|
+
System.err.println(token.getPartOfSpeechLevel1());
|
170
|
+
System.err.println(token.getPartOfSpeechLevel2());
|
171
|
+
System.err.println(token.getPartOfSpeechLevel3());
|
172
|
+
System.err.println(token.getPartOfSpeechLevel4());
|
173
|
+
if (!isOkPartsOfSpeech(token)) { continue; }
|
174
|
+
String word = null;
|
164
175
|
if ("base_form".equals(method)) {
|
165
|
-
|
176
|
+
word = MoreObjects.firstNonNull(token.getBaseForm(), token.getSurface());
|
166
177
|
} else if ("reading".equals(method)) {
|
167
|
-
|
178
|
+
word = MoreObjects.firstNonNull(token.getReading(), token.getSurface());
|
168
179
|
} else if ("surface_form".equals(method)) {
|
169
|
-
|
180
|
+
word = token.getSurface();
|
170
181
|
}
|
182
|
+
outputs.add(ValueFactory.newString(word));
|
183
|
+
}
|
184
|
+
if (outputColumn.getType().equals(Types.STRING)) {
|
185
|
+
Joiner joiner = Joiner.on(MoreObjects.firstNonNull(setting.get("delimiter"), ",")).skipNulls();
|
186
|
+
builder.setString(outputColumn, joiner.join(outputs));
|
187
|
+
} else if (outputColumn.getType().equals(Types.JSON)) {
|
188
|
+
builder.setJson(outputColumn, ValueFactory.newArray(outputs));
|
171
189
|
}
|
172
|
-
Joiner joiner = Joiner.on(MoreObjects.firstNonNull(setting.get("delimiter"), ",")).skipNulls();
|
173
|
-
builder.setString(outputColumn, joiner.join(outputs));
|
174
190
|
}
|
175
191
|
}
|
176
192
|
}
|
177
193
|
|
178
194
|
private boolean isOkPartsOfSpeech(Token token) {
|
179
|
-
if (!task.getOkPartsOfSpeech().isPresent()) return true;
|
195
|
+
if (!task.getOkPartsOfSpeech().isPresent()) { return true; };
|
180
196
|
for (String okPartsOfSpeech : task.getOkPartsOfSpeech().get()) {
|
181
197
|
if (token.getAllFeaturesArray()[0].equals(okPartsOfSpeech)) {
|
182
198
|
return true;
|
metadata
CHANGED
@@ -1,43 +1,43 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-filter-kuromoji
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- toyama0919
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-02-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name: bundler
|
15
|
-
version_requirements: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - ~>
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '1.0'
|
20
14
|
requirement: !ruby/object:Gem::Requirement
|
21
15
|
requirements:
|
22
16
|
- - ~>
|
23
17
|
- !ruby/object:Gem::Version
|
24
18
|
version: '1.0'
|
19
|
+
name: bundler
|
25
20
|
prerelease: false
|
26
21
|
type: :development
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: rake
|
29
22
|
version_requirements: !ruby/object:Gem::Requirement
|
30
23
|
requirements:
|
31
|
-
- -
|
24
|
+
- - ~>
|
32
25
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
26
|
+
version: '1.0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
34
28
|
requirement: !ruby/object:Gem::Requirement
|
35
29
|
requirements:
|
36
30
|
- - '>='
|
37
31
|
- !ruby/object:Gem::Version
|
38
32
|
version: '10.0'
|
33
|
+
name: rake
|
39
34
|
prerelease: false
|
40
35
|
type: :development
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.0'
|
41
41
|
description: Kuromoji filter plugin for Embulk
|
42
42
|
email:
|
43
43
|
- toyama0919@gmail.com
|
@@ -56,8 +56,9 @@ files:
|
|
56
56
|
- lib/embulk/filter/kuromoji.rb
|
57
57
|
- src/main/java/org/embulk/filter/kuromoji/KuromojiFilterPlugin.java
|
58
58
|
- src/test/java/org/embulk/filter/kuromoji/TestKuromojiFilterPlugin.java
|
59
|
-
- classpath/embulk-filter-kuromoji-0.
|
60
|
-
- classpath/kuromoji-0.
|
59
|
+
- classpath/embulk-filter-kuromoji-0.3.0.jar
|
60
|
+
- classpath/kuromoji-core-0.9.0.jar
|
61
|
+
- classpath/kuromoji-ipadic-0.9.0.jar
|
61
62
|
homepage: https://github.com/toyama0919/embulk-filter-kuromoji
|
62
63
|
licenses:
|
63
64
|
- MIT
|
@@ -83,4 +84,3 @@ signing_key:
|
|
83
84
|
specification_version: 4
|
84
85
|
summary: Kuromoji filter plugin for Embulk
|
85
86
|
test_files: []
|
86
|
-
has_rdoc:
|
Binary file
|