embulk-filter-kuromoji 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/build.gradle +22 -5
- data/classpath/embulk-filter-kuromoji-0.3.0.jar +0 -0
- data/classpath/kuromoji-core-0.9.0.jar +0 -0
- data/classpath/{kuromoji-0.7.7.jar → kuromoji-ipadic-0.9.0.jar} +0 -0
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +2 -2
- data/src/main/java/org/embulk/filter/kuromoji/KuromojiFilterPlugin.java +36 -20
- metadata +15 -15
- data/classpath/embulk-filter-kuromoji-0.2.0.jar +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3d706d852774bb54a883e1fad994b508213b0a12
|
4
|
+
data.tar.gz: c68d3372c01a5b273391f9c14e3b19dce7977aaa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 163708524757836fb4a45b5f88b5fb030e7e39d91079d5f2aff2ab7dad31c79d52ba6a199979bea0b7cb73ef32cd6d1abd7e28f7a1c7b4f988a8e0aea6d97b8a
|
7
|
+
data.tar.gz: 8e9a645c381a40a0fa9dfefe6103a4bbfb96347eccf110ba870a488f6dda136be2982bbf6004790d6790839648c15e5ca78b6d5960d084b0a637b4e54e9d7e05
|
data/build.gradle
CHANGED
@@ -2,27 +2,27 @@ plugins {
|
|
2
2
|
id "com.jfrog.bintray" version "1.1"
|
3
3
|
id "com.github.jruby-gradle.base" version "0.1.5"
|
4
4
|
id "java"
|
5
|
+
id "checkstyle"
|
5
6
|
id "eclipse"
|
6
7
|
}
|
7
8
|
import com.github.jrubygradle.JRubyExec
|
8
9
|
repositories {
|
9
10
|
mavenCentral()
|
10
11
|
jcenter()
|
11
|
-
maven { url "http://www.atilika.org/nexus/content/repositories/atilika" }
|
12
12
|
}
|
13
13
|
configurations {
|
14
14
|
provided
|
15
15
|
}
|
16
16
|
|
17
|
-
version = "0.
|
17
|
+
version = "0.3.0"
|
18
18
|
|
19
19
|
sourceCompatibility = 1.7
|
20
20
|
targetCompatibility = 1.7
|
21
21
|
|
22
22
|
dependencies {
|
23
|
-
compile "org.embulk:embulk-core:0.
|
24
|
-
compile '
|
25
|
-
provided "org.embulk:embulk-core:0.
|
23
|
+
compile "org.embulk:embulk-core:0.8.1"
|
24
|
+
compile 'com.atilika.kuromoji:kuromoji-ipadic:0.9.0'
|
25
|
+
provided "org.embulk:embulk-core:0.8.1"
|
26
26
|
testCompile "junit:junit:4.+"
|
27
27
|
}
|
28
28
|
|
@@ -33,6 +33,23 @@ task classpath(type: Copy, dependsOn: ["jar"]) {
|
|
33
33
|
}
|
34
34
|
clean { delete "classpath" }
|
35
35
|
|
36
|
+
checkstyle {
|
37
|
+
configFile = file("${project.rootDir}/config/checkstyle/checkstyle.xml")
|
38
|
+
toolVersion = '6.14.1'
|
39
|
+
}
|
40
|
+
checkstyleMain {
|
41
|
+
configFile = file("${project.rootDir}/config/checkstyle/default.xml")
|
42
|
+
ignoreFailures = true
|
43
|
+
}
|
44
|
+
checkstyleTest {
|
45
|
+
configFile = file("${project.rootDir}/config/checkstyle/default.xml")
|
46
|
+
ignoreFailures = true
|
47
|
+
}
|
48
|
+
task checkstyle(type: Checkstyle) {
|
49
|
+
classpath = sourceSets.main.output + sourceSets.test.output
|
50
|
+
source = sourceSets.main.allJava + sourceSets.test.allJava
|
51
|
+
}
|
52
|
+
|
36
53
|
task gem(type: JRubyExec, dependsOn: ["gemspec", "classpath"]) {
|
37
54
|
jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "build"
|
38
55
|
script "${project.name}.gemspec"
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -1,6 +1,6 @@
|
|
1
|
-
#
|
1
|
+
#Wed Jan 13 12:41:02 JST 2016
|
2
2
|
distributionBase=GRADLE_USER_HOME
|
3
3
|
distributionPath=wrapper/dists
|
4
4
|
zipStoreBase=GRADLE_USER_HOME
|
5
5
|
zipStorePath=wrapper/dists
|
6
|
-
distributionUrl=https\://services.gradle.org/distributions/gradle-2.
|
6
|
+
distributionUrl=https\://services.gradle.org/distributions/gradle-2.10-bin.zip
|
@@ -5,9 +5,6 @@ import java.io.IOException;
|
|
5
5
|
import java.util.List;
|
6
6
|
import java.util.Map;
|
7
7
|
|
8
|
-
import org.atilika.kuromoji.Token;
|
9
|
-
import org.atilika.kuromoji.Tokenizer;
|
10
|
-
import org.atilika.kuromoji.Tokenizer.Builder;
|
11
8
|
import org.embulk.config.Config;
|
12
9
|
import org.embulk.config.ConfigDefault;
|
13
10
|
import org.embulk.config.ConfigSource;
|
@@ -21,8 +18,14 @@ import org.embulk.spi.PageBuilder;
|
|
21
18
|
import org.embulk.spi.PageOutput;
|
22
19
|
import org.embulk.spi.PageReader;
|
23
20
|
import org.embulk.spi.Schema;
|
21
|
+
import org.embulk.spi.type.Type;
|
24
22
|
import org.embulk.spi.type.Types;
|
23
|
+
import org.msgpack.value.Value;
|
24
|
+
import org.msgpack.value.ValueFactory;
|
25
25
|
|
26
|
+
import com.atilika.kuromoji.ipadic.Token;
|
27
|
+
import com.atilika.kuromoji.ipadic.Tokenizer;
|
28
|
+
import com.atilika.kuromoji.ipadic.Tokenizer.Builder;
|
26
29
|
import com.google.common.base.Joiner;
|
27
30
|
import com.google.common.base.MoreObjects;
|
28
31
|
import com.google.common.base.Optional;
|
@@ -43,7 +46,7 @@ public class KuromojiFilterPlugin implements FilterPlugin
|
|
43
46
|
@Config("ok_parts_of_speech")
|
44
47
|
@ConfigDefault("null")
|
45
48
|
public Optional<List<String>> getOkPartsOfSpeech();
|
46
|
-
|
49
|
+
|
47
50
|
@Config("keep_input")
|
48
51
|
@ConfigDefault("true")
|
49
52
|
public boolean getKeepInput();
|
@@ -61,21 +64,22 @@ public class KuromojiFilterPlugin implements FilterPlugin
|
|
61
64
|
ImmutableList.Builder<Column> builder = ImmutableList.builder();
|
62
65
|
int i = 0;
|
63
66
|
if (task.getKeepInput()) {
|
64
|
-
for (Column inputColumn: inputSchema.getColumns()) {
|
67
|
+
for (Column inputColumn : inputSchema.getColumns()) {
|
65
68
|
Column outputColumn = new Column(i++, inputColumn.getName(), inputColumn.getType());
|
66
69
|
builder.add(outputColumn);
|
67
70
|
}
|
68
71
|
}
|
69
72
|
|
70
|
-
for (String key: task.getKeyNames()) {
|
73
|
+
for (String key : task.getKeyNames()) {
|
71
74
|
for (Map<String, String> setting : task.getSettings()) {
|
72
75
|
String keyName = key + MoreObjects.firstNonNull(setting.get("suffix"), "");
|
76
|
+
Type type = "array".equals(setting.get("type")) ? Types.JSON : Types.STRING;
|
73
77
|
if (task.getKeepInput()) {
|
74
78
|
if (setting.get("suffix") != null) {
|
75
|
-
builder.add(new Column(i++, keyName,
|
79
|
+
builder.add(new Column(i++, keyName, type));
|
76
80
|
}
|
77
81
|
} else {
|
78
|
-
builder.add(new Column(i++, keyName,
|
82
|
+
builder.add(new Column(i++, keyName, type));
|
79
83
|
}
|
80
84
|
}
|
81
85
|
}
|
@@ -88,7 +92,8 @@ public class KuromojiFilterPlugin implements FilterPlugin
|
|
88
92
|
public PageOutput open(TaskSource taskSource, final Schema inputSchema, final Schema outputSchema, final PageOutput output)
|
89
93
|
{
|
90
94
|
final PluginTask task = taskSource.loadTask(PluginTask.class);
|
91
|
-
|
95
|
+
|
96
|
+
Builder builder = new Tokenizer.Builder();
|
92
97
|
if (task.getDictionaryPath().isPresent()) {
|
93
98
|
try {
|
94
99
|
builder.userDictionary(task.getDictionaryPath().get());
|
@@ -102,7 +107,7 @@ public class KuromojiFilterPlugin implements FilterPlugin
|
|
102
107
|
final List<Column> keyNameColumns = Lists.newArrayList();
|
103
108
|
|
104
109
|
for (String keyName : task.getKeyNames()) {
|
105
|
-
keyNameColumns.add(
|
110
|
+
keyNameColumns.add(outputSchema.lookupColumn(keyName));
|
106
111
|
}
|
107
112
|
|
108
113
|
return new PageOutput() {
|
@@ -133,7 +138,7 @@ public class KuromojiFilterPlugin implements FilterPlugin
|
|
133
138
|
*/
|
134
139
|
private void setValue(PageBuilder builder) {
|
135
140
|
if (task.getKeepInput()) {
|
136
|
-
for (Column inputColumn: inputSchema.getColumns()) {
|
141
|
+
for (Column inputColumn : inputSchema.getColumns()) {
|
137
142
|
if (reader.isNull(inputColumn)) {
|
138
143
|
builder.setNull(inputColumn);
|
139
144
|
continue;
|
@@ -154,29 +159,40 @@ public class KuromojiFilterPlugin implements FilterPlugin
|
|
154
159
|
|
155
160
|
for (Column column : keyNameColumns) {
|
156
161
|
List<Token> tokens = tokenizer.tokenize(reader.getString(column));
|
157
|
-
for (Map<String, String> setting: task.getSettings()) {
|
162
|
+
for (Map<String, String> setting : task.getSettings()) {
|
158
163
|
String suffix = setting.get("suffix");
|
159
164
|
String method = setting.get("method");
|
160
165
|
Column outputColumn = outputSchema.lookupColumn(column.getName() + MoreObjects.firstNonNull(suffix, ""));
|
161
|
-
List<
|
166
|
+
List<Value> outputs = Lists.newArrayList();
|
162
167
|
for (Token token : tokens) {
|
163
|
-
|
168
|
+
System.err.println(token.getAllFeaturesArray().toString());
|
169
|
+
System.err.println(token.getPartOfSpeechLevel1());
|
170
|
+
System.err.println(token.getPartOfSpeechLevel2());
|
171
|
+
System.err.println(token.getPartOfSpeechLevel3());
|
172
|
+
System.err.println(token.getPartOfSpeechLevel4());
|
173
|
+
if (!isOkPartsOfSpeech(token)) { continue; }
|
174
|
+
String word = null;
|
164
175
|
if ("base_form".equals(method)) {
|
165
|
-
|
176
|
+
word = MoreObjects.firstNonNull(token.getBaseForm(), token.getSurface());
|
166
177
|
} else if ("reading".equals(method)) {
|
167
|
-
|
178
|
+
word = MoreObjects.firstNonNull(token.getReading(), token.getSurface());
|
168
179
|
} else if ("surface_form".equals(method)) {
|
169
|
-
|
180
|
+
word = token.getSurface();
|
170
181
|
}
|
182
|
+
outputs.add(ValueFactory.newString(word));
|
183
|
+
}
|
184
|
+
if (outputColumn.getType().equals(Types.STRING)) {
|
185
|
+
Joiner joiner = Joiner.on(MoreObjects.firstNonNull(setting.get("delimiter"), ",")).skipNulls();
|
186
|
+
builder.setString(outputColumn, joiner.join(outputs));
|
187
|
+
} else if (outputColumn.getType().equals(Types.JSON)) {
|
188
|
+
builder.setJson(outputColumn, ValueFactory.newArray(outputs));
|
171
189
|
}
|
172
|
-
Joiner joiner = Joiner.on(MoreObjects.firstNonNull(setting.get("delimiter"), ",")).skipNulls();
|
173
|
-
builder.setString(outputColumn, joiner.join(outputs));
|
174
190
|
}
|
175
191
|
}
|
176
192
|
}
|
177
193
|
|
178
194
|
private boolean isOkPartsOfSpeech(Token token) {
|
179
|
-
if (!task.getOkPartsOfSpeech().isPresent()) return true;
|
195
|
+
if (!task.getOkPartsOfSpeech().isPresent()) { return true; };
|
180
196
|
for (String okPartsOfSpeech : task.getOkPartsOfSpeech().get()) {
|
181
197
|
if (token.getAllFeaturesArray()[0].equals(okPartsOfSpeech)) {
|
182
198
|
return true;
|
metadata
CHANGED
@@ -1,43 +1,43 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-filter-kuromoji
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- toyama0919
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-02-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name: bundler
|
15
|
-
version_requirements: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - ~>
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '1.0'
|
20
14
|
requirement: !ruby/object:Gem::Requirement
|
21
15
|
requirements:
|
22
16
|
- - ~>
|
23
17
|
- !ruby/object:Gem::Version
|
24
18
|
version: '1.0'
|
19
|
+
name: bundler
|
25
20
|
prerelease: false
|
26
21
|
type: :development
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: rake
|
29
22
|
version_requirements: !ruby/object:Gem::Requirement
|
30
23
|
requirements:
|
31
|
-
- -
|
24
|
+
- - ~>
|
32
25
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
26
|
+
version: '1.0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
34
28
|
requirement: !ruby/object:Gem::Requirement
|
35
29
|
requirements:
|
36
30
|
- - '>='
|
37
31
|
- !ruby/object:Gem::Version
|
38
32
|
version: '10.0'
|
33
|
+
name: rake
|
39
34
|
prerelease: false
|
40
35
|
type: :development
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.0'
|
41
41
|
description: Kuromoji filter plugin for Embulk
|
42
42
|
email:
|
43
43
|
- toyama0919@gmail.com
|
@@ -56,8 +56,9 @@ files:
|
|
56
56
|
- lib/embulk/filter/kuromoji.rb
|
57
57
|
- src/main/java/org/embulk/filter/kuromoji/KuromojiFilterPlugin.java
|
58
58
|
- src/test/java/org/embulk/filter/kuromoji/TestKuromojiFilterPlugin.java
|
59
|
-
- classpath/embulk-filter-kuromoji-0.
|
60
|
-
- classpath/kuromoji-0.
|
59
|
+
- classpath/embulk-filter-kuromoji-0.3.0.jar
|
60
|
+
- classpath/kuromoji-core-0.9.0.jar
|
61
|
+
- classpath/kuromoji-ipadic-0.9.0.jar
|
61
62
|
homepage: https://github.com/toyama0919/embulk-filter-kuromoji
|
62
63
|
licenses:
|
63
64
|
- MIT
|
@@ -83,4 +84,3 @@ signing_key:
|
|
83
84
|
specification_version: 4
|
84
85
|
summary: Kuromoji filter plugin for Embulk
|
85
86
|
test_files: []
|
86
|
-
has_rdoc:
|
Binary file
|