embulk-filter-kuromoji 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: b7f610ccf613acc14f51a51a498b8f6386088250
4
+ data.tar.gz: d1c85dacfa0ab721434f308f053257d48577928f
5
+ SHA512:
6
+ metadata.gz: 82d8485d8cff97a82edb0bc307ab4723f84b53be7027f6c27597d635e88fe2fd7a22184dbd38b5f5b77e9c59288c9720ba9442df6b2da357b4c17d72daba2b8e
7
+ data.tar.gz: a4712b4ef64eedb63577d74dcc6dbd963d17f7ad3a79fd7e0eedfbb60f64f01d1068cb8dd70dc464c84613329c601e5548000edc3c23962cb5b9b7698a878f82
data/.gitignore ADDED
@@ -0,0 +1,14 @@
1
+ *~
2
+ /pkg/
3
+ /tmp/
4
+ *.gemspec
5
+ .gradle/
6
+ /classpath/
7
+ build/
8
+ .idea
9
+ /bin/
10
+ /.settings/
11
+ /.metadata/
12
+ .classpath
13
+ .project
14
+
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+
2
+ MIT License
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining
5
+ a copy of this software and associated documentation files (the
6
+ "Software"), to deal in the Software without restriction, including
7
+ without limitation the rights to use, copy, modify, merge, publish,
8
+ distribute, sublicense, and/or sell copies of the Software, and to
9
+ permit persons to whom the Software is furnished to do so, subject to
10
+ the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,79 @@
1
+ # Kuromoji filter plugin for Embulk
2
+
3
+ Kuromoji filter plugin for Embulk.
4
+
5
+ see. [Atilika - Applied Search Innovation](http://www.atilika.com/en/products/kuromoji.html)
6
+
7
+ ## Overview
8
+
9
+ * **Plugin type**: filter
10
+
11
+ ## Configuration
12
+
13
+ - **key_names**: description (list, required)
14
+ - **keep_input**: keep input columns. (bool, default: `true`)
15
+ - **ok_parts_of_speech**: ok parts of speech. (list, default: null)
16
+ - **dictionary_path**: user dictionary file path. (string, default: null)
17
+ - **settings**: description (list, required)
18
+ - **suffix**: output column name suffix. if null overwrite column. (string, default: null)
19
+ - **method**: description (string, required. surface_form or base_form or reading)
20
+ - **delimiter**: delimiter (string, default: ",")
21
+
22
+ ## Example
23
+
24
+ ```yaml
25
+ filters:
26
+ - type: kuromoji
27
+ keep_input: false
28
+ ok_parts_of_speech:
29
+ - 名詞
30
+ key_names:
31
+ - catchcopy
32
+ settings:
33
+ - { method: 'reading', delimiter: '' }
34
+ - { suffix: _surface_form_no_delim, method: 'surface_form', delimiter: '' }
35
+ - { suffix: _base_form, method: 'base_form', delimiter: '###' }
36
+ - { suffix: _surface_form, method: 'surface_form', delimiter: '###' }
37
+ ```
38
+
39
+ ### input
40
+
41
+ ```json
42
+ {
43
+ "catchcopy" : "安全・安心を追及した曲面ボディにデザインを一新しました。"
44
+ }
45
+ ```
46
+
47
+ As below
48
+
49
+ ```json
50
+ {
51
+ "catchcopy" : "アンゼン・アンシンヲツイキュウシタキョクメンボディニデザインヲイッシン。",
52
+ "catchcopy_surface_form_no_delim" : "安全・安心を追及した曲面ボディにデザインを一新。",
53
+ "catchcopy_base_form" : "安全###・###安心###を###追及###する###た###曲面###ボディ###に###デザイン###を###一新###。",
54
+ "catchcopy_surface_form" : "安全###・###安心###を###追及###し###た###曲面###ボディ###に###デザイン###を###一新###。"
55
+ }
56
+ ```
57
+
58
+ ## Example2(use user dictionary)
59
+
60
+ ```yaml
61
+ - type: kuromoji
62
+ keep_input: false
63
+ dictionary_path: /tmp/kuromoji.txt
64
+ ok_parts_of_speech:
65
+ - 名詞
66
+ key_names:
67
+ - catchcopy
68
+ settings:
69
+ - { method: 'reading', delimiter: '#' }
70
+ - { suffix: _surface_form_no_delim, method: 'surface_form', delimiter: '' }
71
+ - { suffix: _base_form, method: 'base_form', delimiter: '###' }
72
+ - { suffix: _surface_form, method: 'surface_form', delimiter: '###' }
73
+ ```
74
+
75
+ ## Build
76
+
77
+ ```
78
+ $ ./gradlew gem # -t to watch change of files and rebuild continuously
79
+ ```
data/build.gradle ADDED
@@ -0,0 +1,78 @@
1
+ plugins {
2
+ id "com.jfrog.bintray" version "1.1"
3
+ id "com.github.jruby-gradle.base" version "0.1.5"
4
+ id "java"
5
+ id "eclipse"
6
+ }
7
+ import com.github.jrubygradle.JRubyExec
8
+ repositories {
9
+ mavenCentral()
10
+ jcenter()
11
+ maven { url "http://www.atilika.org/nexus/content/repositories/atilika" }
12
+ }
13
+ configurations {
14
+ provided
15
+ }
16
+
17
+ version = "0.1.0"
18
+
19
+ sourceCompatibility = 1.7
20
+ targetCompatibility = 1.7
21
+
22
+ dependencies {
23
+ compile "org.embulk:embulk-core:0.7.4"
24
+ compile 'org.atilika.kuromoji:kuromoji:0.7.7'
25
+ provided "org.embulk:embulk-core:0.7.4"
26
+ testCompile "junit:junit:4.+"
27
+ }
28
+
29
+ task classpath(type: Copy, dependsOn: ["jar"]) {
30
+ doFirst { file("classpath").deleteDir() }
31
+ from (configurations.runtime - configurations.provided + files(jar.archivePath))
32
+ into "classpath"
33
+ }
34
+ clean { delete "classpath" }
35
+
36
+ task gem(type: JRubyExec, dependsOn: ["gemspec", "classpath"]) {
37
+ jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "build"
38
+ script "${project.name}.gemspec"
39
+ doLast { ant.move(file: "${project.name}-${project.version}.gem", todir: "pkg") }
40
+ }
41
+
42
+ task gemPush(type: JRubyExec, dependsOn: ["gem"]) {
43
+ jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "push"
44
+ script "pkg/${project.name}-${project.version}.gem"
45
+ }
46
+
47
+ task "package"(dependsOn: ["gemspec", "classpath"]) << {
48
+ println "> Build succeeded."
49
+ println "> You can run embulk with '-L ${file(".").absolutePath}' argument."
50
+ }
51
+
52
+ task gemspec {
53
+ ext.gemspecFile = file("${project.name}.gemspec")
54
+ inputs.file "build.gradle"
55
+ outputs.file gemspecFile
56
+ doLast { gemspecFile.write($/
57
+ Gem::Specification.new do |spec|
58
+ spec.name = "${project.name}"
59
+ spec.version = "${project.version}"
60
+ spec.authors = ["toyama0919"]
61
+ spec.summary = %[Kuromoji filter plugin for Embulk]
62
+ spec.description = %[Kuromoji]
63
+ spec.email = ["toyama0919@gmail.com"]
64
+ spec.licenses = ["MIT"]
65
+ # TODO set this: spec.homepage = "https://github.com/toyama0919/embulk-filter-kuromoji"
66
+
67
+ spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"]
68
+ spec.test_files = spec.files.grep(%r"^(test|spec)/")
69
+ spec.require_paths = ["lib"]
70
+
71
+ #spec.add_dependency 'YOUR_GEM_DEPENDENCY', ['~> YOUR_GEM_DEPENDENCY_VERSION']
72
+ spec.add_development_dependency 'bundler', ['~> 1.0']
73
+ spec.add_development_dependency 'rake', ['>= 10.0']
74
+ end
75
+ /$)
76
+ }
77
+ }
78
+ clean { delete "${project.name}.gemspec" }
Binary file
Binary file
@@ -0,0 +1,6 @@
1
+ #Tue Aug 11 00:26:20 PDT 2015
2
+ distributionBase=GRADLE_USER_HOME
3
+ distributionPath=wrapper/dists
4
+ zipStoreBase=GRADLE_USER_HOME
5
+ zipStorePath=wrapper/dists
6
+ distributionUrl=https\://services.gradle.org/distributions/gradle-2.6-bin.zip
data/gradlew ADDED
@@ -0,0 +1,164 @@
1
+ #!/usr/bin/env bash
2
+
3
+ ##############################################################################
4
+ ##
5
+ ## Gradle start up script for UN*X
6
+ ##
7
+ ##############################################################################
8
+
9
+ # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
10
+ DEFAULT_JVM_OPTS=""
11
+
12
+ APP_NAME="Gradle"
13
+ APP_BASE_NAME=`basename "$0"`
14
+
15
+ # Use the maximum available, or set MAX_FD != -1 to use that value.
16
+ MAX_FD="maximum"
17
+
18
+ warn ( ) {
19
+ echo "$*"
20
+ }
21
+
22
+ die ( ) {
23
+ echo
24
+ echo "$*"
25
+ echo
26
+ exit 1
27
+ }
28
+
29
+ # OS specific support (must be 'true' or 'false').
30
+ cygwin=false
31
+ msys=false
32
+ darwin=false
33
+ case "`uname`" in
34
+ CYGWIN* )
35
+ cygwin=true
36
+ ;;
37
+ Darwin* )
38
+ darwin=true
39
+ ;;
40
+ MINGW* )
41
+ msys=true
42
+ ;;
43
+ esac
44
+
45
+ # For Cygwin, ensure paths are in UNIX format before anything is touched.
46
+ if $cygwin ; then
47
+ [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"`
48
+ fi
49
+
50
+ # Attempt to set APP_HOME
51
+ # Resolve links: $0 may be a link
52
+ PRG="$0"
53
+ # Need this for relative symlinks.
54
+ while [ -h "$PRG" ] ; do
55
+ ls=`ls -ld "$PRG"`
56
+ link=`expr "$ls" : '.*-> \(.*\)$'`
57
+ if expr "$link" : '/.*' > /dev/null; then
58
+ PRG="$link"
59
+ else
60
+ PRG=`dirname "$PRG"`"/$link"
61
+ fi
62
+ done
63
+ SAVED="`pwd`"
64
+ cd "`dirname \"$PRG\"`/" >&-
65
+ APP_HOME="`pwd -P`"
66
+ cd "$SAVED" >&-
67
+
68
+ CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
69
+
70
+ # Determine the Java command to use to start the JVM.
71
+ if [ -n "$JAVA_HOME" ] ; then
72
+ if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
73
+ # IBM's JDK on AIX uses strange locations for the executables
74
+ JAVACMD="$JAVA_HOME/jre/sh/java"
75
+ else
76
+ JAVACMD="$JAVA_HOME/bin/java"
77
+ fi
78
+ if [ ! -x "$JAVACMD" ] ; then
79
+ die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
80
+
81
+ Please set the JAVA_HOME variable in your environment to match the
82
+ location of your Java installation."
83
+ fi
84
+ else
85
+ JAVACMD="java"
86
+ which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
87
+
88
+ Please set the JAVA_HOME variable in your environment to match the
89
+ location of your Java installation."
90
+ fi
91
+
92
+ # Increase the maximum file descriptors if we can.
93
+ if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then
94
+ MAX_FD_LIMIT=`ulimit -H -n`
95
+ if [ $? -eq 0 ] ; then
96
+ if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
97
+ MAX_FD="$MAX_FD_LIMIT"
98
+ fi
99
+ ulimit -n $MAX_FD
100
+ if [ $? -ne 0 ] ; then
101
+ warn "Could not set maximum file descriptor limit: $MAX_FD"
102
+ fi
103
+ else
104
+ warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
105
+ fi
106
+ fi
107
+
108
+ # For Darwin, add options to specify how the application appears in the dock
109
+ if $darwin; then
110
+ GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
111
+ fi
112
+
113
+ # For Cygwin, switch paths to Windows format before running java
114
+ if $cygwin ; then
115
+ APP_HOME=`cygpath --path --mixed "$APP_HOME"`
116
+ CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
117
+
118
+ # We build the pattern for arguments to be converted via cygpath
119
+ ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
120
+ SEP=""
121
+ for dir in $ROOTDIRSRAW ; do
122
+ ROOTDIRS="$ROOTDIRS$SEP$dir"
123
+ SEP="|"
124
+ done
125
+ OURCYGPATTERN="(^($ROOTDIRS))"
126
+ # Add a user-defined pattern to the cygpath arguments
127
+ if [ "$GRADLE_CYGPATTERN" != "" ] ; then
128
+ OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
129
+ fi
130
+ # Now convert the arguments - kludge to limit ourselves to /bin/sh
131
+ i=0
132
+ for arg in "$@" ; do
133
+ CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
134
+ CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
135
+
136
+ if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
137
+ eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
138
+ else
139
+ eval `echo args$i`="\"$arg\""
140
+ fi
141
+ i=$((i+1))
142
+ done
143
+ case $i in
144
+ (0) set -- ;;
145
+ (1) set -- "$args0" ;;
146
+ (2) set -- "$args0" "$args1" ;;
147
+ (3) set -- "$args0" "$args1" "$args2" ;;
148
+ (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
149
+ (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
150
+ (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
151
+ (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
152
+ (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
153
+ (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
154
+ esac
155
+ fi
156
+
157
+ # Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules
158
+ function splitJvmOpts() {
159
+ JVM_OPTS=("$@")
160
+ }
161
+ eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS
162
+ JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME"
163
+
164
+ exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@"
data/gradlew.bat ADDED
@@ -0,0 +1,90 @@
1
+ @if "%DEBUG%" == "" @echo off
2
+ @rem ##########################################################################
3
+ @rem
4
+ @rem Gradle startup script for Windows
5
+ @rem
6
+ @rem ##########################################################################
7
+
8
+ @rem Set local scope for the variables with windows NT shell
9
+ if "%OS%"=="Windows_NT" setlocal
10
+
11
+ @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
12
+ set DEFAULT_JVM_OPTS=
13
+
14
+ set DIRNAME=%~dp0
15
+ if "%DIRNAME%" == "" set DIRNAME=.
16
+ set APP_BASE_NAME=%~n0
17
+ set APP_HOME=%DIRNAME%
18
+
19
+ @rem Find java.exe
20
+ if defined JAVA_HOME goto findJavaFromJavaHome
21
+
22
+ set JAVA_EXE=java.exe
23
+ %JAVA_EXE% -version >NUL 2>&1
24
+ if "%ERRORLEVEL%" == "0" goto init
25
+
26
+ echo.
27
+ echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28
+ echo.
29
+ echo Please set the JAVA_HOME variable in your environment to match the
30
+ echo location of your Java installation.
31
+
32
+ goto fail
33
+
34
+ :findJavaFromJavaHome
35
+ set JAVA_HOME=%JAVA_HOME:"=%
36
+ set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37
+
38
+ if exist "%JAVA_EXE%" goto init
39
+
40
+ echo.
41
+ echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42
+ echo.
43
+ echo Please set the JAVA_HOME variable in your environment to match the
44
+ echo location of your Java installation.
45
+
46
+ goto fail
47
+
48
+ :init
49
+ @rem Get command-line arguments, handling Windowz variants
50
+
51
+ if not "%OS%" == "Windows_NT" goto win9xME_args
52
+ if "%@eval[2+2]" == "4" goto 4NT_args
53
+
54
+ :win9xME_args
55
+ @rem Slurp the command line arguments.
56
+ set CMD_LINE_ARGS=
57
+ set _SKIP=2
58
+
59
+ :win9xME_args_slurp
60
+ if "x%~1" == "x" goto execute
61
+
62
+ set CMD_LINE_ARGS=%*
63
+ goto execute
64
+
65
+ :4NT_args
66
+ @rem Get arguments from the 4NT Shell from JP Software
67
+ set CMD_LINE_ARGS=%$
68
+
69
+ :execute
70
+ @rem Setup the command line
71
+
72
+ set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
73
+
74
+ @rem Execute Gradle
75
+ "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
76
+
77
+ :end
78
+ @rem End local scope for the variables with windows NT shell
79
+ if "%ERRORLEVEL%"=="0" goto mainEnd
80
+
81
+ :fail
82
+ rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
83
+ rem the _cmd.exe /c_ return code!
84
+ if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
85
+ exit /b 1
86
+
87
+ :mainEnd
88
+ if "%OS%"=="Windows_NT" endlocal
89
+
90
+ :omega
@@ -0,0 +1,3 @@
1
+ Embulk::JavaPlugin.register_filter(
2
+ "kuromoji", "org.embulk.filter.kuromoji.KuromojiFilterPlugin",
3
+ File.expand_path('../../../../classpath', __FILE__))
@@ -0,0 +1,185 @@
1
+ package org.embulk.filter.kuromoji;
2
+
3
+ import java.io.FileNotFoundException;
4
+ import java.io.IOException;
5
+ import java.util.List;
6
+ import java.util.Map;
7
+
8
+ import org.atilika.kuromoji.Token;
9
+ import org.atilika.kuromoji.Tokenizer;
10
+ import org.atilika.kuromoji.Tokenizer.Builder;
11
+ import org.embulk.config.Config;
12
+ import org.embulk.config.ConfigDefault;
13
+ import org.embulk.config.ConfigSource;
14
+ import org.embulk.config.Task;
15
+ import org.embulk.config.TaskSource;
16
+ import org.embulk.spi.Column;
17
+ import org.embulk.spi.Exec;
18
+ import org.embulk.spi.FilterPlugin;
19
+ import org.embulk.spi.Page;
20
+ import org.embulk.spi.PageBuilder;
21
+ import org.embulk.spi.PageOutput;
22
+ import org.embulk.spi.PageReader;
23
+ import org.embulk.spi.Schema;
24
+ import org.embulk.spi.type.Types;
25
+
26
+ import com.google.common.base.Joiner;
27
+ import com.google.common.base.MoreObjects;
28
+ import com.google.common.base.Optional;
29
+ import com.google.common.collect.ImmutableList;
30
+ import com.google.common.collect.Lists;
31
+
32
+ public class KuromojiFilterPlugin implements FilterPlugin
33
+ {
34
+ public interface PluginTask extends Task
35
+ {
36
+ @Config("key_names")
37
+ public List<String> getKeyNames();
38
+
39
+ @Config("dictionary_path")
40
+ @ConfigDefault("null")
41
+ public Optional<String> getDictionaryPath();
42
+
43
+ @Config("ok_parts_of_speech")
44
+ @ConfigDefault("null")
45
+ public Optional<List<String>> getOkPartsOfSpeech();
46
+
47
+ @Config("keep_input")
48
+ @ConfigDefault("true")
49
+ public boolean getKeepInput();
50
+
51
+ @Config("settings")
52
+ public List<Map<String, String>> getSettings();
53
+ }
54
+
55
+ @Override
56
+ public void transaction(ConfigSource config, Schema inputSchema,
57
+ FilterPlugin.Control control)
58
+ {
59
+ PluginTask task = config.loadConfig(PluginTask.class);
60
+
61
+ ImmutableList.Builder<Column> builder = ImmutableList.builder();
62
+ int i = 0;
63
+ if (task.getKeepInput()) {
64
+ for (Column inputColumn: inputSchema.getColumns()) {
65
+ Column outputColumn = new Column(i++, inputColumn.getName(), inputColumn.getType());
66
+ builder.add(outputColumn);
67
+ }
68
+ }
69
+
70
+ for (String key: task.getKeyNames()) {
71
+ for (Map<String, String> setting : task.getSettings()) {
72
+ Column outputColumn = new Column(i++, key + MoreObjects.firstNonNull(setting.get("suffix"), ""), Types.STRING);
73
+ builder.add(outputColumn);
74
+ }
75
+ }
76
+
77
+ Schema outputSchema = new Schema(builder.build());
78
+ control.run(task.dump(), outputSchema);
79
+ }
80
+
81
+ @Override
82
+ public PageOutput open(TaskSource taskSource, final Schema inputSchema, final Schema outputSchema, final PageOutput output)
83
+ {
84
+ final PluginTask task = taskSource.loadTask(PluginTask.class);
85
+ Builder builder = Tokenizer.builder();
86
+ if (task.getDictionaryPath().isPresent()) {
87
+ try {
88
+ builder.userDictionary(task.getDictionaryPath().get());
89
+ } catch (FileNotFoundException e) {
90
+ e.printStackTrace();
91
+ } catch (IOException e) {
92
+ e.printStackTrace();
93
+ }
94
+ }
95
+ final Tokenizer tokenizer = builder.build();
96
+ final List<Column> keyNameColumns = Lists.newArrayList();
97
+
98
+ for (String keyName : task.getKeyNames()) {
99
+ keyNameColumns.add(inputSchema.lookupColumn(keyName));
100
+ }
101
+
102
+ return new PageOutput() {
103
+ private PageReader reader = new PageReader(inputSchema);
104
+
105
+ @Override
106
+ public void finish() {
107
+ output.finish();
108
+ }
109
+
110
+ @Override
111
+ public void close() {
112
+ output.close();
113
+ }
114
+
115
+ @Override
116
+ public void add(Page page) {
117
+ reader.setPage(page);
118
+ try (final PageBuilder builder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output)) {
119
+ while (reader.nextRecord()) {
120
+ setValue(builder);
121
+ builder.addRecord();
122
+ }
123
+ builder.finish();
124
+ }
125
+ }
126
+
127
+ /**
128
+ * @param builder
129
+ */
130
+ private void setValue(PageBuilder builder) {
131
+ if (task.getKeepInput()) {
132
+ for (Column inputColumn: inputSchema.getColumns()) {
133
+ if (reader.isNull(inputColumn)) {
134
+ builder.setNull(inputColumn);
135
+ continue;
136
+ }
137
+ if (Types.STRING.equals(inputColumn.getType())) {
138
+ builder.setString(inputColumn, reader.getString(inputColumn));
139
+ } else if (Types.BOOLEAN.equals(inputColumn.getType())) {
140
+ builder.setBoolean(inputColumn, reader.getBoolean(inputColumn));
141
+ } else if (Types.DOUBLE.equals(inputColumn.getType())) {
142
+ builder.setDouble(inputColumn, reader.getDouble(inputColumn));
143
+ } else if (Types.LONG.equals(inputColumn.getType())) {
144
+ builder.setLong(inputColumn, reader.getLong(inputColumn));
145
+ } else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
146
+ builder.setTimestamp(inputColumn, reader.getTimestamp(inputColumn));
147
+ }
148
+ }
149
+ }
150
+
151
+ for (Column column : keyNameColumns) {
152
+ List<Token> tokens = tokenizer.tokenize(reader.getString(column));
153
+ for (Map<String, String> setting: task.getSettings()) {
154
+ String suffix = setting.get("suffix");
155
+ String method = setting.get("method");
156
+ Column outputColumn = outputSchema.lookupColumn(column.getName() + MoreObjects.firstNonNull(suffix, ""));
157
+ List<String> outputs = Lists.newArrayList();
158
+ for (Token token : tokens) {
159
+ if (!isOkPartsOfSpeech(token)) continue;
160
+ if ("base_form".equals(method)) {
161
+ outputs.add(MoreObjects.firstNonNull(token.getBaseForm(), token.getSurfaceForm()));
162
+ } else if ("reading".equals(method)) {
163
+ outputs.add(MoreObjects.firstNonNull(token.getReading(), token.getSurfaceForm()));
164
+ } else if ("surface_form".equals(method)) {
165
+ outputs.add(token.getSurfaceForm());
166
+ }
167
+ }
168
+ Joiner joiner = Joiner.on(MoreObjects.firstNonNull(setting.get("delimiter"), ",")).skipNulls();
169
+ builder.setString(outputColumn, joiner.join(outputs));
170
+ }
171
+ }
172
+ }
173
+
174
+ private boolean isOkPartsOfSpeech(Token token) {
175
+ if (!task.getOkPartsOfSpeech().isPresent()) return true;
176
+ for (String okPartsOfSpeech : task.getOkPartsOfSpeech().get()) {
177
+ if (token.getAllFeaturesArray()[0].equals(okPartsOfSpeech)) {
178
+ return true;
179
+ }
180
+ }
181
+ return false;
182
+ }
183
+ };
184
+ }
185
+ }
@@ -0,0 +1,5 @@
1
+ package org.embulk.filter.kuromoji;
2
+
3
+ public class TestKuromojiFilterPlugin
4
+ {
5
+ }
metadata ADDED
@@ -0,0 +1,86 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: embulk-filter-kuromoji
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - toyama0919
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-09-30 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ version_requirements: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.0'
20
+ requirement: !ruby/object:Gem::Requirement
21
+ requirements:
22
+ - - ~>
23
+ - !ruby/object:Gem::Version
24
+ version: '1.0'
25
+ prerelease: false
26
+ type: :development
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ requirement: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - '>='
37
+ - !ruby/object:Gem::Version
38
+ version: '10.0'
39
+ prerelease: false
40
+ type: :development
41
+ description: Kuromoji
42
+ email:
43
+ - toyama0919@gmail.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - .gitignore
49
+ - LICENSE.txt
50
+ - README.md
51
+ - build.gradle
52
+ - gradle/wrapper/gradle-wrapper.jar
53
+ - gradle/wrapper/gradle-wrapper.properties
54
+ - gradlew
55
+ - gradlew.bat
56
+ - lib/embulk/filter/kuromoji.rb
57
+ - src/main/java/org/embulk/filter/kuromoji/KuromojiFilterPlugin.java
58
+ - src/test/java/org/embulk/filter/kuromoji/TestKuromojiFilterPlugin.java
59
+ - classpath/embulk-filter-kuromoji-0.1.0.jar
60
+ - classpath/kuromoji-0.7.7.jar
61
+ homepage:
62
+ licenses:
63
+ - MIT
64
+ metadata: {}
65
+ post_install_message:
66
+ rdoc_options: []
67
+ require_paths:
68
+ - lib
69
+ required_ruby_version: !ruby/object:Gem::Requirement
70
+ requirements:
71
+ - - '>='
72
+ - !ruby/object:Gem::Version
73
+ version: '0'
74
+ required_rubygems_version: !ruby/object:Gem::Requirement
75
+ requirements:
76
+ - - '>='
77
+ - !ruby/object:Gem::Version
78
+ version: '0'
79
+ requirements: []
80
+ rubyforge_project:
81
+ rubygems_version: 2.1.9
82
+ signing_key:
83
+ specification_version: 4
84
+ summary: Kuromoji filter plugin for Embulk
85
+ test_files: []
86
+ has_rdoc: