embulk-filter-kuromoji 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3d706d852774bb54a883e1fad994b508213b0a12
4
- data.tar.gz: c68d3372c01a5b273391f9c14e3b19dce7977aaa
3
+ metadata.gz: 39f5fef3dee7c66100e57b525ffa310c693cbad6
4
+ data.tar.gz: 40bc8e9a948364242d091ceb84e9b10b8ec57392
5
5
  SHA512:
6
- metadata.gz: 163708524757836fb4a45b5f88b5fb030e7e39d91079d5f2aff2ab7dad31c79d52ba6a199979bea0b7cb73ef32cd6d1abd7e28f7a1c7b4f988a8e0aea6d97b8a
7
- data.tar.gz: 8e9a645c381a40a0fa9dfefe6103a4bbfb96347eccf110ba870a488f6dda136be2982bbf6004790d6790839648c15e5ca78b6d5960d084b0a637b4e54e9d7e05
6
+ metadata.gz: af95448ce60356db7f65ad3d27c5fbb6441a46d0dc18df64e47f527ccc2fb92052acbb21ce292e94ab26f6b060f9ac0e4c60976a9764d85681f86b6b70bbf0d6
7
+ data.tar.gz: 4d0f23b0bee7ea2aff3cdc2785a2bfb4102aa9b390976902f2d0405e6c17ae6dc9cccffbc9efb3043fd97e7f8410f787158f55b6dd887effb8e6f874df5c532b
data/README.md CHANGED
@@ -18,6 +18,7 @@ see. [Atilika - Applied Search Innovation](http://www.atilika.com/en/products/ku
18
18
  - **suffix**: output column name suffix. if null overwrite column. (string, default: null)
19
19
  - **method**: description (string, required. surface_form or base_form or reading)
20
20
  - **delimiter**: delimiter (string, default: ",")
21
+ - **type**: extract data type, array or string. array is json type. (string, default: "string")
21
22
 
22
23
  ## Example
23
24
 
@@ -34,6 +35,7 @@ filters:
34
35
  - { suffix: _surface_form_no_delim, method: 'surface_form', delimiter: '' }
35
36
  - { suffix: _base_form, method: 'base_form', delimiter: '###' }
36
37
  - { suffix: _surface_form, method: 'surface_form', delimiter: '###' }
38
+ - { suffix: _array, method: 'surface_form', type: 'array' }
37
39
  ```
38
40
 
39
41
  ### input
@@ -51,7 +53,8 @@ As below
51
53
  "catchcopy" : "アンゼン・アンシンヲツイキュウシタキョクメンボディニデザインヲイッシン。",
52
54
  "catchcopy_surface_form_no_delim" : "安全・安心を追及した曲面ボディにデザインを一新。",
53
55
  "catchcopy_base_form" : "安全###・###安心###を###追及###する###た###曲面###ボディ###に###デザイン###を###一新###。",
54
- "catchcopy_surface_form" : "安全###・###安心###を###追及###し###た###曲面###ボディ###に###デザイン###を###一新###。"
56
+ "catchcopy_surface_form" : "安全###・###安心###を###追及###し###た###曲面###ボディ###に###デザイン###を###一新###。",
57
+ "catchcopy_array" : ["安全","・","安心","を","追及","し","た","曲面","ボディ","に","デザイン","を","一新","。"]
55
58
  }
56
59
  ```
57
60
 
data/build.gradle CHANGED
@@ -14,16 +14,17 @@ configurations {
14
14
  provided
15
15
  }
16
16
 
17
- version = "0.3.0"
17
+ version = "0.3.1"
18
18
 
19
19
  sourceCompatibility = 1.7
20
20
  targetCompatibility = 1.7
21
21
 
22
22
  dependencies {
23
- compile "org.embulk:embulk-core:0.8.1"
23
+ compile "org.embulk:embulk-core:0.8.9"
24
24
  compile 'com.atilika.kuromoji:kuromoji-ipadic:0.9.0'
25
- provided "org.embulk:embulk-core:0.8.1"
25
+ provided "org.embulk:embulk-core:0.8.9"
26
26
  testCompile "junit:junit:4.+"
27
+ testCompile "org.embulk:embulk-core:0.8.9"
27
28
  }
28
29
 
29
30
  task classpath(type: Copy, dependsOn: ["jar"]) {
@@ -0,0 +1,128 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <!DOCTYPE module PUBLIC
3
+ "-//Puppy Crawl//DTD Check Configuration 1.3//EN"
4
+ "http://www.puppycrawl.com/dtds/configuration_1_3.dtd">
5
+ <module name="Checker">
6
+ <!-- https://github.com/facebook/presto/blob/master/src/checkstyle/checks.xml -->
7
+ <module name="FileTabCharacter"/>
8
+ <module name="NewlineAtEndOfFile">
9
+ <property name="lineSeparator" value="lf"/>
10
+ </module>
11
+ <module name="RegexpMultiline">
12
+ <property name="format" value="\r"/>
13
+ <property name="message" value="Line contains carriage return"/>
14
+ </module>
15
+ <module name="RegexpMultiline">
16
+ <property name="format" value=" \n"/>
17
+ <property name="message" value="Line has trailing whitespace"/>
18
+ </module>
19
+ <module name="RegexpMultiline">
20
+ <property name="format" value="\{\n\n"/>
21
+ <property name="message" value="Blank line after opening brace"/>
22
+ </module>
23
+ <module name="RegexpMultiline">
24
+ <property name="format" value="\n\n\s*\}"/>
25
+ <property name="message" value="Blank line before closing brace"/>
26
+ </module>
27
+ <module name="RegexpMultiline">
28
+ <property name="format" value="\n\n\n"/>
29
+ <property name="message" value="Multiple consecutive blank lines"/>
30
+ </module>
31
+ <module name="RegexpMultiline">
32
+ <property name="format" value="\n\n\Z"/>
33
+ <property name="message" value="Blank line before end of file"/>
34
+ </module>
35
+ <module name="RegexpMultiline">
36
+ <property name="format" value="Preconditions\.checkNotNull"/>
37
+ <property name="message" value="Use of checkNotNull"/>
38
+ </module>
39
+
40
+ <module name="TreeWalker">
41
+ <module name="EmptyBlock">
42
+ <property name="option" value="text"/>
43
+ <property name="tokens" value="
44
+ LITERAL_DO, LITERAL_ELSE, LITERAL_FINALLY, LITERAL_IF,
45
+ LITERAL_FOR, LITERAL_TRY, LITERAL_WHILE, INSTANCE_INIT, STATIC_INIT"/>
46
+ </module>
47
+ <module name="EmptyStatement"/>
48
+ <module name="EmptyForInitializerPad"/>
49
+ <module name="EmptyForIteratorPad">
50
+ <property name="option" value="space"/>
51
+ </module>
52
+ <module name="MethodParamPad">
53
+ <property name="allowLineBreaks" value="true"/>
54
+ <property name="option" value="nospace"/>
55
+ </module>
56
+ <module name="ParenPad"/>
57
+ <module name="TypecastParenPad"/>
58
+ <module name="NeedBraces"/>
59
+ <module name="LeftCurly">
60
+ <property name="option" value="nl"/>
61
+ <property name="tokens" value="CLASS_DEF, CTOR_DEF, INTERFACE_DEF, METHOD_DEF"/>
62
+ </module>
63
+ <module name="LeftCurly">
64
+ <property name="option" value="eol"/>
65
+ <property name="tokens" value="
66
+ LITERAL_CATCH, LITERAL_DO, LITERAL_ELSE, LITERAL_FINALLY, LITERAL_FOR,
67
+ LITERAL_IF, LITERAL_SWITCH, LITERAL_SYNCHRONIZED, LITERAL_TRY, LITERAL_WHILE"/>
68
+ </module>
69
+ <module name="RightCurly">
70
+ <property name="option" value="alone"/>
71
+ </module>
72
+ <module name="GenericWhitespace"/>
73
+ <module name="WhitespaceAfter"/>
74
+ <module name="NoWhitespaceBefore"/>
75
+
76
+ <module name="UpperEll"/>
77
+ <module name="DefaultComesLast"/>
78
+ <module name="ArrayTypeStyle"/>
79
+ <module name="MultipleVariableDeclarations"/>
80
+ <module name="ModifierOrder"/>
81
+ <module name="OneStatementPerLine"/>
82
+ <module name="StringLiteralEquality"/>
83
+ <module name="MutableException"/>
84
+ <module name="EqualsHashCode"/>
85
+ <module name="InnerAssignment"/>
86
+ <module name="InterfaceIsType"/>
87
+ <module name="HideUtilityClassConstructor"/>
88
+
89
+ <module name="MemberName"/>
90
+ <module name="LocalVariableName"/>
91
+ <module name="LocalFinalVariableName"/>
92
+ <module name="TypeName"/>
93
+ <module name="PackageName"/>
94
+ <module name="ParameterName"/>
95
+ <module name="StaticVariableName"/>
96
+ <module name="ClassTypeParameterName">
97
+ <property name="format" value="^[A-Z][0-9]?$"/>
98
+ </module>
99
+ <module name="MethodTypeParameterName">
100
+ <property name="format" value="^[A-Z][0-9]?$"/>
101
+ </module>
102
+
103
+ <module name="AvoidStarImport"/>
104
+ <module name="RedundantImport"/>
105
+ <module name="UnusedImports"/>
106
+ <module name="ImportOrder">
107
+ <property name="groups" value="*,javax,java"/>
108
+ <property name="separated" value="true"/>
109
+ <property name="option" value="bottom"/>
110
+ <property name="sortStaticImportsAlphabetically" value="true"/>
111
+ </module>
112
+
113
+ <module name="WhitespaceAround">
114
+ <property name="allowEmptyConstructors" value="true"/>
115
+ <property name="allowEmptyMethods" value="true"/>
116
+ <property name="ignoreEnhancedForColon" value="false"/>
117
+ <property name="tokens" value="
118
+ ASSIGN, BAND, BAND_ASSIGN, BOR, BOR_ASSIGN, BSR, BSR_ASSIGN,
119
+ BXOR, BXOR_ASSIGN, COLON, DIV, DIV_ASSIGN, EQUAL, GE, GT, LAND, LE,
120
+ LITERAL_ASSERT, LITERAL_CATCH, LITERAL_DO, LITERAL_ELSE,
121
+ LITERAL_FINALLY, LITERAL_FOR, LITERAL_IF, LITERAL_RETURN,
122
+ LITERAL_SYNCHRONIZED, LITERAL_TRY, LITERAL_WHILE,
123
+ LOR, LT, MINUS, MINUS_ASSIGN, MOD, MOD_ASSIGN, NOT_EQUAL,
124
+ PLUS, PLUS_ASSIGN, QUESTION, SL, SLIST, SL_ASSIGN, SR, SR_ASSIGN,
125
+ STAR, STAR_ASSIGN, TYPE_EXTENSION_AND"/>
126
+ </module>
127
+ </module>
128
+ </module>
@@ -0,0 +1,108 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <!DOCTYPE module PUBLIC
3
+ "-//Puppy Crawl//DTD Check Configuration 1.3//EN"
4
+ "http://www.puppycrawl.com/dtds/configuration_1_3.dtd">
5
+ <!--
6
+ This is a subset of ./checkstyle.xml which allows some loose styles
7
+ -->
8
+ <module name="Checker">
9
+ <module name="FileTabCharacter"/>
10
+ <module name="NewlineAtEndOfFile">
11
+ <property name="lineSeparator" value="lf"/>
12
+ </module>
13
+ <module name="RegexpMultiline">
14
+ <property name="format" value="\r"/>
15
+ <property name="message" value="Line contains carriage return"/>
16
+ </module>
17
+ <module name="RegexpMultiline">
18
+ <property name="format" value=" \n"/>
19
+ <property name="message" value="Line has trailing whitespace"/>
20
+ </module>
21
+ <module name="RegexpMultiline">
22
+ <property name="format" value="\n\n\n"/>
23
+ <property name="message" value="Multiple consecutive blank lines"/>
24
+ </module>
25
+ <module name="RegexpMultiline">
26
+ <property name="format" value="\n\n\Z"/>
27
+ <property name="message" value="Blank line before end of file"/>
28
+ </module>
29
+
30
+ <module name="TreeWalker">
31
+ <module name="EmptyBlock">
32
+ <property name="option" value="text"/>
33
+ <property name="tokens" value="
34
+ LITERAL_DO, LITERAL_ELSE, LITERAL_FINALLY, LITERAL_IF,
35
+ LITERAL_FOR, LITERAL_TRY, LITERAL_WHILE, INSTANCE_INIT, STATIC_INIT"/>
36
+ </module>
37
+ <module name="EmptyStatement"/>
38
+ <module name="EmptyForInitializerPad"/>
39
+ <module name="EmptyForIteratorPad">
40
+ <property name="option" value="space"/>
41
+ </module>
42
+ <module name="MethodParamPad">
43
+ <property name="allowLineBreaks" value="true"/>
44
+ <property name="option" value="nospace"/>
45
+ </module>
46
+ <module name="ParenPad"/>
47
+ <module name="TypecastParenPad"/>
48
+ <module name="NeedBraces"/>
49
+ <module name="LeftCurly">
50
+ <property name="option" value="nl"/>
51
+ <property name="tokens" value="CLASS_DEF, CTOR_DEF, INTERFACE_DEF, METHOD_DEF"/>
52
+ </module>
53
+ <module name="LeftCurly">
54
+ <property name="option" value="eol"/>
55
+ <property name="tokens" value="
56
+ LITERAL_CATCH, LITERAL_DO, LITERAL_ELSE, LITERAL_FINALLY, LITERAL_FOR,
57
+ LITERAL_IF, LITERAL_SWITCH, LITERAL_SYNCHRONIZED, LITERAL_TRY, LITERAL_WHILE"/>
58
+ </module>
59
+ <module name="RightCurly">
60
+ <property name="option" value="alone"/>
61
+ </module>
62
+ <module name="GenericWhitespace"/>
63
+ <module name="WhitespaceAfter"/>
64
+ <module name="NoWhitespaceBefore"/>
65
+
66
+ <module name="UpperEll"/>
67
+ <module name="DefaultComesLast"/>
68
+ <module name="ArrayTypeStyle"/>
69
+ <module name="MultipleVariableDeclarations"/>
70
+ <module name="ModifierOrder"/>
71
+ <module name="OneStatementPerLine"/>
72
+ <module name="StringLiteralEquality"/>
73
+ <module name="MutableException"/>
74
+ <module name="EqualsHashCode"/>
75
+ <module name="InnerAssignment"/>
76
+ <module name="InterfaceIsType"/>
77
+ <module name="HideUtilityClassConstructor"/>
78
+
79
+ <module name="MemberName"/>
80
+ <module name="LocalVariableName"/>
81
+ <module name="LocalFinalVariableName"/>
82
+ <module name="TypeName"/>
83
+ <module name="PackageName"/>
84
+ <module name="ParameterName"/>
85
+ <module name="StaticVariableName"/>
86
+ <module name="ClassTypeParameterName">
87
+ <property name="format" value="^[A-Z][0-9]?$"/>
88
+ </module>
89
+ <module name="MethodTypeParameterName">
90
+ <property name="format" value="^[A-Z][0-9]?$"/>
91
+ </module>
92
+
93
+ <module name="WhitespaceAround">
94
+ <property name="allowEmptyConstructors" value="true"/>
95
+ <property name="allowEmptyMethods" value="true"/>
96
+ <property name="ignoreEnhancedForColon" value="false"/>
97
+ <property name="tokens" value="
98
+ ASSIGN, BAND, BAND_ASSIGN, BOR, BOR_ASSIGN, BSR, BSR_ASSIGN,
99
+ BXOR, BXOR_ASSIGN, COLON, DIV, DIV_ASSIGN, EQUAL, GE, GT, LAND, LE,
100
+ LITERAL_ASSERT, LITERAL_CATCH, LITERAL_DO, LITERAL_ELSE,
101
+ LITERAL_FINALLY, LITERAL_FOR, LITERAL_IF, LITERAL_RETURN,
102
+ LITERAL_SYNCHRONIZED, LITERAL_TRY, LITERAL_WHILE,
103
+ LOR, LT, MINUS, MINUS_ASSIGN, MOD, MOD_ASSIGN, NOT_EQUAL,
104
+ PLUS, PLUS_ASSIGN, QUESTION, SL, SLIST, SL_ASSIGN, SR, SR_ASSIGN,
105
+ STAR, STAR_ASSIGN, TYPE_EXTENSION_AND"/>
106
+ </module>
107
+ </module>
108
+ </module>
@@ -31,6 +31,7 @@ import com.google.common.base.MoreObjects;
31
31
  import com.google.common.base.Optional;
32
32
  import com.google.common.collect.ImmutableList;
33
33
  import com.google.common.collect.Lists;
34
+ import com.google.common.collect.Maps;
34
35
 
35
36
  public class KuromojiFilterPlugin implements FilterPlugin
36
37
  {
@@ -62,11 +63,12 @@ public class KuromojiFilterPlugin implements FilterPlugin
62
63
  PluginTask task = config.loadConfig(PluginTask.class);
63
64
 
64
65
  ImmutableList.Builder<Column> builder = ImmutableList.builder();
66
+ Map<String, Column> map = Maps.newHashMap();
65
67
  int i = 0;
66
68
  if (task.getKeepInput()) {
67
69
  for (Column inputColumn : inputSchema.getColumns()) {
68
70
  Column outputColumn = new Column(i++, inputColumn.getName(), inputColumn.getType());
69
- builder.add(outputColumn);
71
+ map.put(inputColumn.getName(), outputColumn);
70
72
  }
71
73
  }
72
74
 
@@ -74,16 +76,16 @@ public class KuromojiFilterPlugin implements FilterPlugin
74
76
  for (Map<String, String> setting : task.getSettings()) {
75
77
  String keyName = key + MoreObjects.firstNonNull(setting.get("suffix"), "");
76
78
  Type type = "array".equals(setting.get("type")) ? Types.JSON : Types.STRING;
77
- if (task.getKeepInput()) {
78
- if (setting.get("suffix") != null) {
79
- builder.add(new Column(i++, keyName, type));
80
- }
81
- } else {
82
- builder.add(new Column(i++, keyName, type));
83
- }
79
+ map.put(keyName, new Column(i++, keyName, type));
84
80
  }
85
81
  }
86
82
 
83
+ i = 0;
84
+ for(Map.Entry<String, Column> e : map.entrySet()) {
85
+ final Column column = e.getValue();
86
+ builder.add(new Column(i++, column.getName(), column.getType()));
87
+ }
88
+
87
89
  Schema outputSchema = new Schema(builder.build());
88
90
  control.run(task.dump(), outputSchema);
89
91
  }
@@ -153,6 +155,8 @@ public class KuromojiFilterPlugin implements FilterPlugin
153
155
  builder.setLong(inputColumn, reader.getLong(inputColumn));
154
156
  } else if (Types.TIMESTAMP.equals(inputColumn.getType())) {
155
157
  builder.setTimestamp(inputColumn, reader.getTimestamp(inputColumn));
158
+ } else if (Types.JSON.equals(inputColumn.getType())) {
159
+ builder.setJson(inputColumn, reader.getJson(inputColumn));
156
160
  }
157
161
  }
158
162
  }
@@ -165,11 +169,6 @@ public class KuromojiFilterPlugin implements FilterPlugin
165
169
  Column outputColumn = outputSchema.lookupColumn(column.getName() + MoreObjects.firstNonNull(suffix, ""));
166
170
  List<Value> outputs = Lists.newArrayList();
167
171
  for (Token token : tokens) {
168
- System.err.println(token.getAllFeaturesArray().toString());
169
- System.err.println(token.getPartOfSpeechLevel1());
170
- System.err.println(token.getPartOfSpeechLevel2());
171
- System.err.println(token.getPartOfSpeechLevel3());
172
- System.err.println(token.getPartOfSpeechLevel4());
173
172
  if (!isOkPartsOfSpeech(token)) { continue; }
174
173
  String word = null;
175
174
  if ("base_form".equals(method)) {
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-filter-kuromoji
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - toyama0919
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-02-10 00:00:00.000000000 Z
11
+ date: 2016-06-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -49,6 +49,8 @@ files:
49
49
  - LICENSE.txt
50
50
  - README.md
51
51
  - build.gradle
52
+ - config/checkstyle/checkstyle.xml
53
+ - config/checkstyle/default.xml
52
54
  - gradle/wrapper/gradle-wrapper.jar
53
55
  - gradle/wrapper/gradle-wrapper.properties
54
56
  - gradlew
@@ -56,7 +58,7 @@ files:
56
58
  - lib/embulk/filter/kuromoji.rb
57
59
  - src/main/java/org/embulk/filter/kuromoji/KuromojiFilterPlugin.java
58
60
  - src/test/java/org/embulk/filter/kuromoji/TestKuromojiFilterPlugin.java
59
- - classpath/embulk-filter-kuromoji-0.3.0.jar
61
+ - classpath/embulk-filter-kuromoji-0.3.1.jar
60
62
  - classpath/kuromoji-core-0.9.0.jar
61
63
  - classpath/kuromoji-ipadic-0.9.0.jar
62
64
  homepage: https://github.com/toyama0919/embulk-filter-kuromoji
@@ -84,3 +86,4 @@ signing_key:
84
86
  specification_version: 4
85
87
  summary: Kuromoji filter plugin for Embulk
86
88
  test_files: []
89
+ has_rdoc: