ve 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +2 -0
- data/.travis.yml +3 -6
- data/Gemfile +8 -6
- data/Gemfile.lock +29 -19
- data/LICENSE.txt +21 -0
- data/Readme.md +42 -5
- data/java/.gitignore +4 -0
- data/java/build.gradle +38 -0
- data/java/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/java/gradle/wrapper/gradle-wrapper.properties +5 -0
- data/java/gradlew +185 -0
- data/java/gradlew.bat +104 -0
- data/java/pom.xml +56 -0
- data/java/readme.md +103 -0
- data/java/settings.gradle +1 -0
- data/java/src/main/java/ve/Grammar.java +10 -0
- data/java/src/main/java/ve/Parse.java +336 -0
- data/java/src/main/java/ve/Pos.java +27 -0
- data/java/src/main/java/ve/Word.java +104 -0
- data/java/src/test/java/ve/VeTest.java +41 -0
- data/lib/part_of_speech.rb +1 -1
- data/lib/providers/freeling_en.rb +29 -28
- data/lib/providers/japanese_transliterators.rb +14 -14
- data/lib/providers/mecab_ipadic.rb +10 -10
- data/lib/ve.rb +21 -15
- data/lib/word.rb +19 -12
- data/sinatra/server.rb +35 -2
- data/tests/japanese_transliterators_test.rb +8 -5
- data/tests/mecab_ipadic_parse_test.rb +12 -0
- data/tests/test_helper.rb +0 -1
- data/tests/ve_test.rb +0 -1
- data/ve.gemspec +9 -7
- metadata +24 -9
data/java/gradlew.bat
ADDED
@@ -0,0 +1,104 @@
|
|
1
|
+
@rem
|
2
|
+
@rem Copyright 2015 the original author or authors.
|
3
|
+
@rem
|
4
|
+
@rem Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
@rem you may not use this file except in compliance with the License.
|
6
|
+
@rem You may obtain a copy of the License at
|
7
|
+
@rem
|
8
|
+
@rem https://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
@rem
|
10
|
+
@rem Unless required by applicable law or agreed to in writing, software
|
11
|
+
@rem distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
@rem See the License for the specific language governing permissions and
|
14
|
+
@rem limitations under the License.
|
15
|
+
@rem
|
16
|
+
|
17
|
+
@if "%DEBUG%" == "" @echo off
|
18
|
+
@rem ##########################################################################
|
19
|
+
@rem
|
20
|
+
@rem Gradle startup script for Windows
|
21
|
+
@rem
|
22
|
+
@rem ##########################################################################
|
23
|
+
|
24
|
+
@rem Set local scope for the variables with windows NT shell
|
25
|
+
if "%OS%"=="Windows_NT" setlocal
|
26
|
+
|
27
|
+
set DIRNAME=%~dp0
|
28
|
+
if "%DIRNAME%" == "" set DIRNAME=.
|
29
|
+
set APP_BASE_NAME=%~n0
|
30
|
+
set APP_HOME=%DIRNAME%
|
31
|
+
|
32
|
+
@rem Resolve any "." and ".." in APP_HOME to make it shorter.
|
33
|
+
for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
|
34
|
+
|
35
|
+
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
36
|
+
set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
|
37
|
+
|
38
|
+
@rem Find java.exe
|
39
|
+
if defined JAVA_HOME goto findJavaFromJavaHome
|
40
|
+
|
41
|
+
set JAVA_EXE=java.exe
|
42
|
+
%JAVA_EXE% -version >NUL 2>&1
|
43
|
+
if "%ERRORLEVEL%" == "0" goto init
|
44
|
+
|
45
|
+
echo.
|
46
|
+
echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
|
47
|
+
echo.
|
48
|
+
echo Please set the JAVA_HOME variable in your environment to match the
|
49
|
+
echo location of your Java installation.
|
50
|
+
|
51
|
+
goto fail
|
52
|
+
|
53
|
+
:findJavaFromJavaHome
|
54
|
+
set JAVA_HOME=%JAVA_HOME:"=%
|
55
|
+
set JAVA_EXE=%JAVA_HOME%/bin/java.exe
|
56
|
+
|
57
|
+
if exist "%JAVA_EXE%" goto init
|
58
|
+
|
59
|
+
echo.
|
60
|
+
echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
|
61
|
+
echo.
|
62
|
+
echo Please set the JAVA_HOME variable in your environment to match the
|
63
|
+
echo location of your Java installation.
|
64
|
+
|
65
|
+
goto fail
|
66
|
+
|
67
|
+
:init
|
68
|
+
@rem Get command-line arguments, handling Windows variants
|
69
|
+
|
70
|
+
if not "%OS%" == "Windows_NT" goto win9xME_args
|
71
|
+
|
72
|
+
:win9xME_args
|
73
|
+
@rem Slurp the command line arguments.
|
74
|
+
set CMD_LINE_ARGS=
|
75
|
+
set _SKIP=2
|
76
|
+
|
77
|
+
:win9xME_args_slurp
|
78
|
+
if "x%~1" == "x" goto execute
|
79
|
+
|
80
|
+
set CMD_LINE_ARGS=%*
|
81
|
+
|
82
|
+
:execute
|
83
|
+
@rem Setup the command line
|
84
|
+
|
85
|
+
set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
|
86
|
+
|
87
|
+
|
88
|
+
@rem Execute Gradle
|
89
|
+
"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
|
90
|
+
|
91
|
+
:end
|
92
|
+
@rem End local scope for the variables with windows NT shell
|
93
|
+
if "%ERRORLEVEL%"=="0" goto mainEnd
|
94
|
+
|
95
|
+
:fail
|
96
|
+
rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
|
97
|
+
rem the _cmd.exe /c_ return code!
|
98
|
+
if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
|
99
|
+
exit /b 1
|
100
|
+
|
101
|
+
:mainEnd
|
102
|
+
if "%OS%"=="Windows_NT" endlocal
|
103
|
+
|
104
|
+
:omega
|
data/java/pom.xml
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
3
|
+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
4
|
+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
5
|
+
<modelVersion>4.0.0</modelVersion>
|
6
|
+
|
7
|
+
<groupId>uk.co.birchlabs.ve</groupId>
|
8
|
+
<artifactId>ve</artifactId>
|
9
|
+
<version>1.0-SNAPSHOT</version>
|
10
|
+
|
11
|
+
<name>main source</name>
|
12
|
+
<url>http://maven.apache.org</url>
|
13
|
+
|
14
|
+
<repositories>
|
15
|
+
<repository>
|
16
|
+
<id>Atilika Open Source repository</id>
|
17
|
+
<url>http://www.atilika.org/nexus/content/repositories/atilika</url>
|
18
|
+
</repository>
|
19
|
+
</repositories>
|
20
|
+
|
21
|
+
<dependencies>
|
22
|
+
<dependency>
|
23
|
+
<groupId>org.atilika.kuromoji</groupId>
|
24
|
+
<artifactId>kuromoji</artifactId>
|
25
|
+
<version>0.7.7</version>
|
26
|
+
<type>jar</type>
|
27
|
+
<scope>compile</scope>
|
28
|
+
</dependency>
|
29
|
+
<dependency>
|
30
|
+
<groupId>junit</groupId>
|
31
|
+
<artifactId>junit</artifactId>
|
32
|
+
<version>4.12</version>
|
33
|
+
<scope>test</scope>
|
34
|
+
</dependency>
|
35
|
+
</dependencies>
|
36
|
+
|
37
|
+
<properties>
|
38
|
+
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
39
|
+
<slf4jVersion>1.7.21</slf4jVersion>
|
40
|
+
<asyncHttpClientVersion>2.0.6</asyncHttpClientVersion>
|
41
|
+
</properties>
|
42
|
+
|
43
|
+
<build>
|
44
|
+
<plugins>
|
45
|
+
<plugin>
|
46
|
+
<groupId>org.apache.maven.plugins</groupId>
|
47
|
+
<artifactId>maven-compiler-plugin</artifactId>
|
48
|
+
<version>3.5.1</version>
|
49
|
+
<configuration>
|
50
|
+
<source>1.8</source>
|
51
|
+
<target>1.8</target>
|
52
|
+
</configuration>
|
53
|
+
</plugin>
|
54
|
+
</plugins>
|
55
|
+
</build>
|
56
|
+
</project>
|
data/java/readme.md
ADDED
@@ -0,0 +1,103 @@
|
|
1
|
+
# Setup and usage
|
2
|
+
|
3
|
+
## Integrating into build
|
4
|
+
|
5
|
+
I've used Maven for managing the build. Unfortunately, I am no expert on Maven so I can't help much on integration (but will try my best). There's very little complexity to this project, thankfully. For starters, here is `ve`'s package description as listed in `pom.xml`:
|
6
|
+
|
7
|
+
```
|
8
|
+
<groupId>uk.co.birchlabs.ve</groupId>
|
9
|
+
<artifactId>ve</artifactId>
|
10
|
+
<version>1.0-SNAPSHOT</version>
|
11
|
+
```
|
12
|
+
|
13
|
+
You could either:
|
14
|
+
|
15
|
+
* Install this to your global Maven repository (see [the instructions I wrote](https://github.com/shirakaba/sen-mavenized/blob/master/README.md) for a similar project if you don't already know how), and add the `ve` package as a `<dependency>` to your project; or...
|
16
|
+
|
17
|
+
* ~~Give up on modularity and just drag the files straight into your project.~~
|
18
|
+
|
19
|
+
* Add this project as a git submodule and import it using gradle
|
20
|
+
```groovy
|
21
|
+
// settings.gradle
|
22
|
+
include ':ve'
|
23
|
+
project(':ve').projectDir = new File('ve/java')
|
24
|
+
...
|
25
|
+
// build.gradle
|
26
|
+
dependencies {
|
27
|
+
implementation project(':ve')
|
28
|
+
}
|
29
|
+
```
|
30
|
+
|
31
|
+
No matter how you go about integrating it, please adhere to the rules of the software license.
|
32
|
+
|
33
|
+
*Useful info: [official guide to POMs](https://maven.apache.org/guides/introduction/introduction-to-the-pom.html).*
|
34
|
+
*Note: I would be very interested if someone could teach me how to put this package on the Maven central repository, to make integration much easier for everyone.*
|
35
|
+
|
36
|
+
## Importing into source
|
37
|
+
|
38
|
+
Import via: `import uk.co.birchlabs.ve`
|
39
|
+
|
40
|
+
Ve has a dependency on a tokenizer. As [kuromoji](https://github.com/atilika/kuromoji) is the most convenient/portable Mecab interface for Java, I coded it to accept Tokens in the format of the Kuromoji Token class.
|
41
|
+
|
42
|
+
*Fun fact: An older (not-to-be-released) version of this project was hooked up with [Sen](https://github.com/shirakaba/sen-mavenized), but its installation turned out to be much, much harder than kuromoji's. If you really want to use Sen (or any other non-Kuromoji Mecab interface) with Ve, I advise you to make a simple tool to convert those incompatible Tokens to Kuromoji-style ones for input into Ve.*
|
43
|
+
|
44
|
+
In all, I import:
|
45
|
+
|
46
|
+
```
|
47
|
+
import uk.co.birchlabs.ve
|
48
|
+
import org.atilika.kuromoji.Token;
|
49
|
+
import org.atilika.kuromoji.Tokenizer;
|
50
|
+
```
|
51
|
+
|
52
|
+
## Usage
|
53
|
+
|
54
|
+
Here's a line-by-line explanation of the `coreUsage` test I provided, detailing `Ve`'s core feature: parsing collections of Tokens into Words (more lexically meaningful groupings of Tokens, using Kim's special recipe).
|
55
|
+
|
56
|
+
``` java
|
57
|
+
// Some nonsense Japanese with interesting word boundaries.
|
58
|
+
String stringOfJapanese = "お金がなければいけないです。";
|
59
|
+
// Output the Kuromoji-style Tokens as a List
|
60
|
+
List<Token> tokensList = Tokenizer.builder().build().tokenize(stringOfJapanese);
|
61
|
+
// Convert to a basic Token array (I haven't adapted Ve to accept Lists of Tokens)
|
62
|
+
Token[] tokensArray = tokensList.toArray(new Token[tokensList.size()]);
|
63
|
+
// Create a parser instance from the array of Kuromoji-style Tokens.
|
64
|
+
Parse parser = new Parse(tokensArray);
|
65
|
+
// Get the Tokens out as 'Words'.
|
66
|
+
List<Word> words = parser.words();
|
67
|
+
// The .toString() method of each Word is generally the most useful. It shows the surface form of the Tokens.
|
68
|
+
// Output: [お金, が, なければいけない, です, 。]
|
69
|
+
System.out.println(words);
|
70
|
+
```
|
71
|
+
|
72
|
+
Note that each Word object stores all its constituent Tokens (unchanged from how they were passed in) losslessly, in order of their appearance in the Word! Think of the possibilities :D
|
73
|
+
|
74
|
+
# How to run the tests (to prove that it works)
|
75
|
+
|
76
|
+
These tests auto-pass; they purely exist to show you the command-line output of Ve's `parse` function.
|
77
|
+
|
78
|
+
## If using an IDE such as IntelliJ
|
79
|
+
|
80
|
+
1. Open `ve/java` in the IDE (the folder containing `pom.xml`)
|
81
|
+
|
82
|
+
2. Open `test/java/ve/VeTest.java`
|
83
|
+
|
84
|
+
3. Run the JUnit `coreUsage()` test by clicking the 'play' button beside it.
|
85
|
+
|
86
|
+
## If using the command line
|
87
|
+
|
88
|
+
1. `cd` into the `ve/java` directory
|
89
|
+
|
90
|
+
2. Run the command:
|
91
|
+
* `mvn test` (whole test suite); or:
|
92
|
+
* `mvn test -Dtest=VeTest,coreUsage` (single test)
|
93
|
+
|
94
|
+
# License
|
95
|
+
|
96
|
+
Java port of Ve © 2017 Jamie Birch, whose many horcruxes can be found across:
|
97
|
+
* [Github] [shirakaba](https://github.com/shirakaba)
|
98
|
+
* [Twitter] [@LinguaBrowse](https://twitter.com/LinguaBrowse)
|
99
|
+
* [Reddit] [r/LinguaBrowse](https://www.reddit.com/r/LinguaBrowse/)
|
100
|
+
* [Tumblr] [#LinguaBrowse](https://linguabrowse.tumblr.com/)
|
101
|
+
* [Facebook] [LinguaBrowse](https://www.facebook.com/LinguaBrowse/)
|
102
|
+
|
103
|
+
This is under the MIT license; a copy is provided at the root of this repository.
|
@@ -0,0 +1 @@
|
|
1
|
+
rootProject.name = 've'
|
@@ -0,0 +1,336 @@
|
|
1
|
+
package ve;
|
2
|
+
|
3
|
+
import org.atilika.kuromoji.Token;
|
4
|
+
|
5
|
+
import java.util.ArrayList;
|
6
|
+
import java.util.Arrays;
|
7
|
+
import java.util.List;
|
8
|
+
|
9
|
+
/** Copyright © 2017 Jamie Birch: [GitHub] shirakaba | [Twitter] LinguaBrowse
|
10
|
+
* Released under MIT license (see LICENSE.txt at root of repository).
|
11
|
+
*
|
12
|
+
* A Java port of Kim Ahlström's Ruby code for Ve's Parse (which identifies word boundaries).
|
13
|
+
**/
|
14
|
+
public class Parse {
|
15
|
+
private final Token[] tokenArray;
|
16
|
+
private static final String NO_DATA = "*";
|
17
|
+
|
18
|
+
private static final int POS1 = 0;
|
19
|
+
private static final int POS2 = 1;
|
20
|
+
private static final int POS3 = 2;
|
21
|
+
private static final int POS4 = 3;
|
22
|
+
private static final int CTYPE = 4;
|
23
|
+
private static final int CFORM = 5;
|
24
|
+
private static final int BASIC = 6;
|
25
|
+
private static final int READING = 7;
|
26
|
+
private static final int PRONUNCIATION = 8;
|
27
|
+
|
28
|
+
public Parse(Token[] tokenArray) {
|
29
|
+
if(tokenArray.length == 0) throw new UnsupportedOperationException("Cannot parse an empty array of tokens.");
|
30
|
+
|
31
|
+
this.tokenArray = tokenArray;
|
32
|
+
}
|
33
|
+
|
34
|
+
/**
|
35
|
+
* @return List of all words in the instance's tokenArray, or an empty list if tokenArray was empty.
|
36
|
+
* Ve returns an asterisk if no word was recognised.
|
37
|
+
* */
|
38
|
+
public List<Word> words(){
|
39
|
+
List<Word> wordList = new ArrayList<>();
|
40
|
+
Token current = null, previous = null, following = null;
|
41
|
+
|
42
|
+
for(int i = 0; i < tokenArray.length; i++){
|
43
|
+
int finalSlot = wordList.size() - 1;
|
44
|
+
current = tokenArray[i];
|
45
|
+
Pos pos = null; // could make this TBD instead.
|
46
|
+
Grammar grammar = Grammar.Unassigned;
|
47
|
+
boolean eat_next = false,
|
48
|
+
eat_lemma = true,
|
49
|
+
attach_to_previous = false,
|
50
|
+
also_attach_to_lemma = false,
|
51
|
+
update_pos = false;
|
52
|
+
|
53
|
+
String[] currentPOSArray = Arrays.copyOfRange(current.getAllFeaturesArray(), POS1, POS4 +1);
|
54
|
+
|
55
|
+
if(currentPOSArray.length == 0 || currentPOSArray[POS1].equals(NO_DATA))
|
56
|
+
throw new IllegalStateException("No Pos data found for token.");
|
57
|
+
|
58
|
+
switch (currentPOSArray[POS1]){
|
59
|
+
case MEISHI:
|
60
|
+
// case MICHIGO:
|
61
|
+
pos = Pos.Noun;
|
62
|
+
if(currentPOSArray[POS2].equals(NO_DATA)) break;
|
63
|
+
switch (currentPOSArray[POS2]){
|
64
|
+
case KOYUUMEISHI:
|
65
|
+
pos = Pos.ProperNoun;
|
66
|
+
break;
|
67
|
+
case DAIMEISHI:
|
68
|
+
pos = Pos.Pronoun;
|
69
|
+
break;
|
70
|
+
case FUKUSHIKANOU:
|
71
|
+
case SAHENSETSUZOKU:
|
72
|
+
case KEIYOUDOUSHIGOKAN:
|
73
|
+
case NAIKEIYOUSHIGOKAN:
|
74
|
+
// Refers to line 213 of Ve.
|
75
|
+
if(currentPOSArray[POS3].equals(NO_DATA)) break;
|
76
|
+
if(i == tokenArray.length -1) break; // protects against array overshooting.
|
77
|
+
following = tokenArray[i+1];
|
78
|
+
switch(following.getAllFeaturesArray()[CTYPE]){
|
79
|
+
case SAHEN_SURU:
|
80
|
+
pos = Pos.Verb;
|
81
|
+
eat_next = true;
|
82
|
+
break;
|
83
|
+
case TOKUSHU_DA:
|
84
|
+
pos = Pos.Adjective;
|
85
|
+
if(Arrays.copyOfRange(following.getAllFeaturesArray(), POS1, POS4 +1)[POS2].equals(TAIGENSETSUZOKU)){
|
86
|
+
eat_next = true;
|
87
|
+
eat_lemma = false;
|
88
|
+
}
|
89
|
+
break;
|
90
|
+
case TOKUSHU_NAI:
|
91
|
+
pos = Pos.Adjective;
|
92
|
+
eat_next = true;
|
93
|
+
break;
|
94
|
+
default:
|
95
|
+
if(Arrays.copyOfRange(following.getAllFeaturesArray(), POS1, POS4 +1)[POS1].equals(JOSHI)
|
96
|
+
&& following.getSurfaceForm().equals(NI))
|
97
|
+
pos = Pos.Adverb; // Ve script redundantly (I think) also has eat_next = false here.
|
98
|
+
break;
|
99
|
+
}
|
100
|
+
|
101
|
+
break;
|
102
|
+
case HIJIRITSU:
|
103
|
+
case TOKUSHU:
|
104
|
+
// Refers to line 233 of Ve.
|
105
|
+
if(currentPOSArray[POS3].equals(NO_DATA)) break;
|
106
|
+
if(i == tokenArray.length -1) break; // protects against array overshooting.
|
107
|
+
following = tokenArray[i+1];
|
108
|
+
|
109
|
+
switch(currentPOSArray[POS3]){
|
110
|
+
case FUKUSHIKANOU:
|
111
|
+
if(Arrays.copyOfRange(following.getAllFeaturesArray(), POS1, POS4 +1)[POS1].equals(JOSHI)
|
112
|
+
&& following.getSurfaceForm().equals(NI)){
|
113
|
+
pos = Pos.Adverb;
|
114
|
+
eat_next = false; // Changed this to false because 'case JOSHI' has 'attach_to_previous = true'.
|
115
|
+
}
|
116
|
+
break;
|
117
|
+
case JODOUSHIGOKAN:
|
118
|
+
if(following.getAllFeaturesArray()[CTYPE].equals(TOKUSHU_DA)){
|
119
|
+
pos = Pos.Verb;
|
120
|
+
grammar = Grammar.Auxiliary;
|
121
|
+
if(following.getAllFeaturesArray()[CFORM].equals(TAIGENSETSUZOKU)) eat_next = true;
|
122
|
+
}
|
123
|
+
else if (Arrays.copyOfRange(following.getAllFeaturesArray(), POS1, POS4 +1)[POS1].equals(JOSHI)
|
124
|
+
&& Arrays.copyOfRange(following.getAllFeaturesArray(), POS1, POS4 +1)[POS3].equals(FUKUSHIKA)){
|
125
|
+
pos = Pos.Adverb;
|
126
|
+
eat_next = true;
|
127
|
+
}
|
128
|
+
break;
|
129
|
+
case KEIYOUDOUSHIGOKAN:
|
130
|
+
pos = Pos.Adjective;
|
131
|
+
if(following.getAllFeaturesArray()[CTYPE].equals(TOKUSHU_DA) && following.getAllFeaturesArray()[CTYPE].equals(TAIGENSETSUZOKU)
|
132
|
+
|| Arrays.copyOfRange(following.getAllFeaturesArray(), POS1, POS4 +1)[POS2].equals(RENTAIKA))
|
133
|
+
eat_next = true;
|
134
|
+
break;
|
135
|
+
default:
|
136
|
+
break;
|
137
|
+
}
|
138
|
+
break;
|
139
|
+
case KAZU:
|
140
|
+
// TODO: "recurse and find following numbers and add to this word. Except non-numbers like 幾"
|
141
|
+
// Refers to line 261.
|
142
|
+
pos = Pos.Number;
|
143
|
+
if(wordList.size() > 0 && wordList.get(finalSlot).getPart_of_speech().equals(Pos.Number)){
|
144
|
+
attach_to_previous = true;
|
145
|
+
also_attach_to_lemma = true;
|
146
|
+
}
|
147
|
+
break;
|
148
|
+
case SETSUBI:
|
149
|
+
// Refers to line 267.
|
150
|
+
if(currentPOSArray[POS3].equals(JINMEI)) pos = Pos.Suffix;
|
151
|
+
else{
|
152
|
+
if(currentPOSArray[POS3].equals(TOKUSHU) && current.getAllFeaturesArray()[BASIC].equals(SA)){
|
153
|
+
update_pos = true;
|
154
|
+
pos = Pos.Noun;
|
155
|
+
}
|
156
|
+
else also_attach_to_lemma = true;
|
157
|
+
attach_to_previous = true;
|
158
|
+
}
|
159
|
+
break;
|
160
|
+
case SETSUZOKUSHITEKI:
|
161
|
+
pos = Pos.Conjunction;
|
162
|
+
break;
|
163
|
+
case DOUSHIHIJIRITSUTEKI:
|
164
|
+
pos = Pos.Verb;
|
165
|
+
grammar = Grammar.Nominal; // not using.
|
166
|
+
break;
|
167
|
+
default:
|
168
|
+
// Keep Pos as Noun, as it currently is.
|
169
|
+
break;
|
170
|
+
}
|
171
|
+
break;
|
172
|
+
case SETTOUSHI:
|
173
|
+
// TODO: "elaborate this when we have the "main part" feature for words?"
|
174
|
+
pos = Pos.Prefix;
|
175
|
+
break;
|
176
|
+
case JODOUSHI:
|
177
|
+
// Refers to line 290.
|
178
|
+
pos = Pos.Postposition;
|
179
|
+
final List<String> qualifyingList1 = Arrays.asList(TOKUSHU_TA, TOKUSHU_NAI, TOKUSHU_TAI, TOKUSHU_MASU, TOKUSHU_NU);
|
180
|
+
if(previous == null || !Arrays.copyOfRange(previous.getAllFeaturesArray(), POS1, POS4 +1)[POS2].equals(KAKARIJOSHI)
|
181
|
+
&& qualifyingList1.contains(current.getAllFeaturesArray()[CTYPE]))
|
182
|
+
attach_to_previous = true;
|
183
|
+
else if (current.getAllFeaturesArray()[CTYPE].equals(FUHENKAGATA) && current.getAllFeaturesArray()[BASIC].equals(NN))
|
184
|
+
attach_to_previous = true;
|
185
|
+
else if (current.getAllFeaturesArray()[CTYPE].equals(TOKUSHU_DA) || current.getAllFeaturesArray()[CTYPE].equals(TOKUSHU_DESU)
|
186
|
+
&& !current.getSurfaceForm().equals(NA))
|
187
|
+
pos = Pos.Verb;
|
188
|
+
break;
|
189
|
+
case DOUSHI:
|
190
|
+
// Refers to line 299.
|
191
|
+
pos = Pos.Verb;
|
192
|
+
switch (currentPOSArray[POS2]){
|
193
|
+
case SETSUBI:
|
194
|
+
attach_to_previous = true;
|
195
|
+
break;
|
196
|
+
case HIJIRITSU:
|
197
|
+
if(!current.getAllFeaturesArray()[CFORM].equals(MEIREI_I)) attach_to_previous = true;
|
198
|
+
default:
|
199
|
+
break;
|
200
|
+
}
|
201
|
+
break;
|
202
|
+
case KEIYOUSHI:
|
203
|
+
pos = Pos.Adjective;
|
204
|
+
break;
|
205
|
+
case JOSHI:
|
206
|
+
// Refers to line 309.
|
207
|
+
pos = Pos.Postposition;
|
208
|
+
final List<String> qualifyingList2 = Arrays.asList(TE, DE, BA); // added NI
|
209
|
+
if(currentPOSArray[POS2].equals(SETSUZOKUJOSHI) && qualifyingList2.contains(current.getSurfaceForm())
|
210
|
+
|| current.getSurfaceForm().equals(NI))
|
211
|
+
attach_to_previous = true;
|
212
|
+
break;
|
213
|
+
case RENTAISHI:
|
214
|
+
pos = Pos.Determiner;
|
215
|
+
break;
|
216
|
+
case SETSUZOKUSHI:
|
217
|
+
pos = Pos.Conjunction;
|
218
|
+
break;
|
219
|
+
case FUKUSHI:
|
220
|
+
pos = Pos.Adverb;
|
221
|
+
break;
|
222
|
+
case KIGOU:
|
223
|
+
pos = Pos.Symbol;
|
224
|
+
break;
|
225
|
+
case FIRAA:
|
226
|
+
case KANDOUSHI:
|
227
|
+
pos = Pos.Interjection;
|
228
|
+
break;
|
229
|
+
case SONOTA:
|
230
|
+
pos = Pos.Other;
|
231
|
+
break;
|
232
|
+
default:
|
233
|
+
pos = Pos.TBD;
|
234
|
+
// C'est une catastrophe
|
235
|
+
}
|
236
|
+
|
237
|
+
if(attach_to_previous && wordList.size() > 0){
|
238
|
+
// these sometimes try to add to null readings.
|
239
|
+
wordList.get(finalSlot).getTokens().add(current);
|
240
|
+
wordList.get(finalSlot).appendToWord(current.getSurfaceForm());
|
241
|
+
wordList.get(finalSlot).appendToReading(getFeatureSafely(current, READING));
|
242
|
+
wordList.get(finalSlot).appendToTranscription(getFeatureSafely(current, PRONUNCIATION));
|
243
|
+
if(also_attach_to_lemma) wordList.get(finalSlot).appendToLemma(current.getAllFeaturesArray()[BASIC]); // lemma == basic.
|
244
|
+
if(update_pos) wordList.get(finalSlot).setPart_of_speech(pos);
|
245
|
+
}
|
246
|
+
else {
|
247
|
+
Word word = new Word(current.getReading(),
|
248
|
+
getFeatureSafely(current, PRONUNCIATION),
|
249
|
+
grammar,
|
250
|
+
current.getAllFeaturesArray()[BASIC],
|
251
|
+
pos,
|
252
|
+
current.getSurfaceForm(),
|
253
|
+
current);
|
254
|
+
if(eat_next){
|
255
|
+
if(i == tokenArray.length -1) throw new IllegalStateException("There's a path that allows array overshooting.");
|
256
|
+
following = tokenArray[i+1];
|
257
|
+
word.getTokens().add(following);
|
258
|
+
word.appendToWord(following.getSurfaceForm());
|
259
|
+
word.appendToReading(following.getReading());
|
260
|
+
word.appendToTranscription(getFeatureSafely(following, PRONUNCIATION));
|
261
|
+
if (eat_lemma) word.appendToLemma(following.getAllFeaturesArray()[BASIC]);
|
262
|
+
}
|
263
|
+
wordList.add(word);
|
264
|
+
}
|
265
|
+
previous = current;
|
266
|
+
|
267
|
+
}
|
268
|
+
|
269
|
+
return wordList;
|
270
|
+
}
|
271
|
+
|
272
|
+
/** Return an asterisk if pronunciation field isn't in array (READING and PRONUNCIATION fields are left undefined,
|
273
|
+
* rather than as "*" by MeCab). Other feature fields are guaranteed to be safe, however. */
|
274
|
+
private String getFeatureSafely(Token token, int feature) {
|
275
|
+
if(feature > PRONUNCIATION) throw new IllegalStateException("Asked for a feature out of bounds.");
|
276
|
+
return token.getAllFeaturesArray().length >= feature + 1 ? token.getAllFeaturesArray()[feature] : "*";
|
277
|
+
}
|
278
|
+
|
279
|
+
// POS1
|
280
|
+
private static final String MEISHI = "名詞";
|
281
|
+
private static final String KOYUUMEISHI = "固有名詞";
|
282
|
+
private static final String DAIMEISHI = "代名詞";
|
283
|
+
private static final String JODOUSHI = "助動詞";
|
284
|
+
private static final String KAZU = "数";
|
285
|
+
private static final String JOSHI = "助詞";
|
286
|
+
private static final String SETTOUSHI = "接頭詞";
|
287
|
+
private static final String DOUSHI = "動詞";
|
288
|
+
private static final String KIGOU = "記号";
|
289
|
+
private static final String FIRAA = "フィラー";
|
290
|
+
private static final String SONOTA = "その他";
|
291
|
+
private static final String KANDOUSHI = "感動詞";
|
292
|
+
private static final String RENTAISHI = "連体詞";
|
293
|
+
private static final String SETSUZOKUSHI = "接続詞";
|
294
|
+
private static final String FUKUSHI = "副詞";
|
295
|
+
private static final String SETSUZOKUJOSHI = "接続助詞";
|
296
|
+
private static final String KEIYOUSHI = "形容詞";
|
297
|
+
private static final String MICHIGO = "未知語";
|
298
|
+
|
299
|
+
// POS2_BLACKLIST and inflection types
|
300
|
+
private static final String HIJIRITSU = "非自立";
|
301
|
+
private static final String FUKUSHIKANOU = "副詞可能";
|
302
|
+
private static final String SAHENSETSUZOKU = "サ変接続";
|
303
|
+
private static final String KEIYOUDOUSHIGOKAN = "形容動詞語幹";
|
304
|
+
private static final String NAIKEIYOUSHIGOKAN = "ナイ形容詞語幹";
|
305
|
+
private static final String JODOUSHIGOKAN = "助動詞語幹";
|
306
|
+
private static final String FUKUSHIKA = "副詞化";
|
307
|
+
private static final String TAIGENSETSUZOKU = "体言接続";
|
308
|
+
private static final String RENTAIKA = "連体化";
|
309
|
+
private static final String TOKUSHU = "特殊";
|
310
|
+
private static final String SETSUBI = "接尾";
|
311
|
+
private static final String SETSUZOKUSHITEKI = "接続詞的";
|
312
|
+
private static final String DOUSHIHIJIRITSUTEKI = "動詞非自立的";
|
313
|
+
private static final String SAHEN_SURU = "サ変・スル";
|
314
|
+
private static final String TOKUSHU_TA = "特殊・タ";
|
315
|
+
private static final String TOKUSHU_NAI = "特殊・ナイ";
|
316
|
+
private static final String TOKUSHU_TAI = "特殊・タイ";
|
317
|
+
private static final String TOKUSHU_DESU = "特殊・デス";
|
318
|
+
private static final String TOKUSHU_DA = "特殊・ダ";
|
319
|
+
private static final String TOKUSHU_MASU = "特殊・マス";
|
320
|
+
private static final String TOKUSHU_NU = "特殊・ヌ";
|
321
|
+
private static final String FUHENKAGATA = "不変化型";
|
322
|
+
private static final String JINMEI = "人名";
|
323
|
+
private static final String MEIREI_I = "命令i";
|
324
|
+
private static final String KAKARIJOSHI = "係助詞";
|
325
|
+
private static final String KAKUJOSHI = "格助詞";
|
326
|
+
|
327
|
+
// etc
|
328
|
+
private static final String NA = "な";
|
329
|
+
private static final String NI = "に";
|
330
|
+
private static final String TE = "て";
|
331
|
+
private static final String DE = "で";
|
332
|
+
private static final String BA = "ば";
|
333
|
+
private static final String NN = "ん";
|
334
|
+
private static final String SA = "さ";
|
335
|
+
|
336
|
+
}
|