ve 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,27 @@
1
+ package ve;
2
+
3
+ /** Copyright © 2017 Jamie Birch: [GitHub] shirakaba | [Twitter] LinguaBrowse
4
+ * Released under MIT license (see LICENSE.txt at root of repository).
5
+ *
6
+ * Based on ve/lib/part_of_speech.rb.
7
+ **/
8
+ public enum Pos {
9
+ Noun,
10
+ ProperNoun,
11
+ Pronoun,
12
+ Adjective,
13
+ Adverb,
14
+ Determiner,
15
+ Preposition,
16
+ Postposition,
17
+ Verb,
18
+ Suffix,
19
+ Prefix,
20
+ Conjunction,
21
+ Interjection,
22
+ Number,
23
+ Unknown,
24
+ Symbol,
25
+ Other,
26
+ TBD
27
+ }
@@ -0,0 +1,104 @@
1
+ package ve;
2
+
3
+ import org.atilika.kuromoji.Token;
4
+
5
+ import java.util.ArrayList;
6
+ import java.util.List;
7
+
8
+ /** Copyright © 2017 Jamie Birch: [GitHub] shirakaba | [Twitter] LinguaBrowse
9
+ * Released under MIT license (see LICENSE.txt at root of repository).
10
+ *
11
+ * A Word is composed of one or more Tokens, as stored in an internal List.
12
+ * It also has various fields like 'reading' and 'transcription', which may
13
+ * build up as extra Tokens are added to the list.
14
+ * Words are identified and built up by this project's Parse.words() method.
15
+ **/
16
+ public class Word {
17
+ // These five seem underdeveloped and underpopulated:
18
+ private String reading;
19
+ private String transcription;
20
+ private Grammar grammar;
21
+ // private String reading_script;
22
+ // private String transcription_script;
23
+ private String lemma; // "聞く"
24
+ private Pos part_of_speech; // eg. Pos.Noun
25
+ private List<Token> tokens = new ArrayList<>(); // those which were eaten up by this one word: {聞か, せ, られ}
26
+ private String word; // "聞かせられ"
27
+
28
+ /**
29
+ * Incoming variables are named in the style of Sen; fields are named in the style of Ve.
30
+ * @param read - call token.getReading().
31
+ * @param pronunciation - call token.getPronunciation().
32
+ * @param grammar - this is an underdeveloped enum-like variable originating from Ve.
33
+ * @param basic - call token.getBasicString().
34
+ * @param part_of_speech - this is another underdeveloped enum-like variable originating from Ve.
35
+ * @param nodeStr - call token.getNodeStr().
36
+ * @param token - pass in a Token composing part of the Word. Currently expects the Token to come from Sen, but could
37
+ * be simply adapted to come from Kuromoji.
38
+ */
39
+ public Word(String read,
40
+ String pronunciation,
41
+ Grammar grammar,
42
+ // String reading_script,
43
+ // String transcription_script,
44
+ String basic,
45
+ Pos part_of_speech,
46
+ String nodeStr,
47
+ Token token) {
48
+ this.reading = read;
49
+ this.transcription = pronunciation;
50
+ this.grammar = grammar;
51
+ // this.reading_script = reading_script;
52
+ // this.transcription_script = transcription_script;
53
+ this.lemma = basic;
54
+ this.part_of_speech = part_of_speech;
55
+ this.word = nodeStr;
56
+ tokens.add(token);
57
+ }
58
+
59
+ public void setPart_of_speech(Pos part_of_speech) {
60
+ this.part_of_speech = part_of_speech;
61
+ }
62
+
63
+ public String getLemma() {
64
+ return lemma;
65
+ }
66
+
67
+ public Pos getPart_of_speech() {
68
+ return part_of_speech;
69
+ }
70
+
71
+ public List<Token> getTokens() {
72
+ return tokens;
73
+ }
74
+
75
+ public String getWord() {
76
+ return word;
77
+ }
78
+
79
+ public void appendToWord(String suffix) {
80
+ if(word == null) word = "_".concat(suffix); // likely won't experience a null word, actually.
81
+ else word = word.concat(suffix);
82
+ }
83
+
84
+ public void appendToReading(String suffix) {
85
+ if(reading == null) reading = "_".concat(suffix);
86
+ else reading = reading.concat(suffix);
87
+ }
88
+
89
+ public void appendToTranscription(String suffix) {
90
+ if(transcription == null) transcription = "_".concat(suffix);
91
+ else transcription = transcription.concat(suffix);
92
+ }
93
+
94
+ // Not sure when this would change.
95
+ public void appendToLemma(String suffix) {
96
+ if(lemma == null) lemma = "_".concat(suffix);
97
+ else lemma = lemma.concat(suffix);
98
+ }
99
+
100
+ @Override
101
+ public String toString() {
102
+ return word;
103
+ }
104
+ }
@@ -0,0 +1,41 @@
1
+ package ve;
2
+
3
+ import org.atilika.kuromoji.Token;
4
+ import org.atilika.kuromoji.Tokenizer;
5
+ import org.junit.Test;
6
+
7
+ import java.util.List;
8
+
9
+ /** Copyright © 2017 Jamie Birch: [GitHub] shirakaba | [Twitter] LinguaBrowse
10
+ * Released under MIT license (see LICENSE.txt at root of repository).
11
+ *
12
+ * This test is purely to show the console output; it is unconditional.
13
+ **/
14
+ public class VeTest {
15
+
16
+ /** More hardcore test sentence at: https://hondou.homedns.org/pukiwiki/index.php?cmd=read&page=Java%20SEN%20%A4%C7%B7%C1%C2%D6%C1%C7%B2%F2%C0%CF
17
+ */
18
+ @Test
19
+ public void coreUsage() {
20
+ String kanji = "お金がなければいけないです。";
21
+ List<Token> tokensList = Tokenizer.builder().build().tokenize(kanji);
22
+ Token[] tokensArray = tokensList.toArray(new Token[tokensList.size()]);
23
+
24
+ Parse parser = new Parse(tokensArray);
25
+ List<Word> words = parser.words();
26
+ System.out.println(words);
27
+
28
+ /* Prints out:
29
+ [お金, が, なければいけない, です, 。]
30
+ */
31
+
32
+ /* Note: I have found that, depending on the MeCab dictionary/model, POS-tagging of tokens may vary.
33
+ ie: for a particular sentence, when tokenizing using net.java.sen:
34
+ なけれ is labelled as a DOUSHI-JITATSU-*-*.
35
+ However, when tokenizing using org.atilika.kuromoji:
36
+ なけれ is labelled as a KEIYOUSHI-JITATSU-*-*.
37
+ So your mileage may vary (very slightly) if comparing to other tokenizer results..!
38
+ Not the Ve algorithm's fault, fortunately.
39
+ */
40
+ }
41
+ }
@@ -2,7 +2,7 @@ class Ve
2
2
  class PartOfSpeech
3
3
 
4
4
  def self.name
5
- self.to_s.split('::').last.downcase
5
+ self.to_s.split('::').last.gsub(/(?<=[A-Za-z])(?=[A-Z])/, ' ').downcase # RegEx adds spaces before uppercase letters. Ex: Ve::PartOfSpeech::ProperNoun.name => "proper noun"
6
6
  end
7
7
 
8
8
  class Noun < PartOfSpeech; end
@@ -10,39 +10,40 @@ class Ve
10
10
  class FreelingEn < Ve::Provider
11
11
  # FIX: This class isn't tested
12
12
  BIT_STOP = 'VeEnd'
13
-
13
+
14
14
  # TODO: Automatically set FREELINGSHARE if it's not set?
15
15
  def initialize(config = {})
16
16
  @config = {:app => 'analyzer',
17
17
  :path => '',
18
18
  :flags => ''}.merge(config)
19
-
19
+
20
20
  @config[:app] = `which #{@config[:app]}`.strip!
21
21
  local = @config[:app] =~ /local/ ? '/local' : ''
22
- @config[:flags] = "-f /usr#{local}/share/FreeLing/config/en.cfg --flush --nonumb --nodate"
23
-
22
+ share_dir = "/usr#{local}/share"
23
+ @config[:freeling_dir_name] = Dir.exist?("#{share_dir}/FreeLing") ? 'FreeLing' : 'freeling'
24
+ @config[:flags] = "-f #{share_dir}/#{@config[:freeling_dir_name]}/config/en.cfg --flush --nonumb --nodate"
25
+
24
26
  start!
25
27
  end
26
-
28
+
27
29
  # Interface methods
28
-
30
+
29
31
  def works?
30
32
  p = parse('Wrote')
31
- ["Wrote write VBD 1", ""] == p.tokens.collect { |t| t[:raw] }
33
+ "Wrote write VBD 1" == p.tokens.collect { |t| t[:raw] }[0]
32
34
  end
33
-
35
+
34
36
  # Talks to the app and returns a parse object
35
37
  def parse(text, options = {})
36
38
  start! if @stdin.nil?
37
39
  # Fix Unicode chars
38
40
  # TODO: These need to be converted back to the original char in the :literal attribute
39
41
  text = text.gsub('’', "'")
40
-
42
+
41
43
  @stdin.puts "#{text}\n#{BIT_STOP}\n"
42
44
  output = []
43
-
45
+
44
46
  while line = @stdout.readline
45
- puts line
46
47
  if line =~ /#{BIT_STOP}/x
47
48
  @stdout.readline
48
49
  break
@@ -56,17 +57,17 @@ class Ve
56
57
  end
57
58
 
58
59
  private
59
-
60
+
60
61
  def start!
61
62
  @stdin, @stdout, @stderr = Open3.popen3("#{@config[:app]} #{@config[:flags]}")
62
-
63
+
63
64
  # TODO: Also filter out non-iso-latin-1 characters
64
65
  @stdin.set_encoding('UTF-8', 'ISO-8859-1')
65
66
  @stdout.set_encoding('ISO-8859-1', 'UTF-8')
66
67
  rescue Errno::ENOENT
67
68
  # The parser couldn't be started. Probably not installed on this system
68
69
  end
69
-
70
+
70
71
  end
71
72
  end
72
73
  end
@@ -74,14 +75,14 @@ end
74
75
  class Ve
75
76
  class Parse
76
77
  class FreelingEn < Ve::Parse
77
-
78
+
78
79
  attr_reader :tokens, :text
79
-
80
+
80
81
  def initialize(text, output)
81
82
  @tokens = []
82
83
  @text = text
83
84
  position = 0
84
-
85
+
85
86
  output.each_with_index do |line, index|
86
87
  line.rstrip!
87
88
  token = {:raw => line}
@@ -98,7 +99,7 @@ class Ve
98
99
  @tokens << unparsed_token
99
100
  end
100
101
  end
101
-
102
+
102
103
  # Sentence splits are just empty lines in Freeling
103
104
  if line.length == 0
104
105
  token[:type] = :sentence_split
@@ -106,7 +107,7 @@ class Ve
106
107
  @tokens << token
107
108
  next
108
109
  end
109
-
110
+
110
111
  # The parsed token
111
112
  info = line.split(/\s+/)
112
113
  token[:type] = :parsed
@@ -116,7 +117,7 @@ class Ve
116
117
 
117
118
  token[:literal].gsub!('_', ' ')
118
119
  token[:lemma].gsub!('_', ' ')
119
-
120
+
120
121
  # Anything unparsed preceding this token.
121
122
  # We need to do this complicated dance with _ since Freeling replaces spaces with it.
122
123
  # And so we need to be able to find the token with both spaces and _ in it since
@@ -137,7 +138,7 @@ class Ve
137
138
  @tokens << token
138
139
  end
139
140
  end
140
-
141
+
141
142
  INTERNAL_INFO_FOR_PARSED_POS = {
142
143
  'CC' => [Ve::PartOfSpeech::Conjunction, nil],
143
144
  'CD' => [Ve::PartOfSpeech::Number, nil],
@@ -176,10 +177,10 @@ class Ve
176
177
  'WRB' => [Ve::PartOfSpeech::Adverb, nil],
177
178
  'Z' => [Ve::PartOfSpeech::Determiner, nil]
178
179
  }
179
-
180
+
180
181
  def words
181
182
  words = []
182
-
183
+
183
184
  @tokens.find_all { |t| t[:type] == :parsed }.each do |token|
184
185
  if token[:pos] == 'POS'
185
186
  # Possessive ending, add to previous token
@@ -199,14 +200,14 @@ class Ve
199
200
  words << word
200
201
  end
201
202
  end
202
-
203
+
203
204
  words
204
205
  end
205
-
206
+
206
207
  def sentences
207
208
  sentences = []
208
209
  current = ''
209
-
210
+
210
211
  @tokens.each do |token|
211
212
  if token[:type] == :sentence_split
212
213
  sentences << current
@@ -215,14 +216,14 @@ class Ve
215
216
  current << token[:literal]
216
217
  end
217
218
  end
218
-
219
+
219
220
  # In case there is no :sentence_split at the end
220
221
  sentences << current if current.length > 0
221
222
 
222
223
  sentences.collect { |s| s.strip! }
223
224
  sentences
224
225
  end
225
-
226
+
226
227
  end
227
228
  end
228
229
  end
@@ -25,7 +25,7 @@ class Ve
25
25
 
26
26
  H_SYLLABIC_N = 'ん'
27
27
  H_SMALL_TSU = 'っ'
28
-
28
+
29
29
  HIRA_TO_LATN = {
30
30
  "あ"=>"a", "い"=>"i", "う"=>"u", "え"=>"e", "お"=>"o",
31
31
  "か"=>"ka", "き"=>"ki", "く"=>"ku", "け"=>"ke", "こ"=>"ko",
@@ -42,17 +42,17 @@ class Ve
42
42
  "や"=>"ya", "ゆ"=>"yu", "よ"=>"yo",
43
43
  "ら"=>"ra", "り"=>"ri", "る"=>"ru", "れ"=>"re", "ろ"=>"ro",
44
44
  "わ"=>"wa", "うぃ"=>"whi", "うぇ"=>"whe", "を"=>"wo",
45
- "ゑ"=>"wye", "ゐ"=>"wyi", "ー"=>"-", "ん"=>"n",
45
+ "ゑ"=>"we", "ゐ"=>"wi", "ー"=>"-", "ん"=>"n",
46
46
 
47
47
  "きゃ"=>"kya", "きゅ"=>"kyu", "きょ"=>"kyo", "きぇ"=>"kye", "きぃ"=>"kyi",
48
48
  "ぎゃ"=>"gya", "ぎゅ"=>"gyu", "ぎょ"=>"gyo", "ぎぇ"=>"gye", "ぎぃ"=>"gyi",
49
49
  "くぁ"=>"kwa", "くぃ"=>"kwi", "くぅ"=>"kwu", "くぇ"=>"kwe", "くぉ"=>"kwo",
50
50
  "ぐぁ"=>"qwa", "ぐぃ"=>"gwi", "ぐぅ"=>"gwu", "ぐぇ"=>"gwe", "ぐぉ"=>"gwo",
51
51
  "しゃ"=>"sha", "しぃ"=>"syi", "しゅ"=>"shu", "しぇ"=>"she", "しょ"=>"sho",
52
- "じゃ"=>"jya", "じゅ"=>"zyu", "じぇ"=>"zye", "じょ"=>"zyo", "じぃ"=>"zyi",
52
+ "じゃ"=>"ja", "じゅ"=>"ju", "じぇ"=>"jye", "じょ"=>"jo", "じぃ"=>"jyi",
53
53
  "すぁ"=>"swa", "すぃ"=>"swi", "すぅ"=>"swu", "すぇ"=>"swe", "すぉ"=>"swo",
54
- "ちゃ"=>"tya", "ちゅ"=>"tyu", "ちぇ"=>"tye", "ちょ"=>"tyo", "ちぃ"=>"tyi",
55
- "ぢゃ"=>"dya", "ぢぃ"=>"dyi", "ぢゅ"=>"dyu", "ぢぇ"=>"dye", "ぢょ"=>"dyo",
54
+ "ちゃ"=>"cha", "ちゅ"=>"chu", "ちぇ"=>"tye", "ちょ"=>"cho", "ちぃ"=>"tyi",
55
+ "ぢゃ"=>"ja", "ぢぃ"=>"dyi", "ぢゅ"=>"ju", "ぢぇ"=>"dye", "ぢょ"=>"jo",
56
56
  "つぁ"=>"tsa", "つぃ"=>"tsi", "つぇ"=>"tse", "つぉ"=>"tso", "てゃ"=>"tha",
57
57
  "てぃ"=>"thi", "てゅ"=>"thu", "てぇ"=>"the", "てょ"=>"tho", "とぁ"=>"twa",
58
58
  "とぃ"=>"twi", "とぅ"=>"twu", "とぇ"=>"twe", "とぉ"=>"two", "でゃ"=>"dha",
@@ -72,7 +72,7 @@ class Ve
72
72
  "ぁ"=>"xa", "ぃ"=>"xi", "ぅ"=>"xu", "ぇ"=>"xe", "ぉ"=>"xo",
73
73
  "ゕ"=>"xka", "ゖ"=>"xke", "ゎ"=>"xwa"
74
74
  }
75
-
75
+
76
76
  LATN_TO_HIRA = {
77
77
  'a' => 'あ', 'i' => 'い', 'u' => 'う', 'e' => 'え', 'o' => 'お',
78
78
  'ka' => 'か', 'ki' => 'き', 'ku' => 'く', 'ke' => 'け', 'ko' => 'こ',
@@ -98,7 +98,7 @@ class Ve
98
98
  'gya' => 'ぎゃ', 'gyu' => 'ぎゅ', 'gyo' => 'ぎょ', 'gye' => 'ぎぇ', 'gyi' => 'ぎぃ',
99
99
  'kwa' => 'くぁ', 'kwi' => 'くぃ', 'kwu' => 'くぅ', 'kwe' => 'くぇ', 'kwo' => 'くぉ',
100
100
  'gwa' => 'ぐぁ', 'gwi' => 'ぐぃ', 'gwu' => 'ぐぅ', 'gwe' => 'ぐぇ', 'gwo' => 'ぐぉ',
101
- 'qwa' => 'ぐぁ', 'gwi' => 'ぐぃ', 'gwu' => 'ぐぅ', 'gwe' => 'ぐぇ', 'gwo' => 'ぐぉ',
101
+ 'qwa' => 'ぐぁ', 'qwi' => 'ぐぃ', 'qwu' => 'ぐぅ', 'qwe' => 'ぐぇ', 'qwo' => 'ぐぉ',
102
102
 
103
103
  'sya' => 'しゃ', 'syi' => 'しぃ', 'syu' => 'しゅ', 'sye' => 'しぇ', 'syo' => 'しょ',
104
104
  'sha' => 'しゃ', 'shu' => 'しゅ', 'she' => 'しぇ', 'sho' => 'しょ',
@@ -196,7 +196,7 @@ class Ve
196
196
 
197
197
  return romaji
198
198
  end
199
-
199
+
200
200
  def transliterate_from_latn_to_hrkt
201
201
  romaji = @text.dup
202
202
  kana = ''
@@ -220,8 +220,8 @@ class Ve
220
220
  elsif LATN_TO_HIRA[for_conversion]
221
221
  # Generic cases
222
222
  mora = LATN_TO_HIRA[for_conversion]
223
- elsif for_conversion == 'tch' || ( length == 2 && for_conversion.match(/([kgsztdnbpmyrlwc])\1/))
224
- # tch and double-consonants for small tsu
223
+ elsif for_conversion == 'tch' || ( length == 2 && for_conversion.match(/([kgsztdnbpmyrlwchf])\1/))
224
+ # tch and double-consonants for small tsu
225
225
  mora = H_SMALL_TSU
226
226
  for_removal = 1
227
227
  end
@@ -237,7 +237,7 @@ class Ve
237
237
  else
238
238
  kana << mora
239
239
  end
240
-
240
+
241
241
  romaji[0, for_removal] = ''
242
242
  break
243
243
  elsif length == 1
@@ -250,7 +250,7 @@ class Ve
250
250
 
251
251
  return kana
252
252
  end
253
-
253
+
254
254
  def transliterate_from_kana_to_hira
255
255
  transpose_codepoints_in_range(@text, -96, 12449..12534)
256
256
  end
@@ -268,7 +268,7 @@ class Ve
268
268
  res = transpose_codepoints_in_range(@text, 65248, 33..126)
269
269
  transpose_codepoints_in_range(res, 12256, 32..32)
270
270
  end
271
-
271
+
272
272
  private
273
273
 
274
274
  def transpose_codepoints_in_range(text, distance, range)
@@ -284,7 +284,7 @@ class Ve
284
284
 
285
285
  return result
286
286
  end
287
-
287
+
288
288
  end
289
289
  end
290
290
  end