ve 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +4 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +22 -0
- data/Rakefile +9 -0
- data/Readme.md +60 -0
- data/js/test.html +32 -0
- data/js/ve.js +57 -0
- data/lib/language.rb +2 -0
- data/lib/languages/english.rb +6 -0
- data/lib/languages/japanese.rb +9 -0
- data/lib/misc.rb +10 -0
- data/lib/part_of_speech.rb +30 -0
- data/lib/provider.rb +29 -0
- data/lib/providers/fallbacks.rb +0 -0
- data/lib/providers/freeling_en.rb +229 -0
- data/lib/providers/japanese_transliterators.rb +293 -0
- data/lib/providers/mecab_ipadic.rb +362 -0
- data/lib/ve.rb +111 -0
- data/lib/word.rb +43 -0
- data/sinatra/server.rb +46 -0
- data/tests/freeling_en_test.rb +135 -0
- data/tests/japanese_transliterators_test.rb +79 -0
- data/tests/mecab_ipadic_test.rb +452 -0
- data/tests/test_helper.rb +26 -0
- data/tests/ve_test.rb +20 -0
- data/ve.gemspec +20 -0
- metadata +80 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
json (1.6.1)
|
5
|
+
rack (1.3.5)
|
6
|
+
rack-cors (0.2.4)
|
7
|
+
rack
|
8
|
+
rack-protection (1.1.4)
|
9
|
+
rack
|
10
|
+
sinatra (1.3.1)
|
11
|
+
rack (~> 1.3, >= 1.3.4)
|
12
|
+
rack-protection (~> 1.1, >= 1.1.2)
|
13
|
+
tilt (~> 1.3, >= 1.3.3)
|
14
|
+
tilt (1.3.3)
|
15
|
+
|
16
|
+
PLATFORMS
|
17
|
+
ruby
|
18
|
+
|
19
|
+
DEPENDENCIES
|
20
|
+
json
|
21
|
+
rack-cors
|
22
|
+
sinatra
|
data/Rakefile
ADDED
data/Readme.md
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
Ve
|
2
|
+
==
|
3
|
+
|
4
|
+
A linguistic framework for anyone. No degree required.
|
5
|
+
|
6
|
+
Read all about it on [kimtaro.github.com/ve](http://kimtaro.github.com/ve).
|
7
|
+
|
8
|
+
Ruby
|
9
|
+
----
|
10
|
+
|
11
|
+
require 've'
|
12
|
+
words = Ve.in(:en).words('I like melons.')
|
13
|
+
# => [#<Ve::Word:0x8ee00cc @word="I", @lemma="i", @part_of_speech=Ve::PartOfSpeech::Pronoun, @tokens=[{:raw=>"I i PRP 1", :type=>:parsed, :literal=>"I", :lemma=>"i", :pos=>"PRP", :accuracy=>"1", :characters=>0..0}], @extra={:grammar=>:personal}, @info={}>, #<Ve::Word:0x8edff28 @word="like", @lemma="like", @part_of_speech=Ve::PartOfSpeech::Preposition, @tokens=[{:raw=>"like like IN 0.815649", :type=>:parsed, :literal=>"like", :lemma=>"like", :pos=>"IN", :accuracy=>"0.815649", :characters=>2..5}], @extra={:grammar=>nil}, @info={}>, #<Ve::Word:0x8edfe24 @word="melons", @lemma="melon", @part_of_speech=Ve::PartOfSpeech::Noun, @tokens=[{:raw=>"melons melon NNS 1", :type=>:parsed, :literal=>"melons", :lemma=>"melon", :pos=>"NNS", :accuracy=>"1", :characters=>7..12}], @extra={:grammar=>:plural}, @info={}>, #<Ve::Word:0x8edfcbc @word=".", @lemma=".", @part_of_speech=Ve::PartOfSpeech::Symbol, @tokens=[{:raw=>". . Fp 1", :type=>:parsed, :literal=>".", :lemma=>".", :pos=>"Fp", :accuracy=>"1", :characters=>13..13}], @extra={:grammar=>nil}, @info={}>]
|
14
|
+
|
15
|
+
words.collect(&:lemma) # => ["i", "like", "melon", "."]
|
16
|
+
words.collect(&:part_of_speec) # => [Ve::PartOfSpeech::Pronoun, Ve::PartOfSpeech::Preposition, Ve::PartOfSpeech::Noun, Ve::PartOfSpeech::Symbol]
|
17
|
+
|
18
|
+
Javascript
|
19
|
+
----------
|
20
|
+
|
21
|
+
<script type="text/javascript" charset="utf-8" src="ve.js"></script>
|
22
|
+
<script type="text/javascript" charset="utf-8">
|
23
|
+
new Ve('ja').words('ビールがおいしかった', function(words) {
|
24
|
+
// [{"_class":"Word","word":"ビール","lemma":"ビール","part_of_speech":"noun","tokens":[{"raw":"ビール\t名詞,一般,*,*,*,*,ビール,ビール,ビール","type":"parsed","literal":"ビール","pos":"名詞","pos2":"一般","pos3":"*","pos4":"*","inflection_type":"*","inflection_form":"*","lemma":"ビール","reading":"ビール","hatsuon":"ビール","characters":"0..2"}],"extra":{"reading":"ビール","transcription":"ビール","grammar":null},"info":{"reading_script":"kata","transcription_script":"kata"}},{"_class":"Word","word":"が","lemma":"が","part_of_speech":"postposition","tokens":[{"raw":"が\t助詞,格助詞,一般,*,*,*,が,ガ,ガ","type":"parsed","literal":"が","pos":"助詞","pos2":"格助詞","pos3":"一般","pos4":"*","inflection_type":"*","inflection_form":"*","lemma":"が","reading":"ガ","hatsuon":"ガ","characters":"3..3"}],"extra":{"reading":"ガ","transcription":"ガ","grammar":null},"info":{"reading_script":"kata","transcription_script":"kata"}},{"_class":"Word","word":"おいしい","lemma":"おいしい","part_of_speech":"adjective","tokens":[{"raw":"おいしい\t形容詞,自立,*,*,形容詞・イ段,基本形,おいしい,オイシイ,オイシイ","type":"parsed","literal":"おいしい","pos":"形容詞","pos2":"自立","pos3":"*","pos4":"*","inflection_type":"形容詞・イ段","inflection_form":"基本形","lemma":"おいしい","reading":"オイシイ","hatsuon":"オイシイ","characters":"4..7"}],"extra":{"reading":"オイシイ","transcription":"オイシイ","grammar":null},"info":{"reading_script":"kata","transcription_script":"kata"}}]
|
25
|
+
|
26
|
+
for ( i in words ) {
|
27
|
+
var word = words[i];
|
28
|
+
console.log(word.lemma + "/" + word.part_of_speech)
|
29
|
+
}
|
30
|
+
|
31
|
+
// ビール/noun
|
32
|
+
// が/postposition
|
33
|
+
// おいしい/adjective
|
34
|
+
});
|
35
|
+
</script>
|
36
|
+
|
37
|
+
Structure
|
38
|
+
---------
|
39
|
+
|
40
|
+
- **Ve::LocalInterface** - Main interface that gives access to functionality in providers that exist locally
|
41
|
+
- **Ve::XInterface** - Allows for different ways of accessing Ve providers. Locally, through an HTTP API, binary protocol or whatever
|
42
|
+
- **Ve::Manager** - Keeps track of providers and what they can do
|
43
|
+
- **Ve::Provider::X** - Talks to the underlying parser
|
44
|
+
- **Ve::Parse::X** - Takes the output from the Provider and turns it into functions the end user can use
|
45
|
+
|
46
|
+
Todo
|
47
|
+
----
|
48
|
+
|
49
|
+
- Expose more through the sinatra server
|
50
|
+
- Alias lemma to base, so people don't need to know what lemmas are
|
51
|
+
- Break out into separate projects for each component. Ve-ruby, Ve-js.
|
52
|
+
- Better UTF-8 handling for Freeling
|
53
|
+
- See all the TODO's in the code
|
54
|
+
|
55
|
+
License
|
56
|
+
-------
|
57
|
+
|
58
|
+
(c) Kim Ahlström 2011
|
59
|
+
|
60
|
+
This is under the MIT license.
|
data/js/test.html
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
<!DOCTYPE html>
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<meta name="http-equiv" content="Content-Type: text/html; charset=utf-8">
|
5
|
+
<style type="text/css">
|
6
|
+
.fail { color: red; }
|
7
|
+
.pass { color: green; }
|
8
|
+
</style>
|
9
|
+
<script src="ve.js" type="text/javascript" charset="utf-8"></script>
|
10
|
+
<script type="text/javascript" charset="utf-8">
|
11
|
+
function assert(test, name) {
|
12
|
+
var report = document.getElementById('report');
|
13
|
+
var result = test ? 'pass' : 'fail';
|
14
|
+
|
15
|
+
report.innerHTML = report.innerHTML + '<p class="' + result + '">' + name + ': ' + result + '</p>'
|
16
|
+
}
|
17
|
+
|
18
|
+
new Ve('en').words('I ate hamburgers.', function(words){
|
19
|
+
assert((4 == words.length && 'eat' == words[1].lemma), 'English');
|
20
|
+
});
|
21
|
+
|
22
|
+
new Ve('ja').words('ビールを飲んだ', function(words){
|
23
|
+
// TODO: Shouldn't have to encode it here ...
|
24
|
+
var word = "\u98f2\u3080"; // 飲む
|
25
|
+
assert((3 == words.length && word == words[2].lemma), 'Japanese');
|
26
|
+
});
|
27
|
+
</script>
|
28
|
+
</head>
|
29
|
+
<body>
|
30
|
+
<div id="report"></div>
|
31
|
+
</body>
|
32
|
+
</html>
|
data/js/ve.js
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
/**
|
2
|
+
* ve.js
|
3
|
+
*
|
4
|
+
* Communicates with a Sinatra-server to facilitate linguistic
|
5
|
+
* parsing tech in JS.
|
6
|
+
*
|
7
|
+
* @Author: Kim Ahlstrom
|
8
|
+
* @Author: Ryan McGrath <ryan@venodesigns.net>
|
9
|
+
* @Requires: Nothing
|
10
|
+
*/
|
11
|
+
|
12
|
+
;(function(w, d, undefined) {
|
13
|
+
var Ve = w.Ve = function Ve(language) {
|
14
|
+
this.language = language;
|
15
|
+
this.url = 'http://localhost:4567/';
|
16
|
+
return this;
|
17
|
+
};
|
18
|
+
|
19
|
+
Ve.prototype = {
|
20
|
+
words: function(text, callbackfn) {
|
21
|
+
// Need to utf8-encode stuff at this point...
|
22
|
+
jsonp(this.url + this.language + '/words?text=' + text, callbackfn);
|
23
|
+
return this;
|
24
|
+
}
|
25
|
+
};
|
26
|
+
|
27
|
+
var jsonp = function jsonp(src, callbackfn) {
|
28
|
+
var newScript = document.createElement("script"),
|
29
|
+
callback = 've_callback_' + +new Date();
|
30
|
+
|
31
|
+
newScript.type = "text/javascript";
|
32
|
+
newScript.setAttribute("async", "true");
|
33
|
+
newScript.setAttribute("src", src + '&callback=' + callback);
|
34
|
+
window[callback] = callbackfn;
|
35
|
+
|
36
|
+
/**
|
37
|
+
* Automagically handle cleanup of injected script tags, so we don't litter someone's DOM
|
38
|
+
* with our stuff. This branches for various reasons - could be a bit cleaner.
|
39
|
+
*/
|
40
|
+
if(newScript.readyState) {
|
41
|
+
newScript.onreadystatechange = function() {
|
42
|
+
if(/loaded|complete/.test(newScript.readyState)) {
|
43
|
+
newScript.onreadystatechange = null;
|
44
|
+
document.documentElement.firstChild.removeChild(newScript);
|
45
|
+
window[callback] = null;
|
46
|
+
}
|
47
|
+
}
|
48
|
+
} else {
|
49
|
+
newScript.addEventListener("load", function() {
|
50
|
+
document.documentElement.firstChild.removeChild(newScript);
|
51
|
+
window[callback] = null;
|
52
|
+
}, false);
|
53
|
+
}
|
54
|
+
|
55
|
+
document.documentElement.firstChild.appendChild(newScript);
|
56
|
+
}
|
57
|
+
})(window, document, 'undefined');
|
data/lib/language.rb
ADDED
data/lib/misc.rb
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
class Ve
|
2
|
+
class PartOfSpeech
|
3
|
+
|
4
|
+
def self.name
|
5
|
+
self.to_s.split('::').last.downcase
|
6
|
+
end
|
7
|
+
|
8
|
+
class Noun < PartOfSpeech; end
|
9
|
+
class ProperNoun < PartOfSpeech; end
|
10
|
+
class Pronoun < PartOfSpeech; end
|
11
|
+
class Adjective < PartOfSpeech; end
|
12
|
+
class Adverb < PartOfSpeech; end
|
13
|
+
class Determiner < PartOfSpeech; end
|
14
|
+
class Preposition < PartOfSpeech; end
|
15
|
+
class Postposition < PartOfSpeech; end
|
16
|
+
class Verb < PartOfSpeech; end
|
17
|
+
class Suffix < PartOfSpeech; end
|
18
|
+
class Prefix < PartOfSpeech; end
|
19
|
+
class Conjunction < PartOfSpeech; end
|
20
|
+
class Interjection < PartOfSpeech; end
|
21
|
+
class Number < PartOfSpeech; end
|
22
|
+
class Unknown < PartOfSpeech; end
|
23
|
+
class Symbol < PartOfSpeech; end
|
24
|
+
class Determiner < PartOfSpeech; end
|
25
|
+
class Other < PartOfSpeech; end
|
26
|
+
|
27
|
+
class TBD < PartOfSpeech; end # Placeholder for provider PoS that haven't had a Ve PoS assigned yet
|
28
|
+
|
29
|
+
end
|
30
|
+
end
|
data/lib/provider.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
class Ve
|
2
|
+
class Provider
|
3
|
+
|
4
|
+
# Interface, to be implemented by providers
|
5
|
+
|
6
|
+
def provides
|
7
|
+
end
|
8
|
+
|
9
|
+
def start!
|
10
|
+
end
|
11
|
+
|
12
|
+
def works?
|
13
|
+
end
|
14
|
+
|
15
|
+
def parse
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class Ve
|
22
|
+
class Parse
|
23
|
+
|
24
|
+
# TODO
|
25
|
+
def as_json
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
end
|
File without changes
|
@@ -0,0 +1,229 @@
|
|
1
|
+
# Encoding: UTF-8
|
2
|
+
|
3
|
+
# TODO: Retain capitalization in lemmas?
|
4
|
+
# TODO: Memoize
|
5
|
+
|
6
|
+
require 'open3'
|
7
|
+
|
8
|
+
class Ve
|
9
|
+
class Provider
|
10
|
+
class FreelingEn < Ve::Provider
|
11
|
+
|
12
|
+
BIT_STOP = 'VeEnd'
|
13
|
+
|
14
|
+
# TODO: Automatically set FREELINGSHARE if it's not set?
|
15
|
+
def initialize(config = {})
|
16
|
+
@config = {:app => 'analyzer',
|
17
|
+
:path => '',
|
18
|
+
:flags => ''}.merge(config)
|
19
|
+
|
20
|
+
@config[:app] = `which #{@config[:app]}`.strip!
|
21
|
+
local = @config[:app] =~ /local/ ? '/local' : ''
|
22
|
+
@config[:flags] = "-f /usr#{local}/share/FreeLing/config/en.cfg --flush --nonumb --nodate"
|
23
|
+
|
24
|
+
start!
|
25
|
+
end
|
26
|
+
|
27
|
+
# Interface methods
|
28
|
+
|
29
|
+
def works?
|
30
|
+
(["Wrote write VBD 1", ""] == parse('Wrote').tokens.collect { |t| t[:raw] })
|
31
|
+
end
|
32
|
+
|
33
|
+
# Talks to the app and returns a parse object
|
34
|
+
def parse(text, options = {})
|
35
|
+
start! if @stdin.nil?
|
36
|
+
# Fix Unicode chars
|
37
|
+
# TODO: These need to be converted back to the original char in the :literal attribute
|
38
|
+
text = text.gsub('’', "'")
|
39
|
+
|
40
|
+
@stdin.puts "#{text}\n#{BIT_STOP}\n"
|
41
|
+
output = []
|
42
|
+
|
43
|
+
while line = @stdout.readline
|
44
|
+
if line =~ /#{BIT_STOP}/x
|
45
|
+
@stdout.readline
|
46
|
+
break
|
47
|
+
end
|
48
|
+
output << line
|
49
|
+
end
|
50
|
+
|
51
|
+
Ve::Parse::FreelingEn.new(text, output)
|
52
|
+
rescue
|
53
|
+
Ve::Parse::FreelingEn.new(text, [])
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
def start!
|
59
|
+
@stdin, @stdout, @stderr = Open3.popen3("#{@config[:app]} #{@config[:flags]}")
|
60
|
+
|
61
|
+
# TODO: Also filter out non-iso-latin-1 characters
|
62
|
+
@stdin.set_encoding('UTF-8', 'ISO-8859-1')
|
63
|
+
@stdout.set_encoding('ISO-8859-1', 'UTF-8')
|
64
|
+
rescue Errno::ENOENT
|
65
|
+
# The parser couldn't be started. Probably not installed on this system
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
class Ve
|
73
|
+
class Parse
|
74
|
+
class FreelingEn < Ve::Parse
|
75
|
+
|
76
|
+
attr_reader :tokens, :text
|
77
|
+
|
78
|
+
def initialize(text, output)
|
79
|
+
@tokens = []
|
80
|
+
@text = text
|
81
|
+
position = 0
|
82
|
+
|
83
|
+
output.each_with_index do |line, index|
|
84
|
+
line.rstrip!
|
85
|
+
token = {:raw => line}
|
86
|
+
|
87
|
+
# Anything unparsed at the end of the text
|
88
|
+
# This must happen before sentence splits are detected to avoid funny ordering
|
89
|
+
if output.length > 1 && output.length == index + 1
|
90
|
+
unparsed_md = %r{(.*? \Z\n?)}mx.match(text, position)
|
91
|
+
if unparsed_md[1].length > 0
|
92
|
+
unparsed_token = {:type => :unparsed,
|
93
|
+
:literal => unparsed_md[1],
|
94
|
+
:raw => ''}
|
95
|
+
unparsed_token[:characters] = (position..(position+unparsed_token[:literal].length-1))
|
96
|
+
@tokens << unparsed_token
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
# Sentence splits are just empty lines in Freeling
|
101
|
+
if line.length == 0
|
102
|
+
token[:type] = :sentence_split
|
103
|
+
token[:literal] = ''
|
104
|
+
@tokens << token
|
105
|
+
next
|
106
|
+
end
|
107
|
+
|
108
|
+
# The parsed token
|
109
|
+
info = line.split(/\s+/)
|
110
|
+
token[:type] = :parsed
|
111
|
+
[:literal, :lemma, :pos, :accuracy].each_with_index do |attr, i|
|
112
|
+
token[attr] = info[i]
|
113
|
+
end
|
114
|
+
|
115
|
+
token[:literal].gsub!('_', ' ')
|
116
|
+
token[:lemma].gsub!('_', ' ')
|
117
|
+
|
118
|
+
# Anything unparsed preceding this token.
|
119
|
+
# We need to do this complicated dance with _ since Freeling replaces spaces with it.
|
120
|
+
# And so we need to be able to find the token with both spaces and _ in it since
|
121
|
+
# we don't know what the original in the text actually is.
|
122
|
+
# Once we have the location in the text we can figure out if it should be with spaces or _.
|
123
|
+
unparsed_re = %r{(.*?) #{Regexp.quote(token[:literal])}}mx
|
124
|
+
unparsed_re = %r{#{unparsed_re.to_s.gsub('_', '[\s_]')}}
|
125
|
+
unparsed_md = unparsed_re.match(text, position)
|
126
|
+
if unparsed_md && unparsed_md[1].length > 0
|
127
|
+
unparsed_token = {:type => :unparsed, :literal => unparsed_md[1]}
|
128
|
+
unparsed_token[:characters] = (position..(position+unparsed_token[:literal].length-1))
|
129
|
+
@tokens << unparsed_token
|
130
|
+
position += unparsed_token[:literal].length
|
131
|
+
end
|
132
|
+
|
133
|
+
token[:characters] = (position..(position+token[:literal].length-1))
|
134
|
+
position += token[:literal].length
|
135
|
+
@tokens << token
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
INTERNAL_INFO_FOR_PARSED_POS = {
|
140
|
+
'CC' => [Ve::PartOfSpeech::Conjunction, nil],
|
141
|
+
'CD' => [Ve::PartOfSpeech::Number, nil],
|
142
|
+
'DT' => [Ve::PartOfSpeech::Determiner, nil],
|
143
|
+
'EX' => [Ve::PartOfSpeech::Pronoun, nil],
|
144
|
+
'FW' => [Ve::PartOfSpeech::Unknown, nil],
|
145
|
+
'IN' => [Ve::PartOfSpeech::Preposition, nil],
|
146
|
+
'JJ' => [Ve::PartOfSpeech::Adjective, nil],
|
147
|
+
'JJR' => [Ve::PartOfSpeech::Adjective, :comparative],
|
148
|
+
'JJS' => [Ve::PartOfSpeech::Adjective, :superlative],
|
149
|
+
'LS' => [Ve::PartOfSpeech::Unknown, nil],
|
150
|
+
'MD' => [Ve::PartOfSpeech::Verb, :modal],
|
151
|
+
'NN' => [Ve::PartOfSpeech::Noun, nil],
|
152
|
+
'NNS' => [Ve::PartOfSpeech::Noun, :plural],
|
153
|
+
'NNP' => [Ve::PartOfSpeech::ProperNoun, nil],
|
154
|
+
'NNPS' => [Ve::PartOfSpeech::ProperNoun, :plural],
|
155
|
+
'PDT' => [Ve::PartOfSpeech::Determiner, nil],
|
156
|
+
'PRP' => [Ve::PartOfSpeech::Pronoun, :personal],
|
157
|
+
'PRP$' => [Ve::PartOfSpeech::Pronoun, :possessive],
|
158
|
+
'RB' => [Ve::PartOfSpeech::Adverb, nil],
|
159
|
+
'RBR' => [Ve::PartOfSpeech::Adverb, :comparative],
|
160
|
+
'RBS' => [Ve::PartOfSpeech::Adverb, :superlative],
|
161
|
+
'RP' => [Ve::PartOfSpeech::Postposition, nil],
|
162
|
+
'SYM' => [Ve::PartOfSpeech::Symbol, nil],
|
163
|
+
'TO' => [Ve::PartOfSpeech::Preposition, nil],
|
164
|
+
'UH' => [Ve::PartOfSpeech::Interjection, nil],
|
165
|
+
'VB' => [Ve::PartOfSpeech::Verb, nil],
|
166
|
+
'VBD' => [Ve::PartOfSpeech::Verb, :past],
|
167
|
+
'VBG' => [Ve::PartOfSpeech::Verb, :present_participle],
|
168
|
+
'VBN' => [Ve::PartOfSpeech::Verb, :past_participle],
|
169
|
+
'VBP' => [Ve::PartOfSpeech::Verb, nil],
|
170
|
+
'VBZ' => [Ve::PartOfSpeech::Verb, nil],
|
171
|
+
'WDT' => [Ve::PartOfSpeech::Determiner, nil],
|
172
|
+
'WP' => [Ve::PartOfSpeech::Pronoun, nil],
|
173
|
+
'WP$' => [Ve::PartOfSpeech::Pronoun, :possessive],
|
174
|
+
'WRB' => [Ve::PartOfSpeech::Adverb, nil],
|
175
|
+
'Z' => [Ve::PartOfSpeech::Determiner, nil]
|
176
|
+
}
|
177
|
+
|
178
|
+
def words
|
179
|
+
words = []
|
180
|
+
|
181
|
+
@tokens.find_all { |t| t[:type] == :parsed }.each do |token|
|
182
|
+
if token[:pos] == 'POS'
|
183
|
+
# Possessive ending, add to previous token
|
184
|
+
words[-1].word << token[:literal]
|
185
|
+
words[-1].tokens << token
|
186
|
+
next
|
187
|
+
else
|
188
|
+
# All other tokens
|
189
|
+
pos, grammar = INTERNAL_INFO_FOR_PARSED_POS[token[:pos]]
|
190
|
+
|
191
|
+
if pos.nil? && token[:pos] =~ /^F\w+$/
|
192
|
+
pos = Ve::PartOfSpeech::Symbol
|
193
|
+
end
|
194
|
+
|
195
|
+
pos = Ve::PartOfSpeech::TBD if pos.nil?
|
196
|
+
word = Ve::Word.new(token[:literal], token[:lemma], pos, [token], {:grammar => grammar})
|
197
|
+
words << word
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
words
|
202
|
+
end
|
203
|
+
|
204
|
+
def sentences
|
205
|
+
sentences = []
|
206
|
+
current = ''
|
207
|
+
|
208
|
+
@tokens.each do |token|
|
209
|
+
if token[:type] == :sentence_split
|
210
|
+
sentences << current
|
211
|
+
current = ''
|
212
|
+
else
|
213
|
+
current << token[:literal]
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
# In case there is no :sentence_split at the end
|
218
|
+
sentences << current if current.length > 0
|
219
|
+
|
220
|
+
sentences.collect { |s| s.strip! }
|
221
|
+
sentences
|
222
|
+
end
|
223
|
+
|
224
|
+
end
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
Ve::Manager.register(Ve::Provider::FreelingEn, :en)
|
229
|
+
|