ve 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ .DS_Store
2
+ .*.swp
3
+ *.gem
4
+
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ source "http://rubygems.org"
2
+
3
+ gem "json"
4
+
5
+ group :server do
6
+ gem "sinatra"
7
+ gem "rack-cors"
8
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,22 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ json (1.6.1)
5
+ rack (1.3.5)
6
+ rack-cors (0.2.4)
7
+ rack
8
+ rack-protection (1.1.4)
9
+ rack
10
+ sinatra (1.3.1)
11
+ rack (~> 1.3, >= 1.3.4)
12
+ rack-protection (~> 1.1, >= 1.1.2)
13
+ tilt (~> 1.3, >= 1.3.3)
14
+ tilt (1.3.3)
15
+
16
+ PLATFORMS
17
+ ruby
18
+
19
+ DEPENDENCIES
20
+ json
21
+ rack-cors
22
+ sinatra
data/Rakefile ADDED
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env rake
2
+
3
+ require 'rake/testtask'
4
+
5
+ Rake::TestTask.new do |t|
6
+ t.pattern = "tests/*_test.rb"
7
+ end
8
+
9
+ task :default => :test
data/Readme.md ADDED
@@ -0,0 +1,60 @@
1
+ Ve
2
+ ==
3
+
4
+ A linguistic framework for anyone. No degree required.
5
+
6
+ Read all about it on [kimtaro.github.com/ve](http://kimtaro.github.com/ve).
7
+
8
+ Ruby
9
+ ----
10
+
11
+ require 've'
12
+ words = Ve.in(:en).words('I like melons.')
13
+ # => [#<Ve::Word:0x8ee00cc @word="I", @lemma="i", @part_of_speech=Ve::PartOfSpeech::Pronoun, @tokens=[{:raw=>"I i PRP 1", :type=>:parsed, :literal=>"I", :lemma=>"i", :pos=>"PRP", :accuracy=>"1", :characters=>0..0}], @extra={:grammar=>:personal}, @info={}>, #<Ve::Word:0x8edff28 @word="like", @lemma="like", @part_of_speech=Ve::PartOfSpeech::Preposition, @tokens=[{:raw=>"like like IN 0.815649", :type=>:parsed, :literal=>"like", :lemma=>"like", :pos=>"IN", :accuracy=>"0.815649", :characters=>2..5}], @extra={:grammar=>nil}, @info={}>, #<Ve::Word:0x8edfe24 @word="melons", @lemma="melon", @part_of_speech=Ve::PartOfSpeech::Noun, @tokens=[{:raw=>"melons melon NNS 1", :type=>:parsed, :literal=>"melons", :lemma=>"melon", :pos=>"NNS", :accuracy=>"1", :characters=>7..12}], @extra={:grammar=>:plural}, @info={}>, #<Ve::Word:0x8edfcbc @word=".", @lemma=".", @part_of_speech=Ve::PartOfSpeech::Symbol, @tokens=[{:raw=>". . Fp 1", :type=>:parsed, :literal=>".", :lemma=>".", :pos=>"Fp", :accuracy=>"1", :characters=>13..13}], @extra={:grammar=>nil}, @info={}>]
14
+
15
+ words.collect(&:lemma) # => ["i", "like", "melon", "."]
16
+ words.collect(&:part_of_speec) # => [Ve::PartOfSpeech::Pronoun, Ve::PartOfSpeech::Preposition, Ve::PartOfSpeech::Noun, Ve::PartOfSpeech::Symbol]
17
+
18
+ Javascript
19
+ ----------
20
+
21
+ <script type="text/javascript" charset="utf-8" src="ve.js"></script>
22
+ <script type="text/javascript" charset="utf-8">
23
+ new Ve('ja').words('ビールがおいしかった', function(words) {
24
+ // [{"_class":"Word","word":"ビール","lemma":"ビール","part_of_speech":"noun","tokens":[{"raw":"ビール\t名詞,一般,*,*,*,*,ビール,ビール,ビール","type":"parsed","literal":"ビール","pos":"名詞","pos2":"一般","pos3":"*","pos4":"*","inflection_type":"*","inflection_form":"*","lemma":"ビール","reading":"ビール","hatsuon":"ビール","characters":"0..2"}],"extra":{"reading":"ビール","transcription":"ビール","grammar":null},"info":{"reading_script":"kata","transcription_script":"kata"}},{"_class":"Word","word":"が","lemma":"が","part_of_speech":"postposition","tokens":[{"raw":"が\t助詞,格助詞,一般,*,*,*,が,ガ,ガ","type":"parsed","literal":"が","pos":"助詞","pos2":"格助詞","pos3":"一般","pos4":"*","inflection_type":"*","inflection_form":"*","lemma":"が","reading":"ガ","hatsuon":"ガ","characters":"3..3"}],"extra":{"reading":"ガ","transcription":"ガ","grammar":null},"info":{"reading_script":"kata","transcription_script":"kata"}},{"_class":"Word","word":"おいしい","lemma":"おいしい","part_of_speech":"adjective","tokens":[{"raw":"おいしい\t形容詞,自立,*,*,形容詞・イ段,基本形,おいしい,オイシイ,オイシイ","type":"parsed","literal":"おいしい","pos":"形容詞","pos2":"自立","pos3":"*","pos4":"*","inflection_type":"形容詞・イ段","inflection_form":"基本形","lemma":"おいしい","reading":"オイシイ","hatsuon":"オイシイ","characters":"4..7"}],"extra":{"reading":"オイシイ","transcription":"オイシイ","grammar":null},"info":{"reading_script":"kata","transcription_script":"kata"}}]
25
+
26
+ for ( i in words ) {
27
+ var word = words[i];
28
+ console.log(word.lemma + "/" + word.part_of_speech)
29
+ }
30
+
31
+ // ビール/noun
32
+ // が/postposition
33
+ // おいしい/adjective
34
+ });
35
+ </script>
36
+
37
+ Structure
38
+ ---------
39
+
40
+ - **Ve::LocalInterface** - Main interface that gives access to functionality in providers that exist locally
41
+ - **Ve::XInterface** - Allows for different ways of accessing Ve providers. Locally, through an HTTP API, binary protocol or whatever
42
+ - **Ve::Manager** - Keeps track of providers and what they can do
43
+ - **Ve::Provider::X** - Talks to the underlying parser
44
+ - **Ve::Parse::X** - Takes the output from the Provider and turns it into functions the end user can use
45
+
46
+ Todo
47
+ ----
48
+
49
+ - Expose more through the sinatra server
50
+ - Alias lemma to base, so people don't need to know what lemmas are
51
+ - Break out into separate projects for each component. Ve-ruby, Ve-js.
52
+ - Better UTF-8 handling for Freeling
53
+ - See all the TODO's in the code
54
+
55
+ License
56
+ -------
57
+
58
+ (c) Kim Ahlström 2011
59
+
60
+ This is under the MIT license.
data/js/test.html ADDED
@@ -0,0 +1,32 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta name="http-equiv" content="Content-Type: text/html; charset=utf-8">
5
+ <style type="text/css">
6
+ .fail { color: red; }
7
+ .pass { color: green; }
8
+ </style>
9
+ <script src="ve.js" type="text/javascript" charset="utf-8"></script>
10
+ <script type="text/javascript" charset="utf-8">
11
+ function assert(test, name) {
12
+ var report = document.getElementById('report');
13
+ var result = test ? 'pass' : 'fail';
14
+
15
+ report.innerHTML = report.innerHTML + '<p class="' + result + '">' + name + ': ' + result + '</p>'
16
+ }
17
+
18
+ new Ve('en').words('I ate hamburgers.', function(words){
19
+ assert((4 == words.length && 'eat' == words[1].lemma), 'English');
20
+ });
21
+
22
+ new Ve('ja').words('ビールを飲んだ', function(words){
23
+ // TODO: Shouldn't have to encode it here ...
24
+ var word = "\u98f2\u3080"; // 飲む
25
+ assert((3 == words.length && word == words[2].lemma), 'Japanese');
26
+ });
27
+ </script>
28
+ </head>
29
+ <body>
30
+ <div id="report"></div>
31
+ </body>
32
+ </html>
data/js/ve.js ADDED
@@ -0,0 +1,57 @@
1
+ /**
2
+ * ve.js
3
+ *
4
+ * Communicates with a Sinatra-server to facilitate linguistic
5
+ * parsing tech in JS.
6
+ *
7
+ * @Author: Kim Ahlstrom
8
+ * @Author: Ryan McGrath <ryan@venodesigns.net>
9
+ * @Requires: Nothing
10
+ */
11
+
12
+ ;(function(w, d, undefined) {
13
+ var Ve = w.Ve = function Ve(language) {
14
+ this.language = language;
15
+ this.url = 'http://localhost:4567/';
16
+ return this;
17
+ };
18
+
19
+ Ve.prototype = {
20
+ words: function(text, callbackfn) {
21
+ // Need to utf8-encode stuff at this point...
22
+ jsonp(this.url + this.language + '/words?text=' + text, callbackfn);
23
+ return this;
24
+ }
25
+ };
26
+
27
+ var jsonp = function jsonp(src, callbackfn) {
28
+ var newScript = document.createElement("script"),
29
+ callback = 've_callback_' + +new Date();
30
+
31
+ newScript.type = "text/javascript";
32
+ newScript.setAttribute("async", "true");
33
+ newScript.setAttribute("src", src + '&callback=' + callback);
34
+ window[callback] = callbackfn;
35
+
36
+ /**
37
+ * Automagically handle cleanup of injected script tags, so we don't litter someone's DOM
38
+ * with our stuff. This branches for various reasons - could be a bit cleaner.
39
+ */
40
+ if(newScript.readyState) {
41
+ newScript.onreadystatechange = function() {
42
+ if(/loaded|complete/.test(newScript.readyState)) {
43
+ newScript.onreadystatechange = null;
44
+ document.documentElement.firstChild.removeChild(newScript);
45
+ window[callback] = null;
46
+ }
47
+ }
48
+ } else {
49
+ newScript.addEventListener("load", function() {
50
+ document.documentElement.firstChild.removeChild(newScript);
51
+ window[callback] = null;
52
+ }, false);
53
+ }
54
+
55
+ document.documentElement.firstChild.appendChild(newScript);
56
+ }
57
+ })(window, document, 'undefined');
data/lib/language.rb ADDED
@@ -0,0 +1,2 @@
1
+ class Ve::Language
2
+ end
@@ -0,0 +1,6 @@
1
+ class Ve
2
+ class Language
3
+ class English
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,9 @@
1
+ class Ve
2
+ class Language
3
+ class Japanese
4
+
5
+ #interface_for :ja
6
+
7
+ end
8
+ end
9
+ end
data/lib/misc.rb ADDED
@@ -0,0 +1,10 @@
1
+ class Enumerator
2
+ def more?
3
+ begin
4
+ self.peek
5
+ true
6
+ rescue
7
+ false
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,30 @@
1
+ class Ve
2
+ class PartOfSpeech
3
+
4
+ def self.name
5
+ self.to_s.split('::').last.downcase
6
+ end
7
+
8
+ class Noun < PartOfSpeech; end
9
+ class ProperNoun < PartOfSpeech; end
10
+ class Pronoun < PartOfSpeech; end
11
+ class Adjective < PartOfSpeech; end
12
+ class Adverb < PartOfSpeech; end
13
+ class Determiner < PartOfSpeech; end
14
+ class Preposition < PartOfSpeech; end
15
+ class Postposition < PartOfSpeech; end
16
+ class Verb < PartOfSpeech; end
17
+ class Suffix < PartOfSpeech; end
18
+ class Prefix < PartOfSpeech; end
19
+ class Conjunction < PartOfSpeech; end
20
+ class Interjection < PartOfSpeech; end
21
+ class Number < PartOfSpeech; end
22
+ class Unknown < PartOfSpeech; end
23
+ class Symbol < PartOfSpeech; end
24
+ class Determiner < PartOfSpeech; end
25
+ class Other < PartOfSpeech; end
26
+
27
+ class TBD < PartOfSpeech; end # Placeholder for provider PoS that haven't had a Ve PoS assigned yet
28
+
29
+ end
30
+ end
data/lib/provider.rb ADDED
@@ -0,0 +1,29 @@
1
+ class Ve
2
+ class Provider
3
+
4
+ # Interface, to be implemented by providers
5
+
6
+ def provides
7
+ end
8
+
9
+ def start!
10
+ end
11
+
12
+ def works?
13
+ end
14
+
15
+ def parse
16
+ end
17
+
18
+ end
19
+ end
20
+
21
+ class Ve
22
+ class Parse
23
+
24
+ # TODO
25
+ def as_json
26
+ end
27
+
28
+ end
29
+ end
File without changes
@@ -0,0 +1,229 @@
1
+ # Encoding: UTF-8
2
+
3
+ # TODO: Retain capitalization in lemmas?
4
+ # TODO: Memoize
5
+
6
+ require 'open3'
7
+
8
+ class Ve
9
+ class Provider
10
+ class FreelingEn < Ve::Provider
11
+
12
+ BIT_STOP = 'VeEnd'
13
+
14
+ # TODO: Automatically set FREELINGSHARE if it's not set?
15
+ def initialize(config = {})
16
+ @config = {:app => 'analyzer',
17
+ :path => '',
18
+ :flags => ''}.merge(config)
19
+
20
+ @config[:app] = `which #{@config[:app]}`.strip!
21
+ local = @config[:app] =~ /local/ ? '/local' : ''
22
+ @config[:flags] = "-f /usr#{local}/share/FreeLing/config/en.cfg --flush --nonumb --nodate"
23
+
24
+ start!
25
+ end
26
+
27
+ # Interface methods
28
+
29
+ def works?
30
+ (["Wrote write VBD 1", ""] == parse('Wrote').tokens.collect { |t| t[:raw] })
31
+ end
32
+
33
+ # Talks to the app and returns a parse object
34
+ def parse(text, options = {})
35
+ start! if @stdin.nil?
36
+ # Fix Unicode chars
37
+ # TODO: These need to be converted back to the original char in the :literal attribute
38
+ text = text.gsub('’', "'")
39
+
40
+ @stdin.puts "#{text}\n#{BIT_STOP}\n"
41
+ output = []
42
+
43
+ while line = @stdout.readline
44
+ if line =~ /#{BIT_STOP}/x
45
+ @stdout.readline
46
+ break
47
+ end
48
+ output << line
49
+ end
50
+
51
+ Ve::Parse::FreelingEn.new(text, output)
52
+ rescue
53
+ Ve::Parse::FreelingEn.new(text, [])
54
+ end
55
+
56
+ private
57
+
58
+ def start!
59
+ @stdin, @stdout, @stderr = Open3.popen3("#{@config[:app]} #{@config[:flags]}")
60
+
61
+ # TODO: Also filter out non-iso-latin-1 characters
62
+ @stdin.set_encoding('UTF-8', 'ISO-8859-1')
63
+ @stdout.set_encoding('ISO-8859-1', 'UTF-8')
64
+ rescue Errno::ENOENT
65
+ # The parser couldn't be started. Probably not installed on this system
66
+ end
67
+
68
+ end
69
+ end
70
+ end
71
+
72
+ class Ve
73
+ class Parse
74
+ class FreelingEn < Ve::Parse
75
+
76
+ attr_reader :tokens, :text
77
+
78
+ def initialize(text, output)
79
+ @tokens = []
80
+ @text = text
81
+ position = 0
82
+
83
+ output.each_with_index do |line, index|
84
+ line.rstrip!
85
+ token = {:raw => line}
86
+
87
+ # Anything unparsed at the end of the text
88
+ # This must happen before sentence splits are detected to avoid funny ordering
89
+ if output.length > 1 && output.length == index + 1
90
+ unparsed_md = %r{(.*? \Z\n?)}mx.match(text, position)
91
+ if unparsed_md[1].length > 0
92
+ unparsed_token = {:type => :unparsed,
93
+ :literal => unparsed_md[1],
94
+ :raw => ''}
95
+ unparsed_token[:characters] = (position..(position+unparsed_token[:literal].length-1))
96
+ @tokens << unparsed_token
97
+ end
98
+ end
99
+
100
+ # Sentence splits are just empty lines in Freeling
101
+ if line.length == 0
102
+ token[:type] = :sentence_split
103
+ token[:literal] = ''
104
+ @tokens << token
105
+ next
106
+ end
107
+
108
+ # The parsed token
109
+ info = line.split(/\s+/)
110
+ token[:type] = :parsed
111
+ [:literal, :lemma, :pos, :accuracy].each_with_index do |attr, i|
112
+ token[attr] = info[i]
113
+ end
114
+
115
+ token[:literal].gsub!('_', ' ')
116
+ token[:lemma].gsub!('_', ' ')
117
+
118
+ # Anything unparsed preceding this token.
119
+ # We need to do this complicated dance with _ since Freeling replaces spaces with it.
120
+ # And so we need to be able to find the token with both spaces and _ in it since
121
+ # we don't know what the original in the text actually is.
122
+ # Once we have the location in the text we can figure out if it should be with spaces or _.
123
+ unparsed_re = %r{(.*?) #{Regexp.quote(token[:literal])}}mx
124
+ unparsed_re = %r{#{unparsed_re.to_s.gsub('_', '[\s_]')}}
125
+ unparsed_md = unparsed_re.match(text, position)
126
+ if unparsed_md && unparsed_md[1].length > 0
127
+ unparsed_token = {:type => :unparsed, :literal => unparsed_md[1]}
128
+ unparsed_token[:characters] = (position..(position+unparsed_token[:literal].length-1))
129
+ @tokens << unparsed_token
130
+ position += unparsed_token[:literal].length
131
+ end
132
+
133
+ token[:characters] = (position..(position+token[:literal].length-1))
134
+ position += token[:literal].length
135
+ @tokens << token
136
+ end
137
+ end
138
+
139
+ INTERNAL_INFO_FOR_PARSED_POS = {
140
+ 'CC' => [Ve::PartOfSpeech::Conjunction, nil],
141
+ 'CD' => [Ve::PartOfSpeech::Number, nil],
142
+ 'DT' => [Ve::PartOfSpeech::Determiner, nil],
143
+ 'EX' => [Ve::PartOfSpeech::Pronoun, nil],
144
+ 'FW' => [Ve::PartOfSpeech::Unknown, nil],
145
+ 'IN' => [Ve::PartOfSpeech::Preposition, nil],
146
+ 'JJ' => [Ve::PartOfSpeech::Adjective, nil],
147
+ 'JJR' => [Ve::PartOfSpeech::Adjective, :comparative],
148
+ 'JJS' => [Ve::PartOfSpeech::Adjective, :superlative],
149
+ 'LS' => [Ve::PartOfSpeech::Unknown, nil],
150
+ 'MD' => [Ve::PartOfSpeech::Verb, :modal],
151
+ 'NN' => [Ve::PartOfSpeech::Noun, nil],
152
+ 'NNS' => [Ve::PartOfSpeech::Noun, :plural],
153
+ 'NNP' => [Ve::PartOfSpeech::ProperNoun, nil],
154
+ 'NNPS' => [Ve::PartOfSpeech::ProperNoun, :plural],
155
+ 'PDT' => [Ve::PartOfSpeech::Determiner, nil],
156
+ 'PRP' => [Ve::PartOfSpeech::Pronoun, :personal],
157
+ 'PRP$' => [Ve::PartOfSpeech::Pronoun, :possessive],
158
+ 'RB' => [Ve::PartOfSpeech::Adverb, nil],
159
+ 'RBR' => [Ve::PartOfSpeech::Adverb, :comparative],
160
+ 'RBS' => [Ve::PartOfSpeech::Adverb, :superlative],
161
+ 'RP' => [Ve::PartOfSpeech::Postposition, nil],
162
+ 'SYM' => [Ve::PartOfSpeech::Symbol, nil],
163
+ 'TO' => [Ve::PartOfSpeech::Preposition, nil],
164
+ 'UH' => [Ve::PartOfSpeech::Interjection, nil],
165
+ 'VB' => [Ve::PartOfSpeech::Verb, nil],
166
+ 'VBD' => [Ve::PartOfSpeech::Verb, :past],
167
+ 'VBG' => [Ve::PartOfSpeech::Verb, :present_participle],
168
+ 'VBN' => [Ve::PartOfSpeech::Verb, :past_participle],
169
+ 'VBP' => [Ve::PartOfSpeech::Verb, nil],
170
+ 'VBZ' => [Ve::PartOfSpeech::Verb, nil],
171
+ 'WDT' => [Ve::PartOfSpeech::Determiner, nil],
172
+ 'WP' => [Ve::PartOfSpeech::Pronoun, nil],
173
+ 'WP$' => [Ve::PartOfSpeech::Pronoun, :possessive],
174
+ 'WRB' => [Ve::PartOfSpeech::Adverb, nil],
175
+ 'Z' => [Ve::PartOfSpeech::Determiner, nil]
176
+ }
177
+
178
+ def words
179
+ words = []
180
+
181
+ @tokens.find_all { |t| t[:type] == :parsed }.each do |token|
182
+ if token[:pos] == 'POS'
183
+ # Possessive ending, add to previous token
184
+ words[-1].word << token[:literal]
185
+ words[-1].tokens << token
186
+ next
187
+ else
188
+ # All other tokens
189
+ pos, grammar = INTERNAL_INFO_FOR_PARSED_POS[token[:pos]]
190
+
191
+ if pos.nil? && token[:pos] =~ /^F\w+$/
192
+ pos = Ve::PartOfSpeech::Symbol
193
+ end
194
+
195
+ pos = Ve::PartOfSpeech::TBD if pos.nil?
196
+ word = Ve::Word.new(token[:literal], token[:lemma], pos, [token], {:grammar => grammar})
197
+ words << word
198
+ end
199
+ end
200
+
201
+ words
202
+ end
203
+
204
+ def sentences
205
+ sentences = []
206
+ current = ''
207
+
208
+ @tokens.each do |token|
209
+ if token[:type] == :sentence_split
210
+ sentences << current
211
+ current = ''
212
+ else
213
+ current << token[:literal]
214
+ end
215
+ end
216
+
217
+ # In case there is no :sentence_split at the end
218
+ sentences << current if current.length > 0
219
+
220
+ sentences.collect { |s| s.strip! }
221
+ sentences
222
+ end
223
+
224
+ end
225
+ end
226
+ end
227
+
228
+ Ve::Manager.register(Ve::Provider::FreelingEn, :en)
229
+