ve 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ .DS_Store
2
+ .*.swp
3
+ *.gem
4
+
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ source "http://rubygems.org"
2
+
3
+ gem "json"
4
+
5
+ group :server do
6
+ gem "sinatra"
7
+ gem "rack-cors"
8
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,22 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ json (1.6.1)
5
+ rack (1.3.5)
6
+ rack-cors (0.2.4)
7
+ rack
8
+ rack-protection (1.1.4)
9
+ rack
10
+ sinatra (1.3.1)
11
+ rack (~> 1.3, >= 1.3.4)
12
+ rack-protection (~> 1.1, >= 1.1.2)
13
+ tilt (~> 1.3, >= 1.3.3)
14
+ tilt (1.3.3)
15
+
16
+ PLATFORMS
17
+ ruby
18
+
19
+ DEPENDENCIES
20
+ json
21
+ rack-cors
22
+ sinatra
data/Rakefile ADDED
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env rake
2
+
3
+ require 'rake/testtask'
4
+
5
+ Rake::TestTask.new do |t|
6
+ t.pattern = "tests/*_test.rb"
7
+ end
8
+
9
+ task :default => :test
data/Readme.md ADDED
@@ -0,0 +1,60 @@
1
+ Ve
2
+ ==
3
+
4
+ A linguistic framework for anyone. No degree required.
5
+
6
+ Read all about it on [kimtaro.github.com/ve](http://kimtaro.github.com/ve).
7
+
8
+ Ruby
9
+ ----
10
+
11
+ require 've'
12
+ words = Ve.in(:en).words('I like melons.')
13
+ # => [#<Ve::Word:0x8ee00cc @word="I", @lemma="i", @part_of_speech=Ve::PartOfSpeech::Pronoun, @tokens=[{:raw=>"I i PRP 1", :type=>:parsed, :literal=>"I", :lemma=>"i", :pos=>"PRP", :accuracy=>"1", :characters=>0..0}], @extra={:grammar=>:personal}, @info={}>, #<Ve::Word:0x8edff28 @word="like", @lemma="like", @part_of_speech=Ve::PartOfSpeech::Preposition, @tokens=[{:raw=>"like like IN 0.815649", :type=>:parsed, :literal=>"like", :lemma=>"like", :pos=>"IN", :accuracy=>"0.815649", :characters=>2..5}], @extra={:grammar=>nil}, @info={}>, #<Ve::Word:0x8edfe24 @word="melons", @lemma="melon", @part_of_speech=Ve::PartOfSpeech::Noun, @tokens=[{:raw=>"melons melon NNS 1", :type=>:parsed, :literal=>"melons", :lemma=>"melon", :pos=>"NNS", :accuracy=>"1", :characters=>7..12}], @extra={:grammar=>:plural}, @info={}>, #<Ve::Word:0x8edfcbc @word=".", @lemma=".", @part_of_speech=Ve::PartOfSpeech::Symbol, @tokens=[{:raw=>". . Fp 1", :type=>:parsed, :literal=>".", :lemma=>".", :pos=>"Fp", :accuracy=>"1", :characters=>13..13}], @extra={:grammar=>nil}, @info={}>]
14
+
15
+ words.collect(&:lemma) # => ["i", "like", "melon", "."]
16
+ words.collect(&:part_of_speec) # => [Ve::PartOfSpeech::Pronoun, Ve::PartOfSpeech::Preposition, Ve::PartOfSpeech::Noun, Ve::PartOfSpeech::Symbol]
17
+
18
+ Javascript
19
+ ----------
20
+
21
+ <script type="text/javascript" charset="utf-8" src="ve.js"></script>
22
+ <script type="text/javascript" charset="utf-8">
23
+ new Ve('ja').words('ビールがおいしかった', function(words) {
24
+ // [{"_class":"Word","word":"ビール","lemma":"ビール","part_of_speech":"noun","tokens":[{"raw":"ビール\t名詞,一般,*,*,*,*,ビール,ビール,ビール","type":"parsed","literal":"ビール","pos":"名詞","pos2":"一般","pos3":"*","pos4":"*","inflection_type":"*","inflection_form":"*","lemma":"ビール","reading":"ビール","hatsuon":"ビール","characters":"0..2"}],"extra":{"reading":"ビール","transcription":"ビール","grammar":null},"info":{"reading_script":"kata","transcription_script":"kata"}},{"_class":"Word","word":"が","lemma":"が","part_of_speech":"postposition","tokens":[{"raw":"が\t助詞,格助詞,一般,*,*,*,が,ガ,ガ","type":"parsed","literal":"が","pos":"助詞","pos2":"格助詞","pos3":"一般","pos4":"*","inflection_type":"*","inflection_form":"*","lemma":"が","reading":"ガ","hatsuon":"ガ","characters":"3..3"}],"extra":{"reading":"ガ","transcription":"ガ","grammar":null},"info":{"reading_script":"kata","transcription_script":"kata"}},{"_class":"Word","word":"おいしい","lemma":"おいしい","part_of_speech":"adjective","tokens":[{"raw":"おいしい\t形容詞,自立,*,*,形容詞・イ段,基本形,おいしい,オイシイ,オイシイ","type":"parsed","literal":"おいしい","pos":"形容詞","pos2":"自立","pos3":"*","pos4":"*","inflection_type":"形容詞・イ段","inflection_form":"基本形","lemma":"おいしい","reading":"オイシイ","hatsuon":"オイシイ","characters":"4..7"}],"extra":{"reading":"オイシイ","transcription":"オイシイ","grammar":null},"info":{"reading_script":"kata","transcription_script":"kata"}}]
25
+
26
+ for ( i in words ) {
27
+ var word = words[i];
28
+ console.log(word.lemma + "/" + word.part_of_speech)
29
+ }
30
+
31
+ // ビール/noun
32
+ // が/postposition
33
+ // おいしい/adjective
34
+ });
35
+ </script>
36
+
37
+ Structure
38
+ ---------
39
+
40
+ - **Ve::LocalInterface** - Main interface that gives access to functionality in providers that exist locally
41
+ - **Ve::XInterface** - Allows for different ways of accessing Ve providers. Locally, through an HTTP API, binary protocol or whatever
42
+ - **Ve::Manager** - Keeps track of providers and what they can do
43
+ - **Ve::Provider::X** - Talks to the underlying parser
44
+ - **Ve::Parse::X** - Takes the output from the Provider and turns it into functions the end user can use
45
+
46
+ Todo
47
+ ----
48
+
49
+ - Expose more through the sinatra server
50
+ - Alias lemma to base, so people don't need to know what lemmas are
51
+ - Break out into separate projects for each component. Ve-ruby, Ve-js.
52
+ - Better UTF-8 handling for Freeling
53
+ - See all the TODO's in the code
54
+
55
+ License
56
+ -------
57
+
58
+ (c) Kim Ahlström 2011
59
+
60
+ This is under the MIT license.
data/js/test.html ADDED
@@ -0,0 +1,32 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta name="http-equiv" content="Content-Type: text/html; charset=utf-8">
5
+ <style type="text/css">
6
+ .fail { color: red; }
7
+ .pass { color: green; }
8
+ </style>
9
+ <script src="ve.js" type="text/javascript" charset="utf-8"></script>
10
+ <script type="text/javascript" charset="utf-8">
11
+ function assert(test, name) {
12
+ var report = document.getElementById('report');
13
+ var result = test ? 'pass' : 'fail';
14
+
15
+ report.innerHTML = report.innerHTML + '<p class="' + result + '">' + name + ': ' + result + '</p>'
16
+ }
17
+
18
+ new Ve('en').words('I ate hamburgers.', function(words){
19
+ assert((4 == words.length && 'eat' == words[1].lemma), 'English');
20
+ });
21
+
22
+ new Ve('ja').words('ビールを飲んだ', function(words){
23
+ // TODO: Shouldn't have to encode it here ...
24
+ var word = "\u98f2\u3080"; // 飲む
25
+ assert((3 == words.length && word == words[2].lemma), 'Japanese');
26
+ });
27
+ </script>
28
+ </head>
29
+ <body>
30
+ <div id="report"></div>
31
+ </body>
32
+ </html>
data/js/ve.js ADDED
@@ -0,0 +1,57 @@
1
+ /**
2
+ * ve.js
3
+ *
4
+ * Communicates with a Sinatra-server to facilitate linguistic
5
+ * parsing tech in JS.
6
+ *
7
+ * @Author: Kim Ahlstrom
8
+ * @Author: Ryan McGrath <ryan@venodesigns.net>
9
+ * @Requires: Nothing
10
+ */
11
+
12
+ ;(function(w, d, undefined) {
13
+ var Ve = w.Ve = function Ve(language) {
14
+ this.language = language;
15
+ this.url = 'http://localhost:4567/';
16
+ return this;
17
+ };
18
+
19
+ Ve.prototype = {
20
+ words: function(text, callbackfn) {
21
+ // Need to utf8-encode stuff at this point...
22
+ jsonp(this.url + this.language + '/words?text=' + text, callbackfn);
23
+ return this;
24
+ }
25
+ };
26
+
27
+ var jsonp = function jsonp(src, callbackfn) {
28
+ var newScript = document.createElement("script"),
29
+ callback = 've_callback_' + +new Date();
30
+
31
+ newScript.type = "text/javascript";
32
+ newScript.setAttribute("async", "true");
33
+ newScript.setAttribute("src", src + '&callback=' + callback);
34
+ window[callback] = callbackfn;
35
+
36
+ /**
37
+ * Automagically handle cleanup of injected script tags, so we don't litter someone's DOM
38
+ * with our stuff. This branches for various reasons - could be a bit cleaner.
39
+ */
40
+ if(newScript.readyState) {
41
+ newScript.onreadystatechange = function() {
42
+ if(/loaded|complete/.test(newScript.readyState)) {
43
+ newScript.onreadystatechange = null;
44
+ document.documentElement.firstChild.removeChild(newScript);
45
+ window[callback] = null;
46
+ }
47
+ }
48
+ } else {
49
+ newScript.addEventListener("load", function() {
50
+ document.documentElement.firstChild.removeChild(newScript);
51
+ window[callback] = null;
52
+ }, false);
53
+ }
54
+
55
+ document.documentElement.firstChild.appendChild(newScript);
56
+ }
57
+ })(window, document, 'undefined');
data/lib/language.rb ADDED
@@ -0,0 +1,2 @@
1
+ class Ve::Language
2
+ end
@@ -0,0 +1,6 @@
1
+ class Ve
2
+ class Language
3
+ class English
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,9 @@
1
+ class Ve
2
+ class Language
3
+ class Japanese
4
+
5
+ #interface_for :ja
6
+
7
+ end
8
+ end
9
+ end
data/lib/misc.rb ADDED
@@ -0,0 +1,10 @@
1
+ class Enumerator
2
+ def more?
3
+ begin
4
+ self.peek
5
+ true
6
+ rescue
7
+ false
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,30 @@
1
+ class Ve
2
+ class PartOfSpeech
3
+
4
+ def self.name
5
+ self.to_s.split('::').last.downcase
6
+ end
7
+
8
+ class Noun < PartOfSpeech; end
9
+ class ProperNoun < PartOfSpeech; end
10
+ class Pronoun < PartOfSpeech; end
11
+ class Adjective < PartOfSpeech; end
12
+ class Adverb < PartOfSpeech; end
13
+ class Determiner < PartOfSpeech; end
14
+ class Preposition < PartOfSpeech; end
15
+ class Postposition < PartOfSpeech; end
16
+ class Verb < PartOfSpeech; end
17
+ class Suffix < PartOfSpeech; end
18
+ class Prefix < PartOfSpeech; end
19
+ class Conjunction < PartOfSpeech; end
20
+ class Interjection < PartOfSpeech; end
21
+ class Number < PartOfSpeech; end
22
+ class Unknown < PartOfSpeech; end
23
+ class Symbol < PartOfSpeech; end
24
+ class Determiner < PartOfSpeech; end
25
+ class Other < PartOfSpeech; end
26
+
27
+ class TBD < PartOfSpeech; end # Placeholder for provider PoS that haven't had a Ve PoS assigned yet
28
+
29
+ end
30
+ end
data/lib/provider.rb ADDED
@@ -0,0 +1,29 @@
1
+ class Ve
2
+ class Provider
3
+
4
+ # Interface, to be implemented by providers
5
+
6
+ def provides
7
+ end
8
+
9
+ def start!
10
+ end
11
+
12
+ def works?
13
+ end
14
+
15
+ def parse
16
+ end
17
+
18
+ end
19
+ end
20
+
21
+ class Ve
22
+ class Parse
23
+
24
+ # TODO
25
+ def as_json
26
+ end
27
+
28
+ end
29
+ end
File without changes
@@ -0,0 +1,229 @@
1
+ # Encoding: UTF-8
2
+
3
+ # TODO: Retain capitalization in lemmas?
4
+ # TODO: Memoize
5
+
6
+ require 'open3'
7
+
8
+ class Ve
9
+ class Provider
10
+ class FreelingEn < Ve::Provider
11
+
12
+ BIT_STOP = 'VeEnd'
13
+
14
+ # TODO: Automatically set FREELINGSHARE if it's not set?
15
+ def initialize(config = {})
16
+ @config = {:app => 'analyzer',
17
+ :path => '',
18
+ :flags => ''}.merge(config)
19
+
20
+ @config[:app] = `which #{@config[:app]}`.strip!
21
+ local = @config[:app] =~ /local/ ? '/local' : ''
22
+ @config[:flags] = "-f /usr#{local}/share/FreeLing/config/en.cfg --flush --nonumb --nodate"
23
+
24
+ start!
25
+ end
26
+
27
+ # Interface methods
28
+
29
+ def works?
30
+ (["Wrote write VBD 1", ""] == parse('Wrote').tokens.collect { |t| t[:raw] })
31
+ end
32
+
33
+ # Talks to the app and returns a parse object
34
+ def parse(text, options = {})
35
+ start! if @stdin.nil?
36
+ # Fix Unicode chars
37
+ # TODO: These need to be converted back to the original char in the :literal attribute
38
+ text = text.gsub('’', "'")
39
+
40
+ @stdin.puts "#{text}\n#{BIT_STOP}\n"
41
+ output = []
42
+
43
+ while line = @stdout.readline
44
+ if line =~ /#{BIT_STOP}/x
45
+ @stdout.readline
46
+ break
47
+ end
48
+ output << line
49
+ end
50
+
51
+ Ve::Parse::FreelingEn.new(text, output)
52
+ rescue
53
+ Ve::Parse::FreelingEn.new(text, [])
54
+ end
55
+
56
+ private
57
+
58
+ def start!
59
+ @stdin, @stdout, @stderr = Open3.popen3("#{@config[:app]} #{@config[:flags]}")
60
+
61
+ # TODO: Also filter out non-iso-latin-1 characters
62
+ @stdin.set_encoding('UTF-8', 'ISO-8859-1')
63
+ @stdout.set_encoding('ISO-8859-1', 'UTF-8')
64
+ rescue Errno::ENOENT
65
+ # The parser couldn't be started. Probably not installed on this system
66
+ end
67
+
68
+ end
69
+ end
70
+ end
71
+
72
+ class Ve
73
+ class Parse
74
+ class FreelingEn < Ve::Parse
75
+
76
+ attr_reader :tokens, :text
77
+
78
+ def initialize(text, output)
79
+ @tokens = []
80
+ @text = text
81
+ position = 0
82
+
83
+ output.each_with_index do |line, index|
84
+ line.rstrip!
85
+ token = {:raw => line}
86
+
87
+ # Anything unparsed at the end of the text
88
+ # This must happen before sentence splits are detected to avoid funny ordering
89
+ if output.length > 1 && output.length == index + 1
90
+ unparsed_md = %r{(.*? \Z\n?)}mx.match(text, position)
91
+ if unparsed_md[1].length > 0
92
+ unparsed_token = {:type => :unparsed,
93
+ :literal => unparsed_md[1],
94
+ :raw => ''}
95
+ unparsed_token[:characters] = (position..(position+unparsed_token[:literal].length-1))
96
+ @tokens << unparsed_token
97
+ end
98
+ end
99
+
100
+ # Sentence splits are just empty lines in Freeling
101
+ if line.length == 0
102
+ token[:type] = :sentence_split
103
+ token[:literal] = ''
104
+ @tokens << token
105
+ next
106
+ end
107
+
108
+ # The parsed token
109
+ info = line.split(/\s+/)
110
+ token[:type] = :parsed
111
+ [:literal, :lemma, :pos, :accuracy].each_with_index do |attr, i|
112
+ token[attr] = info[i]
113
+ end
114
+
115
+ token[:literal].gsub!('_', ' ')
116
+ token[:lemma].gsub!('_', ' ')
117
+
118
+ # Anything unparsed preceding this token.
119
+ # We need to do this complicated dance with _ since Freeling replaces spaces with it.
120
+ # And so we need to be able to find the token with both spaces and _ in it since
121
+ # we don't know what the original in the text actually is.
122
+ # Once we have the location in the text we can figure out if it should be with spaces or _.
123
+ unparsed_re = %r{(.*?) #{Regexp.quote(token[:literal])}}mx
124
+ unparsed_re = %r{#{unparsed_re.to_s.gsub('_', '[\s_]')}}
125
+ unparsed_md = unparsed_re.match(text, position)
126
+ if unparsed_md && unparsed_md[1].length > 0
127
+ unparsed_token = {:type => :unparsed, :literal => unparsed_md[1]}
128
+ unparsed_token[:characters] = (position..(position+unparsed_token[:literal].length-1))
129
+ @tokens << unparsed_token
130
+ position += unparsed_token[:literal].length
131
+ end
132
+
133
+ token[:characters] = (position..(position+token[:literal].length-1))
134
+ position += token[:literal].length
135
+ @tokens << token
136
+ end
137
+ end
138
+
139
+ INTERNAL_INFO_FOR_PARSED_POS = {
140
+ 'CC' => [Ve::PartOfSpeech::Conjunction, nil],
141
+ 'CD' => [Ve::PartOfSpeech::Number, nil],
142
+ 'DT' => [Ve::PartOfSpeech::Determiner, nil],
143
+ 'EX' => [Ve::PartOfSpeech::Pronoun, nil],
144
+ 'FW' => [Ve::PartOfSpeech::Unknown, nil],
145
+ 'IN' => [Ve::PartOfSpeech::Preposition, nil],
146
+ 'JJ' => [Ve::PartOfSpeech::Adjective, nil],
147
+ 'JJR' => [Ve::PartOfSpeech::Adjective, :comparative],
148
+ 'JJS' => [Ve::PartOfSpeech::Adjective, :superlative],
149
+ 'LS' => [Ve::PartOfSpeech::Unknown, nil],
150
+ 'MD' => [Ve::PartOfSpeech::Verb, :modal],
151
+ 'NN' => [Ve::PartOfSpeech::Noun, nil],
152
+ 'NNS' => [Ve::PartOfSpeech::Noun, :plural],
153
+ 'NNP' => [Ve::PartOfSpeech::ProperNoun, nil],
154
+ 'NNPS' => [Ve::PartOfSpeech::ProperNoun, :plural],
155
+ 'PDT' => [Ve::PartOfSpeech::Determiner, nil],
156
+ 'PRP' => [Ve::PartOfSpeech::Pronoun, :personal],
157
+ 'PRP$' => [Ve::PartOfSpeech::Pronoun, :possessive],
158
+ 'RB' => [Ve::PartOfSpeech::Adverb, nil],
159
+ 'RBR' => [Ve::PartOfSpeech::Adverb, :comparative],
160
+ 'RBS' => [Ve::PartOfSpeech::Adverb, :superlative],
161
+ 'RP' => [Ve::PartOfSpeech::Postposition, nil],
162
+ 'SYM' => [Ve::PartOfSpeech::Symbol, nil],
163
+ 'TO' => [Ve::PartOfSpeech::Preposition, nil],
164
+ 'UH' => [Ve::PartOfSpeech::Interjection, nil],
165
+ 'VB' => [Ve::PartOfSpeech::Verb, nil],
166
+ 'VBD' => [Ve::PartOfSpeech::Verb, :past],
167
+ 'VBG' => [Ve::PartOfSpeech::Verb, :present_participle],
168
+ 'VBN' => [Ve::PartOfSpeech::Verb, :past_participle],
169
+ 'VBP' => [Ve::PartOfSpeech::Verb, nil],
170
+ 'VBZ' => [Ve::PartOfSpeech::Verb, nil],
171
+ 'WDT' => [Ve::PartOfSpeech::Determiner, nil],
172
+ 'WP' => [Ve::PartOfSpeech::Pronoun, nil],
173
+ 'WP$' => [Ve::PartOfSpeech::Pronoun, :possessive],
174
+ 'WRB' => [Ve::PartOfSpeech::Adverb, nil],
175
+ 'Z' => [Ve::PartOfSpeech::Determiner, nil]
176
+ }
177
+
178
+ def words
179
+ words = []
180
+
181
+ @tokens.find_all { |t| t[:type] == :parsed }.each do |token|
182
+ if token[:pos] == 'POS'
183
+ # Possessive ending, add to previous token
184
+ words[-1].word << token[:literal]
185
+ words[-1].tokens << token
186
+ next
187
+ else
188
+ # All other tokens
189
+ pos, grammar = INTERNAL_INFO_FOR_PARSED_POS[token[:pos]]
190
+
191
+ if pos.nil? && token[:pos] =~ /^F\w+$/
192
+ pos = Ve::PartOfSpeech::Symbol
193
+ end
194
+
195
+ pos = Ve::PartOfSpeech::TBD if pos.nil?
196
+ word = Ve::Word.new(token[:literal], token[:lemma], pos, [token], {:grammar => grammar})
197
+ words << word
198
+ end
199
+ end
200
+
201
+ words
202
+ end
203
+
204
+ def sentences
205
+ sentences = []
206
+ current = ''
207
+
208
+ @tokens.each do |token|
209
+ if token[:type] == :sentence_split
210
+ sentences << current
211
+ current = ''
212
+ else
213
+ current << token[:literal]
214
+ end
215
+ end
216
+
217
+ # In case there is no :sentence_split at the end
218
+ sentences << current if current.length > 0
219
+
220
+ sentences.collect { |s| s.strip! }
221
+ sentences
222
+ end
223
+
224
+ end
225
+ end
226
+ end
227
+
228
+ Ve::Manager.register(Ve::Provider::FreelingEn, :en)
229
+