wlapi 0.0.6 → 0.7.1
Sign up to get free protection for your applications and to get access to all the features.
- data/INSTALL +5 -0
- data/README +11 -5
- data/README.rdoc +22 -0
- data/Rakefile +14 -0
- data/lib/wlapi/api.rb +354 -0
- data/lib/wlapi.rb +1 -267
- data/test/test_api.rb +233 -0
- data/wlapi.gemspec +9 -7
- metadata +14 -47
- data/doc/INSTALL.html +0 -90
- data/doc/LICENSE.html +0 -115
- data/doc/README.html +0 -119
- data/doc/WLAPI/API.html +0 -1008
- data/doc/WLAPI.html +0 -155
- data/doc/created.rid +0 -6
- data/doc/example/example_rb.html +0 -65
- data/doc/images/brick.png +0 -0
- data/doc/images/brick_link.png +0 -0
- data/doc/images/bug.png +0 -0
- data/doc/images/bullet_black.png +0 -0
- data/doc/images/bullet_toggle_minus.png +0 -0
- data/doc/images/bullet_toggle_plus.png +0 -0
- data/doc/images/date.png +0 -0
- data/doc/images/find.png +0 -0
- data/doc/images/loadingAnimation.gif +0 -0
- data/doc/images/macFFBgHack.png +0 -0
- data/doc/images/package.png +0 -0
- data/doc/images/page_green.png +0 -0
- data/doc/images/page_white_text.png +0 -0
- data/doc/images/page_white_width.png +0 -0
- data/doc/images/plugin.png +0 -0
- data/doc/images/ruby.png +0 -0
- data/doc/images/tag_green.png +0 -0
- data/doc/images/wrench.png +0 -0
- data/doc/images/wrench_orange.png +0 -0
- data/doc/images/zoom.png +0 -0
- data/doc/index.html +0 -131
- data/doc/js/darkfish.js +0 -116
- data/doc/js/jquery.js +0 -32
- data/doc/js/quicksearch.js +0 -114
- data/doc/js/thickbox-compressed.js +0 -10
- data/doc/lib/wlapi_rb.html +0 -59
- data/doc/rdoc.css +0 -706
- data/example/example.rb +0 -17
data/INSTALL
CHANGED
data/README
CHANGED
@@ -1,16 +1,22 @@
|
|
1
|
-
|
1
|
+
WLAPI
|
2
2
|
|
3
|
-
|
4
|
-
|
3
|
+
RubyGems (http://rubygems.org/gems/wlapi)
|
4
|
+
Developers Homepage (http://www.uni-trier.de/index.php?id=24140)
|
5
|
+
WLAPI Project Page (http://wlapi.rubyforge.org/)
|
5
6
|
|
6
|
-
|
7
|
+
DESCRIPTION
|
7
8
|
|
8
9
|
WLAPI is a simple API for Wortschatz Leipzig project.
|
9
10
|
|
11
|
+
SYNOPSIS
|
10
12
|
$ require 'wlapi'
|
11
13
|
$ api = WLAPI::API.new
|
12
14
|
$ api.synonyms('Haus', 15) # returns an array with string values (UTF8 encoded)
|
13
15
|
|
14
16
|
See RDOC documentation for details on particular methods.
|
15
17
|
|
16
|
-
|
18
|
+
LICENSE
|
19
|
+
|
20
|
+
WLAPI is a copyrighted software by Andrei Beliankou, 2010.
|
21
|
+
You may use, redistribute and change it under the terms
|
22
|
+
provided in the LICENSE file.
|
data/README.rdoc
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
= WLAPI
|
2
|
+
|
3
|
+
* {RubyGems}[http://rubygems.org/gems/wlapi]
|
4
|
+
* Developers {Homepage}[http://www.uni-trier.de/index.php?id=24140]
|
5
|
+
* {WLAPI Project Page}[http://wlapi.rubyforge.org/]
|
6
|
+
|
7
|
+
== DESCRIPTION
|
8
|
+
|
9
|
+
WLAPI is a simple API for Wortschatz Leipzig project.
|
10
|
+
|
11
|
+
== SYNOPSIS
|
12
|
+
$ require 'wlapi'
|
13
|
+
$ api = WLAPI::API.new
|
14
|
+
$ api.synonyms('Haus', 15) # returns an array with string values (UTF8 encoded)
|
15
|
+
|
16
|
+
See RDOC documentation for details on particular methods.
|
17
|
+
|
18
|
+
== LICENSE
|
19
|
+
|
20
|
+
WLAPI is a copyrighted software by Andrei Beliankou, 2010.
|
21
|
+
You may use, redistribute and change it under the terms
|
22
|
+
provided in the LICENSE file.
|
data/Rakefile
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
# we can require 'rake/clean' to add 'clobber' and 'clean' tasks
|
4
|
+
require 'rake/clean'
|
5
|
+
require 'rake/testtask'
|
6
|
+
|
7
|
+
|
8
|
+
SRC = FileList['*.rb']
|
9
|
+
|
10
|
+
CLOBBER.include('doc', '**/*.html', '*.gem')
|
11
|
+
|
12
|
+
Rake::TestTask.new do |t|
|
13
|
+
t.test_files = FileList['test/*.rb']
|
14
|
+
end
|
data/lib/wlapi/api.rb
ADDED
@@ -0,0 +1,354 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
# 2010-, Andrei Beliankou
|
3
|
+
|
4
|
+
# :title: Ruby based API for Wortschatz Leipzig web services
|
5
|
+
|
6
|
+
|
7
|
+
# SAVON is a SOAP client.
|
8
|
+
require 'savon'
|
9
|
+
|
10
|
+
# REXML is fast enough for our task.
|
11
|
+
require 'rexml/document'
|
12
|
+
include REXML
|
13
|
+
|
14
|
+
# Top level namespace wrapper for WLAPI
|
15
|
+
module WLAPI
|
16
|
+
|
17
|
+
# This class represents an interface to the linguistic web services
|
18
|
+
# provided by the University of Leipzig.
|
19
|
+
#
|
20
|
+
# See the project 'Wortschatz Leipzig' for more details.
|
21
|
+
class API
|
22
|
+
|
23
|
+
# At the creation point clients for all services are being instantiated.
|
24
|
+
# You can also set the login and the password (it defaults to 'anonymous').
|
25
|
+
# api = WLAPI::API.new
|
26
|
+
def initialize(login = 'anonymous', pass = 'anonymous')
|
27
|
+
|
28
|
+
# This hash contains the URLs to the single services.
|
29
|
+
endpoint = 'http://wortschatz.uni-leipzig.de/axis/services'
|
30
|
+
@services = {
|
31
|
+
'Thesaurus' => "#{endpoint}/Thesaurus",
|
32
|
+
'Baseform' => "#{endpoint}/Baseform",
|
33
|
+
'Similarity' => "#{endpoint}/Similarity",
|
34
|
+
'Synonyms' => "#{endpoint}/Synonyms",
|
35
|
+
'Sachgebiet' => "#{endpoint}/Sachgebiet",
|
36
|
+
'Frequencies' => "#{endpoint}/Frequencies",
|
37
|
+
'Kookurrenzschnitt' => "#{endpoint}/Kookkurrenzschnitt",
|
38
|
+
'ExperimentalSynonyms' => "#{endpoint}/ExperimentalSynonyms",
|
39
|
+
'RightCollocationFinder' => "#{endpoint}/RightCollocationFinder",
|
40
|
+
'LeftCollocationFinder' => "#{endpoint}/LeftCollocationFinder",
|
41
|
+
'Wordforms' => "#{endpoint}/Wordforms",
|
42
|
+
'CooccurrencesAll' => "#{endpoint}/CooccurrencesAll",
|
43
|
+
'LeftNeighbours' => "#{endpoint}/LeftNeighbours",
|
44
|
+
'RightNeighbours' => "#{endpoint}/RightNeighbours",
|
45
|
+
'Sentences' => "#{endpoint}/Sentences",
|
46
|
+
'Cooccurrences' => "#{endpoint}/Cooccurrences"
|
47
|
+
# no MARSService and Kreuzwortrraetsel
|
48
|
+
}
|
49
|
+
|
50
|
+
# cl short for client.
|
51
|
+
# Dynamically create all the clients and set access credentials.
|
52
|
+
# It can be a very bad idea to instantiate all the clients at once,
|
53
|
+
# we should investigate the typical user behaviour.
|
54
|
+
# If only one service is used in the separate session => rewrite the class!
|
55
|
+
@services.each do |key, val|
|
56
|
+
cl_name = '@cl_' + key
|
57
|
+
eval("#{cl_name} = Savon::Client.new(val)")
|
58
|
+
eval("#{cl_name}.request.basic_auth(login, pass)")
|
59
|
+
end
|
60
|
+
|
61
|
+
# Savon creates very verbose logs, switching off.
|
62
|
+
Savon::Request.log = false unless $DEBUG
|
63
|
+
end
|
64
|
+
|
65
|
+
# Main methods to access different services.
|
66
|
+
#
|
67
|
+
# You can define the limit for the result set, it defaults to 10.
|
68
|
+
# If you want to get all the results, you should provide a number,
|
69
|
+
# which would be greater than the result set since we cannot
|
70
|
+
# predict how many answers the server will give us. Just try it.
|
71
|
+
##############################################################################
|
72
|
+
|
73
|
+
## One parameter methods.
|
74
|
+
##############################################################################
|
75
|
+
|
76
|
+
# Returns the frequency and frequency class of the input word.
|
77
|
+
# Frequency class is computed in relation to the most frequent word
|
78
|
+
# in the corpus. The higher the class, the rarer the word:
|
79
|
+
# api.frequencies("Autos") => ["40614", "9"]
|
80
|
+
def frequencies(word)
|
81
|
+
arg1 = ['Wort', word]
|
82
|
+
answer = query(@cl_Frequencies, @services['Frequencies'], arg1)
|
83
|
+
return get_answer(answer)
|
84
|
+
end
|
85
|
+
|
86
|
+
# Gets the baseform (whatever it is :) not lemma).
|
87
|
+
# Returns the lemmatized (base) form of the input word
|
88
|
+
# and the POS tag in an array:
|
89
|
+
# api.baseform("Auto") => ["Auto", "N"]
|
90
|
+
def baseform(word)
|
91
|
+
arg1 = ['Wort', word]
|
92
|
+
answer = query(@cl_Baseform, @services['Baseform'], arg1)
|
93
|
+
return get_answer(answer)
|
94
|
+
end
|
95
|
+
|
96
|
+
# Returns categories for a given input word as an array:
|
97
|
+
# api.domain("Michael") => ["Vorname", "Nachname", "Männername"]
|
98
|
+
#--
|
99
|
+
# Is it a good name? all names are in English, but here..
|
100
|
+
# let's call it domain, not sachgebiet
|
101
|
+
def domain(word)
|
102
|
+
arg1 = ['Wort', word]
|
103
|
+
answer = query(@cl_Sachgebiet, @services['Sachgebiet'], arg1)
|
104
|
+
return get_answer(answer)
|
105
|
+
end
|
106
|
+
|
107
|
+
## Two parameter methods.
|
108
|
+
##############################################################################
|
109
|
+
|
110
|
+
# Returns all other word forms of the same lemma for a given word form.
|
111
|
+
# api.wordforms("Auto") => ["Auto", "Autos"]
|
112
|
+
def wordforms(word, limit = 10)
|
113
|
+
# note, it is the only service which requires 'Word', not 'Wort'
|
114
|
+
arg1 = ['Word', word]
|
115
|
+
arg2 = ['Limit', limit]
|
116
|
+
answer = query(@cl_Wordforms, @services['Wordforms'], arg1, arg2)
|
117
|
+
return get_answer(answer)
|
118
|
+
end
|
119
|
+
|
120
|
+
# As the Synonyms service returns synonyms of the given input word.
|
121
|
+
# However, this first builds a lemma of the input word
|
122
|
+
# and thus returns more synonyms:
|
123
|
+
# api.thesaurus("Auto") => ["Auto", "Bahn", "Wagen", "Zug", "Schiff", ...]
|
124
|
+
def thesaurus(word, limit = 10)
|
125
|
+
arg1 = ['Wort', word]
|
126
|
+
arg2 = ['Limit', limit]
|
127
|
+
answer = query(@cl_Thesaurus, @services['Thesaurus'], arg1, arg2)
|
128
|
+
return get_answer(answer)
|
129
|
+
end
|
130
|
+
|
131
|
+
# This method searches for synonyms.
|
132
|
+
# Returns synonyms of the input word. In other words, this is a thesaurus.
|
133
|
+
# api.synonyms("Auto") => ["Kraftwagen", "Automobil", "Benzinkutsche", ...]
|
134
|
+
def synonyms(word, limit = 10)
|
135
|
+
arg1 = ['Wort', word]
|
136
|
+
arg2 = ['Limit', limit]
|
137
|
+
answer = query(@cl_Synonyms, @services['Synonyms'], arg1, arg2)
|
138
|
+
# Synonym service provide multiple values, so we take only odd.
|
139
|
+
return get_answer(answer, '[position() mod 2 = 1 ]')
|
140
|
+
end
|
141
|
+
|
142
|
+
# Returns sample sentences containing the input word.
|
143
|
+
# The return value is an array:
|
144
|
+
# api.sentences("Auto") => ["40808144", "Zweitens der freche, frische Klang der Hupe
|
145
|
+
# und drittens die hinreißend gestylten 16-Zoll-Felgen,
|
146
|
+
# die es leider nur für dieses Auto gibt.", ...]
|
147
|
+
#--
|
148
|
+
# ok, but results should be filtered
|
149
|
+
def sentences(word, limit = 10)
|
150
|
+
arg1 = ['Wort', word]
|
151
|
+
arg2 = ['Limit', limit]
|
152
|
+
answer = query(@cl_Sentences, @services['Sentences'], arg1, arg2)
|
153
|
+
return get_answer(answer)
|
154
|
+
end
|
155
|
+
|
156
|
+
# For a given input word, returns statistically significant left neighbours
|
157
|
+
# (words co-occurring immediately to the left of the input word).
|
158
|
+
# api.left_neighbours("Auto") => ["geparktes", "Auto", "561", ...]
|
159
|
+
#--
|
160
|
+
# ok, but results should be filtered
|
161
|
+
def left_neighbours(word, limit = 10)
|
162
|
+
arg1 = ['Wort', word]
|
163
|
+
arg2 = ['Limit', limit]
|
164
|
+
answer = query(@cl_LeftNeighbours, @services['LeftNeighbours'], arg1, arg2)
|
165
|
+
return get_answer(answer)
|
166
|
+
end
|
167
|
+
|
168
|
+
# For a given input word, returns statistically significant right neighbours
|
169
|
+
# (words co-occurring immediately to the right of the input word).
|
170
|
+
# api.right_neighbours("Auto") => ["Auto", "erfaßt", "575", ...]
|
171
|
+
#--
|
172
|
+
# ok, but results should be filtered
|
173
|
+
def right_neighbours(word, limit = 10)
|
174
|
+
arg1 = ['Wort', word]
|
175
|
+
arg2 = ['Limit', limit]
|
176
|
+
answer = query(@cl_RightNeighbours, @services['RightNeighbours'], arg1, arg2)
|
177
|
+
return get_answer(answer)
|
178
|
+
end
|
179
|
+
|
180
|
+
|
181
|
+
# Returns automatically computed contextually similar words of the input word.
|
182
|
+
# Such similar words may be antonyms, hyperonyms, synonyms,
|
183
|
+
# cohyponyms or other.
|
184
|
+
# Note that due to the huge amount of data any query to this services
|
185
|
+
# may take a long time.
|
186
|
+
# api.similarity("Auto") => ["Auto", "Wagen", "26", ...]
|
187
|
+
def similarity(word, limit = 10)
|
188
|
+
arg1 = ['Wort', word]
|
189
|
+
arg2 = ['Limit', limit]
|
190
|
+
answer = query(@cl_Similarity, @services['Similarity'], arg1, arg2)
|
191
|
+
return get_answer(answer)
|
192
|
+
end
|
193
|
+
|
194
|
+
# This service delivers an experimental synonyms request for internal tests.
|
195
|
+
#--
|
196
|
+
# don't know, if we have to include this service...
|
197
|
+
def experimental_synonyms(word, limit = 10)
|
198
|
+
arg1 = ['Wort', word]
|
199
|
+
arg2 = ['Limit', limit]
|
200
|
+
answer = query(@cl_ExperimentalSynonyms, @services['ExperimentalSynonyms'], arg1, arg2)
|
201
|
+
return get_answer(answer)
|
202
|
+
end
|
203
|
+
|
204
|
+
## Three parameter methods.
|
205
|
+
##############################################################################
|
206
|
+
|
207
|
+
# Attempts to find linguistic collocations that occur to the right
|
208
|
+
# of the given input word.
|
209
|
+
# The parameter 'Wortart' accepts four values 'A, V, N, S'
|
210
|
+
# which stand for adjective, verb, noun and stopword respectively.
|
211
|
+
# The parameter restricts the type of words found.
|
212
|
+
# It returns an array:
|
213
|
+
# api.right_collocation_finder("Auto", "V", 10) => ["Auto", "abfackeln", "V", ...]
|
214
|
+
def right_collocation_finder(word, pos, limit = 10)
|
215
|
+
arg1 = ['Wort', word]
|
216
|
+
arg2 = ['Wortart', pos]
|
217
|
+
arg3 = ['Limit', limit]
|
218
|
+
answer = query(@cl_RightCollocationFinder, @services['RightCollocationFinder'], arg1, arg2, arg3)
|
219
|
+
return get_answer(answer)
|
220
|
+
end
|
221
|
+
|
222
|
+
# Attempts to find linguistic collocations that occur to the left
|
223
|
+
# of the given input word.
|
224
|
+
# The parameter 'Wortart' accepts four values 'A, V, N, S'
|
225
|
+
# which stand for adjective, verb, noun and stopword respectively.
|
226
|
+
# The parameter restricts the type of words found.
|
227
|
+
# It returns an array:
|
228
|
+
# api.left_collocation_finder("Stuhl", "A", 10) => ["apostolisch", "A", "Stuhl", ...]
|
229
|
+
def left_collocation_finder(word, pos, limit = 10)
|
230
|
+
arg1 = ['Wort', word]
|
231
|
+
arg2 = ['Wortart', pos]
|
232
|
+
arg3 = ['Limit', limit]
|
233
|
+
answer = query(@cl_LeftCollocationFinder, @services['LeftCollocationFinder'], arg1, arg2, arg3)
|
234
|
+
return get_answer(answer)
|
235
|
+
end
|
236
|
+
|
237
|
+
# Returns statistically significant co-occurrences of the input word.
|
238
|
+
def cooccurrences(word, sign, limit = 10)
|
239
|
+
arg1 = ['Wort', word]
|
240
|
+
arg2 = ['Mindestsignifikanz', sign]
|
241
|
+
arg3 = ['Limit', limit]
|
242
|
+
raise 'Not implemented yet!'
|
243
|
+
end
|
244
|
+
|
245
|
+
# Returns statistically significant co-occurrences of the input word.
|
246
|
+
# However, it searches in the unrestricted version of the co-occurrences table
|
247
|
+
# as in the Cooccurrences services,
|
248
|
+
# which means significantly longer wait times.
|
249
|
+
def cooccurrences_all(word, sign, limit = 10)
|
250
|
+
arg1 = ['Wort', word]
|
251
|
+
arg2 = ['Mindestsignifikanz', sign]
|
252
|
+
arg3 = ['Limit', limit]
|
253
|
+
raise 'Not implemented yet!'
|
254
|
+
end
|
255
|
+
|
256
|
+
# Returns the intersection of the co-occurrences of the two given words.
|
257
|
+
# The result set is ordered according to the sum of the significances
|
258
|
+
# in descending order. Note that due to the join involved,
|
259
|
+
# this make take some time.
|
260
|
+
#--
|
261
|
+
# let's call it intersection, not kookurrenzschnitt
|
262
|
+
# is being used INTERN, we need additional credentials
|
263
|
+
def intersection(word1, word2, limit = 10)
|
264
|
+
arg1 = ['Wort 1', word1]
|
265
|
+
arg2 = ['Wort 2', word2]
|
266
|
+
arg3 = ['Limit', limit]
|
267
|
+
# we are not going to implement it now
|
268
|
+
raise 'Will never be implemented!'
|
269
|
+
end
|
270
|
+
|
271
|
+
private
|
272
|
+
|
273
|
+
# Main query method, it invokes the soap engine.
|
274
|
+
# This method combines all the data to one SOAP request and gets the answer.
|
275
|
+
# It is not compatible with one and three parameter methods yet.
|
276
|
+
# args contains an array [[key1, value1], [key2, value2], [key3, value3]]
|
277
|
+
# with keys and values for the soap query
|
278
|
+
def query(cl, namespace, *args)
|
279
|
+
|
280
|
+
# Calling the action with ! (disables the wsdl query).
|
281
|
+
# wsdl is disabled since calling the server for wsdl can last too long.
|
282
|
+
resp = cl.execute! do |soap|
|
283
|
+
|
284
|
+
# adding a namespace, wsdl is disabled
|
285
|
+
soap.namespace = namespace
|
286
|
+
|
287
|
+
soap.namespaces['xmlns:soapenv'] = "http://schemas.xmlsoap.org/soap/envelope/"
|
288
|
+
|
289
|
+
# Every service has a different namespace.
|
290
|
+
soap.namespaces['xmlns:urn'] = "urn:#{namespace.sub(/.+ces\//, '')}"
|
291
|
+
|
292
|
+
soap.namespaces['xmlns:dat'] = "http://datatypes.webservice.wortschatz.uni_leipzig.de"
|
293
|
+
|
294
|
+
|
295
|
+
body = "<urn:objRequestParameters>"
|
296
|
+
body << "<urn:corpus>de</urn:corpus>"
|
297
|
+
body << "<urn:parameters>"
|
298
|
+
|
299
|
+
# setting the first argument (usually 'Wort')
|
300
|
+
if args[0]
|
301
|
+
body << "<urn:dataVectors><dat:dataRow>#{args[0][0]}</dat:dataRow><dat:dataRow>#{args[0][1]}</dat:dataRow></urn:dataVectors>"
|
302
|
+
end
|
303
|
+
|
304
|
+
# setting the second argument (usually 'Limit')
|
305
|
+
if args[1]
|
306
|
+
body << "<urn:dataVectors><dat:dataRow>#{args[1][0]}</dat:dataRow><dat:dataRow>#{args[1][1]}</dat:dataRow></urn:dataVectors>"
|
307
|
+
end
|
308
|
+
|
309
|
+
# setting setting the third argument (no common value)
|
310
|
+
if args[2]
|
311
|
+
body << "<urn:dataVectors><dat:dataRow>#{args[2][0]}</dat:dataRow><dat:dataRow>#{args[2][1]}</dat:dataRow></urn:dataVectors>"
|
312
|
+
end
|
313
|
+
body << "</urn:parameters>"
|
314
|
+
body << "</urn:objRequestParameters>"
|
315
|
+
|
316
|
+
soap.body = body
|
317
|
+
|
318
|
+
|
319
|
+
|
320
|
+
STDERR.puts soap.to_xml if $DEBUG
|
321
|
+
|
322
|
+
end
|
323
|
+
|
324
|
+
doc = Document.new(resp.to_xml)
|
325
|
+
|
326
|
+
STDERR.puts doc if $DEBUG
|
327
|
+
|
328
|
+
return doc
|
329
|
+
|
330
|
+
end
|
331
|
+
|
332
|
+
# This method extracts valuable data from the XML structure
|
333
|
+
# of the soap response. It returns an array with extracted xml text nodes
|
334
|
+
# or nil, if the service provided no answer.
|
335
|
+
# The same collection is printed to stderr in the DEBUG mode.
|
336
|
+
#--
|
337
|
+
# TODO: what if the answer is empty?
|
338
|
+
def get_answer(doc, mod='')
|
339
|
+
result = []
|
340
|
+
# The path seems to be weird, because the namespaces change incrementally
|
341
|
+
# in the output, so I don't want to treat it here.
|
342
|
+
# A modifier needed because synonyms service provides duplicate values.
|
343
|
+
XPath.each(doc, "//result/*/*#{mod}") {|el| STDERR.puts el.text} if $DEBUG
|
344
|
+
XPath.each(doc, "//result/*/*#{mod}") {|el| result << el.text}
|
345
|
+
|
346
|
+
if result.empty?
|
347
|
+
return nil
|
348
|
+
end
|
349
|
+
|
350
|
+
return result
|
351
|
+
end
|
352
|
+
|
353
|
+
end # class
|
354
|
+
end # module
|