wlapi 0.0.6 → 0.7.1

Sign up to get free protection for your applications and to get access to all the features.
data/INSTALL CHANGED
@@ -0,0 +1,5 @@
1
+ Install this library using:
2
+
3
+ gem install wlapi-x.y.z.gem
4
+
5
+ The appropriate .gem file you can find in the ./pkg directory.
data/README CHANGED
@@ -1,16 +1,22 @@
1
- = WLAPI
1
+ WLAPI
2
2
 
3
- * http://rubygems.org/gems/wlapi
4
- * http://www.uni-trier.de/index.php?id=24140
3
+ RubyGems (http://rubygems.org/gems/wlapi)
4
+ Developers Homepage (http://www.uni-trier.de/index.php?id=24140)
5
+ WLAPI Project Page (http://wlapi.rubyforge.org/)
5
6
 
6
- == DESCRIPTION
7
+ DESCRIPTION
7
8
 
8
9
  WLAPI is a simple API for Wortschatz Leipzig project.
9
10
 
11
+ SYNOPSIS
10
12
  $ require 'wlapi'
11
13
  $ api = WLAPI::API.new
12
14
  $ api.synonyms('Haus', 15) # returns an array with string values (UTF8 encoded)
13
15
 
14
16
  See RDOC documentation for details on particular methods.
15
17
 
16
- You can find some invocation examples in example/example.rb
18
+ LICENSE
19
+
20
+ WLAPI is a copyrighted software by Andrei Beliankou, 2010.
21
+ You may use, redistribute and change it under the terms
22
+ provided in the LICENSE file.
data/README.rdoc ADDED
@@ -0,0 +1,22 @@
1
+ = WLAPI
2
+
3
+ * {RubyGems}[http://rubygems.org/gems/wlapi]
4
+ * Developers {Homepage}[http://www.uni-trier.de/index.php?id=24140]
5
+ * {WLAPI Project Page}[http://wlapi.rubyforge.org/]
6
+
7
+ == DESCRIPTION
8
+
9
+ WLAPI is a simple API for Wortschatz Leipzig project.
10
+
11
+ == SYNOPSIS
12
+ $ require 'wlapi'
13
+ $ api = WLAPI::API.new
14
+ $ api.synonyms('Haus', 15) # returns an array with string values (UTF8 encoded)
15
+
16
+ See RDOC documentation for details on particular methods.
17
+
18
+ == LICENSE
19
+
20
+ WLAPI is a copyrighted software by Andrei Beliankou, 2010.
21
+ You may use, redistribute and change it under the terms
22
+ provided in the LICENSE file.
data/Rakefile ADDED
@@ -0,0 +1,14 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ # we can require 'rake/clean' to add 'clobber' and 'clean' tasks
4
+ require 'rake/clean'
5
+ require 'rake/testtask'
6
+
7
+
8
+ SRC = FileList['*.rb']
9
+
10
+ CLOBBER.include('doc', '**/*.html', '*.gem')
11
+
12
+ Rake::TestTask.new do |t|
13
+ t.test_files = FileList['test/*.rb']
14
+ end
data/lib/wlapi/api.rb ADDED
@@ -0,0 +1,354 @@
1
+ # -*- coding: utf-8 -*-
2
+ # 2010-, Andrei Beliankou
3
+
4
+ # :title: Ruby based API for Wortschatz Leipzig web services
5
+
6
+
7
+ # SAVON is a SOAP client.
8
+ require 'savon'
9
+
10
+ # REXML is fast enough for our task.
11
+ require 'rexml/document'
12
+ include REXML
13
+
14
+ # Top level namespace wrapper for WLAPI
15
+ module WLAPI
16
+
17
+ # This class represents an interface to the linguistic web services
18
+ # provided by the University of Leipzig.
19
+ #
20
+ # See the project 'Wortschatz Leipzig' for more details.
21
+ class API
22
+
23
+ # At the creation point clients for all services are being instantiated.
24
+ # You can also set the login and the password (it defaults to 'anonymous').
25
+ # api = WLAPI::API.new
26
+ def initialize(login = 'anonymous', pass = 'anonymous')
27
+
28
+ # This hash contains the URLs to the single services.
29
+ endpoint = 'http://wortschatz.uni-leipzig.de/axis/services'
30
+ @services = {
31
+ 'Thesaurus' => "#{endpoint}/Thesaurus",
32
+ 'Baseform' => "#{endpoint}/Baseform",
33
+ 'Similarity' => "#{endpoint}/Similarity",
34
+ 'Synonyms' => "#{endpoint}/Synonyms",
35
+ 'Sachgebiet' => "#{endpoint}/Sachgebiet",
36
+ 'Frequencies' => "#{endpoint}/Frequencies",
37
+ 'Kookurrenzschnitt' => "#{endpoint}/Kookkurrenzschnitt",
38
+ 'ExperimentalSynonyms' => "#{endpoint}/ExperimentalSynonyms",
39
+ 'RightCollocationFinder' => "#{endpoint}/RightCollocationFinder",
40
+ 'LeftCollocationFinder' => "#{endpoint}/LeftCollocationFinder",
41
+ 'Wordforms' => "#{endpoint}/Wordforms",
42
+ 'CooccurrencesAll' => "#{endpoint}/CooccurrencesAll",
43
+ 'LeftNeighbours' => "#{endpoint}/LeftNeighbours",
44
+ 'RightNeighbours' => "#{endpoint}/RightNeighbours",
45
+ 'Sentences' => "#{endpoint}/Sentences",
46
+ 'Cooccurrences' => "#{endpoint}/Cooccurrences"
47
+ # no MARSService and Kreuzwortrraetsel
48
+ }
49
+
50
+ # cl short for client.
51
+ # Dynamically create all the clients and set access credentials.
52
+ # It can be a very bad idea to instantiate all the clients at once,
53
+ # we should investigate the typical user behaviour.
54
+ # If only one service is used in the separate session => rewrite the class!
55
+ @services.each do |key, val|
56
+ cl_name = '@cl_' + key
57
+ eval("#{cl_name} = Savon::Client.new(val)")
58
+ eval("#{cl_name}.request.basic_auth(login, pass)")
59
+ end
60
+
61
+ # Savon creates very verbose logs, switching off.
62
+ Savon::Request.log = false unless $DEBUG
63
+ end
64
+
65
+ # Main methods to access different services.
66
+ #
67
+ # You can define the limit for the result set, it defaults to 10.
68
+ # If you want to get all the results, you should provide a number,
69
+ # which would be greater than the result set since we cannot
70
+ # predict how many answers the server will give us. Just try it.
71
+ ##############################################################################
72
+
73
+ ## One parameter methods.
74
+ ##############################################################################
75
+
76
+ # Returns the frequency and frequency class of the input word.
77
+ # Frequency class is computed in relation to the most frequent word
78
+ # in the corpus. The higher the class, the rarer the word:
79
+ # api.frequencies("Autos") => ["40614", "9"]
80
+ def frequencies(word)
81
+ arg1 = ['Wort', word]
82
+ answer = query(@cl_Frequencies, @services['Frequencies'], arg1)
83
+ return get_answer(answer)
84
+ end
85
+
86
+ # Gets the baseform (whatever it is :) not lemma).
87
+ # Returns the lemmatized (base) form of the input word
88
+ # and the POS tag in an array:
89
+ # api.baseform("Auto") => ["Auto", "N"]
90
+ def baseform(word)
91
+ arg1 = ['Wort', word]
92
+ answer = query(@cl_Baseform, @services['Baseform'], arg1)
93
+ return get_answer(answer)
94
+ end
95
+
96
+ # Returns categories for a given input word as an array:
97
+ # api.domain("Michael") => ["Vorname", "Nachname", "Männername"]
98
+ #--
99
+ # Is it a good name? all names are in English, but here..
100
+ # let's call it domain, not sachgebiet
101
+ def domain(word)
102
+ arg1 = ['Wort', word]
103
+ answer = query(@cl_Sachgebiet, @services['Sachgebiet'], arg1)
104
+ return get_answer(answer)
105
+ end
106
+
107
+ ## Two parameter methods.
108
+ ##############################################################################
109
+
110
+ # Returns all other word forms of the same lemma for a given word form.
111
+ # api.wordforms("Auto") => ["Auto", "Autos"]
112
+ def wordforms(word, limit = 10)
113
+ # note, it is the only service which requires 'Word', not 'Wort'
114
+ arg1 = ['Word', word]
115
+ arg2 = ['Limit', limit]
116
+ answer = query(@cl_Wordforms, @services['Wordforms'], arg1, arg2)
117
+ return get_answer(answer)
118
+ end
119
+
120
+ # As the Synonyms service returns synonyms of the given input word.
121
+ # However, this first builds a lemma of the input word
122
+ # and thus returns more synonyms:
123
+ # api.thesaurus("Auto") => ["Auto", "Bahn", "Wagen", "Zug", "Schiff", ...]
124
+ def thesaurus(word, limit = 10)
125
+ arg1 = ['Wort', word]
126
+ arg2 = ['Limit', limit]
127
+ answer = query(@cl_Thesaurus, @services['Thesaurus'], arg1, arg2)
128
+ return get_answer(answer)
129
+ end
130
+
131
+ # This method searches for synonyms.
132
+ # Returns synonyms of the input word. In other words, this is a thesaurus.
133
+ # api.synonyms("Auto") => ["Kraftwagen", "Automobil", "Benzinkutsche", ...]
134
+ def synonyms(word, limit = 10)
135
+ arg1 = ['Wort', word]
136
+ arg2 = ['Limit', limit]
137
+ answer = query(@cl_Synonyms, @services['Synonyms'], arg1, arg2)
138
+ # Synonym service provide multiple values, so we take only odd.
139
+ return get_answer(answer, '[position() mod 2 = 1 ]')
140
+ end
141
+
142
+ # Returns sample sentences containing the input word.
143
+ # The return value is an array:
144
+ # api.sentences("Auto") => ["40808144", "Zweitens der freche, frische Klang der Hupe
145
+ # und drittens die hinreißend gestylten 16-Zoll-Felgen,
146
+ # die es leider nur für dieses Auto gibt.", ...]
147
+ #--
148
+ # ok, but results should be filtered
149
+ def sentences(word, limit = 10)
150
+ arg1 = ['Wort', word]
151
+ arg2 = ['Limit', limit]
152
+ answer = query(@cl_Sentences, @services['Sentences'], arg1, arg2)
153
+ return get_answer(answer)
154
+ end
155
+
156
+ # For a given input word, returns statistically significant left neighbours
157
+ # (words co-occurring immediately to the left of the input word).
158
+ # api.left_neighbours("Auto") => ["geparktes", "Auto", "561", ...]
159
+ #--
160
+ # ok, but results should be filtered
161
+ def left_neighbours(word, limit = 10)
162
+ arg1 = ['Wort', word]
163
+ arg2 = ['Limit', limit]
164
+ answer = query(@cl_LeftNeighbours, @services['LeftNeighbours'], arg1, arg2)
165
+ return get_answer(answer)
166
+ end
167
+
168
+ # For a given input word, returns statistically significant right neighbours
169
+ # (words co-occurring immediately to the right of the input word).
170
+ # api.right_neighbours("Auto") => ["Auto", "erfaßt", "575", ...]
171
+ #--
172
+ # ok, but results should be filtered
173
+ def right_neighbours(word, limit = 10)
174
+ arg1 = ['Wort', word]
175
+ arg2 = ['Limit', limit]
176
+ answer = query(@cl_RightNeighbours, @services['RightNeighbours'], arg1, arg2)
177
+ return get_answer(answer)
178
+ end
179
+
180
+
181
+ # Returns automatically computed contextually similar words of the input word.
182
+ # Such similar words may be antonyms, hyperonyms, synonyms,
183
+ # cohyponyms or other.
184
+ # Note that due to the huge amount of data any query to this services
185
+ # may take a long time.
186
+ # api.similarity("Auto") => ["Auto", "Wagen", "26", ...]
187
+ def similarity(word, limit = 10)
188
+ arg1 = ['Wort', word]
189
+ arg2 = ['Limit', limit]
190
+ answer = query(@cl_Similarity, @services['Similarity'], arg1, arg2)
191
+ return get_answer(answer)
192
+ end
193
+
194
+ # This service delivers an experimental synonyms request for internal tests.
195
+ #--
196
+ # don't know, if we have to include this service...
197
+ def experimental_synonyms(word, limit = 10)
198
+ arg1 = ['Wort', word]
199
+ arg2 = ['Limit', limit]
200
+ answer = query(@cl_ExperimentalSynonyms, @services['ExperimentalSynonyms'], arg1, arg2)
201
+ return get_answer(answer)
202
+ end
203
+
204
+ ## Three parameter methods.
205
+ ##############################################################################
206
+
207
+ # Attempts to find linguistic collocations that occur to the right
208
+ # of the given input word.
209
+ # The parameter 'Wortart' accepts four values 'A, V, N, S'
210
+ # which stand for adjective, verb, noun and stopword respectively.
211
+ # The parameter restricts the type of words found.
212
+ # It returns an array:
213
+ # api.right_collocation_finder("Auto", "V", 10) => ["Auto", "abfackeln", "V", ...]
214
+ def right_collocation_finder(word, pos, limit = 10)
215
+ arg1 = ['Wort', word]
216
+ arg2 = ['Wortart', pos]
217
+ arg3 = ['Limit', limit]
218
+ answer = query(@cl_RightCollocationFinder, @services['RightCollocationFinder'], arg1, arg2, arg3)
219
+ return get_answer(answer)
220
+ end
221
+
222
+ # Attempts to find linguistic collocations that occur to the left
223
+ # of the given input word.
224
+ # The parameter 'Wortart' accepts four values 'A, V, N, S'
225
+ # which stand for adjective, verb, noun and stopword respectively.
226
+ # The parameter restricts the type of words found.
227
+ # It returns an array:
228
+ # api.left_collocation_finder("Stuhl", "A", 10) => ["apostolisch", "A", "Stuhl", ...]
229
+ def left_collocation_finder(word, pos, limit = 10)
230
+ arg1 = ['Wort', word]
231
+ arg2 = ['Wortart', pos]
232
+ arg3 = ['Limit', limit]
233
+ answer = query(@cl_LeftCollocationFinder, @services['LeftCollocationFinder'], arg1, arg2, arg3)
234
+ return get_answer(answer)
235
+ end
236
+
237
+ # Returns statistically significant co-occurrences of the input word.
238
+ def cooccurrences(word, sign, limit = 10)
239
+ arg1 = ['Wort', word]
240
+ arg2 = ['Mindestsignifikanz', sign]
241
+ arg3 = ['Limit', limit]
242
+ raise 'Not implemented yet!'
243
+ end
244
+
245
+ # Returns statistically significant co-occurrences of the input word.
246
+ # However, it searches in the unrestricted version of the co-occurrences table
247
+ # as in the Cooccurrences services,
248
+ # which means significantly longer wait times.
249
+ def cooccurrences_all(word, sign, limit = 10)
250
+ arg1 = ['Wort', word]
251
+ arg2 = ['Mindestsignifikanz', sign]
252
+ arg3 = ['Limit', limit]
253
+ raise 'Not implemented yet!'
254
+ end
255
+
256
+ # Returns the intersection of the co-occurrences of the two given words.
257
+ # The result set is ordered according to the sum of the significances
258
+ # in descending order. Note that due to the join involved,
259
+ # this make take some time.
260
+ #--
261
+ # let's call it intersection, not kookurrenzschnitt
262
+ # is being used INTERN, we need additional credentials
263
+ def intersection(word1, word2, limit = 10)
264
+ arg1 = ['Wort 1', word1]
265
+ arg2 = ['Wort 2', word2]
266
+ arg3 = ['Limit', limit]
267
+ # we are not going to implement it now
268
+ raise 'Will never be implemented!'
269
+ end
270
+
271
+ private
272
+
273
+ # Main query method, it invokes the soap engine.
274
+ # This method combines all the data to one SOAP request and gets the answer.
275
+ # It is not compatible with one and three parameter methods yet.
276
+ # args contains an array [[key1, value1], [key2, value2], [key3, value3]]
277
+ # with keys and values for the soap query
278
+ def query(cl, namespace, *args)
279
+
280
+ # Calling the action with ! (disables the wsdl query).
281
+ # wsdl is disabled since calling the server for wsdl can last too long.
282
+ resp = cl.execute! do |soap|
283
+
284
+ # adding a namespace, wsdl is disabled
285
+ soap.namespace = namespace
286
+
287
+ soap.namespaces['xmlns:soapenv'] = "http://schemas.xmlsoap.org/soap/envelope/"
288
+
289
+ # Every service has a different namespace.
290
+ soap.namespaces['xmlns:urn'] = "urn:#{namespace.sub(/.+ces\//, '')}"
291
+
292
+ soap.namespaces['xmlns:dat'] = "http://datatypes.webservice.wortschatz.uni_leipzig.de"
293
+
294
+
295
+ body = "<urn:objRequestParameters>"
296
+ body << "<urn:corpus>de</urn:corpus>"
297
+ body << "<urn:parameters>"
298
+
299
+ # setting the first argument (usually 'Wort')
300
+ if args[0]
301
+ body << "<urn:dataVectors><dat:dataRow>#{args[0][0]}</dat:dataRow><dat:dataRow>#{args[0][1]}</dat:dataRow></urn:dataVectors>"
302
+ end
303
+
304
+ # setting the second argument (usually 'Limit')
305
+ if args[1]
306
+ body << "<urn:dataVectors><dat:dataRow>#{args[1][0]}</dat:dataRow><dat:dataRow>#{args[1][1]}</dat:dataRow></urn:dataVectors>"
307
+ end
308
+
309
+ # setting setting the third argument (no common value)
310
+ if args[2]
311
+ body << "<urn:dataVectors><dat:dataRow>#{args[2][0]}</dat:dataRow><dat:dataRow>#{args[2][1]}</dat:dataRow></urn:dataVectors>"
312
+ end
313
+ body << "</urn:parameters>"
314
+ body << "</urn:objRequestParameters>"
315
+
316
+ soap.body = body
317
+
318
+
319
+
320
+ STDERR.puts soap.to_xml if $DEBUG
321
+
322
+ end
323
+
324
+ doc = Document.new(resp.to_xml)
325
+
326
+ STDERR.puts doc if $DEBUG
327
+
328
+ return doc
329
+
330
+ end
331
+
332
+ # This method extracts valuable data from the XML structure
333
+ # of the soap response. It returns an array with extracted xml text nodes
334
+ # or nil, if the service provided no answer.
335
+ # The same collection is printed to stderr in the DEBUG mode.
336
+ #--
337
+ # TODO: what if the answer is empty?
338
+ def get_answer(doc, mod='')
339
+ result = []
340
+ # The path seems to be weird, because the namespaces change incrementally
341
+ # in the output, so I don't want to treat it here.
342
+ # A modifier needed because synonyms service provides duplicate values.
343
+ XPath.each(doc, "//result/*/*#{mod}") {|el| STDERR.puts el.text} if $DEBUG
344
+ XPath.each(doc, "//result/*/*#{mod}") {|el| result << el.text}
345
+
346
+ if result.empty?
347
+ return nil
348
+ end
349
+
350
+ return result
351
+ end
352
+
353
+ end # class
354
+ end # module