wlapi 0.0.6 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/INSTALL CHANGED
@@ -0,0 +1,5 @@
1
+ Install this library using:
2
+
3
+ gem install wlapi-x.y.z.gem
4
+
5
+ The appropriate .gem file you can find in the ./pkg directory.
data/README CHANGED
@@ -1,16 +1,22 @@
1
- = WLAPI
1
+ WLAPI
2
2
 
3
- * http://rubygems.org/gems/wlapi
4
- * http://www.uni-trier.de/index.php?id=24140
3
+ RubyGems (http://rubygems.org/gems/wlapi)
4
+ Developers Homepage (http://www.uni-trier.de/index.php?id=24140)
5
+ WLAPI Project Page (http://wlapi.rubyforge.org/)
5
6
 
6
- == DESCRIPTION
7
+ DESCRIPTION
7
8
 
8
9
  WLAPI is a simple API for Wortschatz Leipzig project.
9
10
 
11
+ SYNOPSIS
10
12
  $ require 'wlapi'
11
13
  $ api = WLAPI::API.new
12
14
  $ api.synonyms('Haus', 15) # returns an array with string values (UTF8 encoded)
13
15
 
14
16
  See RDOC documentation for details on particular methods.
15
17
 
16
- You can find some invocation examples in example/example.rb
18
+ LICENSE
19
+
20
+ WLAPI is a copyrighted software by Andrei Beliankou, 2010.
21
+ You may use, redistribute and change it under the terms
22
+ provided in the LICENSE file.
data/README.rdoc ADDED
@@ -0,0 +1,22 @@
1
+ = WLAPI
2
+
3
+ * {RubyGems}[http://rubygems.org/gems/wlapi]
4
+ * Developers {Homepage}[http://www.uni-trier.de/index.php?id=24140]
5
+ * {WLAPI Project Page}[http://wlapi.rubyforge.org/]
6
+
7
+ == DESCRIPTION
8
+
9
+ WLAPI is a simple API for Wortschatz Leipzig project.
10
+
11
+ == SYNOPSIS
12
+ $ require 'wlapi'
13
+ $ api = WLAPI::API.new
14
+ $ api.synonyms('Haus', 15) # returns an array with string values (UTF8 encoded)
15
+
16
+ See RDOC documentation for details on particular methods.
17
+
18
+ == LICENSE
19
+
20
+ WLAPI is a copyrighted software by Andrei Beliankou, 2010.
21
+ You may use, redistribute and change it under the terms
22
+ provided in the LICENSE file.
data/Rakefile ADDED
@@ -0,0 +1,14 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ # we can require 'rake/clean' to add 'clobber' and 'clean' tasks
4
+ require 'rake/clean'
5
+ require 'rake/testtask'
6
+
7
+
8
+ SRC = FileList['*.rb']
9
+
10
+ CLOBBER.include('doc', '**/*.html', '*.gem')
11
+
12
+ Rake::TestTask.new do |t|
13
+ t.test_files = FileList['test/*.rb']
14
+ end
data/lib/wlapi/api.rb ADDED
@@ -0,0 +1,354 @@
1
+ # -*- coding: utf-8 -*-
2
+ # 2010-, Andrei Beliankou
3
+
4
+ # :title: Ruby based API for Wortschatz Leipzig web services
5
+
6
+
7
+ # SAVON is a SOAP client.
8
+ require 'savon'
9
+
10
+ # REXML is fast enough for our task.
11
+ require 'rexml/document'
12
+ include REXML
13
+
14
+ # Top level namespace wrapper for WLAPI
15
+ module WLAPI
16
+
17
+ # This class represents an interface to the linguistic web services
18
+ # provided by the University of Leipzig.
19
+ #
20
+ # See the project 'Wortschatz Leipzig' for more details.
21
+ class API
22
+
23
+ # At the creation point clients for all services are being instantiated.
24
+ # You can also set the login and the password (it defaults to 'anonymous').
25
+ # api = WLAPI::API.new
26
+ def initialize(login = 'anonymous', pass = 'anonymous')
27
+
28
+ # This hash contains the URLs to the single services.
29
+ endpoint = 'http://wortschatz.uni-leipzig.de/axis/services'
30
+ @services = {
31
+ 'Thesaurus' => "#{endpoint}/Thesaurus",
32
+ 'Baseform' => "#{endpoint}/Baseform",
33
+ 'Similarity' => "#{endpoint}/Similarity",
34
+ 'Synonyms' => "#{endpoint}/Synonyms",
35
+ 'Sachgebiet' => "#{endpoint}/Sachgebiet",
36
+ 'Frequencies' => "#{endpoint}/Frequencies",
37
+ 'Kookurrenzschnitt' => "#{endpoint}/Kookkurrenzschnitt",
38
+ 'ExperimentalSynonyms' => "#{endpoint}/ExperimentalSynonyms",
39
+ 'RightCollocationFinder' => "#{endpoint}/RightCollocationFinder",
40
+ 'LeftCollocationFinder' => "#{endpoint}/LeftCollocationFinder",
41
+ 'Wordforms' => "#{endpoint}/Wordforms",
42
+ 'CooccurrencesAll' => "#{endpoint}/CooccurrencesAll",
43
+ 'LeftNeighbours' => "#{endpoint}/LeftNeighbours",
44
+ 'RightNeighbours' => "#{endpoint}/RightNeighbours",
45
+ 'Sentences' => "#{endpoint}/Sentences",
46
+ 'Cooccurrences' => "#{endpoint}/Cooccurrences"
47
+ # no MARSService and Kreuzwortrraetsel
48
+ }
49
+
50
+ # cl short for client.
51
+ # Dynamically create all the clients and set access credentials.
52
+ # It can be a very bad idea to instantiate all the clients at once,
53
+ # we should investigate the typical user behaviour.
54
+ # If only one service is used in the separate session => rewrite the class!
55
+ @services.each do |key, val|
56
+ cl_name = '@cl_' + key
57
+ eval("#{cl_name} = Savon::Client.new(val)")
58
+ eval("#{cl_name}.request.basic_auth(login, pass)")
59
+ end
60
+
61
+ # Savon creates very verbose logs, switching off.
62
+ Savon::Request.log = false unless $DEBUG
63
+ end
64
+
65
+ # Main methods to access different services.
66
+ #
67
+ # You can define the limit for the result set, it defaults to 10.
68
+ # If you want to get all the results, you should provide a number,
69
+ # which would be greater than the result set since we cannot
70
+ # predict how many answers the server will give us. Just try it.
71
+ ##############################################################################
72
+
73
+ ## One parameter methods.
74
+ ##############################################################################
75
+
76
+ # Returns the frequency and frequency class of the input word.
77
+ # Frequency class is computed in relation to the most frequent word
78
+ # in the corpus. The higher the class, the rarer the word:
79
+ # api.frequencies("Autos") => ["40614", "9"]
80
+ def frequencies(word)
81
+ arg1 = ['Wort', word]
82
+ answer = query(@cl_Frequencies, @services['Frequencies'], arg1)
83
+ return get_answer(answer)
84
+ end
85
+
86
+ # Gets the baseform (whatever it is :) not lemma).
87
+ # Returns the lemmatized (base) form of the input word
88
+ # and the POS tag in an array:
89
+ # api.baseform("Auto") => ["Auto", "N"]
90
+ def baseform(word)
91
+ arg1 = ['Wort', word]
92
+ answer = query(@cl_Baseform, @services['Baseform'], arg1)
93
+ return get_answer(answer)
94
+ end
95
+
96
+ # Returns categories for a given input word as an array:
97
+ # api.domain("Michael") => ["Vorname", "Nachname", "Männername"]
98
+ #--
99
+ # Is it a good name? all names are in English, but here..
100
+ # let's call it domain, not sachgebiet
101
+ def domain(word)
102
+ arg1 = ['Wort', word]
103
+ answer = query(@cl_Sachgebiet, @services['Sachgebiet'], arg1)
104
+ return get_answer(answer)
105
+ end
106
+
107
+ ## Two parameter methods.
108
+ ##############################################################################
109
+
110
+ # Returns all other word forms of the same lemma for a given word form.
111
+ # api.wordforms("Auto") => ["Auto", "Autos"]
112
+ def wordforms(word, limit = 10)
113
+ # note, it is the only service which requires 'Word', not 'Wort'
114
+ arg1 = ['Word', word]
115
+ arg2 = ['Limit', limit]
116
+ answer = query(@cl_Wordforms, @services['Wordforms'], arg1, arg2)
117
+ return get_answer(answer)
118
+ end
119
+
120
+ # As the Synonyms service returns synonyms of the given input word.
121
+ # However, this first builds a lemma of the input word
122
+ # and thus returns more synonyms:
123
+ # api.thesaurus("Auto") => ["Auto", "Bahn", "Wagen", "Zug", "Schiff", ...]
124
+ def thesaurus(word, limit = 10)
125
+ arg1 = ['Wort', word]
126
+ arg2 = ['Limit', limit]
127
+ answer = query(@cl_Thesaurus, @services['Thesaurus'], arg1, arg2)
128
+ return get_answer(answer)
129
+ end
130
+
131
+ # This method searches for synonyms.
132
+ # Returns synonyms of the input word. In other words, this is a thesaurus.
133
+ # api.synonyms("Auto") => ["Kraftwagen", "Automobil", "Benzinkutsche", ...]
134
+ def synonyms(word, limit = 10)
135
+ arg1 = ['Wort', word]
136
+ arg2 = ['Limit', limit]
137
+ answer = query(@cl_Synonyms, @services['Synonyms'], arg1, arg2)
138
+ # Synonym service provide multiple values, so we take only odd.
139
+ return get_answer(answer, '[position() mod 2 = 1 ]')
140
+ end
141
+
142
+ # Returns sample sentences containing the input word.
143
+ # The return value is an array:
144
+ # api.sentences("Auto") => ["40808144", "Zweitens der freche, frische Klang der Hupe
145
+ # und drittens die hinreißend gestylten 16-Zoll-Felgen,
146
+ # die es leider nur für dieses Auto gibt.", ...]
147
+ #--
148
+ # ok, but results should be filtered
149
+ def sentences(word, limit = 10)
150
+ arg1 = ['Wort', word]
151
+ arg2 = ['Limit', limit]
152
+ answer = query(@cl_Sentences, @services['Sentences'], arg1, arg2)
153
+ return get_answer(answer)
154
+ end
155
+
156
+ # For a given input word, returns statistically significant left neighbours
157
+ # (words co-occurring immediately to the left of the input word).
158
+ # api.left_neighbours("Auto") => ["geparktes", "Auto", "561", ...]
159
+ #--
160
+ # ok, but results should be filtered
161
+ def left_neighbours(word, limit = 10)
162
+ arg1 = ['Wort', word]
163
+ arg2 = ['Limit', limit]
164
+ answer = query(@cl_LeftNeighbours, @services['LeftNeighbours'], arg1, arg2)
165
+ return get_answer(answer)
166
+ end
167
+
168
+ # For a given input word, returns statistically significant right neighbours
169
+ # (words co-occurring immediately to the right of the input word).
170
+ # api.right_neighbours("Auto") => ["Auto", "erfaßt", "575", ...]
171
+ #--
172
+ # ok, but results should be filtered
173
+ def right_neighbours(word, limit = 10)
174
+ arg1 = ['Wort', word]
175
+ arg2 = ['Limit', limit]
176
+ answer = query(@cl_RightNeighbours, @services['RightNeighbours'], arg1, arg2)
177
+ return get_answer(answer)
178
+ end
179
+
180
+
181
+ # Returns automatically computed contextually similar words of the input word.
182
+ # Such similar words may be antonyms, hyperonyms, synonyms,
183
+ # cohyponyms or other.
184
+ # Note that due to the huge amount of data any query to this services
185
+ # may take a long time.
186
+ # api.similarity("Auto") => ["Auto", "Wagen", "26", ...]
187
+ def similarity(word, limit = 10)
188
+ arg1 = ['Wort', word]
189
+ arg2 = ['Limit', limit]
190
+ answer = query(@cl_Similarity, @services['Similarity'], arg1, arg2)
191
+ return get_answer(answer)
192
+ end
193
+
194
+ # This service delivers an experimental synonyms request for internal tests.
195
+ #--
196
+ # don't know, if we have to include this service...
197
+ def experimental_synonyms(word, limit = 10)
198
+ arg1 = ['Wort', word]
199
+ arg2 = ['Limit', limit]
200
+ answer = query(@cl_ExperimentalSynonyms, @services['ExperimentalSynonyms'], arg1, arg2)
201
+ return get_answer(answer)
202
+ end
203
+
204
+ ## Three parameter methods.
205
+ ##############################################################################
206
+
207
+ # Attempts to find linguistic collocations that occur to the right
208
+ # of the given input word.
209
+ # The parameter 'Wortart' accepts four values 'A, V, N, S'
210
+ # which stand for adjective, verb, noun and stopword respectively.
211
+ # The parameter restricts the type of words found.
212
+ # It returns an array:
213
+ # api.right_collocation_finder("Auto", "V", 10) => ["Auto", "abfackeln", "V", ...]
214
+ def right_collocation_finder(word, pos, limit = 10)
215
+ arg1 = ['Wort', word]
216
+ arg2 = ['Wortart', pos]
217
+ arg3 = ['Limit', limit]
218
+ answer = query(@cl_RightCollocationFinder, @services['RightCollocationFinder'], arg1, arg2, arg3)
219
+ return get_answer(answer)
220
+ end
221
+
222
+ # Attempts to find linguistic collocations that occur to the left
223
+ # of the given input word.
224
+ # The parameter 'Wortart' accepts four values 'A, V, N, S'
225
+ # which stand for adjective, verb, noun and stopword respectively.
226
+ # The parameter restricts the type of words found.
227
+ # It returns an array:
228
+ # api.left_collocation_finder("Stuhl", "A", 10) => ["apostolisch", "A", "Stuhl", ...]
229
+ def left_collocation_finder(word, pos, limit = 10)
230
+ arg1 = ['Wort', word]
231
+ arg2 = ['Wortart', pos]
232
+ arg3 = ['Limit', limit]
233
+ answer = query(@cl_LeftCollocationFinder, @services['LeftCollocationFinder'], arg1, arg2, arg3)
234
+ return get_answer(answer)
235
+ end
236
+
237
+ # Returns statistically significant co-occurrences of the input word.
238
+ def cooccurrences(word, sign, limit = 10)
239
+ arg1 = ['Wort', word]
240
+ arg2 = ['Mindestsignifikanz', sign]
241
+ arg3 = ['Limit', limit]
242
+ raise 'Not implemented yet!'
243
+ end
244
+
245
+ # Returns statistically significant co-occurrences of the input word.
246
+ # However, it searches in the unrestricted version of the co-occurrences table
247
+ # as in the Cooccurrences services,
248
+ # which means significantly longer wait times.
249
+ def cooccurrences_all(word, sign, limit = 10)
250
+ arg1 = ['Wort', word]
251
+ arg2 = ['Mindestsignifikanz', sign]
252
+ arg3 = ['Limit', limit]
253
+ raise 'Not implemented yet!'
254
+ end
255
+
256
+ # Returns the intersection of the co-occurrences of the two given words.
257
+ # The result set is ordered according to the sum of the significances
258
+ # in descending order. Note that due to the join involved,
259
+ # this make take some time.
260
+ #--
261
+ # let's call it intersection, not kookurrenzschnitt
262
+ # is being used INTERN, we need additional credentials
263
+ def intersection(word1, word2, limit = 10)
264
+ arg1 = ['Wort 1', word1]
265
+ arg2 = ['Wort 2', word2]
266
+ arg3 = ['Limit', limit]
267
+ # we are not going to implement it now
268
+ raise 'Will never be implemented!'
269
+ end
270
+
271
+ private
272
+
273
+ # Main query method, it invokes the soap engine.
274
+ # This method combines all the data to one SOAP request and gets the answer.
275
+ # It is not compatible with one and three parameter methods yet.
276
+ # args contains an array [[key1, value1], [key2, value2], [key3, value3]]
277
+ # with keys and values for the soap query
278
+ def query(cl, namespace, *args)
279
+
280
+ # Calling the action with ! (disables the wsdl query).
281
+ # wsdl is disabled since calling the server for wsdl can last too long.
282
+ resp = cl.execute! do |soap|
283
+
284
+ # adding a namespace, wsdl is disabled
285
+ soap.namespace = namespace
286
+
287
+ soap.namespaces['xmlns:soapenv'] = "http://schemas.xmlsoap.org/soap/envelope/"
288
+
289
+ # Every service has a different namespace.
290
+ soap.namespaces['xmlns:urn'] = "urn:#{namespace.sub(/.+ces\//, '')}"
291
+
292
+ soap.namespaces['xmlns:dat'] = "http://datatypes.webservice.wortschatz.uni_leipzig.de"
293
+
294
+
295
+ body = "<urn:objRequestParameters>"
296
+ body << "<urn:corpus>de</urn:corpus>"
297
+ body << "<urn:parameters>"
298
+
299
+ # setting the first argument (usually 'Wort')
300
+ if args[0]
301
+ body << "<urn:dataVectors><dat:dataRow>#{args[0][0]}</dat:dataRow><dat:dataRow>#{args[0][1]}</dat:dataRow></urn:dataVectors>"
302
+ end
303
+
304
+ # setting the second argument (usually 'Limit')
305
+ if args[1]
306
+ body << "<urn:dataVectors><dat:dataRow>#{args[1][0]}</dat:dataRow><dat:dataRow>#{args[1][1]}</dat:dataRow></urn:dataVectors>"
307
+ end
308
+
309
+ # setting setting the third argument (no common value)
310
+ if args[2]
311
+ body << "<urn:dataVectors><dat:dataRow>#{args[2][0]}</dat:dataRow><dat:dataRow>#{args[2][1]}</dat:dataRow></urn:dataVectors>"
312
+ end
313
+ body << "</urn:parameters>"
314
+ body << "</urn:objRequestParameters>"
315
+
316
+ soap.body = body
317
+
318
+
319
+
320
+ STDERR.puts soap.to_xml if $DEBUG
321
+
322
+ end
323
+
324
+ doc = Document.new(resp.to_xml)
325
+
326
+ STDERR.puts doc if $DEBUG
327
+
328
+ return doc
329
+
330
+ end
331
+
332
+ # This method extracts valuable data from the XML structure
333
+ # of the soap response. It returns an array with extracted xml text nodes
334
+ # or nil, if the service provided no answer.
335
+ # The same collection is printed to stderr in the DEBUG mode.
336
+ #--
337
+ # TODO: what if the answer is empty?
338
+ def get_answer(doc, mod='')
339
+ result = []
340
+ # The path seems to be weird, because the namespaces change incrementally
341
+ # in the output, so I don't want to treat it here.
342
+ # A modifier needed because synonyms service provides duplicate values.
343
+ XPath.each(doc, "//result/*/*#{mod}") {|el| STDERR.puts el.text} if $DEBUG
344
+ XPath.each(doc, "//result/*/*#{mod}") {|el| result << el.text}
345
+
346
+ if result.empty?
347
+ return nil
348
+ end
349
+
350
+ return result
351
+ end
352
+
353
+ end # class
354
+ end # module