yanser 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +19 -0
- data/README +1 -0
- data/README.rdoc +21 -0
- data/Rakefile +14 -0
- data/bin/yanser +19 -0
- data/lib/option_parser.rb +263 -0
- data/lib/tester.rb +18 -0
- data/lib/version.rb +3 -0
- data/lib/yanser.rb +121 -0
- data/test/data/bad_xml.txt +236 -0
- data/test/data/empty_result.txt +13 -0
- data/test/data/error_code.txt +237 -0
- data/test/data/response_with_error.txt +15 -0
- data/test/data/successfull_response.txt +237 -0
- data/test/test_option_parser.rb +91 -0
- data/test/test_yanser.rb +31 -0
- data/test/yanapi/test_query.rb +112 -0
- data/test/yanapi/test_term_query.rb +64 -0
- metadata +104 -0
data/LICENSE
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Copyright (c) 2011 Andrei Beliankou, University of Trier, Germany
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
5
|
+
in the Software without restriction, including without limitation the rights
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in
|
11
|
+
all copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
THE SOFTWARE.
|
data/README
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
new readme
|
data/README.rdoc
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
= YANSER
|
2
|
+
|
3
|
+
* {RubyGems}[http://rubygems.org/gems/yanser]
|
4
|
+
* Developers {Homepage}[http://www.uni-trier.de/index.php?id=24140]
|
5
|
+
* {YANAPI Project Page}[http://yanser.rubyforge.org/]
|
6
|
+
|
7
|
+
== DESCRIPTION
|
8
|
+
|
9
|
+
YANSER (Yahoo! ANSwers harvestER) is a convinient search tool providing access to the Yahoo! Answers Q&A collection. Based on YANAPI it provides a simple CLI and helps to search for Questions and Answers which contain a set of key words, belong to a specific semantic domain or are posted by a certain user. Yanser is a research tool in the field of Computational Linguistics.
|
10
|
+
|
11
|
+
== SYNOPSIS
|
12
|
+
$ yanser --help
|
13
|
+
|
14
|
+
|
15
|
+
|
16
|
+
== LICENSE
|
17
|
+
|
18
|
+
YANSER is a copyrighted software by Andrei Beliankou, 2011.
|
19
|
+
You may use, redistribute and change it under the terms
|
20
|
+
provided in the LICENSE file.
|
21
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
# we can require 'rake/clean' to add 'clobber' and 'clean' tasks
|
4
|
+
require 'rake/clean'
|
5
|
+
require 'rake/testtask'
|
6
|
+
|
7
|
+
|
8
|
+
SRC = FileList['**/*.rb']
|
9
|
+
|
10
|
+
CLOBBER.include('doc', '**/*.html', '**/*.gem')
|
11
|
+
|
12
|
+
Rake::TestTask.new do |t|
|
13
|
+
t.test_files = FileList.new('test/yanapi/*.rb')
|
14
|
+
end
|
data/bin/yanser
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
# это неверное решение, на самом деле я не должен напрямую указывать,
|
5
|
+
# где находятся файлы, но пока пусть будет так
|
6
|
+
lib_path = File.expand_path('../../lib', __FILE__)
|
7
|
+
$LOAD_PATH.unshift(lib_path) unless $LOAD_PATH.include?(lib_path)
|
8
|
+
|
9
|
+
# подобный способ является правильным, только так классы Yanser и OptionParser будут независимы
|
10
|
+
require 'yanser'
|
11
|
+
require 'option_parser'
|
12
|
+
|
13
|
+
# class method, why not?
|
14
|
+
options = OptionParser.parse(ARGV)
|
15
|
+
|
16
|
+
|
17
|
+
yanser = Yanser.new(options)
|
18
|
+
|
19
|
+
yanser.start
|
@@ -0,0 +1,263 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require 'optparse'
|
3
|
+
require 'fileutils'
|
4
|
+
|
5
|
+
require 'version'
|
6
|
+
|
7
|
+
class OptionParser
|
8
|
+
# OP expects cmd_args to be an array like ARGV
|
9
|
+
# dummy output for temporary usage
|
10
|
+
def self.parse(cmd_args)
|
11
|
+
options = {}
|
12
|
+
options[:query_params] = {}
|
13
|
+
|
14
|
+
parser = OptionParser.new do |opts|
|
15
|
+
opts.banner = 'Usage: yanser OPTIONS'
|
16
|
+
|
17
|
+
opts.separator ''
|
18
|
+
opts.separator 'Program specific options:'
|
19
|
+
|
20
|
+
opts.on('--appid APPID',
|
21
|
+
'Provide an ApplicationID given by Yahoo,',
|
22
|
+
'to test Yanser you can use <YahooDemo> as the APPID,',
|
23
|
+
'think in this case on limitations placed by Yahoo.',
|
24
|
+
'This option is required!'
|
25
|
+
) do |appid|
|
26
|
+
options[:query_params][:appid] = appid
|
27
|
+
end
|
28
|
+
|
29
|
+
|
30
|
+
opts.separator ''
|
31
|
+
opts.separator ' Mandatory search arguments:'
|
32
|
+
|
33
|
+
opts.on('-k', '--key-word KEYWORD',
|
34
|
+
'Provide a single keyword or a boolean expression.'
|
35
|
+
) do |keyword|
|
36
|
+
options[:query_params][:query] = keyword
|
37
|
+
# not a solution!!!
|
38
|
+
options[:query_params][:search_in] = 'question'
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
opts.separator ''
|
43
|
+
|
44
|
+
opts.on('-c', '--category CATEGORY',
|
45
|
+
'Provide a category name or ID.'
|
46
|
+
) do |category|
|
47
|
+
|
48
|
+
if category =~ /^[[:digit:]]+$/
|
49
|
+
options[:query_params][:category_id] = category
|
50
|
+
else
|
51
|
+
options[:query_params][:category_name] = category
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
55
|
+
|
56
|
+
opts.separator ''
|
57
|
+
|
58
|
+
opts.on('--user-id ID',
|
59
|
+
'Provide an user ID of questions you search for.',
|
60
|
+
'This way you can get question by a specific user.'
|
61
|
+
) do |user_id|
|
62
|
+
options[:query_params][:user_id] = user_id
|
63
|
+
end
|
64
|
+
|
65
|
+
opts.separator ''
|
66
|
+
|
67
|
+
opts.on('--question-id ID',
|
68
|
+
'Provide a question ID of the question you search for.',
|
69
|
+
'It returns a unique question.'
|
70
|
+
) do |question_id|
|
71
|
+
options[:query_params][:question_id] = question_id
|
72
|
+
end
|
73
|
+
|
74
|
+
opts.separator ''
|
75
|
+
opts.separator ' Optional search arguments:'
|
76
|
+
|
77
|
+
opts.on('-r', '--region REGION',
|
78
|
+
'Provide a geographic region to search in for terms.',
|
79
|
+
'Possible values are: de, us, uk, ca, au, in, es, br,',
|
80
|
+
' ar, mx, e1, it, fr, sg.',
|
81
|
+
'This defaults to en.'
|
82
|
+
) do |region|
|
83
|
+
# todo
|
84
|
+
prove_region(region)
|
85
|
+
options[:query_params][:region] = region
|
86
|
+
end
|
87
|
+
|
88
|
+
opts.separator ''
|
89
|
+
|
90
|
+
opts.on('-o', '--output-dir DIR',
|
91
|
+
'Provide an output folder.',
|
92
|
+
'This directory will be created if it does not exist yet.'
|
93
|
+
) do |output_dir|
|
94
|
+
options[:output_dir] = provide_dir(output_dir)
|
95
|
+
end
|
96
|
+
|
97
|
+
opts.separator ''
|
98
|
+
|
99
|
+
opts.on('-l', '--limit NUMBER', Integer,
|
100
|
+
'Provide a number of answers you want to get from Yahoo.',
|
101
|
+
'This argument is not mandatory, if you want to get',
|
102
|
+
'all answers simply ommit this argument.'
|
103
|
+
) do |limit|
|
104
|
+
options[:limit] = limit
|
105
|
+
end
|
106
|
+
|
107
|
+
opts.separator ''
|
108
|
+
opts.on('-f', '--output-format FORMAT',
|
109
|
+
'Provide an output format: xml, json, rss, php.',
|
110
|
+
'It defaults to xml, and you can simply ommit this option.'
|
111
|
+
) do |f|
|
112
|
+
raise NotImplementedError, 'Only default output format is implemented'
|
113
|
+
end
|
114
|
+
|
115
|
+
opts.separator ''
|
116
|
+
opts.on('--prefix PREFIX',
|
117
|
+
'Provide a prefix for the output files. By default',
|
118
|
+
'the filename begins with the index of the retrieved',
|
119
|
+
'question. You can alter this by providing a prefix.',
|
120
|
+
'It can be useful if you want to put many query results',
|
121
|
+
'in the same output folder.'
|
122
|
+
) do |pref|
|
123
|
+
raise NotImplementedError, 'No prefixes implemeted.'
|
124
|
+
end
|
125
|
+
|
126
|
+
opts.separator ""
|
127
|
+
opts.separator "Common options:"
|
128
|
+
|
129
|
+
opts.on_tail('-h', '--help', 'Show the help message.') do
|
130
|
+
puts opts
|
131
|
+
exit
|
132
|
+
end
|
133
|
+
|
134
|
+
opts.on_tail('-v', '--version', 'Show the program version.') do
|
135
|
+
puts YANSER::VERSION
|
136
|
+
exit
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
# if no options provided print the help
|
141
|
+
if cmd_args.empty?
|
142
|
+
$stderr.printf "You have to provide some options.\n\n"
|
143
|
+
puts parser
|
144
|
+
exit 1
|
145
|
+
end
|
146
|
+
|
147
|
+
# Parse ARGV and provide the options hash.
|
148
|
+
# Check if everything is correct and handle exceptions
|
149
|
+
begin
|
150
|
+
parser.parse!(cmd_args)
|
151
|
+
# rescue all exceptions from OptionParser
|
152
|
+
rescue => e
|
153
|
+
$stderr.printf "#{e.message.capitalize}\n\n"
|
154
|
+
puts parser
|
155
|
+
exit 1
|
156
|
+
end
|
157
|
+
|
158
|
+
# Check to see if we got the required arguments needed.
|
159
|
+
check_required_options(options)
|
160
|
+
|
161
|
+
# Set the search method.
|
162
|
+
options[:query_type] = set_query_type(options[:query_params])
|
163
|
+
|
164
|
+
return options
|
165
|
+
end # parse
|
166
|
+
|
167
|
+
private
|
168
|
+
|
169
|
+
# Check if the value of given region is correct.
|
170
|
+
# Now 14 regions are supported by Yahoo! Answers.
|
171
|
+
def self.prove_region(region)
|
172
|
+
regions = ['de', 'us', 'uk', 'ca', 'au', 'in', 'es',
|
173
|
+
'br', 'ar', 'mx', 'e1', 'it', 'fr', 'sg']
|
174
|
+
|
175
|
+
unless regions.include?(region)
|
176
|
+
$stderr.puts "The provided search region #{region} is currently not supported by Yahoo!"
|
177
|
+
exit 1
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
# define one of the following query types: TermQuery|CategoryQuery|
|
182
|
+
# QuestionQuery|UserQuery
|
183
|
+
def self.set_query_type(params)
|
184
|
+
case
|
185
|
+
when (params[:category_id] || params[:category_name]) && ! params[:query]
|
186
|
+
return 'CategoryQuery'
|
187
|
+
when params[:query]
|
188
|
+
return 'TermQuery'
|
189
|
+
when params[:user_id]
|
190
|
+
return 'UserQuery'
|
191
|
+
when params[:question_id]
|
192
|
+
return 'QuestionQuery'
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
def self.check_required_options(options)
|
197
|
+
required_opts = [:appid]
|
198
|
+
|
199
|
+
required_opts.each do |opt|
|
200
|
+
if options[:query_params].has_key?(opt)
|
201
|
+
next
|
202
|
+
else
|
203
|
+
$stderr.puts "A required option --#{opt} is missing."
|
204
|
+
exit 1
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
def self.provide_dir(dir)
|
210
|
+
dir = File.expand_path(dir)
|
211
|
+
#check for existens
|
212
|
+
if File.directory?(dir)
|
213
|
+
if File.writable?(dir)
|
214
|
+
return dir
|
215
|
+
else
|
216
|
+
$stderr.puts 'The directory you have provided is not writable!'
|
217
|
+
exit 1
|
218
|
+
end
|
219
|
+
else
|
220
|
+
FileUtils.mkdir_p(dir)
|
221
|
+
return dir
|
222
|
+
end
|
223
|
+
end # provide_dir
|
224
|
+
|
225
|
+
end # OptionParser
|
226
|
+
|
227
|
+
__END__
|
228
|
+
|
229
|
+
-c, --category-id # Term & CategorySearch
|
230
|
+
-q, --question-id # QuestionSearch
|
231
|
+
|
232
|
+
-t, --time-interval
|
233
|
+
-f, --output-format
|
234
|
+
-p, --prefix #prefix for output files
|
235
|
+
|
236
|
+
|
237
|
+
|
238
|
+
|
239
|
+
instance interface
|
240
|
+
|
241
|
+
{
|
242
|
+
:query_params => {
|
243
|
+
:appid => 'YahooDemo' | 'SomeStringWithYourID',
|
244
|
+
:callback => '',
|
245
|
+
:category_id => '',
|
246
|
+
:category_name => '',
|
247
|
+
:date_range => '',
|
248
|
+
:filter => '',
|
249
|
+
:output => '',
|
250
|
+
:query => '',
|
251
|
+
:question_id => '',
|
252
|
+
:region => 'de'|'us'|'uk'|'ca'|'au'|'in'|'es'|'br'|'ar'|'mx'|'e1'|'it'|'fr'|'sg', # default 'en'
|
253
|
+
:results => Integer, # 0..50
|
254
|
+
:search_in => "all" | "question" | "best_answer", # default 'all'
|
255
|
+
:sort => 'relevance' | 'date_desc'| 'date_asc', # default 'relevance'
|
256
|
+
:start => Integer, # <= 1000
|
257
|
+
:type => "all" | "resolved" | "open" | "undecided", # default 'all'
|
258
|
+
:user_id => ''
|
259
|
+
},
|
260
|
+
:query_type => 'TermQuery'|'QuestionQuery'|'UserQuery'|'CategoryQuery',
|
261
|
+
:output_dir => 'some path',
|
262
|
+
:prefix => 'some prefix' # prefix for output files
|
263
|
+
}
|
data/lib/tester.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
require 'yanser'
|
4
|
+
|
5
|
+
params = {
|
6
|
+
:query_method => 'TermQuery',
|
7
|
+
:query_params => {
|
8
|
+
:appid => 'YahooDemo',
|
9
|
+
:query => 'Köln',
|
10
|
+
:region => 'de',
|
11
|
+
:results => 5,
|
12
|
+
:start => 0
|
13
|
+
}
|
14
|
+
}
|
15
|
+
|
16
|
+
y = Yanser.new(params)
|
17
|
+
|
18
|
+
y.start
|
data/lib/version.rb
ADDED
data/lib/yanser.rb
ADDED
@@ -0,0 +1,121 @@
|
|
1
|
+
require 'yanapi'
|
2
|
+
|
3
|
+
# :title: YANSER, Yahoo! ANSwers harvestER
|
4
|
+
# :main: Yanser
|
5
|
+
# Main processing class.
|
6
|
+
# Yanser encapsulates the main routine and instantiates
|
7
|
+
# all other classes.
|
8
|
+
#--
|
9
|
+
# Yanser takes the users input and validates it.
|
10
|
+
# It decides which search method to choose.
|
11
|
+
# Then it collects all parameters and useful default values,
|
12
|
+
# creates an XyzQuery with the starting point of 0 and gets the first results.
|
13
|
+
# If more results were requested, Yanser creates a similar XyzQuery and gets
|
14
|
+
# the next result set until the result limitation set by the user is met.
|
15
|
+
#
|
16
|
+
class Yanser
|
17
|
+
|
18
|
+
# Yahoo! Answers returns starting not more than at the 1000st question.
|
19
|
+
# It makes no sense to step over.
|
20
|
+
START_LIMIT = 1000
|
21
|
+
|
22
|
+
# We query the web service every two seconds.
|
23
|
+
QUERY_INTERVAL = 2
|
24
|
+
|
25
|
+
# Yahoo! Answers returns maximum 50 results.
|
26
|
+
MAX_RESULTS = 50
|
27
|
+
|
28
|
+
def initialize(opts)
|
29
|
+
|
30
|
+
# the minimal output of an OptionParser
|
31
|
+
# {:query_type=>u|q|c|w, :query_params=>{appid, start, results, query}}
|
32
|
+
# opts come from the OptionParser
|
33
|
+
# they are supposed to be correct, no validation here
|
34
|
+
@options = opts
|
35
|
+
|
36
|
+
@options[:limit] = @options[:limit] || START_LIMIT + MAX_RESULTS
|
37
|
+
|
38
|
+
@query_params = @options[:query_params]
|
39
|
+
|
40
|
+
@output_type = @query_params[:output] || 'xml'
|
41
|
+
|
42
|
+
@query_params[:start] = @query_params[:start] || 0
|
43
|
+
end
|
44
|
+
|
45
|
+
# TODO: implement some logging
|
46
|
+
def start
|
47
|
+
|
48
|
+
if @options[:query_type] == 'QuestionQuery'
|
49
|
+
query(@query_params)
|
50
|
+
elsif @options[:limit] < MAX_RESULTS
|
51
|
+
@query_params[:results] = @options[:limit]
|
52
|
+
query(@query_params)
|
53
|
+
else
|
54
|
+
@query_params[:results] = MAX_RESULTS
|
55
|
+
|
56
|
+
while query(@query_params) do
|
57
|
+
# we get the next start point here
|
58
|
+
@query_params[:start] += MAX_RESULTS
|
59
|
+
|
60
|
+
results_left = @options[:limit] - @query_params[:start]
|
61
|
+
if results_left == 0
|
62
|
+
break
|
63
|
+
elsif results_left < MAX_RESULTS
|
64
|
+
@query_params[:results] = results_left
|
65
|
+
break
|
66
|
+
end
|
67
|
+
end # while
|
68
|
+
|
69
|
+
end # if
|
70
|
+
|
71
|
+
end # start
|
72
|
+
|
73
|
+
private
|
74
|
+
|
75
|
+
def query(params)
|
76
|
+
q = create_query(params)
|
77
|
+
tries = 0
|
78
|
+
begin
|
79
|
+
tries += 1
|
80
|
+
result = q.get
|
81
|
+
output(result)
|
82
|
+
sleep(2)
|
83
|
+
rescue YANAPI::EmptyResponse => e
|
84
|
+
$sderr.puts e
|
85
|
+
return false # do not iterate futher
|
86
|
+
rescue => e # some errors to retry
|
87
|
+
if (tries < 4)
|
88
|
+
sleep(QUERY_INTERVAL**tries)
|
89
|
+
retry
|
90
|
+
else
|
91
|
+
$stderr.puts e
|
92
|
+
return false # do not iterate futher
|
93
|
+
end
|
94
|
+
end
|
95
|
+
return true # we may iterate futher
|
96
|
+
end
|
97
|
+
|
98
|
+
def create_query(params)
|
99
|
+
eval("YANAPI::#{@options[:query_type]}.new(params)")
|
100
|
+
end
|
101
|
+
|
102
|
+
def output(result)
|
103
|
+
if @options[:output_dir]
|
104
|
+
save(result)
|
105
|
+
else
|
106
|
+
puts result
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
# save results to a dir
|
111
|
+
# this dir exists since has been proved by OptionParser
|
112
|
+
# not a good implementation
|
113
|
+
# interface (filename, data)
|
114
|
+
def save(result)
|
115
|
+
filename = File.join(@options[:output_dir], "#{@query_params[:start]}.#{@output_type}")
|
116
|
+
file = File.new(filename, 'w')
|
117
|
+
file.puts result
|
118
|
+
file.close
|
119
|
+
end
|
120
|
+
|
121
|
+
end # Yanser
|