yanser 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +19 -0
- data/README +1 -0
- data/README.rdoc +21 -0
- data/Rakefile +14 -0
- data/bin/yanser +19 -0
- data/lib/option_parser.rb +263 -0
- data/lib/tester.rb +18 -0
- data/lib/version.rb +3 -0
- data/lib/yanser.rb +121 -0
- data/test/data/bad_xml.txt +236 -0
- data/test/data/empty_result.txt +13 -0
- data/test/data/error_code.txt +237 -0
- data/test/data/response_with_error.txt +15 -0
- data/test/data/successfull_response.txt +237 -0
- data/test/test_option_parser.rb +91 -0
- data/test/test_yanser.rb +31 -0
- data/test/yanapi/test_query.rb +112 -0
- data/test/yanapi/test_term_query.rb +64 -0
- metadata +104 -0
data/LICENSE
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Copyright (c) 2011 Andrei Beliankou, University of Trier, Germany
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
5
|
+
in the Software without restriction, including without limitation the rights
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in
|
11
|
+
all copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
THE SOFTWARE.
|
data/README
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
new readme
|
data/README.rdoc
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
= YANSER
|
2
|
+
|
3
|
+
* {RubyGems}[http://rubygems.org/gems/yanser]
|
4
|
+
* Developers {Homepage}[http://www.uni-trier.de/index.php?id=24140]
|
5
|
+
* {YANAPI Project Page}[http://yanser.rubyforge.org/]
|
6
|
+
|
7
|
+
== DESCRIPTION
|
8
|
+
|
9
|
+
YANSER (Yahoo! ANSwers harvestER) is a convinient search tool providing access to the Yahoo! Answers Q&A collection. Based on YANAPI it provides a simple CLI and helps to search for Questions and Answers which contain a set of key words, belong to a specific semantic domain or are posted by a certain user. Yanser is a research tool in the field of Computational Linguistics.
|
10
|
+
|
11
|
+
== SYNOPSIS
|
12
|
+
$ yanser --help
|
13
|
+
|
14
|
+
|
15
|
+
|
16
|
+
== LICENSE
|
17
|
+
|
18
|
+
YANSER is a copyrighted software by Andrei Beliankou, 2011.
|
19
|
+
You may use, redistribute and change it under the terms
|
20
|
+
provided in the LICENSE file.
|
21
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
# we can require 'rake/clean' to add 'clobber' and 'clean' tasks
|
4
|
+
require 'rake/clean'
|
5
|
+
require 'rake/testtask'
|
6
|
+
|
7
|
+
|
8
|
+
SRC = FileList['**/*.rb']
|
9
|
+
|
10
|
+
CLOBBER.include('doc', '**/*.html', '**/*.gem')
|
11
|
+
|
12
|
+
Rake::TestTask.new do |t|
|
13
|
+
t.test_files = FileList.new('test/yanapi/*.rb')
|
14
|
+
end
|
data/bin/yanser
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
# это неверное решение, на самом деле я не должен напрямую указывать,
|
5
|
+
# где находятся файлы, но пока пусть будет так
|
6
|
+
lib_path = File.expand_path('../../lib', __FILE__)
|
7
|
+
$LOAD_PATH.unshift(lib_path) unless $LOAD_PATH.include?(lib_path)
|
8
|
+
|
9
|
+
# подобный способ является правильным, только так классы Yanser и OptionParser будут независимы
|
10
|
+
require 'yanser'
|
11
|
+
require 'option_parser'
|
12
|
+
|
13
|
+
# class method, why not?
|
14
|
+
options = OptionParser.parse(ARGV)
|
15
|
+
|
16
|
+
|
17
|
+
yanser = Yanser.new(options)
|
18
|
+
|
19
|
+
yanser.start
|
@@ -0,0 +1,263 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require 'optparse'
|
3
|
+
require 'fileutils'
|
4
|
+
|
5
|
+
require 'version'
|
6
|
+
|
7
|
+
class OptionParser
|
8
|
+
# OP expects cmd_args to be an array like ARGV
|
9
|
+
# dummy output for temporary usage
|
10
|
+
def self.parse(cmd_args)
|
11
|
+
options = {}
|
12
|
+
options[:query_params] = {}
|
13
|
+
|
14
|
+
parser = OptionParser.new do |opts|
|
15
|
+
opts.banner = 'Usage: yanser OPTIONS'
|
16
|
+
|
17
|
+
opts.separator ''
|
18
|
+
opts.separator 'Program specific options:'
|
19
|
+
|
20
|
+
opts.on('--appid APPID',
|
21
|
+
'Provide an ApplicationID given by Yahoo,',
|
22
|
+
'to test Yanser you can use <YahooDemo> as the APPID,',
|
23
|
+
'think in this case on limitations placed by Yahoo.',
|
24
|
+
'This option is required!'
|
25
|
+
) do |appid|
|
26
|
+
options[:query_params][:appid] = appid
|
27
|
+
end
|
28
|
+
|
29
|
+
|
30
|
+
opts.separator ''
|
31
|
+
opts.separator ' Mandatory search arguments:'
|
32
|
+
|
33
|
+
opts.on('-k', '--key-word KEYWORD',
|
34
|
+
'Provide a single keyword or a boolean expression.'
|
35
|
+
) do |keyword|
|
36
|
+
options[:query_params][:query] = keyword
|
37
|
+
# not a solution!!!
|
38
|
+
options[:query_params][:search_in] = 'question'
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
opts.separator ''
|
43
|
+
|
44
|
+
opts.on('-c', '--category CATEGORY',
|
45
|
+
'Provide a category name or ID.'
|
46
|
+
) do |category|
|
47
|
+
|
48
|
+
if category =~ /^[[:digit:]]+$/
|
49
|
+
options[:query_params][:category_id] = category
|
50
|
+
else
|
51
|
+
options[:query_params][:category_name] = category
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
55
|
+
|
56
|
+
opts.separator ''
|
57
|
+
|
58
|
+
opts.on('--user-id ID',
|
59
|
+
'Provide an user ID of questions you search for.',
|
60
|
+
'This way you can get question by a specific user.'
|
61
|
+
) do |user_id|
|
62
|
+
options[:query_params][:user_id] = user_id
|
63
|
+
end
|
64
|
+
|
65
|
+
opts.separator ''
|
66
|
+
|
67
|
+
opts.on('--question-id ID',
|
68
|
+
'Provide a question ID of the question you search for.',
|
69
|
+
'It returns a unique question.'
|
70
|
+
) do |question_id|
|
71
|
+
options[:query_params][:question_id] = question_id
|
72
|
+
end
|
73
|
+
|
74
|
+
opts.separator ''
|
75
|
+
opts.separator ' Optional search arguments:'
|
76
|
+
|
77
|
+
opts.on('-r', '--region REGION',
|
78
|
+
'Provide a geographic region to search in for terms.',
|
79
|
+
'Possible values are: de, us, uk, ca, au, in, es, br,',
|
80
|
+
' ar, mx, e1, it, fr, sg.',
|
81
|
+
'This defaults to en.'
|
82
|
+
) do |region|
|
83
|
+
# todo
|
84
|
+
prove_region(region)
|
85
|
+
options[:query_params][:region] = region
|
86
|
+
end
|
87
|
+
|
88
|
+
opts.separator ''
|
89
|
+
|
90
|
+
opts.on('-o', '--output-dir DIR',
|
91
|
+
'Provide an output folder.',
|
92
|
+
'This directory will be created if it does not exist yet.'
|
93
|
+
) do |output_dir|
|
94
|
+
options[:output_dir] = provide_dir(output_dir)
|
95
|
+
end
|
96
|
+
|
97
|
+
opts.separator ''
|
98
|
+
|
99
|
+
opts.on('-l', '--limit NUMBER', Integer,
|
100
|
+
'Provide a number of answers you want to get from Yahoo.',
|
101
|
+
'This argument is not mandatory, if you want to get',
|
102
|
+
'all answers simply ommit this argument.'
|
103
|
+
) do |limit|
|
104
|
+
options[:limit] = limit
|
105
|
+
end
|
106
|
+
|
107
|
+
opts.separator ''
|
108
|
+
opts.on('-f', '--output-format FORMAT',
|
109
|
+
'Provide an output format: xml, json, rss, php.',
|
110
|
+
'It defaults to xml, and you can simply ommit this option.'
|
111
|
+
) do |f|
|
112
|
+
raise NotImplementedError, 'Only default output format is implemented'
|
113
|
+
end
|
114
|
+
|
115
|
+
opts.separator ''
|
116
|
+
opts.on('--prefix PREFIX',
|
117
|
+
'Provide a prefix for the output files. By default',
|
118
|
+
'the filename begins with the index of the retrieved',
|
119
|
+
'question. You can alter this by providing a prefix.',
|
120
|
+
'It can be useful if you want to put many query results',
|
121
|
+
'in the same output folder.'
|
122
|
+
) do |pref|
|
123
|
+
raise NotImplementedError, 'No prefixes implemeted.'
|
124
|
+
end
|
125
|
+
|
126
|
+
opts.separator ""
|
127
|
+
opts.separator "Common options:"
|
128
|
+
|
129
|
+
opts.on_tail('-h', '--help', 'Show the help message.') do
|
130
|
+
puts opts
|
131
|
+
exit
|
132
|
+
end
|
133
|
+
|
134
|
+
opts.on_tail('-v', '--version', 'Show the program version.') do
|
135
|
+
puts YANSER::VERSION
|
136
|
+
exit
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
# if no options provided print the help
|
141
|
+
if cmd_args.empty?
|
142
|
+
$stderr.printf "You have to provide some options.\n\n"
|
143
|
+
puts parser
|
144
|
+
exit 1
|
145
|
+
end
|
146
|
+
|
147
|
+
# Parse ARGV and provide the options hash.
|
148
|
+
# Check if everything is correct and handle exceptions
|
149
|
+
begin
|
150
|
+
parser.parse!(cmd_args)
|
151
|
+
# rescue all exceptions from OptionParser
|
152
|
+
rescue => e
|
153
|
+
$stderr.printf "#{e.message.capitalize}\n\n"
|
154
|
+
puts parser
|
155
|
+
exit 1
|
156
|
+
end
|
157
|
+
|
158
|
+
# Check to see if we got the required arguments needed.
|
159
|
+
check_required_options(options)
|
160
|
+
|
161
|
+
# Set the search method.
|
162
|
+
options[:query_type] = set_query_type(options[:query_params])
|
163
|
+
|
164
|
+
return options
|
165
|
+
end # parse
|
166
|
+
|
167
|
+
private
|
168
|
+
|
169
|
+
# Check if the value of given region is correct.
|
170
|
+
# Now 14 regions are supported by Yahoo! Answers.
|
171
|
+
def self.prove_region(region)
|
172
|
+
regions = ['de', 'us', 'uk', 'ca', 'au', 'in', 'es',
|
173
|
+
'br', 'ar', 'mx', 'e1', 'it', 'fr', 'sg']
|
174
|
+
|
175
|
+
unless regions.include?(region)
|
176
|
+
$stderr.puts "The provided search region #{region} is currently not supported by Yahoo!"
|
177
|
+
exit 1
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
# define one of the following query types: TermQuery|CategoryQuery|
|
182
|
+
# QuestionQuery|UserQuery
|
183
|
+
def self.set_query_type(params)
|
184
|
+
case
|
185
|
+
when (params[:category_id] || params[:category_name]) && ! params[:query]
|
186
|
+
return 'CategoryQuery'
|
187
|
+
when params[:query]
|
188
|
+
return 'TermQuery'
|
189
|
+
when params[:user_id]
|
190
|
+
return 'UserQuery'
|
191
|
+
when params[:question_id]
|
192
|
+
return 'QuestionQuery'
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
def self.check_required_options(options)
|
197
|
+
required_opts = [:appid]
|
198
|
+
|
199
|
+
required_opts.each do |opt|
|
200
|
+
if options[:query_params].has_key?(opt)
|
201
|
+
next
|
202
|
+
else
|
203
|
+
$stderr.puts "A required option --#{opt} is missing."
|
204
|
+
exit 1
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
def self.provide_dir(dir)
|
210
|
+
dir = File.expand_path(dir)
|
211
|
+
#check for existens
|
212
|
+
if File.directory?(dir)
|
213
|
+
if File.writable?(dir)
|
214
|
+
return dir
|
215
|
+
else
|
216
|
+
$stderr.puts 'The directory you have provided is not writable!'
|
217
|
+
exit 1
|
218
|
+
end
|
219
|
+
else
|
220
|
+
FileUtils.mkdir_p(dir)
|
221
|
+
return dir
|
222
|
+
end
|
223
|
+
end # provide_dir
|
224
|
+
|
225
|
+
end # OptionParser
|
226
|
+
|
227
|
+
__END__
|
228
|
+
|
229
|
+
-c, --category-id # Term & CategorySearch
|
230
|
+
-q, --question-id # QuestionSearch
|
231
|
+
|
232
|
+
-t, --time-interval
|
233
|
+
-f, --output-format
|
234
|
+
-p, --prefix #prefix for output files
|
235
|
+
|
236
|
+
|
237
|
+
|
238
|
+
|
239
|
+
instance interface
|
240
|
+
|
241
|
+
{
|
242
|
+
:query_params => {
|
243
|
+
:appid => 'YahooDemo' | 'SomeStringWithYourID',
|
244
|
+
:callback => '',
|
245
|
+
:category_id => '',
|
246
|
+
:category_name => '',
|
247
|
+
:date_range => '',
|
248
|
+
:filter => '',
|
249
|
+
:output => '',
|
250
|
+
:query => '',
|
251
|
+
:question_id => '',
|
252
|
+
:region => 'de'|'us'|'uk'|'ca'|'au'|'in'|'es'|'br'|'ar'|'mx'|'e1'|'it'|'fr'|'sg', # default 'en'
|
253
|
+
:results => Integer, # 0..50
|
254
|
+
:search_in => "all" | "question" | "best_answer", # default 'all'
|
255
|
+
:sort => 'relevance' | 'date_desc'| 'date_asc', # default 'relevance'
|
256
|
+
:start => Integer, # <= 1000
|
257
|
+
:type => "all" | "resolved" | "open" | "undecided", # default 'all'
|
258
|
+
:user_id => ''
|
259
|
+
},
|
260
|
+
:query_type => 'TermQuery'|'QuestionQuery'|'UserQuery'|'CategoryQuery',
|
261
|
+
:output_dir => 'some path',
|
262
|
+
:prefix => 'some prefix' # prefix for output files
|
263
|
+
}
|
data/lib/tester.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
require 'yanser'
|
4
|
+
|
5
|
+
params = {
|
6
|
+
:query_method => 'TermQuery',
|
7
|
+
:query_params => {
|
8
|
+
:appid => 'YahooDemo',
|
9
|
+
:query => 'Köln',
|
10
|
+
:region => 'de',
|
11
|
+
:results => 5,
|
12
|
+
:start => 0
|
13
|
+
}
|
14
|
+
}
|
15
|
+
|
16
|
+
y = Yanser.new(params)
|
17
|
+
|
18
|
+
y.start
|
data/lib/version.rb
ADDED
data/lib/yanser.rb
ADDED
@@ -0,0 +1,121 @@
|
|
1
|
+
require 'yanapi'
|
2
|
+
|
3
|
+
# :title: YANSER, Yahoo! ANSwers harvestER
|
4
|
+
# :main: Yanser
|
5
|
+
# Main processing class.
|
6
|
+
# Yanser encapsulates the main routine and instantiates
|
7
|
+
# all other classes.
|
8
|
+
#--
|
9
|
+
# Yanser takes the users input and validates it.
|
10
|
+
# It decides which search method to choose.
|
11
|
+
# Then it collects all parameters and useful default values,
|
12
|
+
# creates an XyzQuery with the starting point of 0 and gets the first results.
|
13
|
+
# If more results were requested, Yanser creates a similar XyzQuery and gets
|
14
|
+
# the next result set until the result limitation set by the user is met.
|
15
|
+
#
|
16
|
+
class Yanser
|
17
|
+
|
18
|
+
# Yahoo! Answers returns starting not more than at the 1000st question.
|
19
|
+
# It makes no sense to step over.
|
20
|
+
START_LIMIT = 1000
|
21
|
+
|
22
|
+
# We query the web service every two seconds.
|
23
|
+
QUERY_INTERVAL = 2
|
24
|
+
|
25
|
+
# Yahoo! Answers returns maximum 50 results.
|
26
|
+
MAX_RESULTS = 50
|
27
|
+
|
28
|
+
def initialize(opts)
|
29
|
+
|
30
|
+
# the minimal output of an OptionParser
|
31
|
+
# {:query_type=>u|q|c|w, :query_params=>{appid, start, results, query}}
|
32
|
+
# opts come from the OptionParser
|
33
|
+
# they are supposed to be correct, no validation here
|
34
|
+
@options = opts
|
35
|
+
|
36
|
+
@options[:limit] = @options[:limit] || START_LIMIT + MAX_RESULTS
|
37
|
+
|
38
|
+
@query_params = @options[:query_params]
|
39
|
+
|
40
|
+
@output_type = @query_params[:output] || 'xml'
|
41
|
+
|
42
|
+
@query_params[:start] = @query_params[:start] || 0
|
43
|
+
end
|
44
|
+
|
45
|
+
# TODO: implement some logging
|
46
|
+
def start
|
47
|
+
|
48
|
+
if @options[:query_type] == 'QuestionQuery'
|
49
|
+
query(@query_params)
|
50
|
+
elsif @options[:limit] < MAX_RESULTS
|
51
|
+
@query_params[:results] = @options[:limit]
|
52
|
+
query(@query_params)
|
53
|
+
else
|
54
|
+
@query_params[:results] = MAX_RESULTS
|
55
|
+
|
56
|
+
while query(@query_params) do
|
57
|
+
# we get the next start point here
|
58
|
+
@query_params[:start] += MAX_RESULTS
|
59
|
+
|
60
|
+
results_left = @options[:limit] - @query_params[:start]
|
61
|
+
if results_left == 0
|
62
|
+
break
|
63
|
+
elsif results_left < MAX_RESULTS
|
64
|
+
@query_params[:results] = results_left
|
65
|
+
break
|
66
|
+
end
|
67
|
+
end # while
|
68
|
+
|
69
|
+
end # if
|
70
|
+
|
71
|
+
end # start
|
72
|
+
|
73
|
+
private
|
74
|
+
|
75
|
+
def query(params)
|
76
|
+
q = create_query(params)
|
77
|
+
tries = 0
|
78
|
+
begin
|
79
|
+
tries += 1
|
80
|
+
result = q.get
|
81
|
+
output(result)
|
82
|
+
sleep(2)
|
83
|
+
rescue YANAPI::EmptyResponse => e
|
84
|
+
$sderr.puts e
|
85
|
+
return false # do not iterate futher
|
86
|
+
rescue => e # some errors to retry
|
87
|
+
if (tries < 4)
|
88
|
+
sleep(QUERY_INTERVAL**tries)
|
89
|
+
retry
|
90
|
+
else
|
91
|
+
$stderr.puts e
|
92
|
+
return false # do not iterate futher
|
93
|
+
end
|
94
|
+
end
|
95
|
+
return true # we may iterate futher
|
96
|
+
end
|
97
|
+
|
98
|
+
def create_query(params)
|
99
|
+
eval("YANAPI::#{@options[:query_type]}.new(params)")
|
100
|
+
end
|
101
|
+
|
102
|
+
def output(result)
|
103
|
+
if @options[:output_dir]
|
104
|
+
save(result)
|
105
|
+
else
|
106
|
+
puts result
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
# save results to a dir
|
111
|
+
# this dir exists since has been proved by OptionParser
|
112
|
+
# not a good implementation
|
113
|
+
# interface (filename, data)
|
114
|
+
def save(result)
|
115
|
+
filename = File.join(@options[:output_dir], "#{@query_params[:start]}.#{@output_type}")
|
116
|
+
file = File.new(filename, 'w')
|
117
|
+
file.puts result
|
118
|
+
file.close
|
119
|
+
end
|
120
|
+
|
121
|
+
end # Yanser
|