yanser 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README +21 -1
- data/Rakefile +31 -4
- data/bin/yanser +8 -8
- data/lib/yanser/error.rb +11 -0
- data/lib/yanser/opt_parser.rb +313 -0
- data/lib/yanser/version.rb +3 -0
- data/lib/yanser/yanser.rb +135 -0
- data/test/test_opt_parser.rb +138 -0
- data/test/test_yanser.rb +7 -3
- metadata +13 -22
- data/README.rdoc +0 -21
- data/lib/option_parser.rb +0 -263
- data/lib/version.rb +0 -3
- data/lib/yanser.rb +0 -121
- data/test/data/bad_xml.txt +0 -236
- data/test/data/empty_result.txt +0 -13
- data/test/data/error_code.txt +0 -237
- data/test/data/response_with_error.txt +0 -15
- data/test/data/successfull_response.txt +0 -237
- data/test/test_option_parser.rb +0 -91
- data/test/yanapi/test_query.rb +0 -112
- data/test/yanapi/test_term_query.rb +0 -64
data/test/test_yanser.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# -*- coding: utf-8 -*-
|
2
2
|
require 'test/unit'
|
3
|
-
require 'yanser'
|
3
|
+
require 'yanser/yanser'
|
4
|
+
require 'yanser/version'
|
4
5
|
|
5
6
|
class TestYanser < Test::Unit::TestCase
|
6
7
|
def setup
|
@@ -10,7 +11,7 @@ class TestYanser < Test::Unit::TestCase
|
|
10
11
|
end
|
11
12
|
|
12
13
|
def test_public_methods
|
13
|
-
yanser = Yanser.new(
|
14
|
+
yanser = Yanser::Yanser.new(
|
14
15
|
:query_params => {
|
15
16
|
:appid => 'YahooDemo',
|
16
17
|
:query => 'Haus AND grün',
|
@@ -24,7 +25,10 @@ class TestYanser < Test::Unit::TestCase
|
|
24
25
|
|
25
26
|
assert_respond_to(yanser, :start)
|
26
27
|
end
|
27
|
-
|
28
|
+
|
29
|
+
def test_constants
|
30
|
+
assert(Yanser::VERSION.instance_of?(String))
|
31
|
+
end
|
28
32
|
def test_mandatory_options
|
29
33
|
end
|
30
34
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: yanser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
5
|
-
prerelease:
|
4
|
+
hash: 27
|
5
|
+
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 2
|
10
|
+
version: 0.0.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Andrei Beliankou
|
@@ -15,8 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-
|
19
|
-
default_executable:
|
18
|
+
date: 2011-07-08 00:00:00 Z
|
20
19
|
dependencies:
|
21
20
|
- !ruby/object:Gem::Dependency
|
22
21
|
name: yanapi
|
@@ -41,28 +40,20 @@ executables:
|
|
41
40
|
extensions: []
|
42
41
|
|
43
42
|
extra_rdoc_files:
|
44
|
-
- README
|
43
|
+
- README
|
45
44
|
- LICENSE
|
46
45
|
files:
|
47
46
|
- lib/tester.rb
|
48
|
-
- lib/yanser.rb
|
49
|
-
- lib/
|
50
|
-
- lib/
|
47
|
+
- lib/yanser/error.rb
|
48
|
+
- lib/yanser/yanser.rb
|
49
|
+
- lib/yanser/opt_parser.rb
|
50
|
+
- lib/yanser/version.rb
|
51
51
|
- bin/yanser
|
52
|
-
- README.rdoc
|
53
52
|
- LICENSE
|
54
53
|
- Rakefile
|
55
54
|
- README
|
56
|
-
- test/
|
57
|
-
- test/yanapi/test_query.rb
|
58
|
-
- test/yanapi/test_term_query.rb
|
55
|
+
- test/test_opt_parser.rb
|
59
56
|
- test/test_yanser.rb
|
60
|
-
- test/data/bad_xml.txt
|
61
|
-
- test/data/empty_result.txt
|
62
|
-
- test/data/error_code.txt
|
63
|
-
- test/data/response_with_error.txt
|
64
|
-
- test/data/successfull_response.txt
|
65
|
-
has_rdoc: true
|
66
57
|
homepage: http://www.uni-trier.de/index.php?id=34451
|
67
58
|
licenses: []
|
68
59
|
|
@@ -95,10 +86,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
95
86
|
requirements: []
|
96
87
|
|
97
88
|
rubyforge_project: yanser
|
98
|
-
rubygems_version: 1.
|
89
|
+
rubygems_version: 1.7.2
|
99
90
|
signing_key:
|
100
91
|
specification_version: 3
|
101
92
|
summary: Yanser is a convinient search tool providing access to the Yahoo! Answers Q/A collection.
|
102
93
|
test_files:
|
103
|
-
- test/
|
94
|
+
- test/test_opt_parser.rb
|
104
95
|
- test/test_yanser.rb
|
data/README.rdoc
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
= YANSER
|
2
|
-
|
3
|
-
* {RubyGems}[http://rubygems.org/gems/yanser]
|
4
|
-
* Developers {Homepage}[http://www.uni-trier.de/index.php?id=24140]
|
5
|
-
* {YANAPI Project Page}[http://yanser.rubyforge.org/]
|
6
|
-
|
7
|
-
== DESCRIPTION
|
8
|
-
|
9
|
-
YANSER (Yahoo! ANSwers harvestER) is a convinient search tool providing access to the Yahoo! Answers Q&A collection. Based on YANAPI it provides a simple CLI and helps to search for Questions and Answers which contain a set of key words, belong to a specific semantic domain or are posted by a certain user. Yanser is a research tool in the field of Computational Linguistics.
|
10
|
-
|
11
|
-
== SYNOPSIS
|
12
|
-
$ yanser --help
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
== LICENSE
|
17
|
-
|
18
|
-
YANSER is a copyrighted software by Andrei Beliankou, 2011.
|
19
|
-
You may use, redistribute and change it under the terms
|
20
|
-
provided in the LICENSE file.
|
21
|
-
|
data/lib/option_parser.rb
DELETED
@@ -1,263 +0,0 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
|
-
require 'optparse'
|
3
|
-
require 'fileutils'
|
4
|
-
|
5
|
-
require 'version'
|
6
|
-
|
7
|
-
class OptionParser
|
8
|
-
# OP expects cmd_args to be an array like ARGV
|
9
|
-
# dummy output for temporary usage
|
10
|
-
def self.parse(cmd_args)
|
11
|
-
options = {}
|
12
|
-
options[:query_params] = {}
|
13
|
-
|
14
|
-
parser = OptionParser.new do |opts|
|
15
|
-
opts.banner = 'Usage: yanser OPTIONS'
|
16
|
-
|
17
|
-
opts.separator ''
|
18
|
-
opts.separator 'Program specific options:'
|
19
|
-
|
20
|
-
opts.on('--appid APPID',
|
21
|
-
'Provide an ApplicationID given by Yahoo,',
|
22
|
-
'to test Yanser you can use <YahooDemo> as the APPID,',
|
23
|
-
'think in this case on limitations placed by Yahoo.',
|
24
|
-
'This option is required!'
|
25
|
-
) do |appid|
|
26
|
-
options[:query_params][:appid] = appid
|
27
|
-
end
|
28
|
-
|
29
|
-
|
30
|
-
opts.separator ''
|
31
|
-
opts.separator ' Mandatory search arguments:'
|
32
|
-
|
33
|
-
opts.on('-k', '--key-word KEYWORD',
|
34
|
-
'Provide a single keyword or a boolean expression.'
|
35
|
-
) do |keyword|
|
36
|
-
options[:query_params][:query] = keyword
|
37
|
-
# not a solution!!!
|
38
|
-
options[:query_params][:search_in] = 'question'
|
39
|
-
|
40
|
-
end
|
41
|
-
|
42
|
-
opts.separator ''
|
43
|
-
|
44
|
-
opts.on('-c', '--category CATEGORY',
|
45
|
-
'Provide a category name or ID.'
|
46
|
-
) do |category|
|
47
|
-
|
48
|
-
if category =~ /^[[:digit:]]+$/
|
49
|
-
options[:query_params][:category_id] = category
|
50
|
-
else
|
51
|
-
options[:query_params][:category_name] = category
|
52
|
-
end
|
53
|
-
|
54
|
-
end
|
55
|
-
|
56
|
-
opts.separator ''
|
57
|
-
|
58
|
-
opts.on('--user-id ID',
|
59
|
-
'Provide an user ID of questions you search for.',
|
60
|
-
'This way you can get question by a specific user.'
|
61
|
-
) do |user_id|
|
62
|
-
options[:query_params][:user_id] = user_id
|
63
|
-
end
|
64
|
-
|
65
|
-
opts.separator ''
|
66
|
-
|
67
|
-
opts.on('--question-id ID',
|
68
|
-
'Provide a question ID of the question you search for.',
|
69
|
-
'It returns a unique question.'
|
70
|
-
) do |question_id|
|
71
|
-
options[:query_params][:question_id] = question_id
|
72
|
-
end
|
73
|
-
|
74
|
-
opts.separator ''
|
75
|
-
opts.separator ' Optional search arguments:'
|
76
|
-
|
77
|
-
opts.on('-r', '--region REGION',
|
78
|
-
'Provide a geographic region to search in for terms.',
|
79
|
-
'Possible values are: de, us, uk, ca, au, in, es, br,',
|
80
|
-
' ar, mx, e1, it, fr, sg.',
|
81
|
-
'This defaults to en.'
|
82
|
-
) do |region|
|
83
|
-
# todo
|
84
|
-
prove_region(region)
|
85
|
-
options[:query_params][:region] = region
|
86
|
-
end
|
87
|
-
|
88
|
-
opts.separator ''
|
89
|
-
|
90
|
-
opts.on('-o', '--output-dir DIR',
|
91
|
-
'Provide an output folder.',
|
92
|
-
'This directory will be created if it does not exist yet.'
|
93
|
-
) do |output_dir|
|
94
|
-
options[:output_dir] = provide_dir(output_dir)
|
95
|
-
end
|
96
|
-
|
97
|
-
opts.separator ''
|
98
|
-
|
99
|
-
opts.on('-l', '--limit NUMBER', Integer,
|
100
|
-
'Provide a number of answers you want to get from Yahoo.',
|
101
|
-
'This argument is not mandatory, if you want to get',
|
102
|
-
'all answers simply ommit this argument.'
|
103
|
-
) do |limit|
|
104
|
-
options[:limit] = limit
|
105
|
-
end
|
106
|
-
|
107
|
-
opts.separator ''
|
108
|
-
opts.on('-f', '--output-format FORMAT',
|
109
|
-
'Provide an output format: xml, json, rss, php.',
|
110
|
-
'It defaults to xml, and you can simply ommit this option.'
|
111
|
-
) do |f|
|
112
|
-
raise NotImplementedError, 'Only default output format is implemented'
|
113
|
-
end
|
114
|
-
|
115
|
-
opts.separator ''
|
116
|
-
opts.on('--prefix PREFIX',
|
117
|
-
'Provide a prefix for the output files. By default',
|
118
|
-
'the filename begins with the index of the retrieved',
|
119
|
-
'question. You can alter this by providing a prefix.',
|
120
|
-
'It can be useful if you want to put many query results',
|
121
|
-
'in the same output folder.'
|
122
|
-
) do |pref|
|
123
|
-
raise NotImplementedError, 'No prefixes implemeted.'
|
124
|
-
end
|
125
|
-
|
126
|
-
opts.separator ""
|
127
|
-
opts.separator "Common options:"
|
128
|
-
|
129
|
-
opts.on_tail('-h', '--help', 'Show the help message.') do
|
130
|
-
puts opts
|
131
|
-
exit
|
132
|
-
end
|
133
|
-
|
134
|
-
opts.on_tail('-v', '--version', 'Show the program version.') do
|
135
|
-
puts YANSER::VERSION
|
136
|
-
exit
|
137
|
-
end
|
138
|
-
end
|
139
|
-
|
140
|
-
# if no options provided print the help
|
141
|
-
if cmd_args.empty?
|
142
|
-
$stderr.printf "You have to provide some options.\n\n"
|
143
|
-
puts parser
|
144
|
-
exit 1
|
145
|
-
end
|
146
|
-
|
147
|
-
# Parse ARGV and provide the options hash.
|
148
|
-
# Check if everything is correct and handle exceptions
|
149
|
-
begin
|
150
|
-
parser.parse!(cmd_args)
|
151
|
-
# rescue all exceptions from OptionParser
|
152
|
-
rescue => e
|
153
|
-
$stderr.printf "#{e.message.capitalize}\n\n"
|
154
|
-
puts parser
|
155
|
-
exit 1
|
156
|
-
end
|
157
|
-
|
158
|
-
# Check to see if we got the required arguments needed.
|
159
|
-
check_required_options(options)
|
160
|
-
|
161
|
-
# Set the search method.
|
162
|
-
options[:query_type] = set_query_type(options[:query_params])
|
163
|
-
|
164
|
-
return options
|
165
|
-
end # parse
|
166
|
-
|
167
|
-
private
|
168
|
-
|
169
|
-
# Check if the value of given region is correct.
|
170
|
-
# Now 14 regions are supported by Yahoo! Answers.
|
171
|
-
def self.prove_region(region)
|
172
|
-
regions = ['de', 'us', 'uk', 'ca', 'au', 'in', 'es',
|
173
|
-
'br', 'ar', 'mx', 'e1', 'it', 'fr', 'sg']
|
174
|
-
|
175
|
-
unless regions.include?(region)
|
176
|
-
$stderr.puts "The provided search region #{region} is currently not supported by Yahoo!"
|
177
|
-
exit 1
|
178
|
-
end
|
179
|
-
end
|
180
|
-
|
181
|
-
# define one of the following query types: TermQuery|CategoryQuery|
|
182
|
-
# QuestionQuery|UserQuery
|
183
|
-
def self.set_query_type(params)
|
184
|
-
case
|
185
|
-
when (params[:category_id] || params[:category_name]) && ! params[:query]
|
186
|
-
return 'CategoryQuery'
|
187
|
-
when params[:query]
|
188
|
-
return 'TermQuery'
|
189
|
-
when params[:user_id]
|
190
|
-
return 'UserQuery'
|
191
|
-
when params[:question_id]
|
192
|
-
return 'QuestionQuery'
|
193
|
-
end
|
194
|
-
end
|
195
|
-
|
196
|
-
def self.check_required_options(options)
|
197
|
-
required_opts = [:appid]
|
198
|
-
|
199
|
-
required_opts.each do |opt|
|
200
|
-
if options[:query_params].has_key?(opt)
|
201
|
-
next
|
202
|
-
else
|
203
|
-
$stderr.puts "A required option --#{opt} is missing."
|
204
|
-
exit 1
|
205
|
-
end
|
206
|
-
end
|
207
|
-
end
|
208
|
-
|
209
|
-
def self.provide_dir(dir)
|
210
|
-
dir = File.expand_path(dir)
|
211
|
-
#check for existens
|
212
|
-
if File.directory?(dir)
|
213
|
-
if File.writable?(dir)
|
214
|
-
return dir
|
215
|
-
else
|
216
|
-
$stderr.puts 'The directory you have provided is not writable!'
|
217
|
-
exit 1
|
218
|
-
end
|
219
|
-
else
|
220
|
-
FileUtils.mkdir_p(dir)
|
221
|
-
return dir
|
222
|
-
end
|
223
|
-
end # provide_dir
|
224
|
-
|
225
|
-
end # OptionParser
|
226
|
-
|
227
|
-
__END__
|
228
|
-
|
229
|
-
-c, --category-id # Term & CategorySearch
|
230
|
-
-q, --question-id # QuestionSearch
|
231
|
-
|
232
|
-
-t, --time-interval
|
233
|
-
-f, --output-format
|
234
|
-
-p, --prefix #prefix for output files
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
instance interface
|
240
|
-
|
241
|
-
{
|
242
|
-
:query_params => {
|
243
|
-
:appid => 'YahooDemo' | 'SomeStringWithYourID',
|
244
|
-
:callback => '',
|
245
|
-
:category_id => '',
|
246
|
-
:category_name => '',
|
247
|
-
:date_range => '',
|
248
|
-
:filter => '',
|
249
|
-
:output => '',
|
250
|
-
:query => '',
|
251
|
-
:question_id => '',
|
252
|
-
:region => 'de'|'us'|'uk'|'ca'|'au'|'in'|'es'|'br'|'ar'|'mx'|'e1'|'it'|'fr'|'sg', # default 'en'
|
253
|
-
:results => Integer, # 0..50
|
254
|
-
:search_in => "all" | "question" | "best_answer", # default 'all'
|
255
|
-
:sort => 'relevance' | 'date_desc'| 'date_asc', # default 'relevance'
|
256
|
-
:start => Integer, # <= 1000
|
257
|
-
:type => "all" | "resolved" | "open" | "undecided", # default 'all'
|
258
|
-
:user_id => ''
|
259
|
-
},
|
260
|
-
:query_type => 'TermQuery'|'QuestionQuery'|'UserQuery'|'CategoryQuery',
|
261
|
-
:output_dir => 'some path',
|
262
|
-
:prefix => 'some prefix' # prefix for output files
|
263
|
-
}
|
data/lib/version.rb
DELETED
data/lib/yanser.rb
DELETED
@@ -1,121 +0,0 @@
|
|
1
|
-
require 'yanapi'
|
2
|
-
|
3
|
-
# :title: YANSER, Yahoo! ANSwers harvestER
|
4
|
-
# :main: Yanser
|
5
|
-
# Main processing class.
|
6
|
-
# Yanser encapsulates the main routine and instantiates
|
7
|
-
# all other classes.
|
8
|
-
#--
|
9
|
-
# Yanser takes the users input and validates it.
|
10
|
-
# It decides which search method to choose.
|
11
|
-
# Then it collects all parameters and useful default values,
|
12
|
-
# creates an XyzQuery with the starting point of 0 and gets the first results.
|
13
|
-
# If more results were requested, Yanser creates a similar XyzQuery and gets
|
14
|
-
# the next result set until the result limitation set by the user is met.
|
15
|
-
#
|
16
|
-
class Yanser
|
17
|
-
|
18
|
-
# Yahoo! Answers returns starting not more than at the 1000st question.
|
19
|
-
# It makes no sense to step over.
|
20
|
-
START_LIMIT = 1000
|
21
|
-
|
22
|
-
# We query the web service every two seconds.
|
23
|
-
QUERY_INTERVAL = 2
|
24
|
-
|
25
|
-
# Yahoo! Answers returns maximum 50 results.
|
26
|
-
MAX_RESULTS = 50
|
27
|
-
|
28
|
-
def initialize(opts)
|
29
|
-
|
30
|
-
# the minimal output of an OptionParser
|
31
|
-
# {:query_type=>u|q|c|w, :query_params=>{appid, start, results, query}}
|
32
|
-
# opts come from the OptionParser
|
33
|
-
# they are supposed to be correct, no validation here
|
34
|
-
@options = opts
|
35
|
-
|
36
|
-
@options[:limit] = @options[:limit] || START_LIMIT + MAX_RESULTS
|
37
|
-
|
38
|
-
@query_params = @options[:query_params]
|
39
|
-
|
40
|
-
@output_type = @query_params[:output] || 'xml'
|
41
|
-
|
42
|
-
@query_params[:start] = @query_params[:start] || 0
|
43
|
-
end
|
44
|
-
|
45
|
-
# TODO: implement some logging
|
46
|
-
def start
|
47
|
-
|
48
|
-
if @options[:query_type] == 'QuestionQuery'
|
49
|
-
query(@query_params)
|
50
|
-
elsif @options[:limit] < MAX_RESULTS
|
51
|
-
@query_params[:results] = @options[:limit]
|
52
|
-
query(@query_params)
|
53
|
-
else
|
54
|
-
@query_params[:results] = MAX_RESULTS
|
55
|
-
|
56
|
-
while query(@query_params) do
|
57
|
-
# we get the next start point here
|
58
|
-
@query_params[:start] += MAX_RESULTS
|
59
|
-
|
60
|
-
results_left = @options[:limit] - @query_params[:start]
|
61
|
-
if results_left == 0
|
62
|
-
break
|
63
|
-
elsif results_left < MAX_RESULTS
|
64
|
-
@query_params[:results] = results_left
|
65
|
-
break
|
66
|
-
end
|
67
|
-
end # while
|
68
|
-
|
69
|
-
end # if
|
70
|
-
|
71
|
-
end # start
|
72
|
-
|
73
|
-
private
|
74
|
-
|
75
|
-
def query(params)
|
76
|
-
q = create_query(params)
|
77
|
-
tries = 0
|
78
|
-
begin
|
79
|
-
tries += 1
|
80
|
-
result = q.get
|
81
|
-
output(result)
|
82
|
-
sleep(2)
|
83
|
-
rescue YANAPI::EmptyResponse => e
|
84
|
-
$sderr.puts e
|
85
|
-
return false # do not iterate futher
|
86
|
-
rescue => e # some errors to retry
|
87
|
-
if (tries < 4)
|
88
|
-
sleep(QUERY_INTERVAL**tries)
|
89
|
-
retry
|
90
|
-
else
|
91
|
-
$stderr.puts e
|
92
|
-
return false # do not iterate futher
|
93
|
-
end
|
94
|
-
end
|
95
|
-
return true # we may iterate futher
|
96
|
-
end
|
97
|
-
|
98
|
-
def create_query(params)
|
99
|
-
eval("YANAPI::#{@options[:query_type]}.new(params)")
|
100
|
-
end
|
101
|
-
|
102
|
-
def output(result)
|
103
|
-
if @options[:output_dir]
|
104
|
-
save(result)
|
105
|
-
else
|
106
|
-
puts result
|
107
|
-
end
|
108
|
-
end
|
109
|
-
|
110
|
-
# save results to a dir
|
111
|
-
# this dir exists since has been proved by OptionParser
|
112
|
-
# not a good implementation
|
113
|
-
# interface (filename, data)
|
114
|
-
def save(result)
|
115
|
-
filename = File.join(@options[:output_dir], "#{@query_params[:start]}.#{@output_type}")
|
116
|
-
file = File.new(filename, 'w')
|
117
|
-
file.puts result
|
118
|
-
file.close
|
119
|
-
end
|
120
|
-
|
121
|
-
end # Yanser
|