yanser 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +21 -1
- data/Rakefile +31 -4
- data/bin/yanser +8 -8
- data/lib/yanser/error.rb +11 -0
- data/lib/yanser/opt_parser.rb +313 -0
- data/lib/yanser/version.rb +3 -0
- data/lib/yanser/yanser.rb +135 -0
- data/test/test_opt_parser.rb +138 -0
- data/test/test_yanser.rb +7 -3
- metadata +13 -22
- data/README.rdoc +0 -21
- data/lib/option_parser.rb +0 -263
- data/lib/version.rb +0 -3
- data/lib/yanser.rb +0 -121
- data/test/data/bad_xml.txt +0 -236
- data/test/data/empty_result.txt +0 -13
- data/test/data/error_code.txt +0 -237
- data/test/data/response_with_error.txt +0 -15
- data/test/data/successfull_response.txt +0 -237
- data/test/test_option_parser.rb +0 -91
- data/test/yanapi/test_query.rb +0 -112
- data/test/yanapi/test_term_query.rb +0 -64
data/test/test_yanser.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# -*- coding: utf-8 -*-
|
2
2
|
require 'test/unit'
|
3
|
-
require 'yanser'
|
3
|
+
require 'yanser/yanser'
|
4
|
+
require 'yanser/version'
|
4
5
|
|
5
6
|
class TestYanser < Test::Unit::TestCase
|
6
7
|
def setup
|
@@ -10,7 +11,7 @@ class TestYanser < Test::Unit::TestCase
|
|
10
11
|
end
|
11
12
|
|
12
13
|
def test_public_methods
|
13
|
-
yanser = Yanser.new(
|
14
|
+
yanser = Yanser::Yanser.new(
|
14
15
|
:query_params => {
|
15
16
|
:appid => 'YahooDemo',
|
16
17
|
:query => 'Haus AND grün',
|
@@ -24,7 +25,10 @@ class TestYanser < Test::Unit::TestCase
|
|
24
25
|
|
25
26
|
assert_respond_to(yanser, :start)
|
26
27
|
end
|
27
|
-
|
28
|
+
|
29
|
+
def test_constants
|
30
|
+
assert(Yanser::VERSION.instance_of?(String))
|
31
|
+
end
|
28
32
|
def test_mandatory_options
|
29
33
|
end
|
30
34
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: yanser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
5
|
-
prerelease:
|
4
|
+
hash: 27
|
5
|
+
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 2
|
10
|
+
version: 0.0.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Andrei Beliankou
|
@@ -15,8 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-
|
19
|
-
default_executable:
|
18
|
+
date: 2011-07-08 00:00:00 Z
|
20
19
|
dependencies:
|
21
20
|
- !ruby/object:Gem::Dependency
|
22
21
|
name: yanapi
|
@@ -41,28 +40,20 @@ executables:
|
|
41
40
|
extensions: []
|
42
41
|
|
43
42
|
extra_rdoc_files:
|
44
|
-
- README
|
43
|
+
- README
|
45
44
|
- LICENSE
|
46
45
|
files:
|
47
46
|
- lib/tester.rb
|
48
|
-
- lib/yanser.rb
|
49
|
-
- lib/
|
50
|
-
- lib/
|
47
|
+
- lib/yanser/error.rb
|
48
|
+
- lib/yanser/yanser.rb
|
49
|
+
- lib/yanser/opt_parser.rb
|
50
|
+
- lib/yanser/version.rb
|
51
51
|
- bin/yanser
|
52
|
-
- README.rdoc
|
53
52
|
- LICENSE
|
54
53
|
- Rakefile
|
55
54
|
- README
|
56
|
-
- test/
|
57
|
-
- test/yanapi/test_query.rb
|
58
|
-
- test/yanapi/test_term_query.rb
|
55
|
+
- test/test_opt_parser.rb
|
59
56
|
- test/test_yanser.rb
|
60
|
-
- test/data/bad_xml.txt
|
61
|
-
- test/data/empty_result.txt
|
62
|
-
- test/data/error_code.txt
|
63
|
-
- test/data/response_with_error.txt
|
64
|
-
- test/data/successfull_response.txt
|
65
|
-
has_rdoc: true
|
66
57
|
homepage: http://www.uni-trier.de/index.php?id=34451
|
67
58
|
licenses: []
|
68
59
|
|
@@ -95,10 +86,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
95
86
|
requirements: []
|
96
87
|
|
97
88
|
rubyforge_project: yanser
|
98
|
-
rubygems_version: 1.
|
89
|
+
rubygems_version: 1.7.2
|
99
90
|
signing_key:
|
100
91
|
specification_version: 3
|
101
92
|
summary: Yanser is a convinient search tool providing access to the Yahoo! Answers Q/A collection.
|
102
93
|
test_files:
|
103
|
-
- test/
|
94
|
+
- test/test_opt_parser.rb
|
104
95
|
- test/test_yanser.rb
|
data/README.rdoc
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
= YANSER
|
2
|
-
|
3
|
-
* {RubyGems}[http://rubygems.org/gems/yanser]
|
4
|
-
* Developers {Homepage}[http://www.uni-trier.de/index.php?id=24140]
|
5
|
-
* {YANAPI Project Page}[http://yanser.rubyforge.org/]
|
6
|
-
|
7
|
-
== DESCRIPTION
|
8
|
-
|
9
|
-
YANSER (Yahoo! ANSwers harvestER) is a convinient search tool providing access to the Yahoo! Answers Q&A collection. Based on YANAPI it provides a simple CLI and helps to search for Questions and Answers which contain a set of key words, belong to a specific semantic domain or are posted by a certain user. Yanser is a research tool in the field of Computational Linguistics.
|
10
|
-
|
11
|
-
== SYNOPSIS
|
12
|
-
$ yanser --help
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
== LICENSE
|
17
|
-
|
18
|
-
YANSER is a copyrighted software by Andrei Beliankou, 2011.
|
19
|
-
You may use, redistribute and change it under the terms
|
20
|
-
provided in the LICENSE file.
|
21
|
-
|
data/lib/option_parser.rb
DELETED
@@ -1,263 +0,0 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
|
-
require 'optparse'
|
3
|
-
require 'fileutils'
|
4
|
-
|
5
|
-
require 'version'
|
6
|
-
|
7
|
-
class OptionParser
|
8
|
-
# OP expects cmd_args to be an array like ARGV
|
9
|
-
# dummy output for temporary usage
|
10
|
-
def self.parse(cmd_args)
|
11
|
-
options = {}
|
12
|
-
options[:query_params] = {}
|
13
|
-
|
14
|
-
parser = OptionParser.new do |opts|
|
15
|
-
opts.banner = 'Usage: yanser OPTIONS'
|
16
|
-
|
17
|
-
opts.separator ''
|
18
|
-
opts.separator 'Program specific options:'
|
19
|
-
|
20
|
-
opts.on('--appid APPID',
|
21
|
-
'Provide an ApplicationID given by Yahoo,',
|
22
|
-
'to test Yanser you can use <YahooDemo> as the APPID,',
|
23
|
-
'think in this case on limitations placed by Yahoo.',
|
24
|
-
'This option is required!'
|
25
|
-
) do |appid|
|
26
|
-
options[:query_params][:appid] = appid
|
27
|
-
end
|
28
|
-
|
29
|
-
|
30
|
-
opts.separator ''
|
31
|
-
opts.separator ' Mandatory search arguments:'
|
32
|
-
|
33
|
-
opts.on('-k', '--key-word KEYWORD',
|
34
|
-
'Provide a single keyword or a boolean expression.'
|
35
|
-
) do |keyword|
|
36
|
-
options[:query_params][:query] = keyword
|
37
|
-
# not a solution!!!
|
38
|
-
options[:query_params][:search_in] = 'question'
|
39
|
-
|
40
|
-
end
|
41
|
-
|
42
|
-
opts.separator ''
|
43
|
-
|
44
|
-
opts.on('-c', '--category CATEGORY',
|
45
|
-
'Provide a category name or ID.'
|
46
|
-
) do |category|
|
47
|
-
|
48
|
-
if category =~ /^[[:digit:]]+$/
|
49
|
-
options[:query_params][:category_id] = category
|
50
|
-
else
|
51
|
-
options[:query_params][:category_name] = category
|
52
|
-
end
|
53
|
-
|
54
|
-
end
|
55
|
-
|
56
|
-
opts.separator ''
|
57
|
-
|
58
|
-
opts.on('--user-id ID',
|
59
|
-
'Provide an user ID of questions you search for.',
|
60
|
-
'This way you can get question by a specific user.'
|
61
|
-
) do |user_id|
|
62
|
-
options[:query_params][:user_id] = user_id
|
63
|
-
end
|
64
|
-
|
65
|
-
opts.separator ''
|
66
|
-
|
67
|
-
opts.on('--question-id ID',
|
68
|
-
'Provide a question ID of the question you search for.',
|
69
|
-
'It returns a unique question.'
|
70
|
-
) do |question_id|
|
71
|
-
options[:query_params][:question_id] = question_id
|
72
|
-
end
|
73
|
-
|
74
|
-
opts.separator ''
|
75
|
-
opts.separator ' Optional search arguments:'
|
76
|
-
|
77
|
-
opts.on('-r', '--region REGION',
|
78
|
-
'Provide a geographic region to search in for terms.',
|
79
|
-
'Possible values are: de, us, uk, ca, au, in, es, br,',
|
80
|
-
' ar, mx, e1, it, fr, sg.',
|
81
|
-
'This defaults to en.'
|
82
|
-
) do |region|
|
83
|
-
# todo
|
84
|
-
prove_region(region)
|
85
|
-
options[:query_params][:region] = region
|
86
|
-
end
|
87
|
-
|
88
|
-
opts.separator ''
|
89
|
-
|
90
|
-
opts.on('-o', '--output-dir DIR',
|
91
|
-
'Provide an output folder.',
|
92
|
-
'This directory will be created if it does not exist yet.'
|
93
|
-
) do |output_dir|
|
94
|
-
options[:output_dir] = provide_dir(output_dir)
|
95
|
-
end
|
96
|
-
|
97
|
-
opts.separator ''
|
98
|
-
|
99
|
-
opts.on('-l', '--limit NUMBER', Integer,
|
100
|
-
'Provide a number of answers you want to get from Yahoo.',
|
101
|
-
'This argument is not mandatory, if you want to get',
|
102
|
-
'all answers simply ommit this argument.'
|
103
|
-
) do |limit|
|
104
|
-
options[:limit] = limit
|
105
|
-
end
|
106
|
-
|
107
|
-
opts.separator ''
|
108
|
-
opts.on('-f', '--output-format FORMAT',
|
109
|
-
'Provide an output format: xml, json, rss, php.',
|
110
|
-
'It defaults to xml, and you can simply ommit this option.'
|
111
|
-
) do |f|
|
112
|
-
raise NotImplementedError, 'Only default output format is implemented'
|
113
|
-
end
|
114
|
-
|
115
|
-
opts.separator ''
|
116
|
-
opts.on('--prefix PREFIX',
|
117
|
-
'Provide a prefix for the output files. By default',
|
118
|
-
'the filename begins with the index of the retrieved',
|
119
|
-
'question. You can alter this by providing a prefix.',
|
120
|
-
'It can be useful if you want to put many query results',
|
121
|
-
'in the same output folder.'
|
122
|
-
) do |pref|
|
123
|
-
raise NotImplementedError, 'No prefixes implemeted.'
|
124
|
-
end
|
125
|
-
|
126
|
-
opts.separator ""
|
127
|
-
opts.separator "Common options:"
|
128
|
-
|
129
|
-
opts.on_tail('-h', '--help', 'Show the help message.') do
|
130
|
-
puts opts
|
131
|
-
exit
|
132
|
-
end
|
133
|
-
|
134
|
-
opts.on_tail('-v', '--version', 'Show the program version.') do
|
135
|
-
puts YANSER::VERSION
|
136
|
-
exit
|
137
|
-
end
|
138
|
-
end
|
139
|
-
|
140
|
-
# if no options provided print the help
|
141
|
-
if cmd_args.empty?
|
142
|
-
$stderr.printf "You have to provide some options.\n\n"
|
143
|
-
puts parser
|
144
|
-
exit 1
|
145
|
-
end
|
146
|
-
|
147
|
-
# Parse ARGV and provide the options hash.
|
148
|
-
# Check if everything is correct and handle exceptions
|
149
|
-
begin
|
150
|
-
parser.parse!(cmd_args)
|
151
|
-
# rescue all exceptions from OptionParser
|
152
|
-
rescue => e
|
153
|
-
$stderr.printf "#{e.message.capitalize}\n\n"
|
154
|
-
puts parser
|
155
|
-
exit 1
|
156
|
-
end
|
157
|
-
|
158
|
-
# Check to see if we got the required arguments needed.
|
159
|
-
check_required_options(options)
|
160
|
-
|
161
|
-
# Set the search method.
|
162
|
-
options[:query_type] = set_query_type(options[:query_params])
|
163
|
-
|
164
|
-
return options
|
165
|
-
end # parse
|
166
|
-
|
167
|
-
private
|
168
|
-
|
169
|
-
# Check if the value of given region is correct.
|
170
|
-
# Now 14 regions are supported by Yahoo! Answers.
|
171
|
-
def self.prove_region(region)
|
172
|
-
regions = ['de', 'us', 'uk', 'ca', 'au', 'in', 'es',
|
173
|
-
'br', 'ar', 'mx', 'e1', 'it', 'fr', 'sg']
|
174
|
-
|
175
|
-
unless regions.include?(region)
|
176
|
-
$stderr.puts "The provided search region #{region} is currently not supported by Yahoo!"
|
177
|
-
exit 1
|
178
|
-
end
|
179
|
-
end
|
180
|
-
|
181
|
-
# define one of the following query types: TermQuery|CategoryQuery|
|
182
|
-
# QuestionQuery|UserQuery
|
183
|
-
def self.set_query_type(params)
|
184
|
-
case
|
185
|
-
when (params[:category_id] || params[:category_name]) && ! params[:query]
|
186
|
-
return 'CategoryQuery'
|
187
|
-
when params[:query]
|
188
|
-
return 'TermQuery'
|
189
|
-
when params[:user_id]
|
190
|
-
return 'UserQuery'
|
191
|
-
when params[:question_id]
|
192
|
-
return 'QuestionQuery'
|
193
|
-
end
|
194
|
-
end
|
195
|
-
|
196
|
-
def self.check_required_options(options)
|
197
|
-
required_opts = [:appid]
|
198
|
-
|
199
|
-
required_opts.each do |opt|
|
200
|
-
if options[:query_params].has_key?(opt)
|
201
|
-
next
|
202
|
-
else
|
203
|
-
$stderr.puts "A required option --#{opt} is missing."
|
204
|
-
exit 1
|
205
|
-
end
|
206
|
-
end
|
207
|
-
end
|
208
|
-
|
209
|
-
def self.provide_dir(dir)
|
210
|
-
dir = File.expand_path(dir)
|
211
|
-
#check for existens
|
212
|
-
if File.directory?(dir)
|
213
|
-
if File.writable?(dir)
|
214
|
-
return dir
|
215
|
-
else
|
216
|
-
$stderr.puts 'The directory you have provided is not writable!'
|
217
|
-
exit 1
|
218
|
-
end
|
219
|
-
else
|
220
|
-
FileUtils.mkdir_p(dir)
|
221
|
-
return dir
|
222
|
-
end
|
223
|
-
end # provide_dir
|
224
|
-
|
225
|
-
end # OptionParser
|
226
|
-
|
227
|
-
__END__
|
228
|
-
|
229
|
-
-c, --category-id # Term & CategorySearch
|
230
|
-
-q, --question-id # QuestionSearch
|
231
|
-
|
232
|
-
-t, --time-interval
|
233
|
-
-f, --output-format
|
234
|
-
-p, --prefix #prefix for output files
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
instance interface
|
240
|
-
|
241
|
-
{
|
242
|
-
:query_params => {
|
243
|
-
:appid => 'YahooDemo' | 'SomeStringWithYourID',
|
244
|
-
:callback => '',
|
245
|
-
:category_id => '',
|
246
|
-
:category_name => '',
|
247
|
-
:date_range => '',
|
248
|
-
:filter => '',
|
249
|
-
:output => '',
|
250
|
-
:query => '',
|
251
|
-
:question_id => '',
|
252
|
-
:region => 'de'|'us'|'uk'|'ca'|'au'|'in'|'es'|'br'|'ar'|'mx'|'e1'|'it'|'fr'|'sg', # default 'en'
|
253
|
-
:results => Integer, # 0..50
|
254
|
-
:search_in => "all" | "question" | "best_answer", # default 'all'
|
255
|
-
:sort => 'relevance' | 'date_desc'| 'date_asc', # default 'relevance'
|
256
|
-
:start => Integer, # <= 1000
|
257
|
-
:type => "all" | "resolved" | "open" | "undecided", # default 'all'
|
258
|
-
:user_id => ''
|
259
|
-
},
|
260
|
-
:query_type => 'TermQuery'|'QuestionQuery'|'UserQuery'|'CategoryQuery',
|
261
|
-
:output_dir => 'some path',
|
262
|
-
:prefix => 'some prefix' # prefix for output files
|
263
|
-
}
|
data/lib/version.rb
DELETED
data/lib/yanser.rb
DELETED
@@ -1,121 +0,0 @@
|
|
1
|
-
require 'yanapi'
|
2
|
-
|
3
|
-
# :title: YANSER, Yahoo! ANSwers harvestER
|
4
|
-
# :main: Yanser
|
5
|
-
# Main processing class.
|
6
|
-
# Yanser encapsulates the main routine and instantiates
|
7
|
-
# all other classes.
|
8
|
-
#--
|
9
|
-
# Yanser takes the users input and validates it.
|
10
|
-
# It decides which search method to choose.
|
11
|
-
# Then it collects all parameters and useful default values,
|
12
|
-
# creates an XyzQuery with the starting point of 0 and gets the first results.
|
13
|
-
# If more results were requested, Yanser creates a similar XyzQuery and gets
|
14
|
-
# the next result set until the result limitation set by the user is met.
|
15
|
-
#
|
16
|
-
class Yanser
|
17
|
-
|
18
|
-
# Yahoo! Answers returns starting not more than at the 1000st question.
|
19
|
-
# It makes no sense to step over.
|
20
|
-
START_LIMIT = 1000
|
21
|
-
|
22
|
-
# We query the web service every two seconds.
|
23
|
-
QUERY_INTERVAL = 2
|
24
|
-
|
25
|
-
# Yahoo! Answers returns maximum 50 results.
|
26
|
-
MAX_RESULTS = 50
|
27
|
-
|
28
|
-
def initialize(opts)
|
29
|
-
|
30
|
-
# the minimal output of an OptionParser
|
31
|
-
# {:query_type=>u|q|c|w, :query_params=>{appid, start, results, query}}
|
32
|
-
# opts come from the OptionParser
|
33
|
-
# they are supposed to be correct, no validation here
|
34
|
-
@options = opts
|
35
|
-
|
36
|
-
@options[:limit] = @options[:limit] || START_LIMIT + MAX_RESULTS
|
37
|
-
|
38
|
-
@query_params = @options[:query_params]
|
39
|
-
|
40
|
-
@output_type = @query_params[:output] || 'xml'
|
41
|
-
|
42
|
-
@query_params[:start] = @query_params[:start] || 0
|
43
|
-
end
|
44
|
-
|
45
|
-
# TODO: implement some logging
|
46
|
-
def start
|
47
|
-
|
48
|
-
if @options[:query_type] == 'QuestionQuery'
|
49
|
-
query(@query_params)
|
50
|
-
elsif @options[:limit] < MAX_RESULTS
|
51
|
-
@query_params[:results] = @options[:limit]
|
52
|
-
query(@query_params)
|
53
|
-
else
|
54
|
-
@query_params[:results] = MAX_RESULTS
|
55
|
-
|
56
|
-
while query(@query_params) do
|
57
|
-
# we get the next start point here
|
58
|
-
@query_params[:start] += MAX_RESULTS
|
59
|
-
|
60
|
-
results_left = @options[:limit] - @query_params[:start]
|
61
|
-
if results_left == 0
|
62
|
-
break
|
63
|
-
elsif results_left < MAX_RESULTS
|
64
|
-
@query_params[:results] = results_left
|
65
|
-
break
|
66
|
-
end
|
67
|
-
end # while
|
68
|
-
|
69
|
-
end # if
|
70
|
-
|
71
|
-
end # start
|
72
|
-
|
73
|
-
private
|
74
|
-
|
75
|
-
def query(params)
|
76
|
-
q = create_query(params)
|
77
|
-
tries = 0
|
78
|
-
begin
|
79
|
-
tries += 1
|
80
|
-
result = q.get
|
81
|
-
output(result)
|
82
|
-
sleep(2)
|
83
|
-
rescue YANAPI::EmptyResponse => e
|
84
|
-
$sderr.puts e
|
85
|
-
return false # do not iterate futher
|
86
|
-
rescue => e # some errors to retry
|
87
|
-
if (tries < 4)
|
88
|
-
sleep(QUERY_INTERVAL**tries)
|
89
|
-
retry
|
90
|
-
else
|
91
|
-
$stderr.puts e
|
92
|
-
return false # do not iterate futher
|
93
|
-
end
|
94
|
-
end
|
95
|
-
return true # we may iterate futher
|
96
|
-
end
|
97
|
-
|
98
|
-
def create_query(params)
|
99
|
-
eval("YANAPI::#{@options[:query_type]}.new(params)")
|
100
|
-
end
|
101
|
-
|
102
|
-
def output(result)
|
103
|
-
if @options[:output_dir]
|
104
|
-
save(result)
|
105
|
-
else
|
106
|
-
puts result
|
107
|
-
end
|
108
|
-
end
|
109
|
-
|
110
|
-
# save results to a dir
|
111
|
-
# this dir exists since has been proved by OptionParser
|
112
|
-
# not a good implementation
|
113
|
-
# interface (filename, data)
|
114
|
-
def save(result)
|
115
|
-
filename = File.join(@options[:output_dir], "#{@query_params[:start]}.#{@output_type}")
|
116
|
-
file = File.new(filename, 'w')
|
117
|
-
file.puts result
|
118
|
-
file.close
|
119
|
-
end
|
120
|
-
|
121
|
-
end # Yanser
|