yanser 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE ADDED
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2011 Andrei Beliankou, University of Trier, Germany
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
data/README ADDED
@@ -0,0 +1 @@
1
+ new readme
data/README.rdoc ADDED
@@ -0,0 +1,21 @@
1
+ = YANSER
2
+
3
+ * {RubyGems}[http://rubygems.org/gems/yanser]
4
+ * Developers {Homepage}[http://www.uni-trier.de/index.php?id=24140]
5
+ * {YANAPI Project Page}[http://yanser.rubyforge.org/]
6
+
7
+ == DESCRIPTION
8
+
9
+ YANSER (Yahoo! ANSwers harvestER) is a convinient search tool providing access to the Yahoo! Answers Q&A collection. Based on YANAPI it provides a simple CLI and helps to search for Questions and Answers which contain a set of key words, belong to a specific semantic domain or are posted by a certain user. Yanser is a research tool in the field of Computational Linguistics.
10
+
11
+ == SYNOPSIS
12
+ $ yanser --help
13
+
14
+
15
+
16
+ == LICENSE
17
+
18
+ YANSER is a copyrighted software by Andrei Beliankou, 2011.
19
+ You may use, redistribute and change it under the terms
20
+ provided in the LICENSE file.
21
+
data/Rakefile ADDED
@@ -0,0 +1,14 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ # we can require 'rake/clean' to add 'clobber' and 'clean' tasks
4
+ require 'rake/clean'
5
+ require 'rake/testtask'
6
+
7
+
8
+ SRC = FileList['**/*.rb']
9
+
10
+ CLOBBER.include('doc', '**/*.html', '**/*.gem')
11
+
12
+ Rake::TestTask.new do |t|
13
+ t.test_files = FileList.new('test/yanapi/*.rb')
14
+ end
data/bin/yanser ADDED
@@ -0,0 +1,19 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # это неверное решение, на самом деле я не должен напрямую указывать,
5
+ # где находятся файлы, но пока пусть будет так
6
+ lib_path = File.expand_path('../../lib', __FILE__)
7
+ $LOAD_PATH.unshift(lib_path) unless $LOAD_PATH.include?(lib_path)
8
+
9
+ # подобный способ является правильным, только так классы Yanser и OptionParser будут независимы
10
+ require 'yanser'
11
+ require 'option_parser'
12
+
13
+ # class method, why not?
14
+ options = OptionParser.parse(ARGV)
15
+
16
+
17
+ yanser = Yanser.new(options)
18
+
19
+ yanser.start
@@ -0,0 +1,263 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'optparse'
3
+ require 'fileutils'
4
+
5
+ require 'version'
6
+
7
+ class OptionParser
8
+ # OP expects cmd_args to be an array like ARGV
9
+ # dummy output for temporary usage
10
+ def self.parse(cmd_args)
11
+ options = {}
12
+ options[:query_params] = {}
13
+
14
+ parser = OptionParser.new do |opts|
15
+ opts.banner = 'Usage: yanser OPTIONS'
16
+
17
+ opts.separator ''
18
+ opts.separator 'Program specific options:'
19
+
20
+ opts.on('--appid APPID',
21
+ 'Provide an ApplicationID given by Yahoo,',
22
+ 'to test Yanser you can use <YahooDemo> as the APPID,',
23
+ 'think in this case on limitations placed by Yahoo.',
24
+ 'This option is required!'
25
+ ) do |appid|
26
+ options[:query_params][:appid] = appid
27
+ end
28
+
29
+
30
+ opts.separator ''
31
+ opts.separator ' Mandatory search arguments:'
32
+
33
+ opts.on('-k', '--key-word KEYWORD',
34
+ 'Provide a single keyword or a boolean expression.'
35
+ ) do |keyword|
36
+ options[:query_params][:query] = keyword
37
+ # not a solution!!!
38
+ options[:query_params][:search_in] = 'question'
39
+
40
+ end
41
+
42
+ opts.separator ''
43
+
44
+ opts.on('-c', '--category CATEGORY',
45
+ 'Provide a category name or ID.'
46
+ ) do |category|
47
+
48
+ if category =~ /^[[:digit:]]+$/
49
+ options[:query_params][:category_id] = category
50
+ else
51
+ options[:query_params][:category_name] = category
52
+ end
53
+
54
+ end
55
+
56
+ opts.separator ''
57
+
58
+ opts.on('--user-id ID',
59
+ 'Provide an user ID of questions you search for.',
60
+ 'This way you can get question by a specific user.'
61
+ ) do |user_id|
62
+ options[:query_params][:user_id] = user_id
63
+ end
64
+
65
+ opts.separator ''
66
+
67
+ opts.on('--question-id ID',
68
+ 'Provide a question ID of the question you search for.',
69
+ 'It returns a unique question.'
70
+ ) do |question_id|
71
+ options[:query_params][:question_id] = question_id
72
+ end
73
+
74
+ opts.separator ''
75
+ opts.separator ' Optional search arguments:'
76
+
77
+ opts.on('-r', '--region REGION',
78
+ 'Provide a geographic region to search in for terms.',
79
+ 'Possible values are: de, us, uk, ca, au, in, es, br,',
80
+ ' ar, mx, e1, it, fr, sg.',
81
+ 'This defaults to en.'
82
+ ) do |region|
83
+ # todo
84
+ prove_region(region)
85
+ options[:query_params][:region] = region
86
+ end
87
+
88
+ opts.separator ''
89
+
90
+ opts.on('-o', '--output-dir DIR',
91
+ 'Provide an output folder.',
92
+ 'This directory will be created if it does not exist yet.'
93
+ ) do |output_dir|
94
+ options[:output_dir] = provide_dir(output_dir)
95
+ end
96
+
97
+ opts.separator ''
98
+
99
+ opts.on('-l', '--limit NUMBER', Integer,
100
+ 'Provide a number of answers you want to get from Yahoo.',
101
+ 'This argument is not mandatory, if you want to get',
102
+ 'all answers simply ommit this argument.'
103
+ ) do |limit|
104
+ options[:limit] = limit
105
+ end
106
+
107
+ opts.separator ''
108
+ opts.on('-f', '--output-format FORMAT',
109
+ 'Provide an output format: xml, json, rss, php.',
110
+ 'It defaults to xml, and you can simply ommit this option.'
111
+ ) do |f|
112
+ raise NotImplementedError, 'Only default output format is implemented'
113
+ end
114
+
115
+ opts.separator ''
116
+ opts.on('--prefix PREFIX',
117
+ 'Provide a prefix for the output files. By default',
118
+ 'the filename begins with the index of the retrieved',
119
+ 'question. You can alter this by providing a prefix.',
120
+ 'It can be useful if you want to put many query results',
121
+ 'in the same output folder.'
122
+ ) do |pref|
123
+ raise NotImplementedError, 'No prefixes implemeted.'
124
+ end
125
+
126
+ opts.separator ""
127
+ opts.separator "Common options:"
128
+
129
+ opts.on_tail('-h', '--help', 'Show the help message.') do
130
+ puts opts
131
+ exit
132
+ end
133
+
134
+ opts.on_tail('-v', '--version', 'Show the program version.') do
135
+ puts YANSER::VERSION
136
+ exit
137
+ end
138
+ end
139
+
140
+ # if no options provided print the help
141
+ if cmd_args.empty?
142
+ $stderr.printf "You have to provide some options.\n\n"
143
+ puts parser
144
+ exit 1
145
+ end
146
+
147
+ # Parse ARGV and provide the options hash.
148
+ # Check if everything is correct and handle exceptions
149
+ begin
150
+ parser.parse!(cmd_args)
151
+ # rescue all exceptions from OptionParser
152
+ rescue => e
153
+ $stderr.printf "#{e.message.capitalize}\n\n"
154
+ puts parser
155
+ exit 1
156
+ end
157
+
158
+ # Check to see if we got the required arguments needed.
159
+ check_required_options(options)
160
+
161
+ # Set the search method.
162
+ options[:query_type] = set_query_type(options[:query_params])
163
+
164
+ return options
165
+ end # parse
166
+
167
+ private
168
+
169
+ # Check if the value of given region is correct.
170
+ # Now 14 regions are supported by Yahoo! Answers.
171
+ def self.prove_region(region)
172
+ regions = ['de', 'us', 'uk', 'ca', 'au', 'in', 'es',
173
+ 'br', 'ar', 'mx', 'e1', 'it', 'fr', 'sg']
174
+
175
+ unless regions.include?(region)
176
+ $stderr.puts "The provided search region #{region} is currently not supported by Yahoo!"
177
+ exit 1
178
+ end
179
+ end
180
+
181
+ # define one of the following query types: TermQuery|CategoryQuery|
182
+ # QuestionQuery|UserQuery
183
+ def self.set_query_type(params)
184
+ case
185
+ when (params[:category_id] || params[:category_name]) && ! params[:query]
186
+ return 'CategoryQuery'
187
+ when params[:query]
188
+ return 'TermQuery'
189
+ when params[:user_id]
190
+ return 'UserQuery'
191
+ when params[:question_id]
192
+ return 'QuestionQuery'
193
+ end
194
+ end
195
+
196
+ def self.check_required_options(options)
197
+ required_opts = [:appid]
198
+
199
+ required_opts.each do |opt|
200
+ if options[:query_params].has_key?(opt)
201
+ next
202
+ else
203
+ $stderr.puts "A required option --#{opt} is missing."
204
+ exit 1
205
+ end
206
+ end
207
+ end
208
+
209
+ def self.provide_dir(dir)
210
+ dir = File.expand_path(dir)
211
+ #check for existens
212
+ if File.directory?(dir)
213
+ if File.writable?(dir)
214
+ return dir
215
+ else
216
+ $stderr.puts 'The directory you have provided is not writable!'
217
+ exit 1
218
+ end
219
+ else
220
+ FileUtils.mkdir_p(dir)
221
+ return dir
222
+ end
223
+ end # provide_dir
224
+
225
+ end # OptionParser
226
+
227
+ __END__
228
+
229
+ -c, --category-id # Term & CategorySearch
230
+ -q, --question-id # QuestionSearch
231
+
232
+ -t, --time-interval
233
+ -f, --output-format
234
+ -p, --prefix #prefix for output files
235
+
236
+
237
+
238
+
239
+ instance interface
240
+
241
+ {
242
+ :query_params => {
243
+ :appid => 'YahooDemo' | 'SomeStringWithYourID',
244
+ :callback => '',
245
+ :category_id => '',
246
+ :category_name => '',
247
+ :date_range => '',
248
+ :filter => '',
249
+ :output => '',
250
+ :query => '',
251
+ :question_id => '',
252
+ :region => 'de'|'us'|'uk'|'ca'|'au'|'in'|'es'|'br'|'ar'|'mx'|'e1'|'it'|'fr'|'sg', # default 'en'
253
+ :results => Integer, # 0..50
254
+ :search_in => "all" | "question" | "best_answer", # default 'all'
255
+ :sort => 'relevance' | 'date_desc'| 'date_asc', # default 'relevance'
256
+ :start => Integer, # <= 1000
257
+ :type => "all" | "resolved" | "open" | "undecided", # default 'all'
258
+ :user_id => ''
259
+ },
260
+ :query_type => 'TermQuery'|'QuestionQuery'|'UserQuery'|'CategoryQuery',
261
+ :output_dir => 'some path',
262
+ :prefix => 'some prefix' # prefix for output files
263
+ }
data/lib/tester.rb ADDED
@@ -0,0 +1,18 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ require 'yanser'
4
+
5
+ params = {
6
+ :query_method => 'TermQuery',
7
+ :query_params => {
8
+ :appid => 'YahooDemo',
9
+ :query => 'Köln',
10
+ :region => 'de',
11
+ :results => 5,
12
+ :start => 0
13
+ }
14
+ }
15
+
16
+ y = Yanser.new(params)
17
+
18
+ y.start
data/lib/version.rb ADDED
@@ -0,0 +1,3 @@
1
+ module YANSER
2
+ VERSION = '0.0.1'
3
+ end
data/lib/yanser.rb ADDED
@@ -0,0 +1,121 @@
1
+ require 'yanapi'
2
+
3
+ # :title: YANSER, Yahoo! ANSwers harvestER
4
+ # :main: Yanser
5
+ # Main processing class.
6
+ # Yanser encapsulates the main routine and instantiates
7
+ # all other classes.
8
+ #--
9
+ # Yanser takes the users input and validates it.
10
+ # It decides which search method to choose.
11
+ # Then it collects all parameters and useful default values,
12
+ # creates an XyzQuery with the starting point of 0 and gets the first results.
13
+ # If more results were requested, Yanser creates a similar XyzQuery and gets
14
+ # the next result set until the result limitation set by the user is met.
15
+ #
16
+ class Yanser
17
+
18
+ # Yahoo! Answers returns starting not more than at the 1000st question.
19
+ # It makes no sense to step over.
20
+ START_LIMIT = 1000
21
+
22
+ # We query the web service every two seconds.
23
+ QUERY_INTERVAL = 2
24
+
25
+ # Yahoo! Answers returns maximum 50 results.
26
+ MAX_RESULTS = 50
27
+
28
+ def initialize(opts)
29
+
30
+ # the minimal output of an OptionParser
31
+ # {:query_type=>u|q|c|w, :query_params=>{appid, start, results, query}}
32
+ # opts come from the OptionParser
33
+ # they are supposed to be correct, no validation here
34
+ @options = opts
35
+
36
+ @options[:limit] = @options[:limit] || START_LIMIT + MAX_RESULTS
37
+
38
+ @query_params = @options[:query_params]
39
+
40
+ @output_type = @query_params[:output] || 'xml'
41
+
42
+ @query_params[:start] = @query_params[:start] || 0
43
+ end
44
+
45
+ # TODO: implement some logging
46
+ def start
47
+
48
+ if @options[:query_type] == 'QuestionQuery'
49
+ query(@query_params)
50
+ elsif @options[:limit] < MAX_RESULTS
51
+ @query_params[:results] = @options[:limit]
52
+ query(@query_params)
53
+ else
54
+ @query_params[:results] = MAX_RESULTS
55
+
56
+ while query(@query_params) do
57
+ # we get the next start point here
58
+ @query_params[:start] += MAX_RESULTS
59
+
60
+ results_left = @options[:limit] - @query_params[:start]
61
+ if results_left == 0
62
+ break
63
+ elsif results_left < MAX_RESULTS
64
+ @query_params[:results] = results_left
65
+ break
66
+ end
67
+ end # while
68
+
69
+ end # if
70
+
71
+ end # start
72
+
73
+ private
74
+
75
+ def query(params)
76
+ q = create_query(params)
77
+ tries = 0
78
+ begin
79
+ tries += 1
80
+ result = q.get
81
+ output(result)
82
+ sleep(2)
83
+ rescue YANAPI::EmptyResponse => e
84
+ $sderr.puts e
85
+ return false # do not iterate futher
86
+ rescue => e # some errors to retry
87
+ if (tries < 4)
88
+ sleep(QUERY_INTERVAL**tries)
89
+ retry
90
+ else
91
+ $stderr.puts e
92
+ return false # do not iterate futher
93
+ end
94
+ end
95
+ return true # we may iterate futher
96
+ end
97
+
98
+ def create_query(params)
99
+ eval("YANAPI::#{@options[:query_type]}.new(params)")
100
+ end
101
+
102
+ def output(result)
103
+ if @options[:output_dir]
104
+ save(result)
105
+ else
106
+ puts result
107
+ end
108
+ end
109
+
110
+ # save results to a dir
111
+ # this dir exists since has been proved by OptionParser
112
+ # not a good implementation
113
+ # interface (filename, data)
114
+ def save(result)
115
+ filename = File.join(@options[:output_dir], "#{@query_params[:start]}.#{@output_type}")
116
+ file = File.new(filename, 'w')
117
+ file.puts result
118
+ file.close
119
+ end
120
+
121
+ end # Yanser