yanser 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LICENSE ADDED
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2011 Andrei Beliankou, University of Trier, Germany
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
data/README ADDED
@@ -0,0 +1 @@
1
+ new readme
data/README.rdoc ADDED
@@ -0,0 +1,21 @@
1
+ = YANSER
2
+
3
+ * {RubyGems}[http://rubygems.org/gems/yanser]
4
+ * Developers {Homepage}[http://www.uni-trier.de/index.php?id=24140]
5
+ * {YANAPI Project Page}[http://yanser.rubyforge.org/]
6
+
7
+ == DESCRIPTION
8
+
9
+ YANSER (Yahoo! ANSwers harvestER) is a convinient search tool providing access to the Yahoo! Answers Q&A collection. Based on YANAPI it provides a simple CLI and helps to search for Questions and Answers which contain a set of key words, belong to a specific semantic domain or are posted by a certain user. Yanser is a research tool in the field of Computational Linguistics.
10
+
11
+ == SYNOPSIS
12
+ $ yanser --help
13
+
14
+
15
+
16
+ == LICENSE
17
+
18
+ YANSER is a copyrighted software by Andrei Beliankou, 2011.
19
+ You may use, redistribute and change it under the terms
20
+ provided in the LICENSE file.
21
+
data/Rakefile ADDED
@@ -0,0 +1,14 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ # we can require 'rake/clean' to add 'clobber' and 'clean' tasks
4
+ require 'rake/clean'
5
+ require 'rake/testtask'
6
+
7
+
8
+ SRC = FileList['**/*.rb']
9
+
10
+ CLOBBER.include('doc', '**/*.html', '**/*.gem')
11
+
12
+ Rake::TestTask.new do |t|
13
+ t.test_files = FileList.new('test/yanapi/*.rb')
14
+ end
data/bin/yanser ADDED
@@ -0,0 +1,19 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # это неверное решение, на самом деле я не должен напрямую указывать,
5
+ # где находятся файлы, но пока пусть будет так
6
+ lib_path = File.expand_path('../../lib', __FILE__)
7
+ $LOAD_PATH.unshift(lib_path) unless $LOAD_PATH.include?(lib_path)
8
+
9
+ # подобный способ является правильным, только так классы Yanser и OptionParser будут независимы
10
+ require 'yanser'
11
+ require 'option_parser'
12
+
13
+ # class method, why not?
14
+ options = OptionParser.parse(ARGV)
15
+
16
+
17
+ yanser = Yanser.new(options)
18
+
19
+ yanser.start
@@ -0,0 +1,263 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'optparse'
3
+ require 'fileutils'
4
+
5
+ require 'version'
6
+
7
+ class OptionParser
8
+ # OP expects cmd_args to be an array like ARGV
9
+ # dummy output for temporary usage
10
+ def self.parse(cmd_args)
11
+ options = {}
12
+ options[:query_params] = {}
13
+
14
+ parser = OptionParser.new do |opts|
15
+ opts.banner = 'Usage: yanser OPTIONS'
16
+
17
+ opts.separator ''
18
+ opts.separator 'Program specific options:'
19
+
20
+ opts.on('--appid APPID',
21
+ 'Provide an ApplicationID given by Yahoo,',
22
+ 'to test Yanser you can use <YahooDemo> as the APPID,',
23
+ 'think in this case on limitations placed by Yahoo.',
24
+ 'This option is required!'
25
+ ) do |appid|
26
+ options[:query_params][:appid] = appid
27
+ end
28
+
29
+
30
+ opts.separator ''
31
+ opts.separator ' Mandatory search arguments:'
32
+
33
+ opts.on('-k', '--key-word KEYWORD',
34
+ 'Provide a single keyword or a boolean expression.'
35
+ ) do |keyword|
36
+ options[:query_params][:query] = keyword
37
+ # not a solution!!!
38
+ options[:query_params][:search_in] = 'question'
39
+
40
+ end
41
+
42
+ opts.separator ''
43
+
44
+ opts.on('-c', '--category CATEGORY',
45
+ 'Provide a category name or ID.'
46
+ ) do |category|
47
+
48
+ if category =~ /^[[:digit:]]+$/
49
+ options[:query_params][:category_id] = category
50
+ else
51
+ options[:query_params][:category_name] = category
52
+ end
53
+
54
+ end
55
+
56
+ opts.separator ''
57
+
58
+ opts.on('--user-id ID',
59
+ 'Provide an user ID of questions you search for.',
60
+ 'This way you can get question by a specific user.'
61
+ ) do |user_id|
62
+ options[:query_params][:user_id] = user_id
63
+ end
64
+
65
+ opts.separator ''
66
+
67
+ opts.on('--question-id ID',
68
+ 'Provide a question ID of the question you search for.',
69
+ 'It returns a unique question.'
70
+ ) do |question_id|
71
+ options[:query_params][:question_id] = question_id
72
+ end
73
+
74
+ opts.separator ''
75
+ opts.separator ' Optional search arguments:'
76
+
77
+ opts.on('-r', '--region REGION',
78
+ 'Provide a geographic region to search in for terms.',
79
+ 'Possible values are: de, us, uk, ca, au, in, es, br,',
80
+ ' ar, mx, e1, it, fr, sg.',
81
+ 'This defaults to en.'
82
+ ) do |region|
83
+ # todo
84
+ prove_region(region)
85
+ options[:query_params][:region] = region
86
+ end
87
+
88
+ opts.separator ''
89
+
90
+ opts.on('-o', '--output-dir DIR',
91
+ 'Provide an output folder.',
92
+ 'This directory will be created if it does not exist yet.'
93
+ ) do |output_dir|
94
+ options[:output_dir] = provide_dir(output_dir)
95
+ end
96
+
97
+ opts.separator ''
98
+
99
+ opts.on('-l', '--limit NUMBER', Integer,
100
+ 'Provide a number of answers you want to get from Yahoo.',
101
+ 'This argument is not mandatory, if you want to get',
102
+ 'all answers simply ommit this argument.'
103
+ ) do |limit|
104
+ options[:limit] = limit
105
+ end
106
+
107
+ opts.separator ''
108
+ opts.on('-f', '--output-format FORMAT',
109
+ 'Provide an output format: xml, json, rss, php.',
110
+ 'It defaults to xml, and you can simply ommit this option.'
111
+ ) do |f|
112
+ raise NotImplementedError, 'Only default output format is implemented'
113
+ end
114
+
115
+ opts.separator ''
116
+ opts.on('--prefix PREFIX',
117
+ 'Provide a prefix for the output files. By default',
118
+ 'the filename begins with the index of the retrieved',
119
+ 'question. You can alter this by providing a prefix.',
120
+ 'It can be useful if you want to put many query results',
121
+ 'in the same output folder.'
122
+ ) do |pref|
123
+ raise NotImplementedError, 'No prefixes implemeted.'
124
+ end
125
+
126
+ opts.separator ""
127
+ opts.separator "Common options:"
128
+
129
+ opts.on_tail('-h', '--help', 'Show the help message.') do
130
+ puts opts
131
+ exit
132
+ end
133
+
134
+ opts.on_tail('-v', '--version', 'Show the program version.') do
135
+ puts YANSER::VERSION
136
+ exit
137
+ end
138
+ end
139
+
140
+ # if no options provided print the help
141
+ if cmd_args.empty?
142
+ $stderr.printf "You have to provide some options.\n\n"
143
+ puts parser
144
+ exit 1
145
+ end
146
+
147
+ # Parse ARGV and provide the options hash.
148
+ # Check if everything is correct and handle exceptions
149
+ begin
150
+ parser.parse!(cmd_args)
151
+ # rescue all exceptions from OptionParser
152
+ rescue => e
153
+ $stderr.printf "#{e.message.capitalize}\n\n"
154
+ puts parser
155
+ exit 1
156
+ end
157
+
158
+ # Check to see if we got the required arguments needed.
159
+ check_required_options(options)
160
+
161
+ # Set the search method.
162
+ options[:query_type] = set_query_type(options[:query_params])
163
+
164
+ return options
165
+ end # parse
166
+
167
+ private
168
+
169
+ # Check if the value of given region is correct.
170
+ # Now 14 regions are supported by Yahoo! Answers.
171
+ def self.prove_region(region)
172
+ regions = ['de', 'us', 'uk', 'ca', 'au', 'in', 'es',
173
+ 'br', 'ar', 'mx', 'e1', 'it', 'fr', 'sg']
174
+
175
+ unless regions.include?(region)
176
+ $stderr.puts "The provided search region #{region} is currently not supported by Yahoo!"
177
+ exit 1
178
+ end
179
+ end
180
+
181
+ # define one of the following query types: TermQuery|CategoryQuery|
182
+ # QuestionQuery|UserQuery
183
+ def self.set_query_type(params)
184
+ case
185
+ when (params[:category_id] || params[:category_name]) && ! params[:query]
186
+ return 'CategoryQuery'
187
+ when params[:query]
188
+ return 'TermQuery'
189
+ when params[:user_id]
190
+ return 'UserQuery'
191
+ when params[:question_id]
192
+ return 'QuestionQuery'
193
+ end
194
+ end
195
+
196
+ def self.check_required_options(options)
197
+ required_opts = [:appid]
198
+
199
+ required_opts.each do |opt|
200
+ if options[:query_params].has_key?(opt)
201
+ next
202
+ else
203
+ $stderr.puts "A required option --#{opt} is missing."
204
+ exit 1
205
+ end
206
+ end
207
+ end
208
+
209
+ def self.provide_dir(dir)
210
+ dir = File.expand_path(dir)
211
+ #check for existens
212
+ if File.directory?(dir)
213
+ if File.writable?(dir)
214
+ return dir
215
+ else
216
+ $stderr.puts 'The directory you have provided is not writable!'
217
+ exit 1
218
+ end
219
+ else
220
+ FileUtils.mkdir_p(dir)
221
+ return dir
222
+ end
223
+ end # provide_dir
224
+
225
+ end # OptionParser
226
+
227
+ __END__
228
+
229
+ -c, --category-id # Term & CategorySearch
230
+ -q, --question-id # QuestionSearch
231
+
232
+ -t, --time-interval
233
+ -f, --output-format
234
+ -p, --prefix #prefix for output files
235
+
236
+
237
+
238
+
239
+ instance interface
240
+
241
+ {
242
+ :query_params => {
243
+ :appid => 'YahooDemo' | 'SomeStringWithYourID',
244
+ :callback => '',
245
+ :category_id => '',
246
+ :category_name => '',
247
+ :date_range => '',
248
+ :filter => '',
249
+ :output => '',
250
+ :query => '',
251
+ :question_id => '',
252
+ :region => 'de'|'us'|'uk'|'ca'|'au'|'in'|'es'|'br'|'ar'|'mx'|'e1'|'it'|'fr'|'sg', # default 'en'
253
+ :results => Integer, # 0..50
254
+ :search_in => "all" | "question" | "best_answer", # default 'all'
255
+ :sort => 'relevance' | 'date_desc'| 'date_asc', # default 'relevance'
256
+ :start => Integer, # <= 1000
257
+ :type => "all" | "resolved" | "open" | "undecided", # default 'all'
258
+ :user_id => ''
259
+ },
260
+ :query_type => 'TermQuery'|'QuestionQuery'|'UserQuery'|'CategoryQuery',
261
+ :output_dir => 'some path',
262
+ :prefix => 'some prefix' # prefix for output files
263
+ }
data/lib/tester.rb ADDED
@@ -0,0 +1,18 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ require 'yanser'
4
+
5
+ params = {
6
+ :query_method => 'TermQuery',
7
+ :query_params => {
8
+ :appid => 'YahooDemo',
9
+ :query => 'Köln',
10
+ :region => 'de',
11
+ :results => 5,
12
+ :start => 0
13
+ }
14
+ }
15
+
16
+ y = Yanser.new(params)
17
+
18
+ y.start
data/lib/version.rb ADDED
@@ -0,0 +1,3 @@
1
+ module YANSER
2
+ VERSION = '0.0.1'
3
+ end
data/lib/yanser.rb ADDED
@@ -0,0 +1,121 @@
1
+ require 'yanapi'
2
+
3
+ # :title: YANSER, Yahoo! ANSwers harvestER
4
+ # :main: Yanser
5
+ # Main processing class.
6
+ # Yanser encapsulates the main routine and instantiates
7
+ # all other classes.
8
+ #--
9
+ # Yanser takes the users input and validates it.
10
+ # It decides which search method to choose.
11
+ # Then it collects all parameters and useful default values,
12
+ # creates an XyzQuery with the starting point of 0 and gets the first results.
13
+ # If more results were requested, Yanser creates a similar XyzQuery and gets
14
+ # the next result set until the result limitation set by the user is met.
15
+ #
16
+ class Yanser
17
+
18
+ # Yahoo! Answers returns starting not more than at the 1000st question.
19
+ # It makes no sense to step over.
20
+ START_LIMIT = 1000
21
+
22
+ # We query the web service every two seconds.
23
+ QUERY_INTERVAL = 2
24
+
25
+ # Yahoo! Answers returns maximum 50 results.
26
+ MAX_RESULTS = 50
27
+
28
+ def initialize(opts)
29
+
30
+ # the minimal output of an OptionParser
31
+ # {:query_type=>u|q|c|w, :query_params=>{appid, start, results, query}}
32
+ # opts come from the OptionParser
33
+ # they are supposed to be correct, no validation here
34
+ @options = opts
35
+
36
+ @options[:limit] = @options[:limit] || START_LIMIT + MAX_RESULTS
37
+
38
+ @query_params = @options[:query_params]
39
+
40
+ @output_type = @query_params[:output] || 'xml'
41
+
42
+ @query_params[:start] = @query_params[:start] || 0
43
+ end
44
+
45
+ # TODO: implement some logging
46
+ def start
47
+
48
+ if @options[:query_type] == 'QuestionQuery'
49
+ query(@query_params)
50
+ elsif @options[:limit] < MAX_RESULTS
51
+ @query_params[:results] = @options[:limit]
52
+ query(@query_params)
53
+ else
54
+ @query_params[:results] = MAX_RESULTS
55
+
56
+ while query(@query_params) do
57
+ # we get the next start point here
58
+ @query_params[:start] += MAX_RESULTS
59
+
60
+ results_left = @options[:limit] - @query_params[:start]
61
+ if results_left == 0
62
+ break
63
+ elsif results_left < MAX_RESULTS
64
+ @query_params[:results] = results_left
65
+ break
66
+ end
67
+ end # while
68
+
69
+ end # if
70
+
71
+ end # start
72
+
73
+ private
74
+
75
+ def query(params)
76
+ q = create_query(params)
77
+ tries = 0
78
+ begin
79
+ tries += 1
80
+ result = q.get
81
+ output(result)
82
+ sleep(2)
83
+ rescue YANAPI::EmptyResponse => e
84
+ $sderr.puts e
85
+ return false # do not iterate futher
86
+ rescue => e # some errors to retry
87
+ if (tries < 4)
88
+ sleep(QUERY_INTERVAL**tries)
89
+ retry
90
+ else
91
+ $stderr.puts e
92
+ return false # do not iterate futher
93
+ end
94
+ end
95
+ return true # we may iterate futher
96
+ end
97
+
98
+ def create_query(params)
99
+ eval("YANAPI::#{@options[:query_type]}.new(params)")
100
+ end
101
+
102
+ def output(result)
103
+ if @options[:output_dir]
104
+ save(result)
105
+ else
106
+ puts result
107
+ end
108
+ end
109
+
110
+ # save results to a dir
111
+ # this dir exists since has been proved by OptionParser
112
+ # not a good implementation
113
+ # interface (filename, data)
114
+ def save(result)
115
+ filename = File.join(@options[:output_dir], "#{@query_params[:start]}.#{@output_type}")
116
+ file = File.new(filename, 'w')
117
+ file.puts result
118
+ file.close
119
+ end
120
+
121
+ end # Yanser