bio-biostars-analytics 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,12 @@
1
+ # Please require your code below, respecting the naming conventions in the
2
+ # bioruby directory tree.
3
+ #
4
+ # For example, say you have a plugin named bio-plugin, the only uncommented
5
+ # line in this file would be
6
+ #
7
+ # require 'bio/bio-plugin/plugin'
8
+ #
9
+ # In this file only require other files. Avoid other source code.
10
+
11
+ require 'bio-biostars-analytics/biostars-analytics.rb'
12
+
@@ -0,0 +1,365 @@
1
+
2
+ require 'rubygems'
3
+ require 'hpricot'
4
+ require 'open-uri'
5
+ require 'chronic'
6
+ require 'date'
7
+ require 'json'
8
+
9
+ module BioBiostarsAnalytics
10
+
11
+ # Categories in Biostar:
12
+ # Type ID Type
13
+ # 1 Question
14
+ # 2 Answer
15
+ # 3 Comment
16
+ # 4 Tutorial
17
+ # 5 Blog
18
+ # 6 Forum
19
+ # 7 News
20
+ # 8
21
+ # 9 Tool
22
+ # 10 FixMe
23
+ # 11 Video
24
+ # 12 Job
25
+ # 13 Research Paper
26
+ # 14 Tip
27
+ # 15 Poll
28
+ # 16 Ad
29
+ @@CATEGORIES = 16
30
+
31
+ # Extract the date (day, month, year) from a Biostar forum post formatted date string.
32
+ def self.extract_date(datestring)
33
+ # Major headache: weird years like "3.4 years ago"
34
+ if datestring.match(/\d+\.\d+ years ago/) then
35
+ return Chronic.parse("#{(datestring.sub(/\d+\./, '').sub(/\s.*$/, '').to_i * 5.2).to_i} weeks ago",
36
+ :now => Chronic.parse(datestring.sub(/\.\d+/, '')))
37
+ else
38
+ return Chronic.parse(datestring)
39
+ end
40
+ end
41
+
42
+ # Extracts data from the rendered forum post as well as the Biostar's "post" API.
43
+ #
44
+ # Algorithm:
45
+ # 1. mine data from the rendered forum post
46
+ # 2. retrieve limited information from Biostar's API
47
+ # 3. check that gathered data matches up
48
+ # 4. log it
49
+ def self.minecontent(log, id)
50
+ # This hash aggregates information about a particular Biostar question and its answers/comments:
51
+ post = { 'id' => id }
52
+
53
+ #
54
+ # First: mine data from the rendered forum post
55
+ #
56
+
57
+ url = "http://www.biostars.org/p/#{id}/"
58
+ page = nil
59
+
60
+ begin
61
+ page = open(url)
62
+ rescue
63
+ return
64
+ end
65
+
66
+ if page.base_uri.to_s != url then
67
+ # Answer URL.
68
+ return
69
+ end
70
+
71
+ # Question URL that contains the question, its answers and edits.
72
+ doc = Hpricot(page.read)
73
+
74
+ # Bail out if this page does not explicitly mentions a question.
75
+ return unless doc.search('doc.title') or doc.search('doc.title')[0].inner_html.match(/^Question:/)
76
+
77
+ users = []
78
+
79
+ # Extract user interactions: questions asked, answered and edits being made
80
+ times = doc.search('span.relativetime|div.lastedit').map { |element|
81
+ element.inner_html.sub(/^[^0-9]+/, '').sub(/by\s+$/, '').split("\n").first.strip
82
+ }
83
+ links = (doc/'a').delete_if { |link|
84
+ if link.get_attribute('href') then
85
+ not link.get_attribute('href').match(/^\/u\/\d+\//) # Has to be a relative link, or we catch Dropbox link-outs too...
86
+ else
87
+ true
88
+ end
89
+ }.map { |userlink| "#{userlink.get_attribute('href').gsub(/[\/u]+/, '')}\t#{userlink.inner_html}" }
90
+ votes = doc.search('div.vote-count').map { |vote|
91
+ if vote.inner_html.match(/^\d+$/) then
92
+ vote.inner_html.to_i
93
+ else
94
+ nil
95
+ end
96
+ }
97
+ tags = doc.search('a.tag').map { |link|
98
+ link.inner_html
99
+ }
100
+ # Sanity check: times and users need to match up (i.e., both arrays need to be of the same length)
101
+ unless times.length == links.length then
102
+ $stderr.puts "Post ##{id}: recorded times and author links do not match up (#{times.length} vs. #{links.length})."
103
+ return
104
+ end
105
+ # Sanity check: there cannot be more votes than times/links
106
+ if votes.length > times.length then
107
+ $stderr.puts "Post ##{id}: there are more votes than recorded user actions? (#{votes.length} vs. #{links.length})"
108
+ return
109
+ end
110
+ # Question/answer specific stats regarding votes:
111
+ question_vote = votes[0]
112
+ answer_number = votes[1..-1].compact.length
113
+ answer_min_vote = votes[1..-1].compact.sort[0]
114
+ answer_max_vote = votes[1..-1].compact.sort[-1]
115
+ answer_avg_vote = nil
116
+ answer_avg_vote = (answer_min_vote + answer_max_vote).to_f / 2.0 if answer_min_vote and answer_max_vote
117
+ # Helper variables to deal with the "votes" array, which is shorter than the times/links arrays.
118
+ # These variables determine when the index counter for the "votes" array is incremented and when
119
+ # said index is valid.
120
+ vote_used = false
121
+ vote_index = 0
122
+ # Go through each time occurrence/author link pair (and also consider votes):
123
+ post['records'] = times.length
124
+ times.each_index { |index|
125
+ # Sanity check: first time is not an update...
126
+ if index == 0 and times[index].match(/updated/) then
127
+ $stderr.puts "Post ##{id}: First recorded time is also an update?"
128
+ return
129
+ end
130
+ # Sanity check: first time is also not a comment...
131
+ if index == 0 and votes[index] == nil then
132
+ $stderr.puts "Post ##{id}: First recorded time is a comment?"
133
+ return
134
+ end
135
+ action = 'answered'
136
+ action = 'asked' if index == 0
137
+ if votes[vote_index] == nil and not vote_used then
138
+ action = 'commented'
139
+ vote_used = true
140
+ end
141
+ if times[index].match(/updated/) then
142
+ action = 'edited'
143
+ else
144
+ vote_index += 1
145
+ vote_used = false
146
+ end
147
+ times[index] = times[index].sub(/^[^0-9]+/, '')
148
+ datetime = extract_date(times[index])
149
+ post["#{index}"] = {
150
+ 'datestring' => times[index],
151
+ 'year' => datetime.year,
152
+ 'month' => datetime.month,
153
+ 'day' => datetime.day,
154
+ 'action' => action,
155
+ 'uid' => links[index],
156
+ 'question_vote' => question_vote,
157
+ 'answer_number' => answer_number,
158
+ 'answer_min_vote' => answer_min_vote,
159
+ 'answer_max_vote' => answer_max_vote,
160
+ 'answer_avg_vote' => answer_avg_vote,
161
+ 'tags' => tags
162
+ }
163
+ }
164
+
165
+ page.close
166
+
167
+ #
168
+ # Second: retrieve limited information from Biostar's API
169
+ #
170
+
171
+ url = "http://www.biostars.org/api/post/#{id}/"
172
+
173
+ begin
174
+ doc = JSON.parse(open(url).read)
175
+ rescue
176
+ return
177
+ end
178
+
179
+ # Extract the limited information the API offers:
180
+ post['api_creation_date'] = Chronic.parse(doc['creation_date'])
181
+ post['api_answer_number'] = doc['answer_count']
182
+ post['api_question_vote'] = doc['score']
183
+ post['api_type'] = doc['type']
184
+ post['api_type_id'] = doc['type_id']
185
+
186
+ #
187
+ # Third: check that gathered data matches up (API and data mined results are matching)
188
+ #
189
+
190
+ # Warning: number of answers matches
191
+ #
192
+ # Cannot be used as sanity check, because the Biostar implementation actually returns
193
+ # a wrong number of answers. For example, http://www.biostars.org/p/7542/ (20 March 2014)
194
+ # says "4 answers" even though there are clearly just three answers being displayed.
195
+ # The same applies to underreporting of answers, such as in http://www.biostars.org/p/10927/
196
+ # (20 March 2014), where 12 answers are shown on the web-page, but the summary on top
197
+ # reports only 11 answers.
198
+ unless post['api_answer_number'] == post['0']['answer_number'] then
199
+ $stderr.puts "Post ##{id}: number of answers differ (#{post['api_answer_number']} vs. #{post['0']['answer_number']}). Resetting number returned by API; using actual count of answers visible to the user."
200
+ post['api_answer_number'] = post['0']['answer_number']
201
+ end
202
+
203
+ # Sanity check: voting score for the question matches
204
+ unless post['api_question_vote'] == post['0']['question_vote'] then
205
+ $stderr.puts "Post ##{id}: mismatch between API's reported question vote and data mined voting score (#{post['api_question_vote']} vs. #{post['0']['question_vote']})."
206
+ return
207
+ end
208
+
209
+ #
210
+ # Fourth: log it
211
+ #
212
+
213
+ (0..post['records']-1).each { |index|
214
+ record = post["#{index}"]
215
+ log.puts "#{post['id']}\t#{record['datestring']}\t#{record['year']}\t#{record['month']}\t#{record['day']}\t#{record['action']}\t#{record['uid']}\t#{record['question_vote']}\t#{record['answer_number']}\t#{record['answer_min_vote']}\t#{record['answer_max_vote']}\t#{record['answer_avg_vote']}\t#{record['tags'].join(',')}\t#{post['api_type']}\t#{post['api_type_id']}"
216
+ }
217
+ end
218
+
219
+ # Extracts data from Biostar's "stats" API.
220
+ def self.minehistory(log, age)
221
+ url = "http://www.biostars.org/api/stats/#{age}/"
222
+
223
+ begin
224
+ stats = JSON.parse(open(url).read)
225
+ rescue
226
+ return
227
+ end
228
+
229
+ # Extract the limited information the API offers:
230
+ parseddate = Chronic.parse(stats['date'])
231
+ stats['year'] = parseddate.year
232
+ stats['month'] = parseddate.month
233
+ stats['day'] = parseddate.day
234
+
235
+ (1..@@CATEGORIES).each { |category|
236
+ stats["new_posts_in_category_#{category}"] = 0
237
+ }
238
+
239
+ # Types of votes in Biostar:
240
+ # Accept
241
+ # Bookmark
242
+ # Downvote
243
+ # Upvote
244
+ stats['new_votes_of_type_Accept'] = 0
245
+ stats['new_votes_of_type_Bookmark'] = 0
246
+ stats['new_votes_of_type_Downvote'] = 0
247
+ stats['new_votes_of_type_Upvote'] = 0
248
+
249
+ stats['posters'] = []
250
+ stats['poster_ages'] = []
251
+ stats['root_post_ages'] = []
252
+ stats['vote_post_ages'] = []
253
+ stats['biostarbabies'] = []
254
+
255
+ if stats.has_key?('x_new_users') then
256
+ stats['x_new_users'].each { |post|
257
+ @user_age[post['id']] = age
258
+ stats['biostarbabies'] = stats['biostarbabies'] + [ post['id'] ]
259
+ }
260
+ stats['new_users'] = stats['x_new_users'].length
261
+ else
262
+ stats['new_users'] = 0
263
+ end
264
+
265
+ if stats.has_key?('x_new_posts') then
266
+ stats['x_new_posts'].each { |post|
267
+ @post_age[post['id']] = age
268
+ stats['posters'] = stats['posters'] + [ post['author_id'] ]
269
+ stats['poster_ages'] = stats['poster_ages'] + [ @user_age[post['author_id']] ]
270
+ stats['root_post_ages'] = stats['root_post_ages'] + [ @post_age[post['root_id']] ] if post['root_id'] != post['id']
271
+ stats["new_posts_in_category_#{post['type_id']}"] = stats["new_posts_in_category_#{post['type_id']}"] + 1
272
+ }
273
+ stats['new_posts'] = stats['x_new_posts'].length
274
+ else
275
+ stats['new_posts'] = 0
276
+ end
277
+
278
+ # Poster age might not be applicable when having gone too far back in time...
279
+ stats['poster_ages'].reject! { |i| i == nil }
280
+
281
+ if stats.has_key?('x_new_votes') then
282
+ stats['x_new_votes'].each { |vote|
283
+ stats['vote_post_ages'] = stats['vote_post_ages'] + [ @post_age[vote['post_id']] ] if vote['type'] == 'Upvote' or vote['type'] == 'Downvote'
284
+ stats["new_votes_of_type_#{vote['type']}"] = stats["new_votes_of_type_#{vote['type']}"] + 1
285
+ }
286
+ stats['new_votes'] = stats['x_new_votes'].length
287
+ else
288
+ stats['new_votes'] = 0
289
+ end
290
+
291
+ line = "#{age}\t#{stats['date']}\t#{stats['year']}\t#{stats['month']}\t#{stats['day']}\t"
292
+ (1..@@CATEGORIES).each { |category|
293
+ line << "#{stats["new_posts_in_category_#{category}"]}\t"
294
+ }
295
+ line << "#{stats['new_votes_of_type_Accept']}\t"
296
+ line << "#{stats['new_votes_of_type_Bookmark']}\t"
297
+ line << "#{stats['new_votes_of_type_Downvote']}\t"
298
+ line << "#{stats['new_votes_of_type_Upvote']}\t"
299
+ line << "#{stats['new_posts']}\t#{stats['new_votes']}\t#{stats['new_users']}\t"
300
+ line << "#{stats['posters'].join(',')}\t#{stats['poster_ages'].join(',')}\t#{stats['root_post_ages'].join(',')}\t#{stats['vote_post_ages'].join(',')}\t#{stats['biostarbabies'].join(',')}\t"
301
+
302
+ log.puts line
303
+ end
304
+
305
+ def self.cli
306
+ if not ARGV.length.between?(2, 3) or
307
+ not ARGV[0].match(/\d+/) or
308
+ not ARGV[1].match(/\d+/) or
309
+ (ARGV.length == 3 and not ARGV[2].match(/\d+/))then
310
+ puts 'Usage: biostars-analytics max_post_number months_look_back [min_post_number]'
311
+ puts ''
312
+ puts 'Required parameters:'
313
+ puts ' max_post_number : highest number (ID) of the post that should'
314
+ puts ' be mined for data; the crawler will go over'
315
+ puts ' posts min_post_number to max_post_number'
316
+ puts ' months_look_back : how many months back should queries to the'
317
+ puts ' Biostar API go (1 month = 30 days); default'
318
+ puts ' value is 1'
319
+ puts ''
320
+ puts 'Optional parameters:'
321
+ puts ' min_post_number : lowest number (ID) of the post that should'
322
+ puts ' be mined for data'
323
+ puts ''
324
+ puts 'Output (date matches the script\'s invokation):'
325
+ puts ' <date>_crawled.tsv : data mined from crawling over posts'
326
+ puts ' <date>_api.tsv : data extracted from the Biostar API'
327
+ puts ''
328
+ puts 'Example 1: mining Biostars in March 2014:'
329
+ puts ' biostars-analytics 96000 54'
330
+ puts ''
331
+ puts 'Example 2: mining last month data with post numbers determined manually:'
332
+ puts ' biostars-analytics 234 1 123'
333
+ exit 1
334
+ end
335
+
336
+ max_post_number = ARGV[0].to_i
337
+ months_look_back = ARGV[1].to_i
338
+ min_post_number = 1
339
+ min_post_number = ARGV[2].to_i if ARGV.length == 3
340
+
341
+ # Make sure not to buffer stdout, so that it is possible to
342
+ # snoop around whilst the script is running.
343
+ STDOUT.sync = true
344
+
345
+ today = Time.now.strftime('%Y%m%d')
346
+ crawler_log = File.open("#{today}_crawled.tsv", 'w')
347
+ api_log = File.open("#{today}_api.tsv", 'w')
348
+
349
+ (min_post_number..max_post_number).each { |i|
350
+ minecontent(crawler_log, i)
351
+ }
352
+
353
+ @post_age = {}
354
+ @user_age = {}
355
+
356
+ (1..months_look_back*30).to_a.reverse.each { |i|
357
+ minehistory(api_log, i)
358
+ }
359
+
360
+ crawler_log.close
361
+ api_log.close
362
+ end
363
+
364
+ end
365
+
data/test/helper.rb ADDED
@@ -0,0 +1,18 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'test/unit'
11
+ require 'shoulda'
12
+
13
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
14
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
15
+ require 'bio-biostars-analytics'
16
+
17
+ class Test::Unit::TestCase
18
+ end
@@ -0,0 +1,12 @@
1
+ require 'helper'
2
+
3
+ class TestBioBiostarsAnalytics < Test::Unit::TestCase
4
+ should 'convert a relative time to an absolute time' do
5
+ assert_contains([
6
+ "#{BioBiostarsAnalytics::extract_date('3.5 years ago')}",
7
+ "#{BioBiostarsAnalytics::extract_date('5 days ago')}",
8
+ "#{BioBiostarsAnalytics::extract_date('8 months ago')}"
9
+ ], /^\d{4}-\d+-\d+ \d+:\d+:\d+.*/)
10
+ end
11
+ end
12
+
metadata ADDED
@@ -0,0 +1,193 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: bio-biostars-analytics
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Joachim Baran
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-04-06 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: shoulda
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rdoc
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: '3.12'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: '3.12'
41
+ - !ruby/object:Gem::Dependency
42
+ name: jeweler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ~>
46
+ - !ruby/object:Gem::Version
47
+ version: 2.0.1
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: 2.0.1
55
+ - !ruby/object:Gem::Dependency
56
+ name: bundler
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: 1.0.21
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: 1.0.21
69
+ - !ruby/object:Gem::Dependency
70
+ name: bio
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: 1.4.2
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: 1.4.2
83
+ - !ruby/object:Gem::Dependency
84
+ name: rdoc
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ~>
88
+ - !ruby/object:Gem::Version
89
+ version: '3.12'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ~>
95
+ - !ruby/object:Gem::Version
96
+ version: '3.12'
97
+ - !ruby/object:Gem::Dependency
98
+ name: hpricot
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ~>
102
+ - !ruby/object:Gem::Version
103
+ version: 0.8.6
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ~>
109
+ - !ruby/object:Gem::Version
110
+ version: 0.8.6
111
+ - !ruby/object:Gem::Dependency
112
+ name: chronic
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ~>
116
+ - !ruby/object:Gem::Version
117
+ version: 0.10.2
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ~>
123
+ - !ruby/object:Gem::Version
124
+ version: 0.10.2
125
+ - !ruby/object:Gem::Dependency
126
+ name: json
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ~>
130
+ - !ruby/object:Gem::Version
131
+ version: 1.8.0
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ~>
137
+ - !ruby/object:Gem::Version
138
+ version: 1.8.0
139
+ description: Ruby script for data-mining biostars.org using web-crawling techniques
140
+ as well as utilizing the Biostars RESTful API. Statistical analysis requires R (http://www.r-project.org).
141
+ email: joachim.baran@gmail.com
142
+ executables:
143
+ - biostars-analytics
144
+ - biostar_api_stats
145
+ - biostar_crawled_stats
146
+ extensions: []
147
+ extra_rdoc_files:
148
+ - LICENSE.txt
149
+ - README.md
150
+ - README.rdoc
151
+ files:
152
+ - .document
153
+ - .travis.yml
154
+ - Gemfile
155
+ - LICENSE.txt
156
+ - README.md
157
+ - README.rdoc
158
+ - Rakefile
159
+ - VERSION
160
+ - bin/biostar_api_stats
161
+ - bin/biostar_crawled_stats
162
+ - bin/biostars-analytics
163
+ - data/20140328_api.tsv
164
+ - data/20140328_crawled.tsv
165
+ - lib/bio-biostars-analytics.rb
166
+ - lib/bio-biostars-analytics/biostars-analytics.rb
167
+ - test/helper.rb
168
+ - test/test_bio-biostars-analytics.rb
169
+ homepage: http://github.com/joejimbo/bioruby-biostars-analytics
170
+ licenses:
171
+ - MIT
172
+ metadata: {}
173
+ post_install_message:
174
+ rdoc_options: []
175
+ require_paths:
176
+ - lib
177
+ required_ruby_version: !ruby/object:Gem::Requirement
178
+ requirements:
179
+ - - '>='
180
+ - !ruby/object:Gem::Version
181
+ version: '0'
182
+ required_rubygems_version: !ruby/object:Gem::Requirement
183
+ requirements:
184
+ - - '>='
185
+ - !ruby/object:Gem::Version
186
+ version: '0'
187
+ requirements: []
188
+ rubyforge_project:
189
+ rubygems_version: 2.0.5
190
+ signing_key:
191
+ specification_version: 4
192
+ summary: Biostars data-mining and statistical analysis.
193
+ test_files: []