bio-biostars-analytics 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,12 @@
1
+ # Please require your code below, respecting the naming conventions in the
2
+ # bioruby directory tree.
3
+ #
4
+ # For example, say you have a plugin named bio-plugin, the only uncommented
5
+ # line in this file would be
6
+ #
7
+ # require 'bio/bio-plugin/plugin'
8
+ #
9
+ # In this file only require other files. Avoid other source code.
10
+
11
+ require 'bio-biostars-analytics/biostars-analytics.rb'
12
+
@@ -0,0 +1,365 @@
1
+
2
+ require 'rubygems'
3
+ require 'hpricot'
4
+ require 'open-uri'
5
+ require 'chronic'
6
+ require 'date'
7
+ require 'json'
8
+
9
+ module BioBiostarsAnalytics
10
+
11
+ # Categories in Biostar:
12
+ # Type ID Type
13
+ # 1 Question
14
+ # 2 Answer
15
+ # 3 Comment
16
+ # 4 Tutorial
17
+ # 5 Blog
18
+ # 6 Forum
19
+ # 7 News
20
+ # 8
21
+ # 9 Tool
22
+ # 10 FixMe
23
+ # 11 Video
24
+ # 12 Job
25
+ # 13 Research Paper
26
+ # 14 Tip
27
+ # 15 Poll
28
+ # 16 Ad
29
+ @@CATEGORIES = 16
30
+
31
+ # Extract the date (day, month, year) from a Biostar forum post formatted date string.
32
+ def self.extract_date(datestring)
33
+ # Major headache: weird years like "3.4 years ago"
34
+ if datestring.match(/\d+\.\d+ years ago/) then
35
+ return Chronic.parse("#{(datestring.sub(/\d+\./, '').sub(/\s.*$/, '').to_i * 5.2).to_i} weeks ago",
36
+ :now => Chronic.parse(datestring.sub(/\.\d+/, '')))
37
+ else
38
+ return Chronic.parse(datestring)
39
+ end
40
+ end
41
+
42
+ # Extracts data from the rendered forum post as well as the Biostar's "post" API.
43
+ #
44
+ # Algorithm:
45
+ # 1. mine data from the rendered forum post
46
+ # 2. retrieve limited information from Biostar's API
47
+ # 3. check that gathered data matches up
48
+ # 4. log it
49
+ def self.minecontent(log, id)
50
+ # This hash aggregates information about a particular Biostar question and its answers/comments:
51
+ post = { 'id' => id }
52
+
53
+ #
54
+ # First: mine data from the rendered forum post
55
+ #
56
+
57
+ url = "http://www.biostars.org/p/#{id}/"
58
+ page = nil
59
+
60
+ begin
61
+ page = open(url)
62
+ rescue
63
+ return
64
+ end
65
+
66
+ if page.base_uri.to_s != url then
67
+ # Answer URL.
68
+ return
69
+ end
70
+
71
+ # Question URL that contains the question, its answers and edits.
72
+ doc = Hpricot(page.read)
73
+
74
+ # Bail out if this page does not explicitly mentions a question.
75
+ return unless doc.search('doc.title') or doc.search('doc.title')[0].inner_html.match(/^Question:/)
76
+
77
+ users = []
78
+
79
+ # Extract user interactions: questions asked, answered and edits being made
80
+ times = doc.search('span.relativetime|div.lastedit').map { |element|
81
+ element.inner_html.sub(/^[^0-9]+/, '').sub(/by\s+$/, '').split("\n").first.strip
82
+ }
83
+ links = (doc/'a').delete_if { |link|
84
+ if link.get_attribute('href') then
85
+ not link.get_attribute('href').match(/^\/u\/\d+\//) # Has to be a relative link, or we catch Dropbox link-outs too...
86
+ else
87
+ true
88
+ end
89
+ }.map { |userlink| "#{userlink.get_attribute('href').gsub(/[\/u]+/, '')}\t#{userlink.inner_html}" }
90
+ votes = doc.search('div.vote-count').map { |vote|
91
+ if vote.inner_html.match(/^\d+$/) then
92
+ vote.inner_html.to_i
93
+ else
94
+ nil
95
+ end
96
+ }
97
+ tags = doc.search('a.tag').map { |link|
98
+ link.inner_html
99
+ }
100
+ # Sanity check: times and users need to match up (i.e., both arrays need to be of the same length)
101
+ unless times.length == links.length then
102
+ $stderr.puts "Post ##{id}: recorded times and author links do not match up (#{times.length} vs. #{links.length})."
103
+ return
104
+ end
105
+ # Sanity check: there cannot be more votes than times/links
106
+ if votes.length > times.length then
107
+ $stderr.puts "Post ##{id}: there are more votes than recorded user actions? (#{votes.length} vs. #{links.length})"
108
+ return
109
+ end
110
+ # Question/answer specific stats regarding votes:
111
+ question_vote = votes[0]
112
+ answer_number = votes[1..-1].compact.length
113
+ answer_min_vote = votes[1..-1].compact.sort[0]
114
+ answer_max_vote = votes[1..-1].compact.sort[-1]
115
+ answer_avg_vote = nil
116
+ answer_avg_vote = (answer_min_vote + answer_max_vote).to_f / 2.0 if answer_min_vote and answer_max_vote
117
+ # Helper variables to deal with the "votes" array, which is shorter than the times/links arrays.
118
+ # These variables determine when the index counter for the "votes" array is incremented and when
119
+ # said index is valid.
120
+ vote_used = false
121
+ vote_index = 0
122
+ # Go through each time occurrence/author link pair (and also consider votes):
123
+ post['records'] = times.length
124
+ times.each_index { |index|
125
+ # Sanity check: first time is not an update...
126
+ if index == 0 and times[index].match(/updated/) then
127
+ $stderr.puts "Post ##{id}: First recorded time is also an update?"
128
+ return
129
+ end
130
+ # Sanity check: first time is also not a comment...
131
+ if index == 0 and votes[index] == nil then
132
+ $stderr.puts "Post ##{id}: First recorded time is a comment?"
133
+ return
134
+ end
135
+ action = 'answered'
136
+ action = 'asked' if index == 0
137
+ if votes[vote_index] == nil and not vote_used then
138
+ action = 'commented'
139
+ vote_used = true
140
+ end
141
+ if times[index].match(/updated/) then
142
+ action = 'edited'
143
+ else
144
+ vote_index += 1
145
+ vote_used = false
146
+ end
147
+ times[index] = times[index].sub(/^[^0-9]+/, '')
148
+ datetime = extract_date(times[index])
149
+ post["#{index}"] = {
150
+ 'datestring' => times[index],
151
+ 'year' => datetime.year,
152
+ 'month' => datetime.month,
153
+ 'day' => datetime.day,
154
+ 'action' => action,
155
+ 'uid' => links[index],
156
+ 'question_vote' => question_vote,
157
+ 'answer_number' => answer_number,
158
+ 'answer_min_vote' => answer_min_vote,
159
+ 'answer_max_vote' => answer_max_vote,
160
+ 'answer_avg_vote' => answer_avg_vote,
161
+ 'tags' => tags
162
+ }
163
+ }
164
+
165
+ page.close
166
+
167
+ #
168
+ # Second: retrieve limited information from Biostar's API
169
+ #
170
+
171
+ url = "http://www.biostars.org/api/post/#{id}/"
172
+
173
+ begin
174
+ doc = JSON.parse(open(url).read)
175
+ rescue
176
+ return
177
+ end
178
+
179
+ # Extract the limited information the API offers:
180
+ post['api_creation_date'] = Chronic.parse(doc['creation_date'])
181
+ post['api_answer_number'] = doc['answer_count']
182
+ post['api_question_vote'] = doc['score']
183
+ post['api_type'] = doc['type']
184
+ post['api_type_id'] = doc['type_id']
185
+
186
+ #
187
+ # Third: check that gathered data matches up (API and data mined results are matching)
188
+ #
189
+
190
+ # Warning: number of answers matches
191
+ #
192
+ # Cannot be used as sanity check, because the Biostar implementation actually returns
193
+ # a wrong number of answers. For example, http://www.biostars.org/p/7542/ (20 March 2014)
194
+ # says "4 answers" even though there are clearly just three answers being displayed.
195
+ # The same applies to underreporting of answers, such as in http://www.biostars.org/p/10927/
196
+ # (20 March 2014), where 12 answers are shown on the web-page, but the summary on top
197
+ # reports only 11 answers.
198
+ unless post['api_answer_number'] == post['0']['answer_number'] then
199
+ $stderr.puts "Post ##{id}: number of answers differ (#{post['api_answer_number']} vs. #{post['0']['answer_number']}). Resetting number returned by API; using actual count of answers visible to the user."
200
+ post['api_answer_number'] = post['0']['answer_number']
201
+ end
202
+
203
+ # Sanity check: voting score for the question matches
204
+ unless post['api_question_vote'] == post['0']['question_vote'] then
205
+ $stderr.puts "Post ##{id}: mismatch between API's reported question vote and data mined voting score (#{post['api_question_vote']} vs. #{post['0']['question_vote']})."
206
+ return
207
+ end
208
+
209
+ #
210
+ # Fourth: log it
211
+ #
212
+
213
+ (0..post['records']-1).each { |index|
214
+ record = post["#{index}"]
215
+ log.puts "#{post['id']}\t#{record['datestring']}\t#{record['year']}\t#{record['month']}\t#{record['day']}\t#{record['action']}\t#{record['uid']}\t#{record['question_vote']}\t#{record['answer_number']}\t#{record['answer_min_vote']}\t#{record['answer_max_vote']}\t#{record['answer_avg_vote']}\t#{record['tags'].join(',')}\t#{post['api_type']}\t#{post['api_type_id']}"
216
+ }
217
+ end
218
+
219
+ # Extracts data from Biostar's "stats" API.
220
+ def self.minehistory(log, age)
221
+ url = "http://www.biostars.org/api/stats/#{age}/"
222
+
223
+ begin
224
+ stats = JSON.parse(open(url).read)
225
+ rescue
226
+ return
227
+ end
228
+
229
+ # Extract the limited information the API offers:
230
+ parseddate = Chronic.parse(stats['date'])
231
+ stats['year'] = parseddate.year
232
+ stats['month'] = parseddate.month
233
+ stats['day'] = parseddate.day
234
+
235
+ (1..@@CATEGORIES).each { |category|
236
+ stats["new_posts_in_category_#{category}"] = 0
237
+ }
238
+
239
+ # Types of votes in Biostar:
240
+ # Accept
241
+ # Bookmark
242
+ # Downvote
243
+ # Upvote
244
+ stats['new_votes_of_type_Accept'] = 0
245
+ stats['new_votes_of_type_Bookmark'] = 0
246
+ stats['new_votes_of_type_Downvote'] = 0
247
+ stats['new_votes_of_type_Upvote'] = 0
248
+
249
+ stats['posters'] = []
250
+ stats['poster_ages'] = []
251
+ stats['root_post_ages'] = []
252
+ stats['vote_post_ages'] = []
253
+ stats['biostarbabies'] = []
254
+
255
+ if stats.has_key?('x_new_users') then
256
+ stats['x_new_users'].each { |post|
257
+ @user_age[post['id']] = age
258
+ stats['biostarbabies'] = stats['biostarbabies'] + [ post['id'] ]
259
+ }
260
+ stats['new_users'] = stats['x_new_users'].length
261
+ else
262
+ stats['new_users'] = 0
263
+ end
264
+
265
+ if stats.has_key?('x_new_posts') then
266
+ stats['x_new_posts'].each { |post|
267
+ @post_age[post['id']] = age
268
+ stats['posters'] = stats['posters'] + [ post['author_id'] ]
269
+ stats['poster_ages'] = stats['poster_ages'] + [ @user_age[post['author_id']] ]
270
+ stats['root_post_ages'] = stats['root_post_ages'] + [ @post_age[post['root_id']] ] if post['root_id'] != post['id']
271
+ stats["new_posts_in_category_#{post['type_id']}"] = stats["new_posts_in_category_#{post['type_id']}"] + 1
272
+ }
273
+ stats['new_posts'] = stats['x_new_posts'].length
274
+ else
275
+ stats['new_posts'] = 0
276
+ end
277
+
278
+ # Poster age might not be applicable when having gone too far back in time...
279
+ stats['poster_ages'].reject! { |i| i == nil }
280
+
281
+ if stats.has_key?('x_new_votes') then
282
+ stats['x_new_votes'].each { |vote|
283
+ stats['vote_post_ages'] = stats['vote_post_ages'] + [ @post_age[vote['post_id']] ] if vote['type'] == 'Upvote' or vote['type'] == 'Downvote'
284
+ stats["new_votes_of_type_#{vote['type']}"] = stats["new_votes_of_type_#{vote['type']}"] + 1
285
+ }
286
+ stats['new_votes'] = stats['x_new_votes'].length
287
+ else
288
+ stats['new_votes'] = 0
289
+ end
290
+
291
+ line = "#{age}\t#{stats['date']}\t#{stats['year']}\t#{stats['month']}\t#{stats['day']}\t"
292
+ (1..@@CATEGORIES).each { |category|
293
+ line << "#{stats["new_posts_in_category_#{category}"]}\t"
294
+ }
295
+ line << "#{stats['new_votes_of_type_Accept']}\t"
296
+ line << "#{stats['new_votes_of_type_Bookmark']}\t"
297
+ line << "#{stats['new_votes_of_type_Downvote']}\t"
298
+ line << "#{stats['new_votes_of_type_Upvote']}\t"
299
+ line << "#{stats['new_posts']}\t#{stats['new_votes']}\t#{stats['new_users']}\t"
300
+ line << "#{stats['posters'].join(',')}\t#{stats['poster_ages'].join(',')}\t#{stats['root_post_ages'].join(',')}\t#{stats['vote_post_ages'].join(',')}\t#{stats['biostarbabies'].join(',')}\t"
301
+
302
+ log.puts line
303
+ end
304
+
305
+ def self.cli
306
+ if not ARGV.length.between?(2, 3) or
307
+ not ARGV[0].match(/\d+/) or
308
+ not ARGV[1].match(/\d+/) or
309
+ (ARGV.length == 3 and not ARGV[2].match(/\d+/))then
310
+ puts 'Usage: biostars-analytics max_post_number months_look_back [min_post_number]'
311
+ puts ''
312
+ puts 'Required parameters:'
313
+ puts ' max_post_number : highest number (ID) of the post that should'
314
+ puts ' be mined for data; the crawler will go over'
315
+ puts ' posts min_post_number to max_post_number'
316
+ puts ' months_look_back : how many months back should queries to the'
317
+ puts ' Biostar API go (1 month = 30 days); default'
318
+ puts ' value is 1'
319
+ puts ''
320
+ puts 'Optional parameters:'
321
+ puts ' min_post_number : lowest number (ID) of the post that should'
322
+ puts ' be mined for data'
323
+ puts ''
324
+ puts 'Output (date matches the script\'s invokation):'
325
+ puts ' <date>_crawled.tsv : data mined from crawling over posts'
326
+ puts ' <date>_api.tsv : data extracted from the Biostar API'
327
+ puts ''
328
+ puts 'Example 1: mining Biostars in March 2014:'
329
+ puts ' biostars-analytics 96000 54'
330
+ puts ''
331
+ puts 'Example 2: mining last month data with post numbers determined manually:'
332
+ puts ' biostars-analytics 234 1 123'
333
+ exit 1
334
+ end
335
+
336
+ max_post_number = ARGV[0].to_i
337
+ months_look_back = ARGV[1].to_i
338
+ min_post_number = 1
339
+ min_post_number = ARGV[2].to_i if ARGV.length == 3
340
+
341
+ # Make sure not to buffer stdout, so that it is possible to
342
+ # snoop around whilst the script is running.
343
+ STDOUT.sync = true
344
+
345
+ today = Time.now.strftime('%Y%m%d')
346
+ crawler_log = File.open("#{today}_crawled.tsv", 'w')
347
+ api_log = File.open("#{today}_api.tsv", 'w')
348
+
349
+ (min_post_number..max_post_number).each { |i|
350
+ minecontent(crawler_log, i)
351
+ }
352
+
353
+ @post_age = {}
354
+ @user_age = {}
355
+
356
+ (1..months_look_back*30).to_a.reverse.each { |i|
357
+ minehistory(api_log, i)
358
+ }
359
+
360
+ crawler_log.close
361
+ api_log.close
362
+ end
363
+
364
+ end
365
+
data/test/helper.rb ADDED
@@ -0,0 +1,18 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'test/unit'
11
+ require 'shoulda'
12
+
13
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
14
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
15
+ require 'bio-biostars-analytics'
16
+
17
+ class Test::Unit::TestCase
18
+ end
@@ -0,0 +1,12 @@
1
+ require 'helper'
2
+
3
+ class TestBioBiostarsAnalytics < Test::Unit::TestCase
4
+ should 'convert a relative time to an absolute time' do
5
+ assert_contains([
6
+ "#{BioBiostarsAnalytics::extract_date('3.5 years ago')}",
7
+ "#{BioBiostarsAnalytics::extract_date('5 days ago')}",
8
+ "#{BioBiostarsAnalytics::extract_date('8 months ago')}"
9
+ ], /^\d{4}-\d+-\d+ \d+:\d+:\d+.*/)
10
+ end
11
+ end
12
+
metadata ADDED
@@ -0,0 +1,193 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: bio-biostars-analytics
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Joachim Baran
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-04-06 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: shoulda
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rdoc
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: '3.12'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: '3.12'
41
+ - !ruby/object:Gem::Dependency
42
+ name: jeweler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ~>
46
+ - !ruby/object:Gem::Version
47
+ version: 2.0.1
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: 2.0.1
55
+ - !ruby/object:Gem::Dependency
56
+ name: bundler
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: 1.0.21
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: 1.0.21
69
+ - !ruby/object:Gem::Dependency
70
+ name: bio
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: 1.4.2
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: 1.4.2
83
+ - !ruby/object:Gem::Dependency
84
+ name: rdoc
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ~>
88
+ - !ruby/object:Gem::Version
89
+ version: '3.12'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ~>
95
+ - !ruby/object:Gem::Version
96
+ version: '3.12'
97
+ - !ruby/object:Gem::Dependency
98
+ name: hpricot
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ~>
102
+ - !ruby/object:Gem::Version
103
+ version: 0.8.6
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ~>
109
+ - !ruby/object:Gem::Version
110
+ version: 0.8.6
111
+ - !ruby/object:Gem::Dependency
112
+ name: chronic
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ~>
116
+ - !ruby/object:Gem::Version
117
+ version: 0.10.2
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ~>
123
+ - !ruby/object:Gem::Version
124
+ version: 0.10.2
125
+ - !ruby/object:Gem::Dependency
126
+ name: json
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ~>
130
+ - !ruby/object:Gem::Version
131
+ version: 1.8.0
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ~>
137
+ - !ruby/object:Gem::Version
138
+ version: 1.8.0
139
+ description: Ruby script for data-mining biostars.org using web-crawling techniques
140
+ as well as utilizing the Biostars RESTful API. Statistical analysis requires R (http://www.r-project.org).
141
+ email: joachim.baran@gmail.com
142
+ executables:
143
+ - biostars-analytics
144
+ - biostar_api_stats
145
+ - biostar_crawled_stats
146
+ extensions: []
147
+ extra_rdoc_files:
148
+ - LICENSE.txt
149
+ - README.md
150
+ - README.rdoc
151
+ files:
152
+ - .document
153
+ - .travis.yml
154
+ - Gemfile
155
+ - LICENSE.txt
156
+ - README.md
157
+ - README.rdoc
158
+ - Rakefile
159
+ - VERSION
160
+ - bin/biostar_api_stats
161
+ - bin/biostar_crawled_stats
162
+ - bin/biostars-analytics
163
+ - data/20140328_api.tsv
164
+ - data/20140328_crawled.tsv
165
+ - lib/bio-biostars-analytics.rb
166
+ - lib/bio-biostars-analytics/biostars-analytics.rb
167
+ - test/helper.rb
168
+ - test/test_bio-biostars-analytics.rb
169
+ homepage: http://github.com/joejimbo/bioruby-biostars-analytics
170
+ licenses:
171
+ - MIT
172
+ metadata: {}
173
+ post_install_message:
174
+ rdoc_options: []
175
+ require_paths:
176
+ - lib
177
+ required_ruby_version: !ruby/object:Gem::Requirement
178
+ requirements:
179
+ - - '>='
180
+ - !ruby/object:Gem::Version
181
+ version: '0'
182
+ required_rubygems_version: !ruby/object:Gem::Requirement
183
+ requirements:
184
+ - - '>='
185
+ - !ruby/object:Gem::Version
186
+ version: '0'
187
+ requirements: []
188
+ rubyforge_project:
189
+ rubygems_version: 2.0.5
190
+ signing_key:
191
+ specification_version: 4
192
+ summary: Biostars data-mining and statistical analysis.
193
+ test_files: []