repocrawler 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 6dcd4f22099b5b0f090b5062695b9db581cf4b7b
4
+ data.tar.gz: 4ccaaaa819b2b7eedab27c676be626e957360377
5
+ SHA512:
6
+ metadata.gz: 68b6670db01659e7ba8cd60f175141cf29e03928fc8dceae120516af0004e63dc673e45d90a0c43db1effcb0760a9a1c1d5cc85b19d612d91f71e610e91b4f2f
7
+ data.tar.gz: 2f8a36b9e827ec7ceea8c4fd3c48153723e034dd489a3a76be7fafd575ddb087a81551dc3b3c3f1eedb1e33f61e2fd1ad7836105aad30937b160d1c972483d68
data/.gitignore ADDED
@@ -0,0 +1,3 @@
1
+ data/*
2
+ config/local.rb
3
+ *.gem
data/Gemfile ADDED
@@ -0,0 +1,9 @@
1
+ # A sample Gemfile
2
+ source "https://rubygems.org"
3
+
4
+ gem "gems"
5
+ gem "github_api"
6
+ gem "httparty"
7
+ gem 'mongo', '~> 2.1'
8
+ gem 'configuration'
9
+ gem 'nokogiri'
data/Gemfile.lock ADDED
@@ -0,0 +1,52 @@
1
+ GEM
2
+ remote: https://rubygems.org/
3
+ specs:
4
+ addressable (2.3.8)
5
+ bson (3.2.6)
6
+ configuration (1.3.4)
7
+ descendants_tracker (0.0.4)
8
+ thread_safe (~> 0.3, >= 0.3.1)
9
+ faraday (0.9.2)
10
+ multipart-post (>= 1.2, < 3)
11
+ gems (0.8.3)
12
+ github_api (0.12.4)
13
+ addressable (~> 2.3)
14
+ descendants_tracker (~> 0.0.4)
15
+ faraday (~> 0.8, < 0.10)
16
+ hashie (>= 3.4)
17
+ multi_json (>= 1.7.5, < 2.0)
18
+ nokogiri (~> 1.6.6)
19
+ oauth2
20
+ hashie (3.4.3)
21
+ httparty (0.13.7)
22
+ json (~> 1.8)
23
+ multi_xml (>= 0.5.2)
24
+ json (1.8.3)
25
+ jwt (1.5.2)
26
+ mini_portile (0.6.2)
27
+ mongo (2.1.2)
28
+ bson (~> 3.0)
29
+ multi_json (1.11.2)
30
+ multi_xml (0.5.5)
31
+ multipart-post (2.0.0)
32
+ nokogiri (1.6.6.2)
33
+ mini_portile (~> 0.6.0)
34
+ oauth2 (1.0.0)
35
+ faraday (>= 0.8, < 0.10)
36
+ jwt (~> 1.0)
37
+ multi_json (~> 1.3)
38
+ multi_xml (~> 0.5)
39
+ rack (~> 1.2)
40
+ rack (1.6.4)
41
+ thread_safe (0.3.5)
42
+
43
+ PLATFORMS
44
+ ruby
45
+
46
+ DEPENDENCIES
47
+ configuration
48
+ gems
49
+ github_api
50
+ httparty
51
+ mongo (~> 2.1)
52
+ nokogiri
data/README.md ADDED
@@ -0,0 +1,9 @@
1
+ # Description
2
+ A simple program to get the needed data from a single gem, in this case is oga.
3
+
4
+ # Run
5
+ 1. ```bundle install``` for no reason.
6
+ 2. copy the ```config/local.example.rb``` to ```config/local.rb```.
7
+ 3. replace the ```STACKOVERFLOW_TOKEN```, ```TOKEN```, ```GITHUB_ACCOUNT```, and ```USER_AGENT``` in the ```config/local.rb``` with yours.
8
+ 4. run your local Mongodb with command ```sudo mongod```.
9
+ 5. run ```ruby repos_exec.rb``` to start the collecting process.
data/bin/repocrawler ADDED
@@ -0,0 +1,75 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'repocrawler'
4
+
5
+ `stty -echo`
6
+ print 'Github Password (not stored): '
7
+ github_password = gets.chomp
8
+ puts "\n"
9
+ print 'Name of the gem: '
10
+ gem_name = gets.chomp
11
+ puts "\n"
12
+ print 'Name of the repo: '
13
+ repo_name = gets.chomp
14
+ puts "\n"
15
+ print 'Username of the repo: '
16
+ repo_username = gets.chomp
17
+ puts "\n"
18
+ `stty echo`
19
+ puts ""
20
+
21
+ client = Mongo::Client.new([ '127.0.0.1:27017' ], :database => 'gems_info')
22
+
23
+ github = Repos::GithubData.new(repo_username, repo_name, github_password)
24
+ rubygems = Repos::RubyGemsData.new(gem_name)
25
+ ruby_toolbox = Repos::RubyToolBoxData.new(gem_name)
26
+ stackoverflow = Repos::StackOverflow.new(gem_name)
27
+
28
+ last_year_commit_activity = github.get_last_year_commit_activity
29
+ contributors = github.get_contributors
30
+ total_commits = github.get_total_commits
31
+ forks = github.get_forks
32
+ stars = github.get_stars
33
+ issues = github.get_issues
34
+ issues_info = github.get_issues_info
35
+ last_commits_days = github.get_last_commits_days
36
+ readme_word_count = github.get_readme_word_count
37
+
38
+ version_downloads = rubygems.get_version_downloads
39
+ version_downloads_trend = rubygems.get_version_downloads_trend
40
+
41
+ dependencies = rubygems.get_dependencies
42
+ total_downloads = rubygems.get_total_downloads
43
+
44
+ ranking = ruby_toolbox.get_ranking
45
+
46
+ #stackoverflow info
47
+ questions, questions_word_count = stackoverflow.get_questions
48
+
49
+ # aggregate the data
50
+ gem_info = {
51
+ 'name' => gem_name,
52
+ 'repo_name' => repo_name,
53
+ 'repo_username' => repo_username,
54
+ 'total_downloads' => total_downloads,
55
+ 'version_downloads' => version_downloads,
56
+ 'version_downloads_days' => version_downloads_trend,
57
+ 'dependencies' => dependencies,
58
+ 'last_commit' => last_commits_days,
59
+ 'forks' => forks,
60
+ 'stars' => stars,
61
+ 'issues' => issues,
62
+ 'ranking' => ranking,
63
+ 'commits' => total_commits,
64
+ 'commit_activity_last_year' => last_year_commit_activity,
65
+ 'contributors' => contributors,
66
+ 'issues_info' => issues_info,
67
+ 'readme_word_count' => readme_word_count,
68
+ 'questions' => questions,
69
+ 'questions_word_count' => questions_word_count,
70
+ 'created_at' => DateTime.now
71
+ }
72
+
73
+ puts gem_info
74
+
75
+ result = client[:gems].insert_one(gem_info)
@@ -0,0 +1,12 @@
1
+ require 'configuration'
2
+
3
+ Configuration.for('rubygems') do
4
+ github_token TOKEN
5
+ github_account GITHUB_ACCOUNT
6
+ github_password GITHUB_PASSWORD
7
+ user_agent USER_AGENT
8
+ end
9
+
10
+ Configuration.for('stackoverflow') do
11
+ stackoverflow_token STACKOVERFLOW_TOKEN
12
+ end
@@ -0,0 +1,283 @@
1
+ require 'rubygems'
2
+ require 'gems'
3
+ require 'mongo'
4
+ require 'github_api'
5
+ require 'httparty'
6
+ require 'configuration'
7
+ require 'nokogiri'
8
+ require 'open-uri'
9
+
10
+ module Repos
11
+ Kernel.load 'config/local.rb'
12
+
13
+ class GithubData
14
+ def initialize(repo_user, repo_name, github_password='')
15
+ @GITHUB_README_URL = "https://raw.githubusercontent.com/#{repo_user}/#{repo_name}/master"
16
+ @GITHUB_API_BASE_URL = "https://api.github.com/repos/#{repo_user}/#{repo_name}"
17
+ @rubygems = Configuration.for 'rubygems'
18
+ @access_token = @rubygems.github_token
19
+ @github_password = github_password === '' ? @rubygems.github_password : github_password
20
+ @user_agent = @rubygems.user_agent
21
+ @repo_user = repo_user
22
+ @repo_name = repo_name
23
+ end
24
+
25
+ # get the commit activity in last year
26
+ def get_last_year_commit_activity
27
+ last_year_commit_activity = HTTParty.get(@GITHUB_API_BASE_URL + "/stats/commit_activity?access_token=#{@access_token}", headers: {
28
+ "User-Agent" => @user_agent
29
+ })
30
+ end
31
+
32
+ # Get the contributors
33
+ def get_contributors
34
+ contributors = HTTParty.get(@GITHUB_API_BASE_URL + "/contributors?access_token=#{@access_token}", headers: {
35
+ "User-Agent" => @user_agent
36
+ }).map do |contributor|
37
+ {
38
+ 'name' => contributor['login'],
39
+ 'contributions' => contributor['contributions']
40
+ }
41
+ end
42
+
43
+ contributors
44
+ end
45
+
46
+ # get the total commits
47
+ def get_total_commits
48
+ contributors = get_contributors
49
+ commits = contributors.reduce(0) do |sum, num|
50
+ sum + num['contributions']
51
+ end
52
+
53
+ commits
54
+ end
55
+
56
+ # get numbers of forks, stars and issues
57
+ def get_forks
58
+ repos_meta = HTTParty.get(@GITHUB_API_BASE_URL + "?access_token=#{@access_token}", headers: {
59
+ "User-Agent" => @user_agent
60
+ })
61
+ forks = repos_meta['forks_count']
62
+
63
+ forks
64
+ end
65
+
66
+ def get_stars
67
+ repos_meta = HTTParty.get(@GITHUB_API_BASE_URL + "?access_token=#{@access_token}", headers: {
68
+ "User-Agent" => @user_agent
69
+ })
70
+ stars = repos_meta['stargazers_count']
71
+
72
+ stars
73
+ end
74
+
75
+ def get_issues
76
+ repos_meta = HTTParty.get(@GITHUB_API_BASE_URL + "?access_token=#{@access_token}", headers: {
77
+ "User-Agent" => @user_agent
78
+ })
79
+ issues = repos_meta['open_issues_count']
80
+
81
+ issues
82
+ end
83
+
84
+ # get information of the closed issues
85
+ def get_issues_info
86
+ closed_issues = []
87
+ stop = false
88
+ page = 1
89
+
90
+ until stop
91
+ issue_fetch = HTTParty.get(@GITHUB_API_BASE_URL + "/issues?state=closed&page=#{page}&access_token=#{@access_token}", headers: {
92
+ "User-Agent" => @user_agent
93
+ })
94
+ if issue_fetch.count === 0
95
+ stop = true
96
+ end
97
+
98
+ issue_fetch.each do |issue|
99
+ closed_issues << {
100
+ 'number' => issue['number'],
101
+ 'created_at' => issue['created_at'],
102
+ 'closed_at' => issue['closed_at'],
103
+ 'duration' => (Date.parse(issue['closed_at']) - Date.parse(issue['created_at'])).to_i
104
+ }
105
+ end
106
+
107
+ page += 1
108
+ end
109
+
110
+ closed_issues.reverse!
111
+ end
112
+
113
+ # get the date of the last commit
114
+ def get_last_commits_days
115
+ github = Github.new basic_auth: "#{@rubygems.github_account}:#{@github_password}"
116
+
117
+ commit = github.repos.commits.list(@repo_user, @repo_name).to_ary[0].to_hash['commit']['author']['date']
118
+ last_commit = (Date.today - Date.parse(commit)).to_i
119
+
120
+ last_commit
121
+ end
122
+
123
+ # get the readme file
124
+ def get_readme_word_count
125
+ github_contents = HTTParty.get(@GITHUB_API_BASE_URL + "/contents?access_token=#{@access_token}", headers: {
126
+ "User-Agent" => @user_agent
127
+ })
128
+ readme_file = ''
129
+ github_contents.each do |content|
130
+ readme_file = content['name'] if content['name'] =~ /^README/
131
+ end
132
+
133
+ stop_words = []
134
+ File.open(File.expand_path("../../public/stop_words.txt", File.dirname(__FILE__)), "r") do |f|
135
+ f.each_line do |line|
136
+ stop_words << line.gsub(/\n/,"")
137
+ end
138
+ end
139
+
140
+ readme = HTTParty.get(@GITHUB_README_URL + "/#{readme_file}")
141
+ words = readme.split(' ')
142
+ freqs = Hash.new(0)
143
+ words.each do |word|
144
+ if word =~ /^\w+$/ && !stop_words.include?(word.downcase)
145
+ freqs[word] += 1
146
+ end
147
+ end
148
+ freqs = freqs.sort_by { |word, freq| freq }.reverse!
149
+
150
+ freqs
151
+ end
152
+ end
153
+
154
+ class RubyGemsData
155
+ def initialize(gem_name)
156
+ @gem_name = gem_name
157
+ end
158
+
159
+ # get the downloads of each versions
160
+ def get_version_downloads
161
+ versions = Gems.versions @gem_name
162
+
163
+ version_downloads = versions.map do |version|
164
+ if version['platform'] === 'ruby'
165
+ {
166
+ 'number' => version['number'],
167
+ 'downloads' =>version['downloads_count']
168
+ }
169
+ end
170
+ end.reverse!
171
+
172
+ version_downloads.compact! if version_downloads.include? nil
173
+ version_downloads
174
+ end
175
+
176
+ def get_version_downloads_trend(start_date='', end_date='')
177
+ versions = Gems.versions @gem_name
178
+
179
+ end_date = Date.today if end_date.to_s == ''
180
+ version_downloads_trend = versions.map do |version|
181
+ start = version['created_at'] if start_date.to_s == ''
182
+
183
+ if version['platform'] === 'ruby'
184
+ version_downloads_days = Gems.downloads @gem_name, version['number'], start, end_date
185
+ {
186
+ 'number' => version['number'],
187
+ 'downloads_date' => version_downloads_days
188
+ }
189
+ end
190
+ end.reverse!
191
+
192
+ version_downloads_trend.compact! if version_downloads_trend.include? nil
193
+ version_downloads_trend
194
+ end
195
+
196
+ # get the dependencies
197
+ def get_dependencies
198
+ oga_info = Gems.info @gem_name
199
+ dependencies = oga_info['dependencies']
200
+
201
+ dependencies
202
+ end
203
+
204
+ # total number of downloads
205
+ def get_total_downloads
206
+ oga_info = Gems.info @gem_name
207
+ total_downloads = oga_info['downloads']
208
+
209
+ total_downloads
210
+ end
211
+ end
212
+
213
+ class RubyToolBoxData
214
+ def initialize(gem_name)
215
+ rubygems = Configuration.for 'rubygems'
216
+ @user_agent = rubygems.user_agent
217
+ @RUBY_TOOLBOX_BASE_URL = "https://www.ruby-toolbox.com/projects/"
218
+ @RANKING_PATH = "//div[@class='teaser-bar']//li[last()-1]//a"
219
+ @gem_name = gem_name
220
+ end
221
+
222
+ # get the ranking on Ruby ToolBox
223
+ def get_ranking
224
+ begin
225
+ document = open(@RUBY_TOOLBOX_BASE_URL + @gem_name,
226
+ 'User-Agent' => @user_agent
227
+ )
228
+ noko_document = Nokogiri::HTML(document)
229
+ ranking = noko_document.xpath(@RANKING_PATH).text
230
+ rescue
231
+ ranking = 0
232
+ end
233
+ ranking
234
+ end
235
+ end
236
+
237
+ class StackOverflow
238
+ def initialize(gem_name)
239
+ stackoverflow = Configuration.for 'stackoverflow'
240
+ @STACKOVERFLOW_API = "https://api.stackexchange.com/2.2/search/advanced?order=desc&sort=creation&q=#{gem_name}&site=stackoverflow&key=#{stackoverflow.stackoverflow_token}"
241
+ end
242
+
243
+ #get questions from stackexchange
244
+ def get_questions
245
+
246
+ stop_words = []
247
+ File.open(File.expand_path("../../public/stop_words.txt", File.dirname(__FILE__)), "r") do |f|
248
+ f.each_line do |line|
249
+ stop_words << line.gsub(/\n/,"")
250
+ end
251
+ end
252
+
253
+ questions = []
254
+ fetch_questions = HTTParty.get(@STACKOVERFLOW_API)
255
+ fetch_questions['items'].each do |q|
256
+ #don't store stop words
257
+ good_words = []
258
+ q['title'].split(' ').map do |word|
259
+ if !stop_words.include?(word.downcase)
260
+ good_words << word
261
+ end
262
+ end
263
+
264
+ questions << {
265
+ 'creation_date' => q['creation_date'],
266
+ 'title' => good_words,
267
+ 'views' => q['view_count']
268
+ }
269
+ end
270
+
271
+ questions_word_count = Hash.new(0)
272
+ questions.each do |question|
273
+ question['title'].each do |word|
274
+ questions_word_count[word] += 1
275
+ end
276
+ end
277
+
278
+ questions_word_count = questions_word_count.sort_by { |word, freq| freq }.reverse!
279
+ [questions, questions_word_count]
280
+ end
281
+ end
282
+
283
+ end
@@ -0,0 +1,4 @@
1
+ # Versioning
2
+ module Repos
3
+ VERSION = '0.1.3'
4
+ end
@@ -0,0 +1 @@
1
+ require 'repocrawler/crawler.rb'