repocrawler 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +3 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +52 -0
- data/README.md +9 -0
- data/bin/repocrawler +75 -0
- data/config/local.example.rb +12 -0
- data/lib/repocrawler/crawler.rb +283 -0
- data/lib/repocrawler/version.rb +4 -0
- data/lib/repocrawler.rb +1 -0
- data/public/stop_words.txt +1161 -0
- data/repocrawler.gemspec +23 -0
- metadata +129 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 6dcd4f22099b5b0f090b5062695b9db581cf4b7b
|
4
|
+
data.tar.gz: 4ccaaaa819b2b7eedab27c676be626e957360377
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 68b6670db01659e7ba8cd60f175141cf29e03928fc8dceae120516af0004e63dc673e45d90a0c43db1effcb0760a9a1c1d5cc85b19d612d91f71e610e91b4f2f
|
7
|
+
data.tar.gz: 2f8a36b9e827ec7ceea8c4fd3c48153723e034dd489a3a76be7fafd575ddb087a81551dc3b3c3f1eedb1e33f61e2fd1ad7836105aad30937b160d1c972483d68
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
GEM
|
2
|
+
remote: https://rubygems.org/
|
3
|
+
specs:
|
4
|
+
addressable (2.3.8)
|
5
|
+
bson (3.2.6)
|
6
|
+
configuration (1.3.4)
|
7
|
+
descendants_tracker (0.0.4)
|
8
|
+
thread_safe (~> 0.3, >= 0.3.1)
|
9
|
+
faraday (0.9.2)
|
10
|
+
multipart-post (>= 1.2, < 3)
|
11
|
+
gems (0.8.3)
|
12
|
+
github_api (0.12.4)
|
13
|
+
addressable (~> 2.3)
|
14
|
+
descendants_tracker (~> 0.0.4)
|
15
|
+
faraday (~> 0.8, < 0.10)
|
16
|
+
hashie (>= 3.4)
|
17
|
+
multi_json (>= 1.7.5, < 2.0)
|
18
|
+
nokogiri (~> 1.6.6)
|
19
|
+
oauth2
|
20
|
+
hashie (3.4.3)
|
21
|
+
httparty (0.13.7)
|
22
|
+
json (~> 1.8)
|
23
|
+
multi_xml (>= 0.5.2)
|
24
|
+
json (1.8.3)
|
25
|
+
jwt (1.5.2)
|
26
|
+
mini_portile (0.6.2)
|
27
|
+
mongo (2.1.2)
|
28
|
+
bson (~> 3.0)
|
29
|
+
multi_json (1.11.2)
|
30
|
+
multi_xml (0.5.5)
|
31
|
+
multipart-post (2.0.0)
|
32
|
+
nokogiri (1.6.6.2)
|
33
|
+
mini_portile (~> 0.6.0)
|
34
|
+
oauth2 (1.0.0)
|
35
|
+
faraday (>= 0.8, < 0.10)
|
36
|
+
jwt (~> 1.0)
|
37
|
+
multi_json (~> 1.3)
|
38
|
+
multi_xml (~> 0.5)
|
39
|
+
rack (~> 1.2)
|
40
|
+
rack (1.6.4)
|
41
|
+
thread_safe (0.3.5)
|
42
|
+
|
43
|
+
PLATFORMS
|
44
|
+
ruby
|
45
|
+
|
46
|
+
DEPENDENCIES
|
47
|
+
configuration
|
48
|
+
gems
|
49
|
+
github_api
|
50
|
+
httparty
|
51
|
+
mongo (~> 2.1)
|
52
|
+
nokogiri
|
data/README.md
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
# Description
|
2
|
+
A simple program to get the needed data from a single gem, in this case is oga.
|
3
|
+
|
4
|
+
# Run
|
5
|
+
1. ```bundle install``` for no reason.
|
6
|
+
2. copy the ```config/local.example.rb``` to ```config/local.rb```.
|
7
|
+
3. replace the ```STACKOVERFLOW_TOKEN```, ```TOKEN```, ```GITHUB_ACCOUNT```, and ```USER_AGENT``` in the ```config/local.rb``` with yours.
|
8
|
+
4. run your local Mongodb with command ```sudo mongod```.
|
9
|
+
5. run ```ruby repos_exec.rb``` to start the collecting process.
|
data/bin/repocrawler
ADDED
@@ -0,0 +1,75 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'repocrawler'
|
4
|
+
|
5
|
+
`stty -echo`
|
6
|
+
print 'Github Password (not stored): '
|
7
|
+
github_password = gets.chomp
|
8
|
+
puts "\n"
|
9
|
+
print 'Name of the gem: '
|
10
|
+
gem_name = gets.chomp
|
11
|
+
puts "\n"
|
12
|
+
print 'Name of the repo: '
|
13
|
+
repo_name = gets.chomp
|
14
|
+
puts "\n"
|
15
|
+
print 'Username of the repo: '
|
16
|
+
repo_username = gets.chomp
|
17
|
+
puts "\n"
|
18
|
+
`stty echo`
|
19
|
+
puts ""
|
20
|
+
|
21
|
+
client = Mongo::Client.new([ '127.0.0.1:27017' ], :database => 'gems_info')
|
22
|
+
|
23
|
+
github = Repos::GithubData.new(repo_username, repo_name, github_password)
|
24
|
+
rubygems = Repos::RubyGemsData.new(gem_name)
|
25
|
+
ruby_toolbox = Repos::RubyToolBoxData.new(gem_name)
|
26
|
+
stackoverflow = Repos::StackOverflow.new(gem_name)
|
27
|
+
|
28
|
+
last_year_commit_activity = github.get_last_year_commit_activity
|
29
|
+
contributors = github.get_contributors
|
30
|
+
total_commits = github.get_total_commits
|
31
|
+
forks = github.get_forks
|
32
|
+
stars = github.get_stars
|
33
|
+
issues = github.get_issues
|
34
|
+
issues_info = github.get_issues_info
|
35
|
+
last_commits_days = github.get_last_commits_days
|
36
|
+
readme_word_count = github.get_readme_word_count
|
37
|
+
|
38
|
+
version_downloads = rubygems.get_version_downloads
|
39
|
+
version_downloads_trend = rubygems.get_version_downloads_trend
|
40
|
+
|
41
|
+
dependencies = rubygems.get_dependencies
|
42
|
+
total_downloads = rubygems.get_total_downloads
|
43
|
+
|
44
|
+
ranking = ruby_toolbox.get_ranking
|
45
|
+
|
46
|
+
#stackoverflow info
|
47
|
+
questions, questions_word_count = stackoverflow.get_questions
|
48
|
+
|
49
|
+
# aggregate the data
|
50
|
+
gem_info = {
|
51
|
+
'name' => gem_name,
|
52
|
+
'repo_name' => repo_name,
|
53
|
+
'repo_username' => repo_username,
|
54
|
+
'total_downloads' => total_downloads,
|
55
|
+
'version_downloads' => version_downloads,
|
56
|
+
'version_downloads_days' => version_downloads_trend,
|
57
|
+
'dependencies' => dependencies,
|
58
|
+
'last_commit' => last_commits_days,
|
59
|
+
'forks' => forks,
|
60
|
+
'stars' => stars,
|
61
|
+
'issues' => issues,
|
62
|
+
'ranking' => ranking,
|
63
|
+
'commits' => total_commits,
|
64
|
+
'commit_activity_last_year' => last_year_commit_activity,
|
65
|
+
'contributors' => contributors,
|
66
|
+
'issues_info' => issues_info,
|
67
|
+
'readme_word_count' => readme_word_count,
|
68
|
+
'questions' => questions,
|
69
|
+
'questions_word_count' => questions_word_count,
|
70
|
+
'created_at' => DateTime.now
|
71
|
+
}
|
72
|
+
|
73
|
+
puts gem_info
|
74
|
+
|
75
|
+
result = client[:gems].insert_one(gem_info)
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'configuration'
|
2
|
+
|
3
|
+
Configuration.for('rubygems') do
|
4
|
+
github_token TOKEN
|
5
|
+
github_account GITHUB_ACCOUNT
|
6
|
+
github_password GITHUB_PASSWORD
|
7
|
+
user_agent USER_AGENT
|
8
|
+
end
|
9
|
+
|
10
|
+
Configuration.for('stackoverflow') do
|
11
|
+
stackoverflow_token STACKOVERFLOW_TOKEN
|
12
|
+
end
|
@@ -0,0 +1,283 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'gems'
|
3
|
+
require 'mongo'
|
4
|
+
require 'github_api'
|
5
|
+
require 'httparty'
|
6
|
+
require 'configuration'
|
7
|
+
require 'nokogiri'
|
8
|
+
require 'open-uri'
|
9
|
+
|
10
|
+
module Repos
|
11
|
+
Kernel.load 'config/local.rb'
|
12
|
+
|
13
|
+
class GithubData
|
14
|
+
def initialize(repo_user, repo_name, github_password='')
|
15
|
+
@GITHUB_README_URL = "https://raw.githubusercontent.com/#{repo_user}/#{repo_name}/master"
|
16
|
+
@GITHUB_API_BASE_URL = "https://api.github.com/repos/#{repo_user}/#{repo_name}"
|
17
|
+
@rubygems = Configuration.for 'rubygems'
|
18
|
+
@access_token = @rubygems.github_token
|
19
|
+
@github_password = github_password === '' ? @rubygems.github_password : github_password
|
20
|
+
@user_agent = @rubygems.user_agent
|
21
|
+
@repo_user = repo_user
|
22
|
+
@repo_name = repo_name
|
23
|
+
end
|
24
|
+
|
25
|
+
# get the commit activity in last year
|
26
|
+
def get_last_year_commit_activity
|
27
|
+
last_year_commit_activity = HTTParty.get(@GITHUB_API_BASE_URL + "/stats/commit_activity?access_token=#{@access_token}", headers: {
|
28
|
+
"User-Agent" => @user_agent
|
29
|
+
})
|
30
|
+
end
|
31
|
+
|
32
|
+
# Get the contributors
|
33
|
+
def get_contributors
|
34
|
+
contributors = HTTParty.get(@GITHUB_API_BASE_URL + "/contributors?access_token=#{@access_token}", headers: {
|
35
|
+
"User-Agent" => @user_agent
|
36
|
+
}).map do |contributor|
|
37
|
+
{
|
38
|
+
'name' => contributor['login'],
|
39
|
+
'contributions' => contributor['contributions']
|
40
|
+
}
|
41
|
+
end
|
42
|
+
|
43
|
+
contributors
|
44
|
+
end
|
45
|
+
|
46
|
+
# get the total commits
|
47
|
+
def get_total_commits
|
48
|
+
contributors = get_contributors
|
49
|
+
commits = contributors.reduce(0) do |sum, num|
|
50
|
+
sum + num['contributions']
|
51
|
+
end
|
52
|
+
|
53
|
+
commits
|
54
|
+
end
|
55
|
+
|
56
|
+
# get numbers of forks, stars and issues
|
57
|
+
def get_forks
|
58
|
+
repos_meta = HTTParty.get(@GITHUB_API_BASE_URL + "?access_token=#{@access_token}", headers: {
|
59
|
+
"User-Agent" => @user_agent
|
60
|
+
})
|
61
|
+
forks = repos_meta['forks_count']
|
62
|
+
|
63
|
+
forks
|
64
|
+
end
|
65
|
+
|
66
|
+
def get_stars
|
67
|
+
repos_meta = HTTParty.get(@GITHUB_API_BASE_URL + "?access_token=#{@access_token}", headers: {
|
68
|
+
"User-Agent" => @user_agent
|
69
|
+
})
|
70
|
+
stars = repos_meta['stargazers_count']
|
71
|
+
|
72
|
+
stars
|
73
|
+
end
|
74
|
+
|
75
|
+
def get_issues
|
76
|
+
repos_meta = HTTParty.get(@GITHUB_API_BASE_URL + "?access_token=#{@access_token}", headers: {
|
77
|
+
"User-Agent" => @user_agent
|
78
|
+
})
|
79
|
+
issues = repos_meta['open_issues_count']
|
80
|
+
|
81
|
+
issues
|
82
|
+
end
|
83
|
+
|
84
|
+
# get information of the closed issues
|
85
|
+
def get_issues_info
|
86
|
+
closed_issues = []
|
87
|
+
stop = false
|
88
|
+
page = 1
|
89
|
+
|
90
|
+
until stop
|
91
|
+
issue_fetch = HTTParty.get(@GITHUB_API_BASE_URL + "/issues?state=closed&page=#{page}&access_token=#{@access_token}", headers: {
|
92
|
+
"User-Agent" => @user_agent
|
93
|
+
})
|
94
|
+
if issue_fetch.count === 0
|
95
|
+
stop = true
|
96
|
+
end
|
97
|
+
|
98
|
+
issue_fetch.each do |issue|
|
99
|
+
closed_issues << {
|
100
|
+
'number' => issue['number'],
|
101
|
+
'created_at' => issue['created_at'],
|
102
|
+
'closed_at' => issue['closed_at'],
|
103
|
+
'duration' => (Date.parse(issue['closed_at']) - Date.parse(issue['created_at'])).to_i
|
104
|
+
}
|
105
|
+
end
|
106
|
+
|
107
|
+
page += 1
|
108
|
+
end
|
109
|
+
|
110
|
+
closed_issues.reverse!
|
111
|
+
end
|
112
|
+
|
113
|
+
# get the date of the last commit
|
114
|
+
def get_last_commits_days
|
115
|
+
github = Github.new basic_auth: "#{@rubygems.github_account}:#{@github_password}"
|
116
|
+
|
117
|
+
commit = github.repos.commits.list(@repo_user, @repo_name).to_ary[0].to_hash['commit']['author']['date']
|
118
|
+
last_commit = (Date.today - Date.parse(commit)).to_i
|
119
|
+
|
120
|
+
last_commit
|
121
|
+
end
|
122
|
+
|
123
|
+
# get the readme file
|
124
|
+
def get_readme_word_count
|
125
|
+
github_contents = HTTParty.get(@GITHUB_API_BASE_URL + "/contents?access_token=#{@access_token}", headers: {
|
126
|
+
"User-Agent" => @user_agent
|
127
|
+
})
|
128
|
+
readme_file = ''
|
129
|
+
github_contents.each do |content|
|
130
|
+
readme_file = content['name'] if content['name'] =~ /^README/
|
131
|
+
end
|
132
|
+
|
133
|
+
stop_words = []
|
134
|
+
File.open(File.expand_path("../../public/stop_words.txt", File.dirname(__FILE__)), "r") do |f|
|
135
|
+
f.each_line do |line|
|
136
|
+
stop_words << line.gsub(/\n/,"")
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
readme = HTTParty.get(@GITHUB_README_URL + "/#{readme_file}")
|
141
|
+
words = readme.split(' ')
|
142
|
+
freqs = Hash.new(0)
|
143
|
+
words.each do |word|
|
144
|
+
if word =~ /^\w+$/ && !stop_words.include?(word.downcase)
|
145
|
+
freqs[word] += 1
|
146
|
+
end
|
147
|
+
end
|
148
|
+
freqs = freqs.sort_by { |word, freq| freq }.reverse!
|
149
|
+
|
150
|
+
freqs
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
class RubyGemsData
|
155
|
+
def initialize(gem_name)
|
156
|
+
@gem_name = gem_name
|
157
|
+
end
|
158
|
+
|
159
|
+
# get the downloads of each versions
|
160
|
+
def get_version_downloads
|
161
|
+
versions = Gems.versions @gem_name
|
162
|
+
|
163
|
+
version_downloads = versions.map do |version|
|
164
|
+
if version['platform'] === 'ruby'
|
165
|
+
{
|
166
|
+
'number' => version['number'],
|
167
|
+
'downloads' =>version['downloads_count']
|
168
|
+
}
|
169
|
+
end
|
170
|
+
end.reverse!
|
171
|
+
|
172
|
+
version_downloads.compact! if version_downloads.include? nil
|
173
|
+
version_downloads
|
174
|
+
end
|
175
|
+
|
176
|
+
def get_version_downloads_trend(start_date='', end_date='')
|
177
|
+
versions = Gems.versions @gem_name
|
178
|
+
|
179
|
+
end_date = Date.today if end_date.to_s == ''
|
180
|
+
version_downloads_trend = versions.map do |version|
|
181
|
+
start = version['created_at'] if start_date.to_s == ''
|
182
|
+
|
183
|
+
if version['platform'] === 'ruby'
|
184
|
+
version_downloads_days = Gems.downloads @gem_name, version['number'], start, end_date
|
185
|
+
{
|
186
|
+
'number' => version['number'],
|
187
|
+
'downloads_date' => version_downloads_days
|
188
|
+
}
|
189
|
+
end
|
190
|
+
end.reverse!
|
191
|
+
|
192
|
+
version_downloads_trend.compact! if version_downloads_trend.include? nil
|
193
|
+
version_downloads_trend
|
194
|
+
end
|
195
|
+
|
196
|
+
# get the dependencies
|
197
|
+
def get_dependencies
|
198
|
+
oga_info = Gems.info @gem_name
|
199
|
+
dependencies = oga_info['dependencies']
|
200
|
+
|
201
|
+
dependencies
|
202
|
+
end
|
203
|
+
|
204
|
+
# total number of downloads
|
205
|
+
def get_total_downloads
|
206
|
+
oga_info = Gems.info @gem_name
|
207
|
+
total_downloads = oga_info['downloads']
|
208
|
+
|
209
|
+
total_downloads
|
210
|
+
end
|
211
|
+
end
|
212
|
+
|
213
|
+
class RubyToolBoxData
|
214
|
+
def initialize(gem_name)
|
215
|
+
rubygems = Configuration.for 'rubygems'
|
216
|
+
@user_agent = rubygems.user_agent
|
217
|
+
@RUBY_TOOLBOX_BASE_URL = "https://www.ruby-toolbox.com/projects/"
|
218
|
+
@RANKING_PATH = "//div[@class='teaser-bar']//li[last()-1]//a"
|
219
|
+
@gem_name = gem_name
|
220
|
+
end
|
221
|
+
|
222
|
+
# get the ranking on Ruby ToolBox
|
223
|
+
def get_ranking
|
224
|
+
begin
|
225
|
+
document = open(@RUBY_TOOLBOX_BASE_URL + @gem_name,
|
226
|
+
'User-Agent' => @user_agent
|
227
|
+
)
|
228
|
+
noko_document = Nokogiri::HTML(document)
|
229
|
+
ranking = noko_document.xpath(@RANKING_PATH).text
|
230
|
+
rescue
|
231
|
+
ranking = 0
|
232
|
+
end
|
233
|
+
ranking
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
class StackOverflow
|
238
|
+
def initialize(gem_name)
|
239
|
+
stackoverflow = Configuration.for 'stackoverflow'
|
240
|
+
@STACKOVERFLOW_API = "https://api.stackexchange.com/2.2/search/advanced?order=desc&sort=creation&q=#{gem_name}&site=stackoverflow&key=#{stackoverflow.stackoverflow_token}"
|
241
|
+
end
|
242
|
+
|
243
|
+
#get questions from stackexchange
|
244
|
+
def get_questions
|
245
|
+
|
246
|
+
stop_words = []
|
247
|
+
File.open(File.expand_path("../../public/stop_words.txt", File.dirname(__FILE__)), "r") do |f|
|
248
|
+
f.each_line do |line|
|
249
|
+
stop_words << line.gsub(/\n/,"")
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
253
|
+
questions = []
|
254
|
+
fetch_questions = HTTParty.get(@STACKOVERFLOW_API)
|
255
|
+
fetch_questions['items'].each do |q|
|
256
|
+
#don't store stop words
|
257
|
+
good_words = []
|
258
|
+
q['title'].split(' ').map do |word|
|
259
|
+
if !stop_words.include?(word.downcase)
|
260
|
+
good_words << word
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
questions << {
|
265
|
+
'creation_date' => q['creation_date'],
|
266
|
+
'title' => good_words,
|
267
|
+
'views' => q['view_count']
|
268
|
+
}
|
269
|
+
end
|
270
|
+
|
271
|
+
questions_word_count = Hash.new(0)
|
272
|
+
questions.each do |question|
|
273
|
+
question['title'].each do |word|
|
274
|
+
questions_word_count[word] += 1
|
275
|
+
end
|
276
|
+
end
|
277
|
+
|
278
|
+
questions_word_count = questions_word_count.sort_by { |word, freq| freq }.reverse!
|
279
|
+
[questions, questions_word_count]
|
280
|
+
end
|
281
|
+
end
|
282
|
+
|
283
|
+
end
|
data/lib/repocrawler.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'repocrawler/crawler.rb'
|