corpus 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +674 -0
- data/README.rdoc +47 -0
- data/bin/corpus +133 -0
- data/conf/github_languages.txt +10 -0
- data/conf/ubuntu_urls.txt +3 -0
- data/ext/cloc-1.56.pl +8511 -0
- metadata +64 -0
data/README.rdoc
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
= Corpus -- a collection of open source software projects
|
|
2
|
+
|
|
3
|
+
== Table of Contents
|
|
4
|
+
1. Introduction
|
|
5
|
+
|
|
6
|
+
== 1. Introduction
|
|
7
|
+
{corpus}[https://bitbucket.org/martinvelez/corpus] downloads a collection of open source
|
|
8
|
+
software projects.
|
|
9
|
+
|
|
10
|
+
* Downloads the Ubuntu source distribution package,
|
|
11
|
+
* Downloads the Most Watched projects for a given language from Github.
|
|
12
|
+
|
|
13
|
+
== 2. Dependencies
|
|
14
|
+
|
|
15
|
+
* {Ruby 1.9.3}[http://www.ruby-lang.org/en/downloads/] or greater
|
|
16
|
+
* {Unix Tools}
|
|
17
|
+
* curl
|
|
18
|
+
* wget
|
|
19
|
+
* sqlite3
|
|
20
|
+
|
|
21
|
+
== 3. Installation
|
|
22
|
+
|
|
23
|
+
=== Rubygems:
|
|
24
|
+
You might need to use sudo.
|
|
25
|
+
gem install corpus
|
|
26
|
+
|
|
27
|
+
=== Not Rubygems:
|
|
28
|
+
1. Download corpus[http://bitbucket.org/martinvelez/corpus/downloads]
|
|
29
|
+
2. Executable is inside the bin directory
|
|
30
|
+
|
|
31
|
+
== 4. Usage
|
|
32
|
+
List all available tasks
|
|
33
|
+
corpus -T
|
|
34
|
+
|
|
35
|
+
== 5. Development
|
|
36
|
+
|
|
37
|
+
Author:: {Martin Velez}[http://www.martinvelez.com]
|
|
38
|
+
Copyright:: Copyright (C) 2012 {Martin Velez}[http://www.martinvelez.com]
|
|
39
|
+
License:: GPL[http://www.gnu.org/copyleft/gpl.html]
|
|
40
|
+
|
|
41
|
+
=== Source
|
|
42
|
+
Bitbucket[https://bitbucket.org/martinvelez/corpus/src] is hosting this code.
|
|
43
|
+
http://bitbucket.org/martinvelez/corpus/src
|
|
44
|
+
|
|
45
|
+
=== Issues and Bug Reports
|
|
46
|
+
Provide feedback, get help, request features, and reports bugs here:
|
|
47
|
+
https://bitbucket.org/martinvelez/corpus/issues?status=new?status=open
|
data/bin/corpus
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
require 'rake'
|
|
4
|
+
|
|
5
|
+
$download_dir = 'downloads'
|
|
6
|
+
$ubuntu_dir = File.expand_path(File.join($download_dir,'ubuntu'))
|
|
7
|
+
$github_dir = File.expand_path(File.join($download_dir,'github'))
|
|
8
|
+
$sql = {:ubuntu => 'ubuntu.db',:github => 'github.db'}
|
|
9
|
+
$cloc = "perl #{File.expand_path(File.join(File.dirname(__FILE__),"..","ext","cloc-1.56.pl"))}"
|
|
10
|
+
|
|
11
|
+
Rake.application.init('corpus')
|
|
12
|
+
|
|
13
|
+
# START TASKS DEFINITIONS
|
|
14
|
+
# -----------------------
|
|
15
|
+
# Crawl specific code repositories
|
|
16
|
+
directory $github_dir
|
|
17
|
+
directory $ubuntu_dir
|
|
18
|
+
|
|
19
|
+
namespace :crawl do
|
|
20
|
+
|
|
21
|
+
# Ubuntu source distribution package can be download in multiple ISO files
|
|
22
|
+
desc "Download Ubuntu source distribution package"
|
|
23
|
+
task :ubuntu, [:download_dir] do |t, args|
|
|
24
|
+
urls_file = File.expand_path(File.join(File.dirname(__FILE__),"..","conf","ubuntu_urls.txt"))
|
|
25
|
+
args.with_defaults(:download_dir => $ubuntu_dir)
|
|
26
|
+
|
|
27
|
+
puts "Downloading [Ubuntu source distribution package]..."
|
|
28
|
+
puts "download_dir = #{args.download_dir}"
|
|
29
|
+
puts "urls_file = #{urls_file}"
|
|
30
|
+
|
|
31
|
+
# TODO: killing/interrupting this Ruby process does not kill wget processes
|
|
32
|
+
File.open(urls_file).each_line do |url|
|
|
33
|
+
`wget -c --directory-prefix=#{args.download_dir} #{url}`
|
|
34
|
+
puts "ERROR: #{url}" if $? != 0
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Github lists the Most Watched projects for a given language.
|
|
39
|
+
# The list is paginated into 10 webpages.
|
|
40
|
+
desc "Downloads the Most Watched projects for a given language from Github"
|
|
41
|
+
task :github, [:lang,:download_dir] => $github_dir do |t, args|
|
|
42
|
+
lang = args.lang || "Java"
|
|
43
|
+
dd = args.download_dir || File.join($github_dir, lang)
|
|
44
|
+
repos = []
|
|
45
|
+
|
|
46
|
+
for i in 1..10
|
|
47
|
+
#TODO: use URI to encode "C++" into html entities
|
|
48
|
+
o = `curl https://github.com/languages/#{lang}/most_watched?page=#{i}`
|
|
49
|
+
# <a href="/user/repo">Repo</a> => "/user/repo"
|
|
50
|
+
o.each_line {|l| repos.push(l[/"(.*)"/,1]) if l =~ /\s<a href="\/\w+\// }
|
|
51
|
+
repos.pop # TODO: improve regex to exclude pagination bar
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
curr_dir = Dir.pwd
|
|
55
|
+
Dir.mkdir(dd) if not(File.exists?(dd) and File.directory?(dd))
|
|
56
|
+
chdir(dd)
|
|
57
|
+
repos.each do |r|
|
|
58
|
+
repo_name = r[/\/(.*)/,1][/\/(.*)/,1]
|
|
59
|
+
o = `git clone https://github.com#{r} 2>&1`
|
|
60
|
+
if o =~ /fatal/ # downloaded already attempted
|
|
61
|
+
chdir(repo_name)
|
|
62
|
+
puts "git-pulling [#{repo_name}]"
|
|
63
|
+
out = `git pull origin master 2>&1` # try master branch
|
|
64
|
+
output = `git pull 2>&1` if out =~ /fatal/ # no master branch
|
|
65
|
+
chdir(dd)
|
|
66
|
+
if output =~ /You asked me to pull/ # git pull failed
|
|
67
|
+
File.rename(repo_name,"errors") # Deleting can be unsafe
|
|
68
|
+
`git clone https://github.com#{r} 2>&1` # Try one last time.
|
|
69
|
+
end
|
|
70
|
+
else
|
|
71
|
+
puts o
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
chdir(curr_dir)
|
|
75
|
+
end # task
|
|
76
|
+
|
|
77
|
+
end # namespace
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# Database functions
|
|
81
|
+
namespace :db do
|
|
82
|
+
|
|
83
|
+
# Create database files
|
|
84
|
+
namespace :create do
|
|
85
|
+
desc "Counts LOC from projects downloaded from github"
|
|
86
|
+
task :github => $sql[:github]
|
|
87
|
+
desc "Counts LOC from projects downloaded from ubuntu"
|
|
88
|
+
task :ubuntu => $sql[:ubuntu]
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Show programming language LOC counts
|
|
92
|
+
namespace :show do
|
|
93
|
+
desc "print table showing LOC for a given database"
|
|
94
|
+
task :github => $sql[:github] do
|
|
95
|
+
output = `sqlite3 #{$sql[:github]} \
|
|
96
|
+
'select language, SUM(nCode) as total from t GROUP BY language ORDER BY total DESC'`
|
|
97
|
+
table = output.split("\n").map{|s| s.split("|")}
|
|
98
|
+
col_width = table.transpose.map{|col| col.map{|cell| cell.to_s.length}.max}
|
|
99
|
+
table.each{|row| puts row.zip(col_width).map{|lang,count| lang.to_s.ljust(count)}.join(" ")}
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
desc "deletes database FILE"
|
|
104
|
+
task :clobber, [:file] { |t, args| rm $sql[args.file.to_sym] }
|
|
105
|
+
|
|
106
|
+
end #namespace
|
|
107
|
+
|
|
108
|
+
# Private task
|
|
109
|
+
file $sql[:github] do
|
|
110
|
+
repos = Dir.glob("#{$github_dir}/*/*") # downloads/github/LANGUAGE/repo
|
|
111
|
+
reports = []
|
|
112
|
+
project = repos.pop
|
|
113
|
+
|
|
114
|
+
# First cloc call creates database, subsequent calls append to database
|
|
115
|
+
`#{$cloc} --sql 1 --sql-project #{File.basename(project)} #{project} | sqlite3 #{$sql[:github]}`
|
|
116
|
+
repos.each do |r|
|
|
117
|
+
`#{$cloc} --sql 1 --sql-project #{File.basename(r)} --sql-append #{r} | sqlite3 #{$sql[:github]}`
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
file $sql[:ubuntu] do
|
|
122
|
+
# get isos
|
|
123
|
+
# extract
|
|
124
|
+
# pass to cloc
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
task :default do
|
|
128
|
+
puts "usage: '#{Rake.application.name} -T' to get list of all available commands."
|
|
129
|
+
end
|
|
130
|
+
# ---------------------
|
|
131
|
+
# END TASKS DEFINITIONS
|
|
132
|
+
|
|
133
|
+
Rake.application.top_level
|