corpus 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc ADDED
@@ -0,0 +1,47 @@
1
+ = Corpus -- a collection of open source software projects
2
+
3
+ == Table of Contents
4
+ 1. Introduction
5
+
6
+ == 1. Introduction
7
+ {corpus}[https://bitbucket.org/martinvelez/corpus] downloads a collection of open source
8
+ software projects.
9
+
10
+ * Downloads the Ubuntu source distribution package,
11
+ * Downloads the Most Watched projects for a given language from Github.
12
+
13
+ == 2. Dependencies
14
+
15
+ * {Ruby 1.9.3}[http://www.ruby-lang.org/en/downloads/] or greater
16
+ * {Unix Tools}
17
+ * curl
18
+ * wget
19
+ * sqlite3
20
+
21
+ == 3. Installation
22
+
23
+ === Rubygems:
24
+ You might need to use sudo.
25
+ gem install corpus
26
+
27
+ === Not Rubygems:
28
+ 1. Download corpus[http://bitbucket.org/martinvelez/corpus/downloads]
29
+ 2. Executable is inside the bin directory
30
+
31
+ == 4. Usage
32
+ List all available tasks
33
+ corpus -T
34
+
35
+ == 5. Development
36
+
37
+ Author:: {Martin Velez}[http://www.martinvelez.com]
38
+ Copyright:: Copyright (C) 2012 {Martin Velez}[http://www.martinvelez.com]
39
+ License:: GPL[http://www.gnu.org/copyleft/gpl.html]
40
+
41
+ === Source
42
+ Bitbucket[https://bitbucket.org/martinvelez/corpus/src] is hosting this code.
43
+ http://bitbucket.org/martinvelez/corpus/src
44
+
45
+ === Issues and Bug Reports
46
+ Provide feedback, get help, request features, and reports bugs here:
47
+ https://bitbucket.org/martinvelez/corpus/issues?status=new?status=open
data/bin/corpus ADDED
@@ -0,0 +1,133 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rake'
4
+
5
+ $download_dir = 'downloads'
6
+ $ubuntu_dir = File.expand_path(File.join($download_dir,'ubuntu'))
7
+ $github_dir = File.expand_path(File.join($download_dir,'github'))
8
+ $sql = {:ubuntu => 'ubuntu.db',:github => 'github.db'}
9
+ $cloc = "perl #{File.expand_path(File.join(File.dirname(__FILE__),"..","ext","cloc-1.56.pl"))}"
10
+
11
+ Rake.application.init('corpus')
12
+
13
+ # START TASKS DEFINITIONS
14
+ # -----------------------
15
+ # Crawl specific code repositories
16
+ directory $github_dir
17
+ directory $ubuntu_dir
18
+
19
+ namespace :crawl do
20
+
21
+ # Ubuntu source distribution package can be download in multiple ISO files
22
+ desc "Download Ubuntu source distribution package"
23
+ task :ubuntu, [:download_dir] do |t, args|
24
+ urls_file = File.expand_path(File.join(File.dirname(__FILE__),"..","conf","ubuntu_urls.txt"))
25
+ args.with_defaults(:download_dir => $ubuntu_dir)
26
+
27
+ puts "Downloading [Ubuntu source distribution package]..."
28
+ puts "download_dir = #{args.download_dir}"
29
+ puts "urls_file = #{urls_file}"
30
+
31
+ # TODO: killing/interrupting this Ruby process does not kill wget processes
32
+ File.open(urls_file).each_line do |url|
33
+ `wget -c --directory-prefix=#{args.download_dir} #{url}`
34
+ puts "ERROR: #{url}" if $? != 0
35
+ end
36
+ end
37
+
38
+ # Github lists the Most Watched projects for a given language.
39
+ # The list is paginated into 10 webpages.
40
+ desc "Downloads the Most Watched projects for a given language from Github"
41
+ task :github, [:lang,:download_dir] => $github_dir do |t, args|
42
+ lang = args.lang || "Java"
43
+ dd = args.download_dir || File.join($github_dir, lang)
44
+ repos = []
45
+
46
+ for i in 1..10
47
+ #TODO: use URI to encode "C++" into html entities
48
+ o = `curl https://github.com/languages/#{lang}/most_watched?page=#{i}`
49
+ # <a href="/user/repo">Repo</a> => "/user/repo"
50
+ o.each_line {|l| repos.push(l[/"(.*)"/,1]) if l =~ /\s<a href="\/\w+\// }
51
+ repos.pop # TODO: improve regex to exclude pagination bar
52
+ end
53
+
54
+ curr_dir = Dir.pwd
55
+ Dir.mkdir(dd) if not(File.exists?(dd) and File.directory?(dd))
56
+ chdir(dd)
57
+ repos.each do |r|
58
+ repo_name = r[/\/(.*)/,1][/\/(.*)/,1]
59
+ o = `git clone https://github.com#{r} 2>&1`
60
+ if o =~ /fatal/ # downloaded already attempted
61
+ chdir(repo_name)
62
+ puts "git-pulling [#{repo_name}]"
63
+ out = `git pull origin master 2>&1` # try master branch
64
+ output = `git pull 2>&1` if out =~ /fatal/ # no master branch
65
+ chdir(dd)
66
+ if output =~ /You asked me to pull/ # git pull failed
67
+ File.rename(repo_name,"errors") # Deleting can be unsafe
68
+ `git clone https://github.com#{r} 2>&1` # Try one last time.
69
+ end
70
+ else
71
+ puts o
72
+ end
73
+ end
74
+ chdir(curr_dir)
75
+ end # task
76
+
77
+ end # namespace
78
+
79
+
80
+ # Database functions
81
+ namespace :db do
82
+
83
+ # Create database files
84
+ namespace :create do
85
+ desc "Counts LOC from projects downloaded from github"
86
+ task :github => $sql[:github]
87
+ desc "Counts LOC from projects downloaded from ubuntu"
88
+ task :ubuntu => $sql[:ubuntu]
89
+ end
90
+
91
+ # Show programming language LOC counts
92
+ namespace :show do
93
+ desc "print table showing LOC for a given database"
94
+ task :github => $sql[:github] do
95
+ output = `sqlite3 #{$sql[:github]} \
96
+ 'select language, SUM(nCode) as total from t GROUP BY language ORDER BY total DESC'`
97
+ table = output.split("\n").map{|s| s.split("|")}
98
+ col_width = table.transpose.map{|col| col.map{|cell| cell.to_s.length}.max}
99
+ table.each{|row| puts row.zip(col_width).map{|lang,count| lang.to_s.ljust(count)}.join(" ")}
100
+ end
101
+ end
102
+
103
+ desc "deletes database FILE"
104
+ task :clobber, [:file] { |t, args| rm $sql[args.file.to_sym] }
105
+
106
+ end #namespace
107
+
108
+ # Private task
109
+ file $sql[:github] do
110
+ repos = Dir.glob("#{$github_dir}/*/*") # downloads/github/LANGUAGE/repo
111
+ reports = []
112
+ project = repos.pop
113
+
114
+ # First cloc call creates database, subsequent calls append to database
115
+ `#{$cloc} --sql 1 --sql-project #{File.basename(project)} #{project} | sqlite3 #{$sql[:github]}`
116
+ repos.each do |r|
117
+ `#{$cloc} --sql 1 --sql-project #{File.basename(r)} --sql-append #{r} | sqlite3 #{$sql[:github]}`
118
+ end
119
+ end
120
+
121
+ file $sql[:ubuntu] do
122
+ # get isos
123
+ # extract
124
+ # pass to cloc
125
+ end
126
+
127
+ task :default do
128
+ puts "usage: '#{Rake.application.name} -T' to get list of all available commands."
129
+ end
130
+ # ---------------------
131
+ # END TASKS DEFINITIONS
132
+
133
+ Rake.application.top_level
@@ -0,0 +1,10 @@
1
+ JavaScript
2
+ Ruby
3
+ Python
4
+ Shell
5
+ Java
6
+ PHP
7
+ C
8
+ C++
9
+ Perl
10
+ Objective-C
@@ -0,0 +1,3 @@
1
+ http://cdimage.ubuntu.com/source/current/source/precise-src-0.iso
2
+ http://cdimage.ubuntu.com/source/current/source/precise-src-2.iso
3
+ http://cdimage.ubuntu.com/source/current/source/precise-src-3.iso