licensee 5.0.0 → 6.0.0b1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +15 -50
  3. data/bin/licensee +7 -8
  4. data/lib/licensee.rb +9 -33
  5. data/lib/licensee/content_helper.rb +7 -8
  6. data/lib/licensee/license.rb +5 -28
  7. data/lib/licensee/matchers/copyright_matcher.rb +17 -16
  8. data/lib/licensee/matchers/dice_matcher.rb +65 -0
  9. data/lib/licensee/matchers/exact_matcher.rb +12 -6
  10. data/lib/licensee/matchers/gemspec_matcher.rb +11 -11
  11. data/lib/licensee/matchers/npm_bower_matcher.rb +10 -10
  12. data/lib/licensee/matchers/package_matcher.rb +11 -10
  13. data/lib/licensee/project.rb +96 -30
  14. data/lib/licensee/project_file.rb +57 -77
  15. data/lib/licensee/version.rb +1 -1
  16. data/licensee.gemspec +26 -0
  17. data/test/fixtures/npm.git/HEAD +1 -0
  18. data/test/fixtures/npm.git/config +4 -0
  19. data/test/fixtures/npm.git/objects/info/packs +2 -0
  20. data/test/fixtures/npm.git/objects/pack/pack-03c0879445cabcc37f91d97c7955465adef26f4a.idx +0 -0
  21. data/test/fixtures/npm.git/objects/pack/pack-03c0879445cabcc37f91d97c7955465adef26f4a.pack +0 -0
  22. data/test/fixtures/npm.git/packed-refs +2 -0
  23. data/test/functions.rb +4 -15
  24. data/test/test_licensee.rb +1 -13
  25. data/test/test_licensee_copyright_matcher.rb +19 -28
  26. data/test/test_licensee_dice_matcher.rb +21 -0
  27. data/test/test_licensee_exact_matcher.rb +4 -6
  28. data/test/test_licensee_gemspec_matcher.rb +3 -11
  29. data/test/test_licensee_license.rb +2 -12
  30. data/test/test_licensee_npm_bower_matcher.rb +10 -16
  31. data/test/test_licensee_project.rb +24 -35
  32. data/test/test_licensee_project_file.rb +5 -10
  33. data/vendor/choosealicense.com/_licenses/afl-3.0.txt +69 -0
  34. data/vendor/choosealicense.com/_licenses/isc.txt +2 -2
  35. metadata +14 -26
  36. data/lib/licensee/filesystem_repository.rb +0 -38
  37. data/lib/licensee/matcher.rb +0 -32
  38. data/lib/licensee/matchers/git_matcher.rb +0 -27
  39. data/lib/licensee/matchers/levenshtein_matcher.rb +0 -75
  40. data/test/test_licensee_content_helper.rb +0 -40
  41. data/test/test_licensee_git_matcher.rb +0 -19
  42. data/test/test_licensee_levenshtein_matcher.rb +0 -34
  43. data/test/test_licensee_matcher.rb +0 -7
@@ -1,16 +1,16 @@
1
1
  class Licensee
2
- class GemspecMatcher < PackageMatcher
2
+ module Matchers
3
+ class Gemspec < Package
4
+ # We definitely don't want to be evaling arbitrary Gemspec files
5
+ # While not 100% accurate, use some lenient regex to try to grep the
6
+ # license declaration from the Gemspec as a string, if any
7
+ LICENSE_REGEX = /^\s*[a-z0-9_]+\.license\s*\=\s*[\'\"]([a-z\-0-9\.]+)[\'\"]\s*$/i
3
8
 
4
- # We definitely don't want to be evaling arbitrary Gemspec files
5
- # While not 100% accurate, use some lenient regex to try to grep the
6
- # license declaration from the Gemspec as a string, if any
7
- LICENSE_REGEX = /^\s*[a-z0-9_]+\.license\s*\=\s*[\'\"]([a-z\-0-9\.]+)[\'\"]\s*$/i
8
-
9
- private
10
-
11
- def license_property
12
- match = file.content.match LICENSE_REGEX
13
- match[1].downcase if match && match[1]
9
+ private
10
+ def license_property
11
+ match = @file.content.match LICENSE_REGEX
12
+ match[1].downcase if match && match[1]
13
+ end
14
14
  end
15
15
  end
16
16
  end
@@ -1,15 +1,15 @@
1
1
  class Licensee
2
- class NpmBowerMatcher < PackageMatcher
2
+ module Matchers
3
+ class NpmBower < Package
4
+ # While we could parse the package.json or bower.json file, prefer
5
+ # a lenient regex for speed and security. Moar parsing moar problems.
6
+ LICENSE_REGEX = /\s*[\"\']license[\"\']\s*\:\s*[\'\"]([a-z\-0-9\.]+)[\'\"],?\s*/i
3
7
 
4
- # While we could parse the package.json or bower.json file, prefer
5
- # a lenient regex for speed and security. Moar parsing moar problems.
6
- LICENSE_REGEX = /\s*[\"\']license[\"\']\s*\:\s*[\'\"]([a-z\-0-9\.]+)[\'\"],?\s*/i
7
-
8
- private
9
-
10
- def license_property
11
- match = file.content.match LICENSE_REGEX
12
- match[1].downcase if match && match[1]
8
+ private
9
+ def license_property
10
+ match = @file.content.match LICENSE_REGEX
11
+ match[1].downcase if match && match[1]
12
+ end
13
13
  end
14
14
  end
15
15
  end
@@ -1,16 +1,17 @@
1
1
  class Licensee
2
- class PackageMatcher < Matcher
2
+ module Matchers
3
+ class Package
4
+ def initialize(file)
5
+ @file = file
6
+ end
3
7
 
4
- def match
5
- Licensee.licenses(:hidden => true).find { |l| l.key == license_property } if file.package?
6
- end
7
-
8
- def confidence
9
- 90
10
- end
8
+ def match
9
+ Licensee.licenses(:hidden => true).find { |l| l.key == license_property }
10
+ end
11
11
 
12
- def self.package_manager?
13
- true
12
+ def confidence
13
+ 90
14
+ end
14
15
  end
15
16
  end
16
17
  end
@@ -1,59 +1,125 @@
1
+ require 'rugged'
2
+
1
3
  class Licensee
4
+ private
2
5
  class Project
3
- attr_reader :repository, :revision
4
-
5
- # Initializes a new project
6
- #
7
- # path_or_repo path to git repo or Rugged::Repository instance
8
- # revsion - revision ref, if any
9
- def initialize(path_or_repo, revision = nil)
10
- if path_or_repo.kind_of? Rugged::Repository
11
- @repository = path_or_repo
12
- else
13
- begin
14
- @repository = Rugged::Repository.new(path_or_repo)
15
- rescue Rugged::RepositoryError
16
- raise if revision
17
- @repository = FilesystemRepository.new(path_or_repo)
18
- end
19
- end
6
+ def initialize(detect_packages)
7
+ @detect_packages = detect_packages
8
+ end
20
9
 
21
- @revision = revision
10
+ def detect_packages?
11
+ @detect_packages
22
12
  end
23
13
 
24
14
  # Returns the matching Licensee::License instance if a license can be detected
25
15
  def license
26
- @license ||= matched_file.match if matched_file
16
+ @license ||= matched_file && matched_file.license
17
+ end
18
+
19
+ def matched_file
20
+ @matched_file ||= (license_file || package_file)
27
21
  end
28
22
 
29
23
  def license_file
30
24
  return @license_file if defined? @license_file
31
- @license_file = files.select { |f| f.license? }.sort_by { |f| f.license_score }.last
25
+ @license_file = begin
26
+ content, name = find_file { |name| LicenseFile.name_score(name) }
27
+ if content && name
28
+ LicenseFile.new(content, name)
29
+ end
30
+ end
32
31
  end
33
32
 
34
33
  def package_file
35
- return unless Licensee.package_manager_files?
34
+ return unless detect_packages?
36
35
  return @package_file if defined? @package_file
37
- @package_file = files.select { |f| f.package? }.sort_by { |f| f.package_score }.last
36
+ @package_file = begin
37
+ content, name = find_file { |name| PackageInfo.name_score(name) }
38
+ if content && name
39
+ PackageInfo.new(content, name)
40
+ end
41
+ end
38
42
  end
43
+ end
39
44
 
40
- def matched_file
41
- return license_file if license_file && license_file.match
42
- return package_file if package_file && package_file.match
45
+ public
46
+
47
+ # Git-based project
48
+ #
49
+ # analyze a given git repository for license information
50
+ class GitProject < Project
51
+ attr_reader :repository, :revision
52
+
53
+ class InvalidRepository < ArgumentError; end
54
+
55
+ def initialize(repo, revision: nil, detect_packages: false)
56
+ if repo.kind_of? Rugged::Repository
57
+ @repository = repo
58
+ else
59
+ @repository = Rugged::Repository.new(repo)
60
+ end
61
+
62
+ @revision = revision
63
+ super(detect_packages)
64
+ rescue Rugged::RepositoryError
65
+ raise InvalidRepository
43
66
  end
44
67
 
45
68
  private
46
-
47
69
  def commit
48
70
  @commit ||= revision ? repository.lookup(revision) : repository.last_commit
49
71
  end
50
72
 
51
- def tree
52
- @tree ||= commit.tree.select { |blob| blob[:type] == :blob }
73
+ MAX_LICENSE_SIZE = 64 * 1024
74
+
75
+ def load_blob_data(oid)
76
+ data, _ = Rugged::Blob.to_buffer(repository, oid, MAX_LICENSE_SIZE)
77
+ data
78
+ end
79
+
80
+ def find_file
81
+ files = commit.tree.map do |entry|
82
+ next unless entry[:type] == :blob
83
+ if (score = yield entry[:name]) > 0
84
+ { :name => entry[:name], :oid => entry[:oid], :score => score }
85
+ end
86
+ end.compact
87
+
88
+ return if files.empty?
89
+ files.sort! { |a, b| b[:score] <=> a[:score] }
90
+
91
+ f = files.first
92
+ [load_blob_data(f[:oid]), f[:name]]
53
93
  end
94
+ end
95
+
96
+ # Filesystem-based project
97
+ #
98
+ # Analyze a folder on the filesystem for license information
99
+ class FSProject < Project
100
+ attr_reader :path
101
+
102
+ def initialize(path, detect_packages: false)
103
+ @path = path
104
+ super(detect_packages)
105
+ end
106
+
107
+ private
108
+ def find_file
109
+ files = []
110
+
111
+ Dir.foreach(path) do |file|
112
+ next unless ::File.file?(::File.join(path, file))
113
+ if (score = yield file) > 0
114
+ files.push({ :name => file, :score => score })
115
+ end
116
+ end
117
+
118
+ return if files.empty?
119
+ files.sort! { |a, b| b[:score] <=> a[:score] }
54
120
 
55
- def files
56
- @files ||= tree.map { |blob| ProjectFile.new(repository.lookup(blob[:oid]), blob[:name]) }
121
+ f = files.first
122
+ [::File.read(::File.join(path, f[:name])), f[:name]]
57
123
  end
58
124
  end
59
125
  end
@@ -1,91 +1,51 @@
1
+ # encoding=utf-8
1
2
  class Licensee
2
- class ProjectFile
3
-
4
- # Note: File can be a license file (e.g., `LICENSE.txt`)
5
- # or a package manager file (e.g, `package.json`)
6
-
7
- attr_reader :blob, :path
8
- alias_method :filename, :path
9
-
10
- include Licensee::ContentHelper
11
-
12
- def initialize(blob, path)
13
- @blob = blob
14
- @path = path
15
- end
16
-
17
- # Raw file contents
18
- def content
19
- @contents ||= blob.content.force_encoding("UTF-8")
20
- end
21
- alias_method :to_s, :content
22
- alias_method :contents, :content
23
-
24
- # File content with all whitespace replaced with a single space
25
- def content_normalized
26
- @content_normalized ||= normalize_content(content)
27
- end
28
-
29
- # Determines which matching strategy to use, returns an instane of that matcher
30
- def matcher
31
- return @matcher if defined? @matcher
32
- @matcher = Licensee.matchers.map { |m| m.new(self) }.find { |m| m.match }
33
- end
3
+ class Project
4
+ private
5
+ class File
6
+ attr_reader :content, :filename
7
+
8
+ def initialize(content, filename = nil)
9
+ @content = content
10
+ @content.force_encoding(Encoding::UTF_8)
11
+ @filename = filename
12
+ end
34
13
 
35
- # Returns an Licensee::License instance of the matches license
36
- def match
37
- @match ||= matcher.match if matcher
38
- end
14
+ def matcher
15
+ @matcher ||= possible_matchers.map { |m| m.new(self) }.find { |m| m.match }
16
+ end
39
17
 
40
- # Returns the percent confident with the match
41
- def confidence
42
- @condience ||= matcher.confidence if matcher
43
- end
18
+ # Returns the percent confident with the match
19
+ def confidence
20
+ matcher && matcher.confidence
21
+ end
44
22
 
45
- def similarity(other)
46
- blob.hashsig(Rugged::Blob::HashSignature::WHITESPACE_SMART)
47
- other.hashsig && blob.hashsig ? blob.similarity(other.hashsig) : 0
48
- rescue Rugged::InvalidError
49
- 0
50
- end
23
+ def license
24
+ matcher && matcher.match
25
+ end
51
26
 
52
- def license_score
53
- self.class.license_score(filename)
27
+ alias_method :match, :license
28
+ alias_method :path, :filename
54
29
  end
55
30
 
56
- def license?
57
- license_score != 0.0
58
- end
31
+ public
32
+ class LicenseFile < File
33
+ include Licensee::ContentHelper
59
34
 
60
- def attribution
61
- return nil unless license?
62
- matches = /^#{CopyrightMatcher::REGEX}$/i.match(content)
63
- matches[0].strip if matches
64
- end
35
+ def possible_matchers
36
+ [Matchers::Copyright, Matchers::Exact, Matchers::Dice]
37
+ end
65
38
 
66
- def package_score
67
- return 1.0 if filename =~ /[a-zA-Z0-9\-_]+\.gemspec/
68
- return 1.0 if filename =~ /package\.json/
69
- return 0.75 if filename =~ /bower.json/
70
- return 0.0
71
- end
39
+ def wordset
40
+ @wordset ||= create_word_set(content)
41
+ end
72
42
 
73
- def package?
74
- Licensee.package_manager_files? && package_score != 0.0
75
- end
43
+ def attribution
44
+ matches = /^#{Matchers::Copyright::REGEX}$/i.match(content)
45
+ matches[0].strip if matches
46
+ end
76
47
 
77
- class << self
78
- # Scores a given file as a potential license
79
- #
80
- # filename - (string) the name of the file to score
81
- #
82
- # Returns 1.0 if the file is definitely a license file (e.g, LICENSE)
83
- # Returns 0.9 if the file is almost certainly a license file (e.g., LICENSE.md)
84
- # Returns 0.8 if the file is probably a license file (e.g., COPYING, COPYING.md)
85
- # Returns 0.7 if the file is potentially a license file (e.g., LICENSE.php)
86
- # Returns 0.5 if the file is likely a license file (MIT-LICENSE)
87
- # Returns 0.0 if the file is definitely not a license file (e.g., index.php)
88
- def license_score(filename)
48
+ def self.name_score(filename)
89
49
  return 1.0 if filename =~ /\A(un)?licen[sc]e\z/i
90
50
  return 0.9 if filename =~ /\A(un)?licen[sc]e\.(md|markdown|txt)\z/i
91
51
  return 0.8 if filename =~ /\Acopy(ing|right)(\.[^.]+)?\z/i
@@ -94,5 +54,25 @@ class Licensee
94
54
  return 0.0
95
55
  end
96
56
  end
57
+
58
+ class PackageInfo < File
59
+ def possible_matchers
60
+ case ::File.extname(filename)
61
+ when ".gemspec"
62
+ [Matchers::Gemspec]
63
+ when ".json"
64
+ [Matchers::NpmBower]
65
+ else
66
+ []
67
+ end
68
+ end
69
+
70
+ def self.name_score(filename)
71
+ return 1.0 if ::File.extname(filename) == ".gemspec"
72
+ return 1.0 if filename == "package.json"
73
+ return 0.75 if filename == "bower.json"
74
+ return 0.0
75
+ end
76
+ end
97
77
  end
98
78
  end
@@ -1,3 +1,3 @@
1
1
  class Licensee
2
- VERSION = "5.0.0"
2
+ VERSION = "6.0.0b1"
3
3
  end
data/licensee.gemspec ADDED
@@ -0,0 +1,26 @@
1
+ require File.expand_path("../lib/licensee/version", __FILE__)
2
+
3
+ Gem::Specification.new do |gem|
4
+ gem.name = 'licensee'
5
+ gem.version = Licensee::VERSION
6
+
7
+ gem.summary = "A Ruby Gem to detect under what license a project is distributed"
8
+ gem.description = "Licensee automates the process of reading LICENSE files and compares their contents to known licenses using a fancy math thing called Rabin-Karp rolling-hashes."
9
+
10
+ gem.authors = ['Ben Balter']
11
+ gem.email = 'ben.balter@github.com'
12
+ gem.homepage = 'http://github.com/benbalter/licensee'
13
+ gem.license = "MIT"
14
+
15
+ gem.bindir = 'bin'
16
+ gem.executables << 'licensee'
17
+
18
+ gem.add_dependency('rugged', '~> 0.23')
19
+ gem.add_development_dependency('pry', '~> 0.9')
20
+ gem.add_development_dependency('shoulda', '~> 3.5')
21
+ gem.add_development_dependency('rake', '~> 10.3')
22
+ gem.add_development_dependency('ruby-prof', '~> 0.15')
23
+
24
+ # ensure the gem is built out of versioned files
25
+ gem.files = Dir['Rakefile', '{bin,lib,man,test,vendor,spec}/**/*', 'README*', 'LICENSE*'] & `git ls-files -z`.split("\0")
26
+ end
@@ -0,0 +1 @@
1
+ ref: refs/heads/master
@@ -0,0 +1,4 @@
1
+ [core]
2
+ repositoryformatversion = 0
3
+ filemode = true
4
+ bare = true
@@ -0,0 +1,2 @@
1
+ P pack-03c0879445cabcc37f91d97c7955465adef26f4a.pack
2
+
@@ -0,0 +1,2 @@
1
+ # pack-refs with: peeled fully-peeled
2
+ 0a3da38fcc340e45990520699bc3ec5570e60c11 refs/heads/master