spandx 0.12.3 → 0.13.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +50 -25
- data/README.md +11 -7
- data/exe/spandx +1 -2
- data/ext/spandx/extconf.rb +5 -0
- data/lib/spandx.rb +6 -3
- data/lib/spandx/cli.rb +1 -0
- data/lib/spandx/cli/commands/build.rb +13 -2
- data/lib/spandx/cli/commands/scan.rb +31 -25
- data/lib/spandx/cli/main.rb +1 -0
- data/lib/spandx/core/cache.rb +38 -51
- data/lib/spandx/core/concurrent.rb +40 -0
- data/lib/spandx/core/content.rb +5 -23
- data/lib/spandx/core/data_file.rb +66 -0
- data/lib/spandx/core/git.rb +8 -30
- data/lib/spandx/core/guess.rb +37 -40
- data/lib/spandx/core/http.rb +6 -1
- data/lib/spandx/core/index_file.rb +101 -0
- data/lib/spandx/core/license_plugin.rb +3 -3
- data/lib/spandx/core/line_io.rb +23 -0
- data/lib/spandx/core/path_traversal.rb +44 -0
- data/lib/spandx/core/relation.rb +38 -0
- data/lib/spandx/core/thread_pool.rb +15 -4
- data/lib/spandx/dotnet/index.rb +21 -79
- data/lib/spandx/java/index.rb +5 -2
- data/lib/spandx/python/index.rb +4 -33
- data/lib/spandx/spdx/catalogue.rb +4 -0
- data/lib/spandx/spdx/composite_license.rb +60 -0
- data/lib/spandx/spdx/expression.rb +114 -0
- data/lib/spandx/spdx/license.rb +4 -14
- data/lib/spandx/version.rb +1 -1
- data/spandx.gemspec +13 -10
- metadata +70 -27
- data/lib/spandx/core/null_gateway.rb +0 -11
@@ -0,0 +1,40 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Spandx
|
4
|
+
module Core
|
5
|
+
class Concurrent
|
6
|
+
include Enumerable
|
7
|
+
|
8
|
+
def self.map(items, pool:, &block)
|
9
|
+
queue = Queue.new
|
10
|
+
|
11
|
+
items.each do |item|
|
12
|
+
pool.schedule([item, block]) do |marshalled_item, callable|
|
13
|
+
queue.enq(callable.call(marshalled_item))
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
new(queue, items.size)
|
18
|
+
end
|
19
|
+
|
20
|
+
attr_reader :queue, :size
|
21
|
+
|
22
|
+
def initialize(queue, size)
|
23
|
+
@queue = queue
|
24
|
+
@size = size
|
25
|
+
end
|
26
|
+
|
27
|
+
def each
|
28
|
+
size.times { yield queue.deq }
|
29
|
+
end
|
30
|
+
|
31
|
+
def to_enum
|
32
|
+
Enumerator.new do |yielder|
|
33
|
+
each do |item|
|
34
|
+
yielder.yield item
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
data/lib/spandx/core/content.rb
CHANGED
@@ -13,30 +13,12 @@ module Spandx
|
|
13
13
|
@tokens ||= tokenize(canonicalize(raw)).to_set
|
14
14
|
end
|
15
15
|
|
16
|
-
def similar?(other,
|
17
|
-
|
18
|
-
when :dice_coefficient
|
19
|
-
similarity_score(other, algorithm: algorithm) > 89.0
|
20
|
-
when :levenshtein
|
21
|
-
similarity_score(other, algorithm: algorithm) < 3
|
22
|
-
when :jaro_winkler
|
23
|
-
similarity_score(other, algorithm: algorithm) > 89.0
|
24
|
-
end
|
16
|
+
def similar?(other, threshold: 89.0)
|
17
|
+
similarity_score(other) > threshold
|
25
18
|
end
|
26
19
|
|
27
|
-
def similarity_score(other
|
28
|
-
|
29
|
-
when :dice_coefficient
|
30
|
-
dice_coefficient(other)
|
31
|
-
when :levenshtein
|
32
|
-
require 'text'
|
33
|
-
|
34
|
-
Text::Levenshtein.distance(raw, other.raw, 100)
|
35
|
-
when :jaro_winkler
|
36
|
-
require 'jaro_winkler'
|
37
|
-
|
38
|
-
JaroWinkler.distance(raw, other.raw) * 100.0
|
39
|
-
end
|
20
|
+
def similarity_score(other)
|
21
|
+
dice_coefficient(other)
|
40
22
|
end
|
41
23
|
|
42
24
|
private
|
@@ -46,7 +28,7 @@ module Spandx
|
|
46
28
|
end
|
47
29
|
|
48
30
|
def tokenize(content)
|
49
|
-
content.to_s.scan(/[a-zA-Z]+/)
|
31
|
+
content.to_s.scan(/[a-zA-Z\d.]+/)
|
50
32
|
end
|
51
33
|
|
52
34
|
def blank?(content)
|
@@ -0,0 +1,66 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Spandx
|
4
|
+
module Core
|
5
|
+
class DataFile
|
6
|
+
include Enumerable
|
7
|
+
|
8
|
+
attr_reader :absolute_path
|
9
|
+
|
10
|
+
def initialize(absolute_path)
|
11
|
+
@absolute_path = Pathname.new(absolute_path)
|
12
|
+
FileUtils.mkdir_p(@absolute_path.dirname)
|
13
|
+
end
|
14
|
+
|
15
|
+
def each
|
16
|
+
return unless exist?
|
17
|
+
|
18
|
+
open_file(mode: 'rb') do |io|
|
19
|
+
while (line = io.gets)
|
20
|
+
yield CsvParser.parse(line)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def search(name:, version:)
|
26
|
+
return if name.nil? || name.empty?
|
27
|
+
return if version.nil? || name.empty?
|
28
|
+
return unless absolute_path.exist?
|
29
|
+
|
30
|
+
term = "#{name}-#{version}"
|
31
|
+
index.search do |row|
|
32
|
+
term <=> "#{row[0]}-#{row[1]}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def insert(name, version, licenses)
|
37
|
+
return if [name, version].any? { |x| x.nil? || x.empty? }
|
38
|
+
|
39
|
+
open_file(mode: 'a') do |io|
|
40
|
+
io.write(to_csv([name, version, licenses.join('-|-')]))
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def exist?
|
45
|
+
absolute_path.exist?
|
46
|
+
end
|
47
|
+
|
48
|
+
def open_file(mode: 'rb')
|
49
|
+
absolute_path.open(mode) { |io| yield io }
|
50
|
+
rescue Errno::ENOENT => error
|
51
|
+
Spandx.logger.error(error)
|
52
|
+
nil
|
53
|
+
end
|
54
|
+
|
55
|
+
def index
|
56
|
+
@index ||= IndexFile.new(self)
|
57
|
+
end
|
58
|
+
|
59
|
+
private
|
60
|
+
|
61
|
+
def to_csv(array)
|
62
|
+
array.to_csv(force_quotes: true)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
data/lib/spandx/core/git.rb
CHANGED
@@ -3,37 +3,21 @@
|
|
3
3
|
module Spandx
|
4
4
|
module Core
|
5
5
|
class Git
|
6
|
-
attr_reader :
|
6
|
+
attr_reader :root, :url
|
7
7
|
|
8
8
|
def initialize(url:)
|
9
9
|
@url = url
|
10
|
-
@
|
11
|
-
end
|
12
|
-
|
13
|
-
def update!
|
14
|
-
dotgit? ? pull! : clone!
|
15
|
-
end
|
16
|
-
|
17
|
-
def expand_path(relative_path)
|
18
|
-
File.join(path, relative_path)
|
10
|
+
@root = path_for(url)
|
19
11
|
end
|
20
12
|
|
21
13
|
def read(path)
|
22
|
-
|
14
|
+
full_path = File.join(root, path)
|
23
15
|
|
24
|
-
full_path = expand_path(path)
|
25
16
|
IO.read(full_path) if File.exist?(full_path)
|
26
17
|
end
|
27
18
|
|
28
|
-
def
|
29
|
-
|
30
|
-
|
31
|
-
full_path = expand_path(path)
|
32
|
-
return unless File.exist?(full_path)
|
33
|
-
|
34
|
-
File.open(full_path, mode) do |io|
|
35
|
-
yield io
|
36
|
-
end
|
19
|
+
def update!
|
20
|
+
dotgit? ? pull! : clone!
|
37
21
|
end
|
38
22
|
|
39
23
|
private
|
@@ -45,24 +29,18 @@ module Spandx
|
|
45
29
|
end
|
46
30
|
|
47
31
|
def dotgit?
|
48
|
-
File.directory?(File.join(
|
32
|
+
File.directory?(File.join(root, '.git'))
|
49
33
|
end
|
50
34
|
|
51
35
|
def clone!
|
52
|
-
system('git', 'clone', '--quiet', url,
|
36
|
+
system('git', 'clone', '--quiet', '--depth=1', '--single-branch', '--branch', 'master', url, root)
|
53
37
|
end
|
54
38
|
|
55
39
|
def pull!
|
56
|
-
|
40
|
+
Dir.chdir(root) do
|
57
41
|
system('git', 'pull', '--no-rebase', '--quiet', 'origin', 'master')
|
58
42
|
end
|
59
43
|
end
|
60
|
-
|
61
|
-
def within
|
62
|
-
Dir.chdir(path) do
|
63
|
-
yield
|
64
|
-
end
|
65
|
-
end
|
66
44
|
end
|
67
45
|
|
68
46
|
Database = Git
|
data/lib/spandx/core/guess.rb
CHANGED
@@ -9,80 +9,77 @@ module Spandx
|
|
9
9
|
@catalogue = catalogue
|
10
10
|
end
|
11
11
|
|
12
|
-
def license_for(raw
|
13
|
-
raw.is_a?(Hash) ? from_hash(raw
|
12
|
+
def license_for(raw)
|
13
|
+
raw.is_a?(Hash) ? from_hash(raw) : from_string(raw)
|
14
14
|
end
|
15
15
|
|
16
16
|
private
|
17
17
|
|
18
|
-
def from_hash(hash
|
19
|
-
from_string(hash[:name]
|
20
|
-
from_url(hash[:url]
|
18
|
+
def from_hash(hash)
|
19
|
+
from_string(hash[:name]) ||
|
20
|
+
from_url(hash[:url]) ||
|
21
21
|
unknown(hash[:name] || hash[:url])
|
22
22
|
end
|
23
23
|
|
24
|
-
def from_string(raw
|
24
|
+
def from_string(raw)
|
25
|
+
return if raw.nil?
|
26
|
+
|
25
27
|
content = Content.new(raw)
|
26
28
|
|
27
29
|
catalogue[raw] ||
|
28
|
-
|
29
|
-
|
30
|
+
catalogue[raw.split(' ').join('-')] ||
|
31
|
+
match_name(content) ||
|
32
|
+
match_body(content) ||
|
30
33
|
unknown(raw)
|
31
34
|
end
|
32
35
|
|
33
|
-
def from_url(url
|
36
|
+
def from_url(url)
|
34
37
|
return if url.nil? || url.empty?
|
35
38
|
|
36
39
|
response = Spandx.http.get(url)
|
37
40
|
return unless Spandx.http.ok?(response)
|
38
41
|
|
39
|
-
license_for(response.body
|
42
|
+
license_for(response.body)
|
40
43
|
end
|
41
44
|
|
42
|
-
def match_name(content
|
45
|
+
def match_name(content)
|
46
|
+
return if content.tokens.size < 2 || content.tokens.size > 10
|
47
|
+
|
48
|
+
result = from_expression(content)
|
49
|
+
return result if result
|
50
|
+
|
51
|
+
threshold = 85.0
|
43
52
|
catalogue.find do |license|
|
44
|
-
|
45
|
-
score > 85
|
53
|
+
content.similar?(Content.new(license.name), threshold: threshold)
|
46
54
|
end
|
47
55
|
end
|
48
56
|
|
49
|
-
def match_body(content
|
57
|
+
def match_body(content)
|
50
58
|
score = Score.new(nil, nil)
|
51
|
-
threshold =
|
52
|
-
direction = algorithm == :levenshtein ? method(:min) : method(:max)
|
53
|
-
|
59
|
+
threshold = 89.0
|
54
60
|
catalogue.each do |license|
|
55
|
-
|
61
|
+
next if license.deprecated_license_id?
|
62
|
+
|
63
|
+
percentage = content.similarity_score(content_for(license))
|
64
|
+
next if percentage < threshold
|
65
|
+
next if score.score >= percentage
|
66
|
+
|
67
|
+
score.update(percentage, license)
|
56
68
|
end
|
57
69
|
score&.item
|
58
70
|
end
|
59
71
|
|
60
|
-
def
|
61
|
-
::Spandx::
|
62
|
-
end
|
63
|
-
|
64
|
-
def threshold_for(algorithm)
|
65
|
-
{
|
66
|
-
dice_coefficient: 89.0,
|
67
|
-
jaro_winkler: 80.0,
|
68
|
-
levenshtein: 80.0,
|
69
|
-
}[algorithm.to_sym]
|
72
|
+
def content_for(license)
|
73
|
+
::Spandx::Core::Content.new(Spandx.git[:spdx].read("text/#{license.id}.txt") || '')
|
70
74
|
end
|
71
75
|
|
72
|
-
def
|
73
|
-
|
74
|
-
return if percentage > threshold
|
75
|
-
return if score.score > 0.0 && score.score < percentage
|
76
|
-
|
77
|
-
score.update(percentage, other)
|
76
|
+
def unknown(text)
|
77
|
+
::Spandx::Spdx::License.unknown(text)
|
78
78
|
end
|
79
79
|
|
80
|
-
def
|
81
|
-
|
82
|
-
|
83
|
-
return if score.score >= percentage
|
84
|
-
|
85
|
-
score.update(percentage, other)
|
80
|
+
def from_expression(content)
|
81
|
+
Spandx::Spdx::CompositeLicense
|
82
|
+
.from_expression(content.raw, catalogue)
|
86
83
|
end
|
87
84
|
end
|
88
85
|
end
|
data/lib/spandx/core/http.rb
CHANGED
@@ -8,7 +8,12 @@ module Spandx
|
|
8
8
|
def initialize(driver: Http.default_driver, retries: 3)
|
9
9
|
@driver = driver
|
10
10
|
@retries = retries
|
11
|
-
|
11
|
+
semaphore = Mutex.new
|
12
|
+
@circuits = Hash.new do |hash, key|
|
13
|
+
semaphore.synchronize do
|
14
|
+
hash[key] = Circuit.new(key)
|
15
|
+
end
|
16
|
+
end
|
12
17
|
end
|
13
18
|
|
14
19
|
def get(uri, default: nil, escape: true)
|
@@ -0,0 +1,101 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Spandx
|
4
|
+
module Core
|
5
|
+
class IndexFile
|
6
|
+
UINT_32_DIRECTIVE = 'V'
|
7
|
+
UINT_32_SIZE = 4
|
8
|
+
|
9
|
+
attr_reader :data_file, :path
|
10
|
+
|
11
|
+
def initialize(data_file)
|
12
|
+
@data_file = data_file
|
13
|
+
@path = Pathname.new("#{data_file.absolute_path}.idx")
|
14
|
+
@entries = size.positive? ? Array.new(size) : []
|
15
|
+
end
|
16
|
+
|
17
|
+
def each
|
18
|
+
total = path.size / UINT_32_SIZE
|
19
|
+
total.times do |n|
|
20
|
+
yield position_for(n)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def search(min: 0, max: size)
|
25
|
+
scan do |reader|
|
26
|
+
until min >= max
|
27
|
+
mid = mid_for(min, max)
|
28
|
+
row = reader.row(mid)
|
29
|
+
comparison = yield row
|
30
|
+
return row if comparison.zero?
|
31
|
+
|
32
|
+
comparison.positive? ? (min = mid + 1) : (max = mid)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def size
|
38
|
+
path.exist? ? path.size / UINT_32_SIZE : 0
|
39
|
+
end
|
40
|
+
|
41
|
+
def position_for(row_number)
|
42
|
+
return if row_number > size
|
43
|
+
|
44
|
+
entry = entries[row_number]
|
45
|
+
return entry if entry
|
46
|
+
|
47
|
+
bytes = IO.binread(path, UINT_32_SIZE, offset_for(row_number))
|
48
|
+
entry = bytes.unpack1(UINT_32_DIRECTIVE)
|
49
|
+
entries[row_number] = entry
|
50
|
+
entry
|
51
|
+
end
|
52
|
+
|
53
|
+
def update!
|
54
|
+
return unless data_file.exist?
|
55
|
+
|
56
|
+
sort(data_file)
|
57
|
+
rebuild_index!
|
58
|
+
end
|
59
|
+
|
60
|
+
private
|
61
|
+
|
62
|
+
attr_reader :entries
|
63
|
+
|
64
|
+
def scan
|
65
|
+
data_file.open_file(mode: 'rb') do |io|
|
66
|
+
yield Relation.new(io, self)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def offset_for(row_number)
|
71
|
+
row_number * UINT_32_SIZE
|
72
|
+
end
|
73
|
+
|
74
|
+
def sort(data_file)
|
75
|
+
data_file.absolute_path.write(data_file.absolute_path.readlines.sort.uniq.join)
|
76
|
+
end
|
77
|
+
|
78
|
+
def rebuild_index!
|
79
|
+
data_file.open_file do |data_io|
|
80
|
+
File.open(path, mode: 'wb') do |index_io|
|
81
|
+
lines_in(data_io).each do |pos|
|
82
|
+
index_io.write([pos].pack(UINT_32_DIRECTIVE))
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def lines_in(io)
|
89
|
+
lines = [0]
|
90
|
+
io.seek(0)
|
91
|
+
lines << io.pos while io.gets
|
92
|
+
lines.pop if lines.size > 1
|
93
|
+
lines
|
94
|
+
end
|
95
|
+
|
96
|
+
def mid_for(min, max)
|
97
|
+
(max - min) == 1 ? min : (((max - min) / 2) + min)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|