spandx 0.12.3 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +50 -25
- data/README.md +11 -7
- data/exe/spandx +1 -2
- data/ext/spandx/extconf.rb +5 -0
- data/lib/spandx.rb +6 -3
- data/lib/spandx/cli.rb +1 -0
- data/lib/spandx/cli/commands/build.rb +13 -2
- data/lib/spandx/cli/commands/scan.rb +31 -25
- data/lib/spandx/cli/main.rb +1 -0
- data/lib/spandx/core/cache.rb +38 -51
- data/lib/spandx/core/concurrent.rb +40 -0
- data/lib/spandx/core/content.rb +5 -23
- data/lib/spandx/core/data_file.rb +66 -0
- data/lib/spandx/core/git.rb +8 -30
- data/lib/spandx/core/guess.rb +37 -40
- data/lib/spandx/core/http.rb +6 -1
- data/lib/spandx/core/index_file.rb +101 -0
- data/lib/spandx/core/license_plugin.rb +3 -3
- data/lib/spandx/core/line_io.rb +23 -0
- data/lib/spandx/core/path_traversal.rb +44 -0
- data/lib/spandx/core/relation.rb +38 -0
- data/lib/spandx/core/thread_pool.rb +15 -4
- data/lib/spandx/dotnet/index.rb +21 -79
- data/lib/spandx/java/index.rb +5 -2
- data/lib/spandx/python/index.rb +4 -33
- data/lib/spandx/spdx/catalogue.rb +4 -0
- data/lib/spandx/spdx/composite_license.rb +60 -0
- data/lib/spandx/spdx/expression.rb +114 -0
- data/lib/spandx/spdx/license.rb +4 -14
- data/lib/spandx/version.rb +1 -1
- data/spandx.gemspec +13 -10
- metadata +70 -27
- data/lib/spandx/core/null_gateway.rb +0 -11
@@ -0,0 +1,40 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Spandx
|
4
|
+
module Core
|
5
|
+
class Concurrent
|
6
|
+
include Enumerable
|
7
|
+
|
8
|
+
def self.map(items, pool:, &block)
|
9
|
+
queue = Queue.new
|
10
|
+
|
11
|
+
items.each do |item|
|
12
|
+
pool.schedule([item, block]) do |marshalled_item, callable|
|
13
|
+
queue.enq(callable.call(marshalled_item))
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
new(queue, items.size)
|
18
|
+
end
|
19
|
+
|
20
|
+
attr_reader :queue, :size
|
21
|
+
|
22
|
+
def initialize(queue, size)
|
23
|
+
@queue = queue
|
24
|
+
@size = size
|
25
|
+
end
|
26
|
+
|
27
|
+
def each
|
28
|
+
size.times { yield queue.deq }
|
29
|
+
end
|
30
|
+
|
31
|
+
def to_enum
|
32
|
+
Enumerator.new do |yielder|
|
33
|
+
each do |item|
|
34
|
+
yielder.yield item
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
data/lib/spandx/core/content.rb
CHANGED
@@ -13,30 +13,12 @@ module Spandx
|
|
13
13
|
@tokens ||= tokenize(canonicalize(raw)).to_set
|
14
14
|
end
|
15
15
|
|
16
|
-
def similar?(other,
|
17
|
-
|
18
|
-
when :dice_coefficient
|
19
|
-
similarity_score(other, algorithm: algorithm) > 89.0
|
20
|
-
when :levenshtein
|
21
|
-
similarity_score(other, algorithm: algorithm) < 3
|
22
|
-
when :jaro_winkler
|
23
|
-
similarity_score(other, algorithm: algorithm) > 89.0
|
24
|
-
end
|
16
|
+
def similar?(other, threshold: 89.0)
|
17
|
+
similarity_score(other) > threshold
|
25
18
|
end
|
26
19
|
|
27
|
-
def similarity_score(other
|
28
|
-
|
29
|
-
when :dice_coefficient
|
30
|
-
dice_coefficient(other)
|
31
|
-
when :levenshtein
|
32
|
-
require 'text'
|
33
|
-
|
34
|
-
Text::Levenshtein.distance(raw, other.raw, 100)
|
35
|
-
when :jaro_winkler
|
36
|
-
require 'jaro_winkler'
|
37
|
-
|
38
|
-
JaroWinkler.distance(raw, other.raw) * 100.0
|
39
|
-
end
|
20
|
+
def similarity_score(other)
|
21
|
+
dice_coefficient(other)
|
40
22
|
end
|
41
23
|
|
42
24
|
private
|
@@ -46,7 +28,7 @@ module Spandx
|
|
46
28
|
end
|
47
29
|
|
48
30
|
def tokenize(content)
|
49
|
-
content.to_s.scan(/[a-zA-Z]+/)
|
31
|
+
content.to_s.scan(/[a-zA-Z\d.]+/)
|
50
32
|
end
|
51
33
|
|
52
34
|
def blank?(content)
|
@@ -0,0 +1,66 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Spandx
|
4
|
+
module Core
|
5
|
+
class DataFile
|
6
|
+
include Enumerable
|
7
|
+
|
8
|
+
attr_reader :absolute_path
|
9
|
+
|
10
|
+
def initialize(absolute_path)
|
11
|
+
@absolute_path = Pathname.new(absolute_path)
|
12
|
+
FileUtils.mkdir_p(@absolute_path.dirname)
|
13
|
+
end
|
14
|
+
|
15
|
+
def each
|
16
|
+
return unless exist?
|
17
|
+
|
18
|
+
open_file(mode: 'rb') do |io|
|
19
|
+
while (line = io.gets)
|
20
|
+
yield CsvParser.parse(line)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def search(name:, version:)
|
26
|
+
return if name.nil? || name.empty?
|
27
|
+
return if version.nil? || name.empty?
|
28
|
+
return unless absolute_path.exist?
|
29
|
+
|
30
|
+
term = "#{name}-#{version}"
|
31
|
+
index.search do |row|
|
32
|
+
term <=> "#{row[0]}-#{row[1]}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def insert(name, version, licenses)
|
37
|
+
return if [name, version].any? { |x| x.nil? || x.empty? }
|
38
|
+
|
39
|
+
open_file(mode: 'a') do |io|
|
40
|
+
io.write(to_csv([name, version, licenses.join('-|-')]))
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def exist?
|
45
|
+
absolute_path.exist?
|
46
|
+
end
|
47
|
+
|
48
|
+
def open_file(mode: 'rb')
|
49
|
+
absolute_path.open(mode) { |io| yield io }
|
50
|
+
rescue Errno::ENOENT => error
|
51
|
+
Spandx.logger.error(error)
|
52
|
+
nil
|
53
|
+
end
|
54
|
+
|
55
|
+
def index
|
56
|
+
@index ||= IndexFile.new(self)
|
57
|
+
end
|
58
|
+
|
59
|
+
private
|
60
|
+
|
61
|
+
def to_csv(array)
|
62
|
+
array.to_csv(force_quotes: true)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
data/lib/spandx/core/git.rb
CHANGED
@@ -3,37 +3,21 @@
|
|
3
3
|
module Spandx
|
4
4
|
module Core
|
5
5
|
class Git
|
6
|
-
attr_reader :
|
6
|
+
attr_reader :root, :url
|
7
7
|
|
8
8
|
def initialize(url:)
|
9
9
|
@url = url
|
10
|
-
@
|
11
|
-
end
|
12
|
-
|
13
|
-
def update!
|
14
|
-
dotgit? ? pull! : clone!
|
15
|
-
end
|
16
|
-
|
17
|
-
def expand_path(relative_path)
|
18
|
-
File.join(path, relative_path)
|
10
|
+
@root = path_for(url)
|
19
11
|
end
|
20
12
|
|
21
13
|
def read(path)
|
22
|
-
|
14
|
+
full_path = File.join(root, path)
|
23
15
|
|
24
|
-
full_path = expand_path(path)
|
25
16
|
IO.read(full_path) if File.exist?(full_path)
|
26
17
|
end
|
27
18
|
|
28
|
-
def
|
29
|
-
|
30
|
-
|
31
|
-
full_path = expand_path(path)
|
32
|
-
return unless File.exist?(full_path)
|
33
|
-
|
34
|
-
File.open(full_path, mode) do |io|
|
35
|
-
yield io
|
36
|
-
end
|
19
|
+
def update!
|
20
|
+
dotgit? ? pull! : clone!
|
37
21
|
end
|
38
22
|
|
39
23
|
private
|
@@ -45,24 +29,18 @@ module Spandx
|
|
45
29
|
end
|
46
30
|
|
47
31
|
def dotgit?
|
48
|
-
File.directory?(File.join(
|
32
|
+
File.directory?(File.join(root, '.git'))
|
49
33
|
end
|
50
34
|
|
51
35
|
def clone!
|
52
|
-
system('git', 'clone', '--quiet', url,
|
36
|
+
system('git', 'clone', '--quiet', '--depth=1', '--single-branch', '--branch', 'master', url, root)
|
53
37
|
end
|
54
38
|
|
55
39
|
def pull!
|
56
|
-
|
40
|
+
Dir.chdir(root) do
|
57
41
|
system('git', 'pull', '--no-rebase', '--quiet', 'origin', 'master')
|
58
42
|
end
|
59
43
|
end
|
60
|
-
|
61
|
-
def within
|
62
|
-
Dir.chdir(path) do
|
63
|
-
yield
|
64
|
-
end
|
65
|
-
end
|
66
44
|
end
|
67
45
|
|
68
46
|
Database = Git
|
data/lib/spandx/core/guess.rb
CHANGED
@@ -9,80 +9,77 @@ module Spandx
|
|
9
9
|
@catalogue = catalogue
|
10
10
|
end
|
11
11
|
|
12
|
-
def license_for(raw
|
13
|
-
raw.is_a?(Hash) ? from_hash(raw
|
12
|
+
def license_for(raw)
|
13
|
+
raw.is_a?(Hash) ? from_hash(raw) : from_string(raw)
|
14
14
|
end
|
15
15
|
|
16
16
|
private
|
17
17
|
|
18
|
-
def from_hash(hash
|
19
|
-
from_string(hash[:name]
|
20
|
-
from_url(hash[:url]
|
18
|
+
def from_hash(hash)
|
19
|
+
from_string(hash[:name]) ||
|
20
|
+
from_url(hash[:url]) ||
|
21
21
|
unknown(hash[:name] || hash[:url])
|
22
22
|
end
|
23
23
|
|
24
|
-
def from_string(raw
|
24
|
+
def from_string(raw)
|
25
|
+
return if raw.nil?
|
26
|
+
|
25
27
|
content = Content.new(raw)
|
26
28
|
|
27
29
|
catalogue[raw] ||
|
28
|
-
|
29
|
-
|
30
|
+
catalogue[raw.split(' ').join('-')] ||
|
31
|
+
match_name(content) ||
|
32
|
+
match_body(content) ||
|
30
33
|
unknown(raw)
|
31
34
|
end
|
32
35
|
|
33
|
-
def from_url(url
|
36
|
+
def from_url(url)
|
34
37
|
return if url.nil? || url.empty?
|
35
38
|
|
36
39
|
response = Spandx.http.get(url)
|
37
40
|
return unless Spandx.http.ok?(response)
|
38
41
|
|
39
|
-
license_for(response.body
|
42
|
+
license_for(response.body)
|
40
43
|
end
|
41
44
|
|
42
|
-
def match_name(content
|
45
|
+
def match_name(content)
|
46
|
+
return if content.tokens.size < 2 || content.tokens.size > 10
|
47
|
+
|
48
|
+
result = from_expression(content)
|
49
|
+
return result if result
|
50
|
+
|
51
|
+
threshold = 85.0
|
43
52
|
catalogue.find do |license|
|
44
|
-
|
45
|
-
score > 85
|
53
|
+
content.similar?(Content.new(license.name), threshold: threshold)
|
46
54
|
end
|
47
55
|
end
|
48
56
|
|
49
|
-
def match_body(content
|
57
|
+
def match_body(content)
|
50
58
|
score = Score.new(nil, nil)
|
51
|
-
threshold =
|
52
|
-
direction = algorithm == :levenshtein ? method(:min) : method(:max)
|
53
|
-
|
59
|
+
threshold = 89.0
|
54
60
|
catalogue.each do |license|
|
55
|
-
|
61
|
+
next if license.deprecated_license_id?
|
62
|
+
|
63
|
+
percentage = content.similarity_score(content_for(license))
|
64
|
+
next if percentage < threshold
|
65
|
+
next if score.score >= percentage
|
66
|
+
|
67
|
+
score.update(percentage, license)
|
56
68
|
end
|
57
69
|
score&.item
|
58
70
|
end
|
59
71
|
|
60
|
-
def
|
61
|
-
::Spandx::
|
62
|
-
end
|
63
|
-
|
64
|
-
def threshold_for(algorithm)
|
65
|
-
{
|
66
|
-
dice_coefficient: 89.0,
|
67
|
-
jaro_winkler: 80.0,
|
68
|
-
levenshtein: 80.0,
|
69
|
-
}[algorithm.to_sym]
|
72
|
+
def content_for(license)
|
73
|
+
::Spandx::Core::Content.new(Spandx.git[:spdx].read("text/#{license.id}.txt") || '')
|
70
74
|
end
|
71
75
|
|
72
|
-
def
|
73
|
-
|
74
|
-
return if percentage > threshold
|
75
|
-
return if score.score > 0.0 && score.score < percentage
|
76
|
-
|
77
|
-
score.update(percentage, other)
|
76
|
+
def unknown(text)
|
77
|
+
::Spandx::Spdx::License.unknown(text)
|
78
78
|
end
|
79
79
|
|
80
|
-
def
|
81
|
-
|
82
|
-
|
83
|
-
return if score.score >= percentage
|
84
|
-
|
85
|
-
score.update(percentage, other)
|
80
|
+
def from_expression(content)
|
81
|
+
Spandx::Spdx::CompositeLicense
|
82
|
+
.from_expression(content.raw, catalogue)
|
86
83
|
end
|
87
84
|
end
|
88
85
|
end
|
data/lib/spandx/core/http.rb
CHANGED
@@ -8,7 +8,12 @@ module Spandx
|
|
8
8
|
def initialize(driver: Http.default_driver, retries: 3)
|
9
9
|
@driver = driver
|
10
10
|
@retries = retries
|
11
|
-
|
11
|
+
semaphore = Mutex.new
|
12
|
+
@circuits = Hash.new do |hash, key|
|
13
|
+
semaphore.synchronize do
|
14
|
+
hash[key] = Circuit.new(key)
|
15
|
+
end
|
16
|
+
end
|
12
17
|
end
|
13
18
|
|
14
19
|
def get(uri, default: nil, escape: true)
|
@@ -0,0 +1,101 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Spandx
|
4
|
+
module Core
|
5
|
+
class IndexFile
|
6
|
+
UINT_32_DIRECTIVE = 'V'
|
7
|
+
UINT_32_SIZE = 4
|
8
|
+
|
9
|
+
attr_reader :data_file, :path
|
10
|
+
|
11
|
+
def initialize(data_file)
|
12
|
+
@data_file = data_file
|
13
|
+
@path = Pathname.new("#{data_file.absolute_path}.idx")
|
14
|
+
@entries = size.positive? ? Array.new(size) : []
|
15
|
+
end
|
16
|
+
|
17
|
+
def each
|
18
|
+
total = path.size / UINT_32_SIZE
|
19
|
+
total.times do |n|
|
20
|
+
yield position_for(n)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def search(min: 0, max: size)
|
25
|
+
scan do |reader|
|
26
|
+
until min >= max
|
27
|
+
mid = mid_for(min, max)
|
28
|
+
row = reader.row(mid)
|
29
|
+
comparison = yield row
|
30
|
+
return row if comparison.zero?
|
31
|
+
|
32
|
+
comparison.positive? ? (min = mid + 1) : (max = mid)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def size
|
38
|
+
path.exist? ? path.size / UINT_32_SIZE : 0
|
39
|
+
end
|
40
|
+
|
41
|
+
def position_for(row_number)
|
42
|
+
return if row_number > size
|
43
|
+
|
44
|
+
entry = entries[row_number]
|
45
|
+
return entry if entry
|
46
|
+
|
47
|
+
bytes = IO.binread(path, UINT_32_SIZE, offset_for(row_number))
|
48
|
+
entry = bytes.unpack1(UINT_32_DIRECTIVE)
|
49
|
+
entries[row_number] = entry
|
50
|
+
entry
|
51
|
+
end
|
52
|
+
|
53
|
+
def update!
|
54
|
+
return unless data_file.exist?
|
55
|
+
|
56
|
+
sort(data_file)
|
57
|
+
rebuild_index!
|
58
|
+
end
|
59
|
+
|
60
|
+
private
|
61
|
+
|
62
|
+
attr_reader :entries
|
63
|
+
|
64
|
+
def scan
|
65
|
+
data_file.open_file(mode: 'rb') do |io|
|
66
|
+
yield Relation.new(io, self)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def offset_for(row_number)
|
71
|
+
row_number * UINT_32_SIZE
|
72
|
+
end
|
73
|
+
|
74
|
+
def sort(data_file)
|
75
|
+
data_file.absolute_path.write(data_file.absolute_path.readlines.sort.uniq.join)
|
76
|
+
end
|
77
|
+
|
78
|
+
def rebuild_index!
|
79
|
+
data_file.open_file do |data_io|
|
80
|
+
File.open(path, mode: 'wb') do |index_io|
|
81
|
+
lines_in(data_io).each do |pos|
|
82
|
+
index_io.write([pos].pack(UINT_32_DIRECTIVE))
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def lines_in(io)
|
89
|
+
lines = [0]
|
90
|
+
io.seek(0)
|
91
|
+
lines << io.pos while io.gets
|
92
|
+
lines.pop if lines.size > 1
|
93
|
+
lines
|
94
|
+
end
|
95
|
+
|
96
|
+
def mid_for(min, max)
|
97
|
+
(max - min) == 1 ? min : (((max - min) / 2) + min)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|