spandx 0.12.3 → 0.13.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +74 -25
  3. data/README.md +11 -7
  4. data/exe/spandx +1 -2
  5. data/ext/spandx/extconf.rb +5 -0
  6. data/ext/spandx/spandx.c +55 -0
  7. data/ext/spandx/spandx.h +6 -0
  8. data/lib/spandx.rb +6 -3
  9. data/lib/spandx/cli.rb +2 -0
  10. data/lib/spandx/cli/commands/build.rb +13 -2
  11. data/lib/spandx/cli/commands/scan.rb +11 -20
  12. data/lib/spandx/cli/main.rb +3 -2
  13. data/lib/spandx/core/cache.rb +38 -51
  14. data/lib/spandx/core/content.rb +5 -23
  15. data/lib/spandx/core/data_file.rb +66 -0
  16. data/lib/spandx/core/dependency.rb +47 -13
  17. data/lib/spandx/core/git.rb +8 -32
  18. data/lib/spandx/core/guess.rb +48 -40
  19. data/lib/spandx/core/http.rb +7 -2
  20. data/lib/spandx/core/index_file.rb +103 -0
  21. data/lib/spandx/core/license_plugin.rb +15 -4
  22. data/lib/spandx/core/parser.rb +10 -3
  23. data/lib/spandx/core/path_traversal.rb +35 -0
  24. data/lib/spandx/core/relation.rb +38 -0
  25. data/lib/spandx/core/report.rb +6 -12
  26. data/lib/spandx/core/spinner.rb +51 -0
  27. data/lib/spandx/dotnet/index.rb +21 -79
  28. data/lib/spandx/dotnet/parsers/csproj.rb +7 -7
  29. data/lib/spandx/dotnet/parsers/packages_config.rb +7 -7
  30. data/lib/spandx/dotnet/parsers/sln.rb +10 -13
  31. data/lib/spandx/dotnet/project_file.rb +3 -3
  32. data/lib/spandx/java/index.rb +5 -2
  33. data/lib/spandx/java/parsers/maven.rb +7 -7
  34. data/lib/spandx/js/parsers/npm.rb +6 -6
  35. data/lib/spandx/js/parsers/yarn.rb +7 -7
  36. data/lib/spandx/php/parsers/composer.rb +7 -7
  37. data/lib/spandx/python/index.rb +4 -33
  38. data/lib/spandx/python/parsers/pipfile_lock.rb +4 -4
  39. data/lib/spandx/python/pypi.rb +0 -2
  40. data/lib/spandx/python/source.rb +12 -0
  41. data/lib/spandx/ruby/parsers/gemfile_lock.rb +10 -9
  42. data/lib/spandx/spdx/catalogue.rb +5 -1
  43. data/lib/spandx/spdx/composite_license.rb +60 -0
  44. data/lib/spandx/spdx/expression.rb +114 -0
  45. data/lib/spandx/spdx/license.rb +4 -14
  46. data/lib/spandx/version.rb +1 -1
  47. data/spandx.gemspec +16 -10
  48. metadata +100 -30
  49. data/lib/spandx/core/null_gateway.rb +0 -11
  50. data/lib/spandx/core/table.rb +0 -29
  51. data/lib/spandx/core/thread_pool.rb +0 -38
@@ -13,30 +13,12 @@ module Spandx
13
13
  @tokens ||= tokenize(canonicalize(raw)).to_set
14
14
  end
15
15
 
16
- def similar?(other, algorithm: :dice_coefficient)
17
- case algorithm
18
- when :dice_coefficient
19
- similarity_score(other, algorithm: algorithm) > 89.0
20
- when :levenshtein
21
- similarity_score(other, algorithm: algorithm) < 3
22
- when :jaro_winkler
23
- similarity_score(other, algorithm: algorithm) > 89.0
24
- end
16
+ def similar?(other, threshold: 89.0)
17
+ similarity_score(other) > threshold
25
18
  end
26
19
 
27
- def similarity_score(other, algorithm: :dice_coefficient)
28
- case algorithm
29
- when :dice_coefficient
30
- dice_coefficient(other)
31
- when :levenshtein
32
- require 'text'
33
-
34
- Text::Levenshtein.distance(raw, other.raw, 100)
35
- when :jaro_winkler
36
- require 'jaro_winkler'
37
-
38
- JaroWinkler.distance(raw, other.raw) * 100.0
39
- end
20
+ def similarity_score(other)
21
+ dice_coefficient(other)
40
22
  end
41
23
 
42
24
  private
@@ -46,7 +28,7 @@ module Spandx
46
28
  end
47
29
 
48
30
  def tokenize(content)
49
- content.to_s.scan(/[a-zA-Z]+/)
31
+ content.to_s.scan(/[a-zA-Z\d.]+/)
50
32
  end
51
33
 
52
34
  def blank?(content)
@@ -0,0 +1,66 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Spandx
4
+ module Core
5
+ class DataFile
6
+ include Enumerable
7
+
8
+ attr_reader :absolute_path
9
+
10
+ def initialize(absolute_path)
11
+ @absolute_path = Pathname.new(absolute_path)
12
+ FileUtils.mkdir_p(@absolute_path.dirname)
13
+ end
14
+
15
+ def each
16
+ return unless exist?
17
+
18
+ open_file(mode: 'rb') do |io|
19
+ while (line = io.gets)
20
+ yield CsvParser.parse(line)
21
+ end
22
+ end
23
+ end
24
+
25
+ def search(name:, version:)
26
+ return if name.nil? || name.empty?
27
+ return if version.nil? || name.empty?
28
+ return unless absolute_path.exist?
29
+
30
+ term = "#{name}-#{version}"
31
+ index.search do |row|
32
+ term <=> "#{row[0]}-#{row[1]}"
33
+ end
34
+ end
35
+
36
+ def insert(name, version, licenses)
37
+ return if [name, version].any? { |x| x.nil? || x.empty? }
38
+
39
+ open_file(mode: 'a') do |io|
40
+ io.write(to_csv([name, version, licenses.join('-|-')]))
41
+ end
42
+ end
43
+
44
+ def exist?
45
+ absolute_path.exist?
46
+ end
47
+
48
+ def open_file(mode: 'rb')
49
+ absolute_path.open(mode) { |io| yield io }
50
+ rescue Errno::ENOENT => error
51
+ Spandx.logger.error(error)
52
+ nil
53
+ end
54
+
55
+ def index
56
+ @index ||= IndexFile.new(self)
57
+ end
58
+
59
+ private
60
+
61
+ def to_csv(array)
62
+ array.to_csv(force_quotes: true)
63
+ end
64
+ end
65
+ end
66
+ end
@@ -3,46 +3,80 @@
3
3
  module Spandx
4
4
  module Core
5
5
  class Dependency
6
- attr_reader :package_manager, :name, :version, :licenses, :meta
6
+ PACKAGE_MANAGERS = {
7
+ Spandx::Dotnet::Parsers::Csproj => :nuget,
8
+ Spandx::Dotnet::Parsers::PackagesConfig => :nuget,
9
+ Spandx::Dotnet::Parsers::Sln => :nuget,
10
+ Spandx::Java::Parsers::Maven => :maven,
11
+ Spandx::Js::Parsers::Npm => :npm,
12
+ Spandx::Js::Parsers::Yarn => :yarn,
13
+ Spandx::Php::Parsers::Composer => :composer,
14
+ Spandx::Python::Parsers::PipfileLock => :pypi,
15
+ Spandx::Ruby::Parsers::GemfileLock => :rubygems,
16
+ }.freeze
17
+ attr_reader :path, :name, :version, :licenses, :meta
7
18
 
8
- def initialize(package_manager:, name:, version:, licenses: [], meta: {})
9
- @package_manager = package_manager
10
- @name = name
11
- @version = version
12
- @licenses = licenses
19
+ def initialize(name:, version:, path:, meta: {})
20
+ @path = Pathname.new(path).realpath
21
+ @name = name || @path.basename.to_s
22
+ @version = version || @path.mtime.to_i.to_s
23
+ @licenses = []
13
24
  @meta = meta
14
25
  end
15
26
 
16
- def managed_by?(value)
17
- package_manager == value&.to_sym
27
+ def package_manager
28
+ PACKAGE_MANAGERS[Parser.for(path).class]
18
29
  end
19
30
 
20
31
  def <=>(other)
21
- to_s <=> other.to_s
32
+ return 1 if other.nil?
33
+
34
+ score = (name <=> other.name)
35
+ score = score.zero? ? (version <=> other&.version) : score
36
+ score.zero? ? (path.to_s <=> other&.path.to_s) : score
22
37
  end
23
38
 
24
39
  def hash
25
40
  to_s.hash
26
41
  end
27
42
 
43
+ def ==(other)
44
+ eql?(other)
45
+ end
46
+
28
47
  def eql?(other)
29
48
  to_s == other.to_s
30
49
  end
31
50
 
32
51
  def to_s
33
- @to_s ||= [name, version].compact.join(' ')
52
+ @to_s ||= [name, version, path].compact.join(' ')
34
53
  end
35
54
 
36
55
  def inspect
37
- "#<Spandx::Core::Dependency name=#{name}, version=#{version}>"
56
+ "#<#{self.class} name=#{name} version=#{version} path=#{relative_path}>"
38
57
  end
39
58
 
40
59
  def to_a
41
- [name, version, licenses.map(&:id)]
60
+ [name, version, license_expression, relative_path.to_s]
42
61
  end
43
62
 
44
63
  def to_h
45
- { name: name, version: version, licenses: licenses.map(&:id) }
64
+ {
65
+ name: name,
66
+ version: version,
67
+ licenses: license_expression,
68
+ path: relative_path.to_s
69
+ }
70
+ end
71
+
72
+ private
73
+
74
+ def relative_path(from: Pathname.pwd)
75
+ path.relative_path_from(from)
76
+ end
77
+
78
+ def license_expression
79
+ licenses.map(&:id).join(' AND ')
46
80
  end
47
81
  end
48
82
  end
@@ -3,37 +3,21 @@
3
3
  module Spandx
4
4
  module Core
5
5
  class Git
6
- attr_reader :path, :url
6
+ attr_reader :root, :url
7
7
 
8
8
  def initialize(url:)
9
9
  @url = url
10
- @path = path_for(url)
11
- end
12
-
13
- def update!
14
- dotgit? ? pull! : clone!
15
- end
16
-
17
- def expand_path(relative_path)
18
- File.join(path, relative_path)
10
+ @root = path_for(url)
19
11
  end
20
12
 
21
13
  def read(path)
22
- update! unless dotgit?
14
+ full_path = File.join(root, path)
23
15
 
24
- full_path = expand_path(path)
25
16
  IO.read(full_path) if File.exist?(full_path)
26
17
  end
27
18
 
28
- def open(path, mode: 'r')
29
- update! unless dotgit?
30
-
31
- full_path = expand_path(path)
32
- return unless File.exist?(full_path)
33
-
34
- File.open(full_path, mode) do |io|
35
- yield io
36
- end
19
+ def update!
20
+ dotgit? ? pull! : clone!
37
21
  end
38
22
 
39
23
  private
@@ -45,26 +29,18 @@ module Spandx
45
29
  end
46
30
 
47
31
  def dotgit?
48
- File.directory?(File.join(path, '.git'))
32
+ File.directory?(File.join(root, '.git'))
49
33
  end
50
34
 
51
35
  def clone!
52
- system('git', 'clone', '--quiet', url, path)
36
+ system('git', 'clone', '--quiet', '--depth=1', '--single-branch', '--branch', 'master', url, root)
53
37
  end
54
38
 
55
39
  def pull!
56
- within do
40
+ Dir.chdir(root) do
57
41
  system('git', 'pull', '--no-rebase', '--quiet', 'origin', 'master')
58
42
  end
59
43
  end
60
-
61
- def within
62
- Dir.chdir(path) do
63
- yield
64
- end
65
- end
66
44
  end
67
-
68
- Database = Git
69
45
  end
70
46
  end
@@ -9,80 +9,88 @@ module Spandx
9
9
  @catalogue = catalogue
10
10
  end
11
11
 
12
- def license_for(raw, algorithm: :dice_coefficient)
13
- raw.is_a?(Hash) ? from_hash(raw, algorithm) : from_string(raw, algorithm)
12
+ def license_for(raw)
13
+ case raw
14
+ when Hash
15
+ from_hash(raw)
16
+ when Array
17
+ from_array(raw)
18
+ else
19
+ from_string(raw)
20
+ end
14
21
  end
15
22
 
16
23
  private
17
24
 
18
- def from_hash(hash, algorithm)
19
- from_string(hash[:name], algorithm) ||
20
- from_url(hash[:url], algorithm) ||
25
+ def from_hash(hash)
26
+ from_string(hash[:name]) ||
27
+ from_url(hash[:url]) ||
21
28
  unknown(hash[:name] || hash[:url])
22
29
  end
23
30
 
24
- def from_string(raw, algorithm)
31
+ def from_array(array)
32
+ from_string(array.join(' AND '))
33
+ end
34
+
35
+ def from_string(raw)
36
+ return if raw.nil?
37
+
25
38
  content = Content.new(raw)
26
39
 
27
40
  catalogue[raw] ||
28
- match_name(content, algorithm) ||
29
- match_body(content, algorithm) ||
41
+ catalogue[raw.split(' ').join('-')] ||
42
+ match_name(content) ||
43
+ match_body(content) ||
30
44
  unknown(raw)
31
45
  end
32
46
 
33
- def from_url(url, algorithm)
47
+ def from_url(url)
34
48
  return if url.nil? || url.empty?
35
49
 
36
50
  response = Spandx.http.get(url)
37
51
  return unless Spandx.http.ok?(response)
38
52
 
39
- license_for(response.body, algorithm: algorithm)
53
+ license_for(response.body)
40
54
  end
41
55
 
42
- def match_name(content, _algorithm)
56
+ def match_name(content)
57
+ return if content.tokens.size < 2 || content.tokens.size > 10
58
+
59
+ result = from_expression(content)
60
+ return result if result
61
+
62
+ threshold = 85.0
43
63
  catalogue.find do |license|
44
- score = content.similarity_score(::Spandx::Core::Content.new(license.name))
45
- score > 85
64
+ content.similar?(Content.new(license.name), threshold: threshold)
46
65
  end
47
66
  end
48
67
 
49
- def match_body(content, algorithm)
68
+ def match_body(content)
50
69
  score = Score.new(nil, nil)
51
- threshold = threshold_for(algorithm)
52
- direction = algorithm == :levenshtein ? method(:min) : method(:max)
53
-
70
+ threshold = 89.0
54
71
  catalogue.each do |license|
55
- direction.call(content, license, score, threshold, algorithm) unless license.deprecated_license_id?
72
+ next if license.deprecated_license_id?
73
+
74
+ percentage = content.similarity_score(content_for(license))
75
+ next if percentage < threshold
76
+ next if score.score >= percentage
77
+
78
+ score.update(percentage, license)
56
79
  end
57
80
  score&.item
58
81
  end
59
82
 
60
- def unknown(text)
61
- ::Spandx::Spdx::License.unknown(text)
62
- end
63
-
64
- def threshold_for(algorithm)
65
- {
66
- dice_coefficient: 89.0,
67
- jaro_winkler: 80.0,
68
- levenshtein: 80.0,
69
- }[algorithm.to_sym]
83
+ def content_for(license)
84
+ ::Spandx::Core::Content.new(Spandx.git[:spdx].read("text/#{license.id}.txt") || '')
70
85
  end
71
86
 
72
- def min(target, other, score, threshold, algorithm)
73
- percentage = target.similarity_score(other.content, algorithm: algorithm)
74
- return if percentage > threshold
75
- return if score.score > 0.0 && score.score < percentage
76
-
77
- score.update(percentage, other)
87
+ def unknown(text)
88
+ ::Spandx::Spdx::License.unknown(text)
78
89
  end
79
90
 
80
- def max(target, other, score, threshold, algorithm)
81
- percentage = target.similarity_score(other.content, algorithm: algorithm)
82
- return if percentage < threshold
83
- return if score.score >= percentage
84
-
85
- score.update(percentage, other)
91
+ def from_expression(content)
92
+ Spandx::Spdx::CompositeLicense
93
+ .from_expression(content.raw, catalogue)
86
94
  end
87
95
  end
88
96
  end
@@ -8,7 +8,12 @@ module Spandx
8
8
  def initialize(driver: Http.default_driver, retries: 3)
9
9
  @driver = driver
10
10
  @retries = retries
11
- @circuits = Hash.new { |hash, key| hash[key] = Circuit.new(key) }
11
+ semaphore = Mutex.new
12
+ @circuits = Hash.new do |hash, key|
13
+ semaphore.synchronize do
14
+ hash[key] = Circuit.new(key)
15
+ end
16
+ end
12
17
  end
13
18
 
14
19
  def get(uri, default: nil, escape: true)
@@ -22,7 +27,7 @@ module Spandx
22
27
  client.get(escape ? Addressable::URI.escape(uri) : uri)
23
28
  end
24
29
  end
25
- rescue *Net::Hippie::CONNECTION_ERRORS
30
+ rescue *Net::Hippie::CONNECTION_ERRORS, URI::InvalidURIError
26
31
  default
27
32
  end
28
33
 
@@ -0,0 +1,103 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Spandx
4
+ module Core
5
+ class IndexFile
6
+ UINT_32_DIRECTIVE = 'V'
7
+ UINT_32_SIZE = 4
8
+
9
+ attr_reader :data_file, :path
10
+
11
+ def initialize(data_file)
12
+ @data_file = data_file
13
+ @path = Pathname.new("#{data_file.absolute_path}.idx")
14
+ @entries = size.positive? ? Array.new(size) : []
15
+ end
16
+
17
+ def each
18
+ total = path.size / UINT_32_SIZE
19
+ total.times do |n|
20
+ yield position_for(n)
21
+ end
22
+ end
23
+
24
+ def search(min: 0, max: size)
25
+ scan do |reader|
26
+ until min >= max
27
+ mid = mid_for(min, max)
28
+ row = reader.row(mid)
29
+ return unless row
30
+
31
+ comparison = yield row
32
+ return row if comparison.zero?
33
+
34
+ comparison.positive? ? (min = mid + 1) : (max = mid)
35
+ end
36
+ end
37
+ end
38
+
39
+ def size
40
+ path.exist? ? path.size / UINT_32_SIZE : 0
41
+ end
42
+
43
+ def position_for(row_number)
44
+ return if row_number > size
45
+
46
+ entry = entries[row_number]
47
+ return entry if entry
48
+
49
+ bytes = IO.binread(path, UINT_32_SIZE, offset_for(row_number))
50
+ entry = bytes.unpack1(UINT_32_DIRECTIVE)
51
+ entries[row_number] = entry
52
+ entry
53
+ end
54
+
55
+ def update!
56
+ return unless data_file.exist?
57
+
58
+ sort(data_file)
59
+ rebuild_index!
60
+ end
61
+
62
+ private
63
+
64
+ attr_reader :entries
65
+
66
+ def scan
67
+ data_file.open_file(mode: 'rb') do |io|
68
+ yield Relation.new(io, self)
69
+ end
70
+ end
71
+
72
+ def offset_for(row_number)
73
+ row_number * UINT_32_SIZE
74
+ end
75
+
76
+ def sort(data_file)
77
+ data_file.absolute_path.write(data_file.absolute_path.readlines.sort.uniq.join)
78
+ end
79
+
80
+ def rebuild_index!
81
+ data_file.open_file do |data_io|
82
+ File.open(path, mode: 'wb') do |index_io|
83
+ lines_in(data_io).each do |pos|
84
+ index_io.write([pos].pack(UINT_32_DIRECTIVE))
85
+ end
86
+ end
87
+ end
88
+ end
89
+
90
+ def lines_in(io)
91
+ lines = [0]
92
+ io.seek(0)
93
+ lines << io.pos while io.gets
94
+ lines.pop if lines.size > 1
95
+ lines
96
+ end
97
+
98
+ def mid_for(min, max)
99
+ (max - min) == 1 ? min : (((max - min) / 2) + min)
100
+ end
101
+ end
102
+ end
103
+ end