spandx 0.12.3 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Spandx
4
+ module Core
5
+ class Concurrent
6
+ include Enumerable
7
+
8
+ def self.map(items, pool:, &block)
9
+ queue = Queue.new
10
+
11
+ items.each do |item|
12
+ pool.schedule([item, block]) do |marshalled_item, callable|
13
+ queue.enq(callable.call(marshalled_item))
14
+ end
15
+ end
16
+
17
+ new(queue, items.size)
18
+ end
19
+
20
+ attr_reader :queue, :size
21
+
22
+ def initialize(queue, size)
23
+ @queue = queue
24
+ @size = size
25
+ end
26
+
27
+ def each
28
+ size.times { yield queue.deq }
29
+ end
30
+
31
+ def to_enum
32
+ Enumerator.new do |yielder|
33
+ each do |item|
34
+ yielder.yield item
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end
@@ -13,30 +13,12 @@ module Spandx
13
13
  @tokens ||= tokenize(canonicalize(raw)).to_set
14
14
  end
15
15
 
16
- def similar?(other, algorithm: :dice_coefficient)
17
- case algorithm
18
- when :dice_coefficient
19
- similarity_score(other, algorithm: algorithm) > 89.0
20
- when :levenshtein
21
- similarity_score(other, algorithm: algorithm) < 3
22
- when :jaro_winkler
23
- similarity_score(other, algorithm: algorithm) > 89.0
24
- end
16
+ def similar?(other, threshold: 89.0)
17
+ similarity_score(other) > threshold
25
18
  end
26
19
 
27
- def similarity_score(other, algorithm: :dice_coefficient)
28
- case algorithm
29
- when :dice_coefficient
30
- dice_coefficient(other)
31
- when :levenshtein
32
- require 'text'
33
-
34
- Text::Levenshtein.distance(raw, other.raw, 100)
35
- when :jaro_winkler
36
- require 'jaro_winkler'
37
-
38
- JaroWinkler.distance(raw, other.raw) * 100.0
39
- end
20
+ def similarity_score(other)
21
+ dice_coefficient(other)
40
22
  end
41
23
 
42
24
  private
@@ -46,7 +28,7 @@ module Spandx
46
28
  end
47
29
 
48
30
  def tokenize(content)
49
- content.to_s.scan(/[a-zA-Z]+/)
31
+ content.to_s.scan(/[a-zA-Z\d.]+/)
50
32
  end
51
33
 
52
34
  def blank?(content)
@@ -0,0 +1,66 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Spandx
4
+ module Core
5
+ class DataFile
6
+ include Enumerable
7
+
8
+ attr_reader :absolute_path
9
+
10
+ def initialize(absolute_path)
11
+ @absolute_path = Pathname.new(absolute_path)
12
+ FileUtils.mkdir_p(@absolute_path.dirname)
13
+ end
14
+
15
+ def each
16
+ return unless exist?
17
+
18
+ open_file(mode: 'rb') do |io|
19
+ while (line = io.gets)
20
+ yield CsvParser.parse(line)
21
+ end
22
+ end
23
+ end
24
+
25
+ def search(name:, version:)
26
+ return if name.nil? || name.empty?
27
+ return if version.nil? || name.empty?
28
+ return unless absolute_path.exist?
29
+
30
+ term = "#{name}-#{version}"
31
+ index.search do |row|
32
+ term <=> "#{row[0]}-#{row[1]}"
33
+ end
34
+ end
35
+
36
+ def insert(name, version, licenses)
37
+ return if [name, version].any? { |x| x.nil? || x.empty? }
38
+
39
+ open_file(mode: 'a') do |io|
40
+ io.write(to_csv([name, version, licenses.join('-|-')]))
41
+ end
42
+ end
43
+
44
+ def exist?
45
+ absolute_path.exist?
46
+ end
47
+
48
+ def open_file(mode: 'rb')
49
+ absolute_path.open(mode) { |io| yield io }
50
+ rescue Errno::ENOENT => error
51
+ Spandx.logger.error(error)
52
+ nil
53
+ end
54
+
55
+ def index
56
+ @index ||= IndexFile.new(self)
57
+ end
58
+
59
+ private
60
+
61
+ def to_csv(array)
62
+ array.to_csv(force_quotes: true)
63
+ end
64
+ end
65
+ end
66
+ end
@@ -3,37 +3,21 @@
3
3
  module Spandx
4
4
  module Core
5
5
  class Git
6
- attr_reader :path, :url
6
+ attr_reader :root, :url
7
7
 
8
8
  def initialize(url:)
9
9
  @url = url
10
- @path = path_for(url)
11
- end
12
-
13
- def update!
14
- dotgit? ? pull! : clone!
15
- end
16
-
17
- def expand_path(relative_path)
18
- File.join(path, relative_path)
10
+ @root = path_for(url)
19
11
  end
20
12
 
21
13
  def read(path)
22
- update! unless dotgit?
14
+ full_path = File.join(root, path)
23
15
 
24
- full_path = expand_path(path)
25
16
  IO.read(full_path) if File.exist?(full_path)
26
17
  end
27
18
 
28
- def open(path, mode: 'r')
29
- update! unless dotgit?
30
-
31
- full_path = expand_path(path)
32
- return unless File.exist?(full_path)
33
-
34
- File.open(full_path, mode) do |io|
35
- yield io
36
- end
19
+ def update!
20
+ dotgit? ? pull! : clone!
37
21
  end
38
22
 
39
23
  private
@@ -45,24 +29,18 @@ module Spandx
45
29
  end
46
30
 
47
31
  def dotgit?
48
- File.directory?(File.join(path, '.git'))
32
+ File.directory?(File.join(root, '.git'))
49
33
  end
50
34
 
51
35
  def clone!
52
- system('git', 'clone', '--quiet', url, path)
36
+ system('git', 'clone', '--quiet', '--depth=1', '--single-branch', '--branch', 'master', url, root)
53
37
  end
54
38
 
55
39
  def pull!
56
- within do
40
+ Dir.chdir(root) do
57
41
  system('git', 'pull', '--no-rebase', '--quiet', 'origin', 'master')
58
42
  end
59
43
  end
60
-
61
- def within
62
- Dir.chdir(path) do
63
- yield
64
- end
65
- end
66
44
  end
67
45
 
68
46
  Database = Git
@@ -9,80 +9,77 @@ module Spandx
9
9
  @catalogue = catalogue
10
10
  end
11
11
 
12
- def license_for(raw, algorithm: :dice_coefficient)
13
- raw.is_a?(Hash) ? from_hash(raw, algorithm) : from_string(raw, algorithm)
12
+ def license_for(raw)
13
+ raw.is_a?(Hash) ? from_hash(raw) : from_string(raw)
14
14
  end
15
15
 
16
16
  private
17
17
 
18
- def from_hash(hash, algorithm)
19
- from_string(hash[:name], algorithm) ||
20
- from_url(hash[:url], algorithm) ||
18
+ def from_hash(hash)
19
+ from_string(hash[:name]) ||
20
+ from_url(hash[:url]) ||
21
21
  unknown(hash[:name] || hash[:url])
22
22
  end
23
23
 
24
- def from_string(raw, algorithm)
24
+ def from_string(raw)
25
+ return if raw.nil?
26
+
25
27
  content = Content.new(raw)
26
28
 
27
29
  catalogue[raw] ||
28
- match_name(content, algorithm) ||
29
- match_body(content, algorithm) ||
30
+ catalogue[raw.split(' ').join('-')] ||
31
+ match_name(content) ||
32
+ match_body(content) ||
30
33
  unknown(raw)
31
34
  end
32
35
 
33
- def from_url(url, algorithm)
36
+ def from_url(url)
34
37
  return if url.nil? || url.empty?
35
38
 
36
39
  response = Spandx.http.get(url)
37
40
  return unless Spandx.http.ok?(response)
38
41
 
39
- license_for(response.body, algorithm: algorithm)
42
+ license_for(response.body)
40
43
  end
41
44
 
42
- def match_name(content, _algorithm)
45
+ def match_name(content)
46
+ return if content.tokens.size < 2 || content.tokens.size > 10
47
+
48
+ result = from_expression(content)
49
+ return result if result
50
+
51
+ threshold = 85.0
43
52
  catalogue.find do |license|
44
- score = content.similarity_score(::Spandx::Core::Content.new(license.name))
45
- score > 85
53
+ content.similar?(Content.new(license.name), threshold: threshold)
46
54
  end
47
55
  end
48
56
 
49
- def match_body(content, algorithm)
57
+ def match_body(content)
50
58
  score = Score.new(nil, nil)
51
- threshold = threshold_for(algorithm)
52
- direction = algorithm == :levenshtein ? method(:min) : method(:max)
53
-
59
+ threshold = 89.0
54
60
  catalogue.each do |license|
55
- direction.call(content, license, score, threshold, algorithm) unless license.deprecated_license_id?
61
+ next if license.deprecated_license_id?
62
+
63
+ percentage = content.similarity_score(content_for(license))
64
+ next if percentage < threshold
65
+ next if score.score >= percentage
66
+
67
+ score.update(percentage, license)
56
68
  end
57
69
  score&.item
58
70
  end
59
71
 
60
- def unknown(text)
61
- ::Spandx::Spdx::License.unknown(text)
62
- end
63
-
64
- def threshold_for(algorithm)
65
- {
66
- dice_coefficient: 89.0,
67
- jaro_winkler: 80.0,
68
- levenshtein: 80.0,
69
- }[algorithm.to_sym]
72
+ def content_for(license)
73
+ ::Spandx::Core::Content.new(Spandx.git[:spdx].read("text/#{license.id}.txt") || '')
70
74
  end
71
75
 
72
- def min(target, other, score, threshold, algorithm)
73
- percentage = target.similarity_score(other.content, algorithm: algorithm)
74
- return if percentage > threshold
75
- return if score.score > 0.0 && score.score < percentage
76
-
77
- score.update(percentage, other)
76
+ def unknown(text)
77
+ ::Spandx::Spdx::License.unknown(text)
78
78
  end
79
79
 
80
- def max(target, other, score, threshold, algorithm)
81
- percentage = target.similarity_score(other.content, algorithm: algorithm)
82
- return if percentage < threshold
83
- return if score.score >= percentage
84
-
85
- score.update(percentage, other)
80
+ def from_expression(content)
81
+ Spandx::Spdx::CompositeLicense
82
+ .from_expression(content.raw, catalogue)
86
83
  end
87
84
  end
88
85
  end
@@ -8,7 +8,12 @@ module Spandx
8
8
  def initialize(driver: Http.default_driver, retries: 3)
9
9
  @driver = driver
10
10
  @retries = retries
11
- @circuits = Hash.new { |hash, key| hash[key] = Circuit.new(key) }
11
+ semaphore = Mutex.new
12
+ @circuits = Hash.new do |hash, key|
13
+ semaphore.synchronize do
14
+ hash[key] = Circuit.new(key)
15
+ end
16
+ end
12
17
  end
13
18
 
14
19
  def get(uri, default: nil, escape: true)
@@ -0,0 +1,101 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Spandx
4
+ module Core
5
+ class IndexFile
6
+ UINT_32_DIRECTIVE = 'V'
7
+ UINT_32_SIZE = 4
8
+
9
+ attr_reader :data_file, :path
10
+
11
+ def initialize(data_file)
12
+ @data_file = data_file
13
+ @path = Pathname.new("#{data_file.absolute_path}.idx")
14
+ @entries = size.positive? ? Array.new(size) : []
15
+ end
16
+
17
+ def each
18
+ total = path.size / UINT_32_SIZE
19
+ total.times do |n|
20
+ yield position_for(n)
21
+ end
22
+ end
23
+
24
+ def search(min: 0, max: size)
25
+ scan do |reader|
26
+ until min >= max
27
+ mid = mid_for(min, max)
28
+ row = reader.row(mid)
29
+ comparison = yield row
30
+ return row if comparison.zero?
31
+
32
+ comparison.positive? ? (min = mid + 1) : (max = mid)
33
+ end
34
+ end
35
+ end
36
+
37
+ def size
38
+ path.exist? ? path.size / UINT_32_SIZE : 0
39
+ end
40
+
41
+ def position_for(row_number)
42
+ return if row_number > size
43
+
44
+ entry = entries[row_number]
45
+ return entry if entry
46
+
47
+ bytes = IO.binread(path, UINT_32_SIZE, offset_for(row_number))
48
+ entry = bytes.unpack1(UINT_32_DIRECTIVE)
49
+ entries[row_number] = entry
50
+ entry
51
+ end
52
+
53
+ def update!
54
+ return unless data_file.exist?
55
+
56
+ sort(data_file)
57
+ rebuild_index!
58
+ end
59
+
60
+ private
61
+
62
+ attr_reader :entries
63
+
64
+ def scan
65
+ data_file.open_file(mode: 'rb') do |io|
66
+ yield Relation.new(io, self)
67
+ end
68
+ end
69
+
70
+ def offset_for(row_number)
71
+ row_number * UINT_32_SIZE
72
+ end
73
+
74
+ def sort(data_file)
75
+ data_file.absolute_path.write(data_file.absolute_path.readlines.sort.uniq.join)
76
+ end
77
+
78
+ def rebuild_index!
79
+ data_file.open_file do |data_io|
80
+ File.open(path, mode: 'wb') do |index_io|
81
+ lines_in(data_io).each do |pos|
82
+ index_io.write([pos].pack(UINT_32_DIRECTIVE))
83
+ end
84
+ end
85
+ end
86
+ end
87
+
88
+ def lines_in(io)
89
+ lines = [0]
90
+ io.seek(0)
91
+ lines << io.pos while io.gets
92
+ lines.pop if lines.size > 1
93
+ lines
94
+ end
95
+
96
+ def mid_for(min, max)
97
+ (max - min) == 1 ? min : (((max - min) / 2) + min)
98
+ end
99
+ end
100
+ end
101
+ end