spandx 0.12.3 → 0.13.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Spandx
4
+ module Core
5
+ class Concurrent
6
+ include Enumerable
7
+
8
+ def self.map(items, pool:, &block)
9
+ queue = Queue.new
10
+
11
+ items.each do |item|
12
+ pool.schedule([item, block]) do |marshalled_item, callable|
13
+ queue.enq(callable.call(marshalled_item))
14
+ end
15
+ end
16
+
17
+ new(queue, items.size)
18
+ end
19
+
20
+ attr_reader :queue, :size
21
+
22
+ def initialize(queue, size)
23
+ @queue = queue
24
+ @size = size
25
+ end
26
+
27
+ def each
28
+ size.times { yield queue.deq }
29
+ end
30
+
31
+ def to_enum
32
+ Enumerator.new do |yielder|
33
+ each do |item|
34
+ yielder.yield item
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end
@@ -13,30 +13,12 @@ module Spandx
13
13
  @tokens ||= tokenize(canonicalize(raw)).to_set
14
14
  end
15
15
 
16
- def similar?(other, algorithm: :dice_coefficient)
17
- case algorithm
18
- when :dice_coefficient
19
- similarity_score(other, algorithm: algorithm) > 89.0
20
- when :levenshtein
21
- similarity_score(other, algorithm: algorithm) < 3
22
- when :jaro_winkler
23
- similarity_score(other, algorithm: algorithm) > 89.0
24
- end
16
+ def similar?(other, threshold: 89.0)
17
+ similarity_score(other) > threshold
25
18
  end
26
19
 
27
- def similarity_score(other, algorithm: :dice_coefficient)
28
- case algorithm
29
- when :dice_coefficient
30
- dice_coefficient(other)
31
- when :levenshtein
32
- require 'text'
33
-
34
- Text::Levenshtein.distance(raw, other.raw, 100)
35
- when :jaro_winkler
36
- require 'jaro_winkler'
37
-
38
- JaroWinkler.distance(raw, other.raw) * 100.0
39
- end
20
+ def similarity_score(other)
21
+ dice_coefficient(other)
40
22
  end
41
23
 
42
24
  private
@@ -46,7 +28,7 @@ module Spandx
46
28
  end
47
29
 
48
30
  def tokenize(content)
49
- content.to_s.scan(/[a-zA-Z]+/)
31
+ content.to_s.scan(/[a-zA-Z\d.]+/)
50
32
  end
51
33
 
52
34
  def blank?(content)
@@ -0,0 +1,66 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Spandx
4
+ module Core
5
+ class DataFile
6
+ include Enumerable
7
+
8
+ attr_reader :absolute_path
9
+
10
+ def initialize(absolute_path)
11
+ @absolute_path = Pathname.new(absolute_path)
12
+ FileUtils.mkdir_p(@absolute_path.dirname)
13
+ end
14
+
15
+ def each
16
+ return unless exist?
17
+
18
+ open_file(mode: 'rb') do |io|
19
+ while (line = io.gets)
20
+ yield CsvParser.parse(line)
21
+ end
22
+ end
23
+ end
24
+
25
+ def search(name:, version:)
26
+ return if name.nil? || name.empty?
27
+ return if version.nil? || name.empty?
28
+ return unless absolute_path.exist?
29
+
30
+ term = "#{name}-#{version}"
31
+ index.search do |row|
32
+ term <=> "#{row[0]}-#{row[1]}"
33
+ end
34
+ end
35
+
36
+ def insert(name, version, licenses)
37
+ return if [name, version].any? { |x| x.nil? || x.empty? }
38
+
39
+ open_file(mode: 'a') do |io|
40
+ io.write(to_csv([name, version, licenses.join('-|-')]))
41
+ end
42
+ end
43
+
44
+ def exist?
45
+ absolute_path.exist?
46
+ end
47
+
48
+ def open_file(mode: 'rb')
49
+ absolute_path.open(mode) { |io| yield io }
50
+ rescue Errno::ENOENT => error
51
+ Spandx.logger.error(error)
52
+ nil
53
+ end
54
+
55
+ def index
56
+ @index ||= IndexFile.new(self)
57
+ end
58
+
59
+ private
60
+
61
+ def to_csv(array)
62
+ array.to_csv(force_quotes: true)
63
+ end
64
+ end
65
+ end
66
+ end
@@ -3,37 +3,21 @@
3
3
  module Spandx
4
4
  module Core
5
5
  class Git
6
- attr_reader :path, :url
6
+ attr_reader :root, :url
7
7
 
8
8
  def initialize(url:)
9
9
  @url = url
10
- @path = path_for(url)
11
- end
12
-
13
- def update!
14
- dotgit? ? pull! : clone!
15
- end
16
-
17
- def expand_path(relative_path)
18
- File.join(path, relative_path)
10
+ @root = path_for(url)
19
11
  end
20
12
 
21
13
  def read(path)
22
- update! unless dotgit?
14
+ full_path = File.join(root, path)
23
15
 
24
- full_path = expand_path(path)
25
16
  IO.read(full_path) if File.exist?(full_path)
26
17
  end
27
18
 
28
- def open(path, mode: 'r')
29
- update! unless dotgit?
30
-
31
- full_path = expand_path(path)
32
- return unless File.exist?(full_path)
33
-
34
- File.open(full_path, mode) do |io|
35
- yield io
36
- end
19
+ def update!
20
+ dotgit? ? pull! : clone!
37
21
  end
38
22
 
39
23
  private
@@ -45,24 +29,18 @@ module Spandx
45
29
  end
46
30
 
47
31
  def dotgit?
48
- File.directory?(File.join(path, '.git'))
32
+ File.directory?(File.join(root, '.git'))
49
33
  end
50
34
 
51
35
  def clone!
52
- system('git', 'clone', '--quiet', url, path)
36
+ system('git', 'clone', '--quiet', '--depth=1', '--single-branch', '--branch', 'master', url, root)
53
37
  end
54
38
 
55
39
  def pull!
56
- within do
40
+ Dir.chdir(root) do
57
41
  system('git', 'pull', '--no-rebase', '--quiet', 'origin', 'master')
58
42
  end
59
43
  end
60
-
61
- def within
62
- Dir.chdir(path) do
63
- yield
64
- end
65
- end
66
44
  end
67
45
 
68
46
  Database = Git
@@ -9,80 +9,77 @@ module Spandx
9
9
  @catalogue = catalogue
10
10
  end
11
11
 
12
- def license_for(raw, algorithm: :dice_coefficient)
13
- raw.is_a?(Hash) ? from_hash(raw, algorithm) : from_string(raw, algorithm)
12
+ def license_for(raw)
13
+ raw.is_a?(Hash) ? from_hash(raw) : from_string(raw)
14
14
  end
15
15
 
16
16
  private
17
17
 
18
- def from_hash(hash, algorithm)
19
- from_string(hash[:name], algorithm) ||
20
- from_url(hash[:url], algorithm) ||
18
+ def from_hash(hash)
19
+ from_string(hash[:name]) ||
20
+ from_url(hash[:url]) ||
21
21
  unknown(hash[:name] || hash[:url])
22
22
  end
23
23
 
24
- def from_string(raw, algorithm)
24
+ def from_string(raw)
25
+ return if raw.nil?
26
+
25
27
  content = Content.new(raw)
26
28
 
27
29
  catalogue[raw] ||
28
- match_name(content, algorithm) ||
29
- match_body(content, algorithm) ||
30
+ catalogue[raw.split(' ').join('-')] ||
31
+ match_name(content) ||
32
+ match_body(content) ||
30
33
  unknown(raw)
31
34
  end
32
35
 
33
- def from_url(url, algorithm)
36
+ def from_url(url)
34
37
  return if url.nil? || url.empty?
35
38
 
36
39
  response = Spandx.http.get(url)
37
40
  return unless Spandx.http.ok?(response)
38
41
 
39
- license_for(response.body, algorithm: algorithm)
42
+ license_for(response.body)
40
43
  end
41
44
 
42
- def match_name(content, _algorithm)
45
+ def match_name(content)
46
+ return if content.tokens.size < 2 || content.tokens.size > 10
47
+
48
+ result = from_expression(content)
49
+ return result if result
50
+
51
+ threshold = 85.0
43
52
  catalogue.find do |license|
44
- score = content.similarity_score(::Spandx::Core::Content.new(license.name))
45
- score > 85
53
+ content.similar?(Content.new(license.name), threshold: threshold)
46
54
  end
47
55
  end
48
56
 
49
- def match_body(content, algorithm)
57
+ def match_body(content)
50
58
  score = Score.new(nil, nil)
51
- threshold = threshold_for(algorithm)
52
- direction = algorithm == :levenshtein ? method(:min) : method(:max)
53
-
59
+ threshold = 89.0
54
60
  catalogue.each do |license|
55
- direction.call(content, license, score, threshold, algorithm) unless license.deprecated_license_id?
61
+ next if license.deprecated_license_id?
62
+
63
+ percentage = content.similarity_score(content_for(license))
64
+ next if percentage < threshold
65
+ next if score.score >= percentage
66
+
67
+ score.update(percentage, license)
56
68
  end
57
69
  score&.item
58
70
  end
59
71
 
60
- def unknown(text)
61
- ::Spandx::Spdx::License.unknown(text)
62
- end
63
-
64
- def threshold_for(algorithm)
65
- {
66
- dice_coefficient: 89.0,
67
- jaro_winkler: 80.0,
68
- levenshtein: 80.0,
69
- }[algorithm.to_sym]
72
+ def content_for(license)
73
+ ::Spandx::Core::Content.new(Spandx.git[:spdx].read("text/#{license.id}.txt") || '')
70
74
  end
71
75
 
72
- def min(target, other, score, threshold, algorithm)
73
- percentage = target.similarity_score(other.content, algorithm: algorithm)
74
- return if percentage > threshold
75
- return if score.score > 0.0 && score.score < percentage
76
-
77
- score.update(percentage, other)
76
+ def unknown(text)
77
+ ::Spandx::Spdx::License.unknown(text)
78
78
  end
79
79
 
80
- def max(target, other, score, threshold, algorithm)
81
- percentage = target.similarity_score(other.content, algorithm: algorithm)
82
- return if percentage < threshold
83
- return if score.score >= percentage
84
-
85
- score.update(percentage, other)
80
+ def from_expression(content)
81
+ Spandx::Spdx::CompositeLicense
82
+ .from_expression(content.raw, catalogue)
86
83
  end
87
84
  end
88
85
  end
@@ -8,7 +8,12 @@ module Spandx
8
8
  def initialize(driver: Http.default_driver, retries: 3)
9
9
  @driver = driver
10
10
  @retries = retries
11
- @circuits = Hash.new { |hash, key| hash[key] = Circuit.new(key) }
11
+ semaphore = Mutex.new
12
+ @circuits = Hash.new do |hash, key|
13
+ semaphore.synchronize do
14
+ hash[key] = Circuit.new(key)
15
+ end
16
+ end
12
17
  end
13
18
 
14
19
  def get(uri, default: nil, escape: true)
@@ -0,0 +1,101 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Spandx
4
+ module Core
5
+ class IndexFile
6
+ UINT_32_DIRECTIVE = 'V'
7
+ UINT_32_SIZE = 4
8
+
9
+ attr_reader :data_file, :path
10
+
11
+ def initialize(data_file)
12
+ @data_file = data_file
13
+ @path = Pathname.new("#{data_file.absolute_path}.idx")
14
+ @entries = size.positive? ? Array.new(size) : []
15
+ end
16
+
17
+ def each
18
+ total = path.size / UINT_32_SIZE
19
+ total.times do |n|
20
+ yield position_for(n)
21
+ end
22
+ end
23
+
24
+ def search(min: 0, max: size)
25
+ scan do |reader|
26
+ until min >= max
27
+ mid = mid_for(min, max)
28
+ row = reader.row(mid)
29
+ comparison = yield row
30
+ return row if comparison.zero?
31
+
32
+ comparison.positive? ? (min = mid + 1) : (max = mid)
33
+ end
34
+ end
35
+ end
36
+
37
+ def size
38
+ path.exist? ? path.size / UINT_32_SIZE : 0
39
+ end
40
+
41
+ def position_for(row_number)
42
+ return if row_number > size
43
+
44
+ entry = entries[row_number]
45
+ return entry if entry
46
+
47
+ bytes = IO.binread(path, UINT_32_SIZE, offset_for(row_number))
48
+ entry = bytes.unpack1(UINT_32_DIRECTIVE)
49
+ entries[row_number] = entry
50
+ entry
51
+ end
52
+
53
+ def update!
54
+ return unless data_file.exist?
55
+
56
+ sort(data_file)
57
+ rebuild_index!
58
+ end
59
+
60
+ private
61
+
62
+ attr_reader :entries
63
+
64
+ def scan
65
+ data_file.open_file(mode: 'rb') do |io|
66
+ yield Relation.new(io, self)
67
+ end
68
+ end
69
+
70
+ def offset_for(row_number)
71
+ row_number * UINT_32_SIZE
72
+ end
73
+
74
+ def sort(data_file)
75
+ data_file.absolute_path.write(data_file.absolute_path.readlines.sort.uniq.join)
76
+ end
77
+
78
+ def rebuild_index!
79
+ data_file.open_file do |data_io|
80
+ File.open(path, mode: 'wb') do |index_io|
81
+ lines_in(data_io).each do |pos|
82
+ index_io.write([pos].pack(UINT_32_DIRECTIVE))
83
+ end
84
+ end
85
+ end
86
+ end
87
+
88
+ def lines_in(io)
89
+ lines = [0]
90
+ io.seek(0)
91
+ lines << io.pos while io.gets
92
+ lines.pop if lines.size > 1
93
+ lines
94
+ end
95
+
96
+ def mid_for(min, max)
97
+ (max - min) == 1 ? min : (((max - min) / 2) + min)
98
+ end
99
+ end
100
+ end
101
+ end