licensee 9.19.0 → 9.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. checksums.yaml +4 -4
  2. data/lib/licensee/content_helper/constants.rb +3 -1
  3. data/lib/licensee/content_helper/normalization_methods.rb +15 -3
  4. data/lib/licensee/content_helper/similarity_methods.rb +15 -1
  5. data/lib/licensee/content_helper.rb +19 -2
  6. data/lib/licensee/matchers/cabal.rb +5 -2
  7. data/lib/licensee/matchers/dice.rb +12 -2
  8. data/lib/licensee/matchers/gemspec.rb +0 -8
  9. data/lib/licensee/matchers/matcher.rb +3 -3
  10. data/lib/licensee/matchers/package.rb +1 -1
  11. data/lib/licensee/project_files/project_file.rb +1 -1
  12. data/lib/licensee/projects/project.rb +2 -2
  13. data/lib/licensee/version.rb +1 -1
  14. data/spec/fixture_spec.rb +2 -2
  15. data/spec/fixtures/bsd-3-linebreak-owner/LICENSE +30 -0
  16. data/spec/fixtures/bsd-3-multilinecopyright/LICENSE +27 -0
  17. data/spec/fixtures/detect.json +1 -1
  18. data/spec/fixtures/fixtures.yml +19 -11
  19. data/spec/fixtures/license-hashes.json +4 -4
  20. data/spec/licensee/content_helper_spec.rb +68 -4
  21. data/spec/licensee/hash_helper_spec.rb +6 -4
  22. data/spec/licensee/license_spec.rb +4 -0
  23. data/spec/licensee/matchers/cabal_matcher_spec.rb +36 -0
  24. data/spec/licensee/matchers/copyright_matcher_spec.rb +11 -0
  25. data/spec/licensee/matchers/dice_matcher_spec.rb +14 -1
  26. data/spec/licensee/matchers/matcher_spec.rb +10 -0
  27. data/spec/licensee/matchers/package_matcher_spec.rb +8 -0
  28. data/spec/licensee/matchers/reference_matcher_spec.rb +4 -0
  29. data/spec/licensee/project_files/project_file_spec.rb +16 -0
  30. data/spec/licensee/projects/project_spec.rb +15 -0
  31. data/spec/licensee/rule_spec.rb +4 -0
  32. data/spec/licensee_spec.rb +8 -0
  33. data/vendor/choosealicense.com/_licenses/bsd-4-clause.txt +1 -1
  34. data/vendor/choosealicense.com/_licenses/zlib.txt +1 -1
  35. data/vendor/license-list-XML/src/0BSD.xml +1 -1
  36. data/vendor/license-list-XML/src/AFL-3.0.xml +1 -1
  37. data/vendor/license-list-XML/src/AGPL-3.0.xml +1 -1
  38. data/vendor/license-list-XML/src/Apache-2.0.xml +1 -2
  39. data/vendor/license-list-XML/src/Artistic-2.0.xml +1 -1
  40. data/vendor/license-list-XML/src/BSD-2-Clause-Patent.xml +1 -1
  41. data/vendor/license-list-XML/src/BSD-2-Clause.xml +1 -1
  42. data/vendor/license-list-XML/src/BSD-3-Clause.xml +1 -1
  43. data/vendor/license-list-XML/src/BSD-4-Clause.xml +1 -1
  44. data/vendor/license-list-XML/src/BSL-1.0.xml +1 -1
  45. data/vendor/license-list-XML/src/ECL-2.0.xml +1 -1
  46. data/vendor/license-list-XML/src/EPL-1.0.xml +1 -1
  47. data/vendor/license-list-XML/src/EPL-2.0.xml +1 -1
  48. data/vendor/license-list-XML/src/EUPL-1.1.xml +1 -1
  49. data/vendor/license-list-XML/src/EUPL-1.2.xml +1 -1
  50. data/vendor/license-list-XML/src/GPL-2.0.xml +6 -3
  51. data/vendor/license-list-XML/src/GPL-3.0.xml +1 -1
  52. data/vendor/license-list-XML/src/ISC.xml +1 -1
  53. data/vendor/license-list-XML/src/LGPL-2.1.xml +6 -3
  54. data/vendor/license-list-XML/src/LGPL-3.0.xml +1 -1
  55. data/vendor/license-list-XML/src/LPPL-1.3c.xml +2 -2
  56. data/vendor/license-list-XML/src/MIT.xml +23 -20
  57. data/vendor/license-list-XML/src/MPL-2.0.xml +1 -1
  58. data/vendor/license-list-XML/src/MS-PL.xml +1 -1
  59. data/vendor/license-list-XML/src/MS-RL.xml +1 -1
  60. data/vendor/license-list-XML/src/NCSA.xml +1 -1
  61. data/vendor/license-list-XML/src/OFL-1.1.xml +1 -1
  62. data/vendor/license-list-XML/src/OSL-3.0.xml +1 -1
  63. data/vendor/license-list-XML/src/PostgreSQL.xml +1 -1
  64. data/vendor/license-list-XML/src/UPL-1.0.xml +1 -1
  65. data/vendor/license-list-XML/src/Zlib.xml +1 -1
  66. metadata +6 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4878becb1edfcb446503f89645e4daeeacc60715ca978a8d4391418b0493f255
4
- data.tar.gz: 651e41f44d60ab16669584edeb693967e4aa1a6a6c911c7a944fc6b35bd05503
3
+ metadata.gz: 5c28e30b13570d0619bc9ec837c115c8184e3ecd3a46f17a5a3146b425c75262
4
+ data.tar.gz: 79fdbe8f8702fa484280083757db94ec93ac7106f3adb5b54aa3287ad7edbc45
5
5
  SHA512:
6
- metadata.gz: c7e96fea4cd873710794c9e68f93edc8f016a620c55464f0f5c8d8ba82c90de8195376f858d8bc6125bb1ac170e980a98557fd135d68ab38ed98e2ba1d5e030e
7
- data.tar.gz: caf4cc661f13ca7ce0ea847e6cd95ec45dd21472eb03cb6862d0cb39c52823818a70373b1133f55570492297543e7db6d4c7adeeefdbef6dd8c5795f67b19919
6
+ metadata.gz: f761cf9ba0303f36693af8d2b91acf2c64b35daf7a2409e7ba7c463a8bba3eea1c7f16a8657e7af47694bbcb640d46507bf341ce62198241d66066e1a0fc8991
7
+ data.tar.gz: 66599cd5fda2597a8d063c547383a259b16b1d8854e7c88057096a2b054193683d3ff2ffd7b47738f30d81d49a68371eee8025fd139187cf9fd8a2bb71b68bc6
@@ -24,6 +24,8 @@ module Licensee
24
24
  developed_by: /#{START_REGEX}developed by:.*?\n\n/im,
25
25
  cc_dedication: /The\s+text\s+of\s+the\s+Creative\s+Commons.*?Public\s+Domain\s+Dedication\./im,
26
26
  cc_wiki: /wiki\.creativecommons\.org/i,
27
+ cc_preamble: /creative\s+commons\s+corporation.*?(?=by\s+exercising\s+the\s+licensed\s+rights)/im,
28
+ cc_notice: /creative\s+commons\s+is\s+not\s+a\s+party\s+to\s+its\s+public\s+licenses\..*\z/im,
27
29
  cc_legal_code: /^\s*Creative Commons Legal Code\s*$/i,
28
30
  cc0_info: /For more information, please see\s*\S+zero\S+/im,
29
31
  cc0_disclaimer: /CREATIVE COMMONS CORPORATION.*?\n\n/im,
@@ -81,7 +83,7 @@ module Licensee
81
83
  'sub license' => 'sublicense',
82
84
  'utilisation' => 'utilization',
83
85
  'whilst' => 'while',
84
- 'wilful' => 'wilfull',
86
+ 'wilful' => 'willful',
85
87
  'non-commercial' => 'noncommercial',
86
88
  'per cent' => 'percent',
87
89
  'copyright owner' => 'copyright holder'
@@ -65,8 +65,13 @@ module Licensee
65
65
  end
66
66
 
67
67
  def strip_copyright
68
- regex = Regexp.union(Matchers::Copyright::REGEX, ContentHelper::REGEXES[:all_rights_reserved])
69
- strip(regex) while _content =~ regex
68
+ copyright_notice_regex = Matchers::Copyright::MAIN_LINE_REGEX
69
+ copyright_regex = Regexp.union(Matchers::Copyright::REGEX, ContentHelper::REGEXES[:all_rights_reserved])
70
+ # Strip opening paragraph only when "All rights reserved." is present — confirms attribution, not license text.
71
+ strip(/\A.*?(?=\n\n)/m) if (p = _content[/\A.*?(?=\n\n)/m]) &&
72
+ p =~ copyright_notice_regex && /all rights reserved/i.match?(p)
73
+ # Strip any remaining copyright lines (e.g. when no blank line is present)
74
+ strip(copyright_regex) while _content =~ copyright_regex
70
75
  end
71
76
 
72
77
  def strip_cc0_optional
@@ -82,6 +87,8 @@ module Licensee
82
87
 
83
88
  strip(ContentHelper::REGEXES[:cc_dedication])
84
89
  strip(ContentHelper::REGEXES[:cc_wiki])
90
+ strip(ContentHelper::REGEXES[:cc_preamble]) if _content.include? 'creative commons corporation'
91
+ strip(ContentHelper::REGEXES[:cc_notice]) if _content.include? 'creative commons is not a party'
85
92
  end
86
93
 
87
94
  def strip_unlicense_optional
@@ -125,7 +132,12 @@ module Licensee
125
132
  end
126
133
 
127
134
  def normalize_spelling
128
- normalize(/\b#{Regexp.union(ContentHelper::VARIETAL_WORDS.keys)}\b/, ContentHelper::VARIETAL_WORDS)
135
+ # Use flexible whitespace between words so that line-wrapped content
136
+ # (e.g. "copyright\nowner") is still normalized correctly.
137
+ ContentHelper::VARIETAL_WORDS.each do |phrase, replacement|
138
+ pattern = phrase.split.map { |w| Regexp.escape(w) }.join('\s+')
139
+ @_content = _content.gsub(/\b#{pattern}\b/, replacement)
140
+ end
129
141
  end
130
142
 
131
143
  def normalize_bullets
@@ -7,12 +7,26 @@ module Licensee
7
7
  # Given another license or project file, calculates the similarity
8
8
  # as a percentage of words in common, minus a tiny penalty that
9
9
  # increases with size difference between licenses so that false
10
- # positives for long licnses are ruled out by this score alone.
10
+ # positives for long licenses are ruled out by this score alone.
11
11
  def similarity(other)
12
12
  overlap = (wordset_fieldless & other.wordset).size
13
13
  (overlap * 200.0) / similarity_denominator(other)
14
14
  end
15
15
 
16
+ # Given another license or project file, calculates the Dice coefficient
17
+ # over bigrams (consecutive word pairs). Unlike wordset similarity this
18
+ # is sensitive to word order, making it resistant to adversarial scrambling
19
+ # where all the correct words appear but in the wrong sequence.
20
+ def bigram_similarity(other)
21
+ my_bigrams = bigrams
22
+ other_bigrams = other.bigrams
23
+ total = my_bigrams.size + other_bigrams.size
24
+ return 0.0 if total.zero?
25
+
26
+ overlap = (my_bigrams & other_bigrams).size
27
+ (overlap * 200.0) / total
28
+ end
29
+
16
30
  private
17
31
 
18
32
  def wordset_fieldless
@@ -14,7 +14,18 @@ module Licensee
14
14
 
15
15
  # A set of each word in the license, without duplicates
16
16
  def wordset
17
- @wordset ||= content_normalized&.scan(%r{(?:[\w/-](?:'s|(?<=s)')?)+})&.to_set
17
+ @wordset ||= words&.to_set
18
+ end
19
+
20
+ # A set of consecutive word pairs (bigrams) in the license, without duplicates.
21
+ # Unlike wordset, bigrams are order-sensitive, making similarity scores
22
+ # robust against adversarial word scrambling (see GitHub issue #602).
23
+ def bigrams
24
+ @bigrams ||= if words.nil? || words.length < 2
25
+ Set.new
26
+ else
27
+ words.each_cons(2).to_set { |a, b| "#{a} #{b}" }
28
+ end
18
29
  end
19
30
 
20
31
  # Number of characters in the normalized content
@@ -72,7 +83,7 @@ module Licensee
72
83
 
73
84
  def self.title_regex
74
85
  @title_regex ||= begin
75
- licenses = Licensee::License.all(hidden: true, psuedo: false)
86
+ licenses = Licensee::License.all(hidden: true, pseudo: false)
76
87
  titles = licenses.map(&:title_regex)
77
88
 
78
89
  # Title regex must include the version to support matching within
@@ -90,6 +101,12 @@ module Licensee
90
101
 
91
102
  private
92
103
 
104
+ # Ordered array of words extracted from the normalized content.
105
+ # Memoized so that both wordset and bigrams share the same scan result.
106
+ def words
107
+ @words ||= content_normalized&.scan(%r{(?:[\w/-](?:'s|(?<=s)')?)+})
108
+ end
109
+
93
110
  def _content
94
111
  @_content ||= content.to_s.dup.strip
95
112
  end
@@ -6,7 +6,8 @@ module Licensee
6
6
  class Cabal < Licensee::Matchers::Package
7
7
  # While we could parse the cabal file, prefer
8
8
  # a lenient regex for speed and security. Moar parsing moar problems.
9
- LICENSE_REGEX = /^\s*license\s*:\s*([a-z\-0-9.]+)\s*$/ix
9
+ # The "+" suffix is the pre-SPDX Cabal notation for "or-later" (e.g. GPL-2+).
10
+ LICENSE_REGEX = /^\s*license\s*:\s*([a-z\-0-9.+]+)\s*$/ix
10
11
  LICENSE_CONVERSIONS = {
11
12
  'GPL-2' => 'GPL-2.0',
12
13
  'GPL-3' => 'GPL-3.0',
@@ -24,7 +25,9 @@ module Licensee
24
25
  end
25
26
 
26
27
  def spdx_name(cabal_name)
27
- LICENSE_CONVERSIONS[cabal_name] || cabal_name
28
+ # Strip pre-SPDX "or-later" suffix (+) before looking up conversions
29
+ normalized = cabal_name.chomp('+')
30
+ LICENSE_CONVERSIONS[normalized] || normalized
28
31
  end
29
32
  end
30
33
  end
@@ -43,8 +43,9 @@ module Licensee
43
43
  alias licenses_by_similarity matches_by_similarity
44
44
 
45
45
  def matches
46
- @matches ||= matches_by_similarity.select do |_, similarity|
47
- similarity >= minimum_confidence
46
+ @matches ||= matches_by_similarity.select do |license, similarity|
47
+ similarity >= minimum_confidence &&
48
+ license.bigram_similarity(file) >= minimum_bigram_confidence
48
49
  end
49
50
  end
50
51
 
@@ -58,6 +59,15 @@ module Licensee
58
59
  def minimum_confidence
59
60
  Licensee.confidence_threshold
60
61
  end
62
+
63
+ # A floor for bigram similarity, used to reject adversarially scrambled
64
+ # content that achieves high wordset similarity by including all the right
65
+ # words in the wrong order. Set to half the wordset threshold so that any
66
+ # genuine license match (which typically scores 90%+ on bigrams) passes,
67
+ # while scrambled content (which scores near 0%) is rejected.
68
+ def minimum_bigram_confidence
69
+ Licensee.confidence_threshold / 2.0
70
+ end
61
71
  end
62
72
  end
63
73
  end
@@ -13,10 +13,6 @@ module Licensee
13
13
  # non-value groups
14
14
  ARRAY_REGEX = /\s*\[#{VALUE_REGEX}(?:,#{VALUE_REGEX})*\]\s*/i
15
15
 
16
- DECLARATION_REGEX = /
17
- ^\s*[a-z0-9_]+\.([a-z0-9_]+)\s*=#{VALUE_REGEX}$
18
- /ix
19
-
20
16
  LICENSE_REGEX = /
21
17
  ^\s*[a-z0-9_]+\.license\s*=#{VALUE_REGEX}$
22
18
  /ix
@@ -45,10 +41,6 @@ module Licensee
45
41
  match = @file.content.match LICENSE_ARRAY_REGEX
46
42
  match.captures.compact.map(&:downcase) if match
47
43
  end
48
-
49
- def declarations
50
- @declarations ||= @file.content.match DECLARATION_REGEX
51
- end
52
44
  end
53
45
  end
54
46
  end
@@ -19,17 +19,17 @@ module Licensee
19
19
  end
20
20
 
21
21
  def match
22
- raise 'Not implemented'
22
+ raise NotImplementedError, "#{self.class}#match is not implemented"
23
23
  end
24
24
 
25
25
  def confidence
26
- raise 'Not implemented'
26
+ raise NotImplementedError, "#{self.class}#confidence is not implemented"
27
27
  end
28
28
 
29
29
  private
30
30
 
31
31
  def potential_matches
32
- @potential_matches ||= Licensee.licenses(hidden: true, psuedo: false)
32
+ @potential_matches ||= Licensee.licenses(hidden: true, pseudo: false)
33
33
  end
34
34
  end
35
35
  end
@@ -25,7 +25,7 @@ module Licensee
25
25
  end
26
26
 
27
27
  def license_property
28
- raise 'Not implemented'
28
+ raise NotImplementedError, "#{self.class}#license_property is not implemented"
29
29
  end
30
30
 
31
31
  private
@@ -66,7 +66,7 @@ module Licensee
66
66
  alias relative_path path_relative_to_root
67
67
 
68
68
  def possible_matchers
69
- raise 'Not implemented'
69
+ raise NotImplementedError, "#{self.class}#possible_matchers is not implemented"
70
70
  end
71
71
 
72
72
  def matcher
@@ -152,11 +152,11 @@ module Licensee
152
152
  end
153
153
 
154
154
  def files
155
- raise 'Not implemented'
155
+ raise NotImplementedError, "#{self.class}#files is not implemented"
156
156
  end
157
157
 
158
158
  def load_file(_file)
159
- raise 'Not implemented'
159
+ raise NotImplementedError, "#{self.class}#load_file is not implemented"
160
160
  end
161
161
  end
162
162
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Licensee
4
- VERSION = '9.19.0'
4
+ VERSION = '9.20.0'
5
5
  end
data/spec/fixture_spec.rb CHANGED
@@ -18,9 +18,9 @@ RSpec.describe Fixture do
18
18
  Licensee::License.find('none')
19
19
  end
20
20
 
21
- it 'has an expected license in fixtures-licenses.yml' do
21
+ it 'has an expected license in fixtures.yml' do
22
22
  msg = +'Expected an entry in `'
23
- msg << fixture_path('fixtures-licenses.yml')
23
+ msg << fixture_path('fixtures.yml')
24
24
  msg << "` for the `#{fixture}` fixture. Please run "
25
25
  msg << 'script/dump-fixture-licenses and confirm the output.'
26
26
  expect(fixture_licenses).to have_key(fixture), msg
@@ -0,0 +1,30 @@
1
+ Copyright (c) 2023, Karl Pettersson
2
+
3
+ All rights reserved.
4
+
5
+ Redistribution and use in source and binary forms, with or without
6
+ modification, are permitted provided that the following conditions are met:
7
+
8
+ * Redistributions of source code must retain the above copyright
9
+ notice, this list of conditions and the following disclaimer.
10
+
11
+ * Redistributions in binary form must reproduce the above
12
+ copyright notice, this list of conditions and the following
13
+ disclaimer in the documentation and/or other materials provided
14
+ with the distribution.
15
+
16
+ * Neither the name of Karl Pettersson nor the names of other
17
+ contributors may be used to endorse or promote products derived
18
+ from this software without specific prior written permission.
19
+
20
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,27 @@
1
+ BSD 3-Clause License
2
+
3
+ Copyright (c) 2016-2026 by University of Kassel and Fraunhofer Institute for Energy Economics
4
+ and Energy System Technology (IEE) Kassel and individual contributors (see AUTHORS file for details).
5
+ All rights reserved.
6
+
7
+ Redistribution and use in source and binary forms, with or without modification, are permitted
8
+ provided that the following conditions are met:
9
+
10
+ 1. Redistributions of source code must retain the above copyright notice, this list of conditions
11
+ and the following disclaimer.
12
+
13
+ 2. Redistributions in binary form must reproduce the above copyright notice, this list of
14
+ conditions and the following disclaimer in the documentation and/or other materials provided
15
+ with the distribution.
16
+
17
+ 3. Neither the name of the copyright holder nor the names of its contributors may be used to
18
+ endorse or promote products derived from this software without specific prior written permission.
19
+
20
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
21
+ IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
22
+ FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
23
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
27
+ WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -93,7 +93,7 @@
93
93
  },
94
94
  {
95
95
  "filename": "licensee.gemspec",
96
- "content": "# frozen_string_literal: true\n\nrequire File.expand_path('lib/licensee/version', __dir__)\n\nGem::Specification.new do |gem|\n gem.name = 'licensee'\n gem.version = Licensee::VERSION\n\n gem.summary = 'A Ruby Gem to detect open source project licenses'\n gem.description = <<-DESC\n Licensee automates the process of reading LICENSE files and\n compares their contents to known licenses using a fancy maths.\n DESC\n\n gem.authors = ['Ben Balter']\n gem.email = 'ben.balter@github.com'\n gem.homepage = 'https://github.com/benbalter/licensee'\n gem.license = 'MIT'\n gem.metadata['rubygems_mfa_required'] = 'true'\n\n gem.bindir = 'bin'\n gem.executables << 'licensee'\n\n gem.add_dependency('dotenv', '>= 2', '< 4')\n gem.add_dependency('octokit', '>= 4.20', '< 11.0')\n gem.add_dependency('reverse_markdown', '>= 1', '< 4')\n gem.add_dependency('rugged', '>= 0.24', '<2.0')\n gem.add_dependency('thor', '>= 0.19', '< 2.0')\n\n gem.add_development_dependency('gem-release', '~> 2.0')\n gem.add_development_dependency('mustache', '>= 0.9', '< 2.0')\n gem.add_development_dependency('pry', '~> 0.9')\n gem.add_development_dependency('rspec', '~> 3.5')\n gem.add_development_dependency('rubocop', '~> 1.0')\n gem.add_development_dependency('rubocop-performance', '~> 1.5')\n gem.add_development_dependency('rubocop-rspec', '~> 3.0')\n gem.add_development_dependency('simplecov', '~> 0.16')\n gem.add_development_dependency('webmock', '~> 3.1')\n\n gem.required_ruby_version = '>= 3.2'\n\n # ensure the gem is built out of versioned files\n gem.files = Dir[\n '{bin,lib,man,test,vendor,spec}/**/*',\n 'README*', 'LICENSE*'\n ] & `git ls-files -z`.split(\"\\0\")\nend\n",
96
+ "content": "# frozen_string_literal: true\n\nrequire File.expand_path('lib/licensee/version', __dir__)\n\nGem::Specification.new do |gem|\n gem.name = 'licensee'\n gem.version = Licensee::VERSION\n\n gem.summary = 'A Ruby Gem to detect open source project licenses'\n gem.description = <<-DESC\n Licensee automates the process of reading LICENSE files and\n compares their contents to known licenses using a fancy maths.\n DESC\n\n gem.authors = ['Ben Balter']\n gem.email = 'ben.balter@github.com'\n gem.homepage = 'https://github.com/licensee/licensee'\n gem.license = 'MIT'\n gem.metadata['rubygems_mfa_required'] = 'true'\n\n gem.bindir = 'bin'\n gem.executables << 'licensee'\n\n gem.add_dependency('dotenv', '>= 2', '< 4')\n gem.add_dependency('octokit', '>= 4.20', '< 11.0')\n gem.add_dependency('reverse_markdown', '>= 1', '< 4')\n gem.add_dependency('rugged', '>= 0.24', '<2.0')\n gem.add_dependency('thor', '>= 0.19', '< 2.0')\n\n gem.add_development_dependency('gem-release', '~> 2.0')\n gem.add_development_dependency('mustache', '>= 0.9', '< 2.0')\n gem.add_development_dependency('pry', '~> 0.9')\n gem.add_development_dependency('rspec', '~> 3.5')\n gem.add_development_dependency('rubocop', '~> 1.0')\n gem.add_development_dependency('rubocop-performance', '~> 1.5')\n gem.add_development_dependency('rubocop-rspec', '~> 3.0')\n gem.add_development_dependency('simplecov', '~> 0.16')\n gem.add_development_dependency('webmock', '~> 3.1')\n\n gem.required_ruby_version = '>= 3.2'\n\n # ensure the gem is built out of versioned files\n gem.files = Dir[\n '{bin,lib,man,test,vendor,spec}/**/*',\n 'README*', 'LICENSE*'\n ] & `git ls-files -z`.split(\"\\0\")\nend\n",
97
97
  "content_hash": null,
98
98
  "content_normalized": null,
99
99
  "matcher": {
@@ -7,7 +7,7 @@ agpl-3.0_markdown:
7
7
  apache-2.0_markdown:
8
8
  key: apache-2.0
9
9
  matcher: dice
10
- hash: 62937105bf0baec879cc6cc96d2fc2ce4922fc6d
10
+ hash: 0b6213bf4bf883f67e804a39c23f92cb63c70f1d
11
11
  apache-with-readme-notice:
12
12
  key: apache-2.0
13
13
  matcher: exact
@@ -28,10 +28,18 @@ bsd-3-authorowner:
28
28
  key: bsd-3-clause
29
29
  matcher: dice
30
30
  hash: 2e6f215833d1a3d10e6194d479dbb2b4be2f64d7
31
+ bsd-3-multilinecopyright:
32
+ key: bsd-3-clause
33
+ matcher: exact
34
+ hash: a961b19cc6921d510e29a13b0ba1a826fcffe41c
31
35
  bsd-3-clause_markdown:
32
36
  key: bsd-3-clause
33
37
  matcher: dice
34
38
  hash: 2449fc8ece2fa342f2e82bbbf86f01d19329a531
39
+ bsd-3-linebreak-owner:
40
+ key: bsd-3-clause
41
+ matcher: dice
42
+ hash: 9a1ab486a9182629581b5598b415df85b48eb008
35
43
  bsd-3-lists:
36
44
  key: bsd-3-clause
37
45
  matcher:
@@ -54,28 +62,28 @@ case-sensitive:
54
62
  hash: da39a3ee5e6b4b0d3255bfef95601890afd80709
55
63
  cc-by-4.0_markdown:
56
64
  key: cc-by-4.0
57
- matcher: dice
58
- hash: b72312bbcd6400de2c6103f2b9981e6239d89f36
65
+ matcher: exact
66
+ hash: f2a70fcab522bfb2fbcdefb47b94d2a928e22091
59
67
  cc-by-nc-sa:
60
68
  key: other
61
69
  matcher:
62
- hash: c1c5bf7b6f130913568b20ed0441a6dbef5d6f11
70
+ hash: ec364756344b00a9f6a59cb239a16634f7b40770
63
71
  cc-by-nd:
64
72
  key: other
65
73
  matcher:
66
- hash: a006cd136405570d81aafb4cfd19de78012bbbe4
74
+ hash: b3cd0e6254ff1116ad09860ec03ea871cff30f64
67
75
  cc-by-sa-4.0_markdown:
68
76
  key: cc-by-sa-4.0
69
- matcher: dice
70
- hash: 973d41ed4a3fca2a1205ace75835029d66c1e224
77
+ matcher: exact
78
+ hash: 33464647bebf0a285d9df642ccb07c8e26b3e268
71
79
  cc-by-sa-mdlinks:
72
80
  key: cc-by-sa-4.0
73
- matcher: dice
74
- hash: 4b0a634a0db5015914cbdde602672b2addfd66e9
81
+ matcher: exact
82
+ hash: a81c2a3a07b59f58b3c09387874724670122fc90
75
83
  cc-by-sa-nocclicensor:
76
84
  key: cc-by-sa-4.0
77
85
  matcher: dice
78
- hash: 9cf238d2d4dd703a149d8b5ae7d3f5a04206f772
86
+ hash: 6fc6425689d3156be764c8e765ae2553a54e3589
79
87
  cc0-1.0_markdown:
80
88
  key: cc0-1.0
81
89
  matcher: dice
@@ -111,7 +119,7 @@ epl-1.0_markdown:
111
119
  eupl-cal2017:
112
120
  key: eupl-1.2
113
121
  matcher: exact
114
- hash: 7425a276b011dea63591fe8876146f2451cfd777
122
+ hash: d8debddd73476c481fc6ceed37c75f0ae97a6e81
115
123
  fcpl-modified-mpl:
116
124
  key: other
117
125
  matcher:
@@ -11,8 +11,8 @@
11
11
  "bsd-3-clause-clear": "0fcdb12c4060ce8f406e17bc67787e50a9b36a61",
12
12
  "bsd-4-clause": "3b2917580b2b6f13efaaea37546b8b7a53716a30",
13
13
  "bsl-1.0": "27e28f20b57048cf04be07e1532b6fb501a0753b",
14
- "cc-by-4.0": "7ff5344de1b567d0bb090ea7dd6988b7fa4cd351",
15
- "cc-by-sa-4.0": "f8c9d796e80f6e19458f8b3bbe8bfadd615958a5",
14
+ "cc-by-4.0": "e8c6c40ba40ff2a44f19d74987731e98facc1451",
15
+ "cc-by-sa-4.0": "35168cd69d6ef5b9dd81f7793898fad53c1798ec",
16
16
  "cc0-1.0": "34dbb82be40b15f7c521d4f2d1a36ebe76246936",
17
17
  "cecill-2.1": "ea372810464d71db27e62ad499628991ea2818cf",
18
18
  "cern-ohl-p-2.0": "f10b4b8d75502ab65a7bdbe1d616e5eb8d157aed",
@@ -21,8 +21,8 @@
21
21
  "ecl-2.0": "296976ce9e84ba380866e4519b68a779c2059b3a",
22
22
  "epl-1.0": "5e3cb10996b4ba2821d04d5c99a912c924b3bdcb",
23
23
  "epl-2.0": "e2f3e266432478d9248422228a75a404cce1c43c",
24
- "eupl-1.1": "b35810b4113910f5f85af75f24e2538ba64c8876",
25
- "eupl-1.2": "2098182069695981c6dc71093888f6204c7bbdae",
24
+ "eupl-1.1": "2e384f67f0cb5adb7f63470c5dcea0280873f2b3",
25
+ "eupl-1.2": "169fa5fdd2118679d1453414d0a5d28b2a5fcdc4",
26
26
  "gfdl-1.3": "164a858691ea0a6fb0dd06c5ca00e5dd7620eef8",
27
27
  "gpl-2.0": "32108116603c30687d8d0d2f77f140fb6ecea082",
28
28
  "gpl-3.0": "7d4cdf499d39e2e1ce27b2878e22872f0f5a74dd",
@@ -13,10 +13,6 @@ class ContentHelperTestHelper
13
13
  def filename
14
14
  @data[:filename]
15
15
  end
16
-
17
- def spdx_id
18
- @data[:spdx_id]
19
- end
20
16
  end
21
17
 
22
18
  RSpec.describe Licensee::ContentHelper do
@@ -57,10 +53,41 @@ RSpec.describe Licensee::ContentHelper do
57
53
  )
58
54
  end
59
55
 
56
+ def expected_bigrams
57
+ Set.new(
58
+ [
59
+ 'the made', 'made up', 'up license', 'license this', 'this license',
60
+ 'license provided', 'provided as', 'as is\'', 'is\' please', 'please respect',
61
+ 'respect the', 'the contributors\'', 'contributors\' wishes', 'wishes when',
62
+ 'when implementing', 'implementing the', 'the license\'s', 'license\'s software'
63
+ ]
64
+ )
65
+ end
66
+
60
67
  it 'creates the wordset' do
61
68
  expect(helper.wordset).to eql(expected_wordset)
62
69
  end
63
70
 
71
+ it 'creates bigrams' do
72
+ expect(helper.bigrams).to eql(expected_bigrams)
73
+ end
74
+
75
+ it 'returns empty set for content with fewer than two words' do
76
+ single_word = ContentHelperTestHelper.new('word', filename: 'LICENSE')
77
+ expect(single_word.bigrams).to eql(Set.new)
78
+ end
79
+
80
+ it 'calculates bigram_similarity for exact content' do
81
+ expect(mit.bigram_similarity(mit)).to eq(100.0)
82
+ end
83
+
84
+ it 'calculates bigram_similarity near zero for scrambled wordset' do
85
+ # All unique words from MIT sorted alphabetically: same wordset, different order.
86
+ sorted_words = mit.content_normalized.scan(%r{(?:[\w/-](?:'s|(?<=s)')?)+}).uniq.sort
87
+ scrambled = ContentHelperTestHelper.new(sorted_words.join(' '), filename: 'LICENSE')
88
+ expect(mit.bigram_similarity(scrambled)).to be < 5.0
89
+ end
90
+
64
91
  it 'knows the length' do
65
92
  expect(helper.length).to be(135)
66
93
  end
@@ -275,6 +302,14 @@ RSpec.describe Licensee::ContentHelper do
275
302
  end
276
303
  end
277
304
 
305
+ context 'when normalizing wilful to willful' do
306
+ let(:content) { 'wilful misconduct' }
307
+
308
+ it 'normalizes wilful to willful' do
309
+ expect(helper.content_normalized).to eql('willful misconduct')
310
+ end
311
+ end
312
+
278
313
  Licensee::License.all(hidden: true).each do |license|
279
314
  context "with the #{license.name} license" do
280
315
  let(:stripped_content) { helper.content_without_title_and_version }
@@ -334,6 +369,35 @@ RSpec.describe Licensee::ContentHelper do
334
369
  expect(normalized_content).to eql('foo')
335
370
  end
336
371
  end
372
+
373
+ context 'with a multi-line copyright holder name followed by All rights reserved' do
374
+ let(:content) do
375
+ "Copyright (c) 2020 by Corporation Name and\n" \
376
+ "its Subsidiaries (see AUTHORS).\n" \
377
+ "All rights reserved.\n\n" \
378
+ 'Foo'
379
+ end
380
+
381
+ it 'strips the wrapped copyright holder continuation and all rights reserved' do
382
+ expect(normalized_content).to eql('foo')
383
+ end
384
+ end
385
+
386
+ context 'with a multi-line copyright holder name without All rights reserved' do
387
+ let(:content) do
388
+ "Copyright (c) 2020 by Corporation Name and\n" \
389
+ "its Subsidiaries (see AUTHORS).\n\n" \
390
+ 'Foo'
391
+ end
392
+
393
+ it 'leaves the ambiguous continuation' do
394
+ expect(normalized_content).to include('subsidiaries')
395
+ end
396
+
397
+ it 'strips the copyright notice line' do
398
+ expect(normalized_content).not_to include('copyright (c) 2020 by corporation name and')
399
+ end
400
+ end
337
401
  end
338
402
 
339
403
  context 'when matching title regex' do
@@ -21,13 +21,15 @@ class HashHelperSpecFixture
21
21
  Licensee::Rule.all
22
22
  end
23
23
 
24
- def baz
25
- 'baz'
26
- end
27
-
28
24
  def nil_value
29
25
  nil
30
26
  end
27
+
28
+ # Method not listed in HASH_METHODS; used to ensure HashHelper#to_h
29
+ # does not expose arbitrary instance methods.
30
+ def baz
31
+ 'not included'
32
+ end
31
33
  end
32
34
 
33
35
  RSpec.describe Licensee::HashHelper do
@@ -204,6 +204,10 @@ RSpec.describe Licensee::License do
204
204
  expect(mit.key).to eql('mit')
205
205
  end
206
206
 
207
+ it 'has a useful inspect string' do
208
+ expect(mit.inspect).to eql('#<Licensee::License key=mit>')
209
+ end
210
+
207
211
  it 'exposes the SPDX ID' do
208
212
  expect(gpl.spdx_id).to eql('GPL-3.0')
209
213
  end
@@ -122,6 +122,42 @@ RSpec.describe Licensee::Matchers::Cabal do
122
122
  end
123
123
  end
124
124
 
125
+ context 'with pre-SPDX "or-later" (+) suffix' do
126
+ let(:content) { "license: #{cabal_license}" }
127
+
128
+ context 'with GPL-2+' do
129
+ let(:cabal_license) { 'GPL-2+' }
130
+
131
+ it 'returns GPL-2.0' do
132
+ expect(matcher.match).to eql(Licensee::License.find('GPL-2.0'))
133
+ end
134
+ end
135
+
136
+ context 'with GPL-3+' do
137
+ let(:cabal_license) { 'GPL-3+' }
138
+
139
+ it 'returns GPL-3.0' do
140
+ expect(matcher.match).to eql(Licensee::License.find('GPL-3.0'))
141
+ end
142
+ end
143
+
144
+ context 'with LGPL-3+' do
145
+ let(:cabal_license) { 'LGPL-3+' }
146
+
147
+ it 'returns LGPL-3.0' do
148
+ expect(matcher.match).to eql(Licensee::License.find('LGPL-3.0'))
149
+ end
150
+ end
151
+
152
+ context 'with AGPL-3+' do
153
+ let(:cabal_license) { 'AGPL-3+' }
154
+
155
+ it 'returns AGPL-3.0' do
156
+ expect(matcher.match).to eql(Licensee::License.find('AGPL-3.0'))
157
+ end
158
+ end
159
+ end
160
+
125
161
  context 'with no license field' do
126
162
  let(:content) { 'foo: bar' }
127
163
 
@@ -57,4 +57,15 @@ RSpec.describe Licensee::Matchers::Copyright do
57
57
  expect(matcher.match).to be_nil
58
58
  end
59
59
  end
60
+
61
+ context 'with encoding-incompatible content' do
62
+ # A string with non-ASCII bytes in an encoding incompatible with the
63
+ # UTF-8 copyright regex triggers Encoding::CompatibilityError
64
+ let(:raw_content) { (+"\xC2\xA9 2015 Ben Balter").force_encoding('EUC-KR') }
65
+
66
+ it 'returns nil gracefully' do
67
+ allow(file).to receive(:content).and_return(raw_content)
68
+ expect(matcher.match).to be_nil
69
+ end
70
+ end
60
71
  end