licensee 9.10.0 → 9.13.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. checksums.yaml +4 -4
  2. data/bin/licensee +5 -4
  3. data/lib/licensee.rb +4 -2
  4. data/lib/licensee/commands/detect.rb +10 -5
  5. data/lib/licensee/commands/diff.rb +7 -8
  6. data/lib/licensee/commands/license_path.rb +2 -0
  7. data/lib/licensee/commands/version.rb +2 -0
  8. data/lib/licensee/content_helper.rb +234 -85
  9. data/lib/licensee/hash_helper.rb +7 -5
  10. data/lib/licensee/license.rb +32 -25
  11. data/lib/licensee/license_field.rb +4 -1
  12. data/lib/licensee/license_meta.rb +3 -0
  13. data/lib/licensee/license_rules.rb +2 -0
  14. data/lib/licensee/matchers.rb +2 -0
  15. data/lib/licensee/matchers/cabal.rb +16 -2
  16. data/lib/licensee/matchers/cargo.rb +3 -1
  17. data/lib/licensee/matchers/copyright.rb +6 -4
  18. data/lib/licensee/matchers/cran.rb +7 -3
  19. data/lib/licensee/matchers/dice.rb +6 -4
  20. data/lib/licensee/matchers/dist_zilla.rb +3 -1
  21. data/lib/licensee/matchers/exact.rb +3 -0
  22. data/lib/licensee/matchers/gemspec.rb +8 -5
  23. data/lib/licensee/matchers/matcher.rb +3 -1
  24. data/lib/licensee/matchers/npm_bower.rb +3 -1
  25. data/lib/licensee/matchers/package.rb +3 -0
  26. data/lib/licensee/matchers/reference.rb +3 -1
  27. data/lib/licensee/matchers/spdx.rb +3 -1
  28. data/lib/licensee/project_files.rb +2 -0
  29. data/lib/licensee/project_files/license_file.rb +13 -10
  30. data/lib/licensee/project_files/package_manager_file.rb +3 -0
  31. data/lib/licensee/project_files/project_file.rb +12 -4
  32. data/lib/licensee/project_files/readme_file.rb +5 -3
  33. data/lib/licensee/projects.rb +2 -0
  34. data/lib/licensee/projects/fs_project.rb +3 -0
  35. data/lib/licensee/projects/git_project.rb +19 -11
  36. data/lib/licensee/projects/github_project.rb +6 -1
  37. data/lib/licensee/projects/project.rb +16 -5
  38. data/lib/licensee/rule.rb +2 -0
  39. data/lib/licensee/version.rb +3 -1
  40. data/licensee.gemspec +47 -0
  41. data/spec/bin_spec.rb +3 -1
  42. data/spec/fixture_spec.rb +46 -0
  43. data/spec/fixtures/bsd-3-noendorseslash/LICENSE +30 -0
  44. data/spec/fixtures/cc0-cal2013/LICENSE +116 -0
  45. data/spec/fixtures/cc0-cc/LICENSE +121 -0
  46. data/spec/fixtures/detect.json +9 -7
  47. data/spec/fixtures/fixtures.yml +130 -0
  48. data/spec/fixtures/html/license.html +262 -0
  49. data/spec/fixtures/license-hashes.json +41 -0
  50. data/spec/fixtures/mit-optional/LICENSE.txt +21 -0
  51. data/spec/fixtures/multiple-arrs/LICENSE +30 -0
  52. data/spec/fixtures/unlicense-noinfo/LICENSE +22 -0
  53. data/spec/integration_spec.rb +68 -2
  54. data/spec/licensee/commands/detect_spec.rb +10 -6
  55. data/spec/licensee/commands/license_path_spec.rb +3 -1
  56. data/spec/licensee/commands/version_spec.rb +3 -1
  57. data/spec/licensee/content_helper_spec.rb +184 -67
  58. data/spec/licensee/hash_helper_spec.rb +3 -1
  59. data/spec/licensee/license_field_spec.rb +5 -3
  60. data/spec/licensee/license_meta_spec.rb +16 -12
  61. data/spec/licensee/license_rules_spec.rb +6 -2
  62. data/spec/licensee/license_spec.rb +62 -37
  63. data/spec/licensee/matchers/cabal_matcher_spec.rb +97 -2
  64. data/spec/licensee/matchers/cargo_matcher_spec.rb +5 -2
  65. data/spec/licensee/matchers/copyright_matcher_spec.rb +7 -5
  66. data/spec/licensee/matchers/cran_matcher_spec.rb +5 -2
  67. data/spec/licensee/matchers/dice_matcher_spec.rb +15 -12
  68. data/spec/licensee/matchers/dist_zilla_matcher_spec.rb +5 -2
  69. data/spec/licensee/matchers/exact_matcher_spec.rb +5 -2
  70. data/spec/licensee/matchers/gemspec_matcher_spec.rb +5 -2
  71. data/spec/licensee/matchers/matcher_spec.rb +6 -2
  72. data/spec/licensee/matchers/npm_bower_matcher_spec.rb +5 -3
  73. data/spec/licensee/matchers/package_matcher_spec.rb +6 -2
  74. data/spec/licensee/matchers/reference_matcher_spec.rb +4 -2
  75. data/spec/licensee/matchers/spdx_matcher_spec.rb +5 -2
  76. data/spec/licensee/project_files/license_file_spec.rb +20 -18
  77. data/spec/licensee/project_files/package_info_spec.rb +5 -1
  78. data/spec/licensee/project_files/project_file_spec.rb +8 -2
  79. data/spec/licensee/project_files/readme_file_spec.rb +4 -1
  80. data/spec/licensee/project_spec.rb +24 -17
  81. data/spec/licensee/projects/git_project_spec.rb +23 -0
  82. data/spec/licensee/projects/github_project_spec.rb +8 -5
  83. data/spec/licensee/rule_spec.rb +6 -3
  84. data/spec/licensee_spec.rb +12 -9
  85. data/spec/spec_helper.rb +28 -9
  86. data/spec/vendored_license_spec.rb +29 -10
  87. data/vendor/choosealicense.com/_data/meta.yml +0 -4
  88. data/vendor/choosealicense.com/_data/rules.yml +3 -0
  89. data/vendor/choosealicense.com/_licenses/0bsd.txt +39 -0
  90. data/vendor/choosealicense.com/_licenses/afl-3.0.txt +7 -6
  91. data/vendor/choosealicense.com/_licenses/agpl-3.0.txt +0 -1
  92. data/vendor/choosealicense.com/_licenses/apache-2.0.txt +1 -2
  93. data/vendor/choosealicense.com/_licenses/artistic-2.0.txt +1 -2
  94. data/vendor/choosealicense.com/_licenses/bsd-2-clause.txt +8 -6
  95. data/vendor/choosealicense.com/_licenses/bsd-3-clause-clear.txt +2 -2
  96. data/vendor/choosealicense.com/_licenses/bsd-3-clause.txt +12 -10
  97. data/vendor/choosealicense.com/_licenses/bsd-4-clause.txt +61 -0
  98. data/vendor/choosealicense.com/_licenses/bsl-1.0.txt +5 -2
  99. data/vendor/choosealicense.com/_licenses/cc-by-4.0.txt +3 -1
  100. data/vendor/choosealicense.com/_licenses/cc-by-sa-4.0.txt +3 -1
  101. data/vendor/choosealicense.com/_licenses/cc0-1.0.txt +113 -105
  102. data/vendor/choosealicense.com/_licenses/cecill-2.1.txt +579 -0
  103. data/vendor/choosealicense.com/_licenses/ecl-2.0.txt +1 -2
  104. data/vendor/choosealicense.com/_licenses/epl-1.0.txt +1 -2
  105. data/vendor/choosealicense.com/_licenses/epl-2.0.txt +3 -4
  106. data/vendor/choosealicense.com/_licenses/eupl-1.1.txt +0 -1
  107. data/vendor/choosealicense.com/_licenses/eupl-1.2.txt +0 -1
  108. data/vendor/choosealicense.com/_licenses/gpl-2.0.txt +0 -1
  109. data/vendor/choosealicense.com/_licenses/gpl-3.0.txt +1 -2
  110. data/vendor/choosealicense.com/_licenses/isc.txt +0 -1
  111. data/vendor/choosealicense.com/_licenses/lgpl-2.1.txt +0 -1
  112. data/vendor/choosealicense.com/_licenses/lgpl-3.0.txt +1 -3
  113. data/vendor/choosealicense.com/_licenses/lppl-1.3c.txt +1 -2
  114. data/vendor/choosealicense.com/_licenses/mit.txt +1 -2
  115. data/vendor/choosealicense.com/_licenses/mpl-2.0.txt +0 -1
  116. data/vendor/choosealicense.com/_licenses/ms-pl.txt +0 -1
  117. data/vendor/choosealicense.com/_licenses/ms-rl.txt +0 -1
  118. data/vendor/choosealicense.com/_licenses/ncsa.txt +21 -22
  119. data/vendor/choosealicense.com/_licenses/odbl-1.0.txt +573 -0
  120. data/vendor/choosealicense.com/_licenses/ofl-1.1.txt +4 -2
  121. data/vendor/choosealicense.com/_licenses/osl-3.0.txt +1 -2
  122. data/vendor/choosealicense.com/_licenses/postgresql.txt +4 -5
  123. data/vendor/choosealicense.com/_licenses/unlicense.txt +1 -2
  124. data/vendor/choosealicense.com/_licenses/upl-1.0.txt +4 -5
  125. data/vendor/choosealicense.com/_licenses/vim.txt +111 -0
  126. data/vendor/choosealicense.com/_licenses/wtfpl.txt +0 -1
  127. data/vendor/choosealicense.com/_licenses/zlib.txt +4 -2
  128. metadata +77 -19
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fa1cf2d146414880f019c339c2803443db7d63eb2568f7807079aac27a7aec98
4
- data.tar.gz: a6243b9cc4b9068090023c70c89a5ea0253580575b534da0c00deaa799441bb5
3
+ metadata.gz: bd74e74f07c0dc4111b3cbbebf62ae3ab140a788ed511491b5995d45f2371de0
4
+ data.tar.gz: 90561b8b85ded55614a88e38b0469e80d110bd113e3a6ddfee9bdd840dec237a
5
5
  SHA512:
6
- metadata.gz: cff2872a3d87b3c708ac2585dc41b64dd665008064303ec66a3501ec5160dbc87b5e78a4128921d5a95e3e7eb160ecc625d3d12f0fad36b05b59fa577088581b
7
- data.tar.gz: 96466e4da40eb17a01625110f236c40cc90cb99b2e39c7057f2867721c77c565ba16cf08d4c4b02c799d1675a11c765e0dd313c761e273354b19fa8cb0fa1528
6
+ metadata.gz: bc64456f4f05411ab8152ce3ade984aa52851f39399d0bffe1e6c51fa4f9de6b00fcd84008bed0d0b71b0569a9799ba0b539ce6323fee55b4436c583ae4ce92c
7
+ data.tar.gz: d797f174a17e2f91eb8f4e7c9eff9b3d4631f58531d4d57cc9da98e75e062345a52354a9ca3a8e1f991cb70a92b91e7b600814dddca345c3528742011e92c8ed
@@ -1,4 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'dotenv/load'
4
5
  require 'thor'
@@ -15,9 +16,9 @@ class LicenseeCLI < Thor
15
16
 
16
17
  def path
17
18
  @path ||= if !options[:remote] || args.first =~ %r{^https://}
18
- args.first || Dir.pwd
19
- else
20
- "https://github.com/#{args.first}"
19
+ args.first || Dir.pwd
20
+ else
21
+ "https://github.com/#{args.first}"
21
22
  end
22
23
  end
23
24
 
@@ -32,6 +33,6 @@ class LicenseeCLI < Thor
32
33
  end
33
34
 
34
35
  commands_dir = File.expand_path '../lib/licensee/commands/', __dir__
35
- Dir["#{commands_dir}/*.rb"].each { |c| require(c) }
36
+ Dir["#{commands_dir}/*.rb"].sort.each { |c| require(c) }
36
37
 
37
38
  LicenseeCLI.start(ARGV)
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require_relative 'licensee/version'
2
4
  require 'forwardable'
3
5
  require 'pathname'
@@ -19,7 +21,7 @@ module Licensee
19
21
  CONFIDENCE_THRESHOLD = 98
20
22
 
21
23
  # Base domain from which to build license URLs
22
- DOMAIN = 'http://choosealicense.com'.freeze
24
+ DOMAIN = 'http://choosealicense.com'
23
25
 
24
26
  class << self
25
27
  attr_writer :confidence_threshold
@@ -49,7 +51,7 @@ module Licensee
49
51
  end
50
52
 
51
53
  # Inverse of the confidence threshold, represented as a float
52
- # By default this will be 0.05
54
+ # By default this will be 0.02
53
55
  def inverse_confidence_threshold
54
56
  @inverse_confidence_threshold ||=
55
57
  (1 - Licensee.confidence_threshold / 100.0).round(2)
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class LicenseeCLI < Thor
2
4
  # Methods to call when displaying information about ProjectFiles
3
5
  MATCHED_FILE_METHODS = %i[
@@ -21,11 +23,11 @@ class LicenseeCLI < Thor
21
23
 
22
24
  rows = []
23
25
  rows << if project.license
24
- ['License:', project.license.spdx_id]
25
- elsif !project.licenses.empty?
26
- ['Licenses:', project.licenses.map(&:spdx_id)]
27
- else
28
- ['License:', set_color('None', :red)]
26
+ ['License:', project.license.spdx_id]
27
+ elsif !project.licenses.empty?
28
+ ['Licenses:', project.licenses.map(&:spdx_id)]
29
+ else
30
+ ['License:', set_color('None', :red)]
29
31
  end
30
32
 
31
33
  unless project.matched_files.empty?
@@ -40,8 +42,10 @@ class LicenseeCLI < Thor
40
42
 
41
43
  MATCHED_FILE_METHODS.each do |method|
42
44
  next unless matched_file.respond_to? method
45
+
43
46
  value = matched_file.public_send method
44
47
  next if value.nil?
48
+
45
49
  rows << [humanize(method, :method), humanize(value, method)]
46
50
  end
47
51
  print_table rows, indent: 2
@@ -51,6 +55,7 @@ class LicenseeCLI < Thor
51
55
 
52
56
  licenses = licenses_by_similarity(matched_file)
53
57
  next if licenses.empty?
58
+
54
59
  say ' Closest non-matching licenses:'
55
60
  rows = licenses[0...3].map do |license, similarity|
56
61
  spdx_id = license.meta['spdx-id']
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'tmpdir'
2
4
 
3
5
  class LicenseeCLI < Thor
@@ -39,26 +41,23 @@ class LicenseeCLI < Thor
39
41
 
40
42
  def license_to_diff
41
43
  return options[:license_to_diff] if options[:license_to_diff]
42
- return project.license_file if remote?
44
+ return project.license_file if remote? || STDIN.tty? && project.license_file
43
45
 
44
46
  @license_to_diff ||= begin
45
- if STDIN.tty?
46
- error 'You must pipe license contents to the command via STDIN'
47
- exit 1
48
- end
49
-
50
47
  Licensee::ProjectFiles::LicenseFile.new(STDIN.read, 'LICENSE')
51
48
  end
52
49
  end
53
50
 
54
51
  def expected_license
55
- @expected_license ||= Licensee::License.find options[:license] if options[:license]
52
+ if options[:license]
53
+ @expected_license ||= Licensee::License.find options[:license]
54
+ end
56
55
  return @expected_license if @expected_license
57
56
 
58
57
  if options[:license]
59
58
  error "#{options[:license]} is not a valid license"
60
59
  else
61
- error 'You must provide an expected license'
60
+ error 'Usage: provide a license to diff against with --license (spdx name)'
62
61
  end
63
62
 
64
63
  error "Valid licenses: #{Licensee::License.all(hidden: true).map(&:key).join(', ')}"
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class LicenseeCLI < Thor
2
4
  desc 'license-path [PATH]', "Returns the path to the given project's license file"
3
5
  def license_path(_path)
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class LicenseeCLI < Thor
2
4
  desc 'version', 'Return the Licensee version'
3
5
  def version
@@ -1,38 +1,129 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'set'
2
4
  require 'digest'
3
5
 
4
6
  module Licensee
5
7
  module ContentHelper
6
8
  DIGEST = Digest::SHA1
7
- END_OF_TERMS_REGEX = /^[\s#*_]*end of terms and conditions\s*$/i
8
- HR_REGEX = /[=\-\*][=\-\*\s]{3,}/
9
- ALT_TITLE_REGEX = License::ALT_TITLE_REGEX
10
- ALL_RIGHTS_RESERVED_REGEX = /\Aall rights reserved\.?$/i
11
- WHITESPACE_REGEX = /\s+/
12
- MARKDOWN_HEADING_REGEX = /\A\s*#+/
13
- VERSION_REGEX = /\Aversion.*$/i
14
- MARKUP_REGEX = /[#_*=~\[\]()`|>]+/
15
- DEVELOPED_BY_REGEX = /\Adeveloped by:.*?\n\n/im
16
- QUOTE_BEGIN_REGEX = /[`'"‘“]/
17
- QUOTE_END_REGEX = /['"’”]/
9
+ START_REGEX = /\A\s*/.freeze
10
+ END_OF_TERMS_REGEX = /^[\s#*_]*end of terms and conditions\s*$/i.freeze
11
+ REGEXES = {
12
+ hrs: /^\s*[=\-\*]{3,}\s*$/,
13
+ all_rights_reserved: /#{START_REGEX}all rights reserved\.?$/i,
14
+ whitespace: /\s+/,
15
+ markdown_headings: /#{START_REGEX}#+/,
16
+ version: /#{START_REGEX}version.*$/i,
17
+ span_markup: /[_*~]+(.*?)[_*~]+/,
18
+ link_markup: /\[(.+?)\]\(.+?\)/,
19
+ block_markup: /^\s*>/,
20
+ border_markup: /^[\*-](.*?)[\*-]$/,
21
+ comment_markup: %r{^\s*?[/\*]{1,2}},
22
+ url: %r{#{START_REGEX}https?://[^ ]+\n},
23
+ bullet: /\n\n\s*(?:[*-]|\(?[\da-z]{1,2}[)\.])\s+/i,
24
+ developed_by: /#{START_REGEX}developed by:.*?\n\n/im,
25
+ quote_begin: /[`'"‘“]/,
26
+ quote_end: /[`'"’”]/,
27
+ cc_legal_code: /^\s*Creative Commons Legal Code\s*$/i,
28
+ cc0_info: /For more information, please see\s*\S+zero\S+/im,
29
+ cc0_disclaimer: /CREATIVE COMMONS CORPORATION.*?\n\n/im,
30
+ unlicense_info: /For more information, please.*\S+unlicense\S+/im,
31
+ mit_optional: /\(including the next paragraph\)/i
32
+ }.freeze
33
+ NORMALIZATIONS = {
34
+ lists: { from: /^\s*(?:\d\.|\*)\s+([^\n])/, to: '- \1' },
35
+ https: { from: /http:/, to: 'https:' },
36
+ ampersands: { from: '&', to: 'and' },
37
+ dashes: { from: /(?<!^)([—–-]+)(?!$)/, to: '-' },
38
+ quotes: {
39
+ from: /#{REGEXES[:quote_begin]}+([\w -]*?\w)#{REGEXES[:quote_end]}+/,
40
+ to: '"\1"'
41
+ }
42
+ }.freeze
43
+
44
+ # Legally equivalent words that schould be ignored for comparison
45
+ # See https://spdx.org/spdx-license-list/matching-guidelines
46
+ VARIETAL_WORDS = {
47
+ 'acknowledgment' => 'acknowledgement',
48
+ 'analogue' => 'analog',
49
+ 'analyse' => 'analyze',
50
+ 'artefact' => 'artifact',
51
+ 'authorisation' => 'authorization',
52
+ 'authorised' => 'authorized',
53
+ 'calibre' => 'caliber',
54
+ 'cancelled' => 'canceled',
55
+ 'capitalisations' => 'capitalizations',
56
+ 'catalogue' => 'catalog',
57
+ 'categorise' => 'categorize',
58
+ 'centre' => 'center',
59
+ 'emphasised' => 'emphasized',
60
+ 'favour' => 'favor',
61
+ 'favourite' => 'favorite',
62
+ 'fulfil' => 'fulfill',
63
+ 'fulfilment' => 'fulfillment',
64
+ 'initialise' => 'initialize',
65
+ 'judgment' => 'judgement',
66
+ 'labelling' => 'labeling',
67
+ 'labour' => 'labor',
68
+ 'licence' => 'license',
69
+ 'maximise' => 'maximize',
70
+ 'modelled' => 'modeled',
71
+ 'modelling' => 'modeling',
72
+ 'offence' => 'offense',
73
+ 'optimise' => 'optimize',
74
+ 'organisation' => 'organization',
75
+ 'organise' => 'organize',
76
+ 'practise' => 'practice',
77
+ 'programme' => 'program',
78
+ 'realise' => 'realize',
79
+ 'recognise' => 'recognize',
80
+ 'signalling' => 'signaling',
81
+ 'sub-license' => 'sublicense',
82
+ 'sub license' => 'sublicense',
83
+ 'utilisation' => 'utilization',
84
+ 'whilst' => 'while',
85
+ 'wilful' => 'wilfull',
86
+ 'non-commercial' => 'noncommercial',
87
+ 'cent' => 'percent',
88
+ 'owner' => 'holder'
89
+ }.freeze
90
+ STRIP_METHODS = %i[
91
+ cc0_optional
92
+ unlicense_optional
93
+ hrs
94
+ markdown_headings
95
+ borders
96
+ title
97
+ version
98
+ url
99
+ copyright
100
+ title
101
+ block_markup
102
+ span_markup
103
+ link_markup
104
+ developed_by
105
+ end_of_terms
106
+ whitespace
107
+ mit_optional
108
+ ].freeze
18
109
 
19
110
  # A set of each word in the license, without duplicates
20
111
  def wordset
21
- @wordset ||= if content_normalized
22
- content_normalized.scan(/(?:\w(?:'s|(?<=s)')?)+/).to_set
23
- end
112
+ @wordset ||= content_normalized&.scan(%r{(?:[\w\/](?:'s|(?<=s)')?)+})&.to_set
24
113
  end
25
114
 
26
115
  # Number of characteres in the normalized content
27
116
  def length
28
117
  return 0 unless content_normalized
118
+
29
119
  content_normalized.length
30
120
  end
31
121
 
32
122
  # Number of characters that could be added/removed to still be
33
123
  # considered a potential match
34
124
  def max_delta
35
- @max_delta ||= (length * Licensee.inverse_confidence_threshold).to_i
125
+ @max_delta ||= fields_normalized.size * 10 +
126
+ (length * Licensee.inverse_confidence_threshold).to_i
36
127
  end
37
128
 
38
129
  # Given another license or project file, calculates the difference in length
@@ -43,8 +134,9 @@ module Licensee
43
134
  # Given another license or project file, calculates the similarity
44
135
  # as a percentage of words in common
45
136
  def similarity(other)
46
- overlap = (wordset & other.wordset).size
47
- total = wordset.size + other.wordset.size
137
+ overlap = (wordset_fieldless & other.wordset).size
138
+ total = wordset_fieldless.size + other.wordset.size -
139
+ fields_normalized_set.size
48
140
  100.0 * (overlap * 2.0 / total)
49
141
  end
50
142
 
@@ -59,34 +151,21 @@ module Licensee
59
151
  # content with attribution first to detect attribuion in LicenseFile
60
152
  def content_without_title_and_version
61
153
  @content_without_title_and_version ||= begin
62
- string = content.strip
63
- string = strip_markdown_headings(string)
64
- string = strip_hrs(string)
65
- string = strip_title(string) while string =~ ContentHelper.title_regex
66
- strip_version(string).strip
154
+ @_content = nil
155
+ ops = %i[html hrs comments markdown_headings title version]
156
+ ops.each { |op| strip(op) }
157
+ _content
67
158
  end
68
159
  end
69
160
 
70
- # Content without title, version, copyright, whitespace, or insturctions
71
- #
72
- # wrap - Optional width to wrap the content
73
- #
74
- # Returns a string
75
161
  def content_normalized(wrap: nil)
76
- return unless content
77
162
  @content_normalized ||= begin
78
- string = content_without_title_and_version.downcase
79
- while string =~ Matchers::Copyright::REGEX
80
- string = strip_copyright(string)
81
- end
82
- string = strip_all_rights_reserved(string)
83
- string = strip_developed_by(string)
84
- string, _partition, _instructions = string.partition(END_OF_TERMS_REGEX)
85
- string = normalize_lists(string)
86
- string = normalize_quotes(string)
87
- string = normalize_https(string)
88
- string = strip_markup(string)
89
- strip_whitespace(string)
163
+ @_content = content_without_title_and_version.downcase
164
+
165
+ (NORMALIZATIONS.keys + %i[spelling bullets]).each { |op| normalize(op) }
166
+ STRIP_METHODS.each { |op| strip(op) }
167
+
168
+ _content
90
169
  end
91
170
 
92
171
  if wrap.nil?
@@ -96,14 +175,24 @@ module Licensee
96
175
  end
97
176
  end
98
177
 
178
+ # Backwards compatibalize constants to avoid a breaking change
179
+ def self.const_missing(const)
180
+ key = const.to_s.downcase.gsub('_regex', '').to_sym
181
+ REGEXES[key] || super
182
+ end
183
+
99
184
  # Wrap text to the given line length
100
185
  def self.wrap(text, line_width = 80)
101
186
  return if text.nil?
187
+
102
188
  text = text.clone
189
+ text.gsub!(REGEXES[:bullet]) { |m| "\n#{m}\n" }
103
190
  text.gsub!(/([^\n])\n([^\n])/, '\1 \2')
104
191
 
105
192
  text = text.split("\n").collect do |line|
106
- if line.length > line_width
193
+ if line =~ REGEXES[:hrs]
194
+ line
195
+ elsif line.length > line_width
107
196
  line.gsub(/(.{1,#{line_width}})(\s+|$)/, "\\1\n").strip
108
197
  else
109
198
  line
@@ -114,82 +203,142 @@ module Licensee
114
203
  end
115
204
 
116
205
  def self.format_percent(float)
117
- "#{format('%.2f', float)}%"
206
+ "#{format('%<float>.2f', float: float)}%"
118
207
  end
119
208
 
120
209
  def self.title_regex
121
- licenses = Licensee::License.all(hidden: true, psuedo: false)
122
- titles = licenses.map(&:title_regex)
123
-
124
- # Title regex must include the version to support matching within
125
- # families, but for sake of normalization, we can be less strict
126
- without_versions = licenses.map do |license|
127
- next if license.title == license.name_without_version
128
- Regexp.new Regexp.escape(license.name_without_version), 'i'
129
- end
130
- titles.concat(without_versions.compact)
210
+ @title_regex ||= begin
211
+ licenses = Licensee::License.all(hidden: true, psuedo: false)
212
+ titles = licenses.map(&:title_regex)
213
+
214
+ # Title regex must include the version to support matching within
215
+ # families, but for sake of normalization, we can be less strict
216
+ without_versions = licenses.map do |license|
217
+ next if license.title == license.name_without_version
218
+
219
+ Regexp.new Regexp.escape(license.name_without_version), 'i'
220
+ end
221
+ titles.concat(without_versions.compact)
131
222
 
132
- /\A\s*\(?(the )?#{Regexp.union titles}.*$/i
223
+ /#{START_REGEX}\(?(?:the )?#{Regexp.union titles}.*?$/i
224
+ end
133
225
  end
134
226
 
135
227
  private
136
228
 
137
- def strip_title(string)
138
- strip(string, ContentHelper.title_regex)
229
+ def _content
230
+ @_content ||= content.to_s.dup.strip
231
+ end
232
+
233
+ def strip(regex_or_sym)
234
+ return unless _content
235
+
236
+ if regex_or_sym.is_a?(Symbol)
237
+ meth = "strip_#{regex_or_sym}"
238
+ return send(meth) if respond_to?(meth, true)
239
+
240
+ unless REGEXES[regex_or_sym]
241
+ raise ArgumentError, "#{regex_or_sym} is an invalid regex reference"
242
+ end
243
+
244
+ regex_or_sym = REGEXES[regex_or_sym]
245
+ end
246
+
247
+ @_content = _content.gsub(regex_or_sym, ' ').squeeze(' ').strip
248
+ end
249
+
250
+ def strip_title
251
+ while _content =~ ContentHelper.title_regex
252
+ strip(ContentHelper.title_regex)
253
+ end
254
+ end
255
+
256
+ def strip_borders
257
+ normalize(REGEXES[:border_markup], '\1')
258
+ end
259
+
260
+ def strip_comments
261
+ lines = _content.split("\n")
262
+ return if lines.count == 1
263
+ return unless lines.all? { |line| line =~ REGEXES[:comment_markup] }
264
+
265
+ strip(:comment_markup)
139
266
  end
140
267
 
141
- def strip_version(string)
142
- strip(string, VERSION_REGEX)
268
+ def strip_copyright
269
+ regex = Regexp.union(Matchers::Copyright::REGEX, REGEXES[:all_rights_reserved])
270
+ strip(regex) while _content =~ regex
271
+ end
272
+
273
+ def strip_cc0_optional
274
+ return unless _content.include? 'associating cc0'
275
+
276
+ strip(REGEXES[:cc_legal_code])
277
+ strip(REGEXES[:cc0_info])
278
+ strip(REGEXES[:cc0_disclaimer])
143
279
  end
144
280
 
145
- def strip_copyright(string)
146
- strip(string, Matchers::Copyright::REGEX)
281
+ def strip_unlicense_optional
282
+ return unless _content.include? 'unlicense'
283
+
284
+ strip(REGEXES[:unlicense_info])
147
285
  end
148
286
 
149
- # Strip HRs from MPL
150
- def strip_hrs(string)
151
- strip(string, HR_REGEX)
287
+ def strip_end_of_terms
288
+ body, _partition, _instructions = _content.partition(END_OF_TERMS_REGEX)
289
+ @_content = body
152
290
  end
153
291
 
154
- # Strip leading #s from the document
155
- def strip_markdown_headings(string)
156
- strip(string, MARKDOWN_HEADING_REGEX)
292
+ def strip_span_markup
293
+ normalize(REGEXES[:span_markup], '\1')
157
294
  end
158
295
 
159
- def strip_whitespace(string)
160
- strip(string, WHITESPACE_REGEX)
296
+ def strip_link_markup
297
+ normalize(REGEXES[:link_markup], '\1')
161
298
  end
162
299
 
163
- def strip_all_rights_reserved(string)
164
- strip(string, ALL_RIGHTS_RESERVED_REGEX)
300
+ def strip_html
301
+ return unless respond_to?(:filename) && filename
302
+ return unless File.extname(filename) =~ /\.html?/i
303
+
304
+ require 'reverse_markdown'
305
+ @_content = ReverseMarkdown.convert(_content, unknown_tags: :bypass)
165
306
  end
166
307
 
167
- def strip_markup(string)
168
- strip(string, MARKUP_REGEX)
308
+ def normalize(from_or_key, to = nil)
309
+ operation = { from: from_or_key, to: to } if to
310
+ operation ||= NORMALIZATIONS[from_or_key]
311
+
312
+ if operation
313
+ @_content = _content.gsub operation[:from], operation[:to]
314
+ elsif respond_to?("normalize_#{from_or_key}", true)
315
+ send("normalize_#{from_or_key}")
316
+ else
317
+ raise ArgumentError, "#{from_or_key} is an invalid normalization"
318
+ end
169
319
  end
170
320
 
171
- def strip_developed_by(string)
172
- strip(string, DEVELOPED_BY_REGEX)
321
+ def normalize_spelling
322
+ normalize(/\b#{Regexp.union(VARIETAL_WORDS.keys)}\b/, VARIETAL_WORDS)
173
323
  end
174
324
 
175
- def strip(string, regex)
176
- string.gsub(regex, ' ').squeeze(' ').strip
325
+ def normalize_bullets
326
+ normalize(REGEXES[:bullet], "\n\n* ")
327
+ normalize(/\)\s+\(/, ')(')
177
328
  end
178
329
 
179
- # Replace all enclosing quotes with double quotes
180
- # Single versus double quotes don't alter the meaning, and it's easier to
181
- # strip double quotes if we still want to allow possessives
182
- def normalize_quotes(string)
183
- string.gsub(/#{QUOTE_BEGIN_REGEX}+([\w -]*?\w)#{QUOTE_END_REGEX}+/,
184
- '"\1"')
330
+ def wordset_fieldless
331
+ @wordset_fieldless ||= wordset - fields_normalized_set
185
332
  end
186
333
 
187
- def normalize_https(string)
188
- string.gsub(/http:/, 'https:')
334
+ # Returns an array of strings of substitutable fields in normalized content
335
+ def fields_normalized
336
+ @fields_normalized ||=
337
+ content_normalized.scan(LicenseField::FIELD_REGEX).flatten
189
338
  end
190
339
 
191
- def normalize_lists(string)
192
- string.gsub(/^\s*(\d\.|\*)/, '-')
340
+ def fields_normalized_set
341
+ @fields_normalized_set ||= fields_normalized.to_set
193
342
  end
194
343
  end
195
344
  end